aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2010-12-09 12:17:25 -0500
committerThomas Gleixner <tglx@linutronix.de>2010-12-09 12:17:25 -0500
commitd834a9dcecae834cd6b2bc5e50e1907738d9cf6a (patch)
tree0589d753465d3fe359ba451ba6cb7798df03aaa2 /fs
parenta38c5380ef9f088be9f49b6e4c5d80af8b1b5cd4 (diff)
parentf658bcfb2607bf0808966a69cf74135ce98e5c2d (diff)
Merge branch 'x86/amd-nb' into x86/apic-cleanups
Reason: apic cleanup series depends on x86/apic, x86/amd-nb x86/platform Conflicts: arch/x86/include/asm/io_apic.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig13
-rw-r--r--fs/9p/Makefile1
-rw-r--r--fs/9p/acl.c392
-rw-r--r--fs/9p/acl.h49
-rw-r--r--fs/9p/fid.c1
-rw-r--r--fs/9p/v9fs.c22
-rw-r--r--fs/9p/v9fs.h10
-rw-r--r--fs/9p/v9fs_vfs.h4
-rw-r--r--fs/9p/vfs_addr.c30
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_file.c265
-rw-r--r--fs/9p/vfs_inode.c258
-rw-r--r--fs/9p/vfs_super.c36
-rw-r--r--fs/9p/xattr.c52
-rw-r--r--fs/9p/xattr.h6
-rw-r--r--fs/Kconfig8
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile7
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/adfs/super.c17
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c23
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/flock.c5
-rw-r--r--fs/afs/mntpt.c1
-rw-r--r--fs/afs/super.c24
-rw-r--r--fs/afs/write.c19
-rw-r--r--fs/aio.c14
-rw-r--r--fs/anon_inodes.c16
-rw-r--r--fs/autofs/Kconfig21
-rw-r--r--fs/autofs/Makefile7
-rw-r--r--fs/autofs/autofs_i.h165
-rw-r--r--fs/autofs/dirhash.c250
-rw-r--r--fs/autofs/init.c52
-rw-r--r--fs/autofs/inode.c288
-rw-r--r--fs/autofs/root.c643
-rw-r--r--fs/autofs/symlink.c26
-rw-r--r--fs/autofs/waitq.c205
-rw-r--r--fs/autofs4/dev-ioctl.c1
-rw-r--r--fs/autofs4/init.c8
-rw-r--r--fs/autofs4/inode.c1
-rw-r--r--fs/autofs4/root.c14
-rw-r--r--fs/befs/linuxvfs.c11
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/inode.c13
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/bio.c23
-rw-r--r--fs/block_dev.c44
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c57
-rw-r--r--fs/btrfs/ctree.h100
-rw-r--r--fs/btrfs/dir-item.c2
-rw-r--r--fs/btrfs/disk-io.c51
-rw-r--r--fs/btrfs/extent-tree.c697
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/free-space-cache.c751
-rw-r--r--fs/btrfs/free-space-cache.h18
-rw-r--r--fs/btrfs/inode.c206
-rw-r--r--fs/btrfs/ioctl.c398
-rw-r--r--fs/btrfs/ioctl.h13
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/relocation.c109
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/super.c58
-rw-r--r--fs/btrfs/transaction.c234
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c17
-rw-r--r--fs/btrfs/volumes.c11
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c5
-rw-r--r--fs/buffer.c36
-rw-r--r--fs/cachefiles/daemon.c1
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/super.c50
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/cifs/Kconfig3
-rw-r--r--fs/cifs/README5
-rw-r--r--fs/cifs/TODO2
-rw-r--r--fs/cifs/cifs_debug.c12
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifs_dfs_ref.c24
-rw-r--r--fs/cifs/cifs_fs_sb.h13
-rw-r--r--fs/cifs/cifsacl.c46
-rw-r--r--fs/cifs/cifsencrypt.c555
-rw-r--r--fs/cifs/cifsfs.c124
-rw-r--r--fs/cifs/cifsfs.h10
-rw-r--r--fs/cifs/cifsglob.h139
-rw-r--r--fs/cifs/cifspdu.h14
-rw-r--r--fs/cifs/cifsproto.h31
-rw-r--r--fs/cifs/cifssmb.c30
-rw-r--r--fs/cifs/cn_cifs.h37
-rw-r--r--fs/cifs/connect.c572
-rw-r--r--fs/cifs/dir.c212
-rw-r--r--fs/cifs/file.c840
-rw-r--r--fs/cifs/fscache.c13
-rw-r--r--fs/cifs/inode.c253
-rw-r--r--fs/cifs/ioctl.c27
-rw-r--r--fs/cifs/link.c372
-rw-r--r--fs/cifs/misc.c59
-rw-r--r--fs/cifs/ntlmssp.h15
-rw-r--r--fs/cifs/readdir.c79
-rw-r--r--fs/cifs/sess.c247
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/cifs/xattr.c60
-rw-r--r--fs/coda/cache.c17
-rw-r--r--fs/coda/cnode.c19
-rw-r--r--fs/coda/dir.c157
-rw-r--r--fs/coda/file.c31
-rw-r--r--fs/coda/inode.c65
-rw-r--r--fs/coda/pioctl.c23
-rw-r--r--fs/coda/psdev.c42
-rw-r--r--fs/coda/symlink.c3
-rw-r--r--fs/coda/upcall.c89
-rw-r--r--fs/compat.c46
-rw-r--r--fs/compat_ioctl.c93
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/configfs/mount.c8
-rw-r--r--fs/cramfs/inode.c9
-rw-r--r--fs/dcache.c277
-rw-r--r--fs/debugfs/file.c3
-rw-r--r--fs/debugfs/inode.c9
-rw-r--r--fs/devpts/inode.c32
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/debug_fs.c3
-rw-r--r--fs/dlm/lock.c3
-rw-r--r--fs/dlm/plock.c3
-rw-r--r--fs/dlm/user.c3
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h3
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c11
-rw-r--r--fs/ecryptfs/keystore.c45
-rw-r--r--fs/ecryptfs/main.c20
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efs/super.c8
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/exec.c173
-rw-r--r--fs/exofs/dir.c4
-rw-r--r--fs/exofs/file.c6
-rw-r--r--fs/exofs/inode.c78
-rw-r--r--fs/exofs/ios.c10
-rw-r--r--fs/exofs/namei.c2
-rw-r--r--fs/exofs/super.c10
-rw-r--r--fs/exportfs/expfs.c17
-rw-r--r--fs/ext2/balloc.c3
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/inode.c15
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c18
-rw-r--r--fs/ext2/xattr.c2
-rw-r--r--fs/ext3/balloc.c17
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext3/ialloc.c11
-rw-r--r--fs/ext3/inode.c24
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/resize.c13
-rw-r--r--fs/ext3/super.c62
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c7
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h112
-rw-r--r--fs/ext4/ext4_extents.h65
-rw-r--r--fs/ext4/extents.c368
-rw-r--r--fs/ext4/file.c44
-rw-r--r--fs/ext4/fsync.c88
-rw-r--r--fs/ext4/ialloc.c135
-rw-r--r--fs/ext4/inode.c599
-rw-r--r--fs/ext4/mballoc.c555
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c22
-rw-r--r--fs/ext4/namei.c65
-rw-r--r--fs/ext4/page-io.c431
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c630
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/ext4/xattr.h10
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fat/namei_msdos.c15
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c62
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/file_table.c17
-rw-r--r--fs/freevxfs/vxfs_inode.c1
-rw-r--r--fs/freevxfs/vxfs_lookup.c14
-rw-r--r--fs/freevxfs/vxfs_super.c16
-rw-r--r--fs/fs-writeback.c144
-rw-r--r--fs/fuse/control.c15
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c19
-rw-r--r--fs/fuse/inode.c17
-rw-r--r--fs/gfs2/aops.c3
-rw-r--r--fs/gfs2/export.c46
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/gfs2/glock.c21
-rw-r--r--fs/gfs2/inode.c152
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/log.c19
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/ops_fstype.c52
-rw-r--r--fs/gfs2/ops_inode.c8
-rw-r--r--fs/gfs2/rgrp.c97
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/hfs/hfs_fs.h13
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c16
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c2
-rw-r--r--fs/hfsplus/super.c10
-rw-r--r--fs/hostfs/hostfs.h10
-rw-r--r--fs/hostfs/hostfs_kern.c10
-rw-r--r--fs/hostfs/hostfs_user.c14
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/hpfs/buffer.c4
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/super.c19
-rw-r--r--fs/hppfs/hppfs.c9
-rw-r--r--fs/hugetlbfs/inode.c28
-rw-r--r--fs/inode.c527
-rw-r--r--fs/internal.h7
-rw-r--r--fs/ioctl.c39
-rw-r--r--fs/ioprio.c18
-rw-r--r--fs/isofs/dir.c6
-rw-r--r--fs/isofs/inode.c75
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/namei.c8
-rw-r--r--fs/isofs/rock.c10
-rw-r--r--fs/jbd/checkpoint.c4
-rw-r--r--fs/jbd/commit.c40
-rw-r--r--fs/jbd/journal.c44
-rw-r--r--fs/jbd/recovery.c2
-rw-r--r--fs/jbd/transaction.c6
-rw-r--r--fs/jbd2/checkpoint.c13
-rw-r--r--fs/jbd2/commit.c88
-rw-r--r--fs/jbd2/journal.c14
-rw-r--r--fs/jbd2/transaction.c1
-rw-r--r--fs/jffs2/build.c2
-rw-r--r--fs/jffs2/compr.c6
-rw-r--r--fs/jffs2/compr.h4
-rw-r--r--fs/jffs2/compr_lzo.c4
-rw-r--r--fs/jffs2/compr_rtime.c6
-rw-r--r--fs/jffs2/compr_rubin.c11
-rw-r--r--fs/jffs2/compr_zlib.c6
-rw-r--r--fs/jffs2/dir.c7
-rw-r--r--fs/jffs2/erase.c2
-rw-r--r--fs/jffs2/fs.c26
-rw-r--r--fs/jffs2/gc.c7
-rw-r--r--fs/jffs2/jffs2_fs_sb.h1
-rw-r--r--fs/jffs2/nodelist.c8
-rw-r--r--fs/jffs2/nodelist.h3
-rw-r--r--fs/jffs2/scan.c12
-rw-r--r--fs/jffs2/super.c18
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c6
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c32
-rw-r--r--fs/libfs.c22
-rw-r--r--fs/lockd/clntlock.c15
-rw-r--r--fs/lockd/clntproc.c13
-rw-r--r--fs/lockd/host.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c13
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c37
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/lockd/svcsubs.c9
-rw-r--r--fs/locks.c250
-rw-r--r--fs/logfs/dev_bdev.c15
-rw-r--r--fs/logfs/dev_mtd.c18
-rw-r--r--fs/logfs/dir.c3
-rw-r--r--fs/logfs/logfs.h22
-rw-r--r--fs/logfs/super.c77
-rw-r--r--fs/minix/inode.c9
-rw-r--r--fs/minix/namei.c2
-rw-r--r--fs/namei.c18
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/ncpfs/dir.c221
-rw-r--r--fs/ncpfs/file.c25
-rw-r--r--fs/ncpfs/inode.c63
-rw-r--r--fs/ncpfs/ioctl.c470
-rw-r--r--fs/ncpfs/ncplib_kernel.c101
-rw-r--r--fs/ncpfs/ncplib_kernel.h15
-rw-r--r--fs/ncpfs/ncpsign_kernel.c10
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/nfs/Kconfig19
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_proc.c8
-rw-r--r--fs/nfs/client.c28
-rw-r--r--fs/nfs/delegation.c10
-rw-r--r--fs/nfs/dir.c1015
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/dns_resolve.c6
-rw-r--r--fs/nfs/file.c87
-rw-r--r--fs/nfs/getroot.c3
-rw-r--r--fs/nfs/idmap.c211
-rw-r--r--fs/nfs/inode.c39
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs2xdr.c107
-rw-r--r--fs/nfs/nfs3proc.c62
-rw-r--r--fs/nfs/nfs3xdr.c196
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4filelayout.c280
-rw-r--r--fs/nfs/nfs4filelayout.h94
-rw-r--r--fs/nfs/nfs4filelayoutdev.c448
-rw-r--r--fs/nfs/nfs4proc.c497
-rw-r--r--fs/nfs/nfs4state.c52
-rw-r--r--fs/nfs/nfs4xdr.c700
-rw-r--r--fs/nfs/nfsroot.c568
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c783
-rw-r--r--fs/nfs/pnfs.h189
-rw-r--r--fs/nfs/proc.c35
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/super.c168
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfs/unlink.c259
-rw-r--r--fs/nfs/write.c22
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/export.c73
-rw-r--r--fs/nfsd/nfs4callback.c245
-rw-r--r--fs/nfsd/nfs4idmap.c105
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4state.c553
-rw-r--r--fs/nfsd/nfs4xdr.c18
-rw-r--r--fs/nfsd/nfsctl.c35
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfssvc.c5
-rw-r--r--fs/nfsd/state.h52
-rw-r--r--fs/nfsd/vfs.c16
-rw-r--r--fs/nilfs2/Makefile2
-rw-r--r--fs/nilfs2/bmap.c22
-rw-r--r--fs/nilfs2/bmap.h10
-rw-r--r--fs/nilfs2/btnode.c17
-rw-r--r--fs/nilfs2/cpfile.c72
-rw-r--r--fs/nilfs2/cpfile.h4
-rw-r--r--fs/nilfs2/dat.c92
-rw-r--r--fs/nilfs2/dat.h4
-rw-r--r--fs/nilfs2/export.h17
-rw-r--r--fs/nilfs2/gcdat.c87
-rw-r--r--fs/nilfs2/gcinode.c134
-rw-r--r--fs/nilfs2/ifile.c51
-rw-r--r--fs/nilfs2/ifile.h4
-rw-r--r--fs/nilfs2/inode.c167
-rw-r--r--fs/nilfs2/ioctl.c24
-rw-r--r--fs/nilfs2/mdt.c313
-rw-r--r--fs/nilfs2/mdt.h32
-rw-r--r--fs/nilfs2/namei.c141
-rw-r--r--fs/nilfs2/nilfs.h38
-rw-r--r--fs/nilfs2/page.c55
-rw-r--r--fs/nilfs2/page.h6
-rw-r--r--fs/nilfs2/recovery.c19
-rw-r--r--fs/nilfs2/sb.h10
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/nilfs2/segment.c104
-rw-r--r--fs/nilfs2/segment.h10
-rw-r--r--fs/nilfs2/sufile.c77
-rw-r--r--fs/nilfs2/sufile.h6
-rw-r--r--fs/nilfs2/super.c639
-rw-r--r--fs/nilfs2/the_nilfs.c346
-rw-r--r--fs/nilfs2/the_nilfs.h101
-rw-r--r--fs/no-block.c1
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c27
-rw-r--r--fs/notify/fanotify/fanotify_user.c99
-rw-r--r--fs/notify/fsnotify.c68
-rw-r--r--fs/notify/inode_mark.c11
-rw-r--r--fs/notify/inotify/inotify_user.c3
-rw-r--r--fs/notify/vfsmount_mark.c6
-rw-r--r--fs/ntfs/super.c52
-rw-r--r--fs/ocfs2/aops.c19
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c11
-rw-r--r--fs/ocfs2/file.c12
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h6
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/super.c18
-rw-r--r--fs/omfs/inode.c9
-rw-r--r--fs/open.c6
-rw-r--r--fs/openpromfs/inode.c8
-rw-r--r--fs/partitions/check.c42
-rw-r--r--fs/partitions/check.h3
-rw-r--r--fs/partitions/efi.c25
-rw-r--r--fs/partitions/ldm.c2
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/pipe.c13
-rw-r--r--fs/proc/Kconfig4
-rw-r--r--fs/proc/base.c119
-rw-r--r--fs/proc/proc_sysctl.c3
-rw-r--r--fs/proc/root.c17
-rw-r--r--fs/proc/softirqs.c4
-rw-r--r--fs/proc/stat.c14
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--fs/qnx4/dir.c4
-rw-r--r--fs/qnx4/inode.c15
-rw-r--r--fs/qnx4/namei.c4
-rw-r--r--fs/quota/Kconfig4
-rw-r--r--fs/quota/dquot.c30
-rw-r--r--fs/ramfs/inode.c18
-rw-r--r--fs/read_write.c95
-rw-r--r--fs/reiserfs/Kconfig6
-rw-r--r--fs/reiserfs/README2
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c26
-rw-r--r--fs/reiserfs/ioctl.c6
-rw-r--r--fs/reiserfs/journal.c106
-rw-r--r--fs/reiserfs/namei.c2
-rw-r--r--fs/reiserfs/super.c9
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/romfs/super.c18
-rw-r--r--fs/select.c6
-rw-r--r--fs/seq_file.c8
-rw-r--r--fs/signalfd.c11
-rw-r--r--fs/smbfs/Kconfig55
-rw-r--r--fs/smbfs/Makefile18
-rw-r--r--fs/smbfs/cache.c208
-rw-r--r--fs/smbfs/dir.c702
-rw-r--r--fs/smbfs/file.c454
-rw-r--r--fs/smbfs/getopt.c64
-rw-r--r--fs/smbfs/getopt.h14
-rw-r--r--fs/smbfs/inode.c839
-rw-r--r--fs/smbfs/ioctl.c69
-rw-r--r--fs/smbfs/proc.c3507
-rw-r--r--fs/smbfs/proto.h87
-rw-r--r--fs/smbfs/request.c818
-rw-r--r--fs/smbfs/request.h70
-rw-r--r--fs/smbfs/smb_debug.h34
-rw-r--r--fs/smbfs/smbiod.c344
-rw-r--r--fs/smbfs/sock.c386
-rw-r--r--fs/smbfs/symlink.c68
-rw-r--r--fs/squashfs/dir.c3
-rw-r--r--fs/squashfs/super.c15
-rw-r--r--fs/squashfs/xattr.c9
-rw-r--r--fs/squashfs/xattr.h4
-rw-r--r--fs/squashfs/xattr_id.c1
-rw-r--r--fs/super.c119
-rw-r--r--fs/sysfs/bin.c68
-rw-r--r--fs/sysfs/mount.c32
-rw-r--r--fs/sysv/namei.c2
-rw-r--r--fs/sysv/super.c17
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/commit.c4
-rw-r--r--fs/ubifs/debug.c157
-rw-r--r--fs/ubifs/debug.h4
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c7
-rw-r--r--fs/ubifs/gc.c82
-rw-r--r--fs/ubifs/io.c20
-rw-r--r--fs/ubifs/journal.c3
-rw-r--r--fs/ubifs/key.h14
-rw-r--r--fs/ubifs/log.c6
-rw-r--r--fs/ubifs/lpt.c7
-rw-r--r--fs/ubifs/lpt_commit.c3
-rw-r--r--fs/ubifs/master.c3
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c11
-rw-r--r--fs/ubifs/replay.c20
-rw-r--r--fs/ubifs/sb.c9
-rw-r--r--fs/ubifs/scan.c6
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c93
-rw-r--r--fs/ubifs/tnc.c5
-rw-r--r--fs/ubifs/ubifs.h23
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/udf/super.c17
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/super.c13
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c240
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h81
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c46
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c44
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c414
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_version.h29
-rw-r--r--fs/xfs/quota/xfs_dquot.c164
-rw-r--r--fs/xfs/quota/xfs_qm.c221
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c16
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_alloc.c4
-rw-r--r--fs/xfs/xfs_alloc_btree.c33
-rw-r--r--fs/xfs/xfs_attr.c37
-rw-r--r--fs/xfs/xfs_bmap.c44
-rw-r--r--fs/xfs/xfs_bmap.h9
-rw-r--r--fs/xfs/xfs_btree.c56
-rw-r--r--fs/xfs/xfs_btree.h14
-rw-r--r--fs/xfs/xfs_buf_item.c7
-rw-r--r--fs/xfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/xfs_dinode.h5
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_fs.h7
-rw-r--r--fs/xfs/xfs_fsops.c14
-rw-r--r--fs/xfs/xfs_ialloc.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.c33
-rw-r--r--fs/xfs/xfs_iget.c4
-rw-r--r--fs/xfs/xfs_inode.c17
-rw-r--r--fs/xfs/xfs_inode.h32
-rw-r--r--fs/xfs/xfs_inode_item.c9
-rw-r--r--fs/xfs/xfs_itable.c3
-rw-r--r--fs/xfs/xfs_log.c18
-rw-r--r--fs/xfs/xfs_log_cil.c232
-rw-r--r--fs/xfs/xfs_log_recover.c25
-rw-r--r--fs/xfs/xfs_mount.c307
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_quota.h20
-rw-r--r--fs/xfs/xfs_refcache.h52
-rw-r--r--fs/xfs/xfs_rename.c14
-rw-r--r--fs/xfs/xfs_rtalloc.c29
-rw-r--r--fs/xfs/xfs_sb.h10
-rw-r--r--fs/xfs/xfs_trans.c91
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_inode.c30
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_utils.c9
-rw-r--r--fs/xfs/xfs_utils.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c65
-rw-r--r--fs/xfs/xfs_vnodeops.h6
550 files changed, 19631 insertions, 20310 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4e..7e0511476797 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -17,3 +17,16 @@ config 9P_FSCACHE
17 Choose Y here to enable persistent, read-only local 17 Choose Y here to enable persistent, read-only local
18 caching support for 9p clients using FS-Cache 18 caching support for 9p clients using FS-Cache
19 19
20
21config 9P_FS_POSIX_ACL
22 bool "9P POSIX Access Control Lists"
23 depends on 9P_FS
24 select FS_POSIX_ACL
25 help
26 POSIX Access Control Lists (ACLs) support permissions for users and
27 groups beyond the owner/group/world scheme.
28
29 To learn more about Access Control Lists, visit the POSIX ACLs for
30 Linux website <http://acl.bestbits.at/>.
31
32 If you don't know what Access Control Lists are, say N
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 91fba025fcbe..f8ba37effd1b 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_9P_FS) := 9p.o
13 xattr_user.o 13 xattr_user.o
14 14
159p-$(CONFIG_9P_FSCACHE) += cache.o 159p-$(CONFIG_9P_FSCACHE) += cache.o
169p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 000000000000..12d602351dbe
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,392 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/fs.h>
17#include <net/9p/9p.h>
18#include <net/9p/client.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/posix_acl_xattr.h>
22#include "xattr.h"
23#include "acl.h"
24#include "v9fs_vfs.h"
25#include "v9fs.h"
26
27static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
28{
29 ssize_t size;
30 void *value = NULL;
31 struct posix_acl *acl = NULL;;
32
33 size = v9fs_fid_xattr_get(fid, name, NULL, 0);
34 if (size > 0) {
35 value = kzalloc(size, GFP_NOFS);
36 if (!value)
37 return ERR_PTR(-ENOMEM);
38 size = v9fs_fid_xattr_get(fid, name, value, size);
39 if (size > 0) {
40 acl = posix_acl_from_xattr(value, size);
41 if (IS_ERR(acl))
42 goto err_out;
43 }
44 } else if (size == -ENODATA || size == 0 ||
45 size == -ENOSYS || size == -EOPNOTSUPP) {
46 acl = NULL;
47 } else
48 acl = ERR_PTR(-EIO);
49
50err_out:
51 kfree(value);
52 return acl;
53}
54
55int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
56{
57 int retval = 0;
58 struct posix_acl *pacl, *dacl;
59 struct v9fs_session_info *v9ses;
60
61 v9ses = v9fs_inode2v9ses(inode);
62 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
63 set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
64 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
65 return 0;
66 }
67 /* get the default/access acl values and cache them */
68 dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
69 pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
70
71 if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
72 set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
73 set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
74 posix_acl_release(dacl);
75 posix_acl_release(pacl);
76 } else
77 retval = -EIO;
78
79 return retval;
80}
81
82static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
83{
84 struct posix_acl *acl;
85 /*
86 * 9p Always cache the acl value when
87 * instantiating the inode (v9fs_inode_from_fid)
88 */
89 acl = get_cached_acl(inode, type);
90 BUG_ON(acl == ACL_NOT_CACHED);
91 return acl;
92}
93
94int v9fs_check_acl(struct inode *inode, int mask)
95{
96 struct posix_acl *acl;
97 struct v9fs_session_info *v9ses;
98
99 v9ses = v9fs_inode2v9ses(inode);
100 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
101 /*
102 * On access = client mode get the acl
103 * values from the server
104 */
105 return 0;
106 }
107 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
108
109 if (IS_ERR(acl))
110 return PTR_ERR(acl);
111 if (acl) {
112 int error = posix_acl_permission(inode, acl, mask);
113 posix_acl_release(acl);
114 return error;
115 }
116 return -EAGAIN;
117}
118
119static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
120{
121 int retval;
122 char *name;
123 size_t size;
124 void *buffer;
125 struct inode *inode = dentry->d_inode;
126
127 set_cached_acl(inode, type, acl);
128 /* Set a setxattr request to server */
129 size = posix_acl_xattr_size(acl->a_count);
130 buffer = kmalloc(size, GFP_KERNEL);
131 if (!buffer)
132 return -ENOMEM;
133 retval = posix_acl_to_xattr(acl, buffer, size);
134 if (retval < 0)
135 goto err_free_out;
136 switch (type) {
137 case ACL_TYPE_ACCESS:
138 name = POSIX_ACL_XATTR_ACCESS;
139 break;
140 case ACL_TYPE_DEFAULT:
141 name = POSIX_ACL_XATTR_DEFAULT;
142 break;
143 default:
144 BUG();
145 }
146 retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
147err_free_out:
148 kfree(buffer);
149 return retval;
150}
151
152int v9fs_acl_chmod(struct dentry *dentry)
153{
154 int retval = 0;
155 struct posix_acl *acl, *clone;
156 struct inode *inode = dentry->d_inode;
157
158 if (S_ISLNK(inode->i_mode))
159 return -EOPNOTSUPP;
160 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
161 if (acl) {
162 clone = posix_acl_clone(acl, GFP_KERNEL);
163 posix_acl_release(acl);
164 if (!clone)
165 return -ENOMEM;
166 retval = posix_acl_chmod_masq(clone, inode->i_mode);
167 if (!retval)
168 retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
169 posix_acl_release(clone);
170 }
171 return retval;
172}
173
174int v9fs_set_create_acl(struct dentry *dentry,
175 struct posix_acl *dpacl, struct posix_acl *pacl)
176{
177 if (dpacl)
178 v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
179 if (pacl)
180 v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
181 posix_acl_release(dpacl);
182 posix_acl_release(pacl);
183 return 0;
184}
185
186int v9fs_acl_mode(struct inode *dir, mode_t *modep,
187 struct posix_acl **dpacl, struct posix_acl **pacl)
188{
189 int retval = 0;
190 mode_t mode = *modep;
191 struct posix_acl *acl = NULL;
192
193 if (!S_ISLNK(mode)) {
194 acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
195 if (IS_ERR(acl))
196 return PTR_ERR(acl);
197 if (!acl)
198 mode &= ~current_umask();
199 }
200 if (acl) {
201 struct posix_acl *clone;
202
203 if (S_ISDIR(mode))
204 *dpacl = acl;
205 clone = posix_acl_clone(acl, GFP_NOFS);
206 retval = -ENOMEM;
207 if (!clone)
208 goto cleanup;
209
210 retval = posix_acl_create_masq(clone, &mode);
211 if (retval < 0) {
212 posix_acl_release(clone);
213 goto cleanup;
214 }
215 if (retval > 0)
216 *pacl = clone;
217 }
218 *modep = mode;
219 return 0;
220cleanup:
221 posix_acl_release(acl);
222 return retval;
223
224}
225
226static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
227 void *buffer, size_t size, int type)
228{
229 char *full_name;
230
231 switch (type) {
232 case ACL_TYPE_ACCESS:
233 full_name = POSIX_ACL_XATTR_ACCESS;
234 break;
235 case ACL_TYPE_DEFAULT:
236 full_name = POSIX_ACL_XATTR_DEFAULT;
237 break;
238 default:
239 BUG();
240 }
241 return v9fs_xattr_get(dentry, full_name, buffer, size);
242}
243
244static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
245 void *buffer, size_t size, int type)
246{
247 struct v9fs_session_info *v9ses;
248 struct posix_acl *acl;
249 int error;
250
251 if (strcmp(name, "") != 0)
252 return -EINVAL;
253
254 v9ses = v9fs_inode2v9ses(dentry->d_inode);
255 /*
256 * We allow set/get/list of acl when access=client is not specified
257 */
258 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
259 return v9fs_remote_get_acl(dentry, name, buffer, size, type);
260
261 acl = v9fs_get_cached_acl(dentry->d_inode, type);
262 if (IS_ERR(acl))
263 return PTR_ERR(acl);
264 if (acl == NULL)
265 return -ENODATA;
266 error = posix_acl_to_xattr(acl, buffer, size);
267 posix_acl_release(acl);
268
269 return error;
270}
271
272static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
273 const void *value, size_t size,
274 int flags, int type)
275{
276 char *full_name;
277
278 switch (type) {
279 case ACL_TYPE_ACCESS:
280 full_name = POSIX_ACL_XATTR_ACCESS;
281 break;
282 case ACL_TYPE_DEFAULT:
283 full_name = POSIX_ACL_XATTR_DEFAULT;
284 break;
285 default:
286 BUG();
287 }
288 return v9fs_xattr_set(dentry, full_name, value, size, flags);
289}
290
291
292static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
293 const void *value, size_t size,
294 int flags, int type)
295{
296 int retval;
297 struct posix_acl *acl;
298 struct v9fs_session_info *v9ses;
299 struct inode *inode = dentry->d_inode;
300
301 if (strcmp(name, "") != 0)
302 return -EINVAL;
303
304 v9ses = v9fs_inode2v9ses(dentry->d_inode);
305 /*
306 * set the attribute on the remote. Without even looking at the
307 * xattr value. We leave it to the server to validate
308 */
309 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
310 return v9fs_remote_set_acl(dentry, name,
311 value, size, flags, type);
312
313 if (S_ISLNK(inode->i_mode))
314 return -EOPNOTSUPP;
315 if (!is_owner_or_cap(inode))
316 return -EPERM;
317 if (value) {
318 /* update the cached acl value */
319 acl = posix_acl_from_xattr(value, size);
320 if (IS_ERR(acl))
321 return PTR_ERR(acl);
322 else if (acl) {
323 retval = posix_acl_valid(acl);
324 if (retval)
325 goto err_out;
326 }
327 } else
328 acl = NULL;
329
330 switch (type) {
331 case ACL_TYPE_ACCESS:
332 name = POSIX_ACL_XATTR_ACCESS;
333 if (acl) {
334 mode_t mode = inode->i_mode;
335 retval = posix_acl_equiv_mode(acl, &mode);
336 if (retval < 0)
337 goto err_out;
338 else {
339 struct iattr iattr;
340 if (retval == 0) {
341 /*
342 * ACL can be represented
343 * by the mode bits. So don't
344 * update ACL.
345 */
346 acl = NULL;
347 value = NULL;
348 size = 0;
349 }
350 /* Updte the mode bits */
351 iattr.ia_mode = ((mode & S_IALLUGO) |
352 (inode->i_mode & ~S_IALLUGO));
353 iattr.ia_valid = ATTR_MODE;
354 /* FIXME should we update ctime ?
355 * What is the following setxattr update the
356 * mode ?
357 */
358 v9fs_vfs_setattr_dotl(dentry, &iattr);
359 }
360 }
361 break;
362 case ACL_TYPE_DEFAULT:
363 name = POSIX_ACL_XATTR_DEFAULT;
364 if (!S_ISDIR(inode->i_mode)) {
365 retval = -EINVAL;
366 goto err_out;
367 }
368 break;
369 default:
370 BUG();
371 }
372 retval = v9fs_xattr_set(dentry, name, value, size, flags);
373 if (!retval)
374 set_cached_acl(inode, type, acl);
375err_out:
376 posix_acl_release(acl);
377 return retval;
378}
379
380const struct xattr_handler v9fs_xattr_acl_access_handler = {
381 .prefix = POSIX_ACL_XATTR_ACCESS,
382 .flags = ACL_TYPE_ACCESS,
383 .get = v9fs_xattr_get_acl,
384 .set = v9fs_xattr_set_acl,
385};
386
387const struct xattr_handler v9fs_xattr_acl_default_handler = {
388 .prefix = POSIX_ACL_XATTR_DEFAULT,
389 .flags = ACL_TYPE_DEFAULT,
390 .get = v9fs_xattr_get_acl,
391 .set = v9fs_xattr_set_acl,
392};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 000000000000..59e18c2e8c7e
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14#ifndef FS_9P_ACL_H
15#define FS_9P_ACL_H
16
17#ifdef CONFIG_9P_FS_POSIX_ACL
18extern int v9fs_get_acl(struct inode *, struct p9_fid *);
19extern int v9fs_check_acl(struct inode *inode, int mask);
20extern int v9fs_acl_chmod(struct dentry *);
21extern int v9fs_set_create_acl(struct dentry *,
22 struct posix_acl *, struct posix_acl *);
23extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
24 struct posix_acl **dpacl, struct posix_acl **pacl);
25#else
26#define v9fs_check_acl NULL
27static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
28{
29 return 0;
30}
31static inline int v9fs_acl_chmod(struct dentry *dentry)
32{
33 return 0;
34}
35static inline int v9fs_set_create_acl(struct dentry *dentry,
36 struct posix_acl *dpacl,
37 struct posix_acl *pacl)
38{
39 return 0;
40}
41static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
42 struct posix_acl **dpacl,
43 struct posix_acl **pacl)
44{
45 return 0;
46}
47
48#endif
49#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6406f896bf95..b00223c99d70 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
149 switch (access) { 149 switch (access) {
150 case V9FS_ACCESS_SINGLE: 150 case V9FS_ACCESS_SINGLE:
151 case V9FS_ACCESS_USER: 151 case V9FS_ACCESS_USER:
152 case V9FS_ACCESS_CLIENT:
152 uid = current_fsuid(); 153 uid = current_fsuid();
153 any = 0; 154 any = 0;
154 break; 155 break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 38dc0e067599..2f77cd33ba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
193 v9ses->flags |= V9FS_ACCESS_USER; 193 v9ses->flags |= V9FS_ACCESS_USER;
194 else if (strcmp(s, "any") == 0) 194 else if (strcmp(s, "any") == 0)
195 v9ses->flags |= V9FS_ACCESS_ANY; 195 v9ses->flags |= V9FS_ACCESS_ANY;
196 else { 196 else if (strcmp(s, "client") == 0) {
197#ifdef CONFIG_9P_FS_POSIX_ACL
198 v9ses->flags |= V9FS_ACCESS_CLIENT;
199#else
200 P9_DPRINTK(P9_DEBUG_ERROR,
201 "access=client option not supported\n");
202 kfree(s);
203 ret = -EINVAL;
204 goto free_and_return;
205#endif
206 } else {
197 v9ses->flags |= V9FS_ACCESS_SINGLE; 207 v9ses->flags |= V9FS_ACCESS_SINGLE;
198 v9ses->uid = simple_strtoul(s, &e, 10); 208 v9ses->uid = simple_strtoul(s, &e, 10);
199 if (*e != '\0') 209 if (*e != '\0')
@@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
278 288
279 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 289 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
280 290
291 if (!v9fs_proto_dotl(v9ses) &&
292 ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
293 /*
294 * We support ACCESS_CLIENT only for dotl.
295 * Fall back to ACCESS_USER
296 */
297 v9ses->flags &= ~V9FS_ACCESS_MASK;
298 v9ses->flags |= V9FS_ACCESS_USER;
299 }
300 /*FIXME !! */
281 /* for legacy mode, fall back to V9FS_ACCESS_ANY */ 301 /* for legacy mode, fall back to V9FS_ACCESS_ANY */
282 if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && 302 if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
283 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { 303 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4c963c9fc41f..cb6396855e2d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
33 * 33 *
34 * Session flags reflect options selected by users at mount time 34 * Session flags reflect options selected by users at mount time
35 */ 35 */
36#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
37 V9FS_ACCESS_USER | \
38 V9FS_ACCESS_CLIENT)
39#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
40
36enum p9_session_flags { 41enum p9_session_flags {
37 V9FS_PROTO_2000U = 0x01, 42 V9FS_PROTO_2000U = 0x01,
38 V9FS_PROTO_2000L = 0x02, 43 V9FS_PROTO_2000L = 0x02,
39 V9FS_ACCESS_SINGLE = 0x04, 44 V9FS_ACCESS_SINGLE = 0x04,
40 V9FS_ACCESS_USER = 0x08, 45 V9FS_ACCESS_USER = 0x08,
41 V9FS_ACCESS_ANY = 0x0C, 46 V9FS_ACCESS_CLIENT = 0x10
42 V9FS_ACCESS_MASK = 0x0C,
43}; 47};
44 48
45/* possible values of ->cache */ 49/* possible values of ->cache */
@@ -113,8 +117,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses);
113void v9fs_session_cancel(struct v9fs_session_info *v9ses); 117void v9fs_session_cancel(struct v9fs_session_info *v9ses);
114void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); 118void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
115 119
116#define V9FS_MAGIC 0x01021997
117
118/* other default globals */ 120/* other default globals */
119#define V9FS_PORT 564 121#define V9FS_PORT 564
120#define V9FS_DEFUSER "nobody" 122#define V9FS_DEFUSER "nobody"
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 88418c419ea7..bab0eac873f4 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -64,3 +64,7 @@ int v9fs_uflags2omode(int uflags, int extended);
64 64
65ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 65ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
66void v9fs_blank_wstat(struct p9_wstat *wstat); 66void v9fs_blank_wstat(struct p9_wstat *wstat);
67int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
68int v9fs_file_fsync_dotl(struct file *filp, int datasync);
69
70#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b3..b7f2a8e3863e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
154 return 0; 154 return 0;
155} 155}
156 156
157/**
158 * v9fs_direct_IO - 9P address space operation for direct I/O
159 * @rw: direction (read or write)
160 * @iocb: target I/O control block
161 * @iov: array of vectors that define I/O buffer
162 * @pos: offset in file to begin the operation
163 * @nr_segs: size of iovec array
164 *
165 * The presence of v9fs_direct_IO() in the address space ops vector
166 * allowes open() O_DIRECT flags which would have failed otherwise.
167 *
168 * In the non-cached mode, we shunt off direct read and write requests before
169 * the VFS gets them, so this method should never be called.
170 *
171 * Direct IO is not 'yet' supported in the cached mode. Hence when
172 * this routine is called through generic_file_aio_read(), the read/write fails
173 * with an error.
174 *
175 */
176ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
177 loff_t pos, unsigned long nr_segs)
178{
179 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
180 "off/no(%lld/%lu) EINVAL\n",
181 iocb->ki_filp->f_path.dentry->d_name.name,
182 (long long) pos, nr_segs);
183
184 return -EINVAL;
185}
157const struct address_space_operations v9fs_addr_operations = { 186const struct address_space_operations v9fs_addr_operations = {
158 .readpage = v9fs_vfs_readpage, 187 .readpage = v9fs_vfs_readpage,
159 .readpages = v9fs_vfs_readpages, 188 .readpages = v9fs_vfs_readpages,
160 .releasepage = v9fs_release_page, 189 .releasepage = v9fs_release_page,
161 .invalidatepage = v9fs_invalidate_page, 190 .invalidatepage = v9fs_invalidate_page,
162 .launder_page = v9fs_launder_page, 191 .launder_page = v9fs_launder_page,
192 .direct_IO = v9fs_direct_IO,
163}; 193};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 899f168fd19c..b84ebe8cefed 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -242,7 +242,8 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
242 while (rdir->head < rdir->tail) { 242 while (rdir->head < rdir->tail) {
243 243
244 err = p9dirent_read(rdir->buf + rdir->head, 244 err = p9dirent_read(rdir->buf + rdir->head,
245 buflen - rdir->head, &curdirent, 245 rdir->tail - rdir->head,
246 &curdirent,
246 fid->clnt->proto_version); 247 fid->clnt->proto_version);
247 if (err < 0) { 248 if (err < 0) {
248 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 249 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -314,4 +315,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
314 .readdir = v9fs_dir_readdir_dotl, 315 .readdir = v9fs_dir_readdir_dotl,
315 .open = v9fs_file_open, 316 .open = v9fs_file_open,
316 .release = v9fs_dir_release, 317 .release = v9fs_dir_release,
318 .fsync = v9fs_file_fsync_dotl,
317}; 319};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e97c92bd6f16..240c30674396 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/utsname.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <linux/idr.h> 38#include <linux/idr.h>
38#include <net/9p/9p.h> 39#include <net/9p/9p.h>
@@ -44,6 +45,7 @@
44#include "cache.h" 45#include "cache.h"
45 46
46static const struct file_operations v9fs_cached_file_operations; 47static const struct file_operations v9fs_cached_file_operations;
48static const struct file_operations v9fs_cached_file_operations_dotl;
47 49
48/** 50/**
49 * v9fs_file_open - open a file (or directory) 51 * v9fs_file_open - open a file (or directory)
@@ -92,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
92 /* enable cached file options */ 94 /* enable cached file options */
93 if(file->f_op == &v9fs_file_operations) 95 if(file->f_op == &v9fs_file_operations)
94 file->f_op = &v9fs_cached_file_operations; 96 file->f_op = &v9fs_cached_file_operations;
97 else if (file->f_op == &v9fs_file_operations_dotl)
98 file->f_op = &v9fs_cached_file_operations_dotl;
95 99
96#ifdef CONFIG_9P_FSCACHE 100#ifdef CONFIG_9P_FSCACHE
97 v9fs_cache_inode_set_cookie(inode, file); 101 v9fs_cache_inode_set_cookie(inode, file);
@@ -130,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
130 return res; 134 return res;
131} 135}
132 136
137static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
138{
139 struct p9_flock flock;
140 struct p9_fid *fid;
141 uint8_t status;
142 int res = 0;
143 unsigned char fl_type;
144
145 fid = filp->private_data;
146 BUG_ON(fid == NULL);
147
148 if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
149 BUG();
150
151 res = posix_lock_file_wait(filp, fl);
152 if (res < 0)
153 goto out;
154
155 /* convert posix lock to p9 tlock args */
156 memset(&flock, 0, sizeof(flock));
157 flock.type = fl->fl_type;
158 flock.start = fl->fl_start;
159 if (fl->fl_end == OFFSET_MAX)
160 flock.length = 0;
161 else
162 flock.length = fl->fl_end - fl->fl_start + 1;
163 flock.proc_id = fl->fl_pid;
164 flock.client_id = utsname()->nodename;
165 if (IS_SETLKW(cmd))
166 flock.flags = P9_LOCK_FLAGS_BLOCK;
167
168 /*
169 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
170 * for lock request, keep on trying
171 */
172 for (;;) {
173 res = p9_client_lock_dotl(fid, &flock, &status);
174 if (res < 0)
175 break;
176
177 if (status != P9_LOCK_BLOCKED)
178 break;
179 if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
180 break;
181 schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
182 }
183
184 /* map 9p status to VFS status */
185 switch (status) {
186 case P9_LOCK_SUCCESS:
187 res = 0;
188 break;
189 case P9_LOCK_BLOCKED:
190 res = -EAGAIN;
191 break;
192 case P9_LOCK_ERROR:
193 case P9_LOCK_GRACE:
194 res = -ENOLCK;
195 break;
196 default:
197 BUG();
198 }
199
200 /*
201 * incase server returned error for lock request, revert
202 * it locally
203 */
204 if (res < 0 && fl->fl_type != F_UNLCK) {
205 fl_type = fl->fl_type;
206 fl->fl_type = F_UNLCK;
207 res = posix_lock_file_wait(filp, fl);
208 fl->fl_type = fl_type;
209 }
210out:
211 return res;
212}
213
214static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
215{
216 struct p9_getlock glock;
217 struct p9_fid *fid;
218 int res = 0;
219
220 fid = filp->private_data;
221 BUG_ON(fid == NULL);
222
223 posix_test_lock(filp, fl);
224 /*
225 * if we have a conflicting lock locally, no need to validate
226 * with server
227 */
228 if (fl->fl_type != F_UNLCK)
229 return res;
230
231 /* convert posix lock to p9 tgetlock args */
232 memset(&glock, 0, sizeof(glock));
233 glock.type = fl->fl_type;
234 glock.start = fl->fl_start;
235 if (fl->fl_end == OFFSET_MAX)
236 glock.length = 0;
237 else
238 glock.length = fl->fl_end - fl->fl_start + 1;
239 glock.proc_id = fl->fl_pid;
240 glock.client_id = utsname()->nodename;
241
242 res = p9_client_getlock_dotl(fid, &glock);
243 if (res < 0)
244 return res;
245 if (glock.type != F_UNLCK) {
246 fl->fl_type = glock.type;
247 fl->fl_start = glock.start;
248 if (glock.length == 0)
249 fl->fl_end = OFFSET_MAX;
250 else
251 fl->fl_end = glock.start + glock.length - 1;
252 fl->fl_pid = glock.proc_id;
253 } else
254 fl->fl_type = F_UNLCK;
255
256 return res;
257}
258
259/**
260 * v9fs_file_lock_dotl - lock a file (or directory)
261 * @filp: file to be locked
262 * @cmd: lock command
263 * @fl: file lock structure
264 *
265 */
266
267static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
268{
269 struct inode *inode = filp->f_path.dentry->d_inode;
270 int ret = -ENOLCK;
271
272 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
273 cmd, fl, filp->f_path.dentry->d_name.name);
274
275 /* No mandatory locks */
276 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
277 goto out_err;
278
279 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
280 filemap_write_and_wait(inode->i_mapping);
281 invalidate_mapping_pages(&inode->i_data, 0, -1);
282 }
283
284 if (IS_SETLK(cmd) || IS_SETLKW(cmd))
285 ret = v9fs_file_do_lock(filp, cmd, fl);
286 else if (IS_GETLK(cmd))
287 ret = v9fs_file_getlock(filp, fl);
288 else
289 ret = -EINVAL;
290out_err:
291 return ret;
292}
293
294/**
295 * v9fs_file_flock_dotl - lock a file
296 * @filp: file to be locked
297 * @cmd: lock command
298 * @fl: file lock structure
299 *
300 */
301
302static int v9fs_file_flock_dotl(struct file *filp, int cmd,
303 struct file_lock *fl)
304{
305 struct inode *inode = filp->f_path.dentry->d_inode;
306 int ret = -ENOLCK;
307
308 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
309 cmd, fl, filp->f_path.dentry->d_name.name);
310
311 /* No mandatory locks */
312 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
313 goto out_err;
314
315 if (!(fl->fl_flags & FL_FLOCK))
316 goto out_err;
317
318 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
319 filemap_write_and_wait(inode->i_mapping);
320 invalidate_mapping_pages(&inode->i_data, 0, -1);
321 }
322 /* Convert flock to posix lock */
323 fl->fl_owner = (fl_owner_t)filp;
324 fl->fl_start = 0;
325 fl->fl_end = OFFSET_MAX;
326 fl->fl_flags |= FL_POSIX;
327 fl->fl_flags ^= FL_FLOCK;
328
329 if (IS_SETLK(cmd) | IS_SETLKW(cmd))
330 ret = v9fs_file_do_lock(filp, cmd, fl);
331 else
332 ret = -EINVAL;
333out_err:
334 return ret;
335}
336
133/** 337/**
134 * v9fs_file_readn - read from a file 338 * v9fs_file_readn - read from a file
135 * @filp: file pointer to read 339 * @filp: file pointer to read
@@ -219,7 +423,9 @@ static ssize_t
219v9fs_file_write(struct file *filp, const char __user * data, 423v9fs_file_write(struct file *filp, const char __user * data,
220 size_t count, loff_t * offset) 424 size_t count, loff_t * offset)
221{ 425{
222 int n, rsize, total = 0; 426 ssize_t retval;
427 size_t total = 0;
428 int n;
223 struct p9_fid *fid; 429 struct p9_fid *fid;
224 struct p9_client *clnt; 430 struct p9_client *clnt;
225 struct inode *inode = filp->f_path.dentry->d_inode; 431 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -232,14 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data,
232 fid = filp->private_data; 438 fid = filp->private_data;
233 clnt = fid->clnt; 439 clnt = fid->clnt;
234 440
235 rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ; 441 retval = generic_write_checks(filp, &origin, &count, 0);
442 if (retval)
443 goto out;
236 444
237 do { 445 retval = -EINVAL;
238 if (count < rsize) 446 if ((ssize_t) count < 0)
239 rsize = count; 447 goto out;
448 retval = 0;
449 if (!count)
450 goto out;
240 451
241 n = p9_client_write(fid, NULL, data+total, origin+total, 452 do {
242 rsize); 453 n = p9_client_write(fid, NULL, data+total, origin+total, count);
243 if (n <= 0) 454 if (n <= 0)
244 break; 455 break;
245 count -= n; 456 count -= n;
@@ -258,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
258 } 469 }
259 470
260 if (n < 0) 471 if (n < 0)
261 return n; 472 retval = n;
262 473 else
263 return total; 474 retval = total;
475out:
476 return retval;
264} 477}
265 478
266static int v9fs_file_fsync(struct file *filp, int datasync) 479static int v9fs_file_fsync(struct file *filp, int datasync)
@@ -278,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
278 return retval; 491 return retval;
279} 492}
280 493
494int v9fs_file_fsync_dotl(struct file *filp, int datasync)
495{
496 struct p9_fid *fid;
497 int retval;
498
499 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
500 filp, datasync);
501
502 fid = filp->private_data;
503
504 retval = p9_client_fsync(fid, datasync);
505 return retval;
506}
507
281static const struct file_operations v9fs_cached_file_operations = { 508static const struct file_operations v9fs_cached_file_operations = {
282 .llseek = generic_file_llseek, 509 .llseek = generic_file_llseek,
283 .read = do_sync_read, 510 .read = do_sync_read,
@@ -290,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = {
290 .fsync = v9fs_file_fsync, 517 .fsync = v9fs_file_fsync,
291}; 518};
292 519
520static const struct file_operations v9fs_cached_file_operations_dotl = {
521 .llseek = generic_file_llseek,
522 .read = do_sync_read,
523 .aio_read = generic_file_aio_read,
524 .write = v9fs_file_write,
525 .open = v9fs_file_open,
526 .release = v9fs_dir_release,
527 .lock = v9fs_file_lock_dotl,
528 .flock = v9fs_file_flock_dotl,
529 .mmap = generic_file_readonly_mmap,
530 .fsync = v9fs_file_fsync_dotl,
531};
532
293const struct file_operations v9fs_file_operations = { 533const struct file_operations v9fs_file_operations = {
294 .llseek = generic_file_llseek, 534 .llseek = generic_file_llseek,
295 .read = v9fs_file_read, 535 .read = v9fs_file_read,
@@ -307,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = {
307 .write = v9fs_file_write, 547 .write = v9fs_file_write,
308 .open = v9fs_file_open, 548 .open = v9fs_file_open,
309 .release = v9fs_dir_release, 549 .release = v9fs_dir_release,
310 .lock = v9fs_file_lock, 550 .lock = v9fs_file_lock_dotl,
551 .flock = v9fs_file_flock_dotl,
311 .mmap = generic_file_readonly_mmap, 552 .mmap = generic_file_readonly_mmap,
312 .fsync = v9fs_file_fsync, 553 .fsync = v9fs_file_fsync_dotl,
313}; 554};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..34bf71b56542 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -36,6 +36,7 @@
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/xattr.h> 38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
39#include <net/9p/9p.h> 40#include <net/9p/9p.h>
40#include <net/9p/client.h> 41#include <net/9p/client.h>
41 42
@@ -44,6 +45,7 @@
44#include "fid.h" 45#include "fid.h"
45#include "cache.h" 46#include "cache.h"
46#include "xattr.h" 47#include "xattr.h"
48#include "acl.h"
47 49
48static const struct inode_operations v9fs_dir_inode_operations; 50static const struct inode_operations v9fs_dir_inode_operations;
49static const struct inode_operations v9fs_dir_inode_operations_dotu; 51static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -53,6 +55,10 @@ static const struct inode_operations v9fs_file_inode_operations_dotl;
53static const struct inode_operations v9fs_symlink_inode_operations; 55static const struct inode_operations v9fs_symlink_inode_operations;
54static const struct inode_operations v9fs_symlink_inode_operations_dotl; 56static const struct inode_operations v9fs_symlink_inode_operations_dotl;
55 57
58static int
59v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
60 dev_t rdev);
61
56/** 62/**
57 * unixmode2p9mode - convert unix mode bits to plan 9 63 * unixmode2p9mode - convert unix mode bits to plan 9
58 * @v9ses: v9fs session information 64 * @v9ses: v9fs session information
@@ -500,6 +506,11 @@ v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
500 v9fs_vcookie_set_qid(ret, &st->qid); 506 v9fs_vcookie_set_qid(ret, &st->qid);
501 v9fs_cache_inode_get_cookie(ret); 507 v9fs_cache_inode_get_cookie(ret);
502#endif 508#endif
509 err = v9fs_get_acl(ret, fid);
510 if (err) {
511 iput(ret);
512 goto error;
513 }
503 kfree(st); 514 kfree(st);
504 return ret; 515 return ret;
505error: 516error:
@@ -553,13 +564,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
553 return retval; 564 return retval;
554} 565}
555 566
556static int
557v9fs_open_created(struct inode *inode, struct file *file)
558{
559 return 0;
560}
561
562
563/** 567/**
564 * v9fs_create - Create a file 568 * v9fs_create - Create a file
565 * @v9ses: session information 569 * @v9ses: session information
@@ -655,29 +659,37 @@ error:
655 */ 659 */
656 660
657static int 661static int
658v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, 662v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
659 struct nameidata *nd) 663 struct nameidata *nd)
660{ 664{
661 int err = 0; 665 int err = 0;
662 char *name = NULL; 666 char *name = NULL;
663 gid_t gid; 667 gid_t gid;
664 int flags; 668 int flags;
669 mode_t mode;
665 struct v9fs_session_info *v9ses; 670 struct v9fs_session_info *v9ses;
666 struct p9_fid *fid = NULL; 671 struct p9_fid *fid = NULL;
667 struct p9_fid *dfid, *ofid; 672 struct p9_fid *dfid, *ofid;
668 struct file *filp; 673 struct file *filp;
669 struct p9_qid qid; 674 struct p9_qid qid;
670 struct inode *inode; 675 struct inode *inode;
676 struct posix_acl *pacl = NULL, *dacl = NULL;
671 677
672 v9ses = v9fs_inode2v9ses(dir); 678 v9ses = v9fs_inode2v9ses(dir);
673 if (nd && nd->flags & LOOKUP_OPEN) 679 if (nd && nd->flags & LOOKUP_OPEN)
674 flags = nd->intent.open.flags - 1; 680 flags = nd->intent.open.flags - 1;
675 else 681 else {
676 flags = O_RDWR; 682 /*
683 * create call without LOOKUP_OPEN is due
684 * to mknod of regular files. So use mknod
685 * operation.
686 */
687 return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
688 }
677 689
678 name = (char *) dentry->d_name.name; 690 name = (char *) dentry->d_name.name;
679 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " 691 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
680 "mode:0x%x\n", name, flags, mode); 692 "mode:0x%x\n", name, flags, omode);
681 693
682 dfid = v9fs_fid_lookup(dentry->d_parent); 694 dfid = v9fs_fid_lookup(dentry->d_parent);
683 if (IS_ERR(dfid)) { 695 if (IS_ERR(dfid)) {
@@ -695,6 +707,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
695 } 707 }
696 708
697 gid = v9fs_get_fsgid_for_create(dir); 709 gid = v9fs_get_fsgid_for_create(dir);
710
711 mode = omode;
712 /* Update mode based on ACL value */
713 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
714 if (err) {
715 P9_DPRINTK(P9_DEBUG_VFS,
716 "Failed to get acl values in creat %d\n", err);
717 goto error;
718 }
698 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); 719 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
699 if (err < 0) { 720 if (err < 0) {
700 P9_DPRINTK(P9_DEBUG_VFS, 721 P9_DPRINTK(P9_DEBUG_VFS,
@@ -702,46 +723,52 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
702 err); 723 err);
703 goto error; 724 goto error;
704 } 725 }
726 /* instantiate inode and assign the unopened fid to the dentry */
727 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
728 (nd && nd->flags & LOOKUP_OPEN)) {
729 fid = p9_client_walk(dfid, 1, &name, 1);
730 if (IS_ERR(fid)) {
731 err = PTR_ERR(fid);
732 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
733 err);
734 fid = NULL;
735 goto error;
736 }
705 737
706 /* No need to populate the inode if we are not opening the file AND 738 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
707 * not in cached mode. 739 if (IS_ERR(inode)) {
708 */ 740 err = PTR_ERR(inode);
709 if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) { 741 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
710 /* Not in cached mode. No need to populate inode with stat */ 742 err);
711 dentry->d_op = &v9fs_dentry_operations; 743 goto error;
712 p9_client_clunk(ofid); 744 }
713 d_instantiate(dentry, NULL);
714 return 0;
715 }
716
717 /* Now walk from the parent so we can get an unopened fid. */
718 fid = p9_client_walk(dfid, 1, &name, 1);
719 if (IS_ERR(fid)) {
720 err = PTR_ERR(fid);
721 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
722 fid = NULL;
723 goto error;
724 }
725
726 /* instantiate inode and assign the unopened fid to dentry */
727 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
728 if (IS_ERR(inode)) {
729 err = PTR_ERR(inode);
730 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
731 goto error;
732 }
733 if (v9ses->cache)
734 dentry->d_op = &v9fs_cached_dentry_operations; 745 dentry->d_op = &v9fs_cached_dentry_operations;
735 else 746 d_instantiate(dentry, inode);
747 err = v9fs_fid_add(dentry, fid);
748 if (err < 0)
749 goto error;
750 /* The fid would get clunked via a dput */
751 fid = NULL;
752 } else {
753 /*
754 * Not in cached mode. No need to populate
755 * inode with stat. We need to get an inode
756 * so that we can set the acl with dentry
757 */
758 inode = v9fs_get_inode(dir->i_sb, mode);
759 if (IS_ERR(inode)) {
760 err = PTR_ERR(inode);
761 goto error;
762 }
736 dentry->d_op = &v9fs_dentry_operations; 763 dentry->d_op = &v9fs_dentry_operations;
737 d_instantiate(dentry, inode); 764 d_instantiate(dentry, inode);
738 err = v9fs_fid_add(dentry, fid); 765 }
739 if (err < 0) 766 /* Now set the ACL based on the default value */
740 goto error; 767 v9fs_set_create_acl(dentry, dacl, pacl);
741 768
742 /* if we are opening a file, assign the open fid to the file */ 769 /* if we are opening a file, assign the open fid to the file */
743 if (nd && nd->flags & LOOKUP_OPEN) { 770 if (nd && nd->flags & LOOKUP_OPEN) {
744 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); 771 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
745 if (IS_ERR(filp)) { 772 if (IS_ERR(filp)) {
746 p9_client_clunk(ofid); 773 p9_client_clunk(ofid);
747 return PTR_ERR(filp); 774 return PTR_ERR(filp);
@@ -800,7 +827,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
800 827
801 /* if we are opening a file, assign the open fid to the file */ 828 /* if we are opening a file, assign the open fid to the file */
802 if (nd && nd->flags & LOOKUP_OPEN) { 829 if (nd && nd->flags & LOOKUP_OPEN) {
803 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); 830 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
804 if (IS_ERR(filp)) { 831 if (IS_ERR(filp)) {
805 err = PTR_ERR(filp); 832 err = PTR_ERR(filp);
806 goto error; 833 goto error;
@@ -859,23 +886,28 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
859 * 886 *
860 */ 887 */
861 888
862static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, 889static int v9fs_vfs_mkdir_dotl(struct inode *dir,
863 int mode) 890 struct dentry *dentry, int omode)
864{ 891{
865 int err; 892 int err;
866 struct v9fs_session_info *v9ses; 893 struct v9fs_session_info *v9ses;
867 struct p9_fid *fid = NULL, *dfid = NULL; 894 struct p9_fid *fid = NULL, *dfid = NULL;
868 gid_t gid; 895 gid_t gid;
869 char *name; 896 char *name;
897 mode_t mode;
870 struct inode *inode; 898 struct inode *inode;
871 struct p9_qid qid; 899 struct p9_qid qid;
872 struct dentry *dir_dentry; 900 struct dentry *dir_dentry;
901 struct posix_acl *dacl = NULL, *pacl = NULL;
873 902
874 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 903 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
875 err = 0; 904 err = 0;
876 v9ses = v9fs_inode2v9ses(dir); 905 v9ses = v9fs_inode2v9ses(dir);
877 906
878 mode |= S_IFDIR; 907 omode |= S_IFDIR;
908 if (dir->i_mode & S_ISGID)
909 omode |= S_ISGID;
910
879 dir_dentry = v9fs_dentry_from_dir_inode(dir); 911 dir_dentry = v9fs_dentry_from_dir_inode(dir);
880 dfid = v9fs_fid_lookup(dir_dentry); 912 dfid = v9fs_fid_lookup(dir_dentry);
881 if (IS_ERR(dfid)) { 913 if (IS_ERR(dfid)) {
@@ -886,11 +918,14 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
886 } 918 }
887 919
888 gid = v9fs_get_fsgid_for_create(dir); 920 gid = v9fs_get_fsgid_for_create(dir);
889 if (gid < 0) { 921 mode = omode;
890 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); 922 /* Update mode based on ACL value */
923 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
924 if (err) {
925 P9_DPRINTK(P9_DEBUG_VFS,
926 "Failed to get acl values in mkdir %d\n", err);
891 goto error; 927 goto error;
892 } 928 }
893
894 name = (char *) dentry->d_name.name; 929 name = (char *) dentry->d_name.name;
895 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); 930 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
896 if (err < 0) 931 if (err < 0)
@@ -920,7 +955,23 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
920 if (err < 0) 955 if (err < 0)
921 goto error; 956 goto error;
922 fid = NULL; 957 fid = NULL;
958 } else {
959 /*
960 * Not in cached mode. No need to populate
961 * inode with stat. We need to get an inode
962 * so that we can set the acl with dentry
963 */
964 inode = v9fs_get_inode(dir->i_sb, mode);
965 if (IS_ERR(inode)) {
966 err = PTR_ERR(inode);
967 goto error;
968 }
969 dentry->d_op = &v9fs_dentry_operations;
970 d_instantiate(dentry, inode);
923 } 971 }
972 /* Now set the ACL based on the default value */
973 v9fs_set_create_acl(dentry, dacl, pacl);
974
924error: 975error:
925 if (fid) 976 if (fid)
926 p9_client_clunk(fid); 977 p9_client_clunk(fid);
@@ -979,7 +1030,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
979 1030
980 result = v9fs_fid_add(dentry, fid); 1031 result = v9fs_fid_add(dentry, fid);
981 if (result < 0) 1032 if (result < 0)
982 goto error; 1033 goto error_iput;
983 1034
984inst_out: 1035inst_out:
985 if (v9ses->cache) 1036 if (v9ses->cache)
@@ -990,6 +1041,8 @@ inst_out:
990 d_add(dentry, inode); 1041 d_add(dentry, inode);
991 return NULL; 1042 return NULL;
992 1043
1044error_iput:
1045 iput(inode);
993error: 1046error:
994 p9_client_clunk(fid); 1047 p9_client_clunk(fid);
995 1048
@@ -1237,7 +1290,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
1237 * 1290 *
1238 */ 1291 */
1239 1292
1240static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) 1293int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1241{ 1294{
1242 int retval; 1295 int retval;
1243 struct v9fs_session_info *v9ses; 1296 struct v9fs_session_info *v9ses;
@@ -1279,6 +1332,12 @@ static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1279 1332
1280 setattr_copy(dentry->d_inode, iattr); 1333 setattr_copy(dentry->d_inode, iattr);
1281 mark_inode_dirty(dentry->d_inode); 1334 mark_inode_dirty(dentry->d_inode);
1335 if (iattr->ia_valid & ATTR_MODE) {
1336 /* We also want to update ACL when we update mode bits */
1337 retval = v9fs_acl_chmod(dentry);
1338 if (retval < 0)
1339 return retval;
1340 }
1282 return 0; 1341 return 0;
1283} 1342}
1284 1343
@@ -1473,7 +1532,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1473 if (IS_ERR(fid)) 1532 if (IS_ERR(fid))
1474 return PTR_ERR(fid); 1533 return PTR_ERR(fid);
1475 1534
1476 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) 1535 if (!v9fs_proto_dotu(v9ses))
1477 return -EBADF; 1536 return -EBADF;
1478 1537
1479 st = p9_client_stat(fid); 1538 st = p9_client_stat(fid);
@@ -1616,11 +1675,6 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
1616 1675
1617 gid = v9fs_get_fsgid_for_create(dir); 1676 gid = v9fs_get_fsgid_for_create(dir);
1618 1677
1619 if (gid < 0) {
1620 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
1621 goto error;
1622 }
1623
1624 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ 1678 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
1625 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); 1679 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
1626 1680
@@ -1789,9 +1843,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
1789 kfree(st); 1843 kfree(st);
1790 } else { 1844 } else {
1791 /* Caching disabled. No need to get upto date stat info. 1845 /* Caching disabled. No need to get upto date stat info.
1792 * This dentry will be released immediately. So, just i_count++ 1846 * This dentry will be released immediately. So, just hold the
1847 * inode
1793 */ 1848 */
1794 atomic_inc(&old_dentry->d_inode->i_count); 1849 ihold(old_dentry->d_inode);
1795 } 1850 }
1796 1851
1797 dentry->d_op = old_dentry->d_op; 1852 dentry->d_op = old_dentry->d_op;
@@ -1854,21 +1909,23 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1854 * 1909 *
1855 */ 1910 */
1856static int 1911static int
1857v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode, 1912v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
1858 dev_t rdev) 1913 dev_t rdev)
1859{ 1914{
1860 int err; 1915 int err;
1861 char *name; 1916 char *name;
1917 mode_t mode;
1862 struct v9fs_session_info *v9ses; 1918 struct v9fs_session_info *v9ses;
1863 struct p9_fid *fid = NULL, *dfid = NULL; 1919 struct p9_fid *fid = NULL, *dfid = NULL;
1864 struct inode *inode; 1920 struct inode *inode;
1865 gid_t gid; 1921 gid_t gid;
1866 struct p9_qid qid; 1922 struct p9_qid qid;
1867 struct dentry *dir_dentry; 1923 struct dentry *dir_dentry;
1924 struct posix_acl *dacl = NULL, *pacl = NULL;
1868 1925
1869 P9_DPRINTK(P9_DEBUG_VFS, 1926 P9_DPRINTK(P9_DEBUG_VFS,
1870 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, 1927 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1871 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); 1928 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
1872 1929
1873 if (!new_valid_dev(rdev)) 1930 if (!new_valid_dev(rdev))
1874 return -EINVAL; 1931 return -EINVAL;
@@ -1884,11 +1941,14 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
1884 } 1941 }
1885 1942
1886 gid = v9fs_get_fsgid_for_create(dir); 1943 gid = v9fs_get_fsgid_for_create(dir);
1887 if (gid < 0) { 1944 mode = omode;
1888 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); 1945 /* Update mode based on ACL value */
1946 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
1947 if (err) {
1948 P9_DPRINTK(P9_DEBUG_VFS,
1949 "Failed to get acl values in mknod %d\n", err);
1889 goto error; 1950 goto error;
1890 } 1951 }
1891
1892 name = (char *) dentry->d_name.name; 1952 name = (char *) dentry->d_name.name;
1893 1953
1894 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); 1954 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
@@ -1932,13 +1992,68 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
1932 dentry->d_op = &v9fs_dentry_operations; 1992 dentry->d_op = &v9fs_dentry_operations;
1933 d_instantiate(dentry, inode); 1993 d_instantiate(dentry, inode);
1934 } 1994 }
1935 1995 /* Now set the ACL based on the default value */
1996 v9fs_set_create_acl(dentry, dacl, pacl);
1936error: 1997error:
1937 if (fid) 1998 if (fid)
1938 p9_client_clunk(fid); 1999 p9_client_clunk(fid);
1939 return err; 2000 return err;
1940} 2001}
1941 2002
2003static int
2004v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
2005{
2006 int retval;
2007 struct p9_fid *fid;
2008 char *target = NULL;
2009
2010 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
2011 retval = -EPERM;
2012 fid = v9fs_fid_lookup(dentry);
2013 if (IS_ERR(fid))
2014 return PTR_ERR(fid);
2015
2016 retval = p9_client_readlink(fid, &target);
2017 if (retval < 0)
2018 return retval;
2019
2020 strncpy(buffer, target, buflen);
2021 P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
2022
2023 retval = strnlen(buffer, buflen);
2024 return retval;
2025}
2026
2027/**
2028 * v9fs_vfs_follow_link_dotl - follow a symlink path
2029 * @dentry: dentry for symlink
2030 * @nd: nameidata
2031 *
2032 */
2033
2034static void *
2035v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
2036{
2037 int len = 0;
2038 char *link = __getname();
2039
2040 P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
2041
2042 if (!link)
2043 link = ERR_PTR(-ENOMEM);
2044 else {
2045 len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
2046 if (len < 0) {
2047 __putname(link);
2048 link = ERR_PTR(len);
2049 } else
2050 link[min(len, PATH_MAX-1)] = 0;
2051 }
2052 nd_set_link(nd, link);
2053
2054 return NULL;
2055}
2056
1942static const struct inode_operations v9fs_dir_inode_operations_dotu = { 2057static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1943 .create = v9fs_vfs_create, 2058 .create = v9fs_vfs_create,
1944 .lookup = v9fs_vfs_lookup, 2059 .lookup = v9fs_vfs_lookup,
@@ -1969,7 +2084,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1969 .getxattr = generic_getxattr, 2084 .getxattr = generic_getxattr,
1970 .removexattr = generic_removexattr, 2085 .removexattr = generic_removexattr,
1971 .listxattr = v9fs_listxattr, 2086 .listxattr = v9fs_listxattr,
1972 2087 .check_acl = v9fs_check_acl,
1973}; 2088};
1974 2089
1975static const struct inode_operations v9fs_dir_inode_operations = { 2090static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1996,6 +2111,7 @@ static const struct inode_operations v9fs_file_inode_operations_dotl = {
1996 .getxattr = generic_getxattr, 2111 .getxattr = generic_getxattr,
1997 .removexattr = generic_removexattr, 2112 .removexattr = generic_removexattr,
1998 .listxattr = v9fs_listxattr, 2113 .listxattr = v9fs_listxattr,
2114 .check_acl = v9fs_check_acl,
1999}; 2115};
2000 2116
2001static const struct inode_operations v9fs_symlink_inode_operations = { 2117static const struct inode_operations v9fs_symlink_inode_operations = {
@@ -2007,8 +2123,8 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
2007}; 2123};
2008 2124
2009static const struct inode_operations v9fs_symlink_inode_operations_dotl = { 2125static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
2010 .readlink = generic_readlink, 2126 .readlink = v9fs_vfs_readlink_dotl,
2011 .follow_link = v9fs_vfs_follow_link, 2127 .follow_link = v9fs_vfs_follow_link_dotl,
2012 .put_link = v9fs_vfs_put_link, 2128 .put_link = v9fs_vfs_put_link,
2013 .getattr = v9fs_vfs_getattr_dotl, 2129 .getattr = v9fs_vfs_getattr_dotl,
2014 .setattr = v9fs_vfs_setattr_dotl, 2130 .setattr = v9fs_vfs_setattr_dotl,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1d12ba0ed3db..c55c614500ad 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,6 +39,7 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h> 41#include <linux/statfs.h>
42#include <linux/magic.h>
42#include <net/9p/9p.h> 43#include <net/9p/9p.h>
43#include <net/9p/client.h> 44#include <net/9p/client.h>
44 45
@@ -46,6 +47,7 @@
46#include "v9fs_vfs.h" 47#include "v9fs_vfs.h"
47#include "fid.h" 48#include "fid.h"
48#include "xattr.h" 49#include "xattr.h"
50#include "acl.h"
49 51
50static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; 52static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
51 53
@@ -66,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
66 * v9fs_fill_super - populate superblock with info 68 * v9fs_fill_super - populate superblock with info
67 * @sb: superblock 69 * @sb: superblock
68 * @v9ses: session information 70 * @v9ses: session information
69 * @flags: flags propagated from v9fs_get_sb() 71 * @flags: flags propagated from v9fs_mount()
70 * 72 *
71 */ 73 */
72 74
@@ -88,22 +90,25 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
88 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 90 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
89 MS_NOATIME; 91 MS_NOATIME;
90 92
93#ifdef CONFIG_9P_FS_POSIX_ACL
94 if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
95 sb->s_flags |= MS_POSIXACL;
96#endif
97
91 save_mount_options(sb, data); 98 save_mount_options(sb, data);
92} 99}
93 100
94/** 101/**
95 * v9fs_get_sb - mount a superblock 102 * v9fs_mount - mount a superblock
96 * @fs_type: file system type 103 * @fs_type: file system type
97 * @flags: mount flags 104 * @flags: mount flags
98 * @dev_name: device name that was mounted 105 * @dev_name: device name that was mounted
99 * @data: mount options 106 * @data: mount options
100 * @mnt: mountpoint record to be instantiated
101 * 107 *
102 */ 108 */
103 109
104static int v9fs_get_sb(struct file_system_type *fs_type, int flags, 110static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
105 const char *dev_name, void *data, 111 const char *dev_name, void *data)
106 struct vfsmount *mnt)
107{ 112{
108 struct super_block *sb = NULL; 113 struct super_block *sb = NULL;
109 struct inode *inode = NULL; 114 struct inode *inode = NULL;
@@ -117,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
117 122
118 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 123 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
119 if (!v9ses) 124 if (!v9ses)
120 return -ENOMEM; 125 return ERR_PTR(-ENOMEM);
121 126
122 fid = v9fs_session_init(v9ses, dev_name, data); 127 fid = v9fs_session_init(v9ses, dev_name, data);
123 if (IS_ERR(fid)) { 128 if (IS_ERR(fid)) {
@@ -149,7 +154,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
149 goto release_sb; 154 goto release_sb;
150 } 155 }
151 sb->s_root = root; 156 sb->s_root = root;
152
153 if (v9fs_proto_dotl(v9ses)) { 157 if (v9fs_proto_dotl(v9ses)) {
154 struct p9_stat_dotl *st = NULL; 158 struct p9_stat_dotl *st = NULL;
155 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); 159 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
@@ -174,19 +178,21 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
174 p9stat_free(st); 178 p9stat_free(st);
175 kfree(st); 179 kfree(st);
176 } 180 }
177 181 retval = v9fs_get_acl(inode, fid);
182 if (retval)
183 goto release_sb;
178 v9fs_fid_add(root, fid); 184 v9fs_fid_add(root, fid);
179 185
180 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); 186 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
181 simple_set_mnt(mnt, sb); 187 return dget(sb->s_root);
182 return 0;
183 188
184clunk_fid: 189clunk_fid:
185 p9_client_clunk(fid); 190 p9_client_clunk(fid);
186close_session: 191close_session:
187 v9fs_session_close(v9ses); 192 v9fs_session_close(v9ses);
188 kfree(v9ses); 193 kfree(v9ses);
189 return retval; 194 return ERR_PTR(retval);
195
190release_sb: 196release_sb:
191 /* 197 /*
192 * we will do the session_close and root dentry release 198 * we will do the session_close and root dentry release
@@ -196,7 +202,7 @@ release_sb:
196 */ 202 */
197 p9_client_clunk(fid); 203 p9_client_clunk(fid);
198 deactivate_locked_super(sb); 204 deactivate_locked_super(sb);
199 return retval; 205 return ERR_PTR(retval);
200} 206}
201 207
202/** 208/**
@@ -249,7 +255,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
249 if (v9fs_proto_dotl(v9ses)) { 255 if (v9fs_proto_dotl(v9ses)) {
250 res = p9_client_statfs(fid, &rs); 256 res = p9_client_statfs(fid, &rs);
251 if (res == 0) { 257 if (res == 0) {
252 buf->f_type = rs.type; 258 buf->f_type = V9FS_MAGIC;
253 buf->f_bsize = rs.bsize; 259 buf->f_bsize = rs.bsize;
254 buf->f_blocks = rs.blocks; 260 buf->f_blocks = rs.blocks;
255 buf->f_bfree = rs.bfree; 261 buf->f_bfree = rs.bfree;
@@ -292,7 +298,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
292 298
293struct file_system_type v9fs_fs_type = { 299struct file_system_type v9fs_fs_type = {
294 .name = "9p", 300 .name = "9p",
295 .get_sb = v9fs_get_sb, 301 .mount = v9fs_mount,
296 .kill_sb = v9fs_kill_super, 302 .kill_sb = v9fs_kill_super,
297 .owner = THIS_MODULE, 303 .owner = THIS_MODULE,
298 .fs_flags = FS_RENAME_DOES_D_MOVE, 304 .fs_flags = FS_RENAME_DOES_D_MOVE,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f88e5c2dc873..43ec7df84336 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -21,30 +21,13 @@
21#include "fid.h" 21#include "fid.h"
22#include "xattr.h" 22#include "xattr.h"
23 23
24/* 24ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
25 * v9fs_xattr_get() 25 void *buffer, size_t buffer_size)
26 *
27 * Copy an extended attribute into the buffer
28 * provided, or compute the buffer size required.
29 * Buffer is NULL to compute the size of the buffer required.
30 *
31 * Returns a negative error number on failure, or the number of bytes
32 * used / required on success.
33 */
34ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
35 void *buffer, size_t buffer_size)
36{ 26{
37 ssize_t retval; 27 ssize_t retval;
38 int msize, read_count; 28 int msize, read_count;
39 u64 offset = 0, attr_size; 29 u64 offset = 0, attr_size;
40 struct p9_fid *fid, *attr_fid; 30 struct p9_fid *attr_fid;
41
42 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
43 __func__, name, buffer_size);
44
45 fid = v9fs_fid_lookup(dentry);
46 if (IS_ERR(fid))
47 return PTR_ERR(fid);
48 31
49 attr_fid = p9_client_xattrwalk(fid, name, &attr_size); 32 attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
50 if (IS_ERR(attr_fid)) { 33 if (IS_ERR(attr_fid)) {
@@ -88,6 +71,31 @@ error:
88 71
89} 72}
90 73
74
75/*
76 * v9fs_xattr_get()
77 *
78 * Copy an extended attribute into the buffer
79 * provided, or compute the buffer size required.
80 * Buffer is NULL to compute the size of the buffer required.
81 *
82 * Returns a negative error number on failure, or the number of bytes
83 * used / required on success.
84 */
85ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
86 void *buffer, size_t buffer_size)
87{
88 struct p9_fid *fid;
89
90 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
91 __func__, name, buffer_size);
92 fid = v9fs_fid_lookup(dentry);
93 if (IS_ERR(fid))
94 return PTR_ERR(fid);
95
96 return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
97}
98
91/* 99/*
92 * v9fs_xattr_set() 100 * v9fs_xattr_set()
93 * 101 *
@@ -156,5 +164,9 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
156 164
157const struct xattr_handler *v9fs_xattr_handlers[] = { 165const struct xattr_handler *v9fs_xattr_handlers[] = {
158 &v9fs_xattr_user_handler, 166 &v9fs_xattr_user_handler,
167#ifdef CONFIG_9P_FS_POSIX_ACL
168 &v9fs_xattr_acl_access_handler,
169 &v9fs_xattr_acl_default_handler,
170#endif
159 NULL 171 NULL
160}; 172};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 9ddf672ae5c4..eaa837c53bd5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -15,10 +15,16 @@
15#define FS_9P_XATTR_H 15#define FS_9P_XATTR_H
16 16
17#include <linux/xattr.h> 17#include <linux/xattr.h>
18#include <net/9p/9p.h>
19#include <net/9p/client.h>
18 20
19extern const struct xattr_handler *v9fs_xattr_handlers[]; 21extern const struct xattr_handler *v9fs_xattr_handlers[];
20extern struct xattr_handler v9fs_xattr_user_handler; 22extern struct xattr_handler v9fs_xattr_user_handler;
23extern const struct xattr_handler v9fs_xattr_acl_access_handler;
24extern const struct xattr_handler v9fs_xattr_acl_default_handler;
21 25
26extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
27 void *, size_t);
22extern ssize_t v9fs_xattr_get(struct dentry *, const char *, 28extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
23 void *, size_t); 29 void *, size_t);
24extern int v9fs_xattr_set(struct dentry *, const char *, 30extern int v9fs_xattr_set(struct dentry *, const char *,
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..771f457402d4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
47 47
48endif # BLOCK 48endif # BLOCK
49 49
50config EXPORTFS
51 tristate
52
50config FILE_LOCKING 53config FILE_LOCKING
51 bool "Enable POSIX file locking API" if EMBEDDED 54 bool "Enable POSIX file locking API" if EMBEDDED
52 default y 55 default y
@@ -59,7 +62,6 @@ source "fs/notify/Kconfig"
59 62
60source "fs/quota/Kconfig" 63source "fs/quota/Kconfig"
61 64
62source "fs/autofs/Kconfig"
63source "fs/autofs4/Kconfig" 65source "fs/autofs4/Kconfig"
64source "fs/fuse/Kconfig" 66source "fs/fuse/Kconfig"
65 67
@@ -221,9 +223,6 @@ config LOCKD_V4
221 depends on FILE_LOCKING 223 depends on FILE_LOCKING
222 default y 224 default y
223 225
224config EXPORTFS
225 tristate
226
227config NFS_ACL_SUPPORT 226config NFS_ACL_SUPPORT
228 tristate 227 tristate
229 select FS_POSIX_ACL 228 select FS_POSIX_ACL
@@ -234,7 +233,6 @@ config NFS_COMMON
234 default y 233 default y
235 234
236source "net/sunrpc/Kconfig" 235source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig" 236source "fs/ceph/Kconfig"
239source "fs/cifs/Kconfig" 237source "fs/cifs/Kconfig"
240source "fs/ncpfs/Kconfig" 238source "fs/ncpfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc8..79e2ca7973b7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
42 42
43config CORE_DUMP_DEFAULT_ELF_HEADERS 43config CORE_DUMP_DEFAULT_ELF_HEADERS
44 bool "Write ELF core dumps with partial segments" 44 bool "Write ELF core dumps with partial segments"
45 default n 45 default y
46 depends on BINFMT_ELF && ELF_CORE 46 depends on BINFMT_ELF && ELF_CORE
47 help 47 help
48 ELF core dump files describe each memory mapping of the crashed 48 ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
60 inherited. See Documentation/filesystems/proc.txt for details. 60 inherited. See Documentation/filesystems/proc.txt for details.
61 61
62 This config option changes the default setting of coredump_filter 62 This config option changes the default setting of coredump_filter
63 seen at boot time. If unsure, say N. 63 seen at boot time. If unsure, say Y.
64 64
65config BINFMT_FLAT 65config BINFMT_FLAT
66 bool "Kernel support for flat binaries" 66 bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..a7f7cef0c0c8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
29obj-$(CONFIG_AIO) += aio.o 29obj-$(CONFIG_AIO) += aio.o
30obj-$(CONFIG_FILE_LOCKING) += locks.o 30obj-$(CONFIG_FILE_LOCKING) += locks.o
31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
32 32obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o
33nfsd-$(CONFIG_NFSD) := nfsctl.o
34obj-y += $(nfsd-y) $(nfsd-m)
35
36obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 33obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
37obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o 34obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
38obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o 35obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
@@ -91,7 +88,6 @@ obj-$(CONFIG_NFSD) += nfsd/
91obj-$(CONFIG_LOCKD) += lockd/ 88obj-$(CONFIG_LOCKD) += lockd/
92obj-$(CONFIG_NLS) += nls/ 89obj-$(CONFIG_NLS) += nls/
93obj-$(CONFIG_SYSV_FS) += sysv/ 90obj-$(CONFIG_SYSV_FS) += sysv/
94obj-$(CONFIG_SMB_FS) += smbfs/
95obj-$(CONFIG_CIFS) += cifs/ 91obj-$(CONFIG_CIFS) += cifs/
96obj-$(CONFIG_NCP_FS) += ncpfs/ 92obj-$(CONFIG_NCP_FS) += ncpfs/
97obj-$(CONFIG_HPFS_FS) += hpfs/ 93obj-$(CONFIG_HPFS_FS) += hpfs/
@@ -104,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS) += ubifs/
104obj-$(CONFIG_AFFS_FS) += affs/ 100obj-$(CONFIG_AFFS_FS) += affs/
105obj-$(CONFIG_ROMFS_FS) += romfs/ 101obj-$(CONFIG_ROMFS_FS) += romfs/
106obj-$(CONFIG_QNX4FS_FS) += qnx4/ 102obj-$(CONFIG_QNX4FS_FS) += qnx4/
107obj-$(CONFIG_AUTOFS_FS) += autofs/
108obj-$(CONFIG_AUTOFS4_FS) += autofs4/ 103obj-$(CONFIG_AUTOFS4_FS) += autofs4/
109obj-$(CONFIG_ADFS_FS) += adfs/ 104obj-$(CONFIG_ADFS_FS) += adfs/
110obj-$(CONFIG_FUSE_FS) += fuse/ 105obj-$(CONFIG_FUSE_FS) += fuse/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
1config ADFS_FS 1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)" 2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on BLOCK && EXPERIMENTAL
4 depends on BKL # need to fix
4 help 5 help
5 The Acorn Disc Filing System is the standard file system of the 6 The Acorn Disc Filing System is the standard file system of the
6 RiscOS operating system which runs on Acorn's ARM-based Risc PC 7 RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1d..959dbff2d42d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -352,11 +352,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
352 struct adfs_sb_info *asb; 352 struct adfs_sb_info *asb;
353 struct inode *root; 353 struct inode *root;
354 354
355 lock_kernel();
356
355 sb->s_flags |= MS_NODIRATIME; 357 sb->s_flags |= MS_NODIRATIME;
356 358
357 asb = kzalloc(sizeof(*asb), GFP_KERNEL); 359 asb = kzalloc(sizeof(*asb), GFP_KERNEL);
358 if (!asb) 360 if (!asb) {
361 unlock_kernel();
359 return -ENOMEM; 362 return -ENOMEM;
363 }
360 sb->s_fs_info = asb; 364 sb->s_fs_info = asb;
361 365
362 /* set default options */ 366 /* set default options */
@@ -474,6 +478,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
474 goto error; 478 goto error;
475 } else 479 } else
476 sb->s_root->d_op = &adfs_dentry_operations; 480 sb->s_root->d_op = &adfs_dentry_operations;
481 unlock_kernel();
477 return 0; 482 return 0;
478 483
479error_free_bh: 484error_free_bh:
@@ -481,20 +486,20 @@ error_free_bh:
481error: 486error:
482 sb->s_fs_info = NULL; 487 sb->s_fs_info = NULL;
483 kfree(asb); 488 kfree(asb);
489 unlock_kernel();
484 return -EINVAL; 490 return -EINVAL;
485} 491}
486 492
487static int adfs_get_sb(struct file_system_type *fs_type, 493static struct dentry *adfs_mount(struct file_system_type *fs_type,
488 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 494 int flags, const char *dev_name, void *data)
489{ 495{
490 return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super, 496 return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
491 mnt);
492} 497}
493 498
494static struct file_system_type adfs_fs_type = { 499static struct file_system_type adfs_fs_type = {
495 .owner = THIS_MODULE, 500 .owner = THIS_MODULE,
496 .name = "adfs", 501 .name = "adfs",
497 .get_sb = adfs_get_sb, 502 .mount = adfs_mount,
498 .kill_sb = kill_block_super, 503 .kill_sb = kill_block_super,
499 .fs_flags = FS_REQUIRES_DEV, 504 .fs_flags = FS_REQUIRES_DEV,
500}; 505};
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
894 if (AFFS_SB(sb)->s_flags & SF_OFS) { 894 if (AFFS_SB(sb)->s_flags & SF_OFS) {
895 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); 895 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
896 u32 tmp; 896 u32 tmp;
897 if (IS_ERR(ext_bh)) { 897 if (IS_ERR(bh)) {
898 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", 898 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
899 ext, PTR_ERR(ext_bh)); 899 ext, PTR_ERR(bh));
900 return; 900 return;
901 } 901 }
902 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); 902 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); 388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
389 mark_buffer_dirty_inode(inode_bh, inode); 389 mark_buffer_dirty_inode(inode_bh, inode);
390 inode->i_nlink = 2; 390 inode->i_nlink = 2;
391 atomic_inc(&inode->i_count); 391 ihold(inode);
392 } 392 }
393 affs_fix_checksum(sb, bh); 393 affs_fix_checksum(sb, bh);
394 mark_buffer_dirty_inode(bh, inode); 394 mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 9581ea94d5a1..0cf7f4384cbd 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
16#include <linux/parser.h> 16#include <linux/parser.h>
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include "affs.h" 20#include "affs.h"
22 21
@@ -46,8 +45,6 @@ affs_put_super(struct super_block *sb)
46 struct affs_sb_info *sbi = AFFS_SB(sb); 45 struct affs_sb_info *sbi = AFFS_SB(sb);
47 pr_debug("AFFS: put_super()\n"); 46 pr_debug("AFFS: put_super()\n");
48 47
49 lock_kernel();
50
51 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt) 48 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
52 affs_commit_super(sb, 1, 1); 49 affs_commit_super(sb, 1, 1);
53 50
@@ -56,8 +53,6 @@ affs_put_super(struct super_block *sb)
56 affs_brelse(sbi->s_root_bh); 53 affs_brelse(sbi->s_root_bh);
57 kfree(sbi); 54 kfree(sbi);
58 sb->s_fs_info = NULL; 55 sb->s_fs_info = NULL;
59
60 unlock_kernel();
61} 56}
62 57
63static void 58static void
@@ -302,6 +297,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
302 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); 297 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
303 if (!sbi) 298 if (!sbi)
304 return -ENOMEM; 299 return -ENOMEM;
300
305 sb->s_fs_info = sbi; 301 sb->s_fs_info = sbi;
306 mutex_init(&sbi->s_bmlock); 302 mutex_init(&sbi->s_bmlock);
307 spin_lock_init(&sbi->symlink_lock); 303 spin_lock_init(&sbi->symlink_lock);
@@ -527,7 +523,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
527 kfree(new_opts); 523 kfree(new_opts);
528 return -EINVAL; 524 return -EINVAL;
529 } 525 }
530 lock_kernel(); 526
531 replace_mount_options(sb, new_opts); 527 replace_mount_options(sb, new_opts);
532 528
533 sbi->s_flags = mount_flags; 529 sbi->s_flags = mount_flags;
@@ -543,17 +539,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
543 memcpy(sbi->s_volume, volume, 32); 539 memcpy(sbi->s_volume, volume, 32);
544 spin_unlock(&sbi->symlink_lock); 540 spin_unlock(&sbi->symlink_lock);
545 541
546 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 542 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
547 unlock_kernel();
548 return 0; 543 return 0;
549 } 544
550 if (*flags & MS_RDONLY) { 545 if (*flags & MS_RDONLY) {
551 affs_write_super(sb); 546 affs_write_super(sb);
552 affs_free_bitmap(sb); 547 affs_free_bitmap(sb);
553 } else 548 } else
554 res = affs_init_bitmap(sb, flags); 549 res = affs_init_bitmap(sb, flags);
555 550
556 unlock_kernel();
557 return res; 551 return res;
558} 552}
559 553
@@ -579,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
579 return 0; 573 return 0;
580} 574}
581 575
582static int affs_get_sb(struct file_system_type *fs_type, 576static struct dentry *affs_mount(struct file_system_type *fs_type,
583 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 577 int flags, const char *dev_name, void *data)
584{ 578{
585 return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super, 579 return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
586 mnt);
587} 580}
588 581
589static struct file_system_type affs_fs_type = { 582static struct file_system_type affs_fs_type = {
590 .owner = THIS_MODULE, 583 .owner = THIS_MODULE,
591 .name = "affs", 584 .name = "affs",
592 .get_sb = affs_get_sb, 585 .mount = affs_mount,
593 .kill_sb = kill_block_super, 586 .kill_sb = kill_block_super,
594 .fs_flags = FS_REQUIRES_DEV, 587 .fs_flags = FS_REQUIRES_DEV,
595}; 588};
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..5439e1bc9a86 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
1045 if (ret < 0) 1045 if (ret < 0)
1046 goto link_error; 1046 goto link_error;
1047 1047
1048 atomic_inc(&vnode->vfs_inode.i_count); 1048 ihold(&vnode->vfs_inode);
1049 d_instantiate(dentry, &vnode->vfs_inode); 1049 d_instantiate(dentry, &vnode->vfs_inode);
1050 key_put(key); 1050 key_put(key);
1051 _leave(" = 0"); 1051 _leave(" = 0");
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325eb..757d664575dd 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/smp_lock.h>
13#include "internal.h" 12#include "internal.h"
14 13
15#define AFS_LOCK_GRANTED 0 14#define AFS_LOCK_GRANTED 0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
274 273
275 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; 274 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
276 275
277 lock_kernel(); 276 lock_flocks();
278 277
279 /* make sure we've got a callback on this file and that our view of the 278 /* make sure we've got a callback on this file and that our view of the
280 * data version is up to date */ 279 * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
421 afs_vnode_fetch_status(vnode, NULL, key); 420 afs_vnode_fetch_status(vnode, NULL, key);
422 421
423error: 422error:
424 unlock_kernel(); 423 unlock_flocks();
425 _leave(" = %d", ret); 424 _leave(" = %d", ret);
426 return ret; 425 return ret;
427 426
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6d552686c498..6153417caf57 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -29,6 +29,7 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work);
29 29
30const struct file_operations afs_mntpt_file_operations = { 30const struct file_operations afs_mntpt_file_operations = {
31 .open = afs_mntpt_open, 31 .open = afs_mntpt_open,
32 .llseek = noop_llseek,
32}; 33};
33 34
34const struct inode_operations afs_mntpt_inode_operations = { 35const struct inode_operations afs_mntpt_inode_operations = {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 77e1e5a61154..27201cffece4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -19,7 +19,6 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/smp_lock.h>
23#include <linux/fs.h> 22#include <linux/fs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/parser.h> 24#include <linux/parser.h>
@@ -30,9 +29,8 @@
30#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ 29#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
31 30
32static void afs_i_init_once(void *foo); 31static void afs_i_init_once(void *foo);
33static int afs_get_sb(struct file_system_type *fs_type, 32static struct dentry *afs_mount(struct file_system_type *fs_type,
34 int flags, const char *dev_name, 33 int flags, const char *dev_name, void *data);
35 void *data, struct vfsmount *mnt);
36static struct inode *afs_alloc_inode(struct super_block *sb); 34static struct inode *afs_alloc_inode(struct super_block *sb);
37static void afs_put_super(struct super_block *sb); 35static void afs_put_super(struct super_block *sb);
38static void afs_destroy_inode(struct inode *inode); 36static void afs_destroy_inode(struct inode *inode);
@@ -41,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
41struct file_system_type afs_fs_type = { 39struct file_system_type afs_fs_type = {
42 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
43 .name = "afs", 41 .name = "afs",
44 .get_sb = afs_get_sb, 42 .mount = afs_mount,
45 .kill_sb = kill_anon_super, 43 .kill_sb = kill_anon_super,
46 .fs_flags = 0, 44 .fs_flags = 0,
47}; 45};
@@ -360,11 +358,8 @@ error:
360/* 358/*
361 * get an AFS superblock 359 * get an AFS superblock
362 */ 360 */
363static int afs_get_sb(struct file_system_type *fs_type, 361static struct dentry *afs_mount(struct file_system_type *fs_type,
364 int flags, 362 int flags, const char *dev_name, void *options)
365 const char *dev_name,
366 void *options,
367 struct vfsmount *mnt)
368{ 363{
369 struct afs_mount_params params; 364 struct afs_mount_params params;
370 struct super_block *sb; 365 struct super_block *sb;
@@ -428,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
428 ASSERTCMP(sb->s_flags, &, MS_ACTIVE); 423 ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
429 } 424 }
430 425
431 simple_set_mnt(mnt, sb);
432 afs_put_volume(params.volume); 426 afs_put_volume(params.volume);
433 afs_put_cell(params.cell); 427 afs_put_cell(params.cell);
434 kfree(new_opts); 428 kfree(new_opts);
435 _leave(" = 0 [%p]", sb); 429 _leave(" = 0 [%p]", sb);
436 return 0; 430 return dget(sb->s_root);
437 431
438error: 432error:
439 afs_put_volume(params.volume); 433 afs_put_volume(params.volume);
@@ -441,7 +435,7 @@ error:
441 key_put(params.key); 435 key_put(params.key);
442 kfree(new_opts); 436 kfree(new_opts);
443 _leave(" = %d", ret); 437 _leave(" = %d", ret);
444 return ret; 438 return ERR_PTR(ret);
445} 439}
446 440
447/* 441/*
@@ -453,12 +447,8 @@ static void afs_put_super(struct super_block *sb)
453 447
454 _enter(""); 448 _enter("");
455 449
456 lock_kernel();
457
458 afs_put_volume(as->volume); 450 afs_put_volume(as->volume);
459 451
460 unlock_kernel();
461
462 _leave(""); 452 _leave("");
463} 453}
464 454
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
438 */ 438 */
439int afs_writepage(struct page *page, struct writeback_control *wbc) 439int afs_writepage(struct page *page, struct writeback_control *wbc)
440{ 440{
441 struct backing_dev_info *bdi = page->mapping->backing_dev_info;
442 struct afs_writeback *wb; 441 struct afs_writeback *wb;
443 int ret; 442 int ret;
444 443
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
455 } 454 }
456 455
457 wbc->nr_to_write -= ret; 456 wbc->nr_to_write -= ret;
458 if (wbc->nonblocking && bdi_write_congested(bdi))
459 wbc->encountered_congestion = 1;
460 457
461 _leave(" = 0"); 458 _leave(" = 0");
462 return 0; 459 return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
469 struct writeback_control *wbc, 466 struct writeback_control *wbc,
470 pgoff_t index, pgoff_t end, pgoff_t *_next) 467 pgoff_t index, pgoff_t end, pgoff_t *_next)
471{ 468{
472 struct backing_dev_info *bdi = mapping->backing_dev_info;
473 struct afs_writeback *wb; 469 struct afs_writeback *wb;
474 struct page *page; 470 struct page *page;
475 int ret, n; 471 int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
529 525
530 wbc->nr_to_write -= ret; 526 wbc->nr_to_write -= ret;
531 527
532 if (wbc->nonblocking && bdi_write_congested(bdi)) {
533 wbc->encountered_congestion = 1;
534 break;
535 }
536
537 cond_resched(); 528 cond_resched();
538 } while (index < end && wbc->nr_to_write > 0); 529 } while (index < end && wbc->nr_to_write > 0);
539 530
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
548int afs_writepages(struct address_space *mapping, 539int afs_writepages(struct address_space *mapping,
549 struct writeback_control *wbc) 540 struct writeback_control *wbc)
550{ 541{
551 struct backing_dev_info *bdi = mapping->backing_dev_info;
552 pgoff_t start, end, next; 542 pgoff_t start, end, next;
553 int ret; 543 int ret;
554 544
555 _enter(""); 545 _enter("");
556 546
557 if (wbc->nonblocking && bdi_write_congested(bdi)) {
558 wbc->encountered_congestion = 1;
559 _leave(" = 0 [congest]");
560 return 0;
561 }
562
563 if (wbc->range_cyclic) { 547 if (wbc->range_cyclic) {
564 start = mapping->writeback_index; 548 start = mapping->writeback_index;
565 end = -1; 549 end = -1;
566 ret = afs_writepages_region(mapping, wbc, start, end, &next); 550 ret = afs_writepages_region(mapping, wbc, start, end, &next);
567 if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && 551 if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
568 !(wbc->nonblocking && wbc->encountered_congestion))
569 ret = afs_writepages_region(mapping, wbc, 0, start, 552 ret = afs_writepages_region(mapping, wbc, 0, start,
570 &next); 553 &next);
571 mapping->writeback_index = next; 554 mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 250b0a73c8a8..8c8f6c5b6d79 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1543,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping,
1543 } 1543 }
1544 1544
1545 abe = mempool_alloc(abe_pool, GFP_KERNEL); 1545 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1546 BUG_ON(!igrab(mapping->host)); 1546
1547 /*
1548 * we should be using igrab here, but
1549 * we don't want to hammer on the global
1550 * inode spinlock just to take an extra
1551 * reference on a file that we must already
1552 * have a reference to.
1553 *
1554 * When we're called, we always have a reference
1555 * on the file, so we must always have a reference
1556 * on the inode, so ihold() is safe here.
1557 */
1558 ihold(mapping->host);
1547 abe->mapping = mapping; 1559 abe->mapping = mapping;
1548 hlist_add_head(&abe->list, &batch_hash[bucket]); 1560 hlist_add_head(&abe->list, &batch_hash[bucket]);
1549 return; 1561 return;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..57ce55b2564c 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
26static struct inode *anon_inode_inode; 26static struct inode *anon_inode_inode;
27static const struct file_operations anon_inode_fops; 27static const struct file_operations anon_inode_fops;
28 28
29static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, 29static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
30 const char *dev_name, void *data, 30 int flags, const char *dev_name, void *data)
31 struct vfsmount *mnt)
32{ 31{
33 return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC, 32 return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
34 mnt);
35} 33}
36 34
37/* 35/*
@@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
45 43
46static struct file_system_type anon_inode_fs_type = { 44static struct file_system_type anon_inode_fs_type = {
47 .name = "anon_inodefs", 45 .name = "anon_inodefs",
48 .get_sb = anon_inodefs_get_sb, 46 .mount = anon_inodefs_mount,
49 .kill_sb = kill_anon_super, 47 .kill_sb = kill_anon_super,
50}; 48};
51static const struct dentry_operations anon_inodefs_dentry_operations = { 49static const struct dentry_operations anon_inodefs_dentry_operations = {
@@ -111,10 +109,9 @@ struct file *anon_inode_getfile(const char *name,
111 path.mnt = mntget(anon_inode_mnt); 109 path.mnt = mntget(anon_inode_mnt);
112 /* 110 /*
113 * We know the anon_inode inode count is always greater than zero, 111 * We know the anon_inode inode count is always greater than zero,
114 * so we can avoid doing an igrab() and we can use an open-coded 112 * so ihold() is safe.
115 * atomic_inc().
116 */ 113 */
117 atomic_inc(&anon_inode_inode->i_count); 114 ihold(anon_inode_inode);
118 115
119 path.dentry->d_op = &anon_inodefs_dentry_operations; 116 path.dentry->d_op = &anon_inodefs_dentry_operations;
120 d_instantiate(path.dentry, anon_inode_inode); 117 d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +191,7 @@ static struct inode *anon_inode_mkinode(void)
194 if (!inode) 191 if (!inode)
195 return ERR_PTR(-ENOMEM); 192 return ERR_PTR(-ENOMEM);
196 193
194 inode->i_ino = get_next_ino();
197 inode->i_fop = &anon_inode_fops; 195 inode->i_fop = &anon_inode_fops;
198 196
199 inode->i_mapping->a_ops = &anon_aops; 197 inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 5f3bea90911e..000000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
1config AUTOFS_FS
2 tristate "Kernel automounter support"
3 help
4 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce
6 overhead in the already-mounted case; this is unlike the BSD
7 automounter (amd), which is a pure user space daemon.
8
9 To use the automounter you need the user-space tools from the autofs
10 package; you can find the location in <file:Documentation/Changes>.
11 You also want to answer Y to "NFS file system support", below.
12
13 If you want to use the newer version of the automounter with more
14 features, say N here and say Y to "Kernel automounter v4 support",
15 below.
16
17 To compile this support as a module, choose M here: the module will be
18 called autofs.
19
20 If you are not a part of a fairly large, distributed network, you
21 probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d05..000000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
1#
2# Makefile for the linux autofs-filesystem routines.
3#
4
5obj-$(CONFIG_AUTOFS_FS) += autofs.o
6
7autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec45..000000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * linux/fs/autofs/autofs_i.h
4 *
5 * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/* Internal header file for autofs */
14
15#include <linux/auto_fs.h>
16
17/* This is the range of ioctl() numbers we claim as ours */
18#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
19#define AUTOFS_IOC_COUNT 32
20
21#include <linux/kernel.h>
22#include <linux/slab.h>
23#include <linux/time.h>
24#include <linux/string.h>
25#include <linux/wait.h>
26#include <linux/dcache.h>
27#include <linux/namei.h>
28#include <linux/mount.h>
29#include <linux/sched.h>
30
31#include <asm/current.h>
32#include <asm/uaccess.h>
33
34#ifdef DEBUG
35#define DPRINTK(D) (printk D)
36#else
37#define DPRINTK(D) ((void)0)
38#endif
39
40/*
41 * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
42 * kernel will keep the negative response cached for up to the time given
43 * here, although the time can be shorter if the kernel throws the dcache
44 * entry away. This probably should be settable from user space.
45 */
46#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */
47
48/* Structures associated with the root directory hash table */
49
50#define AUTOFS_HASH_SIZE 67
51
52struct autofs_dir_ent {
53 int hash;
54 char *name;
55 int len;
56 ino_t ino;
57 struct dentry *dentry;
58 /* Linked list of entries */
59 struct autofs_dir_ent *next;
60 struct autofs_dir_ent **back;
61 /* The following entries are for the expiry system */
62 unsigned long last_usage;
63 struct list_head exp;
64};
65
66struct autofs_dirhash {
67 struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
68 struct list_head expiry_head;
69};
70
71struct autofs_wait_queue {
72 wait_queue_head_t queue;
73 struct autofs_wait_queue *next;
74 autofs_wqt_t wait_queue_token;
75 /* We use the following to see what we are waiting for */
76 int hash;
77 int len;
78 char *name;
79 /* This is for status reporting upon return */
80 int status;
81 int wait_ctr;
82};
83
84struct autofs_symlink {
85 char *data;
86 int len;
87 time_t mtime;
88};
89
90#define AUTOFS_MAX_SYMLINKS 256
91
92#define AUTOFS_ROOT_INO 1
93#define AUTOFS_FIRST_SYMLINK 2
94#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
95
96#define AUTOFS_SYMLINK_BITMAP_LEN \
97 ((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
98
99#define AUTOFS_SBI_MAGIC 0x6d4a556d
100
101struct autofs_sb_info {
102 u32 magic;
103 struct file *pipe;
104 struct pid *oz_pgrp;
105 int catatonic;
106 struct super_block *sb;
107 unsigned long exp_timeout;
108 ino_t next_dir_ino;
109 struct autofs_wait_queue *queues; /* Wait queue pointer */
110 struct autofs_dirhash dirhash; /* Root directory hash */
111 struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
112 unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
113};
114
115static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
116{
117 return (struct autofs_sb_info *)(sb->s_fs_info);
118}
119
120/* autofs_oz_mode(): do we see the man behind the curtain? (The
121 processes which do manipulations for us in user space sees the raw
122 filesystem without "magic".) */
123
124static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
125 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
126}
127
128/* Hash operations */
129
130void autofs_initialize_hash(struct autofs_dirhash *);
131struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
132void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
133void autofs_hash_delete(struct autofs_dir_ent *);
134struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
135void autofs_hash_dputall(struct autofs_dirhash *);
136void autofs_hash_nuke(struct autofs_sb_info *);
137
138/* Expiration-handling functions */
139
140void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
141struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
142
143/* Operations structures */
144
145extern const struct inode_operations autofs_root_inode_operations;
146extern const struct inode_operations autofs_symlink_inode_operations;
147extern const struct file_operations autofs_root_operations;
148
149/* Initializing function */
150
151int autofs_fill_super(struct super_block *, void *, int);
152void autofs_kill_sb(struct super_block *sb);
153struct inode *autofs_iget(struct super_block *, unsigned long);
154
155/* Queue management functions */
156
157int autofs_wait(struct autofs_sb_info *,struct qstr *);
158int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
159void autofs_catatonic_mode(struct autofs_sb_info *);
160
161#ifdef DEBUG
162void autofs_say(const char *name, int len);
163#else
164#define autofs_say(n,l) ((void)0)
165#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e5..000000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/dirhash.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include "autofs_i.h"
14
15/* Functions for maintenance of expiry queue */
16
17static void autofs_init_usage(struct autofs_dirhash *dh,
18 struct autofs_dir_ent *ent)
19{
20 list_add_tail(&ent->exp, &dh->expiry_head);
21 ent->last_usage = jiffies;
22}
23
24static void autofs_delete_usage(struct autofs_dir_ent *ent)
25{
26 list_del(&ent->exp);
27}
28
29void autofs_update_usage(struct autofs_dirhash *dh,
30 struct autofs_dir_ent *ent)
31{
32 autofs_delete_usage(ent); /* Unlink from current position */
33 autofs_init_usage(dh,ent); /* Relink at queue tail */
34}
35
36struct autofs_dir_ent *autofs_expire(struct super_block *sb,
37 struct autofs_sb_info *sbi,
38 struct vfsmount *mnt)
39{
40 struct autofs_dirhash *dh = &sbi->dirhash;
41 struct autofs_dir_ent *ent;
42 unsigned long timeout = sbi->exp_timeout;
43
44 while (1) {
45 struct path path;
46 int umount_ok;
47
48 if ( list_empty(&dh->expiry_head) || sbi->catatonic )
49 return NULL; /* No entries */
50 /* We keep the list sorted by last_usage and want old stuff */
51 ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
52 if (jiffies - ent->last_usage < timeout)
53 break;
54 /* Move to end of list in case expiry isn't desirable */
55 autofs_update_usage(dh, ent);
56
57 /* Check to see that entry is expirable */
58 if ( ent->ino < AUTOFS_FIRST_DIR_INO )
59 return ent; /* Symlinks are always expirable */
60
61 /* Get the dentry for the autofs subdirectory */
62 path.dentry = ent->dentry;
63
64 if (!path.dentry) {
65 /* Should only happen in catatonic mode */
66 printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
67 autofs_delete_usage(ent);
68 continue;
69 }
70
71 if (!path.dentry->d_inode) {
72 dput(path.dentry);
73 printk("autofs: negative dentry on expiry queue: %s\n",
74 ent->name);
75 autofs_delete_usage(ent);
76 continue;
77 }
78
79 /* Make sure entry is mounted and unused; note that dentry will
80 point to the mounted-on-top root. */
81 if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
82 !d_mountpoint(path.dentry)) {
83 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
84 continue;
85 }
86 path.mnt = mnt;
87 path_get(&path);
88 if (!follow_down(&path)) {
89 path_put(&path);
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue;
92 }
93 while (d_mountpoint(path.dentry) && follow_down(&path))
94 ;
95 umount_ok = may_umount(path.mnt);
96 path_put(&path);
97
98 if (umount_ok) {
99 DPRINTK(("autofs: signaling expire on %s\n", ent->name));
100 return ent; /* Expirable! */
101 }
102 DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
103 }
104 return NULL; /* No expirable entries */
105}
106
107void autofs_initialize_hash(struct autofs_dirhash *dh) {
108 memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
109 INIT_LIST_HEAD(&dh->expiry_head);
110}
111
112struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
113{
114 struct autofs_dir_ent *dhn;
115
116 DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
117 autofs_say(name->name,name->len);
118
119 for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
120 if ( name->hash == dhn->hash &&
121 name->len == dhn->len &&
122 !memcmp(name->name, dhn->name, name->len) )
123 break;
124 }
125
126 return dhn;
127}
128
129void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
130{
131 struct autofs_dir_ent **dhnp;
132
133 DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
134 autofs_say(ent->name,ent->len);
135
136 autofs_init_usage(dh,ent);
137 if (ent->dentry)
138 dget(ent->dentry);
139
140 dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
141 ent->next = *dhnp;
142 ent->back = dhnp;
143 *dhnp = ent;
144 if ( ent->next )
145 ent->next->back = &(ent->next);
146}
147
148void autofs_hash_delete(struct autofs_dir_ent *ent)
149{
150 *(ent->back) = ent->next;
151 if ( ent->next )
152 ent->next->back = ent->back;
153
154 autofs_delete_usage(ent);
155
156 if ( ent->dentry )
157 dput(ent->dentry);
158 kfree(ent->name);
159 kfree(ent);
160}
161
162/*
163 * Used by readdir(). We must validate "ptr", so we can't simply make it
164 * a pointer. Values below 0xffff are reserved; calling with any value
165 * <= 0x10000 will return the first entry found.
166 *
167 * "last" can be NULL or the value returned by the last search *if* we
168 * want the next sequential entry.
169 */
170struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
171 off_t *ptr, struct autofs_dir_ent *last)
172{
173 int bucket, ecount, i;
174 struct autofs_dir_ent *ent;
175
176 bucket = (*ptr >> 16) - 1;
177 ecount = *ptr & 0xffff;
178
179 if ( bucket < 0 ) {
180 bucket = ecount = 0;
181 }
182
183 DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
184
185 ent = last ? last->next : NULL;
186
187 if ( ent ) {
188 ecount++;
189 } else {
190 while ( bucket < AUTOFS_HASH_SIZE ) {
191 ent = dh->h[bucket];
192 for ( i = ecount ; ent && i ; i-- )
193 ent = ent->next;
194
195 if (ent) {
196 ecount++; /* Point to *next* entry */
197 break;
198 }
199
200 bucket++; ecount = 0;
201 }
202 }
203
204#ifdef DEBUG
205 if ( !ent )
206 printk("autofs_hash_enum: nothing found\n");
207 else {
208 printk("autofs_hash_enum: found hash %08x, name", ent->hash);
209 autofs_say(ent->name,ent->len);
210 }
211#endif
212
213 *ptr = ((bucket+1) << 16) + ecount;
214 return ent;
215}
216
217/* Iterate over all the ents, and remove all dentry pointers. Used on
218 entering catatonic mode, in order to make the filesystem unmountable. */
219void autofs_hash_dputall(struct autofs_dirhash *dh)
220{
221 int i;
222 struct autofs_dir_ent *ent;
223
224 for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
225 for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
226 if ( ent->dentry ) {
227 dput(ent->dentry);
228 ent->dentry = NULL;
229 }
230 }
231 }
232}
233
234/* Delete everything. This is used on filesystem destruction, so we
235 make no attempt to keep the pointers valid */
236void autofs_hash_nuke(struct autofs_sb_info *sbi)
237{
238 int i;
239 struct autofs_dir_ent *ent, *nent;
240
241 for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
242 for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
243 nent = ent->next;
244 if ( ent->dentry )
245 dput(ent->dentry);
246 kfree(ent->name);
247 kfree(ent);
248 }
249 }
250}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f37..000000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/init.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include "autofs_i.h"
16
17static int autofs_get_sb(struct file_system_type *fs_type,
18 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
19{
20 return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
21}
22
23static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE,
25 .name = "autofs",
26 .get_sb = autofs_get_sb,
27 .kill_sb = autofs_kill_sb,
28};
29
30static int __init init_autofs_fs(void)
31{
32 return register_filesystem(&autofs_fs_type);
33}
34
35static void __exit exit_autofs_fs(void)
36{
37 unregister_filesystem(&autofs_fs_type);
38}
39
40module_init(init_autofs_fs);
41module_exit(exit_autofs_fs);
42
43#ifdef DEBUG
44void autofs_say(const char *name, int len)
45{
46 printk("(%d: ", len);
47 while ( len-- )
48 printk("%c", *name++);
49 printk(")\n");
50}
51#endif
52MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e26..000000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/inode.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/kernel.h>
14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/file.h>
17#include <linux/parser.h>
18#include <linux/bitops.h>
19#include <linux/magic.h>
20#include "autofs_i.h"
21#include <linux/module.h>
22
23void autofs_kill_sb(struct super_block *sb)
24{
25 struct autofs_sb_info *sbi = autofs_sbi(sb);
26 unsigned int n;
27
28 /*
29 * In the event of a failure in get_sb_nodev the superblock
30 * info is not present so nothing else has been setup, so
31 * just call kill_anon_super when we are called from
32 * deactivate_super.
33 */
34 if (!sbi)
35 goto out_kill_sb;
36
37 if (!sbi->catatonic)
38 autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
39
40 put_pid(sbi->oz_pgrp);
41
42 autofs_hash_nuke(sbi);
43 for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
44 if (test_bit(n, sbi->symlink_bitmap))
45 kfree(sbi->symlink[n].data);
46 }
47
48 kfree(sb->s_fs_info);
49
50out_kill_sb:
51 DPRINTK(("autofs: shutting down\n"));
52 kill_anon_super(sb);
53}
54
55static const struct super_operations autofs_sops = {
56 .statfs = simple_statfs,
57 .show_options = generic_show_options,
58};
59
60enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
61
62static const match_table_t autofs_tokens = {
63 {Opt_fd, "fd=%u"},
64 {Opt_uid, "uid=%u"},
65 {Opt_gid, "gid=%u"},
66 {Opt_pgrp, "pgrp=%u"},
67 {Opt_minproto, "minproto=%u"},
68 {Opt_maxproto, "maxproto=%u"},
69 {Opt_err, NULL}
70};
71
72static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
73 pid_t *pgrp, int *minproto, int *maxproto)
74{
75 char *p;
76 substring_t args[MAX_OPT_ARGS];
77 int option;
78
79 *uid = current_uid();
80 *gid = current_gid();
81 *pgrp = task_pgrp_nr(current);
82
83 *minproto = *maxproto = AUTOFS_PROTO_VERSION;
84
85 *pipefd = -1;
86
87 if (!options)
88 return 1;
89
90 while ((p = strsep(&options, ",")) != NULL) {
91 int token;
92 if (!*p)
93 continue;
94
95 token = match_token(p, autofs_tokens, args);
96 switch (token) {
97 case Opt_fd:
98 if (match_int(&args[0], &option))
99 return 1;
100 *pipefd = option;
101 break;
102 case Opt_uid:
103 if (match_int(&args[0], &option))
104 return 1;
105 *uid = option;
106 break;
107 case Opt_gid:
108 if (match_int(&args[0], &option))
109 return 1;
110 *gid = option;
111 break;
112 case Opt_pgrp:
113 if (match_int(&args[0], &option))
114 return 1;
115 *pgrp = option;
116 break;
117 case Opt_minproto:
118 if (match_int(&args[0], &option))
119 return 1;
120 *minproto = option;
121 break;
122 case Opt_maxproto:
123 if (match_int(&args[0], &option))
124 return 1;
125 *maxproto = option;
126 break;
127 default:
128 return 1;
129 }
130 }
131 return (*pipefd < 0);
132}
133
134int autofs_fill_super(struct super_block *s, void *data, int silent)
135{
136 struct inode * root_inode;
137 struct dentry * root;
138 struct file * pipe;
139 int pipefd;
140 struct autofs_sb_info *sbi;
141 int minproto, maxproto;
142 pid_t pgid;
143
144 save_mount_options(s, data);
145
146 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
147 if (!sbi)
148 goto fail_unlock;
149 DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
150
151 s->s_fs_info = sbi;
152 sbi->magic = AUTOFS_SBI_MAGIC;
153 sbi->pipe = NULL;
154 sbi->catatonic = 1;
155 sbi->exp_timeout = 0;
156 autofs_initialize_hash(&sbi->dirhash);
157 sbi->queues = NULL;
158 memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
159 sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
160 s->s_blocksize = 1024;
161 s->s_blocksize_bits = 10;
162 s->s_magic = AUTOFS_SUPER_MAGIC;
163 s->s_op = &autofs_sops;
164 s->s_time_gran = 1;
165 sbi->sb = s;
166
167 root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
168 if (IS_ERR(root_inode))
169 goto fail_free;
170 root = d_alloc_root(root_inode);
171 pipe = NULL;
172
173 if (!root)
174 goto fail_iput;
175
176 /* Can this call block? - WTF cares? s is locked. */
177 if (parse_options(data, &pipefd, &root_inode->i_uid,
178 &root_inode->i_gid, &pgid, &minproto,
179 &maxproto)) {
180 printk("autofs: called with bogus options\n");
181 goto fail_dput;
182 }
183
184 /* Couldn't this be tested earlier? */
185 if (minproto > AUTOFS_PROTO_VERSION ||
186 maxproto < AUTOFS_PROTO_VERSION) {
187 printk("autofs: kernel does not match daemon version\n");
188 goto fail_dput;
189 }
190
191 DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
192 sbi->oz_pgrp = find_get_pid(pgid);
193
194 if (!sbi->oz_pgrp) {
195 printk("autofs: could not find process group %d\n", pgid);
196 goto fail_dput;
197 }
198
199 pipe = fget(pipefd);
200
201 if (!pipe) {
202 printk("autofs: could not open pipe file descriptor\n");
203 goto fail_put_pid;
204 }
205
206 if (!pipe->f_op || !pipe->f_op->write)
207 goto fail_fput;
208 sbi->pipe = pipe;
209 sbi->catatonic = 0;
210
211 /*
212 * Success! Install the root dentry now to indicate completion.
213 */
214 s->s_root = root;
215 return 0;
216
217fail_fput:
218 printk("autofs: pipe file descriptor does not contain proper ops\n");
219 fput(pipe);
220fail_put_pid:
221 put_pid(sbi->oz_pgrp);
222fail_dput:
223 dput(root);
224 goto fail_free;
225fail_iput:
226 printk("autofs: get root dentry failed\n");
227 iput(root_inode);
228fail_free:
229 kfree(sbi);
230 s->s_fs_info = NULL;
231fail_unlock:
232 return -EINVAL;
233}
234
235struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
236{
237 unsigned int n;
238 struct autofs_sb_info *sbi = autofs_sbi(sb);
239 struct inode *inode;
240
241 inode = iget_locked(sb, ino);
242 if (!inode)
243 return ERR_PTR(-ENOMEM);
244 if (!(inode->i_state & I_NEW))
245 return inode;
246
247 /* Initialize to the default case (stub directory) */
248
249 inode->i_op = &simple_dir_inode_operations;
250 inode->i_fop = &simple_dir_operations;
251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
252 inode->i_nlink = 2;
253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
254
255 if (ino == AUTOFS_ROOT_INO) {
256 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
257 inode->i_op = &autofs_root_inode_operations;
258 inode->i_fop = &autofs_root_operations;
259 goto done;
260 }
261
262 inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
263 inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
264
265 if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
266 /* Symlink inode - should be in symlink list */
267 struct autofs_symlink *sl;
268
269 n = ino - AUTOFS_FIRST_SYMLINK;
270 if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
271 printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
272 goto done;
273 }
274
275 inode->i_op = &autofs_symlink_inode_operations;
276 sl = &sbi->symlink[n];
277 inode->i_private = sl;
278 inode->i_mode = S_IFLNK | S_IRWXUGO;
279 inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
280 inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
281 inode->i_size = sl->len;
282 inode->i_nlink = 1;
283 }
284
285done:
286 unlock_new_inode(inode);
287 return inode;
288}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 11b1ea786d00..000000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,643 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/root.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/capability.h>
14#include <linux/errno.h>
15#include <linux/stat.h>
16#include <linux/slab.h>
17#include <linux/param.h>
18#include <linux/time.h>
19#include <linux/compat.h>
20#include <linux/smp_lock.h>
21#include "autofs_i.h"
22
23static int autofs_root_readdir(struct file *,void *,filldir_t);
24static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
25static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
26static int autofs_root_unlink(struct inode *,struct dentry *);
27static int autofs_root_rmdir(struct inode *,struct dentry *);
28static int autofs_root_mkdir(struct inode *,struct dentry *,int);
29static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
30static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
31
32const struct file_operations autofs_root_operations = {
33 .llseek = generic_file_llseek,
34 .read = generic_read_dir,
35 .readdir = autofs_root_readdir,
36 .unlocked_ioctl = autofs_root_ioctl,
37#ifdef CONFIG_COMPAT
38 .compat_ioctl = autofs_root_compat_ioctl,
39#endif
40};
41
42const struct inode_operations autofs_root_inode_operations = {
43 .lookup = autofs_root_lookup,
44 .unlink = autofs_root_unlink,
45 .symlink = autofs_root_symlink,
46 .mkdir = autofs_root_mkdir,
47 .rmdir = autofs_root_rmdir,
48};
49
50static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
51{
52 struct autofs_dir_ent *ent = NULL;
53 struct autofs_dirhash *dirhash;
54 struct autofs_sb_info *sbi;
55 struct inode * inode = filp->f_path.dentry->d_inode;
56 off_t onr, nr;
57
58 lock_kernel();
59
60 sbi = autofs_sbi(inode->i_sb);
61 dirhash = &sbi->dirhash;
62 nr = filp->f_pos;
63
64 switch(nr)
65 {
66 case 0:
67 if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
68 goto out;
69 filp->f_pos = ++nr;
70 /* fall through */
71 case 1:
72 if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
73 goto out;
74 filp->f_pos = ++nr;
75 /* fall through */
76 default:
77 while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
78 if (!ent->dentry || d_mountpoint(ent->dentry)) {
79 if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
80 goto out;
81 filp->f_pos = nr;
82 }
83 }
84 break;
85 }
86
87out:
88 unlock_kernel();
89 return 0;
90}
91
92static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
93{
94 struct inode * inode;
95 struct autofs_dir_ent *ent;
96 int status = 0;
97
98 if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
99 do {
100 if (status && dentry->d_inode) {
101 if (status != -ENOENT)
102 printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
103 return 0; /* Try to get the kernel to invalidate this dentry */
104 }
105
106 /* Turn this into a real negative dentry? */
107 if (status == -ENOENT) {
108 dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
109 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
110 return 1;
111 } else if (status) {
112 /* Return a negative dentry, but leave it "pending" */
113 return 1;
114 }
115 status = autofs_wait(sbi, &dentry->d_name);
116 } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
117 }
118
119 /* Abuse this field as a pointer to the directory entry, used to
120 find the expire list pointers */
121 dentry->d_time = (unsigned long) ent;
122
123 if (!dentry->d_inode) {
124 inode = autofs_iget(sb, ent->ino);
125 if (IS_ERR(inode)) {
126 /* Failed, but leave pending for next time */
127 return 1;
128 }
129 dentry->d_inode = inode;
130 }
131
132 /* If this is a directory that isn't a mount point, bitch at the
133 daemon and fix it in user space */
134 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
135 return !autofs_wait(sbi, &dentry->d_name);
136 }
137
138 /* We don't update the usages for the autofs daemon itself, this
139 is necessary for recursive autofs mounts */
140 if (!autofs_oz_mode(sbi)) {
141 autofs_update_usage(&sbi->dirhash,ent);
142 }
143
144 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
145 return 1;
146}
147
148
149/*
150 * Revalidate is called on every cache lookup. Some of those
151 * cache lookups may actually happen while the dentry is not
152 * yet completely filled in, and revalidate has to delay such
153 * lookups..
154 */
155static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
156{
157 struct inode * dir;
158 struct autofs_sb_info *sbi;
159 struct autofs_dir_ent *ent;
160 int res;
161
162 lock_kernel();
163 dir = dentry->d_parent->d_inode;
164 sbi = autofs_sbi(dir->i_sb);
165
166 /* Pending dentry */
167 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
168 if (autofs_oz_mode(sbi))
169 res = 1;
170 else
171 res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
172 unlock_kernel();
173 return res;
174 }
175
176 /* Negative dentry.. invalidate if "old" */
177 if (!dentry->d_inode) {
178 unlock_kernel();
179 return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
180 }
181
182 /* Check for a non-mountpoint directory */
183 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
184 if (autofs_oz_mode(sbi))
185 res = 1;
186 else
187 res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
188 unlock_kernel();
189 return res;
190 }
191
192 /* Update the usage list */
193 if (!autofs_oz_mode(sbi)) {
194 ent = (struct autofs_dir_ent *) dentry->d_time;
195 if (ent)
196 autofs_update_usage(&sbi->dirhash,ent);
197 }
198 unlock_kernel();
199 return 1;
200}
201
202static const struct dentry_operations autofs_dentry_operations = {
203 .d_revalidate = autofs_revalidate,
204};
205
206static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
207{
208 struct autofs_sb_info *sbi;
209 int oz_mode;
210
211 DPRINTK(("autofs_root_lookup: name = "));
212 lock_kernel();
213 autofs_say(dentry->d_name.name,dentry->d_name.len);
214
215 if (dentry->d_name.len > NAME_MAX) {
216 unlock_kernel();
217 return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
218 }
219
220 sbi = autofs_sbi(dir->i_sb);
221
222 oz_mode = autofs_oz_mode(sbi);
223 DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
224 "oz_mode = %d\n", task_pid_nr(current),
225 task_pgrp_nr(current), sbi->catatonic,
226 oz_mode));
227
228 /*
229 * Mark the dentry incomplete, but add it. This is needed so
230 * that the VFS layer knows about the dentry, and we can count
231 * on catching any lookups through the revalidate.
232 *
233 * Let all the hard work be done by the revalidate function that
234 * needs to be able to do this anyway..
235 *
236 * We need to do this before we release the directory semaphore.
237 */
238 dentry->d_op = &autofs_dentry_operations;
239 dentry->d_flags |= DCACHE_AUTOFS_PENDING;
240 d_add(dentry, NULL);
241
242 mutex_unlock(&dir->i_mutex);
243 autofs_revalidate(dentry, nd);
244 mutex_lock(&dir->i_mutex);
245
246 /*
247 * If we are still pending, check if we had to handle
248 * a signal. If so we can force a restart..
249 */
250 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
251 /* See if we were interrupted */
252 if (signal_pending(current)) {
253 sigset_t *sigset = &current->pending.signal;
254 if (sigismember (sigset, SIGKILL) ||
255 sigismember (sigset, SIGQUIT) ||
256 sigismember (sigset, SIGINT)) {
257 unlock_kernel();
258 return ERR_PTR(-ERESTARTNOINTR);
259 }
260 }
261 }
262 unlock_kernel();
263
264 /*
265 * If this dentry is unhashed, then we shouldn't honour this
266 * lookup even if the dentry is positive. Returning ENOENT here
267 * doesn't do the right thing for all system calls, but it should
268 * be OK for the operations we permit from an autofs.
269 */
270 if (dentry->d_inode && d_unhashed(dentry))
271 return ERR_PTR(-ENOENT);
272
273 return NULL;
274}
275
276static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
277{
278 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
279 struct autofs_dirhash *dh = &sbi->dirhash;
280 struct autofs_dir_ent *ent;
281 unsigned int n;
282 int slsize;
283 struct autofs_symlink *sl;
284 struct inode *inode;
285
286 DPRINTK(("autofs_root_symlink: %s <- ", symname));
287 autofs_say(dentry->d_name.name,dentry->d_name.len);
288
289 lock_kernel();
290 if (!autofs_oz_mode(sbi)) {
291 unlock_kernel();
292 return -EACCES;
293 }
294
295 if (autofs_hash_lookup(dh, &dentry->d_name)) {
296 unlock_kernel();
297 return -EEXIST;
298 }
299
300 n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
301 if (n >= AUTOFS_MAX_SYMLINKS) {
302 unlock_kernel();
303 return -ENOSPC;
304 }
305
306 set_bit(n,sbi->symlink_bitmap);
307 sl = &sbi->symlink[n];
308 sl->len = strlen(symname);
309 sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
310 if (!sl->data) {
311 clear_bit(n,sbi->symlink_bitmap);
312 unlock_kernel();
313 return -ENOSPC;
314 }
315
316 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
317 if (!ent) {
318 kfree(sl->data);
319 clear_bit(n,sbi->symlink_bitmap);
320 unlock_kernel();
321 return -ENOSPC;
322 }
323
324 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
325 if (!ent->name) {
326 kfree(sl->data);
327 kfree(ent);
328 clear_bit(n,sbi->symlink_bitmap);
329 unlock_kernel();
330 return -ENOSPC;
331 }
332
333 memcpy(sl->data,symname,slsize);
334 sl->mtime = get_seconds();
335
336 ent->ino = AUTOFS_FIRST_SYMLINK + n;
337 ent->hash = dentry->d_name.hash;
338 memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
339 ent->dentry = NULL; /* We don't keep the dentry for symlinks */
340
341 autofs_hash_insert(dh,ent);
342
343 inode = autofs_iget(dir->i_sb, ent->ino);
344 if (IS_ERR(inode))
345 return PTR_ERR(inode);
346
347 d_instantiate(dentry, inode);
348 unlock_kernel();
349 return 0;
350}
351
352/*
353 * NOTE!
354 *
355 * Normal filesystems would do a "d_delete()" to tell the VFS dcache
356 * that the file no longer exists. However, doing that means that the
357 * VFS layer can turn the dentry into a negative dentry, which we
358 * obviously do not want (we're dropping the entry not because it
359 * doesn't exist, but because it has timed out).
360 *
361 * Also see autofs_root_rmdir()..
362 */
363static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
364{
365 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
366 struct autofs_dirhash *dh = &sbi->dirhash;
367 struct autofs_dir_ent *ent;
368 unsigned int n;
369
370 /* This allows root to remove symlinks */
371 lock_kernel();
372 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
373 unlock_kernel();
374 return -EACCES;
375 }
376
377 ent = autofs_hash_lookup(dh, &dentry->d_name);
378 if (!ent) {
379 unlock_kernel();
380 return -ENOENT;
381 }
382
383 n = ent->ino - AUTOFS_FIRST_SYMLINK;
384 if (n >= AUTOFS_MAX_SYMLINKS) {
385 unlock_kernel();
386 return -EISDIR; /* It's a directory, dummy */
387 }
388 if (!test_bit(n,sbi->symlink_bitmap)) {
389 unlock_kernel();
390 return -EINVAL; /* Nonexistent symlink? Shouldn't happen */
391 }
392
393 dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
394 autofs_hash_delete(ent);
395 clear_bit(n,sbi->symlink_bitmap);
396 kfree(sbi->symlink[n].data);
397 d_drop(dentry);
398
399 unlock_kernel();
400 return 0;
401}
402
403static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
404{
405 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
406 struct autofs_dirhash *dh = &sbi->dirhash;
407 struct autofs_dir_ent *ent;
408
409 lock_kernel();
410 if (!autofs_oz_mode(sbi)) {
411 unlock_kernel();
412 return -EACCES;
413 }
414
415 ent = autofs_hash_lookup(dh, &dentry->d_name);
416 if (!ent) {
417 unlock_kernel();
418 return -ENOENT;
419 }
420
421 if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
422 unlock_kernel();
423 return -ENOTDIR; /* Not a directory */
424 }
425
426 if (ent->dentry != dentry) {
427 printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
428 }
429
430 dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
431 autofs_hash_delete(ent);
432 drop_nlink(dir);
433 d_drop(dentry);
434 unlock_kernel();
435
436 return 0;
437}
438
439static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
440{
441 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
442 struct autofs_dirhash *dh = &sbi->dirhash;
443 struct autofs_dir_ent *ent;
444 struct inode *inode;
445 ino_t ino;
446
447 lock_kernel();
448 if (!autofs_oz_mode(sbi)) {
449 unlock_kernel();
450 return -EACCES;
451 }
452
453 ent = autofs_hash_lookup(dh, &dentry->d_name);
454 if (ent) {
455 unlock_kernel();
456 return -EEXIST;
457 }
458
459 if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
460 printk("autofs: Out of inode numbers -- what the heck did you do??\n");
461 unlock_kernel();
462 return -ENOSPC;
463 }
464 ino = sbi->next_dir_ino++;
465
466 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
467 if (!ent) {
468 unlock_kernel();
469 return -ENOSPC;
470 }
471
472 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
473 if (!ent->name) {
474 kfree(ent);
475 unlock_kernel();
476 return -ENOSPC;
477 }
478
479 ent->hash = dentry->d_name.hash;
480 memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
481 ent->ino = ino;
482 ent->dentry = dentry;
483 autofs_hash_insert(dh,ent);
484
485 inc_nlink(dir);
486
487 inode = autofs_iget(dir->i_sb, ino);
488 if (IS_ERR(inode)) {
489 drop_nlink(dir);
490 return PTR_ERR(inode);
491 }
492
493 d_instantiate(dentry, inode);
494 unlock_kernel();
495
496 return 0;
497}
498
499/* Get/set timeout ioctl() operation */
500#ifdef CONFIG_COMPAT
501static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
502 unsigned int __user *p)
503{
504 unsigned long ntimeout;
505
506 if (get_user(ntimeout, p) ||
507 put_user(sbi->exp_timeout / HZ, p))
508 return -EFAULT;
509
510 if (ntimeout > UINT_MAX/HZ)
511 sbi->exp_timeout = 0;
512 else
513 sbi->exp_timeout = ntimeout * HZ;
514
515 return 0;
516}
517#endif
518
519static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
520 unsigned long __user *p)
521{
522 unsigned long ntimeout;
523
524 if (get_user(ntimeout, p) ||
525 put_user(sbi->exp_timeout / HZ, p))
526 return -EFAULT;
527
528 if (ntimeout > ULONG_MAX/HZ)
529 sbi->exp_timeout = 0;
530 else
531 sbi->exp_timeout = ntimeout * HZ;
532
533 return 0;
534}
535
536/* Return protocol version */
537static inline int autofs_get_protover(int __user *p)
538{
539 return put_user(AUTOFS_PROTO_VERSION, p);
540}
541
542/* Perform an expiry operation */
543static inline int autofs_expire_run(struct super_block *sb,
544 struct autofs_sb_info *sbi,
545 struct vfsmount *mnt,
546 struct autofs_packet_expire __user *pkt_p)
547{
548 struct autofs_dir_ent *ent;
549 struct autofs_packet_expire pkt;
550
551 memset(&pkt,0,sizeof pkt);
552
553 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
554 pkt.hdr.type = autofs_ptype_expire;
555
556 if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
557 return -EAGAIN;
558
559 pkt.len = ent->len;
560 memcpy(pkt.name, ent->name, pkt.len);
561 pkt.name[pkt.len] = '\0';
562
563 if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
564 return -EFAULT;
565
566 return 0;
567}
568
569/*
570 * ioctl()'s on the root directory is the chief method for the daemon to
571 * generate kernel reactions
572 */
573static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
574 unsigned int cmd, unsigned long arg)
575{
576 struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
577 void __user *argp = (void __user *)arg;
578
579 DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
580
581 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
582 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
583 return -ENOTTY;
584
585 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
586 return -EPERM;
587
588 switch(cmd) {
589 case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
590 return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
591 case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
592 return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
593 case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
594 autofs_catatonic_mode(sbi);
595 return 0;
596 case AUTOFS_IOC_PROTOVER: /* Get protocol version */
597 return autofs_get_protover(argp);
598#ifdef CONFIG_COMPAT
599 case AUTOFS_IOC_SETTIMEOUT32:
600 return autofs_compat_get_set_timeout(sbi, argp);
601#endif
602 case AUTOFS_IOC_SETTIMEOUT:
603 return autofs_get_set_timeout(sbi, argp);
604 case AUTOFS_IOC_EXPIRE:
605 return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
606 argp);
607 default:
608 return -ENOSYS;
609 }
610
611}
612
613static long autofs_root_ioctl(struct file *filp,
614 unsigned int cmd, unsigned long arg)
615{
616 int ret;
617
618 lock_kernel();
619 ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
620 filp, cmd, arg);
621 unlock_kernel();
622
623 return ret;
624}
625
626#ifdef CONFIG_COMPAT
627static long autofs_root_compat_ioctl(struct file *filp,
628 unsigned int cmd, unsigned long arg)
629{
630 struct inode *inode = filp->f_path.dentry->d_inode;
631 int ret;
632
633 lock_kernel();
634 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
635 ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
636 else
637 ret = autofs_do_root_ioctl(inode, filp, cmd,
638 (unsigned long)compat_ptr(arg));
639 unlock_kernel();
640
641 return ret;
642}
643#endif
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce2..000000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/symlink.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include "autofs_i.h"
14
15/* Nothing to release.. */
16static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
17{
18 char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
19 nd_set_link(nd, s);
20 return NULL;
21}
22
23const struct inode_operations autofs_symlink_inode_operations = {
24 .readlink = generic_readlink,
25 .follow_link = autofs_follow_link
26};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f0..000000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/waitq.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/slab.h>
14#include <linux/time.h>
15#include <linux/signal.h>
16#include <linux/file.h>
17#include "autofs_i.h"
18
19/* We make this a static variable rather than a part of the superblock; it
20 is better if we don't reassign numbers easily even across filesystems */
21static autofs_wqt_t autofs_next_wait_queue = 1;
22
23/* These are the signals we allow interrupting a pending mount */
24#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
25
26void autofs_catatonic_mode(struct autofs_sb_info *sbi)
27{
28 struct autofs_wait_queue *wq, *nwq;
29
30 DPRINTK(("autofs: entering catatonic mode\n"));
31
32 sbi->catatonic = 1;
33 wq = sbi->queues;
34 sbi->queues = NULL; /* Erase all wait queues */
35 while ( wq ) {
36 nwq = wq->next;
37 wq->status = -ENOENT; /* Magic is gone - report failure */
38 kfree(wq->name);
39 wq->name = NULL;
40 wake_up(&wq->queue);
41 wq = nwq;
42 }
43 fput(sbi->pipe); /* Close the pipe */
44 sbi->pipe = NULL;
45 autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
46}
47
48static int autofs_write(struct file *file, const void *addr, int bytes)
49{
50 unsigned long sigpipe, flags;
51 mm_segment_t fs;
52 const char *data = (const char *)addr;
53 ssize_t wr = 0;
54
55 /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
56
57 sigpipe = sigismember(&current->pending.signal, SIGPIPE);
58
59 /* Save pointer to user space and point back to kernel space */
60 fs = get_fs();
61 set_fs(KERNEL_DS);
62
63 while (bytes &&
64 (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
65 data += wr;
66 bytes -= wr;
67 }
68
69 set_fs(fs);
70
71 /* Keep the currently executing process from receiving a
72 SIGPIPE unless it was already supposed to get one */
73 if (wr == -EPIPE && !sigpipe) {
74 spin_lock_irqsave(&current->sighand->siglock, flags);
75 sigdelset(&current->pending.signal, SIGPIPE);
76 recalc_sigpending();
77 spin_unlock_irqrestore(&current->sighand->siglock, flags);
78 }
79
80 return (bytes > 0);
81}
82
83static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
84{
85 struct autofs_packet_missing pkt;
86
87 DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
88 autofs_say(wq->name,wq->len);
89
90 memset(&pkt,0,sizeof pkt); /* For security reasons */
91
92 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
93 pkt.hdr.type = autofs_ptype_missing;
94 pkt.wait_queue_token = wq->wait_queue_token;
95 pkt.len = wq->len;
96 memcpy(pkt.name, wq->name, pkt.len);
97 pkt.name[pkt.len] = '\0';
98
99 if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
100 autofs_catatonic_mode(sbi);
101}
102
103int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
104{
105 struct autofs_wait_queue *wq;
106 int status;
107
108 /* In catatonic mode, we don't wait for nobody */
109 if ( sbi->catatonic )
110 return -ENOENT;
111
112 /* We shouldn't be able to get here, but just in case */
113 if ( name->len > NAME_MAX )
114 return -ENOENT;
115
116 for ( wq = sbi->queues ; wq ; wq = wq->next ) {
117 if ( wq->hash == name->hash &&
118 wq->len == name->len &&
119 wq->name && !memcmp(wq->name,name->name,name->len) )
120 break;
121 }
122
123 if ( !wq ) {
124 /* Create a new wait queue */
125 wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
126 if ( !wq )
127 return -ENOMEM;
128
129 wq->name = kmalloc(name->len,GFP_KERNEL);
130 if ( !wq->name ) {
131 kfree(wq);
132 return -ENOMEM;
133 }
134 wq->wait_queue_token = autofs_next_wait_queue++;
135 init_waitqueue_head(&wq->queue);
136 wq->hash = name->hash;
137 wq->len = name->len;
138 wq->status = -EINTR; /* Status return if interrupted */
139 memcpy(wq->name, name->name, name->len);
140 wq->next = sbi->queues;
141 sbi->queues = wq;
142
143 /* autofs_notify_daemon() may block */
144 wq->wait_ctr = 2;
145 autofs_notify_daemon(sbi,wq);
146 } else
147 wq->wait_ctr++;
148
149 /* wq->name is NULL if and only if the lock is already released */
150
151 if ( sbi->catatonic ) {
152 /* We might have slept, so check again for catatonic mode */
153 wq->status = -ENOENT;
154 kfree(wq->name);
155 wq->name = NULL;
156 }
157
158 if ( wq->name ) {
159 /* Block all but "shutdown" signals while waiting */
160 sigset_t sigmask;
161
162 siginitsetinv(&sigmask, SHUTDOWN_SIGS);
163 sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
164
165 interruptible_sleep_on(&wq->queue);
166
167 sigprocmask(SIG_SETMASK, &sigmask, NULL);
168 } else {
169 DPRINTK(("autofs_wait: skipped sleeping\n"));
170 }
171
172 status = wq->status;
173
174 if ( ! --wq->wait_ctr ) /* Are we the last process to need status? */
175 kfree(wq);
176
177 return status;
178}
179
180
181int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
182{
183 struct autofs_wait_queue *wq, **wql;
184
185 for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
186 if ( wq->wait_queue_token == wait_queue_token )
187 break;
188 }
189 if ( !wq )
190 return -EINVAL;
191
192 *wql = wq->next; /* Unlink from chain */
193 kfree(wq->name);
194 wq->name = NULL; /* Do not wait on this queue */
195
196 wq->status = status;
197
198 if ( ! --wq->wait_ctr ) /* Is anyone still waiting for this guy? */
199 kfree(wq);
200 else
201 wake_up(&wq->queue);
202
203 return 0;
204}
205
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22f..eff9a419469a 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
724 .unlocked_ioctl = autofs_dev_ioctl, 724 .unlocked_ioctl = autofs_dev_ioctl,
725 .compat_ioctl = autofs_dev_ioctl_compat, 725 .compat_ioctl = autofs_dev_ioctl_compat,
726 .owner = THIS_MODULE, 726 .owner = THIS_MODULE,
727 .llseek = noop_llseek,
727}; 728};
728 729
729static struct miscdevice _autofs_dev_ioctl_misc = { 730static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd8957..c038727b4050 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include "autofs_i.h" 15#include "autofs_i.h"
16 16
17static int autofs_get_sb(struct file_system_type *fs_type, 17static struct dentry *autofs_mount(struct file_system_type *fs_type,
18 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 18 int flags, const char *dev_name, void *data)
19{ 19{
20 return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt); 20 return mount_nodev(fs_type, flags, data, autofs4_fill_super);
21} 21}
22 22
23static struct file_system_type autofs_fs_type = { 23static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE, 24 .owner = THIS_MODULE,
25 .name = "autofs", 25 .name = "autofs",
26 .get_sb = autofs_get_sb, 26 .mount = autofs_mount,
27 .kill_sb = autofs4_kill_sb, 27 .kill_sb = autofs4_kill_sb,
28}; 28};
29 29
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..ac87e49fa706 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
398 inode->i_gid = sb->s_root->d_inode->i_gid; 398 inode->i_gid = sb->s_root->d_inode->i_gid;
399 } 399 }
400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
401 inode->i_ino = get_next_ino();
401 402
402 if (S_ISDIR(inf->mode)) { 403 if (S_ISDIR(inf->mode)) {
403 inode->i_nlink = 2; 404 inode->i_nlink = 2;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cb1bd38dc08c..d5c1401f0031 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -19,7 +19,7 @@
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/compat.h> 21#include <linux/compat.h>
22#include <linux/smp_lock.h> 22#include <linux/mutex.h>
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
@@ -28,7 +28,9 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *);
28static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
29static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 29static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); 30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
31#ifdef CONFIG_COMPAT
31static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); 32static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
33#endif
32static int autofs4_dir_open(struct inode *inode, struct file *file); 34static int autofs4_dir_open(struct inode *inode, struct file *file);
33static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
34static void *autofs4_follow_link(struct dentry *, struct nameidata *); 36static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -978,15 +980,17 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
978 } 980 }
979} 981}
980 982
983static DEFINE_MUTEX(autofs4_ioctl_mutex);
984
981static long autofs4_root_ioctl(struct file *filp, 985static long autofs4_root_ioctl(struct file *filp,
982 unsigned int cmd, unsigned long arg) 986 unsigned int cmd, unsigned long arg)
983{ 987{
984 long ret; 988 long ret;
985 struct inode *inode = filp->f_dentry->d_inode; 989 struct inode *inode = filp->f_dentry->d_inode;
986 990
987 lock_kernel(); 991 mutex_lock(&autofs4_ioctl_mutex);
988 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 992 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
989 unlock_kernel(); 993 mutex_unlock(&autofs4_ioctl_mutex);
990 994
991 return ret; 995 return ret;
992} 996}
@@ -998,13 +1002,13 @@ static long autofs4_root_compat_ioctl(struct file *filp,
998 struct inode *inode = filp->f_path.dentry->d_inode; 1002 struct inode *inode = filp->f_path.dentry->d_inode;
999 int ret; 1003 int ret;
1000 1004
1001 lock_kernel(); 1005 mutex_lock(&autofs4_ioctl_mutex);
1002 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) 1006 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
1003 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 1007 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
1004 else 1008 else
1005 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, 1009 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
1006 (unsigned long)compat_ptr(arg)); 1010 (unsigned long)compat_ptr(arg));
1007 unlock_kernel(); 1011 mutex_unlock(&autofs4_ioctl_mutex);
1008 1012
1009 return ret; 1013 return ret;
1010} 1014}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dc39d2824885..aa4e7c7ae3c6 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
913 return 0; 913 return 0;
914} 914}
915 915
916static int 916static struct dentry *
917befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, 917befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
918 void *data, struct vfsmount *mnt) 918 void *data)
919{ 919{
920 return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super, 920 return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
921 mnt);
922} 921}
923 922
924static struct file_system_type befs_fs_type = { 923static struct file_system_type befs_fs_type = {
925 .owner = THIS_MODULE, 924 .owner = THIS_MODULE,
926 .name = "befs", 925 .name = "befs",
927 .get_sb = befs_get_sb, 926 .mount = befs_mount,
928 .kill_sb = kill_block_super, 927 .kill_sb = kill_block_super,
929 .fs_flags = FS_REQUIRES_DEV, 928 .fs_flags = FS_REQUIRES_DEV,
930}; 929};
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
176 inc_nlink(inode); 176 inc_nlink(inode);
177 inode->i_ctime = CURRENT_TIME_SEC; 177 inode->i_ctime = CURRENT_TIME_SEC;
178 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
179 atomic_inc(&inode->i_count); 179 ihold(inode);
180 d_instantiate(new, inode); 180 d_instantiate(new, inode);
181 mutex_unlock(&info->bfs_lock); 181 mutex_unlock(&info->bfs_lock);
182 return 0; 182 return 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c4daf0f5fc02..76db6d7d49bb 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/vfs.h> 16#include <linux/vfs.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
@@ -215,14 +214,10 @@ static void bfs_put_super(struct super_block *s)
215 if (!info) 214 if (!info)
216 return; 215 return;
217 216
218 lock_kernel();
219
220 mutex_destroy(&info->bfs_lock); 217 mutex_destroy(&info->bfs_lock);
221 kfree(info->si_imap); 218 kfree(info->si_imap);
222 kfree(info); 219 kfree(info);
223 s->s_fs_info = NULL; 220 s->s_fs_info = NULL;
224
225 unlock_kernel();
226} 221}
227 222
228static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -455,16 +450,16 @@ out:
455 return ret; 450 return ret;
456} 451}
457 452
458static int bfs_get_sb(struct file_system_type *fs_type, 453static struct dentry *bfs_mount(struct file_system_type *fs_type,
459 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 454 int flags, const char *dev_name, void *data)
460{ 455{
461 return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt); 456 return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
462} 457}
463 458
464static struct file_system_type bfs_fs_type = { 459static struct file_system_type bfs_fs_type = {
465 .owner = THIS_MODULE, 460 .owner = THIS_MODULE,
466 .name = "bfs", 461 .name = "bfs",
467 .get_sb = bfs_get_sb, 462 .mount = bfs_mount,
468 .kill_sb = kill_block_super, 463 .kill_sb = kill_block_super,
469 .fs_flags = FS_REQUIRES_DEV, 464 .fs_flags = FS_REQUIRES_DEV,
470}; 465};
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd0cc0bf9a40..1befe2ec8186 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
495 struct inode * inode = new_inode(sb); 495 struct inode * inode = new_inode(sb);
496 496
497 if (inode) { 497 if (inode) {
498 inode->i_ino = get_next_ino();
498 inode->i_mode = mode; 499 inode->i_mode = mode;
499 inode->i_atime = inode->i_mtime = inode->i_ctime = 500 inode->i_atime = inode->i_mtime = inode->i_ctime =
500 current_fs_time(inode->i_sb); 501 current_fs_time(inode->i_sb);
@@ -576,6 +577,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
576static const struct file_operations bm_entry_operations = { 577static const struct file_operations bm_entry_operations = {
577 .read = bm_entry_read, 578 .read = bm_entry_read,
578 .write = bm_entry_write, 579 .write = bm_entry_write,
580 .llseek = default_llseek,
579}; 581};
580 582
581/* /register */ 583/* /register */
@@ -643,6 +645,7 @@ out:
643 645
644static const struct file_operations bm_register_operations = { 646static const struct file_operations bm_register_operations = {
645 .write = bm_register_write, 647 .write = bm_register_write,
648 .llseek = noop_llseek,
646}; 649};
647 650
648/* /status */ 651/* /status */
@@ -680,6 +683,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
680static const struct file_operations bm_status_operations = { 683static const struct file_operations bm_status_operations = {
681 .read = bm_status_read, 684 .read = bm_status_read,
682 .write = bm_status_write, 685 .write = bm_status_write,
686 .llseek = default_llseek,
683}; 687};
684 688
685/* Superblock handling */ 689/* Superblock handling */
@@ -702,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
702 return err; 706 return err;
703} 707}
704 708
705static int bm_get_sb(struct file_system_type *fs_type, 709static struct dentry *bm_mount(struct file_system_type *fs_type,
706 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 710 int flags, const char *dev_name, void *data)
707{ 711{
708 return get_sb_single(fs_type, flags, data, bm_fill_super, mnt); 712 return mount_single(fs_type, flags, data, bm_fill_super);
709} 713}
710 714
711static struct linux_binfmt misc_format = { 715static struct linux_binfmt misc_format = {
@@ -716,7 +720,7 @@ static struct linux_binfmt misc_format = {
716static struct file_system_type bm_fs_type = { 720static struct file_system_type bm_fs_type = {
717 .owner = THIS_MODULE, 721 .owner = THIS_MODULE,
718 .name = "binfmt_misc", 722 .name = "binfmt_misc",
719 .get_sb = bm_get_sb, 723 .mount = bm_mount,
720 .kill_sb = kill_litter_super, 724 .kill_sb = kill_litter_super,
721}; 725};
722 726
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7c..4bd454fa844e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
370{ 370{
371 struct bio *bio; 371 struct bio *bio;
372 372
373 if (nr_iovecs > UIO_MAXIOV)
374 return NULL;
375
373 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), 376 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
374 gfp_mask); 377 gfp_mask);
375 if (unlikely(!bio)) 378 if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
697static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, 700static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
698 gfp_t gfp_mask) 701 gfp_t gfp_mask)
699{ 702{
700 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); 703 struct bio_map_data *bmd;
701 704
705 if (iov_count > UIO_MAXIOV)
706 return NULL;
707
708 bmd = kmalloc(sizeof(*bmd), gfp_mask);
702 if (!bmd) 709 if (!bmd)
703 return NULL; 710 return NULL;
704 711
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
827 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 834 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
828 start = uaddr >> PAGE_SHIFT; 835 start = uaddr >> PAGE_SHIFT;
829 836
837 /*
838 * Overflow, abort
839 */
840 if (end < start)
841 return ERR_PTR(-EINVAL);
842
830 nr_pages += end - start; 843 nr_pages += end - start;
831 len += iov[i].iov_len; 844 len += iov[i].iov_len;
832 } 845 }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
955 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 968 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
956 unsigned long start = uaddr >> PAGE_SHIFT; 969 unsigned long start = uaddr >> PAGE_SHIFT;
957 970
971 /*
972 * Overflow, abort
973 */
974 if (end < start)
975 return ERR_PTR(-EINVAL);
976
958 nr_pages += end - start; 977 nr_pages += end - start;
959 /* 978 /*
960 * buffer must be aligned to at least hardsector size for now 979 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
982 unsigned long start = uaddr >> PAGE_SHIFT; 1001 unsigned long start = uaddr >> PAGE_SHIFT;
983 const int local_nr_pages = end - start; 1002 const int local_nr_pages = end - start;
984 const int page_limit = cur_page + local_nr_pages; 1003 const int page_limit = cur_page + local_nr_pages;
985 1004
986 ret = get_user_pages_fast(uaddr, local_nr_pages, 1005 ret = get_user_pages_fast(uaddr, local_nr_pages,
987 write_to_vm, &pages[cur_page]); 1006 write_to_vm, &pages[cur_page]);
988 if (ret < local_nr_pages) { 1007 if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..06e8ff12b97c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
48 48
49EXPORT_SYMBOL(I_BDEV); 49EXPORT_SYMBOL(I_BDEV);
50 50
51/*
52 * move the inode from it's current bdi to the a new bdi. if the inode is dirty
53 * we need to move it onto the dirty list of @dst so that the inode is always
54 * on the right list.
55 */
56static void bdev_inode_switch_bdi(struct inode *inode,
57 struct backing_dev_info *dst)
58{
59 spin_lock(&inode_lock);
60 inode->i_data.backing_dev_info = dst;
61 if (inode->i_state & I_DIRTY)
62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 spin_unlock(&inode_lock);
64}
65
51static sector_t max_block(struct block_device *bdev) 66static sector_t max_block(struct block_device *bdev)
52{ 67{
53 sector_t retval = ~((sector_t)0); 68 sector_t retval = ~((sector_t)0);
@@ -370,7 +385,7 @@ int blkdev_fsync(struct file *filp, int datasync)
370 */ 385 */
371 mutex_unlock(&bd_inode->i_mutex); 386 mutex_unlock(&bd_inode->i_mutex);
372 387
373 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); 388 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
374 if (error == -EOPNOTSUPP) 389 if (error == -EOPNOTSUPP)
375 error = 0; 390 error = 0;
376 391
@@ -449,15 +464,15 @@ static const struct super_operations bdev_sops = {
449 .evict_inode = bdev_evict_inode, 464 .evict_inode = bdev_evict_inode,
450}; 465};
451 466
452static int bd_get_sb(struct file_system_type *fs_type, 467static struct dentry *bd_mount(struct file_system_type *fs_type,
453 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 468 int flags, const char *dev_name, void *data)
454{ 469{
455 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 470 return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
456} 471}
457 472
458static struct file_system_type bd_type = { 473static struct file_system_type bd_type = {
459 .name = "bdev", 474 .name = "bdev",
460 .get_sb = bd_get_sb, 475 .mount = bd_mount,
461 .kill_sb = kill_anon_super, 476 .kill_sb = kill_anon_super,
462}; 477};
463 478
@@ -550,7 +565,7 @@ EXPORT_SYMBOL(bdget);
550 */ 565 */
551struct block_device *bdgrab(struct block_device *bdev) 566struct block_device *bdgrab(struct block_device *bdev)
552{ 567{
553 atomic_inc(&bdev->bd_inode->i_count); 568 ihold(bdev->bd_inode);
554 return bdev; 569 return bdev;
555} 570}
556 571
@@ -580,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode)
580 spin_lock(&bdev_lock); 595 spin_lock(&bdev_lock);
581 bdev = inode->i_bdev; 596 bdev = inode->i_bdev;
582 if (bdev) { 597 if (bdev) {
583 atomic_inc(&bdev->bd_inode->i_count); 598 ihold(bdev->bd_inode);
584 spin_unlock(&bdev_lock); 599 spin_unlock(&bdev_lock);
585 return bdev; 600 return bdev;
586 } 601 }
@@ -591,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode)
591 spin_lock(&bdev_lock); 606 spin_lock(&bdev_lock);
592 if (!inode->i_bdev) { 607 if (!inode->i_bdev) {
593 /* 608 /*
594 * We take an additional bd_inode->i_count for inode, 609 * We take an additional reference to bd_inode,
595 * and it's released in clear_inode() of inode. 610 * and it's released in clear_inode() of inode.
596 * So, we can access it via ->i_mapping always 611 * So, we can access it via ->i_mapping always
597 * without igrab(). 612 * without igrab().
598 */ 613 */
599 atomic_inc(&bdev->bd_inode->i_count); 614 ihold(bdev->bd_inode);
600 inode->i_bdev = bdev; 615 inode->i_bdev = bdev;
601 inode->i_mapping = bdev->bd_inode->i_mapping; 616 inode->i_mapping = bdev->bd_inode->i_mapping;
602 list_add(&inode->i_devices, &bdev->bd_inodes); 617 list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1390 bdi = blk_get_backing_dev_info(bdev); 1405 bdi = blk_get_backing_dev_info(bdev);
1391 if (bdi == NULL) 1406 if (bdi == NULL)
1392 bdi = &default_backing_dev_info; 1407 bdi = &default_backing_dev_info;
1393 bdev->bd_inode->i_data.backing_dev_info = bdi; 1408 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1394 } 1409 }
1395 if (bdev->bd_invalidated) 1410 if (bdev->bd_invalidated)
1396 rescan_partitions(disk, bdev); 1411 rescan_partitions(disk, bdev);
@@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1405 if (ret) 1420 if (ret)
1406 goto out_clear; 1421 goto out_clear;
1407 bdev->bd_contains = whole; 1422 bdev->bd_contains = whole;
1408 bdev->bd_inode->i_data.backing_dev_info = 1423 bdev_inode_switch_bdi(bdev->bd_inode,
1409 whole->bd_inode->i_data.backing_dev_info; 1424 whole->bd_inode->i_data.backing_dev_info);
1410 bdev->bd_part = disk_get_part(disk, partno); 1425 bdev->bd_part = disk_get_part(disk, partno);
1411 if (!(disk->flags & GENHD_FL_UP) || 1426 if (!(disk->flags & GENHD_FL_UP) ||
1412 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1427 !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1439 disk_put_part(bdev->bd_part); 1454 disk_put_part(bdev->bd_part);
1440 bdev->bd_disk = NULL; 1455 bdev->bd_disk = NULL;
1441 bdev->bd_part = NULL; 1456 bdev->bd_part = NULL;
1442 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1457 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1443 if (bdev != bdev->bd_contains) 1458 if (bdev != bdev->bd_contains)
1444 __blkdev_put(bdev->bd_contains, mode, 1); 1459 __blkdev_put(bdev->bd_contains, mode, 1);
1445 bdev->bd_contains = NULL; 1460 bdev->bd_contains = NULL;
@@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1533 disk_put_part(bdev->bd_part); 1548 disk_put_part(bdev->bd_part);
1534 bdev->bd_part = NULL; 1549 bdev->bd_part = NULL;
1535 bdev->bd_disk = NULL; 1550 bdev->bd_disk = NULL;
1536 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1551 bdev_inode_switch_bdi(bdev->bd_inode,
1552 &default_backing_dev_info);
1537 if (bdev != bdev->bd_contains) 1553 if (bdev != bdev->bd_contains)
1538 victim = bdev->bd_contains; 1554 victim = bdev->bd_contains;
1539 bdev->bd_contains = NULL; 1555 bdev->bd_contains = NULL;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a2..7845d1f7d1d9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -163,7 +163,6 @@ fail:
163 */ 163 */
164static void end_compressed_bio_read(struct bio *bio, int err) 164static void end_compressed_bio_read(struct bio *bio, int err)
165{ 165{
166 struct extent_io_tree *tree;
167 struct compressed_bio *cb = bio->bi_private; 166 struct compressed_bio *cb = bio->bi_private;
168 struct inode *inode; 167 struct inode *inode;
169 struct page *page; 168 struct page *page;
@@ -187,7 +186,6 @@ static void end_compressed_bio_read(struct bio *bio, int err)
187 /* ok, we're the last bio for this extent, lets start 186 /* ok, we're the last bio for this extent, lets start
188 * the decompression. 187 * the decompression.
189 */ 188 */
190 tree = &BTRFS_I(inode)->io_tree;
191 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 189 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
192 cb->start, 190 cb->start,
193 cb->orig_bio->bi_io_vec, 191 cb->orig_bio->bi_io_vec,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc2..9ac171599258 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
200 struct extent_buffer **cow_ret, u64 new_root_objectid) 200 struct extent_buffer **cow_ret, u64 new_root_objectid)
201{ 201{
202 struct extent_buffer *cow; 202 struct extent_buffer *cow;
203 u32 nritems;
204 int ret = 0; 203 int ret = 0;
205 int level; 204 int level;
206 struct btrfs_disk_key disk_key; 205 struct btrfs_disk_key disk_key;
@@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
210 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 209 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
211 210
212 level = btrfs_header_level(buf); 211 level = btrfs_header_level(buf);
213 nritems = btrfs_header_nritems(buf);
214 if (level == 0) 212 if (level == 0)
215 btrfs_item_key(buf, &disk_key, 0); 213 btrfs_item_key(buf, &disk_key, 0);
216 else 214 else
@@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1008 int wret; 1006 int wret;
1009 int pslot; 1007 int pslot;
1010 int orig_slot = path->slots[level]; 1008 int orig_slot = path->slots[level];
1011 int err_on_enospc = 0;
1012 u64 orig_ptr; 1009 u64 orig_ptr;
1013 1010
1014 if (level == 0) 1011 if (level == 0)
@@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1068 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1072 return 0; 1069 return 0;
1073 1070
1074 if (btrfs_header_nritems(mid) < 2) 1071 btrfs_header_nritems(mid);
1075 err_on_enospc = 1;
1076 1072
1077 left = read_node_slot(root, parent, pslot - 1); 1073 left = read_node_slot(root, parent, pslot - 1);
1078 if (left) { 1074 if (left) {
@@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1103 wret = push_node_left(trans, root, left, mid, 1); 1099 wret = push_node_left(trans, root, left, mid, 1);
1104 if (wret < 0) 1100 if (wret < 0)
1105 ret = wret; 1101 ret = wret;
1106 if (btrfs_header_nritems(mid) < 2) 1102 btrfs_header_nritems(mid);
1107 err_on_enospc = 1;
1108 } 1103 }
1109 1104
1110 /* 1105 /*
@@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1224 int wret; 1219 int wret;
1225 int pslot; 1220 int pslot;
1226 int orig_slot = path->slots[level]; 1221 int orig_slot = path->slots[level];
1227 u64 orig_ptr;
1228 1222
1229 if (level == 0) 1223 if (level == 0)
1230 return 1; 1224 return 1;
1231 1225
1232 mid = path->nodes[level]; 1226 mid = path->nodes[level];
1233 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1227 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1234 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1235 1228
1236 if (level < BTRFS_MAX_LEVEL - 1) 1229 if (level < BTRFS_MAX_LEVEL - 1)
1237 parent = path->nodes[level + 1]; 1230 parent = path->nodes[level + 1];
@@ -1577,13 +1570,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1577 blocksize = btrfs_level_size(root, level - 1); 1570 blocksize = btrfs_level_size(root, level - 1);
1578 1571
1579 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1572 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1580 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1573 if (tmp) {
1581 /* 1574 if (btrfs_buffer_uptodate(tmp, 0)) {
1582 * we found an up to date block without sleeping, return 1575 if (btrfs_buffer_uptodate(tmp, gen)) {
1583 * right away 1576 /*
1584 */ 1577 * we found an up to date block without
1585 *eb_ret = tmp; 1578 * sleeping, return
1586 return 0; 1579 * right away
1580 */
1581 *eb_ret = tmp;
1582 return 0;
1583 }
1584 /* the pages were up to date, but we failed
1585 * the generation number check. Do a full
1586 * read for the generation number that is correct.
1587 * We must do this without dropping locks so
1588 * we can trust our generation number
1589 */
1590 free_extent_buffer(tmp);
1591 tmp = read_tree_block(root, blocknr, blocksize, gen);
1592 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1593 *eb_ret = tmp;
1594 return 0;
1595 }
1596 free_extent_buffer(tmp);
1597 btrfs_release_path(NULL, p);
1598 return -EIO;
1599 }
1587 } 1600 }
1588 1601
1589 /* 1602 /*
@@ -1596,8 +1609,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1596 btrfs_unlock_up_safe(p, level + 1); 1609 btrfs_unlock_up_safe(p, level + 1);
1597 btrfs_set_path_blocking(p); 1610 btrfs_set_path_blocking(p);
1598 1611
1599 if (tmp) 1612 free_extent_buffer(tmp);
1600 free_extent_buffer(tmp);
1601 if (p->reada) 1613 if (p->reada)
1602 reada_for_search(root, p, level, slot, key->objectid); 1614 reada_for_search(root, p, level, slot, key->objectid);
1603 1615
@@ -2548,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2548{ 2560{
2549 struct btrfs_disk_key disk_key; 2561 struct btrfs_disk_key disk_key;
2550 struct extent_buffer *right = path->nodes[0]; 2562 struct extent_buffer *right = path->nodes[0];
2551 int slot;
2552 int i; 2563 int i;
2553 int push_space = 0; 2564 int push_space = 0;
2554 int push_items = 0; 2565 int push_items = 0;
@@ -2560,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2560 u32 this_item_size; 2571 u32 this_item_size;
2561 u32 old_left_item_size; 2572 u32 old_left_item_size;
2562 2573
2563 slot = path->slots[1];
2564
2565 if (empty) 2574 if (empty)
2566 nr = min(right_nritems, max_slot); 2575 nr = min(right_nritems, max_slot);
2567 else 2576 else
@@ -3330,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3330{ 3339{
3331 int ret = 0; 3340 int ret = 0;
3332 int slot; 3341 int slot;
3333 int slot_orig;
3334 struct extent_buffer *leaf; 3342 struct extent_buffer *leaf;
3335 struct btrfs_item *item; 3343 struct btrfs_item *item;
3336 u32 nritems; 3344 u32 nritems;
@@ -3340,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3340 unsigned int size_diff; 3348 unsigned int size_diff;
3341 int i; 3349 int i;
3342 3350
3343 slot_orig = path->slots[0];
3344 leaf = path->nodes[0]; 3351 leaf = path->nodes[0];
3345 slot = path->slots[0]; 3352 slot = path->slots[0];
3346 3353
@@ -3445,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3445{ 3452{
3446 int ret = 0; 3453 int ret = 0;
3447 int slot; 3454 int slot;
3448 int slot_orig;
3449 struct extent_buffer *leaf; 3455 struct extent_buffer *leaf;
3450 struct btrfs_item *item; 3456 struct btrfs_item *item;
3451 u32 nritems; 3457 u32 nritems;
@@ -3454,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3454 unsigned int old_size; 3460 unsigned int old_size;
3455 int i; 3461 int i;
3456 3462
3457 slot_orig = path->slots[0];
3458 leaf = path->nodes[0]; 3463 leaf = path->nodes[0];
3459 3464
3460 nritems = btrfs_header_nritems(leaf); 3465 nritems = btrfs_header_nritems(leaf);
@@ -3787,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3787 struct btrfs_key *cpu_key, u32 *data_size, 3792 struct btrfs_key *cpu_key, u32 *data_size,
3788 int nr) 3793 int nr)
3789{ 3794{
3790 struct extent_buffer *leaf;
3791 int ret = 0; 3795 int ret = 0;
3792 int slot; 3796 int slot;
3793 int i; 3797 int i;
@@ -3804,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3804 if (ret < 0) 3808 if (ret < 0)
3805 goto out; 3809 goto out;
3806 3810
3807 leaf = path->nodes[0];
3808 slot = path->slots[0]; 3811 slot = path->slots[0];
3809 BUG_ON(slot < 0); 3812 BUG_ON(slot < 0);
3810 3813
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eaf286abad17..8db9234f6b41 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -99,6 +99,9 @@ struct btrfs_ordered_sum;
99 */ 99 */
100#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL 100#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
101 101
102/* For storing free space cache */
103#define BTRFS_FREE_SPACE_OBJECTID -11ULL
104
102/* dummy objectid represents multiple objectids */ 105/* dummy objectid represents multiple objectids */
103#define BTRFS_MULTIPLE_OBJECTIDS -255ULL 106#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
104 107
@@ -265,6 +268,22 @@ struct btrfs_chunk {
265 /* additional stripes go here */ 268 /* additional stripes go here */
266} __attribute__ ((__packed__)); 269} __attribute__ ((__packed__));
267 270
271#define BTRFS_FREE_SPACE_EXTENT 1
272#define BTRFS_FREE_SPACE_BITMAP 2
273
274struct btrfs_free_space_entry {
275 __le64 offset;
276 __le64 bytes;
277 u8 type;
278} __attribute__ ((__packed__));
279
280struct btrfs_free_space_header {
281 struct btrfs_disk_key location;
282 __le64 generation;
283 __le64 num_entries;
284 __le64 num_bitmaps;
285} __attribute__ ((__packed__));
286
268static inline unsigned long btrfs_chunk_item_size(int num_stripes) 287static inline unsigned long btrfs_chunk_item_size(int num_stripes)
269{ 288{
270 BUG_ON(num_stripes == 0); 289 BUG_ON(num_stripes == 0);
@@ -365,8 +384,10 @@ struct btrfs_super_block {
365 384
366 char label[BTRFS_LABEL_SIZE]; 385 char label[BTRFS_LABEL_SIZE];
367 386
387 __le64 cache_generation;
388
368 /* future expansion */ 389 /* future expansion */
369 __le64 reserved[32]; 390 __le64 reserved[31];
370 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 391 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
371} __attribute__ ((__packed__)); 392} __attribute__ ((__packed__));
372 393
@@ -375,13 +396,15 @@ struct btrfs_super_block {
375 * ones specified below then we will fail to mount 396 * ones specified below then we will fail to mount
376 */ 397 */
377#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 398#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
378#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) 399#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
400#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
379 401
380#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 402#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
381#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 403#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
382#define BTRFS_FEATURE_INCOMPAT_SUPP \ 404#define BTRFS_FEATURE_INCOMPAT_SUPP \
383 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 405 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
384 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) 406 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
407 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
385 408
386/* 409/*
387 * A leaf is full of items. offset and size tell us where to find 410 * A leaf is full of items. offset and size tell us where to find
@@ -675,7 +698,8 @@ struct btrfs_block_group_item {
675struct btrfs_space_info { 698struct btrfs_space_info {
676 u64 flags; 699 u64 flags;
677 700
678 u64 total_bytes; /* total bytes in the space */ 701 u64 total_bytes; /* total bytes in the space,
702 this doesn't take mirrors into account */
679 u64 bytes_used; /* total bytes used, 703 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */ 704 this does't take mirrors into account */
681 u64 bytes_pinned; /* total bytes pinned, will be freed when the 705 u64 bytes_pinned; /* total bytes pinned, will be freed when the
@@ -687,6 +711,8 @@ struct btrfs_space_info {
687 u64 bytes_may_use; /* number of bytes that may be used for 711 u64 bytes_may_use; /* number of bytes that may be used for
688 delalloc/allocations */ 712 delalloc/allocations */
689 u64 disk_used; /* total bytes used on disk */ 713 u64 disk_used; /* total bytes used on disk */
714 u64 disk_total; /* total bytes on disk, takes mirrors into
715 account */
690 716
691 int full; /* indicates that we cannot allocate any more 717 int full; /* indicates that we cannot allocate any more
692 chunks for this space */ 718 chunks for this space */
@@ -750,6 +776,14 @@ enum btrfs_caching_type {
750 BTRFS_CACHE_FINISHED = 2, 776 BTRFS_CACHE_FINISHED = 2,
751}; 777};
752 778
779enum btrfs_disk_cache_state {
780 BTRFS_DC_WRITTEN = 0,
781 BTRFS_DC_ERROR = 1,
782 BTRFS_DC_CLEAR = 2,
783 BTRFS_DC_SETUP = 3,
784 BTRFS_DC_NEED_WRITE = 4,
785};
786
753struct btrfs_caching_control { 787struct btrfs_caching_control {
754 struct list_head list; 788 struct list_head list;
755 struct mutex mutex; 789 struct mutex mutex;
@@ -763,6 +797,7 @@ struct btrfs_block_group_cache {
763 struct btrfs_key key; 797 struct btrfs_key key;
764 struct btrfs_block_group_item item; 798 struct btrfs_block_group_item item;
765 struct btrfs_fs_info *fs_info; 799 struct btrfs_fs_info *fs_info;
800 struct inode *inode;
766 spinlock_t lock; 801 spinlock_t lock;
767 u64 pinned; 802 u64 pinned;
768 u64 reserved; 803 u64 reserved;
@@ -773,8 +808,11 @@ struct btrfs_block_group_cache {
773 int extents_thresh; 808 int extents_thresh;
774 int free_extents; 809 int free_extents;
775 int total_bitmaps; 810 int total_bitmaps;
776 int ro; 811 int ro:1;
777 int dirty; 812 int dirty:1;
813 int iref:1;
814
815 int disk_cache_state;
778 816
779 /* cache tracking stuff */ 817 /* cache tracking stuff */
780 int cached; 818 int cached;
@@ -863,6 +901,7 @@ struct btrfs_fs_info {
863 struct btrfs_transaction *running_transaction; 901 struct btrfs_transaction *running_transaction;
864 wait_queue_head_t transaction_throttle; 902 wait_queue_head_t transaction_throttle;
865 wait_queue_head_t transaction_wait; 903 wait_queue_head_t transaction_wait;
904 wait_queue_head_t transaction_blocked_wait;
866 wait_queue_head_t async_submit_wait; 905 wait_queue_head_t async_submit_wait;
867 906
868 struct btrfs_super_block super_copy; 907 struct btrfs_super_block super_copy;
@@ -949,6 +988,7 @@ struct btrfs_fs_info {
949 struct btrfs_workers endio_meta_workers; 988 struct btrfs_workers endio_meta_workers;
950 struct btrfs_workers endio_meta_write_workers; 989 struct btrfs_workers endio_meta_write_workers;
951 struct btrfs_workers endio_write_workers; 990 struct btrfs_workers endio_write_workers;
991 struct btrfs_workers endio_freespace_worker;
952 struct btrfs_workers submit_workers; 992 struct btrfs_workers submit_workers;
953 /* 993 /*
954 * fixup workers take dirty pages that didn't properly go through 994 * fixup workers take dirty pages that didn't properly go through
@@ -1192,6 +1232,9 @@ struct btrfs_root {
1192#define BTRFS_MOUNT_NOSSD (1 << 9) 1232#define BTRFS_MOUNT_NOSSD (1 << 9)
1193#define BTRFS_MOUNT_DISCARD (1 << 10) 1233#define BTRFS_MOUNT_DISCARD (1 << 10)
1194#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) 1234#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
1235#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
1236#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1237#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1195 1238
1196#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1239#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1197#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1240#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1665,6 +1708,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1665 write_eb_member(eb, item, struct btrfs_dir_item, location, key); 1708 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1666} 1709}
1667 1710
1711BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
1712 num_entries, 64);
1713BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
1714 num_bitmaps, 64);
1715BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
1716 generation, 64);
1717
1718static inline void btrfs_free_space_key(struct extent_buffer *eb,
1719 struct btrfs_free_space_header *h,
1720 struct btrfs_disk_key *key)
1721{
1722 read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1723}
1724
1725static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
1726 struct btrfs_free_space_header *h,
1727 struct btrfs_disk_key *key)
1728{
1729 write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1730}
1731
1668/* struct btrfs_disk_key */ 1732/* struct btrfs_disk_key */
1669BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, 1733BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1670 objectid, 64); 1734 objectid, 64);
@@ -1876,6 +1940,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1876 incompat_flags, 64); 1940 incompat_flags, 64);
1877BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1941BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1878 csum_type, 16); 1942 csum_type, 16);
1943BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
1944 cache_generation, 64);
1879 1945
1880static inline int btrfs_super_csum_size(struct btrfs_super_block *s) 1946static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1881{ 1947{
@@ -1988,6 +2054,12 @@ static inline struct dentry *fdentry(struct file *file)
1988 return file->f_path.dentry; 2054 return file->f_path.dentry;
1989} 2055}
1990 2056
2057static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2058{
2059 return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2060 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2061}
2062
1991/* extent-tree.c */ 2063/* extent-tree.c */
1992void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2064void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2065int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2079,7 +2151,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2151void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 2152int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2081 struct btrfs_root *root, 2153 struct btrfs_root *root,
2082 int num_items, int *retries); 2154 int num_items);
2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2155void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2084 struct btrfs_root *root); 2156 struct btrfs_root *root);
2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 2157int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2100,7 +2172,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 2172int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root, 2173 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv, 2174 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries); 2175 u64 num_bytes);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2176int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root, 2177 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv, 2178 struct btrfs_block_rsv *block_rsv,
@@ -2115,6 +2187,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache); 2187 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root, 2188int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache); 2189 struct btrfs_block_group_cache *cache);
2190void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2118/* ctree.c */ 2191/* ctree.c */
2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2192int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2120 int level, int *slot); 2193 int level, int *slot);
@@ -2373,7 +2446,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2373 u32 min_type); 2446 u32 min_type);
2374 2447
2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2448int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); 2449int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
2450 int sync);
2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2451int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2378 struct extent_state **cached_state); 2452 struct extent_state **cached_state);
2379int btrfs_writepages(struct address_space *mapping, 2453int btrfs_writepages(struct address_space *mapping,
@@ -2426,6 +2500,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode, 2500int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size, 2501 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint); 2502 loff_t actual_len, u64 *alloc_hint);
2503int btrfs_prealloc_file_range_trans(struct inode *inode,
2504 struct btrfs_trans_handle *trans, int mode,
2505 u64 start, u64 num_bytes, u64 min_size,
2506 loff_t actual_len, u64 *alloc_hint);
2429extern const struct dentry_operations btrfs_dentry_operations; 2507extern const struct dentry_operations btrfs_dentry_operations;
2430 2508
2431/* ioctl.c */ 2509/* ioctl.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa49..f0cad5ae5be7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
427 ret = btrfs_truncate_item(trans, root, path, 427 ret = btrfs_truncate_item(trans, root, path,
428 item_len - sub_item_len, 1); 428 item_len - sub_item_len, 1);
429 } 429 }
430 return 0; 430 return ret;
431} 431}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..fb827d0d7181 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
338 struct extent_io_tree *tree; 338 struct extent_io_tree *tree;
339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
340 u64 found_start; 340 u64 found_start;
341 int found_level;
342 unsigned long len; 341 unsigned long len;
343 struct extent_buffer *eb; 342 struct extent_buffer *eb;
344 int ret; 343 int ret;
@@ -369,8 +368,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
369 WARN_ON(1); 368 WARN_ON(1);
370 goto err; 369 goto err;
371 } 370 }
372 found_level = btrfs_header_level(eb);
373
374 csum_tree_block(root, eb, 0); 371 csum_tree_block(root, eb, 0);
375err: 372err:
376 free_extent_buffer(eb); 373 free_extent_buffer(eb);
@@ -481,9 +478,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
481 end_io_wq->work.flags = 0; 478 end_io_wq->work.flags = 0;
482 479
483 if (bio->bi_rw & REQ_WRITE) { 480 if (bio->bi_rw & REQ_WRITE) {
484 if (end_io_wq->metadata) 481 if (end_io_wq->metadata == 1)
485 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 482 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
486 &end_io_wq->work); 483 &end_io_wq->work);
484 else if (end_io_wq->metadata == 2)
485 btrfs_queue_worker(&fs_info->endio_freespace_worker,
486 &end_io_wq->work);
487 else 487 else
488 btrfs_queue_worker(&fs_info->endio_write_workers, 488 btrfs_queue_worker(&fs_info->endio_write_workers,
489 &end_io_wq->work); 489 &end_io_wq->work);
@@ -497,6 +497,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
497 } 497 }
498} 498}
499 499
500/*
501 * For the metadata arg you want
502 *
503 * 0 - if data
504 * 1 - if normal metadta
505 * 2 - if writing to the free space cache area
506 */
500int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 507int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
501 int metadata) 508 int metadata)
502{ 509{
@@ -533,11 +540,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
533 540
534static void run_one_async_start(struct btrfs_work *work) 541static void run_one_async_start(struct btrfs_work *work)
535{ 542{
536 struct btrfs_fs_info *fs_info;
537 struct async_submit_bio *async; 543 struct async_submit_bio *async;
538 544
539 async = container_of(work, struct async_submit_bio, work); 545 async = container_of(work, struct async_submit_bio, work);
540 fs_info = BTRFS_I(async->inode)->root->fs_info;
541 async->submit_bio_start(async->inode, async->rw, async->bio, 546 async->submit_bio_start(async->inode, async->rw, async->bio,
542 async->mirror_num, async->bio_flags, 547 async->mirror_num, async->bio_flags,
543 async->bio_offset); 548 async->bio_offset);
@@ -850,12 +855,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
850 u32 blocksize, u64 parent_transid) 855 u32 blocksize, u64 parent_transid)
851{ 856{
852 struct extent_buffer *buf = NULL; 857 struct extent_buffer *buf = NULL;
853 struct inode *btree_inode = root->fs_info->btree_inode;
854 struct extent_io_tree *io_tree;
855 int ret; 858 int ret;
856 859
857 io_tree = &BTRFS_I(btree_inode)->io_tree;
858
859 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 860 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
860 if (!buf) 861 if (!buf)
861 return NULL; 862 return NULL;
@@ -1377,7 +1378,6 @@ static int bio_ready_for_csum(struct bio *bio)
1377 u64 start = 0; 1378 u64 start = 0;
1378 struct page *page; 1379 struct page *page;
1379 struct extent_io_tree *io_tree = NULL; 1380 struct extent_io_tree *io_tree = NULL;
1380 struct btrfs_fs_info *info = NULL;
1381 struct bio_vec *bvec; 1381 struct bio_vec *bvec;
1382 int i; 1382 int i;
1383 int ret; 1383 int ret;
@@ -1396,7 +1396,6 @@ static int bio_ready_for_csum(struct bio *bio)
1396 buf_len = page->private >> 2; 1396 buf_len = page->private >> 2;
1397 start = page_offset(page) + bvec->bv_offset; 1397 start = page_offset(page) + bvec->bv_offset;
1398 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1398 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1399 info = BTRFS_I(page->mapping->host)->root->fs_info;
1400 } 1399 }
1401 /* are we fully contained in this bio? */ 1400 /* are we fully contained in this bio? */
1402 if (buf_len <= length) 1401 if (buf_len <= length)
@@ -1680,12 +1679,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1680 1679
1681 init_waitqueue_head(&fs_info->transaction_throttle); 1680 init_waitqueue_head(&fs_info->transaction_throttle);
1682 init_waitqueue_head(&fs_info->transaction_wait); 1681 init_waitqueue_head(&fs_info->transaction_wait);
1682 init_waitqueue_head(&fs_info->transaction_blocked_wait);
1683 init_waitqueue_head(&fs_info->async_submit_wait); 1683 init_waitqueue_head(&fs_info->async_submit_wait);
1684 1684
1685 __setup_root(4096, 4096, 4096, 4096, tree_root, 1685 __setup_root(4096, 4096, 4096, 4096, tree_root,
1686 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1686 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1687 1687
1688
1689 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1688 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1690 if (!bh) 1689 if (!bh)
1691 goto fail_iput; 1690 goto fail_iput;
@@ -1775,6 +1774,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1775 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1774 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1776 fs_info->thread_pool_size, 1775 fs_info->thread_pool_size,
1777 &fs_info->generic_worker); 1776 &fs_info->generic_worker);
1777 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1778 1, &fs_info->generic_worker);
1778 1779
1779 /* 1780 /*
1780 * endios are largely parallel and should have a very 1781 * endios are largely parallel and should have a very
@@ -1795,6 +1796,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1797 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1798 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1799 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1798 1800
1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1801 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1802 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1993,6 +1995,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1993 if (!(sb->s_flags & MS_RDONLY)) { 1995 if (!(sb->s_flags & MS_RDONLY)) {
1994 down_read(&fs_info->cleanup_work_sem); 1996 down_read(&fs_info->cleanup_work_sem);
1995 btrfs_orphan_cleanup(fs_info->fs_root); 1997 btrfs_orphan_cleanup(fs_info->fs_root);
1998 btrfs_orphan_cleanup(fs_info->tree_root);
1996 up_read(&fs_info->cleanup_work_sem); 1999 up_read(&fs_info->cleanup_work_sem);
1997 } 2000 }
1998 2001
@@ -2035,6 +2038,7 @@ fail_sb_buffer:
2035 btrfs_stop_workers(&fs_info->endio_meta_workers); 2038 btrfs_stop_workers(&fs_info->endio_meta_workers);
2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2039 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2037 btrfs_stop_workers(&fs_info->endio_write_workers); 2040 btrfs_stop_workers(&fs_info->endio_write_workers);
2041 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2038 btrfs_stop_workers(&fs_info->submit_workers); 2042 btrfs_stop_workers(&fs_info->submit_workers);
2039fail_iput: 2043fail_iput:
2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2044 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -2063,7 +2067,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2063 if (uptodate) { 2067 if (uptodate) {
2064 set_buffer_uptodate(bh); 2068 set_buffer_uptodate(bh);
2065 } else { 2069 } else {
2066 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2070 if (printk_ratelimit()) {
2067 printk(KERN_WARNING "lost page write due to " 2071 printk(KERN_WARNING "lost page write due to "
2068 "I/O error on %s\n", 2072 "I/O error on %s\n",
2069 bdevname(bh->b_bdev, b)); 2073 bdevname(bh->b_bdev, b));
@@ -2200,21 +2204,10 @@ static int write_dev_supers(struct btrfs_device *device,
2200 bh->b_end_io = btrfs_end_buffer_write_sync; 2204 bh->b_end_io = btrfs_end_buffer_write_sync;
2201 } 2205 }
2202 2206
2203 if (i == last_barrier && do_barriers && device->barriers) { 2207 if (i == last_barrier && do_barriers)
2204 ret = submit_bh(WRITE_BARRIER, bh); 2208 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2205 if (ret == -EOPNOTSUPP) { 2209 else
2206 printk("btrfs: disabling barriers on dev %s\n",
2207 device->name);
2208 set_buffer_uptodate(bh);
2209 device->barriers = 0;
2210 /* one reference for submit_bh */
2211 get_bh(bh);
2212 lock_buffer(bh);
2213 ret = submit_bh(WRITE_SYNC, bh);
2214 }
2215 } else {
2216 ret = submit_bh(WRITE_SYNC, bh); 2210 ret = submit_bh(WRITE_SYNC, bh);
2217 }
2218 2211
2219 if (ret) 2212 if (ret)
2220 errors++; 2213 errors++;
@@ -2421,6 +2414,7 @@ int close_ctree(struct btrfs_root *root)
2421 fs_info->closing = 1; 2414 fs_info->closing = 1;
2422 smp_mb(); 2415 smp_mb();
2423 2416
2417 btrfs_put_block_group_cache(fs_info);
2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2418 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2425 ret = btrfs_commit_super(root); 2419 ret = btrfs_commit_super(root);
2426 if (ret) 2420 if (ret)
@@ -2467,6 +2461,7 @@ int close_ctree(struct btrfs_root *root)
2467 btrfs_stop_workers(&fs_info->endio_meta_workers); 2461 btrfs_stop_workers(&fs_info->endio_meta_workers);
2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2462 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2469 btrfs_stop_workers(&fs_info->endio_write_workers); 2463 btrfs_stop_workers(&fs_info->endio_write_workers);
2464 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2470 btrfs_stop_workers(&fs_info->submit_workers); 2465 btrfs_stop_workers(&fs_info->submit_workers);
2471 2466
2472 btrfs_close_devices(fs_info->fs_devices); 2467 btrfs_close_devices(fs_info->fs_devices);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..0c097f3aec41 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
242 return NULL; 242 return NULL;
243 } 243 }
244 244
245 /* We're loading it the fast way, so we don't have a caching_ctl. */
246 if (!cache->caching_ctl) {
247 spin_unlock(&cache->lock);
248 return NULL;
249 }
250
245 ctl = cache->caching_ctl; 251 ctl = cache->caching_ctl;
246 atomic_inc(&ctl->count); 252 atomic_inc(&ctl->count);
247 spin_unlock(&cache->lock); 253 spin_unlock(&cache->lock);
@@ -421,7 +427,9 @@ err:
421 return 0; 427 return 0;
422} 428}
423 429
424static int cache_block_group(struct btrfs_block_group_cache *cache) 430static int cache_block_group(struct btrfs_block_group_cache *cache,
431 struct btrfs_trans_handle *trans,
432 int load_cache_only)
425{ 433{
426 struct btrfs_fs_info *fs_info = cache->fs_info; 434 struct btrfs_fs_info *fs_info = cache->fs_info;
427 struct btrfs_caching_control *caching_ctl; 435 struct btrfs_caching_control *caching_ctl;
@@ -432,6 +440,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
432 if (cache->cached != BTRFS_CACHE_NO) 440 if (cache->cached != BTRFS_CACHE_NO)
433 return 0; 441 return 0;
434 442
443 /*
444 * We can't do the read from on-disk cache during a commit since we need
445 * to have the normal tree locking.
446 */
447 if (!trans->transaction->in_commit) {
448 spin_lock(&cache->lock);
449 if (cache->cached != BTRFS_CACHE_NO) {
450 spin_unlock(&cache->lock);
451 return 0;
452 }
453 cache->cached = BTRFS_CACHE_STARTED;
454 spin_unlock(&cache->lock);
455
456 ret = load_free_space_cache(fs_info, cache);
457
458 spin_lock(&cache->lock);
459 if (ret == 1) {
460 cache->cached = BTRFS_CACHE_FINISHED;
461 cache->last_byte_to_unpin = (u64)-1;
462 } else {
463 cache->cached = BTRFS_CACHE_NO;
464 }
465 spin_unlock(&cache->lock);
466 if (ret == 1)
467 return 0;
468 }
469
470 if (load_cache_only)
471 return 0;
472
435 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 473 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
436 BUG_ON(!caching_ctl); 474 BUG_ON(!caching_ctl);
437 475
@@ -509,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
509 547
510 rcu_read_lock(); 548 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 549 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 550 if (found->flags & flags) {
513 rcu_read_unlock(); 551 rcu_read_unlock();
514 return found; 552 return found;
515 } 553 }
@@ -542,6 +580,15 @@ static u64 div_factor(u64 num, int factor)
542 return num; 580 return num;
543} 581}
544 582
583static u64 div_factor_fine(u64 num, int factor)
584{
585 if (factor == 100)
586 return num;
587 num *= factor;
588 do_div(num, 100);
589 return num;
590}
591
545u64 btrfs_find_block_group(struct btrfs_root *root, 592u64 btrfs_find_block_group(struct btrfs_root *root,
546 u64 search_start, u64 search_hint, int owner) 593 u64 search_start, u64 search_hint, int owner)
547{ 594{
@@ -1695,8 +1742,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1695static void btrfs_issue_discard(struct block_device *bdev, 1742static void btrfs_issue_discard(struct block_device *bdev,
1696 u64 start, u64 len) 1743 u64 start, u64 len)
1697{ 1744{
1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1745 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1700} 1746}
1701 1747
1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1748static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -2688,6 +2734,109 @@ next_block_group(struct btrfs_root *root,
2688 return cache; 2734 return cache;
2689} 2735}
2690 2736
2737static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2738 struct btrfs_trans_handle *trans,
2739 struct btrfs_path *path)
2740{
2741 struct btrfs_root *root = block_group->fs_info->tree_root;
2742 struct inode *inode = NULL;
2743 u64 alloc_hint = 0;
2744 int num_pages = 0;
2745 int retries = 0;
2746 int ret = 0;
2747
2748 /*
2749 * If this block group is smaller than 100 megs don't bother caching the
2750 * block group.
2751 */
2752 if (block_group->key.offset < (100 * 1024 * 1024)) {
2753 spin_lock(&block_group->lock);
2754 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2755 spin_unlock(&block_group->lock);
2756 return 0;
2757 }
2758
2759again:
2760 inode = lookup_free_space_inode(root, block_group, path);
2761 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2762 ret = PTR_ERR(inode);
2763 btrfs_release_path(root, path);
2764 goto out;
2765 }
2766
2767 if (IS_ERR(inode)) {
2768 BUG_ON(retries);
2769 retries++;
2770
2771 if (block_group->ro)
2772 goto out_free;
2773
2774 ret = create_free_space_inode(root, trans, block_group, path);
2775 if (ret)
2776 goto out_free;
2777 goto again;
2778 }
2779
2780 /*
2781 * We want to set the generation to 0, that way if anything goes wrong
2782 * from here on out we know not to trust this cache when we load up next
2783 * time.
2784 */
2785 BTRFS_I(inode)->generation = 0;
2786 ret = btrfs_update_inode(trans, root, inode);
2787 WARN_ON(ret);
2788
2789 if (i_size_read(inode) > 0) {
2790 ret = btrfs_truncate_free_space_cache(root, trans, path,
2791 inode);
2792 if (ret)
2793 goto out_put;
2794 }
2795
2796 spin_lock(&block_group->lock);
2797 if (block_group->cached != BTRFS_CACHE_FINISHED) {
2798 spin_unlock(&block_group->lock);
2799 goto out_put;
2800 }
2801 spin_unlock(&block_group->lock);
2802
2803 num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2804 if (!num_pages)
2805 num_pages = 1;
2806
2807 /*
2808 * Just to make absolutely sure we have enough space, we're going to
2809 * preallocate 12 pages worth of space for each block group. In
2810 * practice we ought to use at most 8, but we need extra space so we can
2811 * add our header and have a terminator between the extents and the
2812 * bitmaps.
2813 */
2814 num_pages *= 16;
2815 num_pages *= PAGE_CACHE_SIZE;
2816
2817 ret = btrfs_check_data_free_space(inode, num_pages);
2818 if (ret)
2819 goto out_put;
2820
2821 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2822 num_pages, num_pages,
2823 &alloc_hint);
2824 btrfs_free_reserved_data_space(inode, num_pages);
2825out_put:
2826 iput(inode);
2827out_free:
2828 btrfs_release_path(root, path);
2829out:
2830 spin_lock(&block_group->lock);
2831 if (ret)
2832 block_group->disk_cache_state = BTRFS_DC_ERROR;
2833 else
2834 block_group->disk_cache_state = BTRFS_DC_SETUP;
2835 spin_unlock(&block_group->lock);
2836
2837 return ret;
2838}
2839
2691int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2840int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2692 struct btrfs_root *root) 2841 struct btrfs_root *root)
2693{ 2842{
@@ -2700,6 +2849,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2700 if (!path) 2849 if (!path)
2701 return -ENOMEM; 2850 return -ENOMEM;
2702 2851
2852again:
2853 while (1) {
2854 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2855 while (cache) {
2856 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2857 break;
2858 cache = next_block_group(root, cache);
2859 }
2860 if (!cache) {
2861 if (last == 0)
2862 break;
2863 last = 0;
2864 continue;
2865 }
2866 err = cache_save_setup(cache, trans, path);
2867 last = cache->key.objectid + cache->key.offset;
2868 btrfs_put_block_group(cache);
2869 }
2870
2703 while (1) { 2871 while (1) {
2704 if (last == 0) { 2872 if (last == 0) {
2705 err = btrfs_run_delayed_refs(trans, root, 2873 err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2877,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2709 2877
2710 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2878 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2711 while (cache) { 2879 while (cache) {
2880 if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2881 btrfs_put_block_group(cache);
2882 goto again;
2883 }
2884
2712 if (cache->dirty) 2885 if (cache->dirty)
2713 break; 2886 break;
2714 cache = next_block_group(root, cache); 2887 cache = next_block_group(root, cache);
@@ -2720,6 +2893,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2720 continue; 2893 continue;
2721 } 2894 }
2722 2895
2896 if (cache->disk_cache_state == BTRFS_DC_SETUP)
2897 cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2723 cache->dirty = 0; 2898 cache->dirty = 0;
2724 last = cache->key.objectid + cache->key.offset; 2899 last = cache->key.objectid + cache->key.offset;
2725 2900
@@ -2728,6 +2903,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2728 btrfs_put_block_group(cache); 2903 btrfs_put_block_group(cache);
2729 } 2904 }
2730 2905
2906 while (1) {
2907 /*
2908 * I don't think this is needed since we're just marking our
2909 * preallocated extent as written, but just in case it can't
2910 * hurt.
2911 */
2912 if (last == 0) {
2913 err = btrfs_run_delayed_refs(trans, root,
2914 (unsigned long)-1);
2915 BUG_ON(err);
2916 }
2917
2918 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2919 while (cache) {
2920 /*
2921 * Really this shouldn't happen, but it could if we
2922 * couldn't write the entire preallocated extent and
2923 * splitting the extent resulted in a new block.
2924 */
2925 if (cache->dirty) {
2926 btrfs_put_block_group(cache);
2927 goto again;
2928 }
2929 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2930 break;
2931 cache = next_block_group(root, cache);
2932 }
2933 if (!cache) {
2934 if (last == 0)
2935 break;
2936 last = 0;
2937 continue;
2938 }
2939
2940 btrfs_write_out_cache(root, trans, cache, path);
2941
2942 /*
2943 * If we didn't have an error then the cache state is still
2944 * NEED_WRITE, so we can set it to WRITTEN.
2945 */
2946 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2947 cache->disk_cache_state = BTRFS_DC_WRITTEN;
2948 last = cache->key.objectid + cache->key.offset;
2949 btrfs_put_block_group(cache);
2950 }
2951
2731 btrfs_free_path(path); 2952 btrfs_free_path(path);
2732 return 0; 2953 return 0;
2733} 2954}
@@ -2763,6 +2984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2763 if (found) { 2984 if (found) {
2764 spin_lock(&found->lock); 2985 spin_lock(&found->lock);
2765 found->total_bytes += total_bytes; 2986 found->total_bytes += total_bytes;
2987 found->disk_total += total_bytes * factor;
2766 found->bytes_used += bytes_used; 2988 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor; 2989 found->disk_used += bytes_used * factor;
2768 found->full = 0; 2990 found->full = 0;
@@ -2782,6 +3004,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2782 BTRFS_BLOCK_GROUP_SYSTEM | 3004 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA); 3005 BTRFS_BLOCK_GROUP_METADATA);
2784 found->total_bytes = total_bytes; 3006 found->total_bytes = total_bytes;
3007 found->disk_total = total_bytes * factor;
2785 found->bytes_used = bytes_used; 3008 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor; 3009 found->disk_used = bytes_used * factor;
2787 found->bytes_pinned = 0; 3010 found->bytes_pinned = 0;
@@ -2883,11 +3106,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
2883 struct btrfs_space_info *data_sinfo; 3106 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root; 3107 struct btrfs_root *root = BTRFS_I(inode)->root;
2885 u64 used; 3108 u64 used;
2886 int ret = 0, committed = 0; 3109 int ret = 0, committed = 0, alloc_chunk = 1;
2887 3110
2888 /* make sure bytes are sectorsize aligned */ 3111 /* make sure bytes are sectorsize aligned */
2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3112 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2890 3113
3114 if (root == root->fs_info->tree_root) {
3115 alloc_chunk = 0;
3116 committed = 1;
3117 }
3118
2891 data_sinfo = BTRFS_I(inode)->space_info; 3119 data_sinfo = BTRFS_I(inode)->space_info;
2892 if (!data_sinfo) 3120 if (!data_sinfo)
2893 goto alloc; 3121 goto alloc;
@@ -2906,7 +3134,7 @@ again:
2906 * if we don't have enough free bytes in this space then we need 3134 * if we don't have enough free bytes in this space then we need
2907 * to alloc a new chunk. 3135 * to alloc a new chunk.
2908 */ 3136 */
2909 if (!data_sinfo->full) { 3137 if (!data_sinfo->full && alloc_chunk) {
2910 u64 alloc_target; 3138 u64 alloc_target;
2911 3139
2912 data_sinfo->force_alloc = 1; 3140 data_sinfo->force_alloc = 1;
@@ -2998,10 +3226,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
2998 rcu_read_unlock(); 3226 rcu_read_unlock();
2999} 3227}
3000 3228
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo, 3229static int should_alloc_chunk(struct btrfs_root *root,
3002 u64 alloc_bytes) 3230 struct btrfs_space_info *sinfo, u64 alloc_bytes)
3003{ 3231{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3232 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3233 u64 thresh;
3005 3234
3006 if (sinfo->bytes_used + sinfo->bytes_reserved + 3235 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3236 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
@@ -3011,6 +3240,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3011 alloc_bytes < div_factor(num_bytes, 8)) 3240 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0; 3241 return 0;
3013 3242
3243 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3244 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3245
3246 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3247 return 0;
3248
3014 return 1; 3249 return 1;
3015} 3250}
3016 3251
@@ -3042,13 +3277,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3042 goto out; 3277 goto out;
3043 } 3278 }
3044 3279
3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { 3280 if (!force && !should_alloc_chunk(extent_root, space_info,
3281 alloc_bytes)) {
3046 spin_unlock(&space_info->lock); 3282 spin_unlock(&space_info->lock);
3047 goto out; 3283 goto out;
3048 } 3284 }
3049 spin_unlock(&space_info->lock); 3285 spin_unlock(&space_info->lock);
3050 3286
3051 /* 3287 /*
3288 * If we have mixed data/metadata chunks we want to make sure we keep
3289 * allocating mixed chunks instead of individual chunks.
3290 */
3291 if (btrfs_mixed_space_info(space_info))
3292 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3293
3294 /*
3052 * if we're doing a data chunk, go ahead and make sure that 3295 * if we're doing a data chunk, go ahead and make sure that
3053 * we keep a reasonable number of metadata chunks allocated in the 3296 * we keep a reasonable number of metadata chunks allocated in the
3054 * FS as well. 3297 * FS as well.
@@ -3073,55 +3316,25 @@ out:
3073 return ret; 3316 return ret;
3074} 3317}
3075 3318
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/* 3319/*
3109 * shrink metadata reservation for delalloc 3320 * shrink metadata reservation for delalloc
3110 */ 3321 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans, 3322static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim) 3323 struct btrfs_root *root, u64 to_reclaim, int sync)
3113{ 3324{
3114 struct btrfs_block_rsv *block_rsv; 3325 struct btrfs_block_rsv *block_rsv;
3326 struct btrfs_space_info *space_info;
3115 u64 reserved; 3327 u64 reserved;
3116 u64 max_reclaim; 3328 u64 max_reclaim;
3117 u64 reclaimed = 0; 3329 u64 reclaimed = 0;
3118 int pause = 1; 3330 int pause = 1;
3119 int ret; 3331 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3120 3332
3121 block_rsv = &root->fs_info->delalloc_block_rsv; 3333 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock); 3334 space_info = block_rsv->space_info;
3123 reserved = block_rsv->reserved; 3335
3124 spin_unlock(&block_rsv->lock); 3336 smp_mb();
3337 reserved = space_info->bytes_reserved;
3125 3338
3126 if (reserved == 0) 3339 if (reserved == 0)
3127 return 0; 3340 return 0;
@@ -3129,104 +3342,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3129 max_reclaim = min(reserved, to_reclaim); 3342 max_reclaim = min(reserved, to_reclaim);
3130 3343
3131 while (1) { 3344 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); 3345 /* have the flusher threads jump in and do some IO */
3133 if (!ret) { 3346 smp_mb();
3134 __set_current_state(TASK_INTERRUPTIBLE); 3347 nr_pages = min_t(unsigned long, nr_pages,
3135 schedule_timeout(pause); 3348 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3136 pause <<= 1; 3349 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142 3350
3143 spin_lock(&block_rsv->lock); 3351 spin_lock(&space_info->lock);
3144 if (reserved > block_rsv->reserved) 3352 if (reserved > space_info->bytes_reserved)
3145 reclaimed = reserved - block_rsv->reserved; 3353 reclaimed += reserved - space_info->bytes_reserved;
3146 reserved = block_rsv->reserved; 3354 reserved = space_info->bytes_reserved;
3147 spin_unlock(&block_rsv->lock); 3355 spin_unlock(&space_info->lock);
3148 3356
3149 if (reserved == 0 || reclaimed >= max_reclaim) 3357 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break; 3358 break;
3151 3359
3152 if (trans && trans->transaction->blocked) 3360 if (trans && trans->transaction->blocked)
3153 return -EAGAIN; 3361 return -EAGAIN;
3362
3363 __set_current_state(TASK_INTERRUPTIBLE);
3364 schedule_timeout(pause);
3365 pause <<= 1;
3366 if (pause > HZ / 10)
3367 pause = HZ / 10;
3368
3154 } 3369 }
3155 return reclaimed >= to_reclaim; 3370 return reclaimed >= to_reclaim;
3156} 3371}
3157 3372
3158static int should_retry_reserve(struct btrfs_trans_handle *trans, 3373/*
3159 struct btrfs_root *root, 3374 * Retries tells us how many times we've called reserve_metadata_bytes. The
3160 struct btrfs_block_rsv *block_rsv, 3375 * idea is if this is the first call (retries == 0) then we will add to our
3161 u64 num_bytes, int *retries) 3376 * reserved count if we can't make the allocation in order to hold our place
3377 * while we go and try and free up space. That way for retries > 1 we don't try
3378 * and add space, we just check to see if the amount of unused space is >= the
3379 * total space, meaning that our reservation is valid.
3380 *
3381 * However if we don't intend to retry this reservation, pass -1 as retries so
3382 * that it short circuits this logic.
3383 */
3384static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3385 struct btrfs_root *root,
3386 struct btrfs_block_rsv *block_rsv,
3387 u64 orig_bytes, int flush)
3162{ 3388{
3163 struct btrfs_space_info *space_info = block_rsv->space_info; 3389 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret; 3390 u64 unused;
3391 u64 num_bytes = orig_bytes;
3392 int retries = 0;
3393 int ret = 0;
3394 bool reserved = false;
3395 bool committed = false;
3165 3396
3166 if ((*retries) > 2) 3397again:
3167 return -ENOSPC; 3398 ret = -ENOSPC;
3399 if (reserved)
3400 num_bytes = 0;
3168 3401
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); 3402 spin_lock(&space_info->lock);
3170 if (ret) 3403 unused = space_info->bytes_used + space_info->bytes_reserved +
3171 return 1; 3404 space_info->bytes_pinned + space_info->bytes_readonly +
3405 space_info->bytes_may_use;
3172 3406
3173 if (trans && trans->transaction->in_commit) 3407 /*
3174 return -ENOSPC; 3408 * The idea here is that we've not already over-reserved the block group
3409 * then we can go ahead and save our reservation first and then start
3410 * flushing if we need to. Otherwise if we've already overcommitted
3411 * lets start flushing stuff first and then come back and try to make
3412 * our reservation.
3413 */
3414 if (unused <= space_info->total_bytes) {
3415 unused -= space_info->total_bytes;
3416 if (unused >= num_bytes) {
3417 if (!reserved)
3418 space_info->bytes_reserved += orig_bytes;
3419 ret = 0;
3420 } else {
3421 /*
3422 * Ok set num_bytes to orig_bytes since we aren't
3423 * overocmmitted, this way we only try and reclaim what
3424 * we need.
3425 */
3426 num_bytes = orig_bytes;
3427 }
3428 } else {
3429 /*
3430 * Ok we're over committed, set num_bytes to the overcommitted
3431 * amount plus the amount of bytes that we need for this
3432 * reservation.
3433 */
3434 num_bytes = unused - space_info->total_bytes +
3435 (orig_bytes * (retries + 1));
3436 }
3175 3437
3176 ret = shrink_delalloc(trans, root, num_bytes); 3438 /*
3177 if (ret) 3439 * Couldn't make our reservation, save our place so while we're trying
3178 return ret; 3440 * to reclaim space we can actually use it instead of somebody else
3441 * stealing it from us.
3442 */
3443 if (ret && !reserved) {
3444 space_info->bytes_reserved += orig_bytes;
3445 reserved = true;
3446 }
3179 3447
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock); 3448 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188 3449
3189 if (trans) 3450 if (!ret)
3190 return -EAGAIN; 3451 return 0;
3191 3452
3192 trans = btrfs_join_transaction(root, 1); 3453 if (!flush)
3193 BUG_ON(IS_ERR(trans)); 3454 goto out;
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196 3455
3197 return 1; 3456 /*
3198} 3457 * We do synchronous shrinking since we don't actually unreserve
3458 * metadata until after the IO is completed.
3459 */
3460 ret = shrink_delalloc(trans, root, num_bytes, 1);
3461 if (ret > 0)
3462 return 0;
3463 else if (ret < 0)
3464 goto out;
3199 3465
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, 3466 /*
3201 u64 num_bytes) 3467 * So if we were overcommitted it's possible that somebody else flushed
3202{ 3468 * out enough space and we simply didn't have enough space to reclaim,
3203 struct btrfs_space_info *space_info = block_rsv->space_info; 3469 * so go back around and try again.
3204 u64 unused; 3470 */
3205 int ret = -ENOSPC; 3471 if (retries < 2) {
3472 retries++;
3473 goto again;
3474 }
3206 3475
3207 spin_lock(&space_info->lock); 3476 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved + 3477 /*
3209 space_info->bytes_pinned + space_info->bytes_readonly; 3478 * Not enough space to be reclaimed, don't bother committing the
3479 * transaction.
3480 */
3481 if (space_info->bytes_pinned < orig_bytes)
3482 ret = -ENOSPC;
3483 spin_unlock(&space_info->lock);
3484 if (ret)
3485 goto out;
3210 3486
3211 if (unused < space_info->total_bytes) 3487 ret = -EAGAIN;
3212 unused = space_info->total_bytes - unused; 3488 if (trans || committed)
3213 else 3489 goto out;
3214 unused = 0;
3215 3490
3216 if (unused >= num_bytes) { 3491 ret = -ENOSPC;
3217 if (block_rsv->priority >= 10) { 3492 trans = btrfs_join_transaction(root, 1);
3218 space_info->bytes_reserved += num_bytes; 3493 if (IS_ERR(trans))
3219 ret = 0; 3494 goto out;
3220 } else { 3495 ret = btrfs_commit_transaction(trans, root);
3221 if ((unused + block_rsv->reserved) * 3496 if (!ret) {
3222 block_rsv->priority >= 3497 trans = NULL;
3223 (num_bytes + block_rsv->reserved) * 10) { 3498 committed = true;
3224 space_info->bytes_reserved += num_bytes; 3499 goto again;
3225 ret = 0; 3500 }
3226 } 3501
3227 } 3502out:
3503 if (reserved) {
3504 spin_lock(&space_info->lock);
3505 space_info->bytes_reserved -= orig_bytes;
3506 spin_unlock(&space_info->lock);
3228 } 3507 }
3229 spin_unlock(&space_info->lock);
3230 3508
3231 return ret; 3509 return ret;
3232} 3510}
@@ -3328,18 +3606,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{ 3606{
3329 struct btrfs_block_rsv *block_rsv; 3607 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info; 3608 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332 3609
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 3610 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv) 3611 if (!block_rsv)
3335 return NULL; 3612 return NULL;
3336 3613
3337 btrfs_init_block_rsv(block_rsv); 3614 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info, 3615 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA); 3616 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv; 3617 return block_rsv;
3344} 3618}
3345 3619
@@ -3370,23 +3644,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3644int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root, 3645 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv, 3646 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries) 3647 u64 num_bytes)
3374{ 3648{
3375 int ret; 3649 int ret;
3376 3650
3377 if (num_bytes == 0) 3651 if (num_bytes == 0)
3378 return 0; 3652 return 0;
3379again: 3653
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes); 3654 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
3381 if (!ret) { 3655 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3656 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0; 3657 return 0;
3384 } 3658 }
3385 3659
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret; 3660 return ret;
3391} 3661}
3392 3662
@@ -3421,7 +3691,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3421 return 0; 3691 return 0;
3422 3692
3423 if (block_rsv->refill_used) { 3693 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes); 3694 ret = reserve_metadata_bytes(trans, root, block_rsv,
3695 num_bytes, 0);
3425 if (!ret) { 3696 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3697 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0; 3698 return 0;
@@ -3500,6 +3771,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3500 3771
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3772 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock); 3773 spin_lock(&sinfo->lock);
3774 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3775 data_used = 0;
3503 meta_used = sinfo->bytes_used; 3776 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock); 3777 spin_unlock(&sinfo->lock);
3505 3778
@@ -3527,7 +3800,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3527 block_rsv->size = num_bytes; 3800 block_rsv->size = num_bytes;
3528 3801
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 3802 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly; 3803 sinfo->bytes_reserved + sinfo->bytes_readonly +
3804 sinfo->bytes_may_use;
3531 3805
3532 if (sinfo->total_bytes > num_bytes) { 3806 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes; 3807 num_bytes = sinfo->total_bytes - num_bytes;
@@ -3598,7 +3872,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3598 3872
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 3873int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root, 3874 struct btrfs_root *root,
3601 int num_items, int *retries) 3875 int num_items)
3602{ 3876{
3603 u64 num_bytes; 3877 u64 num_bytes;
3604 int ret; 3878 int ret;
@@ -3608,7 +3882,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3608 3882
3609 num_bytes = calc_trans_metadata_size(root, num_items); 3883 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, 3884 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries); 3885 num_bytes);
3612 if (!ret) { 3886 if (!ret) {
3613 trans->bytes_reserved += num_bytes; 3887 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv; 3888 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3682,14 +3956,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 3956 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve; 3957 u64 to_reserve;
3684 int nr_extents; 3958 int nr_extents;
3685 int retries = 0;
3686 int ret; 3959 int ret;
3687 3960
3688 if (btrfs_transaction_in_commit(root->fs_info)) 3961 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1); 3962 schedule_timeout(1);
3690 3963
3691 num_bytes = ALIGN(num_bytes, root->sectorsize); 3964 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again: 3965
3693 spin_lock(&BTRFS_I(inode)->accounting_lock); 3966 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 3967 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) { 3968 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3699,18 +3972,14 @@ again:
3699 nr_extents = 0; 3972 nr_extents = 0;
3700 to_reserve = 0; 3973 to_reserve = 0;
3701 } 3974 }
3975 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3702 3976
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes); 3977 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve); 3978 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3705 if (ret) { 3979 if (ret)
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret; 3980 return ret;
3712 }
3713 3981
3982 spin_lock(&BTRFS_I(inode)->accounting_lock);
3714 BTRFS_I(inode)->reserved_extents += nr_extents; 3983 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 3984 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock); 3985 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -3718,7 +3987,7 @@ again:
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1); 3987 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719 3988
3720 if (block_rsv->size > 512 * 1024 * 1024) 3989 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve); 3990 shrink_delalloc(NULL, root, to_reserve, 0);
3722 3991
3723 return 0; 3992 return 0;
3724} 3993}
@@ -3777,12 +4046,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3777 struct btrfs_root *root, 4046 struct btrfs_root *root,
3778 u64 bytenr, u64 num_bytes, int alloc) 4047 u64 bytenr, u64 num_bytes, int alloc)
3779{ 4048{
3780 struct btrfs_block_group_cache *cache; 4049 struct btrfs_block_group_cache *cache = NULL;
3781 struct btrfs_fs_info *info = root->fs_info; 4050 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3783 u64 total = num_bytes; 4051 u64 total = num_bytes;
3784 u64 old_val; 4052 u64 old_val;
3785 u64 byte_in_group; 4053 u64 byte_in_group;
4054 int factor;
3786 4055
3787 /* block accounting for super block */ 4056 /* block accounting for super block */
3788 spin_lock(&info->delalloc_lock); 4057 spin_lock(&info->delalloc_lock);
@@ -3804,11 +4073,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3804 factor = 2; 4073 factor = 2;
3805 else 4074 else
3806 factor = 1; 4075 factor = 1;
4076 /*
4077 * If this block group has free space cache written out, we
4078 * need to make sure to load it if we are removing space. This
4079 * is because we need the unpinning stage to actually add the
4080 * space back to the block group, otherwise we will leak space.
4081 */
4082 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4083 cache_block_group(cache, trans, 1);
4084
3807 byte_in_group = bytenr - cache->key.objectid; 4085 byte_in_group = bytenr - cache->key.objectid;
3808 WARN_ON(byte_in_group > cache->key.offset); 4086 WARN_ON(byte_in_group > cache->key.offset);
3809 4087
3810 spin_lock(&cache->space_info->lock); 4088 spin_lock(&cache->space_info->lock);
3811 spin_lock(&cache->lock); 4089 spin_lock(&cache->lock);
4090
4091 if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
4092 cache->disk_cache_state < BTRFS_DC_CLEAR)
4093 cache->disk_cache_state = BTRFS_DC_CLEAR;
4094
3812 cache->dirty = 1; 4095 cache->dirty = 1;
3813 old_val = btrfs_block_group_used(&cache->item); 4096 old_val = btrfs_block_group_used(&cache->item);
3814 num_bytes = min(total, cache->key.offset - byte_in_group); 4097 num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -4555,6 +4838,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4555 bool found_uncached_bg = false; 4838 bool found_uncached_bg = false;
4556 bool failed_cluster_refill = false; 4839 bool failed_cluster_refill = false;
4557 bool failed_alloc = false; 4840 bool failed_alloc = false;
4841 bool use_cluster = true;
4558 u64 ideal_cache_percent = 0; 4842 u64 ideal_cache_percent = 0;
4559 u64 ideal_cache_offset = 0; 4843 u64 ideal_cache_offset = 0;
4560 4844
@@ -4569,16 +4853,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4569 return -ENOSPC; 4853 return -ENOSPC;
4570 } 4854 }
4571 4855
4856 /*
4857 * If the space info is for both data and metadata it means we have a
4858 * small filesystem and we can't use the clustering stuff.
4859 */
4860 if (btrfs_mixed_space_info(space_info))
4861 use_cluster = false;
4862
4572 if (orig_root->ref_cows || empty_size) 4863 if (orig_root->ref_cows || empty_size)
4573 allowed_chunk_alloc = 1; 4864 allowed_chunk_alloc = 1;
4574 4865
4575 if (data & BTRFS_BLOCK_GROUP_METADATA) { 4866 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
4576 last_ptr = &root->fs_info->meta_alloc_cluster; 4867 last_ptr = &root->fs_info->meta_alloc_cluster;
4577 if (!btrfs_test_opt(root, SSD)) 4868 if (!btrfs_test_opt(root, SSD))
4578 empty_cluster = 64 * 1024; 4869 empty_cluster = 64 * 1024;
4579 } 4870 }
4580 4871
4581 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 4872 if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
4873 btrfs_test_opt(root, SSD)) {
4582 last_ptr = &root->fs_info->data_alloc_cluster; 4874 last_ptr = &root->fs_info->data_alloc_cluster;
4583 } 4875 }
4584 4876
@@ -4642,6 +4934,10 @@ have_block_group:
4642 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4934 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4643 u64 free_percent; 4935 u64 free_percent;
4644 4936
4937 ret = cache_block_group(block_group, trans, 1);
4938 if (block_group->cached == BTRFS_CACHE_FINISHED)
4939 goto have_block_group;
4940
4645 free_percent = btrfs_block_group_used(&block_group->item); 4941 free_percent = btrfs_block_group_used(&block_group->item);
4646 free_percent *= 100; 4942 free_percent *= 100;
4647 free_percent = div64_u64(free_percent, 4943 free_percent = div64_u64(free_percent,
@@ -4662,7 +4958,7 @@ have_block_group:
4662 if (loop > LOOP_CACHING_NOWAIT || 4958 if (loop > LOOP_CACHING_NOWAIT ||
4663 (loop > LOOP_FIND_IDEAL && 4959 (loop > LOOP_FIND_IDEAL &&
4664 atomic_read(&space_info->caching_threads) < 2)) { 4960 atomic_read(&space_info->caching_threads) < 2)) {
4665 ret = cache_block_group(block_group); 4961 ret = cache_block_group(block_group, trans, 0);
4666 BUG_ON(ret); 4962 BUG_ON(ret);
4667 } 4963 }
4668 found_uncached_bg = true; 4964 found_uncached_bg = true;
@@ -5219,7 +5515,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5219 u64 num_bytes = ins->offset; 5515 u64 num_bytes = ins->offset;
5220 5516
5221 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5517 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5222 cache_block_group(block_group); 5518 cache_block_group(block_group, trans, 0);
5223 caching_ctl = get_caching_control(block_group); 5519 caching_ctl = get_caching_control(block_group);
5224 5520
5225 if (!caching_ctl) { 5521 if (!caching_ctl) {
@@ -5309,7 +5605,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5309 block_rsv = get_block_rsv(trans, root); 5605 block_rsv = get_block_rsv(trans, root);
5310 5606
5311 if (block_rsv->size == 0) { 5607 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize); 5608 ret = reserve_metadata_bytes(trans, root, block_rsv,
5609 blocksize, 0);
5313 if (ret) 5610 if (ret)
5314 return ERR_PTR(ret); 5611 return ERR_PTR(ret);
5315 return block_rsv; 5612 return block_rsv;
@@ -5319,11 +5616,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5319 if (!ret) 5616 if (!ret)
5320 return block_rsv; 5617 return block_rsv;
5321 5618
5322 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5324 block_rsv->size, block_rsv->reserved,
5325 block_rsv->freed[0], block_rsv->freed[1]);
5326
5327 return ERR_PTR(-ENOSPC); 5619 return ERR_PTR(-ENOSPC);
5328} 5620}
5329 5621
@@ -5422,7 +5714,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5422 u64 generation; 5714 u64 generation;
5423 u64 refs; 5715 u64 refs;
5424 u64 flags; 5716 u64 flags;
5425 u64 last = 0;
5426 u32 nritems; 5717 u32 nritems;
5427 u32 blocksize; 5718 u32 blocksize;
5428 struct btrfs_key key; 5719 struct btrfs_key key;
@@ -5490,7 +5781,6 @@ reada:
5490 generation); 5781 generation);
5491 if (ret) 5782 if (ret)
5492 break; 5783 break;
5493 last = bytenr + blocksize;
5494 nread++; 5784 nread++;
5495 } 5785 }
5496 wc->reada_slot = slot; 5786 wc->reada_slot = slot;
@@ -7814,6 +8104,40 @@ out:
7814 return ret; 8104 return ret;
7815} 8105}
7816 8106
8107void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8108{
8109 struct btrfs_block_group_cache *block_group;
8110 u64 last = 0;
8111
8112 while (1) {
8113 struct inode *inode;
8114
8115 block_group = btrfs_lookup_first_block_group(info, last);
8116 while (block_group) {
8117 spin_lock(&block_group->lock);
8118 if (block_group->iref)
8119 break;
8120 spin_unlock(&block_group->lock);
8121 block_group = next_block_group(info->tree_root,
8122 block_group);
8123 }
8124 if (!block_group) {
8125 if (last == 0)
8126 break;
8127 last = 0;
8128 continue;
8129 }
8130
8131 inode = block_group->inode;
8132 block_group->iref = 0;
8133 block_group->inode = NULL;
8134 spin_unlock(&block_group->lock);
8135 iput(inode);
8136 last = block_group->key.objectid + block_group->key.offset;
8137 btrfs_put_block_group(block_group);
8138 }
8139}
8140
7817int btrfs_free_block_groups(struct btrfs_fs_info *info) 8141int btrfs_free_block_groups(struct btrfs_fs_info *info)
7818{ 8142{
7819 struct btrfs_block_group_cache *block_group; 8143 struct btrfs_block_group_cache *block_group;
@@ -7897,6 +8221,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7897 struct btrfs_key key; 8221 struct btrfs_key key;
7898 struct btrfs_key found_key; 8222 struct btrfs_key found_key;
7899 struct extent_buffer *leaf; 8223 struct extent_buffer *leaf;
8224 int need_clear = 0;
8225 u64 cache_gen;
7900 8226
7901 root = info->extent_root; 8227 root = info->extent_root;
7902 key.objectid = 0; 8228 key.objectid = 0;
@@ -7906,6 +8232,15 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7906 if (!path) 8232 if (!path)
7907 return -ENOMEM; 8233 return -ENOMEM;
7908 8234
8235 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
8236 if (cache_gen != 0 &&
8237 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
8238 need_clear = 1;
8239 if (btrfs_test_opt(root, CLEAR_CACHE))
8240 need_clear = 1;
8241 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
8242 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
8243
7909 while (1) { 8244 while (1) {
7910 ret = find_first_block_group(root, path, &key); 8245 ret = find_first_block_group(root, path, &key);
7911 if (ret > 0) 8246 if (ret > 0)
@@ -7928,6 +8263,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7928 INIT_LIST_HEAD(&cache->list); 8263 INIT_LIST_HEAD(&cache->list);
7929 INIT_LIST_HEAD(&cache->cluster_list); 8264 INIT_LIST_HEAD(&cache->cluster_list);
7930 8265
8266 if (need_clear)
8267 cache->disk_cache_state = BTRFS_DC_CLEAR;
8268
7931 /* 8269 /*
7932 * we only want to have 32k of ram per block group for keeping 8270 * we only want to have 32k of ram per block group for keeping
7933 * track of free space, and if we pass 1/2 of that we want to 8271 * track of free space, and if we pass 1/2 of that we want to
@@ -8032,6 +8370,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8032 cache->key.offset = size; 8370 cache->key.offset = size;
8033 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8371 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8034 cache->sectorsize = root->sectorsize; 8372 cache->sectorsize = root->sectorsize;
8373 cache->fs_info = root->fs_info;
8035 8374
8036 /* 8375 /*
8037 * we only want to have 32k of ram per block group for keeping track 8376 * we only want to have 32k of ram per block group for keeping track
@@ -8088,8 +8427,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8088 struct btrfs_path *path; 8427 struct btrfs_path *path;
8089 struct btrfs_block_group_cache *block_group; 8428 struct btrfs_block_group_cache *block_group;
8090 struct btrfs_free_cluster *cluster; 8429 struct btrfs_free_cluster *cluster;
8430 struct btrfs_root *tree_root = root->fs_info->tree_root;
8091 struct btrfs_key key; 8431 struct btrfs_key key;
8432 struct inode *inode;
8092 int ret; 8433 int ret;
8434 int factor;
8093 8435
8094 root = root->fs_info->extent_root; 8436 root = root->fs_info->extent_root;
8095 8437
@@ -8098,6 +8440,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8098 BUG_ON(!block_group->ro); 8440 BUG_ON(!block_group->ro);
8099 8441
8100 memcpy(&key, &block_group->key, sizeof(key)); 8442 memcpy(&key, &block_group->key, sizeof(key));
8443 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8444 BTRFS_BLOCK_GROUP_RAID1 |
8445 BTRFS_BLOCK_GROUP_RAID10))
8446 factor = 2;
8447 else
8448 factor = 1;
8101 8449
8102 /* make sure this block group isn't part of an allocation cluster */ 8450 /* make sure this block group isn't part of an allocation cluster */
8103 cluster = &root->fs_info->data_alloc_cluster; 8451 cluster = &root->fs_info->data_alloc_cluster;
@@ -8117,6 +8465,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8117 path = btrfs_alloc_path(); 8465 path = btrfs_alloc_path();
8118 BUG_ON(!path); 8466 BUG_ON(!path);
8119 8467
8468 inode = lookup_free_space_inode(root, block_group, path);
8469 if (!IS_ERR(inode)) {
8470 btrfs_orphan_add(trans, inode);
8471 clear_nlink(inode);
8472 /* One for the block groups ref */
8473 spin_lock(&block_group->lock);
8474 if (block_group->iref) {
8475 block_group->iref = 0;
8476 block_group->inode = NULL;
8477 spin_unlock(&block_group->lock);
8478 iput(inode);
8479 } else {
8480 spin_unlock(&block_group->lock);
8481 }
8482 /* One for our lookup ref */
8483 iput(inode);
8484 }
8485
8486 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8487 key.offset = block_group->key.objectid;
8488 key.type = 0;
8489
8490 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8491 if (ret < 0)
8492 goto out;
8493 if (ret > 0)
8494 btrfs_release_path(tree_root, path);
8495 if (ret == 0) {
8496 ret = btrfs_del_item(trans, tree_root, path);
8497 if (ret)
8498 goto out;
8499 btrfs_release_path(tree_root, path);
8500 }
8501
8120 spin_lock(&root->fs_info->block_group_cache_lock); 8502 spin_lock(&root->fs_info->block_group_cache_lock);
8121 rb_erase(&block_group->cache_node, 8503 rb_erase(&block_group->cache_node,
8122 &root->fs_info->block_group_cache_tree); 8504 &root->fs_info->block_group_cache_tree);
@@ -8138,8 +8520,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8138 spin_lock(&block_group->space_info->lock); 8520 spin_lock(&block_group->space_info->lock);
8139 block_group->space_info->total_bytes -= block_group->key.offset; 8521 block_group->space_info->total_bytes -= block_group->key.offset;
8140 block_group->space_info->bytes_readonly -= block_group->key.offset; 8522 block_group->space_info->bytes_readonly -= block_group->key.offset;
8523 block_group->space_info->disk_total -= block_group->key.offset * factor;
8141 spin_unlock(&block_group->space_info->lock); 8524 spin_unlock(&block_group->space_info->lock);
8142 8525
8526 memcpy(&key, &block_group->key, sizeof(key));
8527
8143 btrfs_clear_space_info_full(root->fs_info); 8528 btrfs_clear_space_info_full(root->fs_info);
8144 8529
8145 btrfs_put_block_group(block_group); 8530 btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53a..eac10e3260a9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
104 struct address_space *mapping, gfp_t mask) 104 struct address_space *mapping, gfp_t mask)
105{ 105{
106 tree->state = RB_ROOT; 106 tree->state = RB_ROOT;
107 tree->buffer = RB_ROOT; 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
108 tree->ops = NULL; 108 tree->ops = NULL;
109 tree->dirty_bytes = 0; 109 tree->dirty_bytes = 0;
110 spin_lock_init(&tree->lock); 110 spin_lock_init(&tree->lock);
@@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
235 return ret; 235 return ret;
236} 236}
237 237
238static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
239 u64 offset, struct rb_node *node)
240{
241 struct rb_root *root = &tree->buffer;
242 struct rb_node **p = &root->rb_node;
243 struct rb_node *parent = NULL;
244 struct extent_buffer *eb;
245
246 while (*p) {
247 parent = *p;
248 eb = rb_entry(parent, struct extent_buffer, rb_node);
249
250 if (offset < eb->start)
251 p = &(*p)->rb_left;
252 else if (offset > eb->start)
253 p = &(*p)->rb_right;
254 else
255 return eb;
256 }
257
258 rb_link_node(node, parent, p);
259 rb_insert_color(node, root);
260 return NULL;
261}
262
263static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
264 u64 offset)
265{
266 struct rb_root *root = &tree->buffer;
267 struct rb_node *n = root->rb_node;
268 struct extent_buffer *eb;
269
270 while (n) {
271 eb = rb_entry(n, struct extent_buffer, rb_node);
272 if (offset < eb->start)
273 n = n->rb_left;
274 else if (offset > eb->start)
275 n = n->rb_right;
276 else
277 return eb;
278 }
279 return NULL;
280}
281
282static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 238static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
283 struct extent_state *other) 239 struct extent_state *other)
284{ 240{
@@ -1901,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1901 struct page *page = bvec->bv_page; 1857 struct page *page = bvec->bv_page;
1902 struct extent_io_tree *tree = bio->bi_private; 1858 struct extent_io_tree *tree = bio->bi_private;
1903 u64 start; 1859 u64 start;
1904 u64 end;
1905 1860
1906 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1861 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1907 end = start + bvec->bv_len - 1;
1908 1862
1909 bio->bi_private = NULL; 1863 bio->bi_private = NULL;
1910 1864
@@ -2204,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2204 u64 last_byte = i_size_read(inode); 2158 u64 last_byte = i_size_read(inode);
2205 u64 block_start; 2159 u64 block_start;
2206 u64 iosize; 2160 u64 iosize;
2207 u64 unlock_start;
2208 sector_t sector; 2161 sector_t sector;
2209 struct extent_state *cached_state = NULL; 2162 struct extent_state *cached_state = NULL;
2210 struct extent_map *em; 2163 struct extent_map *em;
@@ -2329,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2329 if (tree->ops && tree->ops->writepage_end_io_hook) 2282 if (tree->ops && tree->ops->writepage_end_io_hook)
2330 tree->ops->writepage_end_io_hook(page, start, 2283 tree->ops->writepage_end_io_hook(page, start,
2331 page_end, NULL, 1); 2284 page_end, NULL, 1);
2332 unlock_start = page_end + 1;
2333 goto done; 2285 goto done;
2334 } 2286 }
2335 2287
@@ -2340,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2340 if (tree->ops && tree->ops->writepage_end_io_hook) 2292 if (tree->ops && tree->ops->writepage_end_io_hook)
2341 tree->ops->writepage_end_io_hook(page, cur, 2293 tree->ops->writepage_end_io_hook(page, cur,
2342 page_end, NULL, 1); 2294 page_end, NULL, 1);
2343 unlock_start = page_end + 1;
2344 break; 2295 break;
2345 } 2296 }
2346 em = epd->get_extent(inode, page, pg_offset, cur, 2297 em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2387,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2387 2338
2388 cur += iosize; 2339 cur += iosize;
2389 pg_offset += iosize; 2340 pg_offset += iosize;
2390 unlock_start = cur;
2391 continue; 2341 continue;
2392 } 2342 }
2393 /* leave this out until we have a page_mkwrite call */ 2343 /* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2473 pgoff_t index; 2423 pgoff_t index;
2474 pgoff_t end; /* Inclusive */ 2424 pgoff_t end; /* Inclusive */
2475 int scanned = 0; 2425 int scanned = 0;
2476 int range_whole = 0;
2477 2426
2478 pagevec_init(&pvec, 0); 2427 pagevec_init(&pvec, 0);
2479 if (wbc->range_cyclic) { 2428 if (wbc->range_cyclic) {
@@ -2482,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2482 } else { 2431 } else {
2483 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2432 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2484 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2433 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2485 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2486 range_whole = 1;
2487 scanned = 1; 2434 scanned = 1;
2488 } 2435 }
2489retry: 2436retry:
@@ -2823,6 +2770,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
2823 NULL, 1, 2770 NULL, 1,
2824 end_bio_extent_preparewrite, 0, 2771 end_bio_extent_preparewrite, 0,
2825 0, 0); 2772 0, 0);
2773 if (ret && !err)
2774 err = ret;
2826 iocount++; 2775 iocount++;
2827 block_start = block_start + iosize; 2776 block_start = block_start + iosize;
2828 } else { 2777 } else {
@@ -3104,6 +3053,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
3104 kmem_cache_free(extent_buffer_cache, eb); 3053 kmem_cache_free(extent_buffer_cache, eb);
3105} 3054}
3106 3055
3056/*
3057 * Helper for releasing extent buffer page.
3058 */
3059static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3060 unsigned long start_idx)
3061{
3062 unsigned long index;
3063 struct page *page;
3064
3065 if (!eb->first_page)
3066 return;
3067
3068 index = num_extent_pages(eb->start, eb->len);
3069 if (start_idx >= index)
3070 return;
3071
3072 do {
3073 index--;
3074 page = extent_buffer_page(eb, index);
3075 if (page)
3076 page_cache_release(page);
3077 } while (index != start_idx);
3078}
3079
3080/*
3081 * Helper for releasing the extent buffer.
3082 */
3083static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3084{
3085 btrfs_release_extent_buffer_page(eb, 0);
3086 __free_extent_buffer(eb);
3087}
3088
3107struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3089struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3108 u64 start, unsigned long len, 3090 u64 start, unsigned long len,
3109 struct page *page0, 3091 struct page *page0,
@@ -3117,16 +3099,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3117 struct page *p; 3099 struct page *p;
3118 struct address_space *mapping = tree->mapping; 3100 struct address_space *mapping = tree->mapping;
3119 int uptodate = 1; 3101 int uptodate = 1;
3102 int ret;
3120 3103
3121 spin_lock(&tree->buffer_lock); 3104 rcu_read_lock();
3122 eb = buffer_search(tree, start); 3105 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3123 if (eb) { 3106 if (eb && atomic_inc_not_zero(&eb->refs)) {
3124 atomic_inc(&eb->refs); 3107 rcu_read_unlock();
3125 spin_unlock(&tree->buffer_lock);
3126 mark_page_accessed(eb->first_page); 3108 mark_page_accessed(eb->first_page);
3127 return eb; 3109 return eb;
3128 } 3110 }
3129 spin_unlock(&tree->buffer_lock); 3111 rcu_read_unlock();
3130 3112
3131 eb = __alloc_extent_buffer(tree, start, len, mask); 3113 eb = __alloc_extent_buffer(tree, start, len, mask);
3132 if (!eb) 3114 if (!eb)
@@ -3165,26 +3147,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3165 if (uptodate) 3147 if (uptodate)
3166 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3148 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3167 3149
3150 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3151 if (ret)
3152 goto free_eb;
3153
3168 spin_lock(&tree->buffer_lock); 3154 spin_lock(&tree->buffer_lock);
3169 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3155 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3170 if (exists) { 3156 if (ret == -EEXIST) {
3157 exists = radix_tree_lookup(&tree->buffer,
3158 start >> PAGE_CACHE_SHIFT);
3171 /* add one reference for the caller */ 3159 /* add one reference for the caller */
3172 atomic_inc(&exists->refs); 3160 atomic_inc(&exists->refs);
3173 spin_unlock(&tree->buffer_lock); 3161 spin_unlock(&tree->buffer_lock);
3162 radix_tree_preload_end();
3174 goto free_eb; 3163 goto free_eb;
3175 } 3164 }
3176 /* add one reference for the tree */ 3165 /* add one reference for the tree */
3177 atomic_inc(&eb->refs); 3166 atomic_inc(&eb->refs);
3178 spin_unlock(&tree->buffer_lock); 3167 spin_unlock(&tree->buffer_lock);
3168 radix_tree_preload_end();
3179 return eb; 3169 return eb;
3180 3170
3181free_eb: 3171free_eb:
3182 if (!atomic_dec_and_test(&eb->refs)) 3172 if (!atomic_dec_and_test(&eb->refs))
3183 return exists; 3173 return exists;
3184 for (index = 1; index < i; index++) 3174 btrfs_release_extent_buffer(eb);
3185 page_cache_release(extent_buffer_page(eb, index));
3186 page_cache_release(extent_buffer_page(eb, 0));
3187 __free_extent_buffer(eb);
3188 return exists; 3175 return exists;
3189} 3176}
3190 3177
@@ -3194,16 +3181,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3194{ 3181{
3195 struct extent_buffer *eb; 3182 struct extent_buffer *eb;
3196 3183
3197 spin_lock(&tree->buffer_lock); 3184 rcu_read_lock();
3198 eb = buffer_search(tree, start); 3185 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3199 if (eb) 3186 if (eb && atomic_inc_not_zero(&eb->refs)) {
3200 atomic_inc(&eb->refs); 3187 rcu_read_unlock();
3201 spin_unlock(&tree->buffer_lock);
3202
3203 if (eb)
3204 mark_page_accessed(eb->first_page); 3188 mark_page_accessed(eb->first_page);
3189 return eb;
3190 }
3191 rcu_read_unlock();
3205 3192
3206 return eb; 3193 return NULL;
3207} 3194}
3208 3195
3209void free_extent_buffer(struct extent_buffer *eb) 3196void free_extent_buffer(struct extent_buffer *eb)
@@ -3833,34 +3820,45 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3833 } 3820 }
3834} 3821}
3835 3822
3823static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3824{
3825 struct extent_buffer *eb =
3826 container_of(head, struct extent_buffer, rcu_head);
3827
3828 btrfs_release_extent_buffer(eb);
3829}
3830
3836int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3831int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3837{ 3832{
3838 u64 start = page_offset(page); 3833 u64 start = page_offset(page);
3839 struct extent_buffer *eb; 3834 struct extent_buffer *eb;
3840 int ret = 1; 3835 int ret = 1;
3841 unsigned long i;
3842 unsigned long num_pages;
3843 3836
3844 spin_lock(&tree->buffer_lock); 3837 spin_lock(&tree->buffer_lock);
3845 eb = buffer_search(tree, start); 3838 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3846 if (!eb) 3839 if (!eb)
3847 goto out; 3840 goto out;
3848 3841
3849 if (atomic_read(&eb->refs) > 1) { 3842 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3850 ret = 0; 3843 ret = 0;
3851 goto out; 3844 goto out;
3852 } 3845 }
3853 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3846
3847 /*
3848 * set @eb->refs to 0 if it is already 1, and then release the @eb.
3849 * Or go back.
3850 */
3851 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
3854 ret = 0; 3852 ret = 0;
3855 goto out; 3853 goto out;
3856 } 3854 }
3857 /* at this point we can safely release the extent buffer */ 3855
3858 num_pages = num_extent_pages(eb->start, eb->len); 3856 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3859 for (i = 0; i < num_pages; i++)
3860 page_cache_release(extent_buffer_page(eb, i));
3861 rb_erase(&eb->rb_node, &tree->buffer);
3862 __free_extent_buffer(eb);
3863out: 3857out:
3864 spin_unlock(&tree->buffer_lock); 3858 spin_unlock(&tree->buffer_lock);
3859
3860 /* at this point we can safely release the extent buffer */
3861 if (atomic_read(&eb->refs) == 0)
3862 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3865 return ret; 3863 return ret;
3866} 3864}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590da..1c6d4f342ef7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -85,7 +85,7 @@ struct extent_io_ops {
85 85
86struct extent_io_tree { 86struct extent_io_tree {
87 struct rb_root state; 87 struct rb_root state;
88 struct rb_root buffer; 88 struct radix_tree_root buffer;
89 struct address_space *mapping; 89 struct address_space *mapping;
90 u64 dirty_bytes; 90 u64 dirty_bytes;
91 spinlock_t lock; 91 spinlock_t lock;
@@ -123,7 +123,7 @@ struct extent_buffer {
123 unsigned long bflags; 123 unsigned long bflags;
124 atomic_t refs; 124 atomic_t refs;
125 struct list_head leak_list; 125 struct list_head leak_list;
126 struct rb_node rb_node; 126 struct rcu_head rcu_head;
127 127
128 /* the spinlock is used to protect most operations */ 128 /* the spinlock is used to protect most operations */
129 spinlock_t lock; 129 spinlock_t lock;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d6451..23cb8da3ff66 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
335 goto out; 335 goto out;
336 } 336 }
337 if (IS_ERR(rb_node)) { 337 if (IS_ERR(rb_node)) {
338 em = ERR_PTR(PTR_ERR(rb_node)); 338 em = ERR_CAST(rb_node);
339 goto out; 339 goto out;
340 } 340 }
341 em = rb_entry(rb_node, struct extent_map, rb_node); 341 em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
384 goto out; 384 goto out;
385 } 385 }
386 if (IS_ERR(rb_node)) { 386 if (IS_ERR(rb_node)) {
387 em = ERR_PTR(PTR_ERR(rb_node)); 387 em = ERR_CAST(rb_node);
388 goto out; 388 goto out;
389 } 389 }
390 em = rb_entry(rb_node, struct extent_map, rb_node); 390 em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d99..22ee0dc2e6b8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,10 +23,761 @@
23#include "ctree.h" 23#include "ctree.h"
24#include "free-space-cache.h" 24#include "free-space-cache.h"
25#include "transaction.h" 25#include "transaction.h"
26#include "disk-io.h"
26 27
27#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 28#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
28#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 29#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
29 30
31static void recalculate_thresholds(struct btrfs_block_group_cache
32 *block_group);
33static int link_free_space(struct btrfs_block_group_cache *block_group,
34 struct btrfs_free_space *info);
35
36struct inode *lookup_free_space_inode(struct btrfs_root *root,
37 struct btrfs_block_group_cache
38 *block_group, struct btrfs_path *path)
39{
40 struct btrfs_key key;
41 struct btrfs_key location;
42 struct btrfs_disk_key disk_key;
43 struct btrfs_free_space_header *header;
44 struct extent_buffer *leaf;
45 struct inode *inode = NULL;
46 int ret;
47
48 spin_lock(&block_group->lock);
49 if (block_group->inode)
50 inode = igrab(block_group->inode);
51 spin_unlock(&block_group->lock);
52 if (inode)
53 return inode;
54
55 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
56 key.offset = block_group->key.objectid;
57 key.type = 0;
58
59 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
60 if (ret < 0)
61 return ERR_PTR(ret);
62 if (ret > 0) {
63 btrfs_release_path(root, path);
64 return ERR_PTR(-ENOENT);
65 }
66
67 leaf = path->nodes[0];
68 header = btrfs_item_ptr(leaf, path->slots[0],
69 struct btrfs_free_space_header);
70 btrfs_free_space_key(leaf, header, &disk_key);
71 btrfs_disk_key_to_cpu(&location, &disk_key);
72 btrfs_release_path(root, path);
73
74 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
75 if (!inode)
76 return ERR_PTR(-ENOENT);
77 if (IS_ERR(inode))
78 return inode;
79 if (is_bad_inode(inode)) {
80 iput(inode);
81 return ERR_PTR(-ENOENT);
82 }
83
84 spin_lock(&block_group->lock);
85 if (!root->fs_info->closing) {
86 block_group->inode = igrab(inode);
87 block_group->iref = 1;
88 }
89 spin_unlock(&block_group->lock);
90
91 return inode;
92}
93
94int create_free_space_inode(struct btrfs_root *root,
95 struct btrfs_trans_handle *trans,
96 struct btrfs_block_group_cache *block_group,
97 struct btrfs_path *path)
98{
99 struct btrfs_key key;
100 struct btrfs_disk_key disk_key;
101 struct btrfs_free_space_header *header;
102 struct btrfs_inode_item *inode_item;
103 struct extent_buffer *leaf;
104 u64 objectid;
105 int ret;
106
107 ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
108 if (ret < 0)
109 return ret;
110
111 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
112 if (ret)
113 return ret;
114
115 leaf = path->nodes[0];
116 inode_item = btrfs_item_ptr(leaf, path->slots[0],
117 struct btrfs_inode_item);
118 btrfs_item_key(leaf, &disk_key, path->slots[0]);
119 memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
120 sizeof(*inode_item));
121 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
122 btrfs_set_inode_size(leaf, inode_item, 0);
123 btrfs_set_inode_nbytes(leaf, inode_item, 0);
124 btrfs_set_inode_uid(leaf, inode_item, 0);
125 btrfs_set_inode_gid(leaf, inode_item, 0);
126 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
127 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
128 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
129 btrfs_set_inode_nlink(leaf, inode_item, 1);
130 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
131 btrfs_set_inode_block_group(leaf, inode_item,
132 block_group->key.objectid);
133 btrfs_mark_buffer_dirty(leaf);
134 btrfs_release_path(root, path);
135
136 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
137 key.offset = block_group->key.objectid;
138 key.type = 0;
139
140 ret = btrfs_insert_empty_item(trans, root, path, &key,
141 sizeof(struct btrfs_free_space_header));
142 if (ret < 0) {
143 btrfs_release_path(root, path);
144 return ret;
145 }
146 leaf = path->nodes[0];
147 header = btrfs_item_ptr(leaf, path->slots[0],
148 struct btrfs_free_space_header);
149 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
150 btrfs_set_free_space_key(leaf, header, &disk_key);
151 btrfs_mark_buffer_dirty(leaf);
152 btrfs_release_path(root, path);
153
154 return 0;
155}
156
157int btrfs_truncate_free_space_cache(struct btrfs_root *root,
158 struct btrfs_trans_handle *trans,
159 struct btrfs_path *path,
160 struct inode *inode)
161{
162 loff_t oldsize;
163 int ret = 0;
164
165 trans->block_rsv = root->orphan_block_rsv;
166 ret = btrfs_block_rsv_check(trans, root,
167 root->orphan_block_rsv,
168 0, 5);
169 if (ret)
170 return ret;
171
172 oldsize = i_size_read(inode);
173 btrfs_i_size_write(inode, 0);
174 truncate_pagecache(inode, oldsize, 0);
175
176 /*
177 * We don't need an orphan item because truncating the free space cache
178 * will never be split across transactions.
179 */
180 ret = btrfs_truncate_inode_items(trans, root, inode,
181 0, BTRFS_EXTENT_DATA_KEY);
182 if (ret) {
183 WARN_ON(1);
184 return ret;
185 }
186
187 return btrfs_update_inode(trans, root, inode);
188}
189
190static int readahead_cache(struct inode *inode)
191{
192 struct file_ra_state *ra;
193 unsigned long last_index;
194
195 ra = kzalloc(sizeof(*ra), GFP_NOFS);
196 if (!ra)
197 return -ENOMEM;
198
199 file_ra_state_init(ra, inode->i_mapping);
200 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
201
202 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
203
204 kfree(ra);
205
206 return 0;
207}
208
209int load_free_space_cache(struct btrfs_fs_info *fs_info,
210 struct btrfs_block_group_cache *block_group)
211{
212 struct btrfs_root *root = fs_info->tree_root;
213 struct inode *inode;
214 struct btrfs_free_space_header *header;
215 struct extent_buffer *leaf;
216 struct page *page;
217 struct btrfs_path *path;
218 u32 *checksums = NULL, *crc;
219 char *disk_crcs = NULL;
220 struct btrfs_key key;
221 struct list_head bitmaps;
222 u64 num_entries;
223 u64 num_bitmaps;
224 u64 generation;
225 u32 cur_crc = ~(u32)0;
226 pgoff_t index = 0;
227 unsigned long first_page_offset;
228 int num_checksums;
229 int ret = 0;
230
231 /*
232 * If we're unmounting then just return, since this does a search on the
233 * normal root and not the commit root and we could deadlock.
234 */
235 smp_mb();
236 if (fs_info->closing)
237 return 0;
238
239 /*
240 * If this block group has been marked to be cleared for one reason or
241 * another then we can't trust the on disk cache, so just return.
242 */
243 spin_lock(&block_group->lock);
244 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
245 spin_unlock(&block_group->lock);
246 return 0;
247 }
248 spin_unlock(&block_group->lock);
249
250 INIT_LIST_HEAD(&bitmaps);
251
252 path = btrfs_alloc_path();
253 if (!path)
254 return 0;
255
256 inode = lookup_free_space_inode(root, block_group, path);
257 if (IS_ERR(inode)) {
258 btrfs_free_path(path);
259 return 0;
260 }
261
262 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) {
264 btrfs_free_path(path);
265 goto out;
266 }
267
268 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
269 key.offset = block_group->key.objectid;
270 key.type = 0;
271
272 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
273 if (ret) {
274 btrfs_free_path(path);
275 goto out;
276 }
277
278 leaf = path->nodes[0];
279 header = btrfs_item_ptr(leaf, path->slots[0],
280 struct btrfs_free_space_header);
281 num_entries = btrfs_free_space_entries(leaf, header);
282 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
283 generation = btrfs_free_space_generation(leaf, header);
284 btrfs_free_path(path);
285
286 if (BTRFS_I(inode)->generation != generation) {
287 printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
288 " not match free space cache generation (%llu) for "
289 "block group %llu\n",
290 (unsigned long long)BTRFS_I(inode)->generation,
291 (unsigned long long)generation,
292 (unsigned long long)block_group->key.objectid);
293 goto out;
294 }
295
296 if (!num_entries)
297 goto out;
298
299 /* Setup everything for doing checksumming */
300 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
301 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
302 if (!checksums)
303 goto out;
304 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
305 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
306 if (!disk_crcs)
307 goto out;
308
309 ret = readahead_cache(inode);
310 if (ret) {
311 ret = 0;
312 goto out;
313 }
314
315 while (1) {
316 struct btrfs_free_space_entry *entry;
317 struct btrfs_free_space *e;
318 void *addr;
319 unsigned long offset = 0;
320 unsigned long start_offset = 0;
321 int need_loop = 0;
322
323 if (!num_entries && !num_bitmaps)
324 break;
325
326 if (index == 0) {
327 start_offset = first_page_offset;
328 offset = start_offset;
329 }
330
331 page = grab_cache_page(inode->i_mapping, index);
332 if (!page) {
333 ret = 0;
334 goto free_cache;
335 }
336
337 if (!PageUptodate(page)) {
338 btrfs_readpage(NULL, page);
339 lock_page(page);
340 if (!PageUptodate(page)) {
341 unlock_page(page);
342 page_cache_release(page);
343 printk(KERN_ERR "btrfs: error reading free "
344 "space cache: %llu\n",
345 (unsigned long long)
346 block_group->key.objectid);
347 goto free_cache;
348 }
349 }
350 addr = kmap(page);
351
352 if (index == 0) {
353 u64 *gen;
354
355 memcpy(disk_crcs, addr, first_page_offset);
356 gen = addr + (sizeof(u32) * num_checksums);
357 if (*gen != BTRFS_I(inode)->generation) {
358 printk(KERN_ERR "btrfs: space cache generation"
359 " (%llu) does not match inode (%llu) "
360 "for block group %llu\n",
361 (unsigned long long)*gen,
362 (unsigned long long)
363 BTRFS_I(inode)->generation,
364 (unsigned long long)
365 block_group->key.objectid);
366 kunmap(page);
367 unlock_page(page);
368 page_cache_release(page);
369 goto free_cache;
370 }
371 crc = (u32 *)disk_crcs;
372 }
373 entry = addr + start_offset;
374
375 /* First lets check our crc before we do anything fun */
376 cur_crc = ~(u32)0;
377 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
378 PAGE_CACHE_SIZE - start_offset);
379 btrfs_csum_final(cur_crc, (char *)&cur_crc);
380 if (cur_crc != *crc) {
381 printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
382 "block group %llu\n", index,
383 (unsigned long long)block_group->key.objectid);
384 kunmap(page);
385 unlock_page(page);
386 page_cache_release(page);
387 goto free_cache;
388 }
389 crc++;
390
391 while (1) {
392 if (!num_entries)
393 break;
394
395 need_loop = 1;
396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
397 if (!e) {
398 kunmap(page);
399 unlock_page(page);
400 page_cache_release(page);
401 goto free_cache;
402 }
403
404 e->offset = le64_to_cpu(entry->offset);
405 e->bytes = le64_to_cpu(entry->bytes);
406 if (!e->bytes) {
407 kunmap(page);
408 kfree(e);
409 unlock_page(page);
410 page_cache_release(page);
411 goto free_cache;
412 }
413
414 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
415 spin_lock(&block_group->tree_lock);
416 ret = link_free_space(block_group, e);
417 spin_unlock(&block_group->tree_lock);
418 BUG_ON(ret);
419 } else {
420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
421 if (!e->bitmap) {
422 kunmap(page);
423 kfree(e);
424 unlock_page(page);
425 page_cache_release(page);
426 goto free_cache;
427 }
428 spin_lock(&block_group->tree_lock);
429 ret = link_free_space(block_group, e);
430 block_group->total_bitmaps++;
431 recalculate_thresholds(block_group);
432 spin_unlock(&block_group->tree_lock);
433 list_add_tail(&e->list, &bitmaps);
434 }
435
436 num_entries--;
437 offset += sizeof(struct btrfs_free_space_entry);
438 if (offset + sizeof(struct btrfs_free_space_entry) >=
439 PAGE_CACHE_SIZE)
440 break;
441 entry++;
442 }
443
444 /*
445 * We read an entry out of this page, we need to move on to the
446 * next page.
447 */
448 if (need_loop) {
449 kunmap(page);
450 goto next;
451 }
452
453 /*
454 * We add the bitmaps at the end of the entries in order that
455 * the bitmap entries are added to the cache.
456 */
457 e = list_entry(bitmaps.next, struct btrfs_free_space, list);
458 list_del_init(&e->list);
459 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
460 kunmap(page);
461 num_bitmaps--;
462next:
463 unlock_page(page);
464 page_cache_release(page);
465 index++;
466 }
467
468 ret = 1;
469out:
470 kfree(checksums);
471 kfree(disk_crcs);
472 iput(inode);
473 return ret;
474
475free_cache:
476 /* This cache is bogus, make sure it gets cleared */
477 spin_lock(&block_group->lock);
478 block_group->disk_cache_state = BTRFS_DC_CLEAR;
479 spin_unlock(&block_group->lock);
480 btrfs_remove_free_space_cache(block_group);
481 goto out;
482}
483
484int btrfs_write_out_cache(struct btrfs_root *root,
485 struct btrfs_trans_handle *trans,
486 struct btrfs_block_group_cache *block_group,
487 struct btrfs_path *path)
488{
489 struct btrfs_free_space_header *header;
490 struct extent_buffer *leaf;
491 struct inode *inode;
492 struct rb_node *node;
493 struct list_head *pos, *n;
494 struct page *page;
495 struct extent_state *cached_state = NULL;
496 struct list_head bitmap_list;
497 struct btrfs_key key;
498 u64 bytes = 0;
499 u32 *crc, *checksums;
500 pgoff_t index = 0, last_index = 0;
501 unsigned long first_page_offset;
502 int num_checksums;
503 int entries = 0;
504 int bitmaps = 0;
505 int ret = 0;
506
507 root = root->fs_info->tree_root;
508
509 INIT_LIST_HEAD(&bitmap_list);
510
511 spin_lock(&block_group->lock);
512 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
513 spin_unlock(&block_group->lock);
514 return 0;
515 }
516 spin_unlock(&block_group->lock);
517
518 inode = lookup_free_space_inode(root, block_group, path);
519 if (IS_ERR(inode))
520 return 0;
521
522 if (!i_size_read(inode)) {
523 iput(inode);
524 return 0;
525 }
526
527 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
528 filemap_write_and_wait(inode->i_mapping);
529 btrfs_wait_ordered_range(inode, inode->i_size &
530 ~(root->sectorsize - 1), (u64)-1);
531
532 /* We need a checksum per page. */
533 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
534 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
535 if (!crc) {
536 iput(inode);
537 return 0;
538 }
539
540 /* Since the first page has all of our checksums and our generation we
541 * need to calculate the offset into the page that we can start writing
542 * our entries.
543 */
544 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
545
546 node = rb_first(&block_group->free_space_offset);
547 if (!node)
548 goto out_free;
549
550 /*
551 * Lock all pages first so we can lock the extent safely.
552 *
553 * NOTE: Because we hold the ref the entire time we're going to write to
554 * the page find_get_page should never fail, so we don't do a check
555 * after find_get_page at this point. Just putting this here so people
556 * know and don't freak out.
557 */
558 while (index <= last_index) {
559 page = grab_cache_page(inode->i_mapping, index);
560 if (!page) {
561 pgoff_t i = 0;
562
563 while (i < index) {
564 page = find_get_page(inode->i_mapping, i);
565 unlock_page(page);
566 page_cache_release(page);
567 page_cache_release(page);
568 i++;
569 }
570 goto out_free;
571 }
572 index++;
573 }
574
575 index = 0;
576 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
577 0, &cached_state, GFP_NOFS);
578
579 /* Write out the extent entries */
580 do {
581 struct btrfs_free_space_entry *entry;
582 void *addr;
583 unsigned long offset = 0;
584 unsigned long start_offset = 0;
585
586 if (index == 0) {
587 start_offset = first_page_offset;
588 offset = start_offset;
589 }
590
591 page = find_get_page(inode->i_mapping, index);
592
593 addr = kmap(page);
594 entry = addr + start_offset;
595
596 memset(addr, 0, PAGE_CACHE_SIZE);
597 while (1) {
598 struct btrfs_free_space *e;
599
600 e = rb_entry(node, struct btrfs_free_space, offset_index);
601 entries++;
602
603 entry->offset = cpu_to_le64(e->offset);
604 entry->bytes = cpu_to_le64(e->bytes);
605 if (e->bitmap) {
606 entry->type = BTRFS_FREE_SPACE_BITMAP;
607 list_add_tail(&e->list, &bitmap_list);
608 bitmaps++;
609 } else {
610 entry->type = BTRFS_FREE_SPACE_EXTENT;
611 }
612 node = rb_next(node);
613 if (!node)
614 break;
615 offset += sizeof(struct btrfs_free_space_entry);
616 if (offset + sizeof(struct btrfs_free_space_entry) >=
617 PAGE_CACHE_SIZE)
618 break;
619 entry++;
620 }
621 *crc = ~(u32)0;
622 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
623 PAGE_CACHE_SIZE - start_offset);
624 kunmap(page);
625
626 btrfs_csum_final(*crc, (char *)crc);
627 crc++;
628
629 bytes += PAGE_CACHE_SIZE;
630
631 ClearPageChecked(page);
632 set_page_extent_mapped(page);
633 SetPageUptodate(page);
634 set_page_dirty(page);
635
636 /*
637 * We need to release our reference we got for grab_cache_page,
638 * except for the first page which will hold our checksums, we
639 * do that below.
640 */
641 if (index != 0) {
642 unlock_page(page);
643 page_cache_release(page);
644 }
645
646 page_cache_release(page);
647
648 index++;
649 } while (node);
650
651 /* Write out the bitmaps */
652 list_for_each_safe(pos, n, &bitmap_list) {
653 void *addr;
654 struct btrfs_free_space *entry =
655 list_entry(pos, struct btrfs_free_space, list);
656
657 page = find_get_page(inode->i_mapping, index);
658
659 addr = kmap(page);
660 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
661 *crc = ~(u32)0;
662 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
663 kunmap(page);
664 btrfs_csum_final(*crc, (char *)crc);
665 crc++;
666 bytes += PAGE_CACHE_SIZE;
667
668 ClearPageChecked(page);
669 set_page_extent_mapped(page);
670 SetPageUptodate(page);
671 set_page_dirty(page);
672 unlock_page(page);
673 page_cache_release(page);
674 page_cache_release(page);
675 list_del_init(&entry->list);
676 index++;
677 }
678
679 /* Zero out the rest of the pages just to make sure */
680 while (index <= last_index) {
681 void *addr;
682
683 page = find_get_page(inode->i_mapping, index);
684
685 addr = kmap(page);
686 memset(addr, 0, PAGE_CACHE_SIZE);
687 kunmap(page);
688 ClearPageChecked(page);
689 set_page_extent_mapped(page);
690 SetPageUptodate(page);
691 set_page_dirty(page);
692 unlock_page(page);
693 page_cache_release(page);
694 page_cache_release(page);
695 bytes += PAGE_CACHE_SIZE;
696 index++;
697 }
698
699 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
700
701 /* Write the checksums and trans id to the first page */
702 {
703 void *addr;
704 u64 *gen;
705
706 page = find_get_page(inode->i_mapping, 0);
707
708 addr = kmap(page);
709 memcpy(addr, checksums, sizeof(u32) * num_checksums);
710 gen = addr + (sizeof(u32) * num_checksums);
711 *gen = trans->transid;
712 kunmap(page);
713 ClearPageChecked(page);
714 set_page_extent_mapped(page);
715 SetPageUptodate(page);
716 set_page_dirty(page);
717 unlock_page(page);
718 page_cache_release(page);
719 page_cache_release(page);
720 }
721 BTRFS_I(inode)->generation = trans->transid;
722
723 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
724 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
725
726 filemap_write_and_wait(inode->i_mapping);
727
728 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
729 key.offset = block_group->key.objectid;
730 key.type = 0;
731
732 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
733 if (ret < 0) {
734 ret = 0;
735 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
736 EXTENT_DIRTY | EXTENT_DELALLOC |
737 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
738 goto out_free;
739 }
740 leaf = path->nodes[0];
741 if (ret > 0) {
742 struct btrfs_key found_key;
743 BUG_ON(!path->slots[0]);
744 path->slots[0]--;
745 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
746 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
747 found_key.offset != block_group->key.objectid) {
748 ret = 0;
749 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
750 EXTENT_DIRTY | EXTENT_DELALLOC |
751 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
752 GFP_NOFS);
753 btrfs_release_path(root, path);
754 goto out_free;
755 }
756 }
757 header = btrfs_item_ptr(leaf, path->slots[0],
758 struct btrfs_free_space_header);
759 btrfs_set_free_space_entries(leaf, header, entries);
760 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
761 btrfs_set_free_space_generation(leaf, header, trans->transid);
762 btrfs_mark_buffer_dirty(leaf);
763 btrfs_release_path(root, path);
764
765 ret = 1;
766
767out_free:
768 if (ret == 0) {
769 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
770 spin_lock(&block_group->lock);
771 block_group->disk_cache_state = BTRFS_DC_ERROR;
772 spin_unlock(&block_group->lock);
773 BTRFS_I(inode)->generation = 0;
774 }
775 kfree(checksums);
776 btrfs_update_inode(trans, root, inode);
777 iput(inode);
778 return ret;
779}
780
30static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, 781static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
31 u64 offset) 782 u64 offset)
32{ 783{
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011b..e49ca5c321b5 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,24 @@ struct btrfs_free_space {
27 struct list_head list; 27 struct list_head list;
28}; 28};
29 29
30struct inode *lookup_free_space_inode(struct btrfs_root *root,
31 struct btrfs_block_group_cache
32 *block_group, struct btrfs_path *path);
33int create_free_space_inode(struct btrfs_root *root,
34 struct btrfs_trans_handle *trans,
35 struct btrfs_block_group_cache *block_group,
36 struct btrfs_path *path);
37
38int btrfs_truncate_free_space_cache(struct btrfs_root *root,
39 struct btrfs_trans_handle *trans,
40 struct btrfs_path *path,
41 struct inode *inode);
42int load_free_space_cache(struct btrfs_fs_info *fs_info,
43 struct btrfs_block_group_cache *block_group);
44int btrfs_write_out_cache(struct btrfs_root *root,
45 struct btrfs_trans_handle *trans,
46 struct btrfs_block_group_cache *block_group,
47 struct btrfs_path *path);
30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 48int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytenr, u64 size); 49 u64 bytenr, u64 size);
32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 50int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..558cac2dfa54 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode,
319 struct btrfs_root *root = BTRFS_I(inode)->root; 319 struct btrfs_root *root = BTRFS_I(inode)->root;
320 struct btrfs_trans_handle *trans; 320 struct btrfs_trans_handle *trans;
321 u64 num_bytes; 321 u64 num_bytes;
322 u64 orig_start;
323 u64 disk_num_bytes;
324 u64 blocksize = root->sectorsize; 322 u64 blocksize = root->sectorsize;
325 u64 actual_end; 323 u64 actual_end;
326 u64 isize = i_size_read(inode); 324 u64 isize = i_size_read(inode);
@@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode,
335 int i; 333 int i;
336 int will_compress; 334 int will_compress;
337 335
338 orig_start = start;
339
340 actual_end = min_t(u64, isize, end + 1); 336 actual_end = min_t(u64, isize, end + 1);
341again: 337again:
342 will_compress = 0; 338 will_compress = 0;
@@ -371,7 +367,6 @@ again:
371 total_compressed = min(total_compressed, max_uncompressed); 367 total_compressed = min(total_compressed, max_uncompressed);
372 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 368 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
373 num_bytes = max(blocksize, num_bytes); 369 num_bytes = max(blocksize, num_bytes);
374 disk_num_bytes = num_bytes;
375 total_in = 0; 370 total_in = 0;
376 ret = 0; 371 ret = 0;
377 372
@@ -467,7 +462,6 @@ again:
467 if (total_compressed >= total_in) { 462 if (total_compressed >= total_in) {
468 will_compress = 0; 463 will_compress = 0;
469 } else { 464 } else {
470 disk_num_bytes = total_compressed;
471 num_bytes = total_in; 465 num_bytes = total_in;
472 } 466 }
473 } 467 }
@@ -757,20 +751,17 @@ static noinline int cow_file_range(struct inode *inode,
757 u64 disk_num_bytes; 751 u64 disk_num_bytes;
758 u64 cur_alloc_size; 752 u64 cur_alloc_size;
759 u64 blocksize = root->sectorsize; 753 u64 blocksize = root->sectorsize;
760 u64 actual_end;
761 u64 isize = i_size_read(inode);
762 struct btrfs_key ins; 754 struct btrfs_key ins;
763 struct extent_map *em; 755 struct extent_map *em;
764 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 756 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
765 int ret = 0; 757 int ret = 0;
766 758
759 BUG_ON(root == root->fs_info->tree_root);
767 trans = btrfs_join_transaction(root, 1); 760 trans = btrfs_join_transaction(root, 1);
768 BUG_ON(!trans); 761 BUG_ON(!trans);
769 btrfs_set_trans_block_group(trans, inode); 762 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 763 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
771 764
772 actual_end = min_t(u64, isize, end + 1);
773
774 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 765 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
775 num_bytes = max(blocksize, num_bytes); 766 num_bytes = max(blocksize, num_bytes);
776 disk_num_bytes = num_bytes; 767 disk_num_bytes = num_bytes;
@@ -1035,10 +1026,16 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1035 int type; 1026 int type;
1036 int nocow; 1027 int nocow;
1037 int check_prev = 1; 1028 int check_prev = 1;
1029 bool nolock = false;
1038 1030
1039 path = btrfs_alloc_path(); 1031 path = btrfs_alloc_path();
1040 BUG_ON(!path); 1032 BUG_ON(!path);
1041 trans = btrfs_join_transaction(root, 1); 1033 if (root == root->fs_info->tree_root) {
1034 nolock = true;
1035 trans = btrfs_join_transaction_nolock(root, 1);
1036 } else {
1037 trans = btrfs_join_transaction(root, 1);
1038 }
1042 BUG_ON(!trans); 1039 BUG_ON(!trans);
1043 1040
1044 cow_start = (u64)-1; 1041 cow_start = (u64)-1;
@@ -1211,8 +1208,13 @@ out_check:
1211 BUG_ON(ret); 1208 BUG_ON(ret);
1212 } 1209 }
1213 1210
1214 ret = btrfs_end_transaction(trans, root); 1211 if (nolock) {
1215 BUG_ON(ret); 1212 ret = btrfs_end_transaction_nolock(trans, root);
1213 BUG_ON(ret);
1214 } else {
1215 ret = btrfs_end_transaction(trans, root);
1216 BUG_ON(ret);
1217 }
1216 btrfs_free_path(path); 1218 btrfs_free_path(path);
1217 return 0; 1219 return 0;
1218} 1220}
@@ -1289,6 +1291,8 @@ static int btrfs_set_bit_hook(struct inode *inode,
1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1291 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1290 struct btrfs_root *root = BTRFS_I(inode)->root; 1292 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start; 1293 u64 len = state->end + 1 - state->start;
1294 int do_list = (root->root_key.objectid !=
1295 BTRFS_ROOT_TREE_OBJECTID);
1292 1296
1293 if (*bits & EXTENT_FIRST_DELALLOC) 1297 if (*bits & EXTENT_FIRST_DELALLOC)
1294 *bits &= ~EXTENT_FIRST_DELALLOC; 1298 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1302,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
1298 spin_lock(&root->fs_info->delalloc_lock); 1302 spin_lock(&root->fs_info->delalloc_lock);
1299 BTRFS_I(inode)->delalloc_bytes += len; 1303 BTRFS_I(inode)->delalloc_bytes += len;
1300 root->fs_info->delalloc_bytes += len; 1304 root->fs_info->delalloc_bytes += len;
1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1305 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1306 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1303 &root->fs_info->delalloc_inodes); 1307 &root->fs_info->delalloc_inodes);
1304 } 1308 }
@@ -1321,6 +1325,8 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1325 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1322 struct btrfs_root *root = BTRFS_I(inode)->root; 1326 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start; 1327 u64 len = state->end + 1 - state->start;
1328 int do_list = (root->root_key.objectid !=
1329 BTRFS_ROOT_TREE_OBJECTID);
1324 1330
1325 if (*bits & EXTENT_FIRST_DELALLOC) 1331 if (*bits & EXTENT_FIRST_DELALLOC)
1326 *bits &= ~EXTENT_FIRST_DELALLOC; 1332 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1336,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1330 if (*bits & EXTENT_DO_ACCOUNTING) 1336 if (*bits & EXTENT_DO_ACCOUNTING)
1331 btrfs_delalloc_release_metadata(inode, len); 1337 btrfs_delalloc_release_metadata(inode, len);
1332 1338
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) 1339 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1340 && do_list)
1334 btrfs_free_reserved_data_space(inode, len); 1341 btrfs_free_reserved_data_space(inode, len);
1335 1342
1336 spin_lock(&root->fs_info->delalloc_lock); 1343 spin_lock(&root->fs_info->delalloc_lock);
1337 root->fs_info->delalloc_bytes -= len; 1344 root->fs_info->delalloc_bytes -= len;
1338 BTRFS_I(inode)->delalloc_bytes -= len; 1345 BTRFS_I(inode)->delalloc_bytes -= len;
1339 1346
1340 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1347 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1348 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1349 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1343 } 1350 }
@@ -1372,7 +1379,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1372 1379
1373 if (map_length < length + size) 1380 if (map_length < length + size)
1374 return 1; 1381 return 1;
1375 return 0; 1382 return ret;
1376} 1383}
1377 1384
1378/* 1385/*
@@ -1426,7 +1433,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1426 1433
1427 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1434 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1428 1435
1429 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1436 if (root == root->fs_info->tree_root)
1437 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1438 else
1439 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1430 BUG_ON(ret); 1440 BUG_ON(ret);
1431 1441
1432 if (!(rw & REQ_WRITE)) { 1442 if (!(rw & REQ_WRITE)) {
@@ -1662,6 +1672,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1662 struct extent_state *cached_state = NULL; 1672 struct extent_state *cached_state = NULL;
1663 int compressed = 0; 1673 int compressed = 0;
1664 int ret; 1674 int ret;
1675 bool nolock = false;
1665 1676
1666 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1677 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1667 end - start + 1); 1678 end - start + 1);
@@ -1669,11 +1680,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1669 return 0; 1680 return 0;
1670 BUG_ON(!ordered_extent); 1681 BUG_ON(!ordered_extent);
1671 1682
1683 nolock = (root == root->fs_info->tree_root);
1684
1672 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1685 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1673 BUG_ON(!list_empty(&ordered_extent->list)); 1686 BUG_ON(!list_empty(&ordered_extent->list));
1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1687 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1675 if (!ret) { 1688 if (!ret) {
1676 trans = btrfs_join_transaction(root, 1); 1689 if (nolock)
1690 trans = btrfs_join_transaction_nolock(root, 1);
1691 else
1692 trans = btrfs_join_transaction(root, 1);
1693 BUG_ON(!trans);
1677 btrfs_set_trans_block_group(trans, inode); 1694 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1695 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1679 ret = btrfs_update_inode(trans, root, inode); 1696 ret = btrfs_update_inode(trans, root, inode);
@@ -1686,7 +1703,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1686 ordered_extent->file_offset + ordered_extent->len - 1, 1703 ordered_extent->file_offset + ordered_extent->len - 1,
1687 0, &cached_state, GFP_NOFS); 1704 0, &cached_state, GFP_NOFS);
1688 1705
1689 trans = btrfs_join_transaction(root, 1); 1706 if (nolock)
1707 trans = btrfs_join_transaction_nolock(root, 1);
1708 else
1709 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode); 1710 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1692 1712
@@ -1700,6 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1700 ordered_extent->len); 1720 ordered_extent->len);
1701 BUG_ON(ret); 1721 BUG_ON(ret);
1702 } else { 1722 } else {
1723 BUG_ON(root == root->fs_info->tree_root);
1703 ret = insert_reserved_file_extent(trans, inode, 1724 ret = insert_reserved_file_extent(trans, inode,
1704 ordered_extent->file_offset, 1725 ordered_extent->file_offset,
1705 ordered_extent->start, 1726 ordered_extent->start,
@@ -1724,9 +1745,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1724 ret = btrfs_update_inode(trans, root, inode); 1745 ret = btrfs_update_inode(trans, root, inode);
1725 BUG_ON(ret); 1746 BUG_ON(ret);
1726out: 1747out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1748 if (nolock) {
1728 if (trans) 1749 if (trans)
1729 btrfs_end_transaction(trans, root); 1750 btrfs_end_transaction_nolock(trans, root);
1751 } else {
1752 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1753 if (trans)
1754 btrfs_end_transaction(trans, root);
1755 }
1756
1730 /* once for us */ 1757 /* once for us */
1731 btrfs_put_ordered_extent(ordered_extent); 1758 btrfs_put_ordered_extent(ordered_extent);
1732 /* once for the tree */ 1759 /* once for the tree */
@@ -2237,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2237{ 2264{
2238 struct btrfs_path *path; 2265 struct btrfs_path *path;
2239 struct extent_buffer *leaf; 2266 struct extent_buffer *leaf;
2240 struct btrfs_item *item;
2241 struct btrfs_key key, found_key; 2267 struct btrfs_key key, found_key;
2242 struct btrfs_trans_handle *trans; 2268 struct btrfs_trans_handle *trans;
2243 struct inode *inode; 2269 struct inode *inode;
@@ -2275,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2275 2301
2276 /* pull out the item */ 2302 /* pull out the item */
2277 leaf = path->nodes[0]; 2303 leaf = path->nodes[0];
2278 item = btrfs_item_nr(leaf, path->slots[0]);
2279 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2304 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2280 2305
2281 /* make sure the item matches what we want */ 2306 /* make sure the item matches what we want */
@@ -2651,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2651 2676
2652 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2677 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2653 dir, index); 2678 dir, index);
2654 BUG_ON(ret); 2679 if (ret == -ENOENT)
2680 ret = 0;
2655err: 2681err:
2656 btrfs_free_path(path); 2682 btrfs_free_path(path);
2657 if (ret) 2683 if (ret)
@@ -2672,8 +2698,8 @@ static int check_path_shared(struct btrfs_root *root,
2672{ 2698{
2673 struct extent_buffer *eb; 2699 struct extent_buffer *eb;
2674 int level; 2700 int level;
2675 int ret;
2676 u64 refs = 1; 2701 u64 refs = 1;
2702 int uninitialized_var(ret);
2677 2703
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2704 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level]) 2705 if (!path->nodes[level])
@@ -2686,7 +2712,7 @@ static int check_path_shared(struct btrfs_root *root,
2686 if (refs > 1) 2712 if (refs > 1)
2687 return 1; 2713 return 1;
2688 } 2714 }
2689 return 0; 2715 return ret; /* XXX callers? */
2690} 2716}
2691 2717
2692/* 2718/*
@@ -3196,7 +3222,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3196 3222
3197 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3223 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3198 3224
3199 if (root->ref_cows) 3225 if (root->ref_cows || root == root->fs_info->tree_root)
3200 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3226 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
3201 3227
3202 path = btrfs_alloc_path(); 3228 path = btrfs_alloc_path();
@@ -3344,7 +3370,8 @@ delete:
3344 } else { 3370 } else {
3345 break; 3371 break;
3346 } 3372 }
3347 if (found_extent && root->ref_cows) { 3373 if (found_extent && (root->ref_cows ||
3374 root == root->fs_info->tree_root)) {
3348 btrfs_set_path_blocking(path); 3375 btrfs_set_path_blocking(path);
3349 ret = btrfs_free_extent(trans, root, extent_start, 3376 ret = btrfs_free_extent(trans, root, extent_start,
3350 extent_num_bytes, 0, 3377 extent_num_bytes, 0,
@@ -3675,7 +3702,8 @@ void btrfs_evict_inode(struct inode *inode)
3675 int ret; 3702 int ret;
3676 3703
3677 truncate_inode_pages(&inode->i_data, 0); 3704 truncate_inode_pages(&inode->i_data, 0);
3678 if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0) 3705 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3706 root == root->fs_info->tree_root))
3679 goto no_delete; 3707 goto no_delete;
3680 3708
3681 if (is_bad_inode(inode)) { 3709 if (is_bad_inode(inode)) {
@@ -3849,7 +3877,7 @@ again:
3849 p = &root->inode_tree.rb_node; 3877 p = &root->inode_tree.rb_node;
3850 parent = NULL; 3878 parent = NULL;
3851 3879
3852 if (hlist_unhashed(&inode->i_hash)) 3880 if (inode_unhashed(inode))
3853 return; 3881 return;
3854 3882
3855 spin_lock(&root->inode_lock); 3883 spin_lock(&root->inode_lock);
@@ -3888,7 +3916,14 @@ static void inode_tree_del(struct inode *inode)
3888 } 3916 }
3889 spin_unlock(&root->inode_lock); 3917 spin_unlock(&root->inode_lock);
3890 3918
3891 if (empty && btrfs_root_refs(&root->root_item) == 0) { 3919 /*
3920 * Free space cache has inodes in the tree root, but the tree root has a
3921 * root_refs of 0, so this could end up dropping the tree root as a
3922 * snapshot, so we need the extra !root->fs_info->tree_root check to
3923 * make sure we don't drop it.
3924 */
3925 if (empty && btrfs_root_refs(&root->root_item) == 0 &&
3926 root != root->fs_info->tree_root) {
3892 synchronize_srcu(&root->fs_info->subvol_srcu); 3927 synchronize_srcu(&root->fs_info->subvol_srcu);
3893 spin_lock(&root->inode_lock); 3928 spin_lock(&root->inode_lock);
3894 empty = RB_EMPTY_ROOT(&root->inode_tree); 3929 empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -4282,14 +4317,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4282 struct btrfs_root *root = BTRFS_I(inode)->root; 4317 struct btrfs_root *root = BTRFS_I(inode)->root;
4283 struct btrfs_trans_handle *trans; 4318 struct btrfs_trans_handle *trans;
4284 int ret = 0; 4319 int ret = 0;
4320 bool nolock = false;
4285 4321
4286 if (BTRFS_I(inode)->dummy_inode) 4322 if (BTRFS_I(inode)->dummy_inode)
4287 return 0; 4323 return 0;
4288 4324
4325 smp_mb();
4326 nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
4327
4289 if (wbc->sync_mode == WB_SYNC_ALL) { 4328 if (wbc->sync_mode == WB_SYNC_ALL) {
4290 trans = btrfs_join_transaction(root, 1); 4329 if (nolock)
4330 trans = btrfs_join_transaction_nolock(root, 1);
4331 else
4332 trans = btrfs_join_transaction(root, 1);
4291 btrfs_set_trans_block_group(trans, inode); 4333 btrfs_set_trans_block_group(trans, inode);
4292 ret = btrfs_commit_transaction(trans, root); 4334 if (nolock)
4335 ret = btrfs_end_transaction_nolock(trans, root);
4336 else
4337 ret = btrfs_commit_transaction(trans, root);
4293 } 4338 }
4294 return ret; 4339 return ret;
4295} 4340}
@@ -4758,7 +4803,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4758 } 4803 }
4759 4804
4760 btrfs_set_trans_block_group(trans, dir); 4805 btrfs_set_trans_block_group(trans, dir);
4761 atomic_inc(&inode->i_count); 4806 ihold(inode);
4762 4807
4763 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4808 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
4764 4809
@@ -5645,7 +5690,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5645 struct btrfs_root *root = BTRFS_I(inode)->root; 5690 struct btrfs_root *root = BTRFS_I(inode)->root;
5646 struct btrfs_dio_private *dip; 5691 struct btrfs_dio_private *dip;
5647 struct bio_vec *bvec = bio->bi_io_vec; 5692 struct bio_vec *bvec = bio->bi_io_vec;
5648 u64 start;
5649 int skip_sum; 5693 int skip_sum;
5650 int write = rw & REQ_WRITE; 5694 int write = rw & REQ_WRITE;
5651 int ret = 0; 5695 int ret = 0;
@@ -5671,7 +5715,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5671 dip->inode = inode; 5715 dip->inode = inode;
5672 dip->logical_offset = file_offset; 5716 dip->logical_offset = file_offset;
5673 5717
5674 start = dip->logical_offset;
5675 dip->bytes = 0; 5718 dip->bytes = 0;
5676 do { 5719 do {
5677 dip->bytes += bvec->bv_len; 5720 dip->bytes += bvec->bv_len;
@@ -6308,6 +6351,21 @@ void btrfs_destroy_inode(struct inode *inode)
6308 spin_unlock(&root->fs_info->ordered_extent_lock); 6351 spin_unlock(&root->fs_info->ordered_extent_lock);
6309 } 6352 }
6310 6353
6354 if (root == root->fs_info->tree_root) {
6355 struct btrfs_block_group_cache *block_group;
6356
6357 block_group = btrfs_lookup_block_group(root->fs_info,
6358 BTRFS_I(inode)->block_group);
6359 if (block_group && block_group->inode == inode) {
6360 spin_lock(&block_group->lock);
6361 block_group->inode = NULL;
6362 spin_unlock(&block_group->lock);
6363 btrfs_put_block_group(block_group);
6364 } else if (block_group) {
6365 btrfs_put_block_group(block_group);
6366 }
6367 }
6368
6311 spin_lock(&root->orphan_lock); 6369 spin_lock(&root->orphan_lock);
6312 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6370 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
6313 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6371 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -6340,7 +6398,8 @@ int btrfs_drop_inode(struct inode *inode)
6340{ 6398{
6341 struct btrfs_root *root = BTRFS_I(inode)->root; 6399 struct btrfs_root *root = BTRFS_I(inode)->root;
6342 6400
6343 if (btrfs_root_refs(&root->root_item) == 0) 6401 if (btrfs_root_refs(&root->root_item) == 0 &&
6402 root != root->fs_info->tree_root)
6344 return 1; 6403 return 1;
6345 else 6404 else
6346 return generic_drop_inode(inode); 6405 return generic_drop_inode(inode);
@@ -6609,7 +6668,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
6609 return 0; 6668 return 0;
6610} 6669}
6611 6670
6612int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) 6671int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
6672 int sync)
6613{ 6673{
6614 struct btrfs_inode *binode; 6674 struct btrfs_inode *binode;
6615 struct inode *inode = NULL; 6675 struct inode *inode = NULL;
@@ -6631,7 +6691,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6631 spin_unlock(&root->fs_info->delalloc_lock); 6691 spin_unlock(&root->fs_info->delalloc_lock);
6632 6692
6633 if (inode) { 6693 if (inode) {
6634 write_inode_now(inode, 0); 6694 if (sync) {
6695 filemap_write_and_wait(inode->i_mapping);
6696 /*
6697 * We have to do this because compression doesn't
6698 * actually set PG_writeback until it submits the pages
6699 * for IO, which happens in an async thread, so we could
6700 * race and not actually wait for any writeback pages
6701 * because they've not been submitted yet. Technically
6702 * this could still be the case for the ordered stuff
6703 * since the async thread may not have started to do its
6704 * work yet. If this becomes the case then we need to
6705 * figure out a way to make sure that in writepage we
6706 * wait for any async pages to be submitted before
6707 * returning so that fdatawait does what its supposed to
6708 * do.
6709 */
6710 btrfs_wait_ordered_range(inode, 0, (u64)-1);
6711 } else {
6712 filemap_flush(inode->i_mapping);
6713 }
6635 if (delay_iput) 6714 if (delay_iput)
6636 btrfs_add_delayed_iput(inode); 6715 btrfs_add_delayed_iput(inode);
6637 else 6716 else
@@ -6757,27 +6836,33 @@ out_unlock:
6757 return err; 6836 return err;
6758} 6837}
6759 6838
6760int btrfs_prealloc_file_range(struct inode *inode, int mode, 6839static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6761 u64 start, u64 num_bytes, u64 min_size, 6840 u64 start, u64 num_bytes, u64 min_size,
6762 loff_t actual_len, u64 *alloc_hint) 6841 loff_t actual_len, u64 *alloc_hint,
6842 struct btrfs_trans_handle *trans)
6763{ 6843{
6764 struct btrfs_trans_handle *trans;
6765 struct btrfs_root *root = BTRFS_I(inode)->root; 6844 struct btrfs_root *root = BTRFS_I(inode)->root;
6766 struct btrfs_key ins; 6845 struct btrfs_key ins;
6767 u64 cur_offset = start; 6846 u64 cur_offset = start;
6768 int ret = 0; 6847 int ret = 0;
6848 bool own_trans = true;
6769 6849
6850 if (trans)
6851 own_trans = false;
6770 while (num_bytes > 0) { 6852 while (num_bytes > 0) {
6771 trans = btrfs_start_transaction(root, 3); 6853 if (own_trans) {
6772 if (IS_ERR(trans)) { 6854 trans = btrfs_start_transaction(root, 3);
6773 ret = PTR_ERR(trans); 6855 if (IS_ERR(trans)) {
6774 break; 6856 ret = PTR_ERR(trans);
6857 break;
6858 }
6775 } 6859 }
6776 6860
6777 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 6861 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6778 0, *alloc_hint, (u64)-1, &ins, 1); 6862 0, *alloc_hint, (u64)-1, &ins, 1);
6779 if (ret) { 6863 if (ret) {
6780 btrfs_end_transaction(trans, root); 6864 if (own_trans)
6865 btrfs_end_transaction(trans, root);
6781 break; 6866 break;
6782 } 6867 }
6783 6868
@@ -6810,11 +6895,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
6810 ret = btrfs_update_inode(trans, root, inode); 6895 ret = btrfs_update_inode(trans, root, inode);
6811 BUG_ON(ret); 6896 BUG_ON(ret);
6812 6897
6813 btrfs_end_transaction(trans, root); 6898 if (own_trans)
6899 btrfs_end_transaction(trans, root);
6814 } 6900 }
6815 return ret; 6901 return ret;
6816} 6902}
6817 6903
6904int btrfs_prealloc_file_range(struct inode *inode, int mode,
6905 u64 start, u64 num_bytes, u64 min_size,
6906 loff_t actual_len, u64 *alloc_hint)
6907{
6908 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
6909 min_size, actual_len, alloc_hint,
6910 NULL);
6911}
6912
6913int btrfs_prealloc_file_range_trans(struct inode *inode,
6914 struct btrfs_trans_handle *trans, int mode,
6915 u64 start, u64 num_bytes, u64 min_size,
6916 loff_t actual_len, u64 *alloc_hint)
6917{
6918 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
6919 min_size, actual_len, alloc_hint, trans);
6920}
6921
6818static long btrfs_fallocate(struct inode *inode, int mode, 6922static long btrfs_fallocate(struct inode *inode, int mode,
6819 loff_t offset, loff_t len) 6923 loff_t offset, loff_t len)
6820{ 6924{
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58dbe..463d91b4dd3a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
224 224
225static noinline int create_subvol(struct btrfs_root *root, 225static noinline int create_subvol(struct btrfs_root *root,
226 struct dentry *dentry, 226 struct dentry *dentry,
227 char *name, int namelen) 227 char *name, int namelen,
228 u64 *async_transid)
228{ 229{
229 struct btrfs_trans_handle *trans; 230 struct btrfs_trans_handle *trans;
230 struct btrfs_key key; 231 struct btrfs_key key;
@@ -338,13 +339,19 @@ static noinline int create_subvol(struct btrfs_root *root,
338 339
339 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 340 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
340fail: 341fail:
341 err = btrfs_commit_transaction(trans, root); 342 if (async_transid) {
343 *async_transid = trans->transid;
344 err = btrfs_commit_transaction_async(trans, root, 1);
345 } else {
346 err = btrfs_commit_transaction(trans, root);
347 }
342 if (err && !ret) 348 if (err && !ret)
343 ret = err; 349 ret = err;
344 return ret; 350 return ret;
345} 351}
346 352
347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) 353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
354 char *name, int namelen, u64 *async_transid)
348{ 355{
349 struct inode *inode; 356 struct inode *inode;
350 struct btrfs_pending_snapshot *pending_snapshot; 357 struct btrfs_pending_snapshot *pending_snapshot;
@@ -373,7 +380,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
373 380
374 list_add(&pending_snapshot->list, 381 list_add(&pending_snapshot->list,
375 &trans->transaction->pending_snapshots); 382 &trans->transaction->pending_snapshots);
376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); 383 if (async_transid) {
384 *async_transid = trans->transid;
385 ret = btrfs_commit_transaction_async(trans,
386 root->fs_info->extent_root, 1);
387 } else {
388 ret = btrfs_commit_transaction(trans,
389 root->fs_info->extent_root);
390 }
377 BUG_ON(ret); 391 BUG_ON(ret);
378 392
379 ret = pending_snapshot->error; 393 ret = pending_snapshot->error;
@@ -395,6 +409,76 @@ fail:
395 return ret; 409 return ret;
396} 410}
397 411
412/* copy of check_sticky in fs/namei.c()
413* It's inline, so penalty for filesystems that don't use sticky bit is
414* minimal.
415*/
416static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
417{
418 uid_t fsuid = current_fsuid();
419
420 if (!(dir->i_mode & S_ISVTX))
421 return 0;
422 if (inode->i_uid == fsuid)
423 return 0;
424 if (dir->i_uid == fsuid)
425 return 0;
426 return !capable(CAP_FOWNER);
427}
428
429/* copy of may_delete in fs/namei.c()
430 * Check whether we can remove a link victim from directory dir, check
431 * whether the type of victim is right.
432 * 1. We can't do it if dir is read-only (done in permission())
433 * 2. We should have write and exec permissions on dir
434 * 3. We can't remove anything from append-only dir
435 * 4. We can't do anything with immutable dir (done in permission())
436 * 5. If the sticky bit on dir is set we should either
437 * a. be owner of dir, or
438 * b. be owner of victim, or
439 * c. have CAP_FOWNER capability
440 * 6. If the victim is append-only or immutable we can't do antyhing with
441 * links pointing to it.
442 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
443 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
444 * 9. We can't remove a root or mountpoint.
445 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
446 * nfs_async_unlink().
447 */
448
449static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
450{
451 int error;
452
453 if (!victim->d_inode)
454 return -ENOENT;
455
456 BUG_ON(victim->d_parent->d_inode != dir);
457 audit_inode_child(victim, dir);
458
459 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
460 if (error)
461 return error;
462 if (IS_APPEND(dir))
463 return -EPERM;
464 if (btrfs_check_sticky(dir, victim->d_inode)||
465 IS_APPEND(victim->d_inode)||
466 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
467 return -EPERM;
468 if (isdir) {
469 if (!S_ISDIR(victim->d_inode->i_mode))
470 return -ENOTDIR;
471 if (IS_ROOT(victim))
472 return -EBUSY;
473 } else if (S_ISDIR(victim->d_inode->i_mode))
474 return -EISDIR;
475 if (IS_DEADDIR(dir))
476 return -ENOENT;
477 if (victim->d_flags & DCACHE_NFSFS_RENAMED)
478 return -EBUSY;
479 return 0;
480}
481
398/* copy of may_create in fs/namei.c() */ 482/* copy of may_create in fs/namei.c() */
399static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 483static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
400{ 484{
@@ -412,7 +496,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
412 */ 496 */
413static noinline int btrfs_mksubvol(struct path *parent, 497static noinline int btrfs_mksubvol(struct path *parent,
414 char *name, int namelen, 498 char *name, int namelen,
415 struct btrfs_root *snap_src) 499 struct btrfs_root *snap_src,
500 u64 *async_transid)
416{ 501{
417 struct inode *dir = parent->dentry->d_inode; 502 struct inode *dir = parent->dentry->d_inode;
418 struct dentry *dentry; 503 struct dentry *dentry;
@@ -443,10 +528,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
443 goto out_up_read; 528 goto out_up_read;
444 529
445 if (snap_src) { 530 if (snap_src) {
446 error = create_snapshot(snap_src, dentry); 531 error = create_snapshot(snap_src, dentry,
532 name, namelen, async_transid);
447 } else { 533 } else {
448 error = create_subvol(BTRFS_I(dir)->root, dentry, 534 error = create_subvol(BTRFS_I(dir)->root, dentry,
449 name, namelen); 535 name, namelen, async_transid);
450 } 536 }
451 if (!error) 537 if (!error)
452 fsnotify_mkdir(dir, dentry); 538 fsnotify_mkdir(dir, dentry);
@@ -708,7 +794,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
708 char *sizestr; 794 char *sizestr;
709 char *devstr = NULL; 795 char *devstr = NULL;
710 int ret = 0; 796 int ret = 0;
711 int namelen;
712 int mod = 0; 797 int mod = 0;
713 798
714 if (root->fs_info->sb->s_flags & MS_RDONLY) 799 if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +807,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
722 return PTR_ERR(vol_args); 807 return PTR_ERR(vol_args);
723 808
724 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 809 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
725 namelen = strlen(vol_args->name);
726 810
727 mutex_lock(&root->fs_info->volume_mutex); 811 mutex_lock(&root->fs_info->volume_mutex);
728 sizestr = vol_args->name; 812 sizestr = vol_args->name;
@@ -801,11 +885,13 @@ out_unlock:
801 return ret; 885 return ret;
802} 886}
803 887
804static noinline int btrfs_ioctl_snap_create(struct file *file, 888static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
805 void __user *arg, int subvol) 889 char *name,
890 unsigned long fd,
891 int subvol,
892 u64 *transid)
806{ 893{
807 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 894 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
808 struct btrfs_ioctl_vol_args *vol_args;
809 struct file *src_file; 895 struct file *src_file;
810 int namelen; 896 int namelen;
811 int ret = 0; 897 int ret = 0;
@@ -813,23 +899,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
813 if (root->fs_info->sb->s_flags & MS_RDONLY) 899 if (root->fs_info->sb->s_flags & MS_RDONLY)
814 return -EROFS; 900 return -EROFS;
815 901
816 vol_args = memdup_user(arg, sizeof(*vol_args)); 902 namelen = strlen(name);
817 if (IS_ERR(vol_args)) 903 if (strchr(name, '/')) {
818 return PTR_ERR(vol_args);
819
820 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
821 namelen = strlen(vol_args->name);
822 if (strchr(vol_args->name, '/')) {
823 ret = -EINVAL; 904 ret = -EINVAL;
824 goto out; 905 goto out;
825 } 906 }
826 907
827 if (subvol) { 908 if (subvol) {
828 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 909 ret = btrfs_mksubvol(&file->f_path, name, namelen,
829 NULL); 910 NULL, transid);
830 } else { 911 } else {
831 struct inode *src_inode; 912 struct inode *src_inode;
832 src_file = fget(vol_args->fd); 913 src_file = fget(fd);
833 if (!src_file) { 914 if (!src_file) {
834 ret = -EINVAL; 915 ret = -EINVAL;
835 goto out; 916 goto out;
@@ -843,12 +924,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
843 fput(src_file); 924 fput(src_file);
844 goto out; 925 goto out;
845 } 926 }
846 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 927 ret = btrfs_mksubvol(&file->f_path, name, namelen,
847 BTRFS_I(src_inode)->root); 928 BTRFS_I(src_inode)->root,
929 transid);
848 fput(src_file); 930 fput(src_file);
849 } 931 }
850out: 932out:
933 return ret;
934}
935
936static noinline int btrfs_ioctl_snap_create(struct file *file,
937 void __user *arg, int subvol,
938 int async)
939{
940 struct btrfs_ioctl_vol_args *vol_args = NULL;
941 struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
942 char *name;
943 u64 fd;
944 u64 transid = 0;
945 int ret;
946
947 if (async) {
948 async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
949 if (IS_ERR(async_vol_args))
950 return PTR_ERR(async_vol_args);
951
952 name = async_vol_args->name;
953 fd = async_vol_args->fd;
954 async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
955 } else {
956 vol_args = memdup_user(arg, sizeof(*vol_args));
957 if (IS_ERR(vol_args))
958 return PTR_ERR(vol_args);
959 name = vol_args->name;
960 fd = vol_args->fd;
961 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
962 }
963
964 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
965 subvol, &transid);
966
967 if (!ret && async) {
968 if (copy_to_user(arg +
969 offsetof(struct btrfs_ioctl_async_vol_args,
970 transid), &transid, sizeof(transid)))
971 return -EFAULT;
972 }
973
851 kfree(vol_args); 974 kfree(vol_args);
975 kfree(async_vol_args);
976
852 return ret; 977 return ret;
853} 978}
854 979
@@ -1073,14 +1198,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
1073 if (!capable(CAP_SYS_ADMIN)) 1198 if (!capable(CAP_SYS_ADMIN))
1074 return -EPERM; 1199 return -EPERM;
1075 1200
1076 args = kmalloc(sizeof(*args), GFP_KERNEL); 1201 args = memdup_user(argp, sizeof(*args));
1077 if (!args) 1202 if (IS_ERR(args))
1078 return -ENOMEM; 1203 return PTR_ERR(args);
1079 1204
1080 if (copy_from_user(args, argp, sizeof(*args))) {
1081 kfree(args);
1082 return -EFAULT;
1083 }
1084 inode = fdentry(file)->d_inode; 1205 inode = fdentry(file)->d_inode;
1085 ret = search_ioctl(inode, args); 1206 ret = search_ioctl(inode, args);
1086 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 1207 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1188,14 +1309,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1188 if (!capable(CAP_SYS_ADMIN)) 1309 if (!capable(CAP_SYS_ADMIN))
1189 return -EPERM; 1310 return -EPERM;
1190 1311
1191 args = kmalloc(sizeof(*args), GFP_KERNEL); 1312 args = memdup_user(argp, sizeof(*args));
1192 if (!args) 1313 if (IS_ERR(args))
1193 return -ENOMEM; 1314 return PTR_ERR(args);
1194 1315
1195 if (copy_from_user(args, argp, sizeof(*args))) {
1196 kfree(args);
1197 return -EFAULT;
1198 }
1199 inode = fdentry(file)->d_inode; 1316 inode = fdentry(file)->d_inode;
1200 1317
1201 if (args->treeid == 0) 1318 if (args->treeid == 0)
@@ -1227,9 +1344,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1227 int ret; 1344 int ret;
1228 int err = 0; 1345 int err = 0;
1229 1346
1230 if (!capable(CAP_SYS_ADMIN))
1231 return -EPERM;
1232
1233 vol_args = memdup_user(arg, sizeof(*vol_args)); 1347 vol_args = memdup_user(arg, sizeof(*vol_args));
1234 if (IS_ERR(vol_args)) 1348 if (IS_ERR(vol_args))
1235 return PTR_ERR(vol_args); 1349 return PTR_ERR(vol_args);
@@ -1259,13 +1373,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1259 } 1373 }
1260 1374
1261 inode = dentry->d_inode; 1375 inode = dentry->d_inode;
1376 dest = BTRFS_I(inode)->root;
1377 if (!capable(CAP_SYS_ADMIN)){
1378 /*
1379 * Regular user. Only allow this with a special mount
1380 * option, when the user has write+exec access to the
1381 * subvol root, and when rmdir(2) would have been
1382 * allowed.
1383 *
1384 * Note that this is _not_ check that the subvol is
1385 * empty or doesn't contain data that we wouldn't
1386 * otherwise be able to delete.
1387 *
1388 * Users who want to delete empty subvols should try
1389 * rmdir(2).
1390 */
1391 err = -EPERM;
1392 if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
1393 goto out_dput;
1394
1395 /*
1396 * Do not allow deletion if the parent dir is the same
1397 * as the dir to be deleted. That means the ioctl
1398 * must be called on the dentry referencing the root
1399 * of the subvol, not a random directory contained
1400 * within it.
1401 */
1402 err = -EINVAL;
1403 if (root == dest)
1404 goto out_dput;
1405
1406 err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
1407 if (err)
1408 goto out_dput;
1409
1410 /* check if subvolume may be deleted by a non-root user */
1411 err = btrfs_may_delete(dir, dentry, 1);
1412 if (err)
1413 goto out_dput;
1414 }
1415
1262 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 1416 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
1263 err = -EINVAL; 1417 err = -EINVAL;
1264 goto out_dput; 1418 goto out_dput;
1265 } 1419 }
1266 1420
1267 dest = BTRFS_I(inode)->root;
1268
1269 mutex_lock(&inode->i_mutex); 1421 mutex_lock(&inode->i_mutex);
1270 err = d_invalidate(dentry); 1422 err = d_invalidate(dentry);
1271 if (err) 1423 if (err)
@@ -1304,7 +1456,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1304 BUG_ON(ret); 1456 BUG_ON(ret);
1305 } 1457 }
1306 1458
1307 ret = btrfs_commit_transaction(trans, root); 1459 ret = btrfs_end_transaction(trans, root);
1308 BUG_ON(ret); 1460 BUG_ON(ret);
1309 inode->i_flags |= S_DEAD; 1461 inode->i_flags |= S_DEAD;
1310out_up_write: 1462out_up_write:
@@ -1502,11 +1654,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1502 path->reada = 2; 1654 path->reada = 2;
1503 1655
1504 if (inode < src) { 1656 if (inode < src) {
1505 mutex_lock(&inode->i_mutex); 1657 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
1506 mutex_lock(&src->i_mutex); 1658 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
1507 } else { 1659 } else {
1508 mutex_lock(&src->i_mutex); 1660 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
1509 mutex_lock(&inode->i_mutex); 1661 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1510 } 1662 }
1511 1663
1512 /* determine range to clone */ 1664 /* determine range to clone */
@@ -1530,13 +1682,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1530 while (1) { 1682 while (1) {
1531 struct btrfs_ordered_extent *ordered; 1683 struct btrfs_ordered_extent *ordered;
1532 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1684 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1533 ordered = btrfs_lookup_first_ordered_extent(inode, off+len); 1685 ordered = btrfs_lookup_first_ordered_extent(src, off+len);
1534 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) 1686 if (!ordered &&
1687 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
1688 EXTENT_DELALLOC, 0, NULL))
1535 break; 1689 break;
1536 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1690 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1537 if (ordered) 1691 if (ordered)
1538 btrfs_put_ordered_extent(ordered); 1692 btrfs_put_ordered_extent(ordered);
1539 btrfs_wait_ordered_range(src, off, off+len); 1693 btrfs_wait_ordered_range(src, off, len);
1540 } 1694 }
1541 1695
1542 /* clone data */ 1696 /* clone data */
@@ -1605,7 +1759,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1605 } 1759 }
1606 btrfs_release_path(root, path); 1760 btrfs_release_path(root, path);
1607 1761
1608 if (key.offset + datal < off || 1762 if (key.offset + datal <= off ||
1609 key.offset >= off+len) 1763 key.offset >= off+len)
1610 goto next; 1764 goto next;
1611 1765
@@ -1879,6 +2033,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1879 return 0; 2033 return 0;
1880} 2034}
1881 2035
2036static void get_block_group_info(struct list_head *groups_list,
2037 struct btrfs_ioctl_space_info *space)
2038{
2039 struct btrfs_block_group_cache *block_group;
2040
2041 space->total_bytes = 0;
2042 space->used_bytes = 0;
2043 space->flags = 0;
2044 list_for_each_entry(block_group, groups_list, list) {
2045 space->flags = block_group->flags;
2046 space->total_bytes += block_group->key.offset;
2047 space->used_bytes +=
2048 btrfs_block_group_used(&block_group->item);
2049 }
2050}
2051
1882long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) 2052long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1883{ 2053{
1884 struct btrfs_ioctl_space_args space_args; 2054 struct btrfs_ioctl_space_args space_args;
@@ -1887,27 +2057,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1887 struct btrfs_ioctl_space_info *dest_orig; 2057 struct btrfs_ioctl_space_info *dest_orig;
1888 struct btrfs_ioctl_space_info *user_dest; 2058 struct btrfs_ioctl_space_info *user_dest;
1889 struct btrfs_space_info *info; 2059 struct btrfs_space_info *info;
2060 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2061 BTRFS_BLOCK_GROUP_SYSTEM,
2062 BTRFS_BLOCK_GROUP_METADATA,
2063 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2064 int num_types = 4;
1890 int alloc_size; 2065 int alloc_size;
1891 int ret = 0; 2066 int ret = 0;
1892 int slot_count = 0; 2067 int slot_count = 0;
2068 int i, c;
1893 2069
1894 if (copy_from_user(&space_args, 2070 if (copy_from_user(&space_args,
1895 (struct btrfs_ioctl_space_args __user *)arg, 2071 (struct btrfs_ioctl_space_args __user *)arg,
1896 sizeof(space_args))) 2072 sizeof(space_args)))
1897 return -EFAULT; 2073 return -EFAULT;
1898 2074
1899 /* first we count slots */ 2075 for (i = 0; i < num_types; i++) {
1900 rcu_read_lock(); 2076 struct btrfs_space_info *tmp;
1901 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) 2077
1902 slot_count++; 2078 info = NULL;
1903 rcu_read_unlock(); 2079 rcu_read_lock();
2080 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
2081 list) {
2082 if (tmp->flags == types[i]) {
2083 info = tmp;
2084 break;
2085 }
2086 }
2087 rcu_read_unlock();
2088
2089 if (!info)
2090 continue;
2091
2092 down_read(&info->groups_sem);
2093 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2094 if (!list_empty(&info->block_groups[c]))
2095 slot_count++;
2096 }
2097 up_read(&info->groups_sem);
2098 }
1904 2099
1905 /* space_slots == 0 means they are asking for a count */ 2100 /* space_slots == 0 means they are asking for a count */
1906 if (space_args.space_slots == 0) { 2101 if (space_args.space_slots == 0) {
1907 space_args.total_spaces = slot_count; 2102 space_args.total_spaces = slot_count;
1908 goto out; 2103 goto out;
1909 } 2104 }
2105
2106 slot_count = min_t(int, space_args.space_slots, slot_count);
2107
1910 alloc_size = sizeof(*dest) * slot_count; 2108 alloc_size = sizeof(*dest) * slot_count;
2109
1911 /* we generally have at most 6 or so space infos, one for each raid 2110 /* we generally have at most 6 or so space infos, one for each raid
1912 * level. So, a whole page should be more than enough for everyone 2111 * level. So, a whole page should be more than enough for everyone
1913 */ 2112 */
@@ -1921,27 +2120,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1921 dest_orig = dest; 2120 dest_orig = dest;
1922 2121
1923 /* now we have a buffer to copy into */ 2122 /* now we have a buffer to copy into */
1924 rcu_read_lock(); 2123 for (i = 0; i < num_types; i++) {
1925 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { 2124 struct btrfs_space_info *tmp;
1926 /* make sure we don't copy more than we allocated 2125
1927 * in our buffer 2126 info = NULL;
1928 */ 2127 rcu_read_lock();
1929 if (slot_count == 0) 2128 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
1930 break; 2129 list) {
1931 slot_count--; 2130 if (tmp->flags == types[i]) {
1932 2131 info = tmp;
1933 /* make sure userland has enough room in their buffer */ 2132 break;
1934 if (space_args.total_spaces >= space_args.space_slots) 2133 }
1935 break; 2134 }
2135 rcu_read_unlock();
1936 2136
1937 space.flags = info->flags; 2137 if (!info)
1938 space.total_bytes = info->total_bytes; 2138 continue;
1939 space.used_bytes = info->bytes_used; 2139 down_read(&info->groups_sem);
1940 memcpy(dest, &space, sizeof(space)); 2140 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
1941 dest++; 2141 if (!list_empty(&info->block_groups[c])) {
1942 space_args.total_spaces++; 2142 get_block_group_info(&info->block_groups[c],
2143 &space);
2144 memcpy(dest, &space, sizeof(space));
2145 dest++;
2146 space_args.total_spaces++;
2147 }
2148 }
2149 up_read(&info->groups_sem);
1943 } 2150 }
1944 rcu_read_unlock();
1945 2151
1946 user_dest = (struct btrfs_ioctl_space_info *) 2152 user_dest = (struct btrfs_ioctl_space_info *)
1947 (arg + sizeof(struct btrfs_ioctl_space_args)); 2153 (arg + sizeof(struct btrfs_ioctl_space_args));
@@ -1984,6 +2190,36 @@ long btrfs_ioctl_trans_end(struct file *file)
1984 return 0; 2190 return 0;
1985} 2191}
1986 2192
2193static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
2194{
2195 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2196 struct btrfs_trans_handle *trans;
2197 u64 transid;
2198
2199 trans = btrfs_start_transaction(root, 0);
2200 transid = trans->transid;
2201 btrfs_commit_transaction_async(trans, root, 0);
2202
2203 if (argp)
2204 if (copy_to_user(argp, &transid, sizeof(transid)))
2205 return -EFAULT;
2206 return 0;
2207}
2208
2209static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
2210{
2211 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2212 u64 transid;
2213
2214 if (argp) {
2215 if (copy_from_user(&transid, argp, sizeof(transid)))
2216 return -EFAULT;
2217 } else {
2218 transid = 0; /* current trans */
2219 }
2220 return btrfs_wait_for_commit(root, transid);
2221}
2222
1987long btrfs_ioctl(struct file *file, unsigned int 2223long btrfs_ioctl(struct file *file, unsigned int
1988 cmd, unsigned long arg) 2224 cmd, unsigned long arg)
1989{ 2225{
@@ -1998,9 +2234,11 @@ long btrfs_ioctl(struct file *file, unsigned int
1998 case FS_IOC_GETVERSION: 2234 case FS_IOC_GETVERSION:
1999 return btrfs_ioctl_getversion(file, argp); 2235 return btrfs_ioctl_getversion(file, argp);
2000 case BTRFS_IOC_SNAP_CREATE: 2236 case BTRFS_IOC_SNAP_CREATE:
2001 return btrfs_ioctl_snap_create(file, argp, 0); 2237 return btrfs_ioctl_snap_create(file, argp, 0, 0);
2238 case BTRFS_IOC_SNAP_CREATE_ASYNC:
2239 return btrfs_ioctl_snap_create(file, argp, 0, 1);
2002 case BTRFS_IOC_SUBVOL_CREATE: 2240 case BTRFS_IOC_SUBVOL_CREATE:
2003 return btrfs_ioctl_snap_create(file, argp, 1); 2241 return btrfs_ioctl_snap_create(file, argp, 1, 0);
2004 case BTRFS_IOC_SNAP_DESTROY: 2242 case BTRFS_IOC_SNAP_DESTROY:
2005 return btrfs_ioctl_snap_destroy(file, argp); 2243 return btrfs_ioctl_snap_destroy(file, argp);
2006 case BTRFS_IOC_DEFAULT_SUBVOL: 2244 case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -2034,6 +2272,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2034 case BTRFS_IOC_SYNC: 2272 case BTRFS_IOC_SYNC:
2035 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2273 btrfs_sync_fs(file->f_dentry->d_sb, 1);
2036 return 0; 2274 return 0;
2275 case BTRFS_IOC_START_SYNC:
2276 return btrfs_ioctl_start_sync(file, argp);
2277 case BTRFS_IOC_WAIT_SYNC:
2278 return btrfs_ioctl_wait_sync(file, argp);
2037 } 2279 }
2038 2280
2039 return -ENOTTY; 2281 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517f..17c99ebdf960 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,21 @@
22 22
23#define BTRFS_IOCTL_MAGIC 0x94 23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255 24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 4087
26 25
27/* this should be 4k */ 26/* this should be 4k */
27#define BTRFS_PATH_NAME_MAX 4087
28struct btrfs_ioctl_vol_args { 28struct btrfs_ioctl_vol_args {
29 __s64 fd; 29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_SNAPSHOT_NAME_MAX 4079
34struct btrfs_ioctl_async_vol_args {
35 __s64 fd;
36 __u64 transid;
37 char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
38};
39
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080 40#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args { 41struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid; 42 __u64 treeid;
@@ -178,4 +185,8 @@ struct btrfs_ioctl_space_args {
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) 185#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ 186#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args) 187 struct btrfs_ioctl_space_args)
188#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
189#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
190#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
191 struct btrfs_ioctl_async_vol_args)
181#endif 192#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5add..f4621f6deca1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -526,7 +526,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
526{ 526{
527 u64 end; 527 u64 end;
528 u64 orig_end; 528 u64 orig_end;
529 u64 wait_end;
530 struct btrfs_ordered_extent *ordered; 529 struct btrfs_ordered_extent *ordered;
531 int found; 530 int found;
532 531
@@ -537,7 +536,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
537 if (orig_end > INT_LIMIT(loff_t)) 536 if (orig_end > INT_LIMIT(loff_t))
538 orig_end = INT_LIMIT(loff_t); 537 orig_end = INT_LIMIT(loff_t);
539 } 538 }
540 wait_end = orig_end;
541again: 539again:
542 /* start IO across the range first to instantiate any delalloc 540 /* start IO across the range first to instantiate any delalloc
543 * extents 541 * extents
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4a..045c9c2b2d7e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,7 @@
29#include "locking.h" 29#include "locking.h"
30#include "btrfs_inode.h" 30#include "btrfs_inode.h"
31#include "async-thread.h" 31#include "async-thread.h"
32#include "free-space-cache.h"
32 33
33/* 34/*
34 * backref_node, mapping_node and tree_block start with this 35 * backref_node, mapping_node and tree_block start with this
@@ -178,8 +179,6 @@ struct reloc_control {
178 u64 search_start; 179 u64 search_start;
179 u64 extents_found; 180 u64 extents_found;
180 181
181 int block_rsv_retries;
182
183 unsigned int stage:8; 182 unsigned int stage:8;
184 unsigned int create_reloc_tree:1; 183 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1; 184 unsigned int merge_reloc_tree:1;
@@ -2133,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2133 LIST_HEAD(reloc_roots); 2132 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0; 2133 u64 num_bytes = 0;
2135 int ret; 2134 int ret;
2136 int retries = 0;
2137 2135
2138 mutex_lock(&root->fs_info->trans_mutex); 2136 mutex_lock(&root->fs_info->trans_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2137 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2143,7 +2141,7 @@ again:
2143 if (!err) { 2141 if (!err) {
2144 num_bytes = rc->merging_rsv_size; 2142 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2143 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries); 2144 num_bytes);
2147 if (ret) 2145 if (ret)
2148 err = ret; 2146 err = ret;
2149 } 2147 }
@@ -2155,7 +2153,6 @@ again:
2155 btrfs_end_transaction(trans, rc->extent_root); 2153 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root, 2154 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes); 2155 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again; 2156 goto again;
2160 } 2157 }
2161 } 2158 }
@@ -2405,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2402 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2406 2403
2407 trans->block_rsv = rc->block_rsv; 2404 trans->block_rsv = rc->block_rsv;
2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, 2405 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
2409 &rc->block_rsv_retries);
2410 if (ret) { 2406 if (ret) {
2411 if (ret == -EAGAIN) 2407 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1; 2408 rc->commit_transaction = 1;
2413 return ret; 2409 return ret;
2414 } 2410 }
2415 2411
2416 rc->block_rsv_retries = 0;
2417 return 0; 2412 return 0;
2418} 2413}
2419 2414
@@ -3099,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc,
3099 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3094 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3100 ret = get_ref_objectid_v0(rc, path, extent_key, 3095 ret = get_ref_objectid_v0(rc, path, extent_key,
3101 &ref_owner, NULL); 3096 &ref_owner, NULL);
3097 if (ret < 0)
3098 return ret;
3102 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); 3099 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3103 level = (int)ref_owner; 3100 level = (int)ref_owner;
3104 /* FIXME: get real generation */ 3101 /* FIXME: get real generation */
@@ -3191,6 +3188,54 @@ static int block_use_full_backref(struct reloc_control *rc,
3191 return ret; 3188 return ret;
3192} 3189}
3193 3190
3191static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3192 struct inode *inode, u64 ino)
3193{
3194 struct btrfs_key key;
3195 struct btrfs_path *path;
3196 struct btrfs_root *root = fs_info->tree_root;
3197 struct btrfs_trans_handle *trans;
3198 unsigned long nr;
3199 int ret = 0;
3200
3201 if (inode)
3202 goto truncate;
3203
3204 key.objectid = ino;
3205 key.type = BTRFS_INODE_ITEM_KEY;
3206 key.offset = 0;
3207
3208 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3209 if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
3210 if (inode && !IS_ERR(inode))
3211 iput(inode);
3212 return -ENOENT;
3213 }
3214
3215truncate:
3216 path = btrfs_alloc_path();
3217 if (!path) {
3218 ret = -ENOMEM;
3219 goto out;
3220 }
3221
3222 trans = btrfs_join_transaction(root, 0);
3223 if (IS_ERR(trans)) {
3224 btrfs_free_path(path);
3225 goto out;
3226 }
3227
3228 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3229
3230 btrfs_free_path(path);
3231 nr = trans->blocks_used;
3232 btrfs_end_transaction(trans, root);
3233 btrfs_btree_balance_dirty(root, nr);
3234out:
3235 iput(inode);
3236 return ret;
3237}
3238
3194/* 3239/*
3195 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY 3240 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3196 * this function scans fs tree to find blocks reference the data extent 3241 * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc,
3217 int counted; 3262 int counted;
3218 int ret; 3263 int ret;
3219 3264
3220 path = btrfs_alloc_path();
3221 if (!path)
3222 return -ENOMEM;
3223
3224 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3265 ref_root = btrfs_extent_data_ref_root(leaf, ref);
3225 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3266 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3226 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3267 ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3227 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3268 ref_count = btrfs_extent_data_ref_count(leaf, ref);
3228 3269
3270 /*
3271 * This is an extent belonging to the free space cache, lets just delete
3272 * it and redo the search.
3273 */
3274 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3275 ret = delete_block_group_cache(rc->extent_root->fs_info,
3276 NULL, ref_objectid);
3277 if (ret != -ENOENT)
3278 return ret;
3279 ret = 0;
3280 }
3281
3282 path = btrfs_alloc_path();
3283 if (!path)
3284 return -ENOMEM;
3285
3229 root = read_fs_root(rc->extent_root->fs_info, ref_root); 3286 root = read_fs_root(rc->extent_root->fs_info, ref_root);
3230 if (IS_ERR(root)) { 3287 if (IS_ERR(root)) {
3231 err = PTR_ERR(root); 3288 err = PTR_ERR(root);
@@ -3554,8 +3611,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3554 * is no reservation in transaction handle. 3611 * is no reservation in transaction handle.
3555 */ 3612 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3613 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256, 3614 rc->extent_root->nodesize * 256);
3558 &rc->block_rsv_retries);
3559 if (ret) 3615 if (ret)
3560 return ret; 3616 return ret;
3561 3617
@@ -3567,7 +3623,6 @@ int prepare_to_relocate(struct reloc_control *rc)
3567 rc->extents_found = 0; 3623 rc->extents_found = 0;
3568 rc->nodes_relocated = 0; 3624 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0; 3625 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571 3626
3572 rc->create_reloc_tree = 1; 3627 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc); 3628 set_reloc_control(rc);
@@ -3860,6 +3915,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3860{ 3915{
3861 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3916 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3862 struct reloc_control *rc; 3917 struct reloc_control *rc;
3918 struct inode *inode;
3919 struct btrfs_path *path;
3863 int ret; 3920 int ret;
3864 int rw = 0; 3921 int rw = 0;
3865 int err = 0; 3922 int err = 0;
@@ -3882,6 +3939,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3882 rw = 1; 3939 rw = 1;
3883 } 3940 }
3884 3941
3942 path = btrfs_alloc_path();
3943 if (!path) {
3944 err = -ENOMEM;
3945 goto out;
3946 }
3947
3948 inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
3949 path);
3950 btrfs_free_path(path);
3951
3952 if (!IS_ERR(inode))
3953 ret = delete_block_group_cache(fs_info, inode, 0);
3954 else
3955 ret = PTR_ERR(inode);
3956
3957 if (ret && ret != -ENOENT) {
3958 err = ret;
3959 goto out;
3960 }
3961
3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3962 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3886 if (IS_ERR(rc->data_inode)) { 3963 if (IS_ERR(rc->data_inode)) {
3887 err = PTR_ERR(rc->data_inode); 3964 err = PTR_ERR(rc->data_inode);
@@ -4143,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4143 btrfs_add_ordered_sum(inode, ordered, sums); 4220 btrfs_add_ordered_sum(inode, ordered, sums);
4144 } 4221 }
4145 btrfs_put_ordered_extent(ordered); 4222 btrfs_put_ordered_extent(ordered);
4146 return 0; 4223 return ret;
4147} 4224}
4148 4225
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4226void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c8..6a1086e83ffc 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
181int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) 181int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
182{ 182{
183 struct btrfs_root *dead_root; 183 struct btrfs_root *dead_root;
184 struct btrfs_item *item;
185 struct btrfs_root_item *ri; 184 struct btrfs_root_item *ri;
186 struct btrfs_key key; 185 struct btrfs_key key;
187 struct btrfs_key found_key; 186 struct btrfs_key found_key;
@@ -214,7 +213,6 @@ again:
214 nritems = btrfs_header_nritems(leaf); 213 nritems = btrfs_header_nritems(leaf);
215 slot = path->slots[0]; 214 slot = path->slots[0];
216 } 215 }
217 item = btrfs_item_nr(leaf, slot);
218 btrfs_item_key_to_cpu(leaf, &key, slot); 216 btrfs_item_key_to_cpu(leaf, &key, slot);
219 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) 217 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
220 goto next; 218 goto next;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1776dbd8dc98..8299a25ffc8f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb)
61 61
62 ret = close_ctree(root); 62 ret = close_ctree(root);
63 sb->s_fs_info = NULL; 63 sb->s_fs_info = NULL;
64
65 (void)ret; /* FIXME: need to fix VFS to return error? */
64} 66}
65 67
66enum { 68enum {
@@ -68,7 +70,8 @@ enum {
68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
71 Opt_discard, Opt_err, 73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
74 Opt_user_subvol_rm_allowed,
72}; 75};
73 76
74static match_table_t tokens = { 77static match_table_t tokens = {
@@ -92,6 +95,9 @@ static match_table_t tokens = {
92 {Opt_flushoncommit, "flushoncommit"}, 95 {Opt_flushoncommit, "flushoncommit"},
93 {Opt_ratio, "metadata_ratio=%d"}, 96 {Opt_ratio, "metadata_ratio=%d"},
94 {Opt_discard, "discard"}, 97 {Opt_discard, "discard"},
98 {Opt_space_cache, "space_cache"},
99 {Opt_clear_cache, "clear_cache"},
100 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
95 {Opt_err, NULL}, 101 {Opt_err, NULL},
96}; 102};
97 103
@@ -235,6 +241,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
235 case Opt_discard: 241 case Opt_discard:
236 btrfs_set_opt(info->mount_opt, DISCARD); 242 btrfs_set_opt(info->mount_opt, DISCARD);
237 break; 243 break;
244 case Opt_space_cache:
245 printk(KERN_INFO "btrfs: enabling disk space caching\n");
246 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
247 case Opt_clear_cache:
248 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
249 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
250 break;
251 case Opt_user_subvol_rm_allowed:
252 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
253 break;
238 case Opt_err: 254 case Opt_err:
239 printk(KERN_INFO "btrfs: unrecognized mount option " 255 printk(KERN_INFO "btrfs: unrecognized mount option "
240 "'%s'\n", p); 256 "'%s'\n", p);
@@ -380,7 +396,7 @@ static struct dentry *get_default_root(struct super_block *sb,
380find_root: 396find_root:
381 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 397 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
382 if (IS_ERR(new_root)) 398 if (IS_ERR(new_root))
383 return ERR_PTR(PTR_ERR(new_root)); 399 return ERR_CAST(new_root);
384 400
385 if (btrfs_root_refs(&new_root->root_item) == 0) 401 if (btrfs_root_refs(&new_root->root_item) == 0)
386 return ERR_PTR(-ENOENT); 402 return ERR_PTR(-ENOENT);
@@ -436,7 +452,6 @@ static int btrfs_fill_super(struct super_block *sb,
436{ 452{
437 struct inode *inode; 453 struct inode *inode;
438 struct dentry *root_dentry; 454 struct dentry *root_dentry;
439 struct btrfs_super_block *disk_super;
440 struct btrfs_root *tree_root; 455 struct btrfs_root *tree_root;
441 struct btrfs_key key; 456 struct btrfs_key key;
442 int err; 457 int err;
@@ -458,7 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
458 return PTR_ERR(tree_root); 473 return PTR_ERR(tree_root);
459 } 474 }
460 sb->s_fs_info = tree_root; 475 sb->s_fs_info = tree_root;
461 disk_super = &tree_root->fs_info->super_copy;
462 476
463 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 477 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
464 key.type = BTRFS_INODE_ITEM_KEY; 478 key.type = BTRFS_INODE_ITEM_KEY;
@@ -560,8 +574,8 @@ static int btrfs_test_super(struct super_block *s, void *data)
560 * Note: This is based on get_sb_bdev from fs/super.c with a few additions 574 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
561 * for multiple device setup. Make sure to keep it in sync. 575 * for multiple device setup. Make sure to keep it in sync.
562 */ 576 */
563static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 577static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
564 const char *dev_name, void *data, struct vfsmount *mnt) 578 const char *dev_name, void *data)
565{ 579{
566 struct block_device *bdev = NULL; 580 struct block_device *bdev = NULL;
567 struct super_block *s; 581 struct super_block *s;
@@ -571,7 +585,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
571 char *subvol_name = NULL; 585 char *subvol_name = NULL;
572 u64 subvol_objectid = 0; 586 u64 subvol_objectid = 0;
573 int error = 0; 587 int error = 0;
574 int found = 0;
575 588
576 if (!(flags & MS_RDONLY)) 589 if (!(flags & MS_RDONLY))
577 mode |= FMODE_WRITE; 590 mode |= FMODE_WRITE;
@@ -580,7 +593,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
580 &subvol_name, &subvol_objectid, 593 &subvol_name, &subvol_objectid,
581 &fs_devices); 594 &fs_devices);
582 if (error) 595 if (error)
583 return error; 596 return ERR_PTR(error);
584 597
585 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); 598 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
586 if (error) 599 if (error)
@@ -607,7 +620,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
607 goto error_close_devices; 620 goto error_close_devices;
608 } 621 }
609 622
610 found = 1;
611 btrfs_close_devices(fs_devices); 623 btrfs_close_devices(fs_devices);
612 } else { 624 } else {
613 char b[BDEVNAME_SIZE]; 625 char b[BDEVNAME_SIZE];
@@ -629,7 +641,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
629 if (IS_ERR(root)) { 641 if (IS_ERR(root)) {
630 error = PTR_ERR(root); 642 error = PTR_ERR(root);
631 deactivate_locked_super(s); 643 deactivate_locked_super(s);
632 goto error; 644 goto error_free_subvol_name;
633 } 645 }
634 /* if they gave us a subvolume name bind mount into that */ 646 /* if they gave us a subvolume name bind mount into that */
635 if (strcmp(subvol_name, ".")) { 647 if (strcmp(subvol_name, ".")) {
@@ -643,24 +655,21 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
643 deactivate_locked_super(s); 655 deactivate_locked_super(s);
644 error = PTR_ERR(new_root); 656 error = PTR_ERR(new_root);
645 dput(root); 657 dput(root);
646 goto error_close_devices; 658 goto error_free_subvol_name;
647 } 659 }
648 if (!new_root->d_inode) { 660 if (!new_root->d_inode) {
649 dput(root); 661 dput(root);
650 dput(new_root); 662 dput(new_root);
651 deactivate_locked_super(s); 663 deactivate_locked_super(s);
652 error = -ENXIO; 664 error = -ENXIO;
653 goto error_close_devices; 665 goto error_free_subvol_name;
654 } 666 }
655 dput(root); 667 dput(root);
656 root = new_root; 668 root = new_root;
657 } 669 }
658 670
659 mnt->mnt_sb = s;
660 mnt->mnt_root = root;
661
662 kfree(subvol_name); 671 kfree(subvol_name);
663 return 0; 672 return root;
664 673
665error_s: 674error_s:
666 error = PTR_ERR(s); 675 error = PTR_ERR(s);
@@ -668,8 +677,7 @@ error_close_devices:
668 btrfs_close_devices(fs_devices); 677 btrfs_close_devices(fs_devices);
669error_free_subvol_name: 678error_free_subvol_name:
670 kfree(subvol_name); 679 kfree(subvol_name);
671error: 680 return ERR_PTR(error);
672 return error;
673} 681}
674 682
675static int btrfs_remount(struct super_block *sb, int *flags, char *data) 683static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -716,18 +724,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
716 struct list_head *head = &root->fs_info->space_info; 724 struct list_head *head = &root->fs_info->space_info;
717 struct btrfs_space_info *found; 725 struct btrfs_space_info *found;
718 u64 total_used = 0; 726 u64 total_used = 0;
727 u64 total_used_data = 0;
719 int bits = dentry->d_sb->s_blocksize_bits; 728 int bits = dentry->d_sb->s_blocksize_bits;
720 __be32 *fsid = (__be32 *)root->fs_info->fsid; 729 __be32 *fsid = (__be32 *)root->fs_info->fsid;
721 730
722 rcu_read_lock(); 731 rcu_read_lock();
723 list_for_each_entry_rcu(found, head, list) 732 list_for_each_entry_rcu(found, head, list) {
733 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
734 BTRFS_BLOCK_GROUP_SYSTEM))
735 total_used_data += found->disk_total;
736 else
737 total_used_data += found->disk_used;
724 total_used += found->disk_used; 738 total_used += found->disk_used;
739 }
725 rcu_read_unlock(); 740 rcu_read_unlock();
726 741
727 buf->f_namelen = BTRFS_NAME_LEN; 742 buf->f_namelen = BTRFS_NAME_LEN;
728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 743 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
729 buf->f_bfree = buf->f_blocks - (total_used >> bits); 744 buf->f_bfree = buf->f_blocks - (total_used >> bits);
730 buf->f_bavail = buf->f_bfree; 745 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
731 buf->f_bsize = dentry->d_sb->s_blocksize; 746 buf->f_bsize = dentry->d_sb->s_blocksize;
732 buf->f_type = BTRFS_SUPER_MAGIC; 747 buf->f_type = BTRFS_SUPER_MAGIC;
733 748
@@ -746,7 +761,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
746static struct file_system_type btrfs_fs_type = { 761static struct file_system_type btrfs_fs_type = {
747 .owner = THIS_MODULE, 762 .owner = THIS_MODULE,
748 .name = "btrfs", 763 .name = "btrfs",
749 .get_sb = btrfs_get_sb, 764 .mount = btrfs_mount,
750 .kill_sb = kill_anon_super, 765 .kill_sb = kill_anon_super,
751 .fs_flags = FS_REQUIRES_DEV, 766 .fs_flags = FS_REQUIRES_DEV,
752}; 767};
@@ -815,6 +830,7 @@ static const struct file_operations btrfs_ctl_fops = {
815 .unlocked_ioctl = btrfs_control_ioctl, 830 .unlocked_ioctl = btrfs_control_ioctl,
816 .compat_ioctl = btrfs_control_ioctl, 831 .compat_ioctl = btrfs_control_ioctl,
817 .owner = THIS_MODULE, 832 .owner = THIS_MODULE,
833 .llseek = noop_llseek,
818}; 834};
819 835
820static struct miscdevice btrfs_misc = { 836static struct miscdevice btrfs_misc = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63b..1fffbc017bdf 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,6 +163,7 @@ enum btrfs_trans_type {
163 TRANS_START, 163 TRANS_START,
164 TRANS_JOIN, 164 TRANS_JOIN,
165 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166 TRANS_JOIN_NOLOCK,
166}; 167};
167 168
168static int may_wait_transaction(struct btrfs_root *root, int type) 169static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -179,14 +180,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
179{ 180{
180 struct btrfs_trans_handle *h; 181 struct btrfs_trans_handle *h;
181 struct btrfs_transaction *cur_trans; 182 struct btrfs_transaction *cur_trans;
182 int retries = 0;
183 int ret; 183 int ret;
184again: 184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 186 if (!h)
187 return ERR_PTR(-ENOMEM); 187 return ERR_PTR(-ENOMEM);
188 188
189 mutex_lock(&root->fs_info->trans_mutex); 189 if (type != TRANS_JOIN_NOLOCK)
190 mutex_lock(&root->fs_info->trans_mutex);
190 if (may_wait_transaction(root, type)) 191 if (may_wait_transaction(root, type))
191 wait_current_trans(root); 192 wait_current_trans(root);
192 193
@@ -195,7 +196,8 @@ again:
195 196
196 cur_trans = root->fs_info->running_transaction; 197 cur_trans = root->fs_info->running_transaction;
197 cur_trans->use_count++; 198 cur_trans->use_count++;
198 mutex_unlock(&root->fs_info->trans_mutex); 199 if (type != TRANS_JOIN_NOLOCK)
200 mutex_unlock(&root->fs_info->trans_mutex);
199 201
200 h->transid = cur_trans->transid; 202 h->transid = cur_trans->transid;
201 h->transaction = cur_trans; 203 h->transaction = cur_trans;
@@ -212,8 +214,7 @@ again:
212 } 214 }
213 215
214 if (num_items > 0) { 216 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items, 217 ret = btrfs_trans_reserve_metadata(h, root, num_items);
216 &retries);
217 if (ret == -EAGAIN) { 218 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root); 219 btrfs_commit_transaction(h, root);
219 goto again; 220 goto again;
@@ -224,9 +225,11 @@ again:
224 } 225 }
225 } 226 }
226 227
227 mutex_lock(&root->fs_info->trans_mutex); 228 if (type != TRANS_JOIN_NOLOCK)
229 mutex_lock(&root->fs_info->trans_mutex);
228 record_root_in_trans(h, root); 230 record_root_in_trans(h, root);
229 mutex_unlock(&root->fs_info->trans_mutex); 231 if (type != TRANS_JOIN_NOLOCK)
232 mutex_unlock(&root->fs_info->trans_mutex);
230 233
231 if (!current->journal_info && type != TRANS_USERSPACE) 234 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h; 235 current->journal_info = h;
@@ -244,6 +247,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
244 return start_transaction(root, 0, TRANS_JOIN); 247 return start_transaction(root, 0, TRANS_JOIN);
245} 248}
246 249
250struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
251 int num_blocks)
252{
253 return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
254}
255
247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 256struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
248 int num_blocks) 257 int num_blocks)
249{ 258{
@@ -270,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root,
270 return 0; 279 return 0;
271} 280}
272 281
282int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
283{
284 struct btrfs_transaction *cur_trans = NULL, *t;
285 int ret;
286
287 mutex_lock(&root->fs_info->trans_mutex);
288
289 ret = 0;
290 if (transid) {
291 if (transid <= root->fs_info->last_trans_committed)
292 goto out_unlock;
293
294 /* find specified transaction */
295 list_for_each_entry(t, &root->fs_info->trans_list, list) {
296 if (t->transid == transid) {
297 cur_trans = t;
298 break;
299 }
300 if (t->transid > transid)
301 break;
302 }
303 ret = -EINVAL;
304 if (!cur_trans)
305 goto out_unlock; /* bad transid */
306 } else {
307 /* find newest transaction that is committing | committed */
308 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
309 list) {
310 if (t->in_commit) {
311 if (t->commit_done)
312 goto out_unlock;
313 cur_trans = t;
314 break;
315 }
316 }
317 if (!cur_trans)
318 goto out_unlock; /* nothing committing|committed */
319 }
320
321 cur_trans->use_count++;
322 mutex_unlock(&root->fs_info->trans_mutex);
323
324 wait_for_commit(root, cur_trans);
325
326 mutex_lock(&root->fs_info->trans_mutex);
327 put_transaction(cur_trans);
328 ret = 0;
329out_unlock:
330 mutex_unlock(&root->fs_info->trans_mutex);
331 return ret;
332}
333
273#if 0 334#if 0
274/* 335/*
275 * rate limit against the drop_snapshot code. This helps to slow down new 336 * rate limit against the drop_snapshot code. This helps to slow down new
@@ -348,7 +409,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
348} 409}
349 410
350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 411static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
351 struct btrfs_root *root, int throttle) 412 struct btrfs_root *root, int throttle, int lock)
352{ 413{
353 struct btrfs_transaction *cur_trans = trans->transaction; 414 struct btrfs_transaction *cur_trans = trans->transaction;
354 struct btrfs_fs_info *info = root->fs_info; 415 struct btrfs_fs_info *info = root->fs_info;
@@ -376,26 +437,29 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
376 437
377 btrfs_trans_release_metadata(trans, root); 438 btrfs_trans_release_metadata(trans, root);
378 439
379 if (!root->fs_info->open_ioctl_trans && 440 if (lock && !root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root)) 441 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1; 442 trans->transaction->blocked = 1;
382 443
383 if (cur_trans->blocked && !cur_trans->in_commit) { 444 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle) 445 if (throttle)
385 return btrfs_commit_transaction(trans, root); 446 return btrfs_commit_transaction(trans, root);
386 else 447 else
387 wake_up_process(info->transaction_kthread); 448 wake_up_process(info->transaction_kthread);
388 } 449 }
389 450
390 mutex_lock(&info->trans_mutex); 451 if (lock)
452 mutex_lock(&info->trans_mutex);
391 WARN_ON(cur_trans != info->running_transaction); 453 WARN_ON(cur_trans != info->running_transaction);
392 WARN_ON(cur_trans->num_writers < 1); 454 WARN_ON(cur_trans->num_writers < 1);
393 cur_trans->num_writers--; 455 cur_trans->num_writers--;
394 456
457 smp_mb();
395 if (waitqueue_active(&cur_trans->writer_wait)) 458 if (waitqueue_active(&cur_trans->writer_wait))
396 wake_up(&cur_trans->writer_wait); 459 wake_up(&cur_trans->writer_wait);
397 put_transaction(cur_trans); 460 put_transaction(cur_trans);
398 mutex_unlock(&info->trans_mutex); 461 if (lock)
462 mutex_unlock(&info->trans_mutex);
399 463
400 if (current->journal_info == trans) 464 if (current->journal_info == trans)
401 current->journal_info = NULL; 465 current->journal_info = NULL;
@@ -411,13 +475,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
411int btrfs_end_transaction(struct btrfs_trans_handle *trans, 475int btrfs_end_transaction(struct btrfs_trans_handle *trans,
412 struct btrfs_root *root) 476 struct btrfs_root *root)
413{ 477{
414 return __btrfs_end_transaction(trans, root, 0); 478 return __btrfs_end_transaction(trans, root, 0, 1);
415} 479}
416 480
417int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 481int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 482 struct btrfs_root *root)
419{ 483{
420 return __btrfs_end_transaction(trans, root, 1); 484 return __btrfs_end_transaction(trans, root, 1, 1);
485}
486
487int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
488 struct btrfs_root *root)
489{
490 return __btrfs_end_transaction(trans, root, 0, 0);
421} 491}
422 492
423/* 493/*
@@ -836,7 +906,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
836 struct extent_buffer *tmp; 906 struct extent_buffer *tmp;
837 struct extent_buffer *old; 907 struct extent_buffer *old;
838 int ret; 908 int ret;
839 int retries = 0;
840 u64 to_reserve = 0; 909 u64 to_reserve = 0;
841 u64 index = 0; 910 u64 index = 0;
842 u64 objectid; 911 u64 objectid;
@@ -858,7 +927,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
858 927
859 if (to_reserve > 0) { 928 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 929 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries); 930 to_reserve);
862 if (ret) { 931 if (ret) {
863 pending->error = ret; 932 pending->error = ret;
864 goto fail; 933 goto fail;
@@ -966,6 +1035,8 @@ static void update_super_roots(struct btrfs_root *root)
966 super->root = root_item->bytenr; 1035 super->root = root_item->bytenr;
967 super->generation = root_item->generation; 1036 super->generation = root_item->generation;
968 super->root_level = root_item->level; 1037 super->root_level = root_item->level;
1038 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
1039 super->cache_generation = root_item->generation;
969} 1040}
970 1041
971int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1042int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -988,11 +1059,127 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
988 return ret; 1059 return ret;
989} 1060}
990 1061
1062/*
1063 * wait for the current transaction commit to start and block subsequent
1064 * transaction joins
1065 */
1066static void wait_current_trans_commit_start(struct btrfs_root *root,
1067 struct btrfs_transaction *trans)
1068{
1069 DEFINE_WAIT(wait);
1070
1071 if (trans->in_commit)
1072 return;
1073
1074 while (1) {
1075 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1076 TASK_UNINTERRUPTIBLE);
1077 if (trans->in_commit) {
1078 finish_wait(&root->fs_info->transaction_blocked_wait,
1079 &wait);
1080 break;
1081 }
1082 mutex_unlock(&root->fs_info->trans_mutex);
1083 schedule();
1084 mutex_lock(&root->fs_info->trans_mutex);
1085 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1086 }
1087}
1088
1089/*
1090 * wait for the current transaction to start and then become unblocked.
1091 * caller holds ref.
1092 */
1093static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1094 struct btrfs_transaction *trans)
1095{
1096 DEFINE_WAIT(wait);
1097
1098 if (trans->commit_done || (trans->in_commit && !trans->blocked))
1099 return;
1100
1101 while (1) {
1102 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1103 TASK_UNINTERRUPTIBLE);
1104 if (trans->commit_done ||
1105 (trans->in_commit && !trans->blocked)) {
1106 finish_wait(&root->fs_info->transaction_wait,
1107 &wait);
1108 break;
1109 }
1110 mutex_unlock(&root->fs_info->trans_mutex);
1111 schedule();
1112 mutex_lock(&root->fs_info->trans_mutex);
1113 finish_wait(&root->fs_info->transaction_wait,
1114 &wait);
1115 }
1116}
1117
1118/*
1119 * commit transactions asynchronously. once btrfs_commit_transaction_async
1120 * returns, any subsequent transaction will not be allowed to join.
1121 */
1122struct btrfs_async_commit {
1123 struct btrfs_trans_handle *newtrans;
1124 struct btrfs_root *root;
1125 struct delayed_work work;
1126};
1127
1128static void do_async_commit(struct work_struct *work)
1129{
1130 struct btrfs_async_commit *ac =
1131 container_of(work, struct btrfs_async_commit, work.work);
1132
1133 btrfs_commit_transaction(ac->newtrans, ac->root);
1134 kfree(ac);
1135}
1136
1137int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1138 struct btrfs_root *root,
1139 int wait_for_unblock)
1140{
1141 struct btrfs_async_commit *ac;
1142 struct btrfs_transaction *cur_trans;
1143
1144 ac = kmalloc(sizeof(*ac), GFP_NOFS);
1145 BUG_ON(!ac);
1146
1147 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1148 ac->root = root;
1149 ac->newtrans = btrfs_join_transaction(root, 0);
1150
1151 /* take transaction reference */
1152 mutex_lock(&root->fs_info->trans_mutex);
1153 cur_trans = trans->transaction;
1154 cur_trans->use_count++;
1155 mutex_unlock(&root->fs_info->trans_mutex);
1156
1157 btrfs_end_transaction(trans, root);
1158 schedule_delayed_work(&ac->work, 0);
1159
1160 /* wait for transaction to start and unblock */
1161 mutex_lock(&root->fs_info->trans_mutex);
1162 if (wait_for_unblock)
1163 wait_current_trans_commit_start_and_unblock(root, cur_trans);
1164 else
1165 wait_current_trans_commit_start(root, cur_trans);
1166 put_transaction(cur_trans);
1167 mutex_unlock(&root->fs_info->trans_mutex);
1168
1169 return 0;
1170}
1171
1172/*
1173 * btrfs_transaction state sequence:
1174 * in_commit = 0, blocked = 0 (initial)
1175 * in_commit = 1, blocked = 1
1176 * blocked = 0
1177 * commit_done = 1
1178 */
991int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1179int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
992 struct btrfs_root *root) 1180 struct btrfs_root *root)
993{ 1181{
994 unsigned long joined = 0; 1182 unsigned long joined = 0;
995 unsigned long timeout = 1;
996 struct btrfs_transaction *cur_trans; 1183 struct btrfs_transaction *cur_trans;
997 struct btrfs_transaction *prev_trans = NULL; 1184 struct btrfs_transaction *prev_trans = NULL;
998 DEFINE_WAIT(wait); 1185 DEFINE_WAIT(wait);
@@ -1039,6 +1226,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1039 1226
1040 trans->transaction->in_commit = 1; 1227 trans->transaction->in_commit = 1;
1041 trans->transaction->blocked = 1; 1228 trans->transaction->blocked = 1;
1229 wake_up(&root->fs_info->transaction_blocked_wait);
1230
1042 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1231 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1043 prev_trans = list_entry(cur_trans->list.prev, 1232 prev_trans = list_entry(cur_trans->list.prev,
1044 struct btrfs_transaction, list); 1233 struct btrfs_transaction, list);
@@ -1063,11 +1252,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1063 snap_pending = 1; 1252 snap_pending = 1;
1064 1253
1065 WARN_ON(cur_trans != trans->transaction); 1254 WARN_ON(cur_trans != trans->transaction);
1066 if (cur_trans->num_writers > 1)
1067 timeout = MAX_SCHEDULE_TIMEOUT;
1068 else if (should_grow)
1069 timeout = 1;
1070
1071 mutex_unlock(&root->fs_info->trans_mutex); 1255 mutex_unlock(&root->fs_info->trans_mutex);
1072 1256
1073 if (flush_on_commit || snap_pending) { 1257 if (flush_on_commit || snap_pending) {
@@ -1089,8 +1273,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1089 TASK_UNINTERRUPTIBLE); 1273 TASK_UNINTERRUPTIBLE);
1090 1274
1091 smp_mb(); 1275 smp_mb();
1092 if (cur_trans->num_writers > 1 || should_grow) 1276 if (cur_trans->num_writers > 1)
1093 schedule_timeout(timeout); 1277 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1278 else if (should_grow)
1279 schedule_timeout(1);
1094 1280
1095 mutex_lock(&root->fs_info->trans_mutex); 1281 mutex_lock(&root->fs_info->trans_mutex);
1096 finish_wait(&cur_trans->writer_wait, &wait); 1282 finish_wait(&cur_trans->writer_wait, &wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bfd..f104b57ad4ef 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -87,12 +87,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
87 87
88int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
89 struct btrfs_root *root); 89 struct btrfs_root *root);
90int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
91 int num_items); 93 int num_items);
92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 94struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
93 int num_blocks); 95 int num_blocks);
96struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
97 int num_blocks);
94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 98struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
95 int num_blocks); 99 int num_blocks);
100int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 101int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root); 102 struct btrfs_root *root);
98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 103int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -104,6 +109,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
104int btrfs_clean_old_snapshots(struct btrfs_root *root); 109int btrfs_clean_old_snapshots(struct btrfs_root *root);
105int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 110int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
106 struct btrfs_root *root); 111 struct btrfs_root *root);
112int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
113 struct btrfs_root *root,
114 int wait_for_unblock);
107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 115int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 116 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 117int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed7..992ab425599d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
36 int ret = 0; 36 int ret = 0;
37 int wret; 37 int wret;
38 int level; 38 int level;
39 int orig_level;
40 int is_extent = 0; 39 int is_extent = 0;
41 int next_key_ret = 0; 40 int next_key_ret = 0;
42 u64 last_ret = 0; 41 u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
64 return -ENOMEM; 63 return -ENOMEM;
65 64
66 level = btrfs_header_level(root->node); 65 level = btrfs_header_level(root->node);
67 orig_level = level;
68 66
69 if (level == 0) 67 if (level == 0)
70 goto out; 68 goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9c..a29f19384a27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
786{ 786{
787 struct inode *dir; 787 struct inode *dir;
788 int ret; 788 int ret;
789 struct btrfs_key location;
790 struct btrfs_inode_ref *ref; 789 struct btrfs_inode_ref *ref;
791 struct btrfs_dir_item *di; 790 struct btrfs_dir_item *di;
792 struct inode *inode; 791 struct inode *inode;
@@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
795 unsigned long ref_ptr; 794 unsigned long ref_ptr;
796 unsigned long ref_end; 795 unsigned long ref_end;
797 796
798 location.objectid = key->objectid;
799 location.type = BTRFS_INODE_ITEM_KEY;
800 location.offset = 0;
801
802 /* 797 /*
803 * it is possible that we didn't log all the parent directories 798 * it is possible that we didn't log all the parent directories
804 * for a given inode. If we don't find the dir, just don't 799 * for a given inode. If we don't find the dir, just don't
@@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1583 struct btrfs_path *path; 1578 struct btrfs_path *path;
1584 struct btrfs_root *root = wc->replay_dest; 1579 struct btrfs_root *root = wc->replay_dest;
1585 struct btrfs_key key; 1580 struct btrfs_key key;
1586 u32 item_size;
1587 int level; 1581 int level;
1588 int i; 1582 int i;
1589 int ret; 1583 int ret;
@@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1601 nritems = btrfs_header_nritems(eb); 1595 nritems = btrfs_header_nritems(eb);
1602 for (i = 0; i < nritems; i++) { 1596 for (i = 0; i < nritems; i++) {
1603 btrfs_item_key_to_cpu(eb, &key, i); 1597 btrfs_item_key_to_cpu(eb, &key, i);
1604 item_size = btrfs_item_size_nr(eb, i);
1605 1598
1606 /* inode keys are done during the first stage */ 1599 /* inode keys are done during the first stage */
1607 if (key.type == BTRFS_INODE_ITEM_KEY && 1600 if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1668 struct walk_control *wc) 1661 struct walk_control *wc)
1669{ 1662{
1670 u64 root_owner; 1663 u64 root_owner;
1671 u64 root_gen;
1672 u64 bytenr; 1664 u64 bytenr;
1673 u64 ptr_gen; 1665 u64 ptr_gen;
1674 struct extent_buffer *next; 1666 struct extent_buffer *next;
@@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1698 1690
1699 parent = path->nodes[*level]; 1691 parent = path->nodes[*level];
1700 root_owner = btrfs_header_owner(parent); 1692 root_owner = btrfs_header_owner(parent);
1701 root_gen = btrfs_header_generation(parent);
1702 1693
1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1704 1695
@@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1749 struct walk_control *wc) 1740 struct walk_control *wc)
1750{ 1741{
1751 u64 root_owner; 1742 u64 root_owner;
1752 u64 root_gen;
1753 int i; 1743 int i;
1754 int slot; 1744 int slot;
1755 int ret; 1745 int ret;
@@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1747 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1758 slot = path->slots[i]; 1748 slot = path->slots[i];
1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 1749 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1760 struct extent_buffer *node;
1761 node = path->nodes[i];
1762 path->slots[i]++; 1750 path->slots[i]++;
1763 *level = i; 1751 *level = i;
1764 WARN_ON(*level == 0); 1752 WARN_ON(*level == 0);
@@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1771 parent = path->nodes[*level + 1]; 1759 parent = path->nodes[*level + 1];
1772 1760
1773 root_owner = btrfs_header_owner(parent); 1761 root_owner = btrfs_header_owner(parent);
1774 root_gen = btrfs_header_generation(parent);
1775 wc->process_func(root, path->nodes[*level], wc, 1762 wc->process_func(root, path->nodes[*level], wc,
1776 btrfs_header_generation(path->nodes[*level])); 1763 btrfs_header_generation(path->nodes[*level]));
1777 if (wc->free) { 1764 if (wc->free) {
@@ -2273,7 +2260,7 @@ fail:
2273 } 2260 }
2274 btrfs_end_log_trans(root); 2261 btrfs_end_log_trans(root);
2275 2262
2276 return 0; 2263 return err;
2277} 2264}
2278 2265
2279/* see comments for btrfs_del_dir_entries_in_log */ 2266/* see comments for btrfs_del_dir_entries_in_log */
@@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2729 struct btrfs_key max_key; 2716 struct btrfs_key max_key;
2730 struct btrfs_root *log = root->log_root; 2717 struct btrfs_root *log = root->log_root;
2731 struct extent_buffer *src = NULL; 2718 struct extent_buffer *src = NULL;
2732 u32 size;
2733 int err = 0; 2719 int err = 0;
2734 int ret; 2720 int ret;
2735 int nritems; 2721 int nritems;
@@ -2793,7 +2779,6 @@ again:
2793 break; 2779 break;
2794 2780
2795 src = path->nodes[0]; 2781 src = path->nodes[0];
2796 size = btrfs_item_size_nr(src, path->slots[0]);
2797 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2782 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2798 ins_nr++; 2783 ins_nr++;
2799 goto next_slot; 2784 goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..cc04dc1445d6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
398 device->work.func = pending_bios_fn; 398 device->work.func = pending_bios_fn;
399 memcpy(device->uuid, disk_super->dev_item.uuid, 399 memcpy(device->uuid, disk_super->dev_item.uuid,
400 BTRFS_UUID_SIZE); 400 BTRFS_UUID_SIZE);
401 device->barriers = 1;
402 spin_lock_init(&device->io_lock); 401 spin_lock_init(&device->io_lock);
403 device->name = kstrdup(path, GFP_NOFS); 402 device->name = kstrdup(path, GFP_NOFS);
404 if (!device->name) { 403 if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
462 device->devid = orig_dev->devid; 461 device->devid = orig_dev->devid;
463 device->work.func = pending_bios_fn; 462 device->work.func = pending_bios_fn;
464 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 463 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
465 device->barriers = 1;
466 spin_lock_init(&device->io_lock); 464 spin_lock_init(&device->io_lock);
467 INIT_LIST_HEAD(&device->dev_list); 465 INIT_LIST_HEAD(&device->dev_list);
468 INIT_LIST_HEAD(&device->dev_alloc_list); 466 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1489 trans = btrfs_start_transaction(root, 0); 1487 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1488 lock_chunks(root);
1491 1489
1492 device->barriers = 1;
1493 device->writeable = 1; 1490 device->writeable = 1;
1494 device->work.func = pending_bios_fn; 1491 device->work.func = pending_bios_fn;
1495 generate_random_uuid(device->uuid); 1492 generate_random_uuid(device->uuid);
@@ -1901,7 +1898,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1901 u64 size_to_free; 1898 u64 size_to_free;
1902 struct btrfs_path *path; 1899 struct btrfs_path *path;
1903 struct btrfs_key key; 1900 struct btrfs_key key;
1904 struct btrfs_chunk *chunk;
1905 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1901 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1906 struct btrfs_trans_handle *trans; 1902 struct btrfs_trans_handle *trans;
1907 struct btrfs_key found_key; 1903 struct btrfs_key found_key;
@@ -1965,9 +1961,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1965 if (found_key.objectid != key.objectid) 1961 if (found_key.objectid != key.objectid)
1966 break; 1962 break;
1967 1963
1968 chunk = btrfs_item_ptr(path->nodes[0],
1969 path->slots[0],
1970 struct btrfs_chunk);
1971 /* chunk zero is special */ 1964 /* chunk zero is special */
1972 if (found_key.offset == 0) 1965 if (found_key.offset == 0)
1973 break; 1966 break;
@@ -3034,8 +3027,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3034 } 3027 }
3035 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3028 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3036 dev = multi->stripes[dev_nr].dev; 3029 dev = multi->stripes[dev_nr].dev;
3037 BUG_ON(rw == WRITE && !dev->writeable); 3030 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3038 if (dev && dev->bdev) {
3039 bio->bi_bdev = dev->bdev; 3031 bio->bi_bdev = dev->bdev;
3040 if (async_submit) 3032 if (async_submit)
3041 schedule_bio(root, dev, rw, bio); 3033 schedule_bio(root, dev, rw, bio);
@@ -3084,7 +3076,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3084 return NULL; 3076 return NULL;
3085 list_add(&device->dev_list, 3077 list_add(&device->dev_list,
3086 &fs_devices->devices); 3078 &fs_devices->devices);
3087 device->barriers = 1;
3088 device->dev_root = root->fs_info->dev_root; 3079 device->dev_root = root->fs_info->dev_root;
3089 device->devid = devid; 3080 device->devid = devid;
3090 device->work.func = pending_bios_fn; 3081 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..2b638b6e4eea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
42 int running_pending; 42 int running_pending;
43 u64 generation; 43 u64 generation;
44 44
45 int barriers;
46 int writeable; 45 int writeable;
47 int in_fs_metadata; 46 int in_fs_metadata;
48 47
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb215878..698fdd2c739c 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
178 struct inode *inode = dentry->d_inode; 178 struct inode *inode = dentry->d_inode;
179 struct btrfs_root *root = BTRFS_I(inode)->root; 179 struct btrfs_root *root = BTRFS_I(inode)->root;
180 struct btrfs_path *path; 180 struct btrfs_path *path;
181 struct btrfs_item *item;
182 struct extent_buffer *leaf; 181 struct extent_buffer *leaf;
183 struct btrfs_dir_item *di; 182 struct btrfs_dir_item *di;
184 int ret = 0, slot, advance; 183 int ret = 0, slot, advance;
@@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
234 } 233 }
235 advance = 1; 234 advance = 1;
236 235
237 item = btrfs_item_nr(leaf, slot);
238 btrfs_item_key_to_cpu(leaf, &found_key, slot); 236 btrfs_item_key_to_cpu(leaf, &found_key, slot);
239 237
240 /* check to make sure this item is what we want */ 238 /* check to make sure this item is what we want */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa239..b9cd5445f71c 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
199 int nr_pages = 0; 199 int nr_pages = 0;
200 struct page *in_page = NULL; 200 struct page *in_page = NULL;
201 struct page *out_page = NULL; 201 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left; 202 unsigned long bytes_left;
205 203
206 *out_pages = 0; 204 *out_pages = 0;
@@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 231 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 232 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235 233
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) { 234 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 235 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) { 236 if (ret != Z_OK) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..5930e382959b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
156 if (uptodate) { 156 if (uptodate) {
157 set_buffer_uptodate(bh); 157 set_buffer_uptodate(bh);
158 } else { 158 } else {
159 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { 159 if (!quiet_error(bh)) {
160 buffer_io_error(bh); 160 buffer_io_error(bh);
161 printk(KERN_WARNING "lost page write due to " 161 printk(KERN_WARNING "lost page write due to "
162 "I/O error on %s\n", 162 "I/O error on %s\n",
@@ -905,7 +905,6 @@ try_again:
905 905
906 bh->b_state = 0; 906 bh->b_state = 0;
907 atomic_set(&bh->b_count, 0); 907 atomic_set(&bh->b_count, 0);
908 bh->b_private = NULL;
909 bh->b_size = size; 908 bh->b_size = size;
910 909
911 /* Link the buffer to its page */ 910 /* Link the buffer to its page */
@@ -1706,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1706 * and kswapd activity, but those code paths have their own 1705 * and kswapd activity, but those code paths have their own
1707 * higher-level throttling. 1706 * higher-level throttling.
1708 */ 1707 */
1709 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1708 if (wbc->sync_mode != WB_SYNC_NONE) {
1710 lock_buffer(bh); 1709 lock_buffer(bh);
1711 } else if (!trylock_buffer(bh)) { 1710 } else if (!trylock_buffer(bh)) {
1712 redirty_page_for_writepage(wbc, page); 1711 redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1833,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1834} 1833}
1835EXPORT_SYMBOL(page_zero_new_buffers); 1834EXPORT_SYMBOL(page_zero_new_buffers);
1836 1835
1837int block_prepare_write(struct page *page, unsigned from, unsigned to, 1836int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1838 get_block_t *get_block) 1837 get_block_t *get_block)
1839{ 1838{
1839 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1840 unsigned to = from + len;
1840 struct inode *inode = page->mapping->host; 1841 struct inode *inode = page->mapping->host;
1841 unsigned block_start, block_end; 1842 unsigned block_start, block_end;
1842 sector_t block; 1843 sector_t block;
@@ -1916,7 +1917,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
1916 } 1917 }
1917 return err; 1918 return err;
1918} 1919}
1919EXPORT_SYMBOL(block_prepare_write); 1920EXPORT_SYMBOL(__block_write_begin);
1920 1921
1921static int __block_commit_write(struct inode *inode, struct page *page, 1922static int __block_commit_write(struct inode *inode, struct page *page,
1922 unsigned from, unsigned to) 1923 unsigned from, unsigned to)
@@ -1953,15 +1954,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1953 return 0; 1954 return 0;
1954} 1955}
1955 1956
1956int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1957 get_block_t *get_block)
1958{
1959 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
1960
1961 return block_prepare_write(page, start, start + len, get_block);
1962}
1963EXPORT_SYMBOL(__block_write_begin);
1964
1965/* 1957/*
1966 * block_write_begin takes care of the basic task of block allocation and 1958 * block_write_begin takes care of the basic task of block allocation and
1967 * bringing partial write blocks uptodate first. 1959 * bringing partial write blocks uptodate first.
@@ -2379,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2379 else 2371 else
2380 end = PAGE_CACHE_SIZE; 2372 end = PAGE_CACHE_SIZE;
2381 2373
2382 ret = block_prepare_write(page, 0, end, get_block); 2374 ret = __block_write_begin(page, 0, end, get_block);
2383 if (!ret) 2375 if (!ret)
2384 ret = block_commit_write(page, 0, end); 2376 ret = block_commit_write(page, 0, end);
2385 2377
@@ -2466,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping,
2466 *fsdata = NULL; 2458 *fsdata = NULL;
2467 2459
2468 if (page_has_buffers(page)) { 2460 if (page_has_buffers(page)) {
2469 unlock_page(page); 2461 ret = __block_write_begin(page, pos, len, get_block);
2470 page_cache_release(page); 2462 if (unlikely(ret))
2471 *pagep = NULL; 2463 goto out_release;
2472 return block_write_begin(mapping, pos, len, flags, pagep, 2464 return ret;
2473 get_block);
2474 } 2465 }
2475 2466
2476 if (PageMappedToDisk(page)) 2467 if (PageMappedToDisk(page))
@@ -2891,7 +2882,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2891 2882
2892 if (err == -EOPNOTSUPP) { 2883 if (err == -EOPNOTSUPP) {
2893 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2884 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2894 set_bit(BH_Eopnotsupp, &bh->b_state);
2895 } 2885 }
2896 2886
2897 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) 2887 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3021,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3031 bh->b_end_io = end_buffer_write_sync; 3021 bh->b_end_io = end_buffer_write_sync;
3032 ret = submit_bh(rw, bh); 3022 ret = submit_bh(rw, bh);
3033 wait_on_buffer(bh); 3023 wait_on_buffer(bh);
3034 if (buffer_eopnotsupp(bh)) {
3035 clear_buffer_eopnotsupp(bh);
3036 ret = -EOPNOTSUPP;
3037 }
3038 if (!ret && !buffer_uptodate(bh)) 3024 if (!ret && !buffer_uptodate(bh))
3039 ret = -EIO; 3025 ret = -EIO;
3040 } else { 3026 } else {
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 727caedcdd92..0a1467b15516 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
55 .read = cachefiles_daemon_read, 55 .read = cachefiles_daemon_read,
56 .write = cachefiles_daemon_write, 56 .write = cachefiles_daemon_write,
57 .poll = cachefiles_daemon_poll, 57 .poll = cachefiles_daemon_poll,
58 .llseek = noop_llseek,
58}; 59};
59 60
60struct cachefiles_daemon_cmd { 61struct cachefiles_daemon_cmd {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 51bcc5ce3230..e9c874abc9e1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping,
591 struct writeback_control *wbc) 591 struct writeback_control *wbc)
592{ 592{
593 struct inode *inode = mapping->host; 593 struct inode *inode = mapping->host;
594 struct backing_dev_info *bdi = mapping->backing_dev_info;
595 struct ceph_inode_info *ci = ceph_inode(inode); 594 struct ceph_inode_info *ci = ceph_inode(inode);
596 struct ceph_fs_client *fsc; 595 struct ceph_fs_client *fsc;
597 pgoff_t index, start, end; 596 pgoff_t index, start, end;
@@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping,
633 632
634 pagevec_init(&pvec, 0); 633 pagevec_init(&pvec, 0);
635 634
636 /* ?? */
637 if (wbc->nonblocking && bdi_write_congested(bdi)) {
638 dout(" writepages congested\n");
639 wbc->encountered_congestion = 1;
640 goto out_final;
641 }
642
643 /* where to start/end? */ 635 /* where to start/end? */
644 if (wbc->range_cyclic) { 636 if (wbc->range_cyclic) {
645 start = mapping->writeback_index; /* Start from prev offset */ 637 start = mapping->writeback_index; /* Start from prev offset */
@@ -885,7 +877,6 @@ out:
885 rc = 0; /* vfs expects us to return 0 */ 877 rc = 0; /* vfs expects us to return 0 */
886 ceph_put_snap_context(snapc); 878 ceph_put_snap_context(snapc);
887 dout("writepages done, rc = %d\n", rc); 879 dout("writepages done, rc = %d\n", rc);
888out_final:
889 return rc; 880 return rc;
890} 881}
891 882
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d6e0e0421891..08b460ae0539 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -635,7 +635,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
635/* 635/*
636 * mount: join the ceph cluster, and open root directory. 636 * mount: join the ceph cluster, and open root directory.
637 */ 637 */
638static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, 638static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
639 const char *path) 639 const char *path)
640{ 640{
641 int err; 641 int err;
@@ -678,16 +678,14 @@ static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
678 } 678 }
679 } 679 }
680 680
681 mnt->mnt_root = root;
682 mnt->mnt_sb = fsc->sb;
683
684 fsc->mount_state = CEPH_MOUNT_MOUNTED; 681 fsc->mount_state = CEPH_MOUNT_MOUNTED;
685 dout("mount success\n"); 682 dout("mount success\n");
686 err = 0; 683 mutex_unlock(&fsc->client->mount_mutex);
684 return root;
687 685
688out: 686out:
689 mutex_unlock(&fsc->client->mount_mutex); 687 mutex_unlock(&fsc->client->mount_mutex);
690 return err; 688 return ERR_PTR(err);
691 689
692fail: 690fail:
693 if (first) { 691 if (first) {
@@ -777,41 +775,45 @@ static int ceph_register_bdi(struct super_block *sb,
777 return err; 775 return err;
778} 776}
779 777
780static int ceph_get_sb(struct file_system_type *fs_type, 778static struct dentry *ceph_mount(struct file_system_type *fs_type,
781 int flags, const char *dev_name, void *data, 779 int flags, const char *dev_name, void *data)
782 struct vfsmount *mnt)
783{ 780{
784 struct super_block *sb; 781 struct super_block *sb;
785 struct ceph_fs_client *fsc; 782 struct ceph_fs_client *fsc;
783 struct dentry *res;
786 int err; 784 int err;
787 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 785 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
788 const char *path = NULL; 786 const char *path = NULL;
789 struct ceph_mount_options *fsopt = NULL; 787 struct ceph_mount_options *fsopt = NULL;
790 struct ceph_options *opt = NULL; 788 struct ceph_options *opt = NULL;
791 789
792 dout("ceph_get_sb\n"); 790 dout("ceph_mount\n");
793 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); 791 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
794 if (err < 0) 792 if (err < 0) {
793 res = ERR_PTR(err);
795 goto out_final; 794 goto out_final;
795 }
796 796
797 /* create client (which we may/may not use) */ 797 /* create client (which we may/may not use) */
798 fsc = create_fs_client(fsopt, opt); 798 fsc = create_fs_client(fsopt, opt);
799 if (IS_ERR(fsc)) { 799 if (IS_ERR(fsc)) {
800 err = PTR_ERR(fsc); 800 res = ERR_CAST(fsc);
801 kfree(fsopt); 801 kfree(fsopt);
802 kfree(opt); 802 kfree(opt);
803 goto out_final; 803 goto out_final;
804 } 804 }
805 805
806 err = ceph_mdsc_init(fsc); 806 err = ceph_mdsc_init(fsc);
807 if (err < 0) 807 if (err < 0) {
808 res = ERR_PTR(err);
808 goto out; 809 goto out;
810 }
809 811
810 if (ceph_test_opt(fsc->client, NOSHARE)) 812 if (ceph_test_opt(fsc->client, NOSHARE))
811 compare_super = NULL; 813 compare_super = NULL;
812 sb = sget(fs_type, compare_super, ceph_set_super, fsc); 814 sb = sget(fs_type, compare_super, ceph_set_super, fsc);
813 if (IS_ERR(sb)) { 815 if (IS_ERR(sb)) {
814 err = PTR_ERR(sb); 816 res = ERR_CAST(sb);
815 goto out; 817 goto out;
816 } 818 }
817 819
@@ -823,16 +825,18 @@ static int ceph_get_sb(struct file_system_type *fs_type,
823 } else { 825 } else {
824 dout("get_sb using new client %p\n", fsc); 826 dout("get_sb using new client %p\n", fsc);
825 err = ceph_register_bdi(sb, fsc); 827 err = ceph_register_bdi(sb, fsc);
826 if (err < 0) 828 if (err < 0) {
829 res = ERR_PTR(err);
827 goto out_splat; 830 goto out_splat;
831 }
828 } 832 }
829 833
830 err = ceph_mount(fsc, mnt, path); 834 res = ceph_real_mount(fsc, path);
831 if (err < 0) 835 if (IS_ERR(res))
832 goto out_splat; 836 goto out_splat;
833 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, 837 dout("root %p inode %p ino %llx.%llx\n", res,
834 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode)); 838 res->d_inode, ceph_vinop(res->d_inode));
835 return 0; 839 return res;
836 840
837out_splat: 841out_splat:
838 ceph_mdsc_close_sessions(fsc->mdsc); 842 ceph_mdsc_close_sessions(fsc->mdsc);
@@ -843,8 +847,8 @@ out:
843 ceph_mdsc_destroy(fsc); 847 ceph_mdsc_destroy(fsc);
844 destroy_fs_client(fsc); 848 destroy_fs_client(fsc);
845out_final: 849out_final:
846 dout("ceph_get_sb fail %d\n", err); 850 dout("ceph_mount fail %ld\n", PTR_ERR(res));
847 return err; 851 return res;
848} 852}
849 853
850static void ceph_kill_sb(struct super_block *s) 854static void ceph_kill_sb(struct super_block *s)
@@ -860,7 +864,7 @@ static void ceph_kill_sb(struct super_block *s)
860static struct file_system_type ceph_fs_type = { 864static struct file_system_type ceph_fs_type = {
861 .owner = THIS_MODULE, 865 .owner = THIS_MODULE,
862 .name = "ceph", 866 .name = "ceph",
863 .get_sb = ceph_get_sb, 867 .mount = ceph_mount,
864 .kill_sb = ceph_kill_sb, 868 .kill_sb = ceph_kill_sb,
865 .fs_flags = FS_RENAME_DOES_D_MOVE, 869 .fs_flags = FS_RENAME_DOES_D_MOVE,
866}; 870};
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 143d393881cb..e5b9df993b93 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -456,6 +456,7 @@ static void cdev_purge(struct cdev *cdev)
456 */ 456 */
457const struct file_operations def_chr_fops = { 457const struct file_operations def_chr_fops = {
458 .open = chrdev_open, 458 .open = chrdev_open,
459 .llseek = noop_llseek,
459}; 460};
460 461
461static struct kobject *exact_match(dev_t dev, int *part, void *data) 462static struct kobject *exact_match(dev_t dev, int *part, void *data)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..0ed213970ced 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,9 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select CRYPTO
6 select CRYPTO_MD5
7 select CRYPTO_ARC4
5 help 8 help
6 This is the client VFS module for the Common Internet File System 9 This is the client VFS module for the Common Internet File System
7 (CIFS) protocol which is the successor to the Server Message Block 10 (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/README b/fs/cifs/README
index 7099a526f775..ee68d1036544 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -527,6 +527,11 @@ A partial list of the supported mount options follows:
527 SFU does). In the future the bottom 9 bits of the 527 SFU does). In the future the bottom 9 bits of the
528 mode also will be emulated using queries of the security 528 mode also will be emulated using queries of the security
529 descriptor (ACL). 529 descriptor (ACL).
530 mfsymlinks Enable support for Minshall+French symlinks
531 (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
532 This option is ignored when specified together with the
533 'sfu' option. Minshall+French symlinks are used even if
534 the server supports the CIFS Unix Extensions.
530 sign Must use packet signing (helps avoid unwanted data modification 535 sign Must use packet signing (helps avoid unwanted data modification
531 by intermediate systems in the route). Note that signing 536 by intermediate systems in the route). Note that signing
532 does not work with lanman or plaintext authentication. 537 does not work with lanman or plaintext authentication.
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e52..355abcdcda98 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
81 81
82v) mount check for unmatched uids 82v) mount check for unmatched uids
83 83
84w) Add support for new vfs entry points for setlease and fallocate 84w) Add support for new vfs entry point for fallocate
85 85
86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
87processes can proceed better in parallel (on the server) 87processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index eb1ba493489f..103ab8b605b0 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -148,7 +148,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
148 seq_printf(m, "Servers:"); 148 seq_printf(m, "Servers:");
149 149
150 i = 0; 150 i = 0;
151 read_lock(&cifs_tcp_ses_lock); 151 spin_lock(&cifs_tcp_ses_lock);
152 list_for_each(tmp1, &cifs_tcp_ses_list) { 152 list_for_each(tmp1, &cifs_tcp_ses_list) {
153 server = list_entry(tmp1, struct TCP_Server_Info, 153 server = list_entry(tmp1, struct TCP_Server_Info,
154 tcp_ses_list); 154 tcp_ses_list);
@@ -230,7 +230,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
230 spin_unlock(&GlobalMid_Lock); 230 spin_unlock(&GlobalMid_Lock);
231 } 231 }
232 } 232 }
233 read_unlock(&cifs_tcp_ses_lock); 233 spin_unlock(&cifs_tcp_ses_lock);
234 seq_putc(m, '\n'); 234 seq_putc(m, '\n');
235 235
236 /* BB add code to dump additional info such as TCP session info now */ 236 /* BB add code to dump additional info such as TCP session info now */
@@ -270,7 +270,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
270 atomic_set(&totBufAllocCount, 0); 270 atomic_set(&totBufAllocCount, 0);
271 atomic_set(&totSmBufAllocCount, 0); 271 atomic_set(&totSmBufAllocCount, 0);
272#endif /* CONFIG_CIFS_STATS2 */ 272#endif /* CONFIG_CIFS_STATS2 */
273 read_lock(&cifs_tcp_ses_lock); 273 spin_lock(&cifs_tcp_ses_lock);
274 list_for_each(tmp1, &cifs_tcp_ses_list) { 274 list_for_each(tmp1, &cifs_tcp_ses_list) {
275 server = list_entry(tmp1, struct TCP_Server_Info, 275 server = list_entry(tmp1, struct TCP_Server_Info,
276 tcp_ses_list); 276 tcp_ses_list);
@@ -303,7 +303,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
303 } 303 }
304 } 304 }
305 } 305 }
306 read_unlock(&cifs_tcp_ses_lock); 306 spin_unlock(&cifs_tcp_ses_lock);
307 } 307 }
308 308
309 return count; 309 return count;
@@ -343,7 +343,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
343 GlobalCurrentXid, GlobalMaxActiveXid); 343 GlobalCurrentXid, GlobalMaxActiveXid);
344 344
345 i = 0; 345 i = 0;
346 read_lock(&cifs_tcp_ses_lock); 346 spin_lock(&cifs_tcp_ses_lock);
347 list_for_each(tmp1, &cifs_tcp_ses_list) { 347 list_for_each(tmp1, &cifs_tcp_ses_list) {
348 server = list_entry(tmp1, struct TCP_Server_Info, 348 server = list_entry(tmp1, struct TCP_Server_Info,
349 tcp_ses_list); 349 tcp_ses_list);
@@ -397,7 +397,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
397 } 397 }
398 } 398 }
399 } 399 }
400 read_unlock(&cifs_tcp_ses_lock); 400 spin_unlock(&cifs_tcp_ses_lock);
401 401
402 seq_putc(m, '\n'); 402 seq_putc(m, '\n');
403 return 0; 403 return 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0c..8942b28cf807 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
34extern int traceSMB; /* flag which enables the function below */ 34extern int traceSMB; /* flag which enables the function below */
35void dump_smb(struct smb_hdr *, int); 35void dump_smb(struct smb_hdr *, int);
36#define CIFS_INFO 0x01 36#define CIFS_INFO 0x01
37#define CIFS_RC 0x02 37#define CIFS_RC 0x02
38#define CIFS_TIMER 0x04 38#define CIFS_TIMER 0x04
39 39
40/* 40/*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d6ced7aa23cf..c68a056f27fd 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
44void cifs_dfs_release_automount_timer(void) 44void cifs_dfs_release_automount_timer(void)
45{ 45{
46 BUG_ON(!list_empty(&cifs_dfs_automount_list)); 46 BUG_ON(!list_empty(&cifs_dfs_automount_list));
47 cancel_delayed_work(&cifs_dfs_automount_task); 47 cancel_delayed_work_sync(&cifs_dfs_automount_task);
48 flush_scheduled_work();
49} 48}
50 49
51/** 50/**
@@ -306,6 +305,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
306 int xid, i; 305 int xid, i;
307 int rc = 0; 306 int rc = 0;
308 struct vfsmount *mnt = ERR_PTR(-ENOENT); 307 struct vfsmount *mnt = ERR_PTR(-ENOENT);
308 struct tcon_link *tlink;
309 309
310 cFYI(1, "in %s", __func__); 310 cFYI(1, "in %s", __func__);
311 BUG_ON(IS_ROOT(dentry)); 311 BUG_ON(IS_ROOT(dentry));
@@ -315,14 +315,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
315 dput(nd->path.dentry); 315 dput(nd->path.dentry);
316 nd->path.dentry = dget(dentry); 316 nd->path.dentry = dget(dentry);
317 317
318 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
319 ses = cifs_sb->tcon->ses;
320
321 if (!ses) {
322 rc = -EINVAL;
323 goto out_err;
324 }
325
326 /* 318 /*
327 * The MSDFS spec states that paths in DFS referral requests and 319 * The MSDFS spec states that paths in DFS referral requests and
328 * responses must be prefixed by a single '\' character instead of 320 * responses must be prefixed by a single '\' character instead of
@@ -335,10 +327,20 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
335 goto out_err; 327 goto out_err;
336 } 328 }
337 329
338 rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls, 330 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
331 tlink = cifs_sb_tlink(cifs_sb);
332 if (IS_ERR(tlink)) {
333 rc = PTR_ERR(tlink);
334 goto out_err;
335 }
336 ses = tlink_tcon(tlink)->ses;
337
338 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
339 &num_referrals, &referrals, 339 &num_referrals, &referrals,
340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
341 341
342 cifs_put_tlink(tlink);
343
342 for (i = 0; i < num_referrals; i++) { 344 for (i = 0; i < num_referrals; i++) {
343 int len; 345 int len;
344 dump_referral(referrals+i); 346 dump_referral(referrals+i);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9e771450c3b8..e9a393c9c2ca 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
15 * the GNU Lesser General Public License for more details. 15 * the GNU Lesser General Public License for more details.
16 * 16 *
17 */ 17 */
18#include <linux/rbtree.h>
19
18#ifndef _CIFS_FS_SB_H 20#ifndef _CIFS_FS_SB_H
19#define _CIFS_FS_SB_H 21#define _CIFS_FS_SB_H
20 22
@@ -36,23 +38,28 @@
36#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ 38#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
37#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ 39#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
38#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ 40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
39 43
40struct cifs_sb_info { 44struct cifs_sb_info {
41 struct cifsTconInfo *tcon; /* primary mount */ 45 struct rb_root tlink_tree;
42 struct list_head nested_tcon_q; 46 spinlock_t tlink_tree_lock;
47 struct tcon_link *master_tlink;
43 struct nls_table *local_nls; 48 struct nls_table *local_nls;
44 unsigned int rsize; 49 unsigned int rsize;
45 unsigned int wsize; 50 unsigned int wsize;
51 atomic_t active;
46 uid_t mnt_uid; 52 uid_t mnt_uid;
47 gid_t mnt_gid; 53 gid_t mnt_gid;
48 mode_t mnt_file_mode; 54 mode_t mnt_file_mode;
49 mode_t mnt_dir_mode; 55 mode_t mnt_dir_mode;
50 int mnt_cifs_flags; 56 unsigned int mnt_cifs_flags;
51 int prepathlen; 57 int prepathlen;
52 char *prepath; /* relative path under the share to mount to */ 58 char *prepath; /* relative path under the share to mount to */
53#ifdef CONFIG_CIFS_DFS_UPCALL 59#ifdef CONFIG_CIFS_DFS_UPCALL
54 char *mountdata; /* mount options received at mount time */ 60 char *mountdata; /* mount options received at mount time */
55#endif 61#endif
56 struct backing_dev_info bdi; 62 struct backing_dev_info bdi;
63 struct delayed_work prune_tlinks;
57}; 64};
58#endif /* _CIFS_FS_SB_H */ 65#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c8..c9b4792ae825 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -557,11 +557,16 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
557{ 557{
558 struct cifs_ntsd *pntsd = NULL; 558 struct cifs_ntsd *pntsd = NULL;
559 int xid, rc; 559 int xid, rc;
560 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
561
562 if (IS_ERR(tlink))
563 return NULL;
560 564
561 xid = GetXid(); 565 xid = GetXid();
562 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 566 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
563 FreeXid(xid); 567 FreeXid(xid);
564 568
569 cifs_put_tlink(tlink);
565 570
566 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 571 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
567 return pntsd; 572 return pntsd;
@@ -574,10 +579,16 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
574 int oplock = 0; 579 int oplock = 0;
575 int xid, rc; 580 int xid, rc;
576 __u16 fid; 581 __u16 fid;
582 struct cifsTconInfo *tcon;
583 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
584
585 if (IS_ERR(tlink))
586 return NULL;
577 587
588 tcon = tlink_tcon(tlink);
578 xid = GetXid(); 589 xid = GetXid();
579 590
580 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0, 591 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
581 &fid, &oplock, NULL, cifs_sb->local_nls, 592 &fid, &oplock, NULL, cifs_sb->local_nls,
582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 593 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
583 if (rc) { 594 if (rc) {
@@ -585,11 +596,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
585 goto out; 596 goto out;
586 } 597 }
587 598
588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 599 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
589 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 600 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
590 601
591 CIFSSMBClose(xid, cifs_sb->tcon, fid); 602 CIFSSMBClose(xid, tcon, fid);
592 out: 603 out:
604 cifs_put_tlink(tlink);
593 FreeXid(xid); 605 FreeXid(xid);
594 return pntsd; 606 return pntsd;
595} 607}
@@ -603,7 +615,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
603 struct cifsFileInfo *open_file = NULL; 615 struct cifsFileInfo *open_file = NULL;
604 616
605 if (inode) 617 if (inode)
606 open_file = find_readable_file(CIFS_I(inode)); 618 open_file = find_readable_file(CIFS_I(inode), true);
607 if (!open_file) 619 if (!open_file)
608 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 620 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
609 621
@@ -616,10 +628,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
616 struct cifs_ntsd *pnntsd, u32 acllen) 628 struct cifs_ntsd *pnntsd, u32 acllen)
617{ 629{
618 int xid, rc; 630 int xid, rc;
631 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
632
633 if (IS_ERR(tlink))
634 return PTR_ERR(tlink);
619 635
620 xid = GetXid(); 636 xid = GetXid();
621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 637 rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
622 FreeXid(xid); 638 FreeXid(xid);
639 cifs_put_tlink(tlink);
623 640
624 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 641 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
625 return rc; 642 return rc;
@@ -631,10 +648,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
631 int oplock = 0; 648 int oplock = 0;
632 int xid, rc; 649 int xid, rc;
633 __u16 fid; 650 __u16 fid;
651 struct cifsTconInfo *tcon;
652 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
634 653
654 if (IS_ERR(tlink))
655 return PTR_ERR(tlink);
656
657 tcon = tlink_tcon(tlink);
635 xid = GetXid(); 658 xid = GetXid();
636 659
637 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0, 660 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
638 &fid, &oplock, NULL, cifs_sb->local_nls, 661 &fid, &oplock, NULL, cifs_sb->local_nls,
639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 662 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
640 if (rc) { 663 if (rc) {
@@ -642,12 +665,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
642 goto out; 665 goto out;
643 } 666 }
644 667
645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 668 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
646 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 669 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
647 670
648 CIFSSMBClose(xid, cifs_sb->tcon, fid); 671 CIFSSMBClose(xid, tcon, fid);
649 out: 672out:
650 FreeXid(xid); 673 FreeXid(xid);
674 cifs_put_tlink(tlink);
651 return rc; 675 return rc;
652} 676}
653 677
@@ -661,7 +685,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
661 685
662 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); 686 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
663 687
664 open_file = find_readable_file(CIFS_I(inode)); 688 open_file = find_readable_file(CIFS_I(inode), true);
665 if (!open_file) 689 if (!open_file)
666 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); 690 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
667 691
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35042d8f7338..f856732161ab 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
27#include "md5.h" 27#include "md5.h"
28#include "cifs_unicode.h" 28#include "cifs_unicode.h"
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "ntlmssp.h"
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include <linux/random.h> 32#include <linux/random.h>
32 33
@@ -42,18 +43,32 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
42 unsigned char *p24); 43 unsigned char *p24);
43 44
44static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 45static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
45 const struct mac_key *key, char *signature) 46 struct TCP_Server_Info *server, char *signature)
46{ 47{
47 struct MD5Context context; 48 int rc;
48 49
49 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) 50 if (cifs_pdu == NULL || signature == NULL || server == NULL)
50 return -EINVAL; 51 return -EINVAL;
51 52
52 cifs_MD5_init(&context); 53 if (!server->secmech.sdescmd5) {
53 cifs_MD5_update(&context, (char *)&key->data, key->len); 54 cERROR(1, "%s: Can't generate signature\n", __func__);
54 cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); 55 return -1;
56 }
57
58 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
59 if (rc) {
60 cERROR(1, "%s: Oould not init md5\n", __func__);
61 return rc;
62 }
63
64 crypto_shash_update(&server->secmech.sdescmd5->shash,
65 server->session_key.response, server->session_key.len);
66
67 crypto_shash_update(&server->secmech.sdescmd5->shash,
68 cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
69
70 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
55 71
56 cifs_MD5_final(signature, &context);
57 return 0; 72 return 0;
58} 73}
59 74
@@ -78,8 +93,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
78 server->sequence_number++; 93 server->sequence_number++;
79 spin_unlock(&GlobalMid_Lock); 94 spin_unlock(&GlobalMid_Lock);
80 95
81 rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key, 96 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
82 smb_signature);
83 if (rc) 97 if (rc)
84 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 98 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
85 else 99 else
@@ -89,16 +103,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
89} 103}
90 104
91static int cifs_calc_signature2(const struct kvec *iov, int n_vec, 105static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
92 const struct mac_key *key, char *signature) 106 struct TCP_Server_Info *server, char *signature)
93{ 107{
94 struct MD5Context context;
95 int i; 108 int i;
109 int rc;
96 110
97 if ((iov == NULL) || (signature == NULL) || (key == NULL)) 111 if (iov == NULL || signature == NULL || server == NULL)
98 return -EINVAL; 112 return -EINVAL;
99 113
100 cifs_MD5_init(&context); 114 if (!server->secmech.sdescmd5) {
101 cifs_MD5_update(&context, (char *)&key->data, key->len); 115 cERROR(1, "%s: Can't generate signature\n", __func__);
116 return -1;
117 }
118
119 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
120 if (rc) {
121 cERROR(1, "%s: Oould not init md5\n", __func__);
122 return rc;
123 }
124
125 crypto_shash_update(&server->secmech.sdescmd5->shash,
126 server->session_key.response, server->session_key.len);
127
102 for (i = 0; i < n_vec; i++) { 128 for (i = 0; i < n_vec; i++) {
103 if (iov[i].iov_len == 0) 129 if (iov[i].iov_len == 0)
104 continue; 130 continue;
@@ -111,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
111 if (i == 0) { 137 if (i == 0) {
112 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 138 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
113 break; /* nothing to sign or corrupt header */ 139 break; /* nothing to sign or corrupt header */
114 cifs_MD5_update(&context, iov[0].iov_base+4, 140 crypto_shash_update(&server->secmech.sdescmd5->shash,
115 iov[0].iov_len-4); 141 iov[i].iov_base + 4, iov[i].iov_len - 4);
116 } else 142 } else
117 cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); 143 crypto_shash_update(&server->secmech.sdescmd5->shash,
144 iov[i].iov_base, iov[i].iov_len);
118 } 145 }
119 146
120 cifs_MD5_final(signature, &context); 147 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
121 148
122 return 0; 149 return rc;
123} 150}
124 151
125
126int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, 152int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
127 __u32 *pexpected_response_sequence_number) 153 __u32 *pexpected_response_sequence_number)
128{ 154{
@@ -145,8 +171,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
145 server->sequence_number++; 171 server->sequence_number++;
146 spin_unlock(&GlobalMid_Lock); 172 spin_unlock(&GlobalMid_Lock);
147 173
148 rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key, 174 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
149 smb_signature);
150 if (rc) 175 if (rc)
151 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 176 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
152 else 177 else
@@ -156,14 +181,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
156} 181}
157 182
158int cifs_verify_signature(struct smb_hdr *cifs_pdu, 183int cifs_verify_signature(struct smb_hdr *cifs_pdu,
159 const struct mac_key *mac_key, 184 struct TCP_Server_Info *server,
160 __u32 expected_sequence_number) 185 __u32 expected_sequence_number)
161{ 186{
162 unsigned int rc; 187 unsigned int rc;
163 char server_response_sig[8]; 188 char server_response_sig[8];
164 char what_we_think_sig_should_be[20]; 189 char what_we_think_sig_should_be[20];
165 190
166 if ((cifs_pdu == NULL) || (mac_key == NULL)) 191 if (cifs_pdu == NULL || server == NULL)
167 return -EINVAL; 192 return -EINVAL;
168 193
169 if (cifs_pdu->Command == SMB_COM_NEGOTIATE) 194 if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +217,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
192 cpu_to_le32(expected_sequence_number); 217 cpu_to_le32(expected_sequence_number);
193 cifs_pdu->Signature.Sequence.Reserved = 0; 218 cifs_pdu->Signature.Sequence.Reserved = 0;
194 219
195 rc = cifs_calculate_signature(cifs_pdu, mac_key, 220 rc = cifs_calculate_signature(cifs_pdu, server,
196 what_we_think_sig_should_be); 221 what_we_think_sig_should_be);
197 222
198 if (rc) 223 if (rc)
@@ -208,18 +233,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
208 233
209} 234}
210 235
211/* We fill in key by putting in 40 byte array which was allocated by caller */ 236/* first calculate 24 bytes ntlm response and then 16 byte session key */
212int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 237int setup_ntlm_response(struct cifsSesInfo *ses)
213 const char *password)
214{ 238{
215 char temp_key[16]; 239 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
216 if ((key == NULL) || (rn == NULL)) 240 char temp_key[CIFS_SESS_KEY_SIZE];
241
242 if (!ses)
217 return -EINVAL; 243 return -EINVAL;
218 244
219 E_md4hash(password, temp_key); 245 ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
220 mdfour(key->data.ntlm, temp_key, 16); 246 if (!ses->auth_key.response) {
221 memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE); 247 cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
222 key->len = 40; 248 return -ENOMEM;
249 }
250 ses->auth_key.len = temp_len;
251
252 SMBNTencrypt(ses->password, ses->server->cryptkey,
253 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
254
255 E_md4hash(ses->password, temp_key);
256 mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
257
223 return 0; 258 return 0;
224} 259}
225 260
@@ -262,109 +297,457 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
262} 297}
263#endif /* CIFS_WEAK_PW_HASH */ 298#endif /* CIFS_WEAK_PW_HASH */
264 299
265static int calc_ntlmv2_hash(struct cifsSesInfo *ses, 300/* Build a proper attribute value/target info pairs blob.
301 * Fill in netbios and dns domain name and workstation name
302 * and client time (total five av pairs and + one end of fields indicator.
303 * Allocate domain name which gets freed when session struct is deallocated.
304 */
305static int
306build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
307{
308 unsigned int dlen;
309 unsigned int wlen;
310 unsigned int size = 6 * sizeof(struct ntlmssp2_name);
311 __le64 curtime;
312 char *defdmname = "WORKGROUP";
313 unsigned char *blobptr;
314 struct ntlmssp2_name *attrptr;
315
316 if (!ses->domainName) {
317 ses->domainName = kstrdup(defdmname, GFP_KERNEL);
318 if (!ses->domainName)
319 return -ENOMEM;
320 }
321
322 dlen = strlen(ses->domainName);
323 wlen = strlen(ses->server->hostname);
324
325 /* The length of this blob is a size which is
326 * six times the size of a structure which holds name/size +
327 * two times the unicode length of a domain name +
328 * two times the unicode length of a server name +
329 * size of a timestamp (which is 8 bytes).
330 */
331 ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
332 ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
333 if (!ses->auth_key.response) {
334 ses->auth_key.len = 0;
335 cERROR(1, "Challenge target info allocation failure");
336 return -ENOMEM;
337 }
338
339 blobptr = ses->auth_key.response;
340 attrptr = (struct ntlmssp2_name *) blobptr;
341
342 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
343 attrptr->length = cpu_to_le16(2 * dlen);
344 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
345 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
346
347 blobptr += 2 * dlen;
348 attrptr = (struct ntlmssp2_name *) blobptr;
349
350 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
351 attrptr->length = cpu_to_le16(2 * wlen);
352 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
353 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
354
355 blobptr += 2 * wlen;
356 attrptr = (struct ntlmssp2_name *) blobptr;
357
358 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
359 attrptr->length = cpu_to_le16(2 * dlen);
360 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
361 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
362
363 blobptr += 2 * dlen;
364 attrptr = (struct ntlmssp2_name *) blobptr;
365
366 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
367 attrptr->length = cpu_to_le16(2 * wlen);
368 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
369 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
370
371 blobptr += 2 * wlen;
372 attrptr = (struct ntlmssp2_name *) blobptr;
373
374 attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
375 attrptr->length = cpu_to_le16(sizeof(__le64));
376 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
377 curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
378 memcpy(blobptr, &curtime, sizeof(__le64));
379
380 return 0;
381}
382
383/* Server has provided av pairs/target info in the type 2 challenge
384 * packet and we have plucked it and stored within smb session.
385 * We parse that blob here to find netbios domain name to be used
386 * as part of ntlmv2 authentication (in Target String), if not already
387 * specified on the command line.
388 * If this function returns without any error but without fetching
389 * domain name, authentication may fail against some server but
390 * may not fail against other (those who are not very particular
391 * about target string i.e. for some, just user name might suffice.
392 */
393static int
394find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
395{
396 unsigned int attrsize;
397 unsigned int type;
398 unsigned int onesize = sizeof(struct ntlmssp2_name);
399 unsigned char *blobptr;
400 unsigned char *blobend;
401 struct ntlmssp2_name *attrptr;
402
403 if (!ses->auth_key.len || !ses->auth_key.response)
404 return 0;
405
406 blobptr = ses->auth_key.response;
407 blobend = blobptr + ses->auth_key.len;
408
409 while (blobptr + onesize < blobend) {
410 attrptr = (struct ntlmssp2_name *) blobptr;
411 type = le16_to_cpu(attrptr->type);
412 if (type == NTLMSSP_AV_EOL)
413 break;
414 blobptr += 2; /* advance attr type */
415 attrsize = le16_to_cpu(attrptr->length);
416 blobptr += 2; /* advance attr size */
417 if (blobptr + attrsize > blobend)
418 break;
419 if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
420 if (!attrsize)
421 break;
422 if (!ses->domainName) {
423 ses->domainName =
424 kmalloc(attrsize + 1, GFP_KERNEL);
425 if (!ses->domainName)
426 return -ENOMEM;
427 cifs_from_ucs2(ses->domainName,
428 (__le16 *)blobptr, attrsize, attrsize,
429 nls_cp, false);
430 break;
431 }
432 }
433 blobptr += attrsize; /* advance attr value */
434 }
435
436 return 0;
437}
438
439static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
266 const struct nls_table *nls_cp) 440 const struct nls_table *nls_cp)
267{ 441{
268 int rc = 0; 442 int rc = 0;
269 int len; 443 int len;
270 char nt_hash[16]; 444 char nt_hash[CIFS_NTHASH_SIZE];
271 struct HMACMD5Context *pctxt;
272 wchar_t *user; 445 wchar_t *user;
273 wchar_t *domain; 446 wchar_t *domain;
447 wchar_t *server;
274 448
275 pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); 449 if (!ses->server->secmech.sdeschmacmd5) {
276 450 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
277 if (pctxt == NULL) 451 return -1;
278 return -ENOMEM; 452 }
279 453
280 /* calculate md4 hash of password */ 454 /* calculate md4 hash of password */
281 E_md4hash(ses->password, nt_hash); 455 E_md4hash(ses->password, nt_hash);
282 456
283 /* convert Domainname to unicode and uppercase */ 457 crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
284 hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); 458 CIFS_NTHASH_SIZE);
459
460 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
461 if (rc) {
462 cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
463 return rc;
464 }
285 465
286 /* convert ses->userName to unicode and uppercase */ 466 /* convert ses->userName to unicode and uppercase */
287 len = strlen(ses->userName); 467 len = strlen(ses->userName);
288 user = kmalloc(2 + (len * 2), GFP_KERNEL); 468 user = kmalloc(2 + (len * 2), GFP_KERNEL);
289 if (user == NULL) 469 if (user == NULL) {
470 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
471 rc = -ENOMEM;
290 goto calc_exit_2; 472 goto calc_exit_2;
473 }
291 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); 474 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
292 UniStrupr(user); 475 UniStrupr(user);
293 hmac_md5_update((char *)user, 2*len, pctxt); 476
477 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
478 (char *)user, 2 * len);
294 479
295 /* convert ses->domainName to unicode and uppercase */ 480 /* convert ses->domainName to unicode and uppercase */
296 if (ses->domainName) { 481 if (ses->domainName) {
297 len = strlen(ses->domainName); 482 len = strlen(ses->domainName);
298 483
299 domain = kmalloc(2 + (len * 2), GFP_KERNEL); 484 domain = kmalloc(2 + (len * 2), GFP_KERNEL);
300 if (domain == NULL) 485 if (domain == NULL) {
486 cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
487 rc = -ENOMEM;
301 goto calc_exit_1; 488 goto calc_exit_1;
489 }
302 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, 490 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
303 nls_cp); 491 nls_cp);
304 /* the following line was removed since it didn't work well 492 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
305 with lower cased domain name that passed as an option. 493 (char *)domain, 2 * len);
306 Maybe converting the domain name earlier makes sense */
307 /* UniStrupr(domain); */
308
309 hmac_md5_update((char *)domain, 2*len, pctxt);
310
311 kfree(domain); 494 kfree(domain);
495 } else if (ses->serverName) {
496 len = strlen(ses->serverName);
497
498 server = kmalloc(2 + (len * 2), GFP_KERNEL);
499 if (server == NULL) {
500 cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
501 rc = -ENOMEM;
502 goto calc_exit_1;
503 }
504 len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
505 nls_cp);
506 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
507 (char *)server, 2 * len);
508 kfree(server);
312 } 509 }
510
511 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
512 ntlmv2_hash);
513
313calc_exit_1: 514calc_exit_1:
314 kfree(user); 515 kfree(user);
315calc_exit_2: 516calc_exit_2:
316 /* BB FIXME what about bytes 24 through 40 of the signing key? 517 return rc;
317 compare with the NTLM example */ 518}
318 hmac_md5_final(ses->server->ntlmv2_hash, pctxt); 519
520static int
521CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
522{
523 int rc;
524 unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
525
526 if (!ses->server->secmech.sdeschmacmd5) {
527 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
528 return -1;
529 }
530
531 crypto_shash_setkey(ses->server->secmech.hmacmd5,
532 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
533
534 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
535 if (rc) {
536 cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
537 return rc;
538 }
539
540 if (ses->server->secType == RawNTLMSSP)
541 memcpy(ses->auth_key.response + offset,
542 ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
543 else
544 memcpy(ses->auth_key.response + offset,
545 ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
546 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
547 ses->auth_key.response + offset, ses->auth_key.len - offset);
548
549 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
550 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
319 551
320 kfree(pctxt);
321 return rc; 552 return rc;
322} 553}
323 554
324void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, 555
325 const struct nls_table *nls_cp) 556int
557setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
326{ 558{
327 int rc; 559 int rc;
328 struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; 560 int baselen;
329 struct HMACMD5Context context; 561 unsigned int tilen;
562 struct ntlmv2_resp *buf;
563 char ntlmv2_hash[16];
564 unsigned char *tiblob = NULL; /* target info blob */
565
566 if (ses->server->secType == RawNTLMSSP) {
567 if (!ses->domainName) {
568 rc = find_domain_name(ses, nls_cp);
569 if (rc) {
570 cERROR(1, "error %d finding domain name", rc);
571 goto setup_ntlmv2_rsp_ret;
572 }
573 }
574 } else {
575 rc = build_avpair_blob(ses, nls_cp);
576 if (rc) {
577 cERROR(1, "error %d building av pair blob", rc);
578 goto setup_ntlmv2_rsp_ret;
579 }
580 }
330 581
582 baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
583 tilen = ses->auth_key.len;
584 tiblob = ses->auth_key.response;
585
586 ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
587 if (!ses->auth_key.response) {
588 rc = ENOMEM;
589 ses->auth_key.len = 0;
590 cERROR(1, "%s: Can't allocate auth blob", __func__);
591 goto setup_ntlmv2_rsp_ret;
592 }
593 ses->auth_key.len += baselen;
594
595 buf = (struct ntlmv2_resp *)
596 (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
331 buf->blob_signature = cpu_to_le32(0x00000101); 597 buf->blob_signature = cpu_to_le32(0x00000101);
332 buf->reserved = 0; 598 buf->reserved = 0;
333 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 599 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
334 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); 600 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
335 buf->reserved2 = 0; 601 buf->reserved2 = 0;
336 buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
337 buf->names[0].length = 0;
338 buf->names[1].type = 0;
339 buf->names[1].length = 0;
340 602
341 /* calculate buf->ntlmv2_hash */ 603 memcpy(ses->auth_key.response + baselen, tiblob, tilen);
342 rc = calc_ntlmv2_hash(ses, nls_cp); 604
343 if (rc) 605 /* calculate ntlmv2_hash */
606 rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
607 if (rc) {
344 cERROR(1, "could not get v2 hash rc %d", rc); 608 cERROR(1, "could not get v2 hash rc %d", rc);
345 CalcNTLMv2_response(ses, resp_buf); 609 goto setup_ntlmv2_rsp_ret;
610 }
611
612 /* calculate first part of the client response (CR1) */
613 rc = CalcNTLMv2_response(ses, ntlmv2_hash);
614 if (rc) {
615 cERROR(1, "Could not calculate CR1 rc: %d", rc);
616 goto setup_ntlmv2_rsp_ret;
617 }
618
619 /* now calculate the session key for NTLMv2 */
620 crypto_shash_setkey(ses->server->secmech.hmacmd5,
621 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
622
623 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
624 if (rc) {
625 cERROR(1, "%s: Could not init hmacmd5\n", __func__);
626 goto setup_ntlmv2_rsp_ret;
627 }
628
629 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
630 ses->auth_key.response + CIFS_SESS_KEY_SIZE,
631 CIFS_HMAC_MD5_HASH_SIZE);
632
633 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
634 ses->auth_key.response);
635
636setup_ntlmv2_rsp_ret:
637 kfree(tiblob);
638
639 return rc;
640}
346 641
347 /* now calculate the MAC key for NTLMv2 */ 642int
348 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 643calc_seckey(struct cifsSesInfo *ses)
349 hmac_md5_update(resp_buf, 16, &context); 644{
350 hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context); 645 int rc;
646 struct crypto_blkcipher *tfm_arc4;
647 struct scatterlist sgin, sgout;
648 struct blkcipher_desc desc;
649 unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
650
651 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
652
653 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
654 if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
655 cERROR(1, "could not allocate crypto API arc4\n");
656 return PTR_ERR(tfm_arc4);
657 }
658
659 desc.tfm = tfm_arc4;
660
661 crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
662 CIFS_SESS_KEY_SIZE);
663
664 sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
665 sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
666
667 rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
668 if (rc) {
669 cERROR(1, "could not encrypt session key rc: %d\n", rc);
670 crypto_free_blkcipher(tfm_arc4);
671 return rc;
672 }
673
674 /* make secondary_key/nonce as session key */
675 memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
676 /* and make len as that of session key only */
677 ses->auth_key.len = CIFS_SESS_KEY_SIZE;
351 678
352 memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf, 679 crypto_free_blkcipher(tfm_arc4);
353 sizeof(struct ntlmv2_resp)); 680
354 ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp); 681 return 0;
355} 682}
356 683
357void CalcNTLMv2_response(const struct cifsSesInfo *ses, 684void
358 char *v2_session_response) 685cifs_crypto_shash_release(struct TCP_Server_Info *server)
359{ 686{
360 struct HMACMD5Context context; 687 if (server->secmech.md5)
361 /* rest of v2 struct already generated */ 688 crypto_free_shash(server->secmech.md5);
362 memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
363 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
364 689
365 hmac_md5_update(v2_session_response+8, 690 if (server->secmech.hmacmd5)
366 sizeof(struct ntlmv2_resp) - 8, &context); 691 crypto_free_shash(server->secmech.hmacmd5);
367 692
368 hmac_md5_final(v2_session_response, &context); 693 kfree(server->secmech.sdeschmacmd5);
369/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ 694
695 kfree(server->secmech.sdescmd5);
696}
697
698int
699cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
700{
701 int rc;
702 unsigned int size;
703
704 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
705 if (!server->secmech.hmacmd5 ||
706 IS_ERR(server->secmech.hmacmd5)) {
707 cERROR(1, "could not allocate crypto hmacmd5\n");
708 return PTR_ERR(server->secmech.hmacmd5);
709 }
710
711 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
712 if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
713 cERROR(1, "could not allocate crypto md5\n");
714 rc = PTR_ERR(server->secmech.md5);
715 goto crypto_allocate_md5_fail;
716 }
717
718 size = sizeof(struct shash_desc) +
719 crypto_shash_descsize(server->secmech.hmacmd5);
720 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
721 if (!server->secmech.sdeschmacmd5) {
722 cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
723 rc = -ENOMEM;
724 goto crypto_allocate_hmacmd5_sdesc_fail;
725 }
726 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
727 server->secmech.sdeschmacmd5->shash.flags = 0x0;
728
729
730 size = sizeof(struct shash_desc) +
731 crypto_shash_descsize(server->secmech.md5);
732 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
733 if (!server->secmech.sdescmd5) {
734 cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
735 rc = -ENOMEM;
736 goto crypto_allocate_md5_sdesc_fail;
737 }
738 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
739 server->secmech.sdescmd5->shash.flags = 0x0;
740
741 return 0;
742
743crypto_allocate_md5_sdesc_fail:
744 kfree(server->secmech.sdeschmacmd5);
745
746crypto_allocate_hmacmd5_sdesc_fail:
747 crypto_free_shash(server->secmech.md5);
748
749crypto_allocate_md5_fail:
750 crypto_free_shash(server->secmech.hmacmd5);
751
752 return rc;
370} 753}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7431afdd76d..9c3789762ab7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,7 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/smp_lock.h> 38#include <net/ipv6.h>
39#include "cifsfs.h" 39#include "cifsfs.h"
40#include "cifspdu.h" 40#include "cifspdu.h"
41#define DECLARE_GLOBALS_HERE 41#define DECLARE_GLOBALS_HERE
@@ -82,6 +82,24 @@ extern mempool_t *cifs_sm_req_poolp;
82extern mempool_t *cifs_req_poolp; 82extern mempool_t *cifs_req_poolp;
83extern mempool_t *cifs_mid_poolp; 83extern mempool_t *cifs_mid_poolp;
84 84
85void
86cifs_sb_active(struct super_block *sb)
87{
88 struct cifs_sb_info *server = CIFS_SB(sb);
89
90 if (atomic_inc_return(&server->active) == 1)
91 atomic_inc(&sb->s_active);
92}
93
94void
95cifs_sb_deactive(struct super_block *sb)
96{
97 struct cifs_sb_info *server = CIFS_SB(sb);
98
99 if (atomic_dec_and_test(&server->active))
100 deactivate_super(sb);
101}
102
85static int 103static int
86cifs_read_super(struct super_block *sb, void *data, 104cifs_read_super(struct super_block *sb, void *data,
87 const char *devname, int silent) 105 const char *devname, int silent)
@@ -97,6 +115,9 @@ cifs_read_super(struct super_block *sb, void *data,
97 if (cifs_sb == NULL) 115 if (cifs_sb == NULL)
98 return -ENOMEM; 116 return -ENOMEM;
99 117
118 spin_lock_init(&cifs_sb->tlink_tree_lock);
119 cifs_sb->tlink_tree = RB_ROOT;
120
100 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 121 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
101 if (rc) { 122 if (rc) {
102 kfree(cifs_sb); 123 kfree(cifs_sb);
@@ -136,9 +157,6 @@ cifs_read_super(struct super_block *sb, void *data,
136 sb->s_magic = CIFS_MAGIC_NUMBER; 157 sb->s_magic = CIFS_MAGIC_NUMBER;
137 sb->s_op = &cifs_super_ops; 158 sb->s_op = &cifs_super_ops;
138 sb->s_bdi = &cifs_sb->bdi; 159 sb->s_bdi = &cifs_sb->bdi;
139/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
140 sb->s_blocksize =
141 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
142 sb->s_blocksize = CIFS_MAX_MSGSIZE; 160 sb->s_blocksize = CIFS_MAX_MSGSIZE;
143 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 161 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
144 inode = cifs_root_iget(sb, ROOT_I); 162 inode = cifs_root_iget(sb, ROOT_I);
@@ -200,8 +218,6 @@ cifs_put_super(struct super_block *sb)
200 return; 218 return;
201 } 219 }
202 220
203 lock_kernel();
204
205 rc = cifs_umount(sb, cifs_sb); 221 rc = cifs_umount(sb, cifs_sb);
206 if (rc) 222 if (rc)
207 cERROR(1, "cifs_umount failed with return code %d", rc); 223 cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +231,6 @@ cifs_put_super(struct super_block *sb)
215 unload_nls(cifs_sb->local_nls); 231 unload_nls(cifs_sb->local_nls);
216 bdi_destroy(&cifs_sb->bdi); 232 bdi_destroy(&cifs_sb->bdi);
217 kfree(cifs_sb); 233 kfree(cifs_sb);
218
219 unlock_kernel();
220} 234}
221 235
222static int 236static int
@@ -224,7 +238,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 238{
225 struct super_block *sb = dentry->d_sb; 239 struct super_block *sb = dentry->d_sb;
226 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 240 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
227 struct cifsTconInfo *tcon = cifs_sb->tcon; 241 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
228 int rc = -EOPNOTSUPP; 242 int rc = -EOPNOTSUPP;
229 int xid; 243 int xid;
230 244
@@ -304,12 +318,10 @@ cifs_alloc_inode(struct super_block *sb)
304 return NULL; 318 return NULL;
305 cifs_inode->cifsAttrs = 0x20; /* default */ 319 cifs_inode->cifsAttrs = 0x20; /* default */
306 cifs_inode->time = 0; 320 cifs_inode->time = 0;
307 cifs_inode->write_behind_rc = 0;
308 /* Until the file is open and we have gotten oplock 321 /* Until the file is open and we have gotten oplock
309 info back from the server, can not assume caching of 322 info back from the server, can not assume caching of
310 file data or metadata */ 323 file data or metadata */
311 cifs_inode->clientCanCacheRead = false; 324 cifs_set_oplock_level(cifs_inode, 0);
312 cifs_inode->clientCanCacheAll = false;
313 cifs_inode->delete_pending = false; 325 cifs_inode->delete_pending = false;
314 cifs_inode->invalid_mapping = false; 326 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 327 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
@@ -366,14 +378,36 @@ static int
366cifs_show_options(struct seq_file *s, struct vfsmount *m) 378cifs_show_options(struct seq_file *s, struct vfsmount *m)
367{ 379{
368 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); 380 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
369 struct cifsTconInfo *tcon = cifs_sb->tcon; 381 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
382 struct sockaddr *srcaddr;
383 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
370 384
371 seq_printf(s, ",unc=%s", tcon->treeName); 385 seq_printf(s, ",unc=%s", tcon->treeName);
372 if (tcon->ses->userName) 386
387 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
388 seq_printf(s, ",multiuser");
389 else if (tcon->ses->userName)
373 seq_printf(s, ",username=%s", tcon->ses->userName); 390 seq_printf(s, ",username=%s", tcon->ses->userName);
391
374 if (tcon->ses->domainName) 392 if (tcon->ses->domainName)
375 seq_printf(s, ",domain=%s", tcon->ses->domainName); 393 seq_printf(s, ",domain=%s", tcon->ses->domainName);
376 394
395 if (srcaddr->sa_family != AF_UNSPEC) {
396 struct sockaddr_in *saddr4;
397 struct sockaddr_in6 *saddr6;
398 saddr4 = (struct sockaddr_in *)srcaddr;
399 saddr6 = (struct sockaddr_in6 *)srcaddr;
400 if (srcaddr->sa_family == AF_INET6)
401 seq_printf(s, ",srcaddr=%pI6c",
402 &saddr6->sin6_addr);
403 else if (srcaddr->sa_family == AF_INET)
404 seq_printf(s, ",srcaddr=%pI4",
405 &saddr4->sin_addr.s_addr);
406 else
407 seq_printf(s, ",srcaddr=BAD-AF:%i",
408 (int)(srcaddr->sa_family));
409 }
410
377 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); 411 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
378 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) 412 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
379 seq_printf(s, ",forceuid"); 413 seq_printf(s, ",forceuid");
@@ -422,6 +456,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
422 seq_printf(s, ",dynperm"); 456 seq_printf(s, ",dynperm");
423 if (m->mnt_sb->s_flags & MS_POSIXACL) 457 if (m->mnt_sb->s_flags & MS_POSIXACL)
424 seq_printf(s, ",acl"); 458 seq_printf(s, ",acl");
459 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
460 seq_printf(s, ",mfsymlinks");
425 461
426 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 462 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
427 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 463 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -437,20 +473,18 @@ static void cifs_umount_begin(struct super_block *sb)
437 if (cifs_sb == NULL) 473 if (cifs_sb == NULL)
438 return; 474 return;
439 475
440 tcon = cifs_sb->tcon; 476 tcon = cifs_sb_master_tcon(cifs_sb);
441 if (tcon == NULL)
442 return;
443 477
444 read_lock(&cifs_tcp_ses_lock); 478 spin_lock(&cifs_tcp_ses_lock);
445 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { 479 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
446 /* we have other mounts to same share or we have 480 /* we have other mounts to same share or we have
447 already tried to force umount this and woken up 481 already tried to force umount this and woken up
448 all waiting network requests, nothing to do */ 482 all waiting network requests, nothing to do */
449 read_unlock(&cifs_tcp_ses_lock); 483 spin_unlock(&cifs_tcp_ses_lock);
450 return; 484 return;
451 } else if (tcon->tc_count == 1) 485 } else if (tcon->tc_count == 1)
452 tcon->tidStatus = CifsExiting; 486 tcon->tidStatus = CifsExiting;
453 read_unlock(&cifs_tcp_ses_lock); 487 spin_unlock(&cifs_tcp_ses_lock);
454 488
455 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 489 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
456 /* cancel_notify_requests(tcon); */ 490 /* cancel_notify_requests(tcon); */
@@ -509,28 +543,29 @@ static const struct super_operations cifs_super_ops = {
509#endif 543#endif
510}; 544};
511 545
512static int 546static struct dentry *
513cifs_get_sb(struct file_system_type *fs_type, 547cifs_do_mount(struct file_system_type *fs_type,
514 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 548 int flags, const char *dev_name, void *data)
515{ 549{
516 int rc; 550 int rc;
517 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); 551 struct super_block *sb;
552
553 sb = sget(fs_type, NULL, set_anon_super, NULL);
518 554
519 cFYI(1, "Devname: %s flags: %d ", dev_name, flags); 555 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
520 556
521 if (IS_ERR(sb)) 557 if (IS_ERR(sb))
522 return PTR_ERR(sb); 558 return ERR_CAST(sb);
523 559
524 sb->s_flags = flags; 560 sb->s_flags = flags;
525 561
526 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0); 562 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
527 if (rc) { 563 if (rc) {
528 deactivate_locked_super(sb); 564 deactivate_locked_super(sb);
529 return rc; 565 return ERR_PTR(rc);
530 } 566 }
531 sb->s_flags |= MS_ACTIVE; 567 sb->s_flags |= MS_ACTIVE;
532 simple_set_mnt(mnt, sb); 568 return dget(sb->s_root);
533 return 0;
534} 569}
535 570
536static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 571static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -565,9 +600,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
565 600
566static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 601static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
567{ 602{
568 /* note that this is called by vfs setlease with the BKL held 603 /* note that this is called by vfs setlease with lock_flocks held
569 although I doubt that BKL is needed here in cifs */ 604 to protect *lease from going away */
570 struct inode *inode = file->f_path.dentry->d_inode; 605 struct inode *inode = file->f_path.dentry->d_inode;
606 struct cifsFileInfo *cfile = file->private_data;
571 607
572 if (!(S_ISREG(inode->i_mode))) 608 if (!(S_ISREG(inode->i_mode)))
573 return -EINVAL; 609 return -EINVAL;
@@ -578,8 +614,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
578 ((arg == F_WRLCK) && 614 ((arg == F_WRLCK) &&
579 (CIFS_I(inode)->clientCanCacheAll))) 615 (CIFS_I(inode)->clientCanCacheAll)))
580 return generic_setlease(file, arg, lease); 616 return generic_setlease(file, arg, lease);
581 else if (CIFS_SB(inode->i_sb)->tcon->local_lease && 617 else if (tlink_tcon(cfile->tlink)->local_lease &&
582 !CIFS_I(inode)->clientCanCacheRead) 618 !CIFS_I(inode)->clientCanCacheRead)
583 /* If the server claims to support oplock on this 619 /* If the server claims to support oplock on this
584 file, then we still need to check oplock even 620 file, then we still need to check oplock even
585 if the local_lease mount option is set, but there 621 if the local_lease mount option is set, but there
@@ -595,7 +631,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
595struct file_system_type cifs_fs_type = { 631struct file_system_type cifs_fs_type = {
596 .owner = THIS_MODULE, 632 .owner = THIS_MODULE,
597 .name = "cifs", 633 .name = "cifs",
598 .get_sb = cifs_get_sb, 634 .mount = cifs_do_mount,
599 .kill_sb = kill_anon_super, 635 .kill_sb = kill_anon_super,
600 /* .fs_flags */ 636 /* .fs_flags */
601}; 637};
@@ -898,8 +934,8 @@ init_cifs(void)
898 GlobalTotalActiveXid = 0; 934 GlobalTotalActiveXid = 0;
899 GlobalMaxActiveXid = 0; 935 GlobalMaxActiveXid = 0;
900 memset(Local_System_Name, 0, 15); 936 memset(Local_System_Name, 0, 15);
901 rwlock_init(&GlobalSMBSeslock); 937 spin_lock_init(&cifs_tcp_ses_lock);
902 rwlock_init(&cifs_tcp_ses_lock); 938 spin_lock_init(&cifs_file_list_lock);
903 spin_lock_init(&GlobalMid_Lock); 939 spin_lock_init(&GlobalMid_Lock);
904 940
905 if (cifs_max_pending < 2) { 941 if (cifs_max_pending < 2) {
@@ -912,11 +948,11 @@ init_cifs(void)
912 948
913 rc = cifs_fscache_register(); 949 rc = cifs_fscache_register();
914 if (rc) 950 if (rc)
915 goto out; 951 goto out_clean_proc;
916 952
917 rc = cifs_init_inodecache(); 953 rc = cifs_init_inodecache();
918 if (rc) 954 if (rc)
919 goto out_clean_proc; 955 goto out_unreg_fscache;
920 956
921 rc = cifs_init_mids(); 957 rc = cifs_init_mids();
922 if (rc) 958 if (rc)
@@ -938,19 +974,19 @@ init_cifs(void)
938 return 0; 974 return 0;
939 975
940#ifdef CONFIG_CIFS_UPCALL 976#ifdef CONFIG_CIFS_UPCALL
941 out_unregister_filesystem: 977out_unregister_filesystem:
942 unregister_filesystem(&cifs_fs_type); 978 unregister_filesystem(&cifs_fs_type);
943#endif 979#endif
944 out_destroy_request_bufs: 980out_destroy_request_bufs:
945 cifs_destroy_request_bufs(); 981 cifs_destroy_request_bufs();
946 out_destroy_mids: 982out_destroy_mids:
947 cifs_destroy_mids(); 983 cifs_destroy_mids();
948 out_destroy_inodecache: 984out_destroy_inodecache:
949 cifs_destroy_inodecache(); 985 cifs_destroy_inodecache();
950 out_clean_proc: 986out_unreg_fscache:
951 cifs_proc_clean();
952 cifs_fscache_unregister(); 987 cifs_fscache_unregister();
953 out: 988out_clean_proc:
989 cifs_proc_clean();
954 return rc; 990 return rc;
955} 991}
956 992
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d82f5fb4761e..897b2b2b28b5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
42extern const struct address_space_operations cifs_addr_ops_smallbuf; 42extern const struct address_space_operations cifs_addr_ops_smallbuf;
43 43
44/* Functions related to super block operations */ 44/* Functions related to super block operations */
45/* extern const struct super_operations cifs_super_ops;*/ 45extern void cifs_sb_active(struct super_block *sb);
46extern void cifs_read_inode(struct inode *); 46extern void cifs_sb_deactive(struct super_block *sb);
47/*extern void cifs_delete_inode(struct inode *);*/ /* BB not needed yet */
48/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
49 47
50/* Functions related to inodes */ 48/* Functions related to inodes */
51extern const struct inode_operations cifs_dir_inode_ops; 49extern const struct inode_operations cifs_dir_inode_ops;
@@ -104,7 +102,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
104extern int cifs_symlink(struct inode *inode, struct dentry *direntry, 102extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
105 const char *symname); 103 const char *symname);
106extern int cifs_removexattr(struct dentry *, const char *); 104extern int cifs_removexattr(struct dentry *, const char *);
107extern int cifs_setxattr(struct dentry *, const char *, const void *, 105extern int cifs_setxattr(struct dentry *, const char *, const void *,
108 size_t, int); 106 size_t, int);
109extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); 107extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
110extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 108extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
@@ -114,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
114extern const struct export_operations cifs_export_ops; 112extern const struct export_operations cifs_export_ops;
115#endif /* EXPERIMENTAL */ 113#endif /* EXPERIMENTAL */
116 114
117#define CIFS_VERSION "1.65" 115#define CIFS_VERSION "1.68"
118#endif /* _CIFSFS_H */ 116#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..b577bf0a1bb3 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include "cifs_fs_sb.h" 26#include "cifs_fs_sb.h"
27#include "cifsacl.h" 27#include "cifsacl.h"
28#include <crypto/internal/hash.h>
29#include <linux/scatterlist.h>
30
28/* 31/*
29 * The sizes of various internal tables and strings 32 * The sizes of various internal tables and strings
30 */ 33 */
@@ -74,7 +77,7 @@
74 * CIFS vfs client Status information (based on what we know.) 77 * CIFS vfs client Status information (based on what we know.)
75 */ 78 */
76 79
77 /* associated with each tcp and smb session */ 80/* associated with each tcp and smb session */
78enum statusEnum { 81enum statusEnum {
79 CifsNew = 0, 82 CifsNew = 0,
80 CifsGood, 83 CifsGood,
@@ -97,16 +100,31 @@ enum protocolEnum {
97 /* Netbios frames protocol not supported at this time */ 100 /* Netbios frames protocol not supported at this time */
98}; 101};
99 102
100struct mac_key { 103struct session_key {
101 unsigned int len; 104 unsigned int len;
102 union { 105 char *response;
103 char ntlm[CIFS_SESS_KEY_SIZE + 16]; 106};
104 char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */ 107
105 struct { 108/* crypto security descriptor definition */
106 char key[16]; 109struct sdesc {
107 struct ntlmv2_resp resp; 110 struct shash_desc shash;
108 } ntlmv2; 111 char ctx[];
109 } data; 112};
113
114/* crypto hashing related structure/fields, not specific to a sec mech */
115struct cifs_secmech {
116 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
117 struct crypto_shash *md5; /* md5 hash function */
118 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
119 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
120};
121
122/* per smb session structure/fields */
123struct ntlmssp_auth {
124 __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
125 __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
126 unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
127 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
110}; 128};
111 129
112struct cifs_cred { 130struct cifs_cred {
@@ -139,6 +157,7 @@ struct TCP_Server_Info {
139 struct sockaddr_in sockAddr; 157 struct sockaddr_in sockAddr;
140 struct sockaddr_in6 sockAddr6; 158 struct sockaddr_in6 sockAddr6;
141 } addr; 159 } addr;
160 struct sockaddr_storage srcaddr; /* locally bind to this IP */
142 wait_queue_head_t response_q; 161 wait_queue_head_t response_q;
143 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 162 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
144 struct list_head pending_mid_q; 163 struct list_head pending_mid_q;
@@ -178,19 +197,20 @@ struct TCP_Server_Info {
178 int capabilities; /* allow selective disabling of caps by smb sess */ 197 int capabilities; /* allow selective disabling of caps by smb sess */
179 int timeAdj; /* Adjust for difference in server time zone in sec */ 198 int timeAdj; /* Adjust for difference in server time zone in sec */
180 __u16 CurrentMid; /* multiplex id - rotating counter */ 199 __u16 CurrentMid; /* multiplex id - rotating counter */
181 char cryptKey[CIFS_CRYPTO_KEY_SIZE]; 200 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
182 /* 16th byte of RFC1001 workstation name is always null */ 201 /* 16th byte of RFC1001 workstation name is always null */
183 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 202 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
184 __u32 sequence_number; /* needed for CIFS PDU signature */ 203 __u32 sequence_number; /* needed for CIFS PDU signature */
185 struct mac_key mac_signing_key; 204 struct session_key session_key;
186 char ntlmv2_hash[16];
187 unsigned long lstrp; /* when we got last response from this server */ 205 unsigned long lstrp; /* when we got last response from this server */
188 u16 dialect; /* dialect index that server chose */ 206 u16 dialect; /* dialect index that server chose */
207 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
189 /* extended security flavors that server supports */ 208 /* extended security flavors that server supports */
190 bool sec_kerberos; /* supports plain Kerberos */ 209 bool sec_kerberos; /* supports plain Kerberos */
191 bool sec_mskerberos; /* supports legacy MS Kerberos */ 210 bool sec_mskerberos; /* supports legacy MS Kerberos */
192 bool sec_kerberosu2u; /* supports U2U Kerberos */ 211 bool sec_kerberosu2u; /* supports U2U Kerberos */
193 bool sec_ntlmssp; /* supports NTLMSSP */ 212 bool sec_ntlmssp; /* supports NTLMSSP */
213 bool session_estab; /* mark when very first sess is established */
194#ifdef CONFIG_CIFS_FSCACHE 214#ifdef CONFIG_CIFS_FSCACHE
195 struct fscache_cookie *fscache; /* client index cache cookie */ 215 struct fscache_cookie *fscache; /* client index cache cookie */
196#endif 216#endif
@@ -222,6 +242,8 @@ struct cifsSesInfo {
222 char userName[MAX_USERNAME_SIZE + 1]; 242 char userName[MAX_USERNAME_SIZE + 1];
223 char *domainName; 243 char *domainName;
224 char *password; 244 char *password;
245 struct session_key auth_key;
246 struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
225 bool need_reconnect:1; /* connection reset, uid now invalid */ 247 bool need_reconnect:1; /* connection reset, uid now invalid */
226}; 248};
227/* no more than one of the following three session flags may be set */ 249/* no more than one of the following three session flags may be set */
@@ -308,6 +330,45 @@ struct cifsTconInfo {
308}; 330};
309 331
310/* 332/*
333 * This is a refcounted and timestamped container for a tcon pointer. The
334 * container holds a tcon reference. It is considered safe to free one of
335 * these when the tl_count goes to 0. The tl_time is the time of the last
336 * "get" on the container.
337 */
338struct tcon_link {
339 struct rb_node tl_rbnode;
340 uid_t tl_uid;
341 unsigned long tl_flags;
342#define TCON_LINK_MASTER 0
343#define TCON_LINK_PENDING 1
344#define TCON_LINK_IN_TREE 2
345 unsigned long tl_time;
346 atomic_t tl_count;
347 struct cifsTconInfo *tl_tcon;
348};
349
350extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
351
352static inline struct cifsTconInfo *
353tlink_tcon(struct tcon_link *tlink)
354{
355 return tlink->tl_tcon;
356}
357
358extern void cifs_put_tlink(struct tcon_link *tlink);
359
360static inline struct tcon_link *
361cifs_get_tlink(struct tcon_link *tlink)
362{
363 if (tlink && !IS_ERR(tlink))
364 atomic_inc(&tlink->tl_count);
365 return tlink;
366}
367
368/* This function is always expected to succeed */
369extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
370
371/*
311 * This info hangs off the cifsFileInfo structure, pointed to by llist. 372 * This info hangs off the cifsFileInfo structure, pointed to by llist.
312 * This is used to track byte stream locks on the file 373 * This is used to track byte stream locks on the file
313 */ 374 */
@@ -345,34 +406,29 @@ struct cifsFileInfo {
345 __u16 netfid; /* file id from remote */ 406 __u16 netfid; /* file id from remote */
346 /* BB add lock scope info here if needed */ ; 407 /* BB add lock scope info here if needed */ ;
347 /* lock scope id (0 if none) */ 408 /* lock scope id (0 if none) */
348 struct file *pfile; /* needed for writepage */ 409 struct dentry *dentry;
349 struct inode *pInode; /* needed for oplock break */ 410 unsigned int f_flags;
350 struct vfsmount *mnt; 411 struct tcon_link *tlink;
351 struct mutex lock_mutex; 412 struct mutex lock_mutex;
352 struct list_head llist; /* list of byte range locks we have. */ 413 struct list_head llist; /* list of byte range locks we have. */
353 bool closePend:1; /* file is marked to close */
354 bool invalidHandle:1; /* file closed via session abend */ 414 bool invalidHandle:1; /* file closed via session abend */
355 bool oplock_break_cancelled:1; 415 bool oplock_break_cancelled:1;
356 atomic_t count; /* reference count */ 416 int count; /* refcount protected by cifs_file_list_lock */
357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 417 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
358 struct cifs_search_info srch_inf; 418 struct cifs_search_info srch_inf;
359 struct work_struct oplock_break; /* work for oplock breaks */ 419 struct work_struct oplock_break; /* work for oplock breaks */
360}; 420};
361 421
362/* Take a reference on the file private data */ 422/*
423 * Take a reference on the file private data. Must be called with
424 * cifs_file_list_lock held.
425 */
363static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) 426static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
364{ 427{
365 atomic_inc(&cifs_file->count); 428 ++cifs_file->count;
366} 429}
367 430
368/* Release a reference on the file private data */ 431void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
369static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
370{
371 if (atomic_dec_and_test(&cifs_file->count)) {
372 iput(cifs_file->pInode);
373 kfree(cifs_file);
374 }
375}
376 432
377/* 433/*
378 * One of these for each file inode 434 * One of these for each file inode
@@ -382,7 +438,6 @@ struct cifsInodeInfo {
382 struct list_head lockList; 438 struct list_head lockList;
383 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 439 /* BB add in lists for dirty pages i.e. write caching info for oplock */
384 struct list_head openFileList; 440 struct list_head openFileList;
385 int write_behind_rc;
386 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 441 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
387 unsigned long time; /* jiffies of last update/check of inode */ 442 unsigned long time; /* jiffies of last update/check of inode */
388 bool clientCanCacheRead:1; /* read oplock */ 443 bool clientCanCacheRead:1; /* read oplock */
@@ -474,16 +529,16 @@ struct oplock_q_entry {
474 529
475/* for pending dnotify requests */ 530/* for pending dnotify requests */
476struct dir_notify_req { 531struct dir_notify_req {
477 struct list_head lhead; 532 struct list_head lhead;
478 __le16 Pid; 533 __le16 Pid;
479 __le16 PidHigh; 534 __le16 PidHigh;
480 __u16 Mid; 535 __u16 Mid;
481 __u16 Tid; 536 __u16 Tid;
482 __u16 Uid; 537 __u16 Uid;
483 __u16 netfid; 538 __u16 netfid;
484 __u32 filter; /* CompletionFilter (for multishot) */ 539 __u32 filter; /* CompletionFilter (for multishot) */
485 int multishot; 540 int multishot;
486 struct file *pfile; 541 struct file *pfile;
487}; 542};
488 543
489struct dfs_info3_param { 544struct dfs_info3_param {
@@ -633,7 +688,7 @@ require use of the stronger protocol */
633 * GlobalMid_Lock protects: 688 * GlobalMid_Lock protects:
634 * list operations on pending_mid_q and oplockQ 689 * list operations on pending_mid_q and oplockQ
635 * updates to XID counters, multiplex id and SMB sequence numbers 690 * updates to XID counters, multiplex id and SMB sequence numbers
636 * GlobalSMBSesLock protects: 691 * cifs_file_list_lock protects:
637 * list operations on tcp and SMB session lists and tCon lists 692 * list operations on tcp and SMB session lists and tCon lists
638 * f_owner.lock protects certain per file struct operations 693 * f_owner.lock protects certain per file struct operations
639 * mapping->page_lock protects certain per page operations 694 * mapping->page_lock protects certain per page operations
@@ -667,7 +722,7 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
667 * the reference counters for the server, smb session, and tcon. Finally, 722 * the reference counters for the server, smb session, and tcon. Finally,
668 * changes to the tcon->tidStatus should be done while holding this lock. 723 * changes to the tcon->tidStatus should be done while holding this lock.
669 */ 724 */
670GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; 725GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
671 726
672/* 727/*
673 * This lock protects the cifs_file->llist and cifs_file->flist 728 * This lock protects the cifs_file->llist and cifs_file->flist
@@ -676,7 +731,7 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
676 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then 731 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
677 * the cifs_tcp_ses_lock must be grabbed first and released last. 732 * the cifs_tcp_ses_lock must be grabbed first and released last.
678 */ 733 */
679GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 734GLOBAL_EXTERN spinlock_t cifs_file_list_lock;
680 735
681/* Outstanding dir notify requests */ 736/* Outstanding dir notify requests */
682GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 737GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..de36b09763a8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -131,9 +131,20 @@
131#define CIFS_CRYPTO_KEY_SIZE (8) 131#define CIFS_CRYPTO_KEY_SIZE (8)
132 132
133/* 133/*
134 * Size of the ntlm client response
135 */
136#define CIFS_AUTH_RESP_SIZE (24)
137
138/*
134 * Size of the session key (crypto key encrypted with the password 139 * Size of the session key (crypto key encrypted with the password
135 */ 140 */
136#define CIFS_SESS_KEY_SIZE (24) 141#define CIFS_SESS_KEY_SIZE (16)
142
143#define CIFS_CLIENT_CHALLENGE_SIZE (8)
144#define CIFS_SERVER_CHALLENGE_SIZE (8)
145#define CIFS_HMAC_MD5_HASH_SIZE (16)
146#define CIFS_CPHTXT_SIZE (16)
147#define CIFS_NTHASH_SIZE (16)
137 148
138/* 149/*
139 * Maximum user name length 150 * Maximum user name length
@@ -663,7 +674,6 @@ struct ntlmv2_resp {
663 __le64 time; 674 __le64 time;
664 __u64 client_chal; /* random */ 675 __u64 client_chal; /* random */
665 __u32 reserved2; 676 __u32 reserved2;
666 struct ntlmssp2_name names[2];
667 /* array of name entries could follow ending in minimum 4 byte struct */ 677 /* array of name entries could follow ending in minimum 4 byte struct */
668} __attribute__((packed)); 678} __attribute__((packed));
669 679
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1d60c655e3e0..7ed69b6b5fe6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,9 +78,9 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
78extern bool is_valid_oplock_break(struct smb_hdr *smb, 78extern bool is_valid_oplock_break(struct smb_hdr *smb,
79 struct TCP_Server_Info *); 79 struct TCP_Server_Info *);
80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); 81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
82#ifdef CONFIG_CIFS_EXPERIMENTAL 82#ifdef CONFIG_CIFS_EXPERIMENTAL
83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *); 83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
84#endif 84#endif
85extern unsigned int smbCalcSize(struct smb_hdr *ptr); 85extern unsigned int smbCalcSize(struct smb_hdr *ptr);
86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
@@ -104,13 +104,14 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
104extern u64 cifs_UnixTimeToNT(struct timespec); 104extern u64 cifs_UnixTimeToNT(struct timespec);
105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
106 int offset); 106 int offset);
107extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
107 108
108extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, 109extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
109 __u16 fileHandle, struct file *file, 110 struct file *file, struct tcon_link *tlink,
110 struct vfsmount *mnt, unsigned int oflags); 111 __u32 oplock);
111extern int cifs_posix_open(char *full_path, struct inode **pinode, 112extern int cifs_posix_open(char *full_path, struct inode **pinode,
112 struct super_block *sb, 113 struct super_block *sb,
113 int mode, int oflags, 114 int mode, unsigned int f_flags,
114 __u32 *poplock, __u16 *pnetfid, int xid); 115 __u32 *poplock, __u16 *pnetfid, int xid);
115void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr); 116void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
116extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 117extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
@@ -362,13 +363,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
362extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 363extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
363 __u32 *); 364 __u32 *);
364extern int cifs_verify_signature(struct smb_hdr *, 365extern int cifs_verify_signature(struct smb_hdr *,
365 const struct mac_key *mac_key, 366 struct TCP_Server_Info *server,
366 __u32 expected_sequence_number); 367 __u32 expected_sequence_number);
367extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 368extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
368 const char *pass); 369extern int setup_ntlm_response(struct cifsSesInfo *);
369extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); 370extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
370extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 371extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
371 const struct nls_table *); 372extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
373extern int calc_seckey(struct cifsSesInfo *);
374
372#ifdef CONFIG_CIFS_WEAK_PW_HASH 375#ifdef CONFIG_CIFS_WEAK_PW_HASH
373extern void calc_lanman_hash(const char *password, const char *cryptkey, 376extern void calc_lanman_hash(const char *password, const char *cryptkey,
374 bool encrypt, char *lnm_session_key); 377 bool encrypt, char *lnm_session_key);
@@ -408,4 +411,8 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
408extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, 411extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
409 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 412 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
410extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); 413extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
414extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
415extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
416 const unsigned char *path,
417 struct cifs_sb_info *cifs_sb, int xid);
411#endif /* _CIFSPROTO_H */ 418#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7e83b356cc9e..2f2632b6df5a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
91 struct list_head *tmp1; 91 struct list_head *tmp1;
92 92
93/* list all files open on tree connection and mark them invalid */ 93/* list all files open on tree connection and mark them invalid */
94 write_lock(&GlobalSMBSeslock); 94 spin_lock(&cifs_file_list_lock);
95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { 95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
96 open_file = list_entry(tmp, struct cifsFileInfo, tlist); 96 open_file = list_entry(tmp, struct cifsFileInfo, tlist);
97 open_file->invalidHandle = true; 97 open_file->invalidHandle = true;
98 open_file->oplock_break_cancelled = true; 98 open_file->oplock_break_cancelled = true;
99 } 99 }
100 write_unlock(&GlobalSMBSeslock); 100 spin_unlock(&cifs_file_list_lock);
101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted 101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
102 to this tcon */ 102 to this tcon */
103} 103}
@@ -503,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
503 503
504 if (rsp->EncryptionKeyLength == 504 if (rsp->EncryptionKeyLength ==
505 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { 505 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
506 memcpy(server->cryptKey, rsp->EncryptionKey, 506 memcpy(ses->server->cryptkey, rsp->EncryptionKey,
507 CIFS_CRYPTO_KEY_SIZE); 507 CIFS_CRYPTO_KEY_SIZE);
508 } else if (server->secMode & SECMODE_PW_ENCRYPT) { 508 } else if (server->secMode & SECMODE_PW_ENCRYPT) {
509 rc = -EIO; /* need cryptkey unless plain text */ 509 rc = -EIO; /* need cryptkey unless plain text */
@@ -574,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
575 server->timeAdj *= 60; 575 server->timeAdj *= 60;
576 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 576 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
577 memcpy(server->cryptKey, pSMBr->u.EncryptionKey, 577 memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
578 CIFS_CRYPTO_KEY_SIZE); 578 CIFS_CRYPTO_KEY_SIZE);
579 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) 579 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
580 && (pSMBr->EncryptionKeyLength == 0)) { 580 && (pSMBr->EncryptionKeyLength == 0)) {
@@ -593,9 +593,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
593 rc = -EIO; 593 rc = -EIO;
594 goto neg_err_exit; 594 goto neg_err_exit;
595 } 595 }
596 read_lock(&cifs_tcp_ses_lock); 596 spin_lock(&cifs_tcp_ses_lock);
597 if (server->srv_count > 1) { 597 if (server->srv_count > 1) {
598 read_unlock(&cifs_tcp_ses_lock); 598 spin_unlock(&cifs_tcp_ses_lock);
599 if (memcmp(server->server_GUID, 599 if (memcmp(server->server_GUID,
600 pSMBr->u.extended_response. 600 pSMBr->u.extended_response.
601 GUID, 16) != 0) { 601 GUID, 16) != 0) {
@@ -605,7 +605,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
605 16); 605 16);
606 } 606 }
607 } else { 607 } else {
608 read_unlock(&cifs_tcp_ses_lock); 608 spin_unlock(&cifs_tcp_ses_lock);
609 memcpy(server->server_GUID, 609 memcpy(server->server_GUID,
610 pSMBr->u.extended_response.GUID, 16); 610 pSMBr->u.extended_response.GUID, 16);
611 } 611 }
@@ -620,13 +620,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
620 rc = 0; 620 rc = 0;
621 else 621 else
622 rc = -EINVAL; 622 rc = -EINVAL;
623 623 if (server->secType == Kerberos) {
624 if (server->sec_kerberos || server->sec_mskerberos) 624 if (!server->sec_kerberos &&
625 server->secType = Kerberos; 625 !server->sec_mskerberos)
626 else if (server->sec_ntlmssp) 626 rc = -EOPNOTSUPP;
627 server->secType = RawNTLMSSP; 627 } else if (server->secType == RawNTLMSSP) {
628 else 628 if (!server->sec_ntlmssp)
629 rc = -EOPNOTSUPP; 629 rc = -EOPNOTSUPP;
630 } else
631 rc = -EOPNOTSUPP;
630 } 632 }
631 } else 633 } else
632 server->capabilities &= ~CAP_EXTENDED_SECURITY; 634 server->capabilities &= ~CAP_EXTENDED_SECURITY;
diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb1..000000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
1/*
2 * fs/cifs/cn_cifs.h
3 *
4 * Copyright (c) International Business Machines Corp., 2002
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _CN_CIFS_H
23#define _CN_CIFS_H
24#ifdef CONFIG_CIFS_UPCALL
25#include <linux/types.h>
26#include <linux/connector.h>
27
28struct cifs_upcall {
29 char signature[4]; /* CIFS */
30 enum command {
31 CIFS_GET_IP = 0x00000001, /* get ip address for hostname */
32 CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
33 } command;
34 /* union cifs upcall data follows */
35};
36#endif /* CIFS_UPCALL */
37#endif /* _CN_CIFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88c84a38bccb..251a17c03545 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,7 +47,6 @@
47#include "ntlmssp.h" 47#include "ntlmssp.h"
48#include "nterr.h" 48#include "nterr.h"
49#include "rfc1002pdu.h" 49#include "rfc1002pdu.h"
50#include "cn_cifs.h"
51#include "fscache.h" 50#include "fscache.h"
52 51
53#define CIFS_PORT 445 52#define CIFS_PORT 445
@@ -100,16 +99,25 @@ struct smb_vol {
100 bool noautotune:1; 99 bool noautotune:1;
101 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 100 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
102 bool fsc:1; /* enable fscache */ 101 bool fsc:1; /* enable fscache */
102 bool mfsymlinks:1; /* use Minshall+French Symlinks */
103 bool multiuser:1;
103 unsigned int rsize; 104 unsigned int rsize;
104 unsigned int wsize; 105 unsigned int wsize;
105 bool sockopt_tcp_nodelay:1; 106 bool sockopt_tcp_nodelay:1;
106 unsigned short int port; 107 unsigned short int port;
107 char *prepath; 108 char *prepath;
109 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
108 struct nls_table *local_nls; 110 struct nls_table *local_nls;
109}; 111};
110 112
113/* FIXME: should these be tunable? */
114#define TLINK_ERROR_EXPIRE (1 * HZ)
115#define TLINK_IDLE_EXPIRE (600 * HZ)
116
111static int ipv4_connect(struct TCP_Server_Info *server); 117static int ipv4_connect(struct TCP_Server_Info *server);
112static int ipv6_connect(struct TCP_Server_Info *server); 118static int ipv6_connect(struct TCP_Server_Info *server);
119static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
120static void cifs_prune_tlinks(struct work_struct *work);
113 121
114/* 122/*
115 * cifs tcp session reconnection 123 * cifs tcp session reconnection
@@ -143,7 +151,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
143 151
144 /* before reconnecting the tcp session, mark the smb session (uid) 152 /* before reconnecting the tcp session, mark the smb session (uid)
145 and the tid bad so they are not used until reconnected */ 153 and the tid bad so they are not used until reconnected */
146 read_lock(&cifs_tcp_ses_lock); 154 spin_lock(&cifs_tcp_ses_lock);
147 list_for_each(tmp, &server->smb_ses_list) { 155 list_for_each(tmp, &server->smb_ses_list) {
148 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 156 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
149 ses->need_reconnect = true; 157 ses->need_reconnect = true;
@@ -153,7 +161,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
153 tcon->need_reconnect = true; 161 tcon->need_reconnect = true;
154 } 162 }
155 } 163 }
156 read_unlock(&cifs_tcp_ses_lock); 164 spin_unlock(&cifs_tcp_ses_lock);
157 /* do not want to be sending data on a socket we are freeing */ 165 /* do not want to be sending data on a socket we are freeing */
158 mutex_lock(&server->srv_mutex); 166 mutex_lock(&server->srv_mutex);
159 if (server->ssocket) { 167 if (server->ssocket) {
@@ -166,6 +174,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
166 sock_release(server->ssocket); 174 sock_release(server->ssocket);
167 server->ssocket = NULL; 175 server->ssocket = NULL;
168 } 176 }
177 server->sequence_number = 0;
178 server->session_estab = false;
179 kfree(server->session_key.response);
180 server->session_key.response = NULL;
181 server->session_key.len = 0;
169 182
170 spin_lock(&GlobalMid_Lock); 183 spin_lock(&GlobalMid_Lock);
171 list_for_each(tmp, &server->pending_mid_q) { 184 list_for_each(tmp, &server->pending_mid_q) {
@@ -198,7 +211,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
198 spin_lock(&GlobalMid_Lock); 211 spin_lock(&GlobalMid_Lock);
199 if (server->tcpStatus != CifsExiting) 212 if (server->tcpStatus != CifsExiting)
200 server->tcpStatus = CifsGood; 213 server->tcpStatus = CifsGood;
201 server->sequence_number = 0;
202 spin_unlock(&GlobalMid_Lock); 214 spin_unlock(&GlobalMid_Lock);
203 /* atomic_set(&server->inFlight,0);*/ 215 /* atomic_set(&server->inFlight,0);*/
204 wake_up(&server->response_q); 216 wake_up(&server->response_q);
@@ -629,9 +641,9 @@ multi_t2_fnd:
629 } /* end while !EXITING */ 641 } /* end while !EXITING */
630 642
631 /* take it off the list, if it's not already */ 643 /* take it off the list, if it's not already */
632 write_lock(&cifs_tcp_ses_lock); 644 spin_lock(&cifs_tcp_ses_lock);
633 list_del_init(&server->tcp_ses_list); 645 list_del_init(&server->tcp_ses_list);
634 write_unlock(&cifs_tcp_ses_lock); 646 spin_unlock(&cifs_tcp_ses_lock);
635 647
636 spin_lock(&GlobalMid_Lock); 648 spin_lock(&GlobalMid_Lock);
637 server->tcpStatus = CifsExiting; 649 server->tcpStatus = CifsExiting;
@@ -669,7 +681,7 @@ multi_t2_fnd:
669 * BB: we shouldn't have to do any of this. It shouldn't be 681 * BB: we shouldn't have to do any of this. It shouldn't be
670 * possible to exit from the thread with active SMB sessions 682 * possible to exit from the thread with active SMB sessions
671 */ 683 */
672 read_lock(&cifs_tcp_ses_lock); 684 spin_lock(&cifs_tcp_ses_lock);
673 if (list_empty(&server->pending_mid_q)) { 685 if (list_empty(&server->pending_mid_q)) {
674 /* loop through server session structures attached to this and 686 /* loop through server session structures attached to this and
675 mark them dead */ 687 mark them dead */
@@ -679,7 +691,7 @@ multi_t2_fnd:
679 ses->status = CifsExiting; 691 ses->status = CifsExiting;
680 ses->server = NULL; 692 ses->server = NULL;
681 } 693 }
682 read_unlock(&cifs_tcp_ses_lock); 694 spin_unlock(&cifs_tcp_ses_lock);
683 } else { 695 } else {
684 /* although we can not zero the server struct pointer yet, 696 /* although we can not zero the server struct pointer yet,
685 since there are active requests which may depnd on them, 697 since there are active requests which may depnd on them,
@@ -702,7 +714,7 @@ multi_t2_fnd:
702 } 714 }
703 } 715 }
704 spin_unlock(&GlobalMid_Lock); 716 spin_unlock(&GlobalMid_Lock);
705 read_unlock(&cifs_tcp_ses_lock); 717 spin_unlock(&cifs_tcp_ses_lock);
706 /* 1/8th of sec is more than enough time for them to exit */ 718 /* 1/8th of sec is more than enough time for them to exit */
707 msleep(125); 719 msleep(125);
708 } 720 }
@@ -725,12 +737,12 @@ multi_t2_fnd:
725 if a crazy root user tried to kill cifsd 737 if a crazy root user tried to kill cifsd
726 kernel thread explicitly this might happen) */ 738 kernel thread explicitly this might happen) */
727 /* BB: This shouldn't be necessary, see above */ 739 /* BB: This shouldn't be necessary, see above */
728 read_lock(&cifs_tcp_ses_lock); 740 spin_lock(&cifs_tcp_ses_lock);
729 list_for_each(tmp, &server->smb_ses_list) { 741 list_for_each(tmp, &server->smb_ses_list) {
730 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 742 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
731 ses->server = NULL; 743 ses->server = NULL;
732 } 744 }
733 read_unlock(&cifs_tcp_ses_lock); 745 spin_unlock(&cifs_tcp_ses_lock);
734 746
735 kfree(server->hostname); 747 kfree(server->hostname);
736 task_to_wake = xchg(&server->tsk, NULL); 748 task_to_wake = xchg(&server->tsk, NULL);
@@ -1046,6 +1058,22 @@ cifs_parse_mount_options(char *options, const char *devname,
1046 "long\n"); 1058 "long\n");
1047 return 1; 1059 return 1;
1048 } 1060 }
1061 } else if (strnicmp(data, "srcaddr", 7) == 0) {
1062 vol->srcaddr.ss_family = AF_UNSPEC;
1063
1064 if (!value || !*value) {
1065 printk(KERN_WARNING "CIFS: srcaddr value"
1066 " not specified.\n");
1067 return 1; /* needs_arg; */
1068 }
1069 i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
1070 value, strlen(value));
1071 if (i == 0) {
1072 printk(KERN_WARNING "CIFS: Could not parse"
1073 " srcaddr: %s\n",
1074 value);
1075 return 1;
1076 }
1049 } else if (strnicmp(data, "prefixpath", 10) == 0) { 1077 } else if (strnicmp(data, "prefixpath", 10) == 0) {
1050 if (!value || !*value) { 1078 if (!value || !*value) {
1051 printk(KERN_WARNING 1079 printk(KERN_WARNING
@@ -1325,6 +1353,10 @@ cifs_parse_mount_options(char *options, const char *devname,
1325 "/proc/fs/cifs/LookupCacheEnabled to 0\n"); 1353 "/proc/fs/cifs/LookupCacheEnabled to 0\n");
1326 } else if (strnicmp(data, "fsc", 3) == 0) { 1354 } else if (strnicmp(data, "fsc", 3) == 0) {
1327 vol->fsc = true; 1355 vol->fsc = true;
1356 } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
1357 vol->mfsymlinks = true;
1358 } else if (strnicmp(data, "multiuser", 8) == 0) {
1359 vol->multiuser = true;
1328 } else 1360 } else
1329 printk(KERN_WARNING "CIFS: Unknown mount option %s\n", 1361 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
1330 data); 1362 data);
@@ -1356,6 +1388,13 @@ cifs_parse_mount_options(char *options, const char *devname,
1356 return 1; 1388 return 1;
1357 } 1389 }
1358 } 1390 }
1391
1392 if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
1393 cERROR(1, "Multiuser mounts currently require krb5 "
1394 "authentication!");
1395 return 1;
1396 }
1397
1359 if (vol->UNCip == NULL) 1398 if (vol->UNCip == NULL)
1360 vol->UNCip = &vol->UNC[2]; 1399 vol->UNCip = &vol->UNC[2];
1361 1400
@@ -1374,8 +1413,36 @@ cifs_parse_mount_options(char *options, const char *devname,
1374 return 0; 1413 return 0;
1375} 1414}
1376 1415
1416/** Returns true if srcaddr isn't specified and rhs isn't
1417 * specified, or if srcaddr is specified and
1418 * matches the IP address of the rhs argument.
1419 */
1420static bool
1421srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1422{
1423 switch (srcaddr->sa_family) {
1424 case AF_UNSPEC:
1425 return (rhs->sa_family == AF_UNSPEC);
1426 case AF_INET: {
1427 struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
1428 struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
1429 return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
1430 }
1431 case AF_INET6: {
1432 struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
1433 struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
1434 return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
1435 }
1436 default:
1437 WARN_ON(1);
1438 return false; /* don't expect to be here */
1439 }
1440}
1441
1442
1377static bool 1443static bool
1378match_address(struct TCP_Server_Info *server, struct sockaddr *addr) 1444match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1445 struct sockaddr *srcaddr)
1379{ 1446{
1380 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; 1447 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1381 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; 1448 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
@@ -1402,6 +1469,9 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
1402 break; 1469 break;
1403 } 1470 }
1404 1471
1472 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
1473 return false;
1474
1405 return true; 1475 return true;
1406} 1476}
1407 1477
@@ -1458,29 +1528,21 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1458{ 1528{
1459 struct TCP_Server_Info *server; 1529 struct TCP_Server_Info *server;
1460 1530
1461 write_lock(&cifs_tcp_ses_lock); 1531 spin_lock(&cifs_tcp_ses_lock);
1462 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 1532 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1463 /* 1533 if (!match_address(server, addr,
1464 * the demux thread can exit on its own while still in CifsNew 1534 (struct sockaddr *)&vol->srcaddr))
1465 * so don't accept any sockets in that state. Since the
1466 * tcpStatus never changes back to CifsNew it's safe to check
1467 * for this without a lock.
1468 */
1469 if (server->tcpStatus == CifsNew)
1470 continue;
1471
1472 if (!match_address(server, addr))
1473 continue; 1535 continue;
1474 1536
1475 if (!match_security(server, vol)) 1537 if (!match_security(server, vol))
1476 continue; 1538 continue;
1477 1539
1478 ++server->srv_count; 1540 ++server->srv_count;
1479 write_unlock(&cifs_tcp_ses_lock); 1541 spin_unlock(&cifs_tcp_ses_lock);
1480 cFYI(1, "Existing tcp session with server found"); 1542 cFYI(1, "Existing tcp session with server found");
1481 return server; 1543 return server;
1482 } 1544 }
1483 write_unlock(&cifs_tcp_ses_lock); 1545 spin_unlock(&cifs_tcp_ses_lock);
1484 return NULL; 1546 return NULL;
1485} 1547}
1486 1548
@@ -1489,21 +1551,26 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1489{ 1551{
1490 struct task_struct *task; 1552 struct task_struct *task;
1491 1553
1492 write_lock(&cifs_tcp_ses_lock); 1554 spin_lock(&cifs_tcp_ses_lock);
1493 if (--server->srv_count > 0) { 1555 if (--server->srv_count > 0) {
1494 write_unlock(&cifs_tcp_ses_lock); 1556 spin_unlock(&cifs_tcp_ses_lock);
1495 return; 1557 return;
1496 } 1558 }
1497 1559
1498 list_del_init(&server->tcp_ses_list); 1560 list_del_init(&server->tcp_ses_list);
1499 write_unlock(&cifs_tcp_ses_lock); 1561 spin_unlock(&cifs_tcp_ses_lock);
1500 1562
1501 spin_lock(&GlobalMid_Lock); 1563 spin_lock(&GlobalMid_Lock);
1502 server->tcpStatus = CifsExiting; 1564 server->tcpStatus = CifsExiting;
1503 spin_unlock(&GlobalMid_Lock); 1565 spin_unlock(&GlobalMid_Lock);
1504 1566
1567 cifs_crypto_shash_release(server);
1505 cifs_fscache_release_client_cookie(server); 1568 cifs_fscache_release_client_cookie(server);
1506 1569
1570 kfree(server->session_key.response);
1571 server->session_key.response = NULL;
1572 server->session_key.len = 0;
1573
1507 task = xchg(&server->tsk, NULL); 1574 task = xchg(&server->tsk, NULL);
1508 if (task) 1575 if (task)
1509 force_sig(SIGKILL, task); 1576 force_sig(SIGKILL, task);
@@ -1556,10 +1623,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1556 goto out_err; 1623 goto out_err;
1557 } 1624 }
1558 1625
1626 rc = cifs_crypto_shash_allocate(tcp_ses);
1627 if (rc) {
1628 cERROR(1, "could not setup hash structures rc %d", rc);
1629 goto out_err;
1630 }
1631
1559 tcp_ses->hostname = extract_hostname(volume_info->UNC); 1632 tcp_ses->hostname = extract_hostname(volume_info->UNC);
1560 if (IS_ERR(tcp_ses->hostname)) { 1633 if (IS_ERR(tcp_ses->hostname)) {
1561 rc = PTR_ERR(tcp_ses->hostname); 1634 rc = PTR_ERR(tcp_ses->hostname);
1562 goto out_err; 1635 goto out_err_crypto_release;
1563 } 1636 }
1564 1637
1565 tcp_ses->noblocksnd = volume_info->noblocksnd; 1638 tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1574,6 +1647,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1574 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1647 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1575 memcpy(tcp_ses->server_RFC1001_name, 1648 memcpy(tcp_ses->server_RFC1001_name,
1576 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1649 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1650 tcp_ses->session_estab = false;
1577 tcp_ses->sequence_number = 0; 1651 tcp_ses->sequence_number = 0;
1578 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 1652 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
1579 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 1653 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -1584,6 +1658,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1584 * no need to spinlock this init of tcpStatus or srv_count 1658 * no need to spinlock this init of tcpStatus or srv_count
1585 */ 1659 */
1586 tcp_ses->tcpStatus = CifsNew; 1660 tcp_ses->tcpStatus = CifsNew;
1661 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
1662 sizeof(tcp_ses->srcaddr));
1587 ++tcp_ses->srv_count; 1663 ++tcp_ses->srv_count;
1588 1664
1589 if (addr.ss_family == AF_INET6) { 1665 if (addr.ss_family == AF_INET6) {
@@ -1600,7 +1676,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1600 } 1676 }
1601 if (rc < 0) { 1677 if (rc < 0) {
1602 cERROR(1, "Error connecting to socket. Aborting operation"); 1678 cERROR(1, "Error connecting to socket. Aborting operation");
1603 goto out_err; 1679 goto out_err_crypto_release;
1604 } 1680 }
1605 1681
1606 /* 1682 /*
@@ -1614,18 +1690,21 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1614 rc = PTR_ERR(tcp_ses->tsk); 1690 rc = PTR_ERR(tcp_ses->tsk);
1615 cERROR(1, "error %d create cifsd thread", rc); 1691 cERROR(1, "error %d create cifsd thread", rc);
1616 module_put(THIS_MODULE); 1692 module_put(THIS_MODULE);
1617 goto out_err; 1693 goto out_err_crypto_release;
1618 } 1694 }
1619 1695
1620 /* thread spawned, put it on the list */ 1696 /* thread spawned, put it on the list */
1621 write_lock(&cifs_tcp_ses_lock); 1697 spin_lock(&cifs_tcp_ses_lock);
1622 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); 1698 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
1623 write_unlock(&cifs_tcp_ses_lock); 1699 spin_unlock(&cifs_tcp_ses_lock);
1624 1700
1625 cifs_fscache_get_client_cookie(tcp_ses); 1701 cifs_fscache_get_client_cookie(tcp_ses);
1626 1702
1627 return tcp_ses; 1703 return tcp_ses;
1628 1704
1705out_err_crypto_release:
1706 cifs_crypto_shash_release(tcp_ses);
1707
1629out_err: 1708out_err:
1630 if (tcp_ses) { 1709 if (tcp_ses) {
1631 if (!IS_ERR(tcp_ses->hostname)) 1710 if (!IS_ERR(tcp_ses->hostname))
@@ -1642,7 +1721,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1642{ 1721{
1643 struct cifsSesInfo *ses; 1722 struct cifsSesInfo *ses;
1644 1723
1645 write_lock(&cifs_tcp_ses_lock); 1724 spin_lock(&cifs_tcp_ses_lock);
1646 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { 1725 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
1647 switch (server->secType) { 1726 switch (server->secType) {
1648 case Kerberos: 1727 case Kerberos:
@@ -1662,10 +1741,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1662 continue; 1741 continue;
1663 } 1742 }
1664 ++ses->ses_count; 1743 ++ses->ses_count;
1665 write_unlock(&cifs_tcp_ses_lock); 1744 spin_unlock(&cifs_tcp_ses_lock);
1666 return ses; 1745 return ses;
1667 } 1746 }
1668 write_unlock(&cifs_tcp_ses_lock); 1747 spin_unlock(&cifs_tcp_ses_lock);
1669 return NULL; 1748 return NULL;
1670} 1749}
1671 1750
@@ -1676,14 +1755,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1676 struct TCP_Server_Info *server = ses->server; 1755 struct TCP_Server_Info *server = ses->server;
1677 1756
1678 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count); 1757 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
1679 write_lock(&cifs_tcp_ses_lock); 1758 spin_lock(&cifs_tcp_ses_lock);
1680 if (--ses->ses_count > 0) { 1759 if (--ses->ses_count > 0) {
1681 write_unlock(&cifs_tcp_ses_lock); 1760 spin_unlock(&cifs_tcp_ses_lock);
1682 return; 1761 return;
1683 } 1762 }
1684 1763
1685 list_del_init(&ses->smb_ses_list); 1764 list_del_init(&ses->smb_ses_list);
1686 write_unlock(&cifs_tcp_ses_lock); 1765 spin_unlock(&cifs_tcp_ses_lock);
1687 1766
1688 if (ses->status == CifsGood) { 1767 if (ses->status == CifsGood) {
1689 xid = GetXid(); 1768 xid = GetXid();
@@ -1760,10 +1839,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1760 goto get_ses_fail; 1839 goto get_ses_fail;
1761 } 1840 }
1762 if (volume_info->domainname) { 1841 if (volume_info->domainname) {
1763 int len = strlen(volume_info->domainname); 1842 ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
1764 ses->domainName = kmalloc(len + 1, GFP_KERNEL); 1843 if (!ses->domainName)
1765 if (ses->domainName) 1844 goto get_ses_fail;
1766 strcpy(ses->domainName, volume_info->domainname);
1767 } 1845 }
1768 ses->cred_uid = volume_info->cred_uid; 1846 ses->cred_uid = volume_info->cred_uid;
1769 ses->linux_uid = volume_info->linux_uid; 1847 ses->linux_uid = volume_info->linux_uid;
@@ -1778,9 +1856,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1778 goto get_ses_fail; 1856 goto get_ses_fail;
1779 1857
1780 /* success, put it on the list */ 1858 /* success, put it on the list */
1781 write_lock(&cifs_tcp_ses_lock); 1859 spin_lock(&cifs_tcp_ses_lock);
1782 list_add(&ses->smb_ses_list, &server->smb_ses_list); 1860 list_add(&ses->smb_ses_list, &server->smb_ses_list);
1783 write_unlock(&cifs_tcp_ses_lock); 1861 spin_unlock(&cifs_tcp_ses_lock);
1784 1862
1785 FreeXid(xid); 1863 FreeXid(xid);
1786 return ses; 1864 return ses;
@@ -1797,7 +1875,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1797 struct list_head *tmp; 1875 struct list_head *tmp;
1798 struct cifsTconInfo *tcon; 1876 struct cifsTconInfo *tcon;
1799 1877
1800 write_lock(&cifs_tcp_ses_lock); 1878 spin_lock(&cifs_tcp_ses_lock);
1801 list_for_each(tmp, &ses->tcon_list) { 1879 list_for_each(tmp, &ses->tcon_list) {
1802 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list); 1880 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
1803 if (tcon->tidStatus == CifsExiting) 1881 if (tcon->tidStatus == CifsExiting)
@@ -1806,10 +1884,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1806 continue; 1884 continue;
1807 1885
1808 ++tcon->tc_count; 1886 ++tcon->tc_count;
1809 write_unlock(&cifs_tcp_ses_lock); 1887 spin_unlock(&cifs_tcp_ses_lock);
1810 return tcon; 1888 return tcon;
1811 } 1889 }
1812 write_unlock(&cifs_tcp_ses_lock); 1890 spin_unlock(&cifs_tcp_ses_lock);
1813 return NULL; 1891 return NULL;
1814} 1892}
1815 1893
@@ -1820,14 +1898,14 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1820 struct cifsSesInfo *ses = tcon->ses; 1898 struct cifsSesInfo *ses = tcon->ses;
1821 1899
1822 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count); 1900 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
1823 write_lock(&cifs_tcp_ses_lock); 1901 spin_lock(&cifs_tcp_ses_lock);
1824 if (--tcon->tc_count > 0) { 1902 if (--tcon->tc_count > 0) {
1825 write_unlock(&cifs_tcp_ses_lock); 1903 spin_unlock(&cifs_tcp_ses_lock);
1826 return; 1904 return;
1827 } 1905 }
1828 1906
1829 list_del_init(&tcon->tcon_list); 1907 list_del_init(&tcon->tcon_list);
1830 write_unlock(&cifs_tcp_ses_lock); 1908 spin_unlock(&cifs_tcp_ses_lock);
1831 1909
1832 xid = GetXid(); 1910 xid = GetXid();
1833 CIFSSMBTDis(xid, tcon); 1911 CIFSSMBTDis(xid, tcon);
@@ -1900,9 +1978,9 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
1900 tcon->nocase = volume_info->nocase; 1978 tcon->nocase = volume_info->nocase;
1901 tcon->local_lease = volume_info->local_lease; 1979 tcon->local_lease = volume_info->local_lease;
1902 1980
1903 write_lock(&cifs_tcp_ses_lock); 1981 spin_lock(&cifs_tcp_ses_lock);
1904 list_add(&tcon->tcon_list, &ses->tcon_list); 1982 list_add(&tcon->tcon_list, &ses->tcon_list);
1905 write_unlock(&cifs_tcp_ses_lock); 1983 spin_unlock(&cifs_tcp_ses_lock);
1906 1984
1907 cifs_fscache_get_super_cookie(tcon); 1985 cifs_fscache_get_super_cookie(tcon);
1908 1986
@@ -1913,6 +1991,23 @@ out_fail:
1913 return ERR_PTR(rc); 1991 return ERR_PTR(rc);
1914} 1992}
1915 1993
1994void
1995cifs_put_tlink(struct tcon_link *tlink)
1996{
1997 if (!tlink || IS_ERR(tlink))
1998 return;
1999
2000 if (!atomic_dec_and_test(&tlink->tl_count) ||
2001 test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
2002 tlink->tl_time = jiffies;
2003 return;
2004 }
2005
2006 if (!IS_ERR(tlink_tcon(tlink)))
2007 cifs_put_tcon(tlink_tcon(tlink));
2008 kfree(tlink);
2009 return;
2010}
1916 2011
1917int 2012int
1918get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 2013get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -1997,6 +2092,33 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
1997 2092
1998} 2093}
1999 2094
2095static int
2096bind_socket(struct TCP_Server_Info *server)
2097{
2098 int rc = 0;
2099 if (server->srcaddr.ss_family != AF_UNSPEC) {
2100 /* Bind to the specified local IP address */
2101 struct socket *socket = server->ssocket;
2102 rc = socket->ops->bind(socket,
2103 (struct sockaddr *) &server->srcaddr,
2104 sizeof(server->srcaddr));
2105 if (rc < 0) {
2106 struct sockaddr_in *saddr4;
2107 struct sockaddr_in6 *saddr6;
2108 saddr4 = (struct sockaddr_in *)&server->srcaddr;
2109 saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
2110 if (saddr6->sin6_family == AF_INET6)
2111 cERROR(1, "cifs: "
2112 "Failed to bind to: %pI6c, error: %d\n",
2113 &saddr6->sin6_addr, rc);
2114 else
2115 cERROR(1, "cifs: "
2116 "Failed to bind to: %pI4, error: %d\n",
2117 &saddr4->sin_addr.s_addr, rc);
2118 }
2119 }
2120 return rc;
2121}
2000 2122
2001static int 2123static int
2002ipv4_connect(struct TCP_Server_Info *server) 2124ipv4_connect(struct TCP_Server_Info *server)
@@ -2022,6 +2144,10 @@ ipv4_connect(struct TCP_Server_Info *server)
2022 cifs_reclassify_socket4(socket); 2144 cifs_reclassify_socket4(socket);
2023 } 2145 }
2024 2146
2147 rc = bind_socket(server);
2148 if (rc < 0)
2149 return rc;
2150
2025 /* user overrode default port */ 2151 /* user overrode default port */
2026 if (server->addr.sockAddr.sin_port) { 2152 if (server->addr.sockAddr.sin_port) {
2027 rc = socket->ops->connect(socket, (struct sockaddr *) 2153 rc = socket->ops->connect(socket, (struct sockaddr *)
@@ -2184,6 +2310,10 @@ ipv6_connect(struct TCP_Server_Info *server)
2184 cifs_reclassify_socket6(socket); 2310 cifs_reclassify_socket6(socket);
2185 } 2311 }
2186 2312
2313 rc = bind_socket(server);
2314 if (rc < 0)
2315 return rc;
2316
2187 /* user overrode default port */ 2317 /* user overrode default port */
2188 if (server->addr.sockAddr6.sin6_port) { 2318 if (server->addr.sockAddr6.sin6_port) {
2189 rc = socket->ops->connect(socket, 2319 rc = socket->ops->connect(socket,
@@ -2383,6 +2513,8 @@ convert_delimiter(char *path, char delim)
2383static void setup_cifs_sb(struct smb_vol *pvolume_info, 2513static void setup_cifs_sb(struct smb_vol *pvolume_info,
2384 struct cifs_sb_info *cifs_sb) 2514 struct cifs_sb_info *cifs_sb)
2385{ 2515{
2516 INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
2517
2386 if (pvolume_info->rsize > CIFSMaxBufSize) { 2518 if (pvolume_info->rsize > CIFSMaxBufSize) {
2387 cERROR(1, "rsize %d too large, using MaxBufSize", 2519 cERROR(1, "rsize %d too large, using MaxBufSize",
2388 pvolume_info->rsize); 2520 pvolume_info->rsize);
@@ -2462,10 +2594,21 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2462 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; 2594 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
2463 if (pvolume_info->fsc) 2595 if (pvolume_info->fsc)
2464 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; 2596 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
2597 if (pvolume_info->multiuser)
2598 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
2599 CIFS_MOUNT_NO_PERM);
2465 if (pvolume_info->direct_io) { 2600 if (pvolume_info->direct_io) {
2466 cFYI(1, "mounting share using direct i/o"); 2601 cFYI(1, "mounting share using direct i/o");
2467 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2602 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
2468 } 2603 }
2604 if (pvolume_info->mfsymlinks) {
2605 if (pvolume_info->sfu_emul) {
2606 cERROR(1, "mount option mfsymlinks ignored if sfu "
2607 "mount option is used");
2608 } else {
2609 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
2610 }
2611 }
2469 2612
2470 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 2613 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
2471 cERROR(1, "mount option dynperm ignored if cifsacl " 2614 cERROR(1, "mount option dynperm ignored if cifsacl "
@@ -2552,6 +2695,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2552 struct TCP_Server_Info *srvTcp; 2695 struct TCP_Server_Info *srvTcp;
2553 char *full_path; 2696 char *full_path;
2554 char *mount_data = mount_data_global; 2697 char *mount_data = mount_data_global;
2698 struct tcon_link *tlink;
2555#ifdef CONFIG_CIFS_DFS_UPCALL 2699#ifdef CONFIG_CIFS_DFS_UPCALL
2556 struct dfs_info3_param *referrals = NULL; 2700 struct dfs_info3_param *referrals = NULL;
2557 unsigned int num_referrals = 0; 2701 unsigned int num_referrals = 0;
@@ -2563,6 +2707,7 @@ try_mount_again:
2563 pSesInfo = NULL; 2707 pSesInfo = NULL;
2564 srvTcp = NULL; 2708 srvTcp = NULL;
2565 full_path = NULL; 2709 full_path = NULL;
2710 tlink = NULL;
2566 2711
2567 xid = GetXid(); 2712 xid = GetXid();
2568 2713
@@ -2638,8 +2783,6 @@ try_mount_again:
2638 goto remote_path_check; 2783 goto remote_path_check;
2639 } 2784 }
2640 2785
2641 cifs_sb->tcon = tcon;
2642
2643 /* do not care if following two calls succeed - informational */ 2786 /* do not care if following two calls succeed - informational */
2644 if (!tcon->ipc) { 2787 if (!tcon->ipc) {
2645 CIFSSMBQFSDeviceInfo(xid, tcon); 2788 CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2748,6 +2891,30 @@ remote_path_check:
2748#endif 2891#endif
2749 } 2892 }
2750 2893
2894 if (rc)
2895 goto mount_fail_check;
2896
2897 /* now, hang the tcon off of the superblock */
2898 tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
2899 if (tlink == NULL) {
2900 rc = -ENOMEM;
2901 goto mount_fail_check;
2902 }
2903
2904 tlink->tl_uid = pSesInfo->linux_uid;
2905 tlink->tl_tcon = tcon;
2906 tlink->tl_time = jiffies;
2907 set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
2908 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
2909
2910 cifs_sb->master_tlink = tlink;
2911 spin_lock(&cifs_sb->tlink_tree_lock);
2912 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
2913 spin_unlock(&cifs_sb->tlink_tree_lock);
2914
2915 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
2916 TLINK_IDLE_EXPIRE);
2917
2751mount_fail_check: 2918mount_fail_check:
2752 /* on error free sesinfo and tcon struct if needed */ 2919 /* on error free sesinfo and tcon struct if needed */
2753 if (rc) { 2920 if (rc) {
@@ -2825,14 +2992,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2825#ifdef CONFIG_CIFS_WEAK_PW_HASH 2992#ifdef CONFIG_CIFS_WEAK_PW_HASH
2826 if ((global_secflags & CIFSSEC_MAY_LANMAN) && 2993 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
2827 (ses->server->secType == LANMAN)) 2994 (ses->server->secType == LANMAN))
2828 calc_lanman_hash(tcon->password, ses->server->cryptKey, 2995 calc_lanman_hash(tcon->password, ses->server->cryptkey,
2829 ses->server->secMode & 2996 ses->server->secMode &
2830 SECMODE_PW_ENCRYPT ? true : false, 2997 SECMODE_PW_ENCRYPT ? true : false,
2831 bcc_ptr); 2998 bcc_ptr);
2832 else 2999 else
2833#endif /* CIFS_WEAK_PW_HASH */ 3000#endif /* CIFS_WEAK_PW_HASH */
2834 SMBNTencrypt(tcon->password, ses->server->cryptKey, 3001 SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
2835 bcc_ptr);
2836 3002
2837 bcc_ptr += CIFS_SESS_KEY_SIZE; 3003 bcc_ptr += CIFS_SESS_KEY_SIZE;
2838 if (ses->capabilities & CAP_UNICODE) { 3004 if (ses->capabilities & CAP_UNICODE) {
@@ -2934,19 +3100,32 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2934int 3100int
2935cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) 3101cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
2936{ 3102{
2937 int rc = 0; 3103 struct rb_root *root = &cifs_sb->tlink_tree;
3104 struct rb_node *node;
3105 struct tcon_link *tlink;
2938 char *tmp; 3106 char *tmp;
2939 3107
2940 if (cifs_sb->tcon) 3108 cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
2941 cifs_put_tcon(cifs_sb->tcon); 3109
3110 spin_lock(&cifs_sb->tlink_tree_lock);
3111 while ((node = rb_first(root))) {
3112 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3113 cifs_get_tlink(tlink);
3114 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3115 rb_erase(node, root);
3116
3117 spin_unlock(&cifs_sb->tlink_tree_lock);
3118 cifs_put_tlink(tlink);
3119 spin_lock(&cifs_sb->tlink_tree_lock);
3120 }
3121 spin_unlock(&cifs_sb->tlink_tree_lock);
2942 3122
2943 cifs_sb->tcon = NULL;
2944 tmp = cifs_sb->prepath; 3123 tmp = cifs_sb->prepath;
2945 cifs_sb->prepathlen = 0; 3124 cifs_sb->prepathlen = 0;
2946 cifs_sb->prepath = NULL; 3125 cifs_sb->prepath = NULL;
2947 kfree(tmp); 3126 kfree(tmp);
2948 3127
2949 return rc; 3128 return 0;
2950} 3129}
2951 3130
2952int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses) 3131int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -2997,6 +3176,16 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2997 if (rc) { 3176 if (rc) {
2998 cERROR(1, "Send error in SessSetup = %d", rc); 3177 cERROR(1, "Send error in SessSetup = %d", rc);
2999 } else { 3178 } else {
3179 mutex_lock(&ses->server->srv_mutex);
3180 if (!server->session_estab) {
3181 server->session_key.response = ses->auth_key.response;
3182 server->session_key.len = ses->auth_key.len;
3183 server->sequence_number = 0x2;
3184 server->session_estab = true;
3185 ses->auth_key.response = NULL;
3186 }
3187 mutex_unlock(&server->srv_mutex);
3188
3000 cFYI(1, "CIFS Session Established successfully"); 3189 cFYI(1, "CIFS Session Established successfully");
3001 spin_lock(&GlobalMid_Lock); 3190 spin_lock(&GlobalMid_Lock);
3002 ses->status = CifsGood; 3191 ses->status = CifsGood;
@@ -3004,6 +3193,263 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
3004 spin_unlock(&GlobalMid_Lock); 3193 spin_unlock(&GlobalMid_Lock);
3005 } 3194 }
3006 3195
3196 kfree(ses->auth_key.response);
3197 ses->auth_key.response = NULL;
3198 ses->auth_key.len = 0;
3199 kfree(ses->ntlmssp);
3200 ses->ntlmssp = NULL;
3201
3007 return rc; 3202 return rc;
3008} 3203}
3009 3204
3205static struct cifsTconInfo *
3206cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3207{
3208 struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
3209 struct cifsSesInfo *ses;
3210 struct cifsTconInfo *tcon = NULL;
3211 struct smb_vol *vol_info;
3212 char username[MAX_USERNAME_SIZE + 1];
3213
3214 vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
3215 if (vol_info == NULL) {
3216 tcon = ERR_PTR(-ENOMEM);
3217 goto out;
3218 }
3219
3220 snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
3221 vol_info->username = username;
3222 vol_info->local_nls = cifs_sb->local_nls;
3223 vol_info->linux_uid = fsuid;
3224 vol_info->cred_uid = fsuid;
3225 vol_info->UNC = master_tcon->treeName;
3226 vol_info->retry = master_tcon->retry;
3227 vol_info->nocase = master_tcon->nocase;
3228 vol_info->local_lease = master_tcon->local_lease;
3229 vol_info->no_linux_ext = !master_tcon->unix_ext;
3230
3231 /* FIXME: allow for other secFlg settings */
3232 vol_info->secFlg = CIFSSEC_MUST_KRB5;
3233
3234 /* get a reference for the same TCP session */
3235 spin_lock(&cifs_tcp_ses_lock);
3236 ++master_tcon->ses->server->srv_count;
3237 spin_unlock(&cifs_tcp_ses_lock);
3238
3239 ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
3240 if (IS_ERR(ses)) {
3241 tcon = (struct cifsTconInfo *)ses;
3242 cifs_put_tcp_session(master_tcon->ses->server);
3243 goto out;
3244 }
3245
3246 tcon = cifs_get_tcon(ses, vol_info);
3247 if (IS_ERR(tcon)) {
3248 cifs_put_smb_ses(ses);
3249 goto out;
3250 }
3251
3252 if (ses->capabilities & CAP_UNIX)
3253 reset_cifs_unix_caps(0, tcon, NULL, vol_info);
3254out:
3255 kfree(vol_info);
3256
3257 return tcon;
3258}
3259
3260static inline struct tcon_link *
3261cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
3262{
3263 return cifs_sb->master_tlink;
3264}
3265
3266struct cifsTconInfo *
3267cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
3268{
3269 return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
3270}
3271
3272static int
3273cifs_sb_tcon_pending_wait(void *unused)
3274{
3275 schedule();
3276 return signal_pending(current) ? -ERESTARTSYS : 0;
3277}
3278
3279/* find and return a tlink with given uid */
3280static struct tcon_link *
3281tlink_rb_search(struct rb_root *root, uid_t uid)
3282{
3283 struct rb_node *node = root->rb_node;
3284 struct tcon_link *tlink;
3285
3286 while (node) {
3287 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3288
3289 if (tlink->tl_uid > uid)
3290 node = node->rb_left;
3291 else if (tlink->tl_uid < uid)
3292 node = node->rb_right;
3293 else
3294 return tlink;
3295 }
3296 return NULL;
3297}
3298
3299/* insert a tcon_link into the tree */
3300static void
3301tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
3302{
3303 struct rb_node **new = &(root->rb_node), *parent = NULL;
3304 struct tcon_link *tlink;
3305
3306 while (*new) {
3307 tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
3308 parent = *new;
3309
3310 if (tlink->tl_uid > new_tlink->tl_uid)
3311 new = &((*new)->rb_left);
3312 else
3313 new = &((*new)->rb_right);
3314 }
3315
3316 rb_link_node(&new_tlink->tl_rbnode, parent, new);
3317 rb_insert_color(&new_tlink->tl_rbnode, root);
3318}
3319
3320/*
3321 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
3322 * current task.
3323 *
3324 * If the superblock doesn't refer to a multiuser mount, then just return
3325 * the master tcon for the mount.
3326 *
3327 * First, search the rbtree for an existing tcon for this fsuid. If one
3328 * exists, then check to see if it's pending construction. If it is then wait
3329 * for construction to complete. Once it's no longer pending, check to see if
3330 * it failed and either return an error or retry construction, depending on
3331 * the timeout.
3332 *
3333 * If one doesn't exist then insert a new tcon_link struct into the tree and
3334 * try to construct a new one.
3335 */
3336struct tcon_link *
3337cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3338{
3339 int ret;
3340 uid_t fsuid = current_fsuid();
3341 struct tcon_link *tlink, *newtlink;
3342
3343 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
3344 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
3345
3346 spin_lock(&cifs_sb->tlink_tree_lock);
3347 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3348 if (tlink)
3349 cifs_get_tlink(tlink);
3350 spin_unlock(&cifs_sb->tlink_tree_lock);
3351
3352 if (tlink == NULL) {
3353 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
3354 if (newtlink == NULL)
3355 return ERR_PTR(-ENOMEM);
3356 newtlink->tl_uid = fsuid;
3357 newtlink->tl_tcon = ERR_PTR(-EACCES);
3358 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
3359 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
3360 cifs_get_tlink(newtlink);
3361
3362 spin_lock(&cifs_sb->tlink_tree_lock);
3363 /* was one inserted after previous search? */
3364 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3365 if (tlink) {
3366 cifs_get_tlink(tlink);
3367 spin_unlock(&cifs_sb->tlink_tree_lock);
3368 kfree(newtlink);
3369 goto wait_for_construction;
3370 }
3371 tlink = newtlink;
3372 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
3373 spin_unlock(&cifs_sb->tlink_tree_lock);
3374 } else {
3375wait_for_construction:
3376 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
3377 cifs_sb_tcon_pending_wait,
3378 TASK_INTERRUPTIBLE);
3379 if (ret) {
3380 cifs_put_tlink(tlink);
3381 return ERR_PTR(ret);
3382 }
3383
3384 /* if it's good, return it */
3385 if (!IS_ERR(tlink->tl_tcon))
3386 return tlink;
3387
3388 /* return error if we tried this already recently */
3389 if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
3390 cifs_put_tlink(tlink);
3391 return ERR_PTR(-EACCES);
3392 }
3393
3394 if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
3395 goto wait_for_construction;
3396 }
3397
3398 tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
3399 clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
3400 wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
3401
3402 if (IS_ERR(tlink->tl_tcon)) {
3403 cifs_put_tlink(tlink);
3404 return ERR_PTR(-EACCES);
3405 }
3406
3407 return tlink;
3408}
3409
3410/*
3411 * periodic workqueue job that scans tcon_tree for a superblock and closes
3412 * out tcons.
3413 */
3414static void
3415cifs_prune_tlinks(struct work_struct *work)
3416{
3417 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
3418 prune_tlinks.work);
3419 struct rb_root *root = &cifs_sb->tlink_tree;
3420 struct rb_node *node = rb_first(root);
3421 struct rb_node *tmp;
3422 struct tcon_link *tlink;
3423
3424 /*
3425 * Because we drop the spinlock in the loop in order to put the tlink
3426 * it's not guarded against removal of links from the tree. The only
3427 * places that remove entries from the tree are this function and
3428 * umounts. Because this function is non-reentrant and is canceled
3429 * before umount can proceed, this is safe.
3430 */
3431 spin_lock(&cifs_sb->tlink_tree_lock);
3432 node = rb_first(root);
3433 while (node != NULL) {
3434 tmp = node;
3435 node = rb_next(tmp);
3436 tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
3437
3438 if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
3439 atomic_read(&tlink->tl_count) != 0 ||
3440 time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
3441 continue;
3442
3443 cifs_get_tlink(tlink);
3444 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3445 rb_erase(tmp, root);
3446
3447 spin_unlock(&cifs_sb->tlink_tree_lock);
3448 cifs_put_tlink(tlink);
3449 spin_lock(&cifs_sb->tlink_tree_lock);
3450 }
3451 spin_unlock(&cifs_sb->tlink_tree_lock);
3452
3453 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
3454 TLINK_IDLE_EXPIRE);
3455}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f9ed0751cc12..3840eddbfb7a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
54 int dfsplen; 54 int dfsplen;
55 char *full_path; 55 char *full_path;
56 char dirsep; 56 char dirsep;
57 struct cifs_sb_info *cifs_sb; 57 struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
58 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
58 59
59 if (direntry == NULL) 60 if (direntry == NULL)
60 return NULL; /* not much we can do if dentry is freed and 61 return NULL; /* not much we can do if dentry is freed and
61 we need to reopen the file after it was closed implicitly 62 we need to reopen the file after it was closed implicitly
62 when the server crashed */ 63 when the server crashed */
63 64
64 cifs_sb = CIFS_SB(direntry->d_sb);
65 dirsep = CIFS_DIR_SEP(cifs_sb); 65 dirsep = CIFS_DIR_SEP(cifs_sb);
66 pplen = cifs_sb->prepathlen; 66 pplen = cifs_sb->prepathlen;
67 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 67 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
68 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 68 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
69 else 69 else
70 dfsplen = 0; 70 dfsplen = 0;
71cifs_bp_rename_retry: 71cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
117 /* BB test paths to Windows with '/' in the midst of prepath */ 117 /* BB test paths to Windows with '/' in the midst of prepath */
118 118
119 if (dfsplen) { 119 if (dfsplen) {
120 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 120 strncpy(full_path, tcon->treeName, dfsplen);
121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { 121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
122 int i; 122 int i;
123 for (i = 0; i < dfsplen; i++) { 123 for (i = 0; i < dfsplen; i++) {
@@ -130,135 +130,6 @@ cifs_bp_rename_retry:
130 return full_path; 130 return full_path;
131} 131}
132 132
133struct cifsFileInfo *
134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
135 struct file *file, struct vfsmount *mnt, unsigned int oflags)
136{
137 int oplock = 0;
138 struct cifsFileInfo *pCifsFile;
139 struct cifsInodeInfo *pCifsInode;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
141
142 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
143 if (pCifsFile == NULL)
144 return pCifsFile;
145
146 if (oplockEnabled)
147 oplock = REQ_OPLOCK;
148
149 pCifsFile->netfid = fileHandle;
150 pCifsFile->pid = current->tgid;
151 pCifsFile->pInode = igrab(newinode);
152 pCifsFile->mnt = mnt;
153 pCifsFile->pfile = file;
154 pCifsFile->invalidHandle = false;
155 pCifsFile->closePend = false;
156 mutex_init(&pCifsFile->fh_mutex);
157 mutex_init(&pCifsFile->lock_mutex);
158 INIT_LIST_HEAD(&pCifsFile->llist);
159 atomic_set(&pCifsFile->count, 1);
160 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
161
162 write_lock(&GlobalSMBSeslock);
163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
164 pCifsInode = CIFS_I(newinode);
165 if (pCifsInode) {
166 /* if readable file instance put first in list*/
167 if (oflags & FMODE_READ)
168 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
169 else
170 list_add_tail(&pCifsFile->flist,
171 &pCifsInode->openFileList);
172
173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
174 pCifsInode->clientCanCacheAll = true;
175 pCifsInode->clientCanCacheRead = true;
176 cFYI(1, "Exclusive Oplock inode %p", newinode);
177 } else if ((oplock & 0xF) == OPLOCK_READ)
178 pCifsInode->clientCanCacheRead = true;
179 }
180 write_unlock(&GlobalSMBSeslock);
181
182 file->private_data = pCifsFile;
183
184 return pCifsFile;
185}
186
187int cifs_posix_open(char *full_path, struct inode **pinode,
188 struct super_block *sb, int mode, int oflags,
189 __u32 *poplock, __u16 *pnetfid, int xid)
190{
191 int rc;
192 FILE_UNIX_BASIC_INFO *presp_data;
193 __u32 posix_flags = 0;
194 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
195 struct cifs_fattr fattr;
196
197 cFYI(1, "posix open %s", full_path);
198
199 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
200 if (presp_data == NULL)
201 return -ENOMEM;
202
203/* So far cifs posix extensions can only map the following flags.
204 There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
205 so far we do not seem to need them, and we can treat them as local only */
206 if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
207 (FMODE_READ | FMODE_WRITE))
208 posix_flags = SMB_O_RDWR;
209 else if (oflags & FMODE_READ)
210 posix_flags = SMB_O_RDONLY;
211 else if (oflags & FMODE_WRITE)
212 posix_flags = SMB_O_WRONLY;
213 if (oflags & O_CREAT)
214 posix_flags |= SMB_O_CREAT;
215 if (oflags & O_EXCL)
216 posix_flags |= SMB_O_EXCL;
217 if (oflags & O_TRUNC)
218 posix_flags |= SMB_O_TRUNC;
219 /* be safe and imply O_SYNC for O_DSYNC */
220 if (oflags & O_DSYNC)
221 posix_flags |= SMB_O_SYNC;
222 if (oflags & O_DIRECTORY)
223 posix_flags |= SMB_O_DIRECTORY;
224 if (oflags & O_NOFOLLOW)
225 posix_flags |= SMB_O_NOFOLLOW;
226 if (oflags & O_DIRECT)
227 posix_flags |= SMB_O_DIRECT;
228
229 mode &= ~current_umask();
230 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
231 pnetfid, presp_data, poplock, full_path,
232 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
233 CIFS_MOUNT_MAP_SPECIAL_CHR);
234 if (rc)
235 goto posix_open_ret;
236
237 if (presp_data->Type == cpu_to_le32(-1))
238 goto posix_open_ret; /* open ok, caller does qpathinfo */
239
240 if (!pinode)
241 goto posix_open_ret; /* caller does not need info */
242
243 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
244
245 /* get new inode and set it up */
246 if (*pinode == NULL) {
247 cifs_fill_uniqueid(sb, &fattr);
248 *pinode = cifs_iget(sb, &fattr);
249 if (!*pinode) {
250 rc = -ENOMEM;
251 goto posix_open_ret;
252 }
253 } else {
254 cifs_fattr_to_inode(*pinode, &fattr);
255 }
256
257posix_open_ret:
258 kfree(presp_data);
259 return rc;
260}
261
262static void setup_cifs_dentry(struct cifsTconInfo *tcon, 133static void setup_cifs_dentry(struct cifsTconInfo *tcon,
263 struct dentry *direntry, 134 struct dentry *direntry,
264 struct inode *newinode) 135 struct inode *newinode)
@@ -291,6 +162,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
291 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 162 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
292 __u16 fileHandle; 163 __u16 fileHandle;
293 struct cifs_sb_info *cifs_sb; 164 struct cifs_sb_info *cifs_sb;
165 struct tcon_link *tlink;
294 struct cifsTconInfo *tcon; 166 struct cifsTconInfo *tcon;
295 char *full_path = NULL; 167 char *full_path = NULL;
296 FILE_ALL_INFO *buf = NULL; 168 FILE_ALL_INFO *buf = NULL;
@@ -300,21 +172,26 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
300 xid = GetXid(); 172 xid = GetXid();
301 173
302 cifs_sb = CIFS_SB(inode->i_sb); 174 cifs_sb = CIFS_SB(inode->i_sb);
303 tcon = cifs_sb->tcon; 175 tlink = cifs_sb_tlink(cifs_sb);
304 176 if (IS_ERR(tlink)) {
305 full_path = build_path_from_dentry(direntry); 177 FreeXid(xid);
306 if (full_path == NULL) { 178 return PTR_ERR(tlink);
307 rc = -ENOMEM;
308 goto cifs_create_out;
309 } 179 }
180 tcon = tlink_tcon(tlink);
310 181
311 if (oplockEnabled) 182 if (oplockEnabled)
312 oplock = REQ_OPLOCK; 183 oplock = REQ_OPLOCK;
313 184
314 if (nd && (nd->flags & LOOKUP_OPEN)) 185 if (nd && (nd->flags & LOOKUP_OPEN))
315 oflags = nd->intent.open.flags; 186 oflags = nd->intent.open.file->f_flags;
316 else 187 else
317 oflags = FMODE_READ | SMB_O_CREAT; 188 oflags = O_RDONLY | O_CREAT;
189
190 full_path = build_path_from_dentry(direntry);
191 if (full_path == NULL) {
192 rc = -ENOMEM;
193 goto cifs_create_out;
194 }
318 195
319 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 196 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
320 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 197 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -344,9 +221,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
344 /* if the file is going to stay open, then we 221 /* if the file is going to stay open, then we
345 need to set the desired access properly */ 222 need to set the desired access properly */
346 desiredAccess = 0; 223 desiredAccess = 0;
347 if (oflags & FMODE_READ) 224 if (OPEN_FMODE(oflags) & FMODE_READ)
348 desiredAccess |= GENERIC_READ; /* is this too little? */ 225 desiredAccess |= GENERIC_READ; /* is this too little? */
349 if (oflags & FMODE_WRITE) 226 if (OPEN_FMODE(oflags) & FMODE_WRITE)
350 desiredAccess |= GENERIC_WRITE; 227 desiredAccess |= GENERIC_WRITE;
351 228
352 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 229 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -375,7 +252,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
375 if (!tcon->unix_ext && (mode & S_IWUGO) == 0) 252 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
376 create_options |= CREATE_OPTION_READONLY; 253 create_options |= CREATE_OPTION_READONLY;
377 254
378 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 255 if (tcon->ses->capabilities & CAP_NT_SMBS)
379 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 256 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
380 desiredAccess, create_options, 257 desiredAccess, create_options,
381 &fileHandle, &oplock, buf, cifs_sb->local_nls, 258 &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -467,8 +344,7 @@ cifs_create_set_dentry:
467 goto cifs_create_out; 344 goto cifs_create_out;
468 } 345 }
469 346
470 pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp, 347 pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
471 nd->path.mnt, oflags);
472 if (pfile_info == NULL) { 348 if (pfile_info == NULL) {
473 fput(filp); 349 fput(filp);
474 CIFSSMBClose(xid, tcon, fileHandle); 350 CIFSSMBClose(xid, tcon, fileHandle);
@@ -481,6 +357,7 @@ cifs_create_set_dentry:
481cifs_create_out: 357cifs_create_out:
482 kfree(buf); 358 kfree(buf);
483 kfree(full_path); 359 kfree(full_path);
360 cifs_put_tlink(tlink);
484 FreeXid(xid); 361 FreeXid(xid);
485 return rc; 362 return rc;
486} 363}
@@ -491,6 +368,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
491 int rc = -EPERM; 368 int rc = -EPERM;
492 int xid; 369 int xid;
493 struct cifs_sb_info *cifs_sb; 370 struct cifs_sb_info *cifs_sb;
371 struct tcon_link *tlink;
494 struct cifsTconInfo *pTcon; 372 struct cifsTconInfo *pTcon;
495 char *full_path = NULL; 373 char *full_path = NULL;
496 struct inode *newinode = NULL; 374 struct inode *newinode = NULL;
@@ -503,10 +381,14 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
503 if (!old_valid_dev(device_number)) 381 if (!old_valid_dev(device_number))
504 return -EINVAL; 382 return -EINVAL;
505 383
506 xid = GetXid();
507
508 cifs_sb = CIFS_SB(inode->i_sb); 384 cifs_sb = CIFS_SB(inode->i_sb);
509 pTcon = cifs_sb->tcon; 385 tlink = cifs_sb_tlink(cifs_sb);
386 if (IS_ERR(tlink))
387 return PTR_ERR(tlink);
388
389 pTcon = tlink_tcon(tlink);
390
391 xid = GetXid();
510 392
511 full_path = build_path_from_dentry(direntry); 393 full_path = build_path_from_dentry(direntry);
512 if (full_path == NULL) { 394 if (full_path == NULL) {
@@ -606,6 +488,7 @@ mknod_out:
606 kfree(full_path); 488 kfree(full_path);
607 kfree(buf); 489 kfree(buf);
608 FreeXid(xid); 490 FreeXid(xid);
491 cifs_put_tlink(tlink);
609 return rc; 492 return rc;
610} 493}
611 494
@@ -619,6 +502,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
619 __u16 fileHandle = 0; 502 __u16 fileHandle = 0;
620 bool posix_open = false; 503 bool posix_open = false;
621 struct cifs_sb_info *cifs_sb; 504 struct cifs_sb_info *cifs_sb;
505 struct tcon_link *tlink;
622 struct cifsTconInfo *pTcon; 506 struct cifsTconInfo *pTcon;
623 struct cifsFileInfo *cfile; 507 struct cifsFileInfo *cfile;
624 struct inode *newInode = NULL; 508 struct inode *newInode = NULL;
@@ -633,7 +517,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
633 /* check whether path exists */ 517 /* check whether path exists */
634 518
635 cifs_sb = CIFS_SB(parent_dir_inode->i_sb); 519 cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
636 pTcon = cifs_sb->tcon; 520 tlink = cifs_sb_tlink(cifs_sb);
521 if (IS_ERR(tlink)) {
522 FreeXid(xid);
523 return (struct dentry *)tlink;
524 }
525 pTcon = tlink_tcon(tlink);
637 526
638 /* 527 /*
639 * Don't allow the separator character in a path component. 528 * Don't allow the separator character in a path component.
@@ -644,8 +533,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
644 for (i = 0; i < direntry->d_name.len; i++) 533 for (i = 0; i < direntry->d_name.len; i++)
645 if (direntry->d_name.name[i] == '\\') { 534 if (direntry->d_name.name[i] == '\\') {
646 cFYI(1, "Invalid file name"); 535 cFYI(1, "Invalid file name");
647 FreeXid(xid); 536 rc = -EINVAL;
648 return ERR_PTR(-EINVAL); 537 goto lookup_out;
649 } 538 }
650 } 539 }
651 540
@@ -655,7 +544,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
655 */ 544 */
656 if (nd && (nd->flags & LOOKUP_EXCL)) { 545 if (nd && (nd->flags & LOOKUP_EXCL)) {
657 d_instantiate(direntry, NULL); 546 d_instantiate(direntry, NULL);
658 return NULL; 547 rc = 0;
548 goto lookup_out;
659 } 549 }
660 550
661 /* can not grab the rename sem here since it would 551 /* can not grab the rename sem here since it would
@@ -663,8 +553,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
663 in which we already have the sb rename sem */ 553 in which we already have the sb rename sem */
664 full_path = build_path_from_dentry(direntry); 554 full_path = build_path_from_dentry(direntry);
665 if (full_path == NULL) { 555 if (full_path == NULL) {
666 FreeXid(xid); 556 rc = -ENOMEM;
667 return ERR_PTR(-ENOMEM); 557 goto lookup_out;
668 } 558 }
669 559
670 if (direntry->d_inode != NULL) { 560 if (direntry->d_inode != NULL) {
@@ -687,11 +577,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
687 if (pTcon->unix_ext) { 577 if (pTcon->unix_ext) {
688 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 578 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
689 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 579 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
690 (nd->intent.open.flags & O_CREAT)) { 580 (nd->intent.open.file->f_flags & O_CREAT)) {
691 rc = cifs_posix_open(full_path, &newInode, 581 rc = cifs_posix_open(full_path, &newInode,
692 parent_dir_inode->i_sb, 582 parent_dir_inode->i_sb,
693 nd->intent.open.create_mode, 583 nd->intent.open.create_mode,
694 nd->intent.open.flags, &oplock, 584 nd->intent.open.file->f_flags, &oplock,
695 &fileHandle, xid); 585 &fileHandle, xid);
696 /* 586 /*
697 * The check below works around a bug in POSIX 587 * The check below works around a bug in POSIX
@@ -727,9 +617,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
727 goto lookup_out; 617 goto lookup_out;
728 } 618 }
729 619
730 cfile = cifs_new_fileinfo(newInode, fileHandle, filp, 620 cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
731 nd->path.mnt, 621 oplock);
732 nd->intent.open.flags);
733 if (cfile == NULL) { 622 if (cfile == NULL) {
734 fput(filp); 623 fput(filp);
735 CIFSSMBClose(xid, pTcon, fileHandle); 624 CIFSSMBClose(xid, pTcon, fileHandle);
@@ -759,6 +648,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
759 648
760lookup_out: 649lookup_out:
761 kfree(full_path); 650 kfree(full_path);
651 cifs_put_tlink(tlink);
762 FreeXid(xid); 652 FreeXid(xid);
763 return ERR_PTR(rc); 653 return ERR_PTR(rc);
764} 654}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index de748c652d11..06c3e83fa387 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -60,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
60 FILE_READ_DATA); 60 FILE_READ_DATA);
61} 61}
62 62
63static inline fmode_t cifs_posix_convert_flags(unsigned int flags) 63static u32 cifs_posix_convert_flags(unsigned int flags)
64{ 64{
65 fmode_t posix_flags = 0; 65 u32 posix_flags = 0;
66 66
67 if ((flags & O_ACCMODE) == O_RDONLY) 67 if ((flags & O_ACCMODE) == O_RDONLY)
68 posix_flags = FMODE_READ; 68 posix_flags = SMB_O_RDONLY;
69 else if ((flags & O_ACCMODE) == O_WRONLY) 69 else if ((flags & O_ACCMODE) == O_WRONLY)
70 posix_flags = FMODE_WRITE; 70 posix_flags = SMB_O_WRONLY;
71 else if ((flags & O_ACCMODE) == O_RDWR) { 71 else if ((flags & O_ACCMODE) == O_RDWR)
72 /* GENERIC_ALL is too much permission to request 72 posix_flags = SMB_O_RDWR;
73 can cause unnecessary access denied on create */ 73
74 /* return GENERIC_ALL; */ 74 if (flags & O_CREAT)
75 posix_flags = FMODE_READ | FMODE_WRITE; 75 posix_flags |= SMB_O_CREAT;
76 } 76 if (flags & O_EXCL)
77 /* can not map O_CREAT or O_EXCL or O_TRUNC flags when 77 posix_flags |= SMB_O_EXCL;
78 reopening a file. They had their effect on the original open */ 78 if (flags & O_TRUNC)
79 if (flags & O_APPEND) 79 posix_flags |= SMB_O_TRUNC;
80 posix_flags |= (fmode_t)O_APPEND; 80 /* be safe and imply O_SYNC for O_DSYNC */
81 if (flags & O_DSYNC) 81 if (flags & O_DSYNC)
82 posix_flags |= (fmode_t)O_DSYNC; 82 posix_flags |= SMB_O_SYNC;
83 if (flags & __O_SYNC)
84 posix_flags |= (fmode_t)__O_SYNC;
85 if (flags & O_DIRECTORY) 83 if (flags & O_DIRECTORY)
86 posix_flags |= (fmode_t)O_DIRECTORY; 84 posix_flags |= SMB_O_DIRECTORY;
87 if (flags & O_NOFOLLOW) 85 if (flags & O_NOFOLLOW)
88 posix_flags |= (fmode_t)O_NOFOLLOW; 86 posix_flags |= SMB_O_NOFOLLOW;
89 if (flags & O_DIRECT) 87 if (flags & O_DIRECT)
90 posix_flags |= (fmode_t)O_DIRECT; 88 posix_flags |= SMB_O_DIRECT;
91 89
92 return posix_flags; 90 return posix_flags;
93} 91}
@@ -106,66 +104,8 @@ static inline int cifs_get_disposition(unsigned int flags)
106 return FILE_OPEN; 104 return FILE_OPEN;
107} 105}
108 106
109/* all arguments to this function must be checked for validity in caller */
110static inline int
111cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
112 struct cifsInodeInfo *pCifsInode, __u32 oplock,
113 u16 netfid)
114{
115
116 write_lock(&GlobalSMBSeslock);
117
118 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
119 if (pCifsInode == NULL) {
120 write_unlock(&GlobalSMBSeslock);
121 return -EINVAL;
122 }
123
124 if (pCifsInode->clientCanCacheRead) {
125 /* we have the inode open somewhere else
126 no need to discard cache data */
127 goto psx_client_can_cache;
128 }
129
130 /* BB FIXME need to fix this check to move it earlier into posix_open
131 BB fIX following section BB FIXME */
132
133 /* if not oplocked, invalidate inode pages if mtime or file
134 size changed */
135/* temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
136 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
137 (file->f_path.dentry->d_inode->i_size ==
138 (loff_t)le64_to_cpu(buf->EndOfFile))) {
139 cFYI(1, "inode unchanged on server");
140 } else {
141 if (file->f_path.dentry->d_inode->i_mapping) {
142 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
143 if (rc != 0)
144 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
145 }
146 cFYI(1, "invalidating remote inode since open detected it "
147 "changed");
148 invalidate_remote_inode(file->f_path.dentry->d_inode);
149 } */
150
151psx_client_can_cache:
152 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
153 pCifsInode->clientCanCacheAll = true;
154 pCifsInode->clientCanCacheRead = true;
155 cFYI(1, "Exclusive Oplock granted on inode %p",
156 file->f_path.dentry->d_inode);
157 } else if ((oplock & 0xF) == OPLOCK_READ)
158 pCifsInode->clientCanCacheRead = true;
159
160 /* will have to change the unlock if we reenable the
161 filemap_fdatawrite (which does not seem necessary */
162 write_unlock(&GlobalSMBSeslock);
163 return 0;
164}
165
166/* all arguments to this function must be checked for validity in caller */
167static inline int cifs_open_inode_helper(struct inode *inode, 107static inline int cifs_open_inode_helper(struct inode *inode,
168 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, 108 struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
169 char *full_path, int xid) 109 char *full_path, int xid)
170{ 110{
171 struct cifsInodeInfo *pCifsInode = CIFS_I(inode); 111 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
@@ -191,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode,
191 /* BB no need to lock inode until after invalidate 131 /* BB no need to lock inode until after invalidate
192 since namei code should already have it locked? */ 132 since namei code should already have it locked? */
193 rc = filemap_write_and_wait(inode->i_mapping); 133 rc = filemap_write_and_wait(inode->i_mapping);
194 if (rc != 0) 134 mapping_set_error(inode->i_mapping, rc);
195 pCifsInode->write_behind_rc = rc;
196 } 135 }
197 cFYI(1, "invalidating remote inode since open detected it " 136 cFYI(1, "invalidating remote inode since open detected it "
198 "changed"); 137 "changed");
@@ -207,16 +146,166 @@ client_can_cache:
207 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, 146 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
208 xid, NULL); 147 xid, NULL);
209 148
210 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { 149 cifs_set_oplock_level(pCifsInode, oplock);
211 pCifsInode->clientCanCacheAll = true; 150
212 pCifsInode->clientCanCacheRead = true; 151 return rc;
213 cFYI(1, "Exclusive Oplock granted on inode %p", inode); 152}
214 } else if ((*oplock & 0xF) == OPLOCK_READ) 153
215 pCifsInode->clientCanCacheRead = true; 154int cifs_posix_open(char *full_path, struct inode **pinode,
155 struct super_block *sb, int mode, unsigned int f_flags,
156 __u32 *poplock, __u16 *pnetfid, int xid)
157{
158 int rc;
159 FILE_UNIX_BASIC_INFO *presp_data;
160 __u32 posix_flags = 0;
161 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
162 struct cifs_fattr fattr;
163 struct tcon_link *tlink;
164 struct cifsTconInfo *tcon;
165
166 cFYI(1, "posix open %s", full_path);
167
168 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
169 if (presp_data == NULL)
170 return -ENOMEM;
171
172 tlink = cifs_sb_tlink(cifs_sb);
173 if (IS_ERR(tlink)) {
174 rc = PTR_ERR(tlink);
175 goto posix_open_ret;
176 }
177
178 tcon = tlink_tcon(tlink);
179 mode &= ~current_umask();
180
181 posix_flags = cifs_posix_convert_flags(f_flags);
182 rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
183 poplock, full_path, cifs_sb->local_nls,
184 cifs_sb->mnt_cifs_flags &
185 CIFS_MOUNT_MAP_SPECIAL_CHR);
186 cifs_put_tlink(tlink);
187
188 if (rc)
189 goto posix_open_ret;
190
191 if (presp_data->Type == cpu_to_le32(-1))
192 goto posix_open_ret; /* open ok, caller does qpathinfo */
193
194 if (!pinode)
195 goto posix_open_ret; /* caller does not need info */
196
197 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
198
199 /* get new inode and set it up */
200 if (*pinode == NULL) {
201 cifs_fill_uniqueid(sb, &fattr);
202 *pinode = cifs_iget(sb, &fattr);
203 if (!*pinode) {
204 rc = -ENOMEM;
205 goto posix_open_ret;
206 }
207 } else {
208 cifs_fattr_to_inode(*pinode, &fattr);
209 }
216 210
211posix_open_ret:
212 kfree(presp_data);
217 return rc; 213 return rc;
218} 214}
219 215
216struct cifsFileInfo *
217cifs_new_fileinfo(__u16 fileHandle, struct file *file,
218 struct tcon_link *tlink, __u32 oplock)
219{
220 struct dentry *dentry = file->f_path.dentry;
221 struct inode *inode = dentry->d_inode;
222 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
223 struct cifsFileInfo *pCifsFile;
224
225 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
226 if (pCifsFile == NULL)
227 return pCifsFile;
228
229 pCifsFile->count = 1;
230 pCifsFile->netfid = fileHandle;
231 pCifsFile->pid = current->tgid;
232 pCifsFile->uid = current_fsuid();
233 pCifsFile->dentry = dget(dentry);
234 pCifsFile->f_flags = file->f_flags;
235 pCifsFile->invalidHandle = false;
236 pCifsFile->tlink = cifs_get_tlink(tlink);
237 mutex_init(&pCifsFile->fh_mutex);
238 mutex_init(&pCifsFile->lock_mutex);
239 INIT_LIST_HEAD(&pCifsFile->llist);
240 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
241
242 spin_lock(&cifs_file_list_lock);
243 list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
244 /* if readable file instance put first in list*/
245 if (file->f_mode & FMODE_READ)
246 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
247 else
248 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
249 spin_unlock(&cifs_file_list_lock);
250
251 cifs_set_oplock_level(pCifsInode, oplock);
252
253 file->private_data = pCifsFile;
254 return pCifsFile;
255}
256
257/*
258 * Release a reference on the file private data. This may involve closing
259 * the filehandle out on the server. Must be called without holding
260 * cifs_file_list_lock.
261 */
262void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
263{
264 struct inode *inode = cifs_file->dentry->d_inode;
265 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
266 struct cifsInodeInfo *cifsi = CIFS_I(inode);
267 struct cifsLockInfo *li, *tmp;
268
269 spin_lock(&cifs_file_list_lock);
270 if (--cifs_file->count > 0) {
271 spin_unlock(&cifs_file_list_lock);
272 return;
273 }
274
275 /* remove it from the lists */
276 list_del(&cifs_file->flist);
277 list_del(&cifs_file->tlist);
278
279 if (list_empty(&cifsi->openFileList)) {
280 cFYI(1, "closing last open instance for inode %p",
281 cifs_file->dentry->d_inode);
282 cifs_set_oplock_level(cifsi, 0);
283 }
284 spin_unlock(&cifs_file_list_lock);
285
286 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
287 int xid, rc;
288
289 xid = GetXid();
290 rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
291 FreeXid(xid);
292 }
293
294 /* Delete any outstanding lock records. We'll lose them when the file
295 * is closed anyway.
296 */
297 mutex_lock(&cifs_file->lock_mutex);
298 list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
299 list_del(&li->llist);
300 kfree(li);
301 }
302 mutex_unlock(&cifs_file->lock_mutex);
303
304 cifs_put_tlink(cifs_file->tlink);
305 dput(cifs_file->dentry);
306 kfree(cifs_file);
307}
308
220int cifs_open(struct inode *inode, struct file *file) 309int cifs_open(struct inode *inode, struct file *file)
221{ 310{
222 int rc = -EACCES; 311 int rc = -EACCES;
@@ -224,6 +313,7 @@ int cifs_open(struct inode *inode, struct file *file)
224 __u32 oplock; 313 __u32 oplock;
225 struct cifs_sb_info *cifs_sb; 314 struct cifs_sb_info *cifs_sb;
226 struct cifsTconInfo *tcon; 315 struct cifsTconInfo *tcon;
316 struct tcon_link *tlink;
227 struct cifsFileInfo *pCifsFile = NULL; 317 struct cifsFileInfo *pCifsFile = NULL;
228 struct cifsInodeInfo *pCifsInode; 318 struct cifsInodeInfo *pCifsInode;
229 char *full_path = NULL; 319 char *full_path = NULL;
@@ -235,7 +325,12 @@ int cifs_open(struct inode *inode, struct file *file)
235 xid = GetXid(); 325 xid = GetXid();
236 326
237 cifs_sb = CIFS_SB(inode->i_sb); 327 cifs_sb = CIFS_SB(inode->i_sb);
238 tcon = cifs_sb->tcon; 328 tlink = cifs_sb_tlink(cifs_sb);
329 if (IS_ERR(tlink)) {
330 FreeXid(xid);
331 return PTR_ERR(tlink);
332 }
333 tcon = tlink_tcon(tlink);
239 334
240 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 335 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
241 336
@@ -257,27 +352,15 @@ int cifs_open(struct inode *inode, struct file *file)
257 (tcon->ses->capabilities & CAP_UNIX) && 352 (tcon->ses->capabilities & CAP_UNIX) &&
258 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 353 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
259 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 354 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
260 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
261 oflags |= SMB_O_CREAT;
262 /* can not refresh inode info since size could be stale */ 355 /* can not refresh inode info since size could be stale */
263 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 356 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
264 cifs_sb->mnt_file_mode /* ignored */, 357 cifs_sb->mnt_file_mode /* ignored */,
265 oflags, &oplock, &netfid, xid); 358 file->f_flags, &oplock, &netfid, xid);
266 if (rc == 0) { 359 if (rc == 0) {
267 cFYI(1, "posix open succeeded"); 360 cFYI(1, "posix open succeeded");
268 /* no need for special case handling of setting mode
269 on read only files needed here */
270
271 rc = cifs_posix_open_inode_helper(inode, file,
272 pCifsInode, oplock, netfid);
273 if (rc != 0) {
274 CIFSSMBClose(xid, tcon, netfid);
275 goto out;
276 }
277 361
278 pCifsFile = cifs_new_fileinfo(inode, netfid, file, 362 pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
279 file->f_path.mnt, 363 oplock);
280 oflags);
281 if (pCifsFile == NULL) { 364 if (pCifsFile == NULL) {
282 CIFSSMBClose(xid, tcon, netfid); 365 CIFSSMBClose(xid, tcon, netfid);
283 rc = -ENOMEM; 366 rc = -ENOMEM;
@@ -345,7 +428,7 @@ int cifs_open(struct inode *inode, struct file *file)
345 goto out; 428 goto out;
346 } 429 }
347 430
348 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 431 if (tcon->ses->capabilities & CAP_NT_SMBS)
349 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 432 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
350 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, 433 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
351 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 434 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
@@ -365,12 +448,11 @@ int cifs_open(struct inode *inode, struct file *file)
365 goto out; 448 goto out;
366 } 449 }
367 450
368 rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid); 451 rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
369 if (rc != 0) 452 if (rc != 0)
370 goto out; 453 goto out;
371 454
372 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, 455 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
373 file->f_flags);
374 if (pCifsFile == NULL) { 456 if (pCifsFile == NULL) {
375 rc = -ENOMEM; 457 rc = -ENOMEM;
376 goto out; 458 goto out;
@@ -402,6 +484,7 @@ out:
402 kfree(buf); 484 kfree(buf);
403 kfree(full_path); 485 kfree(full_path);
404 FreeXid(xid); 486 FreeXid(xid);
487 cifs_put_tlink(tlink);
405 return rc; 488 return rc;
406} 489}
407 490
@@ -416,14 +499,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
416 return rc; 499 return rc;
417} 500}
418 501
419static int cifs_reopen_file(struct file *file, bool can_flush) 502static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
420{ 503{
421 int rc = -EACCES; 504 int rc = -EACCES;
422 int xid; 505 int xid;
423 __u32 oplock; 506 __u32 oplock;
424 struct cifs_sb_info *cifs_sb; 507 struct cifs_sb_info *cifs_sb;
425 struct cifsTconInfo *tcon; 508 struct cifsTconInfo *tcon;
426 struct cifsFileInfo *pCifsFile;
427 struct cifsInodeInfo *pCifsInode; 509 struct cifsInodeInfo *pCifsInode;
428 struct inode *inode; 510 struct inode *inode;
429 char *full_path = NULL; 511 char *full_path = NULL;
@@ -431,11 +513,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
431 int disposition = FILE_OPEN; 513 int disposition = FILE_OPEN;
432 __u16 netfid; 514 __u16 netfid;
433 515
434 if (file->private_data)
435 pCifsFile = file->private_data;
436 else
437 return -EBADF;
438
439 xid = GetXid(); 516 xid = GetXid();
440 mutex_lock(&pCifsFile->fh_mutex); 517 mutex_lock(&pCifsFile->fh_mutex);
441 if (!pCifsFile->invalidHandle) { 518 if (!pCifsFile->invalidHandle) {
@@ -445,39 +522,24 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
445 return rc; 522 return rc;
446 } 523 }
447 524
448 if (file->f_path.dentry == NULL) { 525 inode = pCifsFile->dentry->d_inode;
449 cERROR(1, "no valid name if dentry freed");
450 dump_stack();
451 rc = -EBADF;
452 goto reopen_error_exit;
453 }
454
455 inode = file->f_path.dentry->d_inode;
456 if (inode == NULL) {
457 cERROR(1, "inode not valid");
458 dump_stack();
459 rc = -EBADF;
460 goto reopen_error_exit;
461 }
462
463 cifs_sb = CIFS_SB(inode->i_sb); 526 cifs_sb = CIFS_SB(inode->i_sb);
464 tcon = cifs_sb->tcon; 527 tcon = tlink_tcon(pCifsFile->tlink);
465 528
466/* can not grab rename sem here because various ops, including 529/* can not grab rename sem here because various ops, including
467 those that already have the rename sem can end up causing writepage 530 those that already have the rename sem can end up causing writepage
468 to get called and if the server was down that means we end up here, 531 to get called and if the server was down that means we end up here,
469 and we can never tell if the caller already has the rename_sem */ 532 and we can never tell if the caller already has the rename_sem */
470 full_path = build_path_from_dentry(file->f_path.dentry); 533 full_path = build_path_from_dentry(pCifsFile->dentry);
471 if (full_path == NULL) { 534 if (full_path == NULL) {
472 rc = -ENOMEM; 535 rc = -ENOMEM;
473reopen_error_exit:
474 mutex_unlock(&pCifsFile->fh_mutex); 536 mutex_unlock(&pCifsFile->fh_mutex);
475 FreeXid(xid); 537 FreeXid(xid);
476 return rc; 538 return rc;
477 } 539 }
478 540
479 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 541 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
480 inode, file->f_flags, full_path); 542 inode, pCifsFile->f_flags, full_path);
481 543
482 if (oplockEnabled) 544 if (oplockEnabled)
483 oplock = REQ_OPLOCK; 545 oplock = REQ_OPLOCK;
@@ -487,8 +549,14 @@ reopen_error_exit:
487 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 549 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
488 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 550 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
489 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 551 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
490 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 552
491 /* can not refresh inode info since size could be stale */ 553 /*
554 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
555 * original open. Must mask them off for a reopen.
556 */
557 unsigned int oflags = pCifsFile->f_flags &
558 ~(O_CREAT | O_EXCL | O_TRUNC);
559
492 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 560 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
493 cifs_sb->mnt_file_mode /* ignored */, 561 cifs_sb->mnt_file_mode /* ignored */,
494 oflags, &oplock, &netfid, xid); 562 oflags, &oplock, &netfid, xid);
@@ -500,7 +568,7 @@ reopen_error_exit:
500 in the reconnect path it is important to retry hard */ 568 in the reconnect path it is important to retry hard */
501 } 569 }
502 570
503 desiredAccess = cifs_convert_flags(file->f_flags); 571 desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
504 572
505 /* Can not refresh inode by passing in file_info buf to be returned 573 /* Can not refresh inode by passing in file_info buf to be returned
506 by SMBOpen and then calling get_inode_info with returned buf 574 by SMBOpen and then calling get_inode_info with returned buf
@@ -516,49 +584,38 @@ reopen_error_exit:
516 mutex_unlock(&pCifsFile->fh_mutex); 584 mutex_unlock(&pCifsFile->fh_mutex);
517 cFYI(1, "cifs_open returned 0x%x", rc); 585 cFYI(1, "cifs_open returned 0x%x", rc);
518 cFYI(1, "oplock: %d", oplock); 586 cFYI(1, "oplock: %d", oplock);
519 } else { 587 goto reopen_error_exit;
520reopen_success:
521 pCifsFile->netfid = netfid;
522 pCifsFile->invalidHandle = false;
523 mutex_unlock(&pCifsFile->fh_mutex);
524 pCifsInode = CIFS_I(inode);
525 if (pCifsInode) {
526 if (can_flush) {
527 rc = filemap_write_and_wait(inode->i_mapping);
528 if (rc != 0)
529 CIFS_I(inode)->write_behind_rc = rc;
530 /* temporarily disable caching while we
531 go to server to get inode info */
532 pCifsInode->clientCanCacheAll = false;
533 pCifsInode->clientCanCacheRead = false;
534 if (tcon->unix_ext)
535 rc = cifs_get_inode_info_unix(&inode,
536 full_path, inode->i_sb, xid);
537 else
538 rc = cifs_get_inode_info(&inode,
539 full_path, NULL, inode->i_sb,
540 xid, NULL);
541 } /* else we are writing out data to server already
542 and could deadlock if we tried to flush data, and
543 since we do not know if we have data that would
544 invalidate the current end of file on the server
545 we can not go to the server to get the new inod
546 info */
547 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
548 pCifsInode->clientCanCacheAll = true;
549 pCifsInode->clientCanCacheRead = true;
550 cFYI(1, "Exclusive Oplock granted on inode %p",
551 file->f_path.dentry->d_inode);
552 } else if ((oplock & 0xF) == OPLOCK_READ) {
553 pCifsInode->clientCanCacheRead = true;
554 pCifsInode->clientCanCacheAll = false;
555 } else {
556 pCifsInode->clientCanCacheRead = false;
557 pCifsInode->clientCanCacheAll = false;
558 }
559 cifs_relock_file(pCifsFile);
560 }
561 } 588 }
589
590reopen_success:
591 pCifsFile->netfid = netfid;
592 pCifsFile->invalidHandle = false;
593 mutex_unlock(&pCifsFile->fh_mutex);
594 pCifsInode = CIFS_I(inode);
595
596 if (can_flush) {
597 rc = filemap_write_and_wait(inode->i_mapping);
598 mapping_set_error(inode->i_mapping, rc);
599
600 if (tcon->unix_ext)
601 rc = cifs_get_inode_info_unix(&inode,
602 full_path, inode->i_sb, xid);
603 else
604 rc = cifs_get_inode_info(&inode,
605 full_path, NULL, inode->i_sb,
606 xid, NULL);
607 } /* else we are writing out data to server already
608 and could deadlock if we tried to flush data, and
609 since we do not know if we have data that would
610 invalidate the current end of file on the server
611 we can not go to the server to get the new inod
612 info */
613
614 cifs_set_oplock_level(pCifsInode, oplock);
615
616 cifs_relock_file(pCifsFile);
617
618reopen_error_exit:
562 kfree(full_path); 619 kfree(full_path);
563 FreeXid(xid); 620 FreeXid(xid);
564 return rc; 621 return rc;
@@ -566,79 +623,11 @@ reopen_success:
566 623
567int cifs_close(struct inode *inode, struct file *file) 624int cifs_close(struct inode *inode, struct file *file)
568{ 625{
569 int rc = 0; 626 cifsFileInfo_put(file->private_data);
570 int xid, timeout; 627 file->private_data = NULL;
571 struct cifs_sb_info *cifs_sb;
572 struct cifsTconInfo *pTcon;
573 struct cifsFileInfo *pSMBFile = file->private_data;
574
575 xid = GetXid();
576 628
577 cifs_sb = CIFS_SB(inode->i_sb); 629 /* return code from the ->release op is always ignored */
578 pTcon = cifs_sb->tcon; 630 return 0;
579 if (pSMBFile) {
580 struct cifsLockInfo *li, *tmp;
581 write_lock(&GlobalSMBSeslock);
582 pSMBFile->closePend = true;
583 if (pTcon) {
584 /* no sense reconnecting to close a file that is
585 already closed */
586 if (!pTcon->need_reconnect) {
587 write_unlock(&GlobalSMBSeslock);
588 timeout = 2;
589 while ((atomic_read(&pSMBFile->count) != 1)
590 && (timeout <= 2048)) {
591 /* Give write a better chance to get to
592 server ahead of the close. We do not
593 want to add a wait_q here as it would
594 increase the memory utilization as
595 the struct would be in each open file,
596 but this should give enough time to
597 clear the socket */
598 cFYI(DBG2, "close delay, write pending");
599 msleep(timeout);
600 timeout *= 4;
601 }
602 if (!pTcon->need_reconnect &&
603 !pSMBFile->invalidHandle)
604 rc = CIFSSMBClose(xid, pTcon,
605 pSMBFile->netfid);
606 } else
607 write_unlock(&GlobalSMBSeslock);
608 } else
609 write_unlock(&GlobalSMBSeslock);
610
611 /* Delete any outstanding lock records.
612 We'll lose them when the file is closed anyway. */
613 mutex_lock(&pSMBFile->lock_mutex);
614 list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
615 list_del(&li->llist);
616 kfree(li);
617 }
618 mutex_unlock(&pSMBFile->lock_mutex);
619
620 write_lock(&GlobalSMBSeslock);
621 list_del(&pSMBFile->flist);
622 list_del(&pSMBFile->tlist);
623 write_unlock(&GlobalSMBSeslock);
624 cifsFileInfo_put(file->private_data);
625 file->private_data = NULL;
626 } else
627 rc = -EBADF;
628
629 read_lock(&GlobalSMBSeslock);
630 if (list_empty(&(CIFS_I(inode)->openFileList))) {
631 cFYI(1, "closing last open instance for inode %p", inode);
632 /* if the file is not open we do not know if we can cache info
633 on this inode, much less write behind and read ahead */
634 CIFS_I(inode)->clientCanCacheRead = false;
635 CIFS_I(inode)->clientCanCacheAll = false;
636 }
637 read_unlock(&GlobalSMBSeslock);
638 if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
639 rc = CIFS_I(inode)->write_behind_rc;
640 FreeXid(xid);
641 return rc;
642} 631}
643 632
644int cifs_closedir(struct inode *inode, struct file *file) 633int cifs_closedir(struct inode *inode, struct file *file)
@@ -653,25 +642,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
653 xid = GetXid(); 642 xid = GetXid();
654 643
655 if (pCFileStruct) { 644 if (pCFileStruct) {
656 struct cifsTconInfo *pTcon; 645 struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
657 struct cifs_sb_info *cifs_sb =
658 CIFS_SB(file->f_path.dentry->d_sb);
659
660 pTcon = cifs_sb->tcon;
661 646
662 cFYI(1, "Freeing private data in close dir"); 647 cFYI(1, "Freeing private data in close dir");
663 write_lock(&GlobalSMBSeslock); 648 spin_lock(&cifs_file_list_lock);
664 if (!pCFileStruct->srch_inf.endOfSearch && 649 if (!pCFileStruct->srch_inf.endOfSearch &&
665 !pCFileStruct->invalidHandle) { 650 !pCFileStruct->invalidHandle) {
666 pCFileStruct->invalidHandle = true; 651 pCFileStruct->invalidHandle = true;
667 write_unlock(&GlobalSMBSeslock); 652 spin_unlock(&cifs_file_list_lock);
668 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 653 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
669 cFYI(1, "Closing uncompleted readdir with rc %d", 654 cFYI(1, "Closing uncompleted readdir with rc %d",
670 rc); 655 rc);
671 /* not much we can do if it fails anyway, ignore rc */ 656 /* not much we can do if it fails anyway, ignore rc */
672 rc = 0; 657 rc = 0;
673 } else 658 } else
674 write_unlock(&GlobalSMBSeslock); 659 spin_unlock(&cifs_file_list_lock);
675 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 660 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
676 if (ptmp) { 661 if (ptmp) {
677 cFYI(1, "closedir free smb buf in srch struct"); 662 cFYI(1, "closedir free smb buf in srch struct");
@@ -681,6 +666,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
681 else 666 else
682 cifs_buf_release(ptmp); 667 cifs_buf_release(ptmp);
683 } 668 }
669 cifs_put_tlink(pCFileStruct->tlink);
684 kfree(file->private_data); 670 kfree(file->private_data);
685 file->private_data = NULL; 671 file->private_data = NULL;
686 } 672 }
@@ -767,13 +753,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
767 cFYI(1, "Unknown type of lock"); 753 cFYI(1, "Unknown type of lock");
768 754
769 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 755 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
770 tcon = cifs_sb->tcon; 756 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
771
772 if (file->private_data == NULL) {
773 rc = -EBADF;
774 FreeXid(xid);
775 return rc;
776 }
777 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 757 netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
778 758
779 if ((tcon->ses->capabilities & CAP_UNIX) && 759 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -949,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
949ssize_t cifs_user_write(struct file *file, const char __user *write_data, 929ssize_t cifs_user_write(struct file *file, const char __user *write_data,
950 size_t write_size, loff_t *poffset) 930 size_t write_size, loff_t *poffset)
951{ 931{
932 struct inode *inode = file->f_path.dentry->d_inode;
952 int rc = 0; 933 int rc = 0;
953 unsigned int bytes_written = 0; 934 unsigned int bytes_written = 0;
954 unsigned int total_written; 935 unsigned int total_written;
@@ -956,18 +937,18 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
956 struct cifsTconInfo *pTcon; 937 struct cifsTconInfo *pTcon;
957 int xid, long_op; 938 int xid, long_op;
958 struct cifsFileInfo *open_file; 939 struct cifsFileInfo *open_file;
959 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 940 struct cifsInodeInfo *cifsi = CIFS_I(inode);
960 941
961 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 942 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
962 943
963 pTcon = cifs_sb->tcon;
964
965 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size, 944 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
966 *poffset, file->f_path.dentry->d_name.name); */ 945 *poffset, file->f_path.dentry->d_name.name); */
967 946
968 if (file->private_data == NULL) 947 if (file->private_data == NULL)
969 return -EBADF; 948 return -EBADF;
949
970 open_file = file->private_data; 950 open_file = file->private_data;
951 pTcon = tlink_tcon(open_file->tlink);
971 952
972 rc = generic_write_checks(file, poffset, &write_size, 0); 953 rc = generic_write_checks(file, poffset, &write_size, 0);
973 if (rc) 954 if (rc)
@@ -988,19 +969,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
988 we blocked so return what we managed to write */ 969 we blocked so return what we managed to write */
989 return total_written; 970 return total_written;
990 } 971 }
991 if (open_file->closePend) {
992 FreeXid(xid);
993 if (total_written)
994 return total_written;
995 else
996 return -EBADF;
997 }
998 if (open_file->invalidHandle) { 972 if (open_file->invalidHandle) {
999 /* we could deadlock if we called 973 /* we could deadlock if we called
1000 filemap_fdatawait from here so tell 974 filemap_fdatawait from here so tell
1001 reopen_file not to flush data to server 975 reopen_file not to flush data to server
1002 now */ 976 now */
1003 rc = cifs_reopen_file(file, false); 977 rc = cifs_reopen_file(open_file, false);
1004 if (rc != 0) 978 if (rc != 0)
1005 break; 979 break;
1006 } 980 }
@@ -1029,27 +1003,24 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1029 1003
1030 cifs_stats_bytes_written(pTcon, total_written); 1004 cifs_stats_bytes_written(pTcon, total_written);
1031 1005
1032 /* since the write may have blocked check these pointers again */
1033 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
1034 struct inode *inode = file->f_path.dentry->d_inode;
1035/* Do not update local mtime - server will set its actual value on write 1006/* Do not update local mtime - server will set its actual value on write
1036 * inode->i_ctime = inode->i_mtime = 1007 * inode->i_ctime = inode->i_mtime =
1037 * current_fs_time(inode->i_sb);*/ 1008 * current_fs_time(inode->i_sb);*/
1038 if (total_written > 0) { 1009 if (total_written > 0) {
1039 spin_lock(&inode->i_lock); 1010 spin_lock(&inode->i_lock);
1040 if (*poffset > file->f_path.dentry->d_inode->i_size) 1011 if (*poffset > inode->i_size)
1041 i_size_write(file->f_path.dentry->d_inode, 1012 i_size_write(inode, *poffset);
1042 *poffset); 1013 spin_unlock(&inode->i_lock);
1043 spin_unlock(&inode->i_lock);
1044 }
1045 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1046 } 1014 }
1015 mark_inode_dirty_sync(inode);
1016
1047 FreeXid(xid); 1017 FreeXid(xid);
1048 return total_written; 1018 return total_written;
1049} 1019}
1050 1020
1051static ssize_t cifs_write(struct file *file, const char *write_data, 1021static ssize_t cifs_write(struct cifsFileInfo *open_file,
1052 size_t write_size, loff_t *poffset) 1022 const char *write_data, size_t write_size,
1023 loff_t *poffset)
1053{ 1024{
1054 int rc = 0; 1025 int rc = 0;
1055 unsigned int bytes_written = 0; 1026 unsigned int bytes_written = 0;
@@ -1057,19 +1028,15 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1057 struct cifs_sb_info *cifs_sb; 1028 struct cifs_sb_info *cifs_sb;
1058 struct cifsTconInfo *pTcon; 1029 struct cifsTconInfo *pTcon;
1059 int xid, long_op; 1030 int xid, long_op;
1060 struct cifsFileInfo *open_file; 1031 struct dentry *dentry = open_file->dentry;
1061 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 1032 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
1062
1063 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1064 1033
1065 pTcon = cifs_sb->tcon; 1034 cifs_sb = CIFS_SB(dentry->d_sb);
1066 1035
1067 cFYI(1, "write %zd bytes to offset %lld of %s", write_size, 1036 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1068 *poffset, file->f_path.dentry->d_name.name); 1037 *poffset, dentry->d_name.name);
1069 1038
1070 if (file->private_data == NULL) 1039 pTcon = tlink_tcon(open_file->tlink);
1071 return -EBADF;
1072 open_file = file->private_data;
1073 1040
1074 xid = GetXid(); 1041 xid = GetXid();
1075 1042
@@ -1078,28 +1045,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1078 total_written += bytes_written) { 1045 total_written += bytes_written) {
1079 rc = -EAGAIN; 1046 rc = -EAGAIN;
1080 while (rc == -EAGAIN) { 1047 while (rc == -EAGAIN) {
1081 if (file->private_data == NULL) {
1082 /* file has been closed on us */
1083 FreeXid(xid);
1084 /* if we have gotten here we have written some data
1085 and blocked, and the file has been freed on us
1086 while we blocked so return what we managed to
1087 write */
1088 return total_written;
1089 }
1090 if (open_file->closePend) {
1091 FreeXid(xid);
1092 if (total_written)
1093 return total_written;
1094 else
1095 return -EBADF;
1096 }
1097 if (open_file->invalidHandle) { 1048 if (open_file->invalidHandle) {
1098 /* we could deadlock if we called 1049 /* we could deadlock if we called
1099 filemap_fdatawait from here so tell 1050 filemap_fdatawait from here so tell
1100 reopen_file not to flush data to 1051 reopen_file not to flush data to
1101 server now */ 1052 server now */
1102 rc = cifs_reopen_file(file, false); 1053 rc = cifs_reopen_file(open_file, false);
1103 if (rc != 0) 1054 if (rc != 0)
1104 break; 1055 break;
1105 } 1056 }
@@ -1146,43 +1097,41 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1146 1097
1147 cifs_stats_bytes_written(pTcon, total_written); 1098 cifs_stats_bytes_written(pTcon, total_written);
1148 1099
1149 /* since the write may have blocked check these pointers again */ 1100 if (total_written > 0) {
1150 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { 1101 spin_lock(&dentry->d_inode->i_lock);
1151/*BB We could make this contingent on superblock ATIME flag too */ 1102 if (*poffset > dentry->d_inode->i_size)
1152/* file->f_path.dentry->d_inode->i_ctime = 1103 i_size_write(dentry->d_inode, *poffset);
1153 file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/ 1104 spin_unlock(&dentry->d_inode->i_lock);
1154 if (total_written > 0) {
1155 spin_lock(&file->f_path.dentry->d_inode->i_lock);
1156 if (*poffset > file->f_path.dentry->d_inode->i_size)
1157 i_size_write(file->f_path.dentry->d_inode,
1158 *poffset);
1159 spin_unlock(&file->f_path.dentry->d_inode->i_lock);
1160 }
1161 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1162 } 1105 }
1106 mark_inode_dirty_sync(dentry->d_inode);
1163 FreeXid(xid); 1107 FreeXid(xid);
1164 return total_written; 1108 return total_written;
1165} 1109}
1166 1110
1167#ifdef CONFIG_CIFS_EXPERIMENTAL 1111#ifdef CONFIG_CIFS_EXPERIMENTAL
1168struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) 1112struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1113 bool fsuid_only)
1169{ 1114{
1170 struct cifsFileInfo *open_file = NULL; 1115 struct cifsFileInfo *open_file = NULL;
1116 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1117
1118 /* only filter by fsuid on multiuser mounts */
1119 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1120 fsuid_only = false;
1171 1121
1172 read_lock(&GlobalSMBSeslock); 1122 spin_lock(&cifs_file_list_lock);
1173 /* we could simply get the first_list_entry since write-only entries 1123 /* we could simply get the first_list_entry since write-only entries
1174 are always at the end of the list but since the first entry might 1124 are always at the end of the list but since the first entry might
1175 have a close pending, we go through the whole list */ 1125 have a close pending, we go through the whole list */
1176 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1126 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1177 if (open_file->closePend) 1127 if (fsuid_only && open_file->uid != current_fsuid())
1178 continue; 1128 continue;
1179 if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || 1129 if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
1180 (open_file->pfile->f_flags & O_RDONLY))) {
1181 if (!open_file->invalidHandle) { 1130 if (!open_file->invalidHandle) {
1182 /* found a good file */ 1131 /* found a good file */
1183 /* lock it so it will not be closed on us */ 1132 /* lock it so it will not be closed on us */
1184 cifsFileInfo_get(open_file); 1133 cifsFileInfo_get(open_file);
1185 read_unlock(&GlobalSMBSeslock); 1134 spin_unlock(&cifs_file_list_lock);
1186 return open_file; 1135 return open_file;
1187 } /* else might as well continue, and look for 1136 } /* else might as well continue, and look for
1188 another, or simply have the caller reopen it 1137 another, or simply have the caller reopen it
@@ -1190,14 +1139,16 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1190 } else /* write only file */ 1139 } else /* write only file */
1191 break; /* write only files are last so must be done */ 1140 break; /* write only files are last so must be done */
1192 } 1141 }
1193 read_unlock(&GlobalSMBSeslock); 1142 spin_unlock(&cifs_file_list_lock);
1194 return NULL; 1143 return NULL;
1195} 1144}
1196#endif 1145#endif
1197 1146
1198struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) 1147struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1148 bool fsuid_only)
1199{ 1149{
1200 struct cifsFileInfo *open_file; 1150 struct cifsFileInfo *open_file;
1151 struct cifs_sb_info *cifs_sb;
1201 bool any_available = false; 1152 bool any_available = false;
1202 int rc; 1153 int rc;
1203 1154
@@ -1211,53 +1162,41 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1211 return NULL; 1162 return NULL;
1212 } 1163 }
1213 1164
1214 read_lock(&GlobalSMBSeslock); 1165 cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1166
1167 /* only filter by fsuid on multiuser mounts */
1168 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1169 fsuid_only = false;
1170
1171 spin_lock(&cifs_file_list_lock);
1215refind_writable: 1172refind_writable:
1216 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1173 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1217 if (open_file->closePend || 1174 if (!any_available && open_file->pid != current->tgid)
1218 (!any_available && open_file->pid != current->tgid))
1219 continue; 1175 continue;
1220 1176 if (fsuid_only && open_file->uid != current_fsuid())
1221 if (open_file->pfile && 1177 continue;
1222 ((open_file->pfile->f_flags & O_RDWR) || 1178 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
1223 (open_file->pfile->f_flags & O_WRONLY))) {
1224 cifsFileInfo_get(open_file); 1179 cifsFileInfo_get(open_file);
1225 1180
1226 if (!open_file->invalidHandle) { 1181 if (!open_file->invalidHandle) {
1227 /* found a good writable file */ 1182 /* found a good writable file */
1228 read_unlock(&GlobalSMBSeslock); 1183 spin_unlock(&cifs_file_list_lock);
1229 return open_file; 1184 return open_file;
1230 } 1185 }
1231 1186
1232 read_unlock(&GlobalSMBSeslock); 1187 spin_unlock(&cifs_file_list_lock);
1188
1233 /* Had to unlock since following call can block */ 1189 /* Had to unlock since following call can block */
1234 rc = cifs_reopen_file(open_file->pfile, false); 1190 rc = cifs_reopen_file(open_file, false);
1235 if (!rc) { 1191 if (!rc)
1236 if (!open_file->closePend) 1192 return open_file;
1237 return open_file;
1238 else { /* start over in case this was deleted */
1239 /* since the list could be modified */
1240 read_lock(&GlobalSMBSeslock);
1241 cifsFileInfo_put(open_file);
1242 goto refind_writable;
1243 }
1244 }
1245 1193
1246 /* if it fails, try another handle if possible - 1194 /* if it fails, try another handle if possible */
1247 (we can not do this if closePending since
1248 loop could be modified - in which case we
1249 have to start at the beginning of the list
1250 again. Note that it would be bad
1251 to hold up writepages here (rather than
1252 in caller) with continuous retries */
1253 cFYI(1, "wp failed on reopen file"); 1195 cFYI(1, "wp failed on reopen file");
1254 read_lock(&GlobalSMBSeslock);
1255 /* can not use this handle, no write
1256 pending on this one after all */
1257 cifsFileInfo_put(open_file); 1196 cifsFileInfo_put(open_file);
1258 1197
1259 if (open_file->closePend) /* list could have changed */ 1198 spin_lock(&cifs_file_list_lock);
1260 goto refind_writable; 1199
1261 /* else we simply continue to the next entry. Thus 1200 /* else we simply continue to the next entry. Thus
1262 we do not loop on reopen errors. If we 1201 we do not loop on reopen errors. If we
1263 can not reopen the file, for example if we 1202 can not reopen the file, for example if we
@@ -1272,7 +1211,7 @@ refind_writable:
1272 any_available = true; 1211 any_available = true;
1273 goto refind_writable; 1212 goto refind_writable;
1274 } 1213 }
1275 read_unlock(&GlobalSMBSeslock); 1214 spin_unlock(&cifs_file_list_lock);
1276 return NULL; 1215 return NULL;
1277} 1216}
1278 1217
@@ -1284,7 +1223,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1284 int rc = -EFAULT; 1223 int rc = -EFAULT;
1285 int bytes_written = 0; 1224 int bytes_written = 0;
1286 struct cifs_sb_info *cifs_sb; 1225 struct cifs_sb_info *cifs_sb;
1287 struct cifsTconInfo *pTcon;
1288 struct inode *inode; 1226 struct inode *inode;
1289 struct cifsFileInfo *open_file; 1227 struct cifsFileInfo *open_file;
1290 1228
@@ -1293,7 +1231,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1293 1231
1294 inode = page->mapping->host; 1232 inode = page->mapping->host;
1295 cifs_sb = CIFS_SB(inode->i_sb); 1233 cifs_sb = CIFS_SB(inode->i_sb);
1296 pTcon = cifs_sb->tcon;
1297 1234
1298 offset += (loff_t)from; 1235 offset += (loff_t)from;
1299 write_data = kmap(page); 1236 write_data = kmap(page);
@@ -1314,10 +1251,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1314 if (mapping->host->i_size - offset < (loff_t)to) 1251 if (mapping->host->i_size - offset < (loff_t)to)
1315 to = (unsigned)(mapping->host->i_size - offset); 1252 to = (unsigned)(mapping->host->i_size - offset);
1316 1253
1317 open_file = find_writable_file(CIFS_I(mapping->host)); 1254 open_file = find_writable_file(CIFS_I(mapping->host), false);
1318 if (open_file) { 1255 if (open_file) {
1319 bytes_written = cifs_write(open_file->pfile, write_data, 1256 bytes_written = cifs_write(open_file, write_data,
1320 to-from, &offset); 1257 to - from, &offset);
1321 cifsFileInfo_put(open_file); 1258 cifsFileInfo_put(open_file);
1322 /* Does mm or vfs already set times? */ 1259 /* Does mm or vfs already set times? */
1323 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1260 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1337,7 +1274,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1337static int cifs_writepages(struct address_space *mapping, 1274static int cifs_writepages(struct address_space *mapping,
1338 struct writeback_control *wbc) 1275 struct writeback_control *wbc)
1339{ 1276{
1340 struct backing_dev_info *bdi = mapping->backing_dev_info;
1341 unsigned int bytes_to_write; 1277 unsigned int bytes_to_write;
1342 unsigned int bytes_written; 1278 unsigned int bytes_written;
1343 struct cifs_sb_info *cifs_sb; 1279 struct cifs_sb_info *cifs_sb;
@@ -1352,6 +1288,7 @@ static int cifs_writepages(struct address_space *mapping,
1352 int nr_pages; 1288 int nr_pages;
1353 __u64 offset = 0; 1289 __u64 offset = 0;
1354 struct cifsFileInfo *open_file; 1290 struct cifsFileInfo *open_file;
1291 struct cifsTconInfo *tcon;
1355 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host); 1292 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
1356 struct page *page; 1293 struct page *page;
1357 struct pagevec pvec; 1294 struct pagevec pvec;
@@ -1368,26 +1305,29 @@ static int cifs_writepages(struct address_space *mapping,
1368 if (cifs_sb->wsize < PAGE_CACHE_SIZE) 1305 if (cifs_sb->wsize < PAGE_CACHE_SIZE)
1369 return generic_writepages(mapping, wbc); 1306 return generic_writepages(mapping, wbc);
1370 1307
1371 if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
1372 if (cifs_sb->tcon->ses->server->secMode &
1373 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
1374 if (!experimEnabled)
1375 return generic_writepages(mapping, wbc);
1376
1377 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL); 1308 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
1378 if (iov == NULL) 1309 if (iov == NULL)
1379 return generic_writepages(mapping, wbc); 1310 return generic_writepages(mapping, wbc);
1380 1311
1381
1382 /* 1312 /*
1383 * BB: Is this meaningful for a non-block-device file system? 1313 * if there's no open file, then this is likely to fail too,
1384 * If it is, we should test it again after we do I/O 1314 * but it'll at least handle the return. Maybe it should be
1315 * a BUG() instead?
1385 */ 1316 */
1386 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1317 open_file = find_writable_file(CIFS_I(mapping->host), false);
1387 wbc->encountered_congestion = 1; 1318 if (!open_file) {
1388 kfree(iov); 1319 kfree(iov);
1389 return 0; 1320 return generic_writepages(mapping, wbc);
1321 }
1322
1323 tcon = tlink_tcon(open_file->tlink);
1324 if (!experimEnabled && tcon->ses->server->secMode &
1325 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1326 cifsFileInfo_put(open_file);
1327 kfree(iov);
1328 return generic_writepages(mapping, wbc);
1390 } 1329 }
1330 cifsFileInfo_put(open_file);
1391 1331
1392 xid = GetXid(); 1332 xid = GetXid();
1393 1333
@@ -1492,38 +1432,29 @@ retry:
1492 break; 1432 break;
1493 } 1433 }
1494 if (n_iov) { 1434 if (n_iov) {
1495 /* Search for a writable handle every time we call 1435 open_file = find_writable_file(CIFS_I(mapping->host),
1496 * CIFSSMBWrite2. We can't rely on the last handle 1436 false);
1497 * we used to still be valid
1498 */
1499 open_file = find_writable_file(CIFS_I(mapping->host));
1500 if (!open_file) { 1437 if (!open_file) {
1501 cERROR(1, "No writable handles for inode"); 1438 cERROR(1, "No writable handles for inode");
1502 rc = -EBADF; 1439 rc = -EBADF;
1503 } else { 1440 } else {
1504 long_op = cifs_write_timeout(cifsi, offset); 1441 long_op = cifs_write_timeout(cifsi, offset);
1505 rc = CIFSSMBWrite2(xid, cifs_sb->tcon, 1442 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
1506 open_file->netfid,
1507 bytes_to_write, offset, 1443 bytes_to_write, offset,
1508 &bytes_written, iov, n_iov, 1444 &bytes_written, iov, n_iov,
1509 long_op); 1445 long_op);
1510 cifsFileInfo_put(open_file); 1446 cifsFileInfo_put(open_file);
1511 cifs_update_eof(cifsi, offset, bytes_written); 1447 cifs_update_eof(cifsi, offset, bytes_written);
1448 }
1512 1449
1513 if (rc || bytes_written < bytes_to_write) { 1450 if (rc || bytes_written < bytes_to_write) {
1514 cERROR(1, "Write2 ret %d, wrote %d", 1451 cERROR(1, "Write2 ret %d, wrote %d",
1515 rc, bytes_written); 1452 rc, bytes_written);
1516 /* BB what if continued retry is 1453 mapping_set_error(mapping, rc);
1517 requested via mount flags? */ 1454 } else {
1518 if (rc == -ENOSPC) 1455 cifs_stats_bytes_written(tcon, bytes_written);
1519 set_bit(AS_ENOSPC, &mapping->flags);
1520 else
1521 set_bit(AS_EIO, &mapping->flags);
1522 } else {
1523 cifs_stats_bytes_written(cifs_sb->tcon,
1524 bytes_written);
1525 }
1526 } 1456 }
1457
1527 for (i = 0; i < n_iov; i++) { 1458 for (i = 0; i < n_iov; i++) {
1528 page = pvec.pages[first + i]; 1459 page = pvec.pages[first + i];
1529 /* Should we also set page error on 1460 /* Should we also set page error on
@@ -1624,7 +1555,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1624 /* BB check if anything else missing out of ppw 1555 /* BB check if anything else missing out of ppw
1625 such as updating last write time */ 1556 such as updating last write time */
1626 page_data = kmap(page); 1557 page_data = kmap(page);
1627 rc = cifs_write(file, page_data + offset, copied, &pos); 1558 rc = cifs_write(file->private_data, page_data + offset,
1559 copied, &pos);
1628 /* if (rc < 0) should we set writebehind rc? */ 1560 /* if (rc < 0) should we set writebehind rc? */
1629 kunmap(page); 1561 kunmap(page);
1630 1562
@@ -1663,11 +1595,10 @@ int cifs_fsync(struct file *file, int datasync)
1663 1595
1664 rc = filemap_write_and_wait(inode->i_mapping); 1596 rc = filemap_write_and_wait(inode->i_mapping);
1665 if (rc == 0) { 1597 if (rc == 0) {
1666 rc = CIFS_I(inode)->write_behind_rc; 1598 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1667 CIFS_I(inode)->write_behind_rc = 0; 1599
1668 tcon = CIFS_SB(inode->i_sb)->tcon; 1600 tcon = tlink_tcon(smbfile->tlink);
1669 if (!rc && tcon && smbfile && 1601 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1670 !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1671 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 1602 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1672 } 1603 }
1673 1604
@@ -1712,21 +1643,8 @@ int cifs_flush(struct file *file, fl_owner_t id)
1712 struct inode *inode = file->f_path.dentry->d_inode; 1643 struct inode *inode = file->f_path.dentry->d_inode;
1713 int rc = 0; 1644 int rc = 0;
1714 1645
1715 /* Rather than do the steps manually: 1646 if (file->f_mode & FMODE_WRITE)
1716 lock the inode for writing 1647 rc = filemap_write_and_wait(inode->i_mapping);
1717 loop through pages looking for write behind data (dirty pages)
1718 coalesce into contiguous 16K (or smaller) chunks to write to server
1719 send to server (prefer in parallel)
1720 deal with writebehind errors
1721 unlock inode for writing
1722 filemapfdatawrite appears easier for the time being */
1723
1724 rc = filemap_fdatawrite(inode->i_mapping);
1725 /* reset wb rc if we were able to write out dirty pages */
1726 if (!rc) {
1727 rc = CIFS_I(inode)->write_behind_rc;
1728 CIFS_I(inode)->write_behind_rc = 0;
1729 }
1730 1648
1731 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); 1649 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
1732 1650
@@ -1750,7 +1668,6 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1750 1668
1751 xid = GetXid(); 1669 xid = GetXid();
1752 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1670 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1753 pTcon = cifs_sb->tcon;
1754 1671
1755 if (file->private_data == NULL) { 1672 if (file->private_data == NULL) {
1756 rc = -EBADF; 1673 rc = -EBADF;
@@ -1758,6 +1675,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1758 return rc; 1675 return rc;
1759 } 1676 }
1760 open_file = file->private_data; 1677 open_file = file->private_data;
1678 pTcon = tlink_tcon(open_file->tlink);
1761 1679
1762 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1680 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1763 cFYI(1, "attempting read on write only file instance"); 1681 cFYI(1, "attempting read on write only file instance");
@@ -1771,9 +1689,8 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1771 smb_read_data = NULL; 1689 smb_read_data = NULL;
1772 while (rc == -EAGAIN) { 1690 while (rc == -EAGAIN) {
1773 int buf_type = CIFS_NO_BUFFER; 1691 int buf_type = CIFS_NO_BUFFER;
1774 if ((open_file->invalidHandle) && 1692 if (open_file->invalidHandle) {
1775 (!open_file->closePend)) { 1693 rc = cifs_reopen_file(open_file, true);
1776 rc = cifs_reopen_file(file, true);
1777 if (rc != 0) 1694 if (rc != 0)
1778 break; 1695 break;
1779 } 1696 }
@@ -1831,7 +1748,6 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1831 1748
1832 xid = GetXid(); 1749 xid = GetXid();
1833 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1750 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1834 pTcon = cifs_sb->tcon;
1835 1751
1836 if (file->private_data == NULL) { 1752 if (file->private_data == NULL) {
1837 rc = -EBADF; 1753 rc = -EBADF;
@@ -1839,6 +1755,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1839 return rc; 1755 return rc;
1840 } 1756 }
1841 open_file = file->private_data; 1757 open_file = file->private_data;
1758 pTcon = tlink_tcon(open_file->tlink);
1842 1759
1843 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1760 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1844 cFYI(1, "attempting read on write only file instance"); 1761 cFYI(1, "attempting read on write only file instance");
@@ -1857,9 +1774,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1857 } 1774 }
1858 rc = -EAGAIN; 1775 rc = -EAGAIN;
1859 while (rc == -EAGAIN) { 1776 while (rc == -EAGAIN) {
1860 if ((open_file->invalidHandle) && 1777 if (open_file->invalidHandle) {
1861 (!open_file->closePend)) { 1778 rc = cifs_reopen_file(open_file, true);
1862 rc = cifs_reopen_file(file, true);
1863 if (rc != 0) 1779 if (rc != 0)
1864 break; 1780 break;
1865 } 1781 }
@@ -1974,7 +1890,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1974 } 1890 }
1975 open_file = file->private_data; 1891 open_file = file->private_data;
1976 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1892 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1977 pTcon = cifs_sb->tcon; 1893 pTcon = tlink_tcon(open_file->tlink);
1978 1894
1979 /* 1895 /*
1980 * Reads as many pages as possible from fscache. Returns -ENOBUFS 1896 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2022,9 +1938,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2022 read_size, contig_pages); 1938 read_size, contig_pages);
2023 rc = -EAGAIN; 1939 rc = -EAGAIN;
2024 while (rc == -EAGAIN) { 1940 while (rc == -EAGAIN) {
2025 if ((open_file->invalidHandle) && 1941 if (open_file->invalidHandle) {
2026 (!open_file->closePend)) { 1942 rc = cifs_reopen_file(open_file, true);
2027 rc = cifs_reopen_file(file, true);
2028 if (rc != 0) 1943 if (rc != 0)
2029 break; 1944 break;
2030 } 1945 }
@@ -2173,18 +2088,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
2173{ 2088{
2174 struct cifsFileInfo *open_file; 2089 struct cifsFileInfo *open_file;
2175 2090
2176 read_lock(&GlobalSMBSeslock); 2091 spin_lock(&cifs_file_list_lock);
2177 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 2092 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
2178 if (open_file->closePend) 2093 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
2179 continue; 2094 spin_unlock(&cifs_file_list_lock);
2180 if (open_file->pfile &&
2181 ((open_file->pfile->f_flags & O_RDWR) ||
2182 (open_file->pfile->f_flags & O_WRONLY))) {
2183 read_unlock(&GlobalSMBSeslock);
2184 return 1; 2095 return 1;
2185 } 2096 }
2186 } 2097 }
2187 read_unlock(&GlobalSMBSeslock); 2098 spin_unlock(&cifs_file_list_lock);
2188 return 0; 2099 return 0;
2189} 2100}
2190 2101
@@ -2310,10 +2221,9 @@ void cifs_oplock_break(struct work_struct *work)
2310{ 2221{
2311 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 2222 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2312 oplock_break); 2223 oplock_break);
2313 struct inode *inode = cfile->pInode; 2224 struct inode *inode = cfile->dentry->d_inode;
2314 struct cifsInodeInfo *cinode = CIFS_I(inode); 2225 struct cifsInodeInfo *cinode = CIFS_I(inode);
2315 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb); 2226 int rc = 0;
2316 int rc, waitrc = 0;
2317 2227
2318 if (inode && S_ISREG(inode->i_mode)) { 2228 if (inode && S_ISREG(inode->i_mode)) {
2319 if (cinode->clientCanCacheRead) 2229 if (cinode->clientCanCacheRead)
@@ -2322,13 +2232,10 @@ void cifs_oplock_break(struct work_struct *work)
2322 break_lease(inode, O_WRONLY); 2232 break_lease(inode, O_WRONLY);
2323 rc = filemap_fdatawrite(inode->i_mapping); 2233 rc = filemap_fdatawrite(inode->i_mapping);
2324 if (cinode->clientCanCacheRead == 0) { 2234 if (cinode->clientCanCacheRead == 0) {
2325 waitrc = filemap_fdatawait(inode->i_mapping); 2235 rc = filemap_fdatawait(inode->i_mapping);
2236 mapping_set_error(inode->i_mapping, rc);
2326 invalidate_remote_inode(inode); 2237 invalidate_remote_inode(inode);
2327 } 2238 }
2328 if (!rc)
2329 rc = waitrc;
2330 if (rc)
2331 cinode->write_behind_rc = rc;
2332 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 2239 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2333 } 2240 }
2334 2241
@@ -2338,33 +2245,34 @@ void cifs_oplock_break(struct work_struct *work)
2338 * not bother sending an oplock release if session to server still is 2245 * not bother sending an oplock release if session to server still is
2339 * disconnected since oplock already released by the server 2246 * disconnected since oplock already released by the server
2340 */ 2247 */
2341 if (!cfile->closePend && !cfile->oplock_break_cancelled) { 2248 if (!cfile->oplock_break_cancelled) {
2342 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, 2249 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
2343 LOCKING_ANDX_OPLOCK_RELEASE, false); 2250 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
2344 cFYI(1, "Oplock release rc = %d", rc); 2251 cFYI(1, "Oplock release rc = %d", rc);
2345 } 2252 }
2346 2253
2347 /* 2254 /*
2348 * We might have kicked in before is_valid_oplock_break() 2255 * We might have kicked in before is_valid_oplock_break()
2349 * finished grabbing reference for us. Make sure it's done by 2256 * finished grabbing reference for us. Make sure it's done by
2350 * waiting for GlobalSMSSeslock. 2257 * waiting for cifs_file_list_lock.
2351 */ 2258 */
2352 write_lock(&GlobalSMBSeslock); 2259 spin_lock(&cifs_file_list_lock);
2353 write_unlock(&GlobalSMBSeslock); 2260 spin_unlock(&cifs_file_list_lock);
2354 2261
2355 cifs_oplock_break_put(cfile); 2262 cifs_oplock_break_put(cfile);
2356} 2263}
2357 2264
2265/* must be called while holding cifs_file_list_lock */
2358void cifs_oplock_break_get(struct cifsFileInfo *cfile) 2266void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2359{ 2267{
2360 mntget(cfile->mnt); 2268 cifs_sb_active(cfile->dentry->d_sb);
2361 cifsFileInfo_get(cfile); 2269 cifsFileInfo_get(cfile);
2362} 2270}
2363 2271
2364void cifs_oplock_break_put(struct cifsFileInfo *cfile) 2272void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2365{ 2273{
2366 mntput(cfile->mnt);
2367 cifsFileInfo_put(cfile); 2274 cifsFileInfo_put(cfile);
2275 cifs_sb_deactive(cfile->dentry->d_sb);
2368} 2276}
2369 2277
2370const struct address_space_operations cifs_addr_ops = { 2278const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 9f3f5c4be161..a2ad94efcfe6 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -62,15 +62,15 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
62{ 62{
63 struct cifsInodeInfo *cifsi = CIFS_I(inode); 63 struct cifsInodeInfo *cifsi = CIFS_I(inode);
64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
65 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
65 66
66 if (cifsi->fscache) 67 if (cifsi->fscache)
67 return; 68 return;
68 69
69 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, 70 cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
70 &cifs_fscache_inode_object_def, 71 &cifs_fscache_inode_object_def, cifsi);
71 cifsi); 72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", 73 cifsi->fscache);
73 cifs_sb->tcon->fscache, cifsi->fscache);
74} 74}
75 75
76void cifs_fscache_release_inode_cookie(struct inode *inode) 76void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -117,7 +117,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
117 /* retire the current fscache cache and get a new one */ 117 /* retire the current fscache cache and get a new one */
118 fscache_relinquish_cookie(cifsi->fscache, 1); 118 fscache_relinquish_cookie(cifsi->fscache, 1);
119 119
120 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, 120 cifsi->fscache = fscache_acquire_cookie(
121 cifs_sb_master_tcon(cifs_sb)->fscache,
121 &cifs_fscache_inode_object_def, 122 &cifs_fscache_inode_object_def,
122 cifsi); 123 cifsi);
123 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", 124 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 53cce8cc2224..ef3a55bf86b6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -52,7 +52,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
52 52
53 53
54 /* check if server can support readpages */ 54 /* check if server can support readpages */
55 if (cifs_sb->tcon->ses->server->maxBuf < 55 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
57 inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 57 inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
58 else 58 else
@@ -288,8 +288,8 @@ int cifs_get_file_info_unix(struct file *filp)
288 struct cifs_fattr fattr; 288 struct cifs_fattr fattr;
289 struct inode *inode = filp->f_path.dentry->d_inode; 289 struct inode *inode = filp->f_path.dentry->d_inode;
290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
291 struct cifsTconInfo *tcon = cifs_sb->tcon;
292 struct cifsFileInfo *cfile = filp->private_data; 291 struct cifsFileInfo *cfile = filp->private_data;
292 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
293 293
294 xid = GetXid(); 294 xid = GetXid();
295 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); 295 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -313,15 +313,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
313 FILE_UNIX_BASIC_INFO find_data; 313 FILE_UNIX_BASIC_INFO find_data;
314 struct cifs_fattr fattr; 314 struct cifs_fattr fattr;
315 struct cifsTconInfo *tcon; 315 struct cifsTconInfo *tcon;
316 struct tcon_link *tlink;
316 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 317 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
317 318
318 tcon = cifs_sb->tcon;
319 cFYI(1, "Getting info on %s", full_path); 319 cFYI(1, "Getting info on %s", full_path);
320 320
321 tlink = cifs_sb_tlink(cifs_sb);
322 if (IS_ERR(tlink))
323 return PTR_ERR(tlink);
324 tcon = tlink_tcon(tlink);
325
321 /* could have done a find first instead but this returns more info */ 326 /* could have done a find first instead but this returns more info */
322 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, 327 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
323 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 328 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
324 CIFS_MOUNT_MAP_SPECIAL_CHR); 329 CIFS_MOUNT_MAP_SPECIAL_CHR);
330 cifs_put_tlink(tlink);
325 331
326 if (!rc) { 332 if (!rc) {
327 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); 333 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -332,6 +338,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
332 return rc; 338 return rc;
333 } 339 }
334 340
341 /* check for Minshall+French symlinks */
342 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
343 int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
344 if (tmprc)
345 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
346 }
347
335 if (*pinode == NULL) { 348 if (*pinode == NULL) {
336 /* get new inode */ 349 /* get new inode */
337 cifs_fill_uniqueid(sb, &fattr); 350 cifs_fill_uniqueid(sb, &fattr);
@@ -353,7 +366,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
353 int rc; 366 int rc;
354 int oplock = 0; 367 int oplock = 0;
355 __u16 netfid; 368 __u16 netfid;
356 struct cifsTconInfo *pTcon = cifs_sb->tcon; 369 struct tcon_link *tlink;
370 struct cifsTconInfo *tcon;
357 char buf[24]; 371 char buf[24];
358 unsigned int bytes_read; 372 unsigned int bytes_read;
359 char *pbuf; 373 char *pbuf;
@@ -372,7 +386,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
372 return -EINVAL; /* EOPNOTSUPP? */ 386 return -EINVAL; /* EOPNOTSUPP? */
373 } 387 }
374 388
375 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, 389 tlink = cifs_sb_tlink(cifs_sb);
390 if (IS_ERR(tlink))
391 return PTR_ERR(tlink);
392 tcon = tlink_tcon(tlink);
393
394 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
376 CREATE_NOT_DIR, &netfid, &oplock, NULL, 395 CREATE_NOT_DIR, &netfid, &oplock, NULL,
377 cifs_sb->local_nls, 396 cifs_sb->local_nls,
378 cifs_sb->mnt_cifs_flags & 397 cifs_sb->mnt_cifs_flags &
@@ -380,7 +399,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
380 if (rc == 0) { 399 if (rc == 0) {
381 int buf_type = CIFS_NO_BUFFER; 400 int buf_type = CIFS_NO_BUFFER;
382 /* Read header */ 401 /* Read header */
383 rc = CIFSSMBRead(xid, pTcon, netfid, 402 rc = CIFSSMBRead(xid, tcon, netfid,
384 24 /* length */, 0 /* offset */, 403 24 /* length */, 0 /* offset */,
385 &bytes_read, &pbuf, &buf_type); 404 &bytes_read, &pbuf, &buf_type);
386 if ((rc == 0) && (bytes_read >= 8)) { 405 if ((rc == 0) && (bytes_read >= 8)) {
@@ -422,8 +441,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
422 fattr->cf_dtype = DT_REG; 441 fattr->cf_dtype = DT_REG;
423 rc = -EOPNOTSUPP; /* or some unknown SFU type */ 442 rc = -EOPNOTSUPP; /* or some unknown SFU type */
424 } 443 }
425 CIFSSMBClose(xid, pTcon, netfid); 444 CIFSSMBClose(xid, tcon, netfid);
426 } 445 }
446 cifs_put_tlink(tlink);
427 return rc; 447 return rc;
428} 448}
429 449
@@ -441,11 +461,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
441 ssize_t rc; 461 ssize_t rc;
442 char ea_value[4]; 462 char ea_value[4];
443 __u32 mode; 463 __u32 mode;
464 struct tcon_link *tlink;
465 struct cifsTconInfo *tcon;
466
467 tlink = cifs_sb_tlink(cifs_sb);
468 if (IS_ERR(tlink))
469 return PTR_ERR(tlink);
470 tcon = tlink_tcon(tlink);
444 471
445 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS", 472 rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
446 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 473 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
447 cifs_sb->mnt_cifs_flags & 474 cifs_sb->mnt_cifs_flags &
448 CIFS_MOUNT_MAP_SPECIAL_CHR); 475 CIFS_MOUNT_MAP_SPECIAL_CHR);
476 cifs_put_tlink(tlink);
449 if (rc < 0) 477 if (rc < 0)
450 return (int)rc; 478 return (int)rc;
451 else if (rc > 3) { 479 else if (rc > 3) {
@@ -468,6 +496,8 @@ static void
468cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, 496cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
469 struct cifs_sb_info *cifs_sb, bool adjust_tz) 497 struct cifs_sb_info *cifs_sb, bool adjust_tz)
470{ 498{
499 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
500
471 memset(fattr, 0, sizeof(*fattr)); 501 memset(fattr, 0, sizeof(*fattr));
472 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes); 502 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
473 if (info->DeletePending) 503 if (info->DeletePending)
@@ -482,8 +512,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
482 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); 512 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
483 513
484 if (adjust_tz) { 514 if (adjust_tz) {
485 fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 515 fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
486 fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 516 fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
487 } 517 }
488 518
489 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 519 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
@@ -515,8 +545,8 @@ int cifs_get_file_info(struct file *filp)
515 struct cifs_fattr fattr; 545 struct cifs_fattr fattr;
516 struct inode *inode = filp->f_path.dentry->d_inode; 546 struct inode *inode = filp->f_path.dentry->d_inode;
517 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 547 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
518 struct cifsTconInfo *tcon = cifs_sb->tcon;
519 struct cifsFileInfo *cfile = filp->private_data; 548 struct cifsFileInfo *cfile = filp->private_data;
549 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
520 550
521 xid = GetXid(); 551 xid = GetXid();
522 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 552 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -554,26 +584,33 @@ int cifs_get_inode_info(struct inode **pinode,
554{ 584{
555 int rc = 0, tmprc; 585 int rc = 0, tmprc;
556 struct cifsTconInfo *pTcon; 586 struct cifsTconInfo *pTcon;
587 struct tcon_link *tlink;
557 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 588 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
558 char *buf = NULL; 589 char *buf = NULL;
559 bool adjustTZ = false; 590 bool adjustTZ = false;
560 struct cifs_fattr fattr; 591 struct cifs_fattr fattr;
561 592
562 pTcon = cifs_sb->tcon; 593 tlink = cifs_sb_tlink(cifs_sb);
594 if (IS_ERR(tlink))
595 return PTR_ERR(tlink);
596 pTcon = tlink_tcon(tlink);
597
563 cFYI(1, "Getting info on %s", full_path); 598 cFYI(1, "Getting info on %s", full_path);
564 599
565 if ((pfindData == NULL) && (*pinode != NULL)) { 600 if ((pfindData == NULL) && (*pinode != NULL)) {
566 if (CIFS_I(*pinode)->clientCanCacheRead) { 601 if (CIFS_I(*pinode)->clientCanCacheRead) {
567 cFYI(1, "No need to revalidate cached inode sizes"); 602 cFYI(1, "No need to revalidate cached inode sizes");
568 return rc; 603 goto cgii_exit;
569 } 604 }
570 } 605 }
571 606
572 /* if file info not passed in then get it from server */ 607 /* if file info not passed in then get it from server */
573 if (pfindData == NULL) { 608 if (pfindData == NULL) {
574 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 609 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
575 if (buf == NULL) 610 if (buf == NULL) {
576 return -ENOMEM; 611 rc = -ENOMEM;
612 goto cgii_exit;
613 }
577 pfindData = (FILE_ALL_INFO *)buf; 614 pfindData = (FILE_ALL_INFO *)buf;
578 615
579 /* could do find first instead but this returns more info */ 616 /* could do find first instead but this returns more info */
@@ -661,6 +698,13 @@ int cifs_get_inode_info(struct inode **pinode,
661 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 698 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
662 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); 699 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
663 700
701 /* check for Minshall+French symlinks */
702 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
703 tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
704 if (tmprc)
705 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
706 }
707
664 if (!*pinode) { 708 if (!*pinode) {
665 *pinode = cifs_iget(sb, &fattr); 709 *pinode = cifs_iget(sb, &fattr);
666 if (!*pinode) 710 if (!*pinode)
@@ -671,6 +715,7 @@ int cifs_get_inode_info(struct inode **pinode,
671 715
672cgii_exit: 716cgii_exit:
673 kfree(buf); 717 kfree(buf);
718 cifs_put_tlink(tlink);
674 return rc; 719 return rc;
675} 720}
676 721
@@ -683,6 +728,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
683 int pplen = cifs_sb->prepathlen; 728 int pplen = cifs_sb->prepathlen;
684 int dfsplen; 729 int dfsplen;
685 char *full_path = NULL; 730 char *full_path = NULL;
731 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
686 732
687 /* if no prefix path, simply set path to the root of share to "" */ 733 /* if no prefix path, simply set path to the root of share to "" */
688 if (pplen == 0) { 734 if (pplen == 0) {
@@ -692,8 +738,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
692 return full_path; 738 return full_path;
693 } 739 }
694 740
695 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 741 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
696 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 742 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
697 else 743 else
698 dfsplen = 0; 744 dfsplen = 0;
699 745
@@ -702,7 +748,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
702 return full_path; 748 return full_path;
703 749
704 if (dfsplen) { 750 if (dfsplen) {
705 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 751 strncpy(full_path, tcon->treeName, dfsplen);
706 /* switch slash direction in prepath depending on whether 752 /* switch slash direction in prepath depending on whether
707 * windows or posix style path names 753 * windows or posix style path names
708 */ 754 */
@@ -818,18 +864,18 @@ retry_iget5_locked:
818struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) 864struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
819{ 865{
820 int xid; 866 int xid;
821 struct cifs_sb_info *cifs_sb; 867 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
822 struct inode *inode = NULL; 868 struct inode *inode = NULL;
823 long rc; 869 long rc;
824 char *full_path; 870 char *full_path;
871 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
825 872
826 cifs_sb = CIFS_SB(sb);
827 full_path = cifs_build_path_to_root(cifs_sb); 873 full_path = cifs_build_path_to_root(cifs_sb);
828 if (full_path == NULL) 874 if (full_path == NULL)
829 return ERR_PTR(-ENOMEM); 875 return ERR_PTR(-ENOMEM);
830 876
831 xid = GetXid(); 877 xid = GetXid();
832 if (cifs_sb->tcon->unix_ext) 878 if (tcon->unix_ext)
833 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 879 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
834 else 880 else
835 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 881 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -840,10 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
840 886
841#ifdef CONFIG_CIFS_FSCACHE 887#ifdef CONFIG_CIFS_FSCACHE
842 /* populate tcon->resource_id */ 888 /* populate tcon->resource_id */
843 cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid; 889 tcon->resource_id = CIFS_I(inode)->uniqueid;
844#endif 890#endif
845 891
846 if (rc && cifs_sb->tcon->ipc) { 892 if (rc && tcon->ipc) {
847 cFYI(1, "ipc connection - fake read inode"); 893 cFYI(1, "ipc connection - fake read inode");
848 inode->i_mode |= S_IFDIR; 894 inode->i_mode |= S_IFDIR;
849 inode->i_nlink = 2; 895 inode->i_nlink = 2;
@@ -879,7 +925,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
879 struct cifsFileInfo *open_file; 925 struct cifsFileInfo *open_file;
880 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 926 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
881 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 927 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
882 struct cifsTconInfo *pTcon = cifs_sb->tcon; 928 struct tcon_link *tlink = NULL;
929 struct cifsTconInfo *pTcon;
883 FILE_BASIC_INFO info_buf; 930 FILE_BASIC_INFO info_buf;
884 931
885 if (attrs == NULL) 932 if (attrs == NULL)
@@ -918,13 +965,22 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
918 /* 965 /*
919 * If the file is already open for write, just use that fileid 966 * If the file is already open for write, just use that fileid
920 */ 967 */
921 open_file = find_writable_file(cifsInode); 968 open_file = find_writable_file(cifsInode, true);
922 if (open_file) { 969 if (open_file) {
923 netfid = open_file->netfid; 970 netfid = open_file->netfid;
924 netpid = open_file->pid; 971 netpid = open_file->pid;
972 pTcon = tlink_tcon(open_file->tlink);
925 goto set_via_filehandle; 973 goto set_via_filehandle;
926 } 974 }
927 975
976 tlink = cifs_sb_tlink(cifs_sb);
977 if (IS_ERR(tlink)) {
978 rc = PTR_ERR(tlink);
979 tlink = NULL;
980 goto out;
981 }
982 pTcon = tlink_tcon(tlink);
983
928 /* 984 /*
929 * NT4 apparently returns success on this call, but it doesn't 985 * NT4 apparently returns success on this call, but it doesn't
930 * really work. 986 * really work.
@@ -968,6 +1024,8 @@ set_via_filehandle:
968 else 1024 else
969 cifsFileInfo_put(open_file); 1025 cifsFileInfo_put(open_file);
970out: 1026out:
1027 if (tlink != NULL)
1028 cifs_put_tlink(tlink);
971 return rc; 1029 return rc;
972} 1030}
973 1031
@@ -985,10 +1043,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
985 struct inode *inode = dentry->d_inode; 1043 struct inode *inode = dentry->d_inode;
986 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1044 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
987 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1045 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
988 struct cifsTconInfo *tcon = cifs_sb->tcon; 1046 struct tcon_link *tlink;
1047 struct cifsTconInfo *tcon;
989 __u32 dosattr, origattr; 1048 __u32 dosattr, origattr;
990 FILE_BASIC_INFO *info_buf = NULL; 1049 FILE_BASIC_INFO *info_buf = NULL;
991 1050
1051 tlink = cifs_sb_tlink(cifs_sb);
1052 if (IS_ERR(tlink))
1053 return PTR_ERR(tlink);
1054 tcon = tlink_tcon(tlink);
1055
992 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 1056 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
993 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, 1057 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
994 &netfid, &oplock, NULL, cifs_sb->local_nls, 1058 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1057,6 +1121,7 @@ out_close:
1057 CIFSSMBClose(xid, tcon, netfid); 1121 CIFSSMBClose(xid, tcon, netfid);
1058out: 1122out:
1059 kfree(info_buf); 1123 kfree(info_buf);
1124 cifs_put_tlink(tlink);
1060 return rc; 1125 return rc;
1061 1126
1062 /* 1127 /*
@@ -1096,12 +1161,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1096 struct cifsInodeInfo *cifs_inode; 1161 struct cifsInodeInfo *cifs_inode;
1097 struct super_block *sb = dir->i_sb; 1162 struct super_block *sb = dir->i_sb;
1098 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 1163 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
1099 struct cifsTconInfo *tcon = cifs_sb->tcon; 1164 struct tcon_link *tlink;
1165 struct cifsTconInfo *tcon;
1100 struct iattr *attrs = NULL; 1166 struct iattr *attrs = NULL;
1101 __u32 dosattr = 0, origattr = 0; 1167 __u32 dosattr = 0, origattr = 0;
1102 1168
1103 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry); 1169 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
1104 1170
1171 tlink = cifs_sb_tlink(cifs_sb);
1172 if (IS_ERR(tlink))
1173 return PTR_ERR(tlink);
1174 tcon = tlink_tcon(tlink);
1175
1105 xid = GetXid(); 1176 xid = GetXid();
1106 1177
1107 /* Unlink can be called from rename so we can not take the 1178 /* Unlink can be called from rename so we can not take the
@@ -1109,8 +1180,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1109 full_path = build_path_from_dentry(dentry); 1180 full_path = build_path_from_dentry(dentry);
1110 if (full_path == NULL) { 1181 if (full_path == NULL) {
1111 rc = -ENOMEM; 1182 rc = -ENOMEM;
1112 FreeXid(xid); 1183 goto unlink_out;
1113 return rc;
1114 } 1184 }
1115 1185
1116 if ((tcon->ses->capabilities & CAP_UNIX) && 1186 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1176,10 +1246,11 @@ out_reval:
1176 dir->i_ctime = dir->i_mtime = current_fs_time(sb); 1246 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
1177 cifs_inode = CIFS_I(dir); 1247 cifs_inode = CIFS_I(dir);
1178 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ 1248 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
1179 1249unlink_out:
1180 kfree(full_path); 1250 kfree(full_path);
1181 kfree(attrs); 1251 kfree(attrs);
1182 FreeXid(xid); 1252 FreeXid(xid);
1253 cifs_put_tlink(tlink);
1183 return rc; 1254 return rc;
1184} 1255}
1185 1256
@@ -1188,6 +1259,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1188 int rc = 0, tmprc; 1259 int rc = 0, tmprc;
1189 int xid; 1260 int xid;
1190 struct cifs_sb_info *cifs_sb; 1261 struct cifs_sb_info *cifs_sb;
1262 struct tcon_link *tlink;
1191 struct cifsTconInfo *pTcon; 1263 struct cifsTconInfo *pTcon;
1192 char *full_path = NULL; 1264 char *full_path = NULL;
1193 struct inode *newinode = NULL; 1265 struct inode *newinode = NULL;
@@ -1195,16 +1267,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1195 1267
1196 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode); 1268 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
1197 1269
1198 xid = GetXid();
1199
1200 cifs_sb = CIFS_SB(inode->i_sb); 1270 cifs_sb = CIFS_SB(inode->i_sb);
1201 pTcon = cifs_sb->tcon; 1271 tlink = cifs_sb_tlink(cifs_sb);
1272 if (IS_ERR(tlink))
1273 return PTR_ERR(tlink);
1274 pTcon = tlink_tcon(tlink);
1275
1276 xid = GetXid();
1202 1277
1203 full_path = build_path_from_dentry(direntry); 1278 full_path = build_path_from_dentry(direntry);
1204 if (full_path == NULL) { 1279 if (full_path == NULL) {
1205 rc = -ENOMEM; 1280 rc = -ENOMEM;
1206 FreeXid(xid); 1281 goto mkdir_out;
1207 return rc;
1208 } 1282 }
1209 1283
1210 if ((pTcon->ses->capabilities & CAP_UNIX) && 1284 if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1362,6 +1436,7 @@ mkdir_get_info:
1362mkdir_out: 1436mkdir_out:
1363 kfree(full_path); 1437 kfree(full_path);
1364 FreeXid(xid); 1438 FreeXid(xid);
1439 cifs_put_tlink(tlink);
1365 return rc; 1440 return rc;
1366} 1441}
1367 1442
@@ -1370,6 +1445,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1370 int rc = 0; 1445 int rc = 0;
1371 int xid; 1446 int xid;
1372 struct cifs_sb_info *cifs_sb; 1447 struct cifs_sb_info *cifs_sb;
1448 struct tcon_link *tlink;
1373 struct cifsTconInfo *pTcon; 1449 struct cifsTconInfo *pTcon;
1374 char *full_path = NULL; 1450 char *full_path = NULL;
1375 struct cifsInodeInfo *cifsInode; 1451 struct cifsInodeInfo *cifsInode;
@@ -1378,18 +1454,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1378 1454
1379 xid = GetXid(); 1455 xid = GetXid();
1380 1456
1381 cifs_sb = CIFS_SB(inode->i_sb);
1382 pTcon = cifs_sb->tcon;
1383
1384 full_path = build_path_from_dentry(direntry); 1457 full_path = build_path_from_dentry(direntry);
1385 if (full_path == NULL) { 1458 if (full_path == NULL) {
1386 rc = -ENOMEM; 1459 rc = -ENOMEM;
1387 FreeXid(xid); 1460 goto rmdir_exit;
1388 return rc; 1461 }
1462
1463 cifs_sb = CIFS_SB(inode->i_sb);
1464 tlink = cifs_sb_tlink(cifs_sb);
1465 if (IS_ERR(tlink)) {
1466 rc = PTR_ERR(tlink);
1467 goto rmdir_exit;
1389 } 1468 }
1469 pTcon = tlink_tcon(tlink);
1390 1470
1391 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, 1471 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
1392 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1472 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1473 cifs_put_tlink(tlink);
1393 1474
1394 if (!rc) { 1475 if (!rc) {
1395 drop_nlink(inode); 1476 drop_nlink(inode);
@@ -1410,6 +1491,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1410 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = 1491 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
1411 current_fs_time(inode->i_sb); 1492 current_fs_time(inode->i_sb);
1412 1493
1494rmdir_exit:
1413 kfree(full_path); 1495 kfree(full_path);
1414 FreeXid(xid); 1496 FreeXid(xid);
1415 return rc; 1497 return rc;
@@ -1420,10 +1502,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1420 struct dentry *to_dentry, const char *toPath) 1502 struct dentry *to_dentry, const char *toPath)
1421{ 1503{
1422 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); 1504 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1423 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1505 struct tcon_link *tlink;
1506 struct cifsTconInfo *pTcon;
1424 __u16 srcfid; 1507 __u16 srcfid;
1425 int oplock, rc; 1508 int oplock, rc;
1426 1509
1510 tlink = cifs_sb_tlink(cifs_sb);
1511 if (IS_ERR(tlink))
1512 return PTR_ERR(tlink);
1513 pTcon = tlink_tcon(tlink);
1514
1427 /* try path-based rename first */ 1515 /* try path-based rename first */
1428 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, 1516 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
1429 cifs_sb->mnt_cifs_flags & 1517 cifs_sb->mnt_cifs_flags &
@@ -1435,11 +1523,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1435 * rename by filehandle to various Windows servers. 1523 * rename by filehandle to various Windows servers.
1436 */ 1524 */
1437 if (rc == 0 || rc != -ETXTBSY) 1525 if (rc == 0 || rc != -ETXTBSY)
1438 return rc; 1526 goto do_rename_exit;
1439 1527
1440 /* open-file renames don't work across directories */ 1528 /* open-file renames don't work across directories */
1441 if (to_dentry->d_parent != from_dentry->d_parent) 1529 if (to_dentry->d_parent != from_dentry->d_parent)
1442 return rc; 1530 goto do_rename_exit;
1443 1531
1444 /* open the file to be renamed -- we need DELETE perms */ 1532 /* open the file to be renamed -- we need DELETE perms */
1445 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, 1533 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1455,7 +1543,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1455 1543
1456 CIFSSMBClose(xid, pTcon, srcfid); 1544 CIFSSMBClose(xid, pTcon, srcfid);
1457 } 1545 }
1458 1546do_rename_exit:
1547 cifs_put_tlink(tlink);
1459 return rc; 1548 return rc;
1460} 1549}
1461 1550
@@ -1465,13 +1554,17 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1465 char *fromName = NULL; 1554 char *fromName = NULL;
1466 char *toName = NULL; 1555 char *toName = NULL;
1467 struct cifs_sb_info *cifs_sb; 1556 struct cifs_sb_info *cifs_sb;
1557 struct tcon_link *tlink;
1468 struct cifsTconInfo *tcon; 1558 struct cifsTconInfo *tcon;
1469 FILE_UNIX_BASIC_INFO *info_buf_source = NULL; 1559 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1470 FILE_UNIX_BASIC_INFO *info_buf_target; 1560 FILE_UNIX_BASIC_INFO *info_buf_target;
1471 int xid, rc, tmprc; 1561 int xid, rc, tmprc;
1472 1562
1473 cifs_sb = CIFS_SB(source_dir->i_sb); 1563 cifs_sb = CIFS_SB(source_dir->i_sb);
1474 tcon = cifs_sb->tcon; 1564 tlink = cifs_sb_tlink(cifs_sb);
1565 if (IS_ERR(tlink))
1566 return PTR_ERR(tlink);
1567 tcon = tlink_tcon(tlink);
1475 1568
1476 xid = GetXid(); 1569 xid = GetXid();
1477 1570
@@ -1547,6 +1640,7 @@ cifs_rename_exit:
1547 kfree(fromName); 1640 kfree(fromName);
1548 kfree(toName); 1641 kfree(toName);
1549 FreeXid(xid); 1642 FreeXid(xid);
1643 cifs_put_tlink(tlink);
1550 return rc; 1644 return rc;
1551} 1645}
1552 1646
@@ -1588,8 +1682,7 @@ cifs_invalidate_mapping(struct inode *inode)
1588 /* write back any cached data */ 1682 /* write back any cached data */
1589 if (inode->i_mapping && inode->i_mapping->nrpages != 0) { 1683 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1590 rc = filemap_write_and_wait(inode->i_mapping); 1684 rc = filemap_write_and_wait(inode->i_mapping);
1591 if (rc) 1685 mapping_set_error(inode->i_mapping, rc);
1592 cifs_i->write_behind_rc = rc;
1593 } 1686 }
1594 invalidate_remote_inode(inode); 1687 invalidate_remote_inode(inode);
1595 cifs_fscache_reset_inode_cookie(inode); 1688 cifs_fscache_reset_inode_cookie(inode);
@@ -1599,11 +1692,12 @@ int cifs_revalidate_file(struct file *filp)
1599{ 1692{
1600 int rc = 0; 1693 int rc = 0;
1601 struct inode *inode = filp->f_path.dentry->d_inode; 1694 struct inode *inode = filp->f_path.dentry->d_inode;
1695 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
1602 1696
1603 if (!cifs_inode_needs_reval(inode)) 1697 if (!cifs_inode_needs_reval(inode))
1604 goto check_inval; 1698 goto check_inval;
1605 1699
1606 if (CIFS_SB(inode->i_sb)->tcon->unix_ext) 1700 if (tlink_tcon(cfile->tlink)->unix_ext)
1607 rc = cifs_get_file_info_unix(filp); 1701 rc = cifs_get_file_info_unix(filp);
1608 else 1702 else
1609 rc = cifs_get_file_info(filp); 1703 rc = cifs_get_file_info(filp);
@@ -1644,7 +1738,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
1644 "jiffies %ld", full_path, inode, inode->i_count.counter, 1738 "jiffies %ld", full_path, inode, inode->i_count.counter,
1645 dentry, dentry->d_time, jiffies); 1739 dentry, dentry->d_time, jiffies);
1646 1740
1647 if (CIFS_SB(sb)->tcon->unix_ext) 1741 if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
1648 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 1742 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1649 else 1743 else
1650 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 1744 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1660,13 +1754,29 @@ check_inval:
1660} 1754}
1661 1755
1662int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1756int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1663 struct kstat *stat) 1757 struct kstat *stat)
1664{ 1758{
1759 struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
1760 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
1665 int err = cifs_revalidate_dentry(dentry); 1761 int err = cifs_revalidate_dentry(dentry);
1762
1666 if (!err) { 1763 if (!err) {
1667 generic_fillattr(dentry->d_inode, stat); 1764 generic_fillattr(dentry->d_inode, stat);
1668 stat->blksize = CIFS_MAX_MSGSIZE; 1765 stat->blksize = CIFS_MAX_MSGSIZE;
1669 stat->ino = CIFS_I(dentry->d_inode)->uniqueid; 1766 stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
1767
1768 /*
1769 * If on a multiuser mount without unix extensions, and the
1770 * admin hasn't overridden them, set the ownership to the
1771 * fsuid/fsgid of the current process.
1772 */
1773 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
1774 !tcon->unix_ext) {
1775 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
1776 stat->uid = current_fsuid();
1777 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
1778 stat->gid = current_fsgid();
1779 }
1670 } 1780 }
1671 return err; 1781 return err;
1672} 1782}
@@ -1708,7 +1818,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1708 struct cifsFileInfo *open_file; 1818 struct cifsFileInfo *open_file;
1709 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1819 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1710 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1820 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1711 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1821 struct tcon_link *tlink = NULL;
1822 struct cifsTconInfo *pTcon = NULL;
1712 1823
1713 /* 1824 /*
1714 * To avoid spurious oplock breaks from server, in the case of 1825 * To avoid spurious oplock breaks from server, in the case of
@@ -1719,10 +1830,11 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1719 * writebehind data than the SMB timeout for the SetPathInfo 1830 * writebehind data than the SMB timeout for the SetPathInfo
1720 * request would allow 1831 * request would allow
1721 */ 1832 */
1722 open_file = find_writable_file(cifsInode); 1833 open_file = find_writable_file(cifsInode, true);
1723 if (open_file) { 1834 if (open_file) {
1724 __u16 nfid = open_file->netfid; 1835 __u16 nfid = open_file->netfid;
1725 __u32 npid = open_file->pid; 1836 __u32 npid = open_file->pid;
1837 pTcon = tlink_tcon(open_file->tlink);
1726 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1838 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1727 npid, false); 1839 npid, false);
1728 cifsFileInfo_put(open_file); 1840 cifsFileInfo_put(open_file);
@@ -1737,6 +1849,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1737 rc = -EINVAL; 1849 rc = -EINVAL;
1738 1850
1739 if (rc != 0) { 1851 if (rc != 0) {
1852 if (pTcon == NULL) {
1853 tlink = cifs_sb_tlink(cifs_sb);
1854 if (IS_ERR(tlink))
1855 return PTR_ERR(tlink);
1856 pTcon = tlink_tcon(tlink);
1857 }
1858
1740 /* Set file size by pathname rather than by handle 1859 /* Set file size by pathname rather than by handle
1741 either because no valid, writeable file handle for 1860 either because no valid, writeable file handle for
1742 it was found or because there was an error setting 1861 it was found or because there was an error setting
@@ -1766,6 +1885,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1766 CIFSSMBClose(xid, pTcon, netfid); 1885 CIFSSMBClose(xid, pTcon, netfid);
1767 } 1886 }
1768 } 1887 }
1888 if (tlink)
1889 cifs_put_tlink(tlink);
1769 } 1890 }
1770 1891
1771 if (rc == 0) { 1892 if (rc == 0) {
@@ -1786,7 +1907,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1786 struct inode *inode = direntry->d_inode; 1907 struct inode *inode = direntry->d_inode;
1787 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1908 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1788 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1909 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1789 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1910 struct tcon_link *tlink;
1911 struct cifsTconInfo *pTcon;
1790 struct cifs_unix_set_info_args *args = NULL; 1912 struct cifs_unix_set_info_args *args = NULL;
1791 struct cifsFileInfo *open_file; 1913 struct cifsFileInfo *open_file;
1792 1914
@@ -1820,10 +1942,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1820 * the flush returns error? 1942 * the flush returns error?
1821 */ 1943 */
1822 rc = filemap_write_and_wait(inode->i_mapping); 1944 rc = filemap_write_and_wait(inode->i_mapping);
1823 if (rc != 0) { 1945 mapping_set_error(inode->i_mapping, rc);
1824 cifsInode->write_behind_rc = rc; 1946 rc = 0;
1825 rc = 0;
1826 }
1827 1947
1828 if (attrs->ia_valid & ATTR_SIZE) { 1948 if (attrs->ia_valid & ATTR_SIZE) {
1829 rc = cifs_set_file_size(inode, attrs, xid, full_path); 1949 rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1873,17 +1993,25 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1873 args->ctime = NO_CHANGE_64; 1993 args->ctime = NO_CHANGE_64;
1874 1994
1875 args->device = 0; 1995 args->device = 0;
1876 open_file = find_writable_file(cifsInode); 1996 open_file = find_writable_file(cifsInode, true);
1877 if (open_file) { 1997 if (open_file) {
1878 u16 nfid = open_file->netfid; 1998 u16 nfid = open_file->netfid;
1879 u32 npid = open_file->pid; 1999 u32 npid = open_file->pid;
2000 pTcon = tlink_tcon(open_file->tlink);
1880 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 2001 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1881 cifsFileInfo_put(open_file); 2002 cifsFileInfo_put(open_file);
1882 } else { 2003 } else {
2004 tlink = cifs_sb_tlink(cifs_sb);
2005 if (IS_ERR(tlink)) {
2006 rc = PTR_ERR(tlink);
2007 goto out;
2008 }
2009 pTcon = tlink_tcon(tlink);
1883 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, 2010 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1884 cifs_sb->local_nls, 2011 cifs_sb->local_nls,
1885 cifs_sb->mnt_cifs_flags & 2012 cifs_sb->mnt_cifs_flags &
1886 CIFS_MOUNT_MAP_SPECIAL_CHR); 2013 CIFS_MOUNT_MAP_SPECIAL_CHR);
2014 cifs_put_tlink(tlink);
1887 } 2015 }
1888 2016
1889 if (rc) 2017 if (rc)
@@ -1956,10 +2084,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1956 * the flush returns error? 2084 * the flush returns error?
1957 */ 2085 */
1958 rc = filemap_write_and_wait(inode->i_mapping); 2086 rc = filemap_write_and_wait(inode->i_mapping);
1959 if (rc != 0) { 2087 mapping_set_error(inode->i_mapping, rc);
1960 cifsInode->write_behind_rc = rc; 2088 rc = 0;
1961 rc = 0;
1962 }
1963 2089
1964 if (attrs->ia_valid & ATTR_SIZE) { 2090 if (attrs->ia_valid & ATTR_SIZE) {
1965 rc = cifs_set_file_size(inode, attrs, xid, full_path); 2091 rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -2051,7 +2177,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2051 2177
2052 setattr_copy(inode, attrs); 2178 setattr_copy(inode, attrs);
2053 mark_inode_dirty(inode); 2179 mark_inode_dirty(inode);
2054 return 0;
2055 2180
2056cifs_setattr_exit: 2181cifs_setattr_exit:
2057 kfree(full_path); 2182 kfree(full_path);
@@ -2064,7 +2189,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
2064{ 2189{
2065 struct inode *inode = direntry->d_inode; 2190 struct inode *inode = direntry->d_inode;
2066 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 2191 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2067 struct cifsTconInfo *pTcon = cifs_sb->tcon; 2192 struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
2068 2193
2069 if (pTcon->unix_ext) 2194 if (pTcon->unix_ext)
2070 return cifs_setattr_unix(direntry, attrs); 2195 return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9d38a71c8e14..0c98672d0122 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,11 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
37 int xid; 37 int xid;
38 struct cifs_sb_info *cifs_sb; 38 struct cifs_sb_info *cifs_sb;
39#ifdef CONFIG_CIFS_POSIX 39#ifdef CONFIG_CIFS_POSIX
40 struct cifsFileInfo *pSMBFile = filep->private_data;
41 struct cifsTconInfo *tcon;
40 __u64 ExtAttrBits = 0; 42 __u64 ExtAttrBits = 0;
41 __u64 ExtAttrMask = 0; 43 __u64 ExtAttrMask = 0;
42 __u64 caps; 44 __u64 caps;
43 struct cifsTconInfo *tcon;
44 struct cifsFileInfo *pSMBFile = filep->private_data;
45#endif /* CONFIG_CIFS_POSIX */ 45#endif /* CONFIG_CIFS_POSIX */
46 46
47 xid = GetXid(); 47 xid = GetXid();
@@ -50,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
50 50
51 cifs_sb = CIFS_SB(inode->i_sb); 51 cifs_sb = CIFS_SB(inode->i_sb);
52 52
53#ifdef CONFIG_CIFS_POSIX
54 tcon = cifs_sb->tcon;
55 if (tcon)
56 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
57 else {
58 rc = -EIO;
59 FreeXid(xid);
60 return -EIO;
61 }
62#endif /* CONFIG_CIFS_POSIX */
63
64 switch (command) { 53 switch (command) {
65 case CIFS_IOC_CHECKUMOUNT: 54 case CIFS_IOC_CHECKUMOUNT:
66 cFYI(1, "User unmount attempted"); 55 cFYI(1, "User unmount attempted");
@@ -73,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
73 break; 62 break;
74#ifdef CONFIG_CIFS_POSIX 63#ifdef CONFIG_CIFS_POSIX
75 case FS_IOC_GETFLAGS: 64 case FS_IOC_GETFLAGS:
65 if (pSMBFile == NULL)
66 break;
67 tcon = tlink_tcon(pSMBFile->tlink);
68 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
76 if (CIFS_UNIX_EXTATTR_CAP & caps) { 69 if (CIFS_UNIX_EXTATTR_CAP & caps) {
77 if (pSMBFile == NULL)
78 break;
79 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, 70 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
80 &ExtAttrBits, &ExtAttrMask); 71 &ExtAttrBits, &ExtAttrMask);
81 if (rc == 0) 72 if (rc == 0)
@@ -86,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
86 break; 77 break;
87 78
88 case FS_IOC_SETFLAGS: 79 case FS_IOC_SETFLAGS:
80 if (pSMBFile == NULL)
81 break;
82 tcon = tlink_tcon(pSMBFile->tlink);
83 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
89 if (CIFS_UNIX_EXTATTR_CAP & caps) { 84 if (CIFS_UNIX_EXTATTR_CAP & caps) {
90 if (get_user(ExtAttrBits, (int __user *)arg)) { 85 if (get_user(ExtAttrBits, (int __user *)arg)) {
91 rc = -EFAULT; 86 rc = -EFAULT;
92 break; 87 break;
93 } 88 }
94 if (pSMBFile == NULL)
95 break;
96 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 89 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
97 extAttrBits, &ExtAttrMask);*/ 90 extAttrBits, &ExtAttrMask);*/
98 } 91 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca8033656..85cdbf831e7b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,296 @@
28#include "cifsproto.h" 28#include "cifsproto.h"
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31#include "md5.h"
32
33#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
34#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
35#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
36#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
37#define CIFS_MF_SYMLINK_FILE_SIZE \
38 (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
39
40#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
41#define CIFS_MF_SYMLINK_MD5_FORMAT \
42 "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
43#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
44 md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \
45 md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \
46 md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\
47 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
48
49static int
50CIFSParseMFSymlink(const u8 *buf,
51 unsigned int buf_len,
52 unsigned int *_link_len,
53 char **_link_str)
54{
55 int rc;
56 unsigned int link_len;
57 const char *md5_str1;
58 const char *link_str;
59 struct MD5Context md5_ctx;
60 u8 md5_hash[16];
61 char md5_str2[34];
62
63 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
64 return -EINVAL;
65
66 md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
67 link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
68
69 rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
70 if (rc != 1)
71 return -EINVAL;
72
73 cifs_MD5_init(&md5_ctx);
74 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
75 cifs_MD5_final(md5_hash, &md5_ctx);
76
77 snprintf(md5_str2, sizeof(md5_str2),
78 CIFS_MF_SYMLINK_MD5_FORMAT,
79 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
80
81 if (strncmp(md5_str1, md5_str2, 17) != 0)
82 return -EINVAL;
83
84 if (_link_str) {
85 *_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
86 if (!*_link_str)
87 return -ENOMEM;
88 }
89
90 *_link_len = link_len;
91 return 0;
92}
93
94static int
95CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
96{
97 unsigned int link_len;
98 unsigned int ofs;
99 struct MD5Context md5_ctx;
100 u8 md5_hash[16];
101
102 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
103 return -EINVAL;
104
105 link_len = strlen(link_str);
106
107 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
108 return -ENAMETOOLONG;
109
110 cifs_MD5_init(&md5_ctx);
111 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
112 cifs_MD5_final(md5_hash, &md5_ctx);
113
114 snprintf(buf, buf_len,
115 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
116 link_len,
117 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
118
119 ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
120 memcpy(buf + ofs, link_str, link_len);
121
122 ofs += link_len;
123 if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
124 buf[ofs] = '\n';
125 ofs++;
126 }
127
128 while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
129 buf[ofs] = ' ';
130 ofs++;
131 }
132
133 return 0;
134}
135
136static int
137CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
138 const char *fromName, const char *toName,
139 const struct nls_table *nls_codepage, int remap)
140{
141 int rc;
142 int oplock = 0;
143 __u16 netfid = 0;
144 u8 *buf;
145 unsigned int bytes_written = 0;
146
147 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
148 if (!buf)
149 return -ENOMEM;
150
151 rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
152 if (rc != 0) {
153 kfree(buf);
154 return rc;
155 }
156
157 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
158 CREATE_NOT_DIR, &netfid, &oplock, NULL,
159 nls_codepage, remap);
160 if (rc != 0) {
161 kfree(buf);
162 return rc;
163 }
164
165 rc = CIFSSMBWrite(xid, tcon, netfid,
166 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
167 0 /* offset */,
168 &bytes_written, buf, NULL, 0);
169 CIFSSMBClose(xid, tcon, netfid);
170 kfree(buf);
171 if (rc != 0)
172 return rc;
173
174 if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
175 return -EIO;
176
177 return 0;
178}
179
180static int
181CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
182 const unsigned char *searchName, char **symlinkinfo,
183 const struct nls_table *nls_codepage, int remap)
184{
185 int rc;
186 int oplock = 0;
187 __u16 netfid = 0;
188 u8 *buf;
189 char *pbuf;
190 unsigned int bytes_read = 0;
191 int buf_type = CIFS_NO_BUFFER;
192 unsigned int link_len = 0;
193 FILE_ALL_INFO file_info;
194
195 rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
196 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
197 nls_codepage, remap);
198 if (rc != 0)
199 return rc;
200
201 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
202 CIFSSMBClose(xid, tcon, netfid);
203 /* it's not a symlink */
204 return -EINVAL;
205 }
206
207 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
208 if (!buf)
209 return -ENOMEM;
210 pbuf = buf;
211
212 rc = CIFSSMBRead(xid, tcon, netfid,
213 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
214 0 /* offset */,
215 &bytes_read, &pbuf, &buf_type);
216 CIFSSMBClose(xid, tcon, netfid);
217 if (rc != 0) {
218 kfree(buf);
219 return rc;
220 }
221
222 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
223 kfree(buf);
224 if (rc != 0)
225 return rc;
226
227 return 0;
228}
229
230bool
231CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
232{
233 if (!(fattr->cf_mode & S_IFREG))
234 /* it's not a symlink */
235 return false;
236
237 if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
238 /* it's not a symlink */
239 return false;
240
241 return true;
242}
243
244int
245CIFSCheckMFSymlink(struct cifs_fattr *fattr,
246 const unsigned char *path,
247 struct cifs_sb_info *cifs_sb, int xid)
248{
249 int rc;
250 int oplock = 0;
251 __u16 netfid = 0;
252 struct tcon_link *tlink;
253 struct cifsTconInfo *pTcon;
254 u8 *buf;
255 char *pbuf;
256 unsigned int bytes_read = 0;
257 int buf_type = CIFS_NO_BUFFER;
258 unsigned int link_len = 0;
259 FILE_ALL_INFO file_info;
260
261 if (!CIFSCouldBeMFSymlink(fattr))
262 /* it's not a symlink */
263 return 0;
264
265 tlink = cifs_sb_tlink(cifs_sb);
266 if (IS_ERR(tlink))
267 return PTR_ERR(tlink);
268 pTcon = tlink_tcon(tlink);
269
270 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
271 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
272 cifs_sb->local_nls,
273 cifs_sb->mnt_cifs_flags &
274 CIFS_MOUNT_MAP_SPECIAL_CHR);
275 if (rc != 0)
276 goto out;
277
278 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
279 CIFSSMBClose(xid, pTcon, netfid);
280 /* it's not a symlink */
281 goto out;
282 }
283
284 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
285 if (!buf) {
286 rc = -ENOMEM;
287 goto out;
288 }
289 pbuf = buf;
290
291 rc = CIFSSMBRead(xid, pTcon, netfid,
292 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
293 0 /* offset */,
294 &bytes_read, &pbuf, &buf_type);
295 CIFSSMBClose(xid, pTcon, netfid);
296 if (rc != 0) {
297 kfree(buf);
298 goto out;
299 }
300
301 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
302 kfree(buf);
303 if (rc == -EINVAL) {
304 /* it's not a symlink */
305 rc = 0;
306 goto out;
307 }
308
309 if (rc != 0)
310 goto out;
311
312 /* it is a symlink */
313 fattr->cf_eof = link_len;
314 fattr->cf_mode &= ~S_IFMT;
315 fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
316 fattr->cf_dtype = DT_LNK;
317out:
318 cifs_put_tlink(tlink);
319 return rc;
320}
31 321
32int 322int
33cifs_hardlink(struct dentry *old_file, struct inode *inode, 323cifs_hardlink(struct dentry *old_file, struct inode *inode,
@@ -37,17 +327,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
37 int xid; 327 int xid;
38 char *fromName = NULL; 328 char *fromName = NULL;
39 char *toName = NULL; 329 char *toName = NULL;
40 struct cifs_sb_info *cifs_sb_target; 330 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
331 struct tcon_link *tlink;
41 struct cifsTconInfo *pTcon; 332 struct cifsTconInfo *pTcon;
42 struct cifsInodeInfo *cifsInode; 333 struct cifsInodeInfo *cifsInode;
43 334
44 xid = GetXid(); 335 tlink = cifs_sb_tlink(cifs_sb);
45 336 if (IS_ERR(tlink))
46 cifs_sb_target = CIFS_SB(inode->i_sb); 337 return PTR_ERR(tlink);
47 pTcon = cifs_sb_target->tcon; 338 pTcon = tlink_tcon(tlink);
48 339
49/* No need to check for cross device links since server will do that 340 xid = GetXid();
50 BB note DFS case in future though (when we may have to check) */
51 341
52 fromName = build_path_from_dentry(old_file); 342 fromName = build_path_from_dentry(old_file);
53 toName = build_path_from_dentry(direntry); 343 toName = build_path_from_dentry(direntry);
@@ -56,16 +346,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
56 goto cifs_hl_exit; 346 goto cifs_hl_exit;
57 } 347 }
58 348
59/* if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
60 if (pTcon->unix_ext) 349 if (pTcon->unix_ext)
61 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName, 350 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
62 cifs_sb_target->local_nls, 351 cifs_sb->local_nls,
63 cifs_sb_target->mnt_cifs_flags & 352 cifs_sb->mnt_cifs_flags &
64 CIFS_MOUNT_MAP_SPECIAL_CHR); 353 CIFS_MOUNT_MAP_SPECIAL_CHR);
65 else { 354 else {
66 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName, 355 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
67 cifs_sb_target->local_nls, 356 cifs_sb->local_nls,
68 cifs_sb_target->mnt_cifs_flags & 357 cifs_sb->mnt_cifs_flags &
69 CIFS_MOUNT_MAP_SPECIAL_CHR); 358 CIFS_MOUNT_MAP_SPECIAL_CHR);
70 if ((rc == -EIO) || (rc == -EINVAL)) 359 if ((rc == -EIO) || (rc == -EINVAL))
71 rc = -EOPNOTSUPP; 360 rc = -EOPNOTSUPP;
@@ -101,6 +390,7 @@ cifs_hl_exit:
101 kfree(fromName); 390 kfree(fromName);
102 kfree(toName); 391 kfree(toName);
103 FreeXid(xid); 392 FreeXid(xid);
393 cifs_put_tlink(tlink);
104 return rc; 394 return rc;
105} 395}
106 396
@@ -113,10 +403,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
113 char *full_path = NULL; 403 char *full_path = NULL;
114 char *target_path = NULL; 404 char *target_path = NULL;
115 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 405 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
116 struct cifsTconInfo *tcon = cifs_sb->tcon; 406 struct tcon_link *tlink = NULL;
407 struct cifsTconInfo *tcon;
117 408
118 xid = GetXid(); 409 xid = GetXid();
119 410
411 tlink = cifs_sb_tlink(cifs_sb);
412 if (IS_ERR(tlink)) {
413 rc = PTR_ERR(tlink);
414 tlink = NULL;
415 goto out;
416 }
417 tcon = tlink_tcon(tlink);
418
120 /* 419 /*
121 * For now, we just handle symlinks with unix extensions enabled. 420 * For now, we just handle symlinks with unix extensions enabled.
122 * Eventually we should handle NTFS reparse points, and MacOS 421 * Eventually we should handle NTFS reparse points, and MacOS
@@ -130,7 +429,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
130 * but there doesn't seem to be any harm in allowing the client to 429 * but there doesn't seem to be any harm in allowing the client to
131 * read them. 430 * read them.
132 */ 431 */
133 if (!(tcon->ses->capabilities & CAP_UNIX)) { 432 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
433 && !(tcon->ses->capabilities & CAP_UNIX)) {
134 rc = -EACCES; 434 rc = -EACCES;
135 goto out; 435 goto out;
136 } 436 }
@@ -141,8 +441,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
141 441
142 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode); 442 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
143 443
144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 444 rc = -EACCES;
145 cifs_sb->local_nls); 445 /*
446 * First try Minshall+French Symlinks, if configured
447 * and fallback to UNIX Extensions Symlinks.
448 */
449 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
450 rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
451 cifs_sb->local_nls,
452 cifs_sb->mnt_cifs_flags &
453 CIFS_MOUNT_MAP_SPECIAL_CHR);
454
455 if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
456 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
457 cifs_sb->local_nls);
458
146 kfree(full_path); 459 kfree(full_path);
147out: 460out:
148 if (rc != 0) { 461 if (rc != 0) {
@@ -151,6 +464,8 @@ out:
151 } 464 }
152 465
153 FreeXid(xid); 466 FreeXid(xid);
467 if (tlink)
468 cifs_put_tlink(tlink);
154 nd_set_link(nd, target_path); 469 nd_set_link(nd, target_path);
155 return NULL; 470 return NULL;
156} 471}
@@ -160,29 +475,37 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
160{ 475{
161 int rc = -EOPNOTSUPP; 476 int rc = -EOPNOTSUPP;
162 int xid; 477 int xid;
163 struct cifs_sb_info *cifs_sb; 478 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
479 struct tcon_link *tlink;
164 struct cifsTconInfo *pTcon; 480 struct cifsTconInfo *pTcon;
165 char *full_path = NULL; 481 char *full_path = NULL;
166 struct inode *newinode = NULL; 482 struct inode *newinode = NULL;
167 483
168 xid = GetXid(); 484 xid = GetXid();
169 485
170 cifs_sb = CIFS_SB(inode->i_sb); 486 tlink = cifs_sb_tlink(cifs_sb);
171 pTcon = cifs_sb->tcon; 487 if (IS_ERR(tlink)) {
488 rc = PTR_ERR(tlink);
489 goto symlink_exit;
490 }
491 pTcon = tlink_tcon(tlink);
172 492
173 full_path = build_path_from_dentry(direntry); 493 full_path = build_path_from_dentry(direntry);
174
175 if (full_path == NULL) { 494 if (full_path == NULL) {
176 rc = -ENOMEM; 495 rc = -ENOMEM;
177 FreeXid(xid); 496 goto symlink_exit;
178 return rc;
179 } 497 }
180 498
181 cFYI(1, "Full path: %s", full_path); 499 cFYI(1, "Full path: %s", full_path);
182 cFYI(1, "symname is %s", symname); 500 cFYI(1, "symname is %s", symname);
183 501
184 /* BB what if DFS and this volume is on different share? BB */ 502 /* BB what if DFS and this volume is on different share? BB */
185 if (pTcon->unix_ext) 503 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
504 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
505 cifs_sb->local_nls,
506 cifs_sb->mnt_cifs_flags &
507 CIFS_MOUNT_MAP_SPECIAL_CHR);
508 else if (pTcon->unix_ext)
186 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, 509 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
187 cifs_sb->local_nls); 510 cifs_sb->local_nls);
188 /* else 511 /* else
@@ -208,8 +531,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
208 d_instantiate(direntry, newinode); 531 d_instantiate(direntry, newinode);
209 } 532 }
210 } 533 }
211 534symlink_exit:
212 kfree(full_path); 535 kfree(full_path);
536 cifs_put_tlink(tlink);
213 FreeXid(xid); 537 FreeXid(xid);
214 return rc; 538 return rc;
215} 539}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3ccadc1326d6..43f10281bc19 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -347,7 +347,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
347 if (current_fsuid() != treeCon->ses->linux_uid) { 347 if (current_fsuid() != treeCon->ses->linux_uid) {
348 cFYI(1, "Multiuser mode and UID " 348 cFYI(1, "Multiuser mode and UID "
349 "did not match tcon uid"); 349 "did not match tcon uid");
350 read_lock(&cifs_tcp_ses_lock); 350 spin_lock(&cifs_tcp_ses_lock);
351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { 351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list); 352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
353 if (ses->linux_uid == current_fsuid()) { 353 if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +361,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
361 } 361 }
362 } 362 }
363 } 363 }
364 read_unlock(&cifs_tcp_ses_lock); 364 spin_unlock(&cifs_tcp_ses_lock);
365 } 365 }
366 } 366 }
367 } 367 }
@@ -551,7 +551,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
551 return false; 551 return false;
552 552
553 /* look up tcon based on tid & uid */ 553 /* look up tcon based on tid & uid */
554 read_lock(&cifs_tcp_ses_lock); 554 spin_lock(&cifs_tcp_ses_lock);
555 list_for_each(tmp, &srv->smb_ses_list) { 555 list_for_each(tmp, &srv->smb_ses_list) {
556 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 556 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
557 list_for_each(tmp1, &ses->tcon_list) { 557 list_for_each(tmp1, &ses->tcon_list) {
@@ -560,51 +560,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
560 continue; 560 continue;
561 561
562 cifs_stats_inc(&tcon->num_oplock_brks); 562 cifs_stats_inc(&tcon->num_oplock_brks);
563 read_lock(&GlobalSMBSeslock); 563 spin_lock(&cifs_file_list_lock);
564 list_for_each(tmp2, &tcon->openFileList) { 564 list_for_each(tmp2, &tcon->openFileList) {
565 netfile = list_entry(tmp2, struct cifsFileInfo, 565 netfile = list_entry(tmp2, struct cifsFileInfo,
566 tlist); 566 tlist);
567 if (pSMB->Fid != netfile->netfid) 567 if (pSMB->Fid != netfile->netfid)
568 continue; 568 continue;
569 569
570 /*
571 * don't do anything if file is about to be
572 * closed anyway.
573 */
574 if (netfile->closePend) {
575 read_unlock(&GlobalSMBSeslock);
576 read_unlock(&cifs_tcp_ses_lock);
577 return true;
578 }
579
580 cFYI(1, "file id match, oplock break"); 570 cFYI(1, "file id match, oplock break");
581 pCifsInode = CIFS_I(netfile->pInode); 571 pCifsInode = CIFS_I(netfile->dentry->d_inode);
582 pCifsInode->clientCanCacheAll = false;
583 if (pSMB->OplockLevel == 0)
584 pCifsInode->clientCanCacheRead = false;
585 572
573 cifs_set_oplock_level(pCifsInode,
574 pSMB->OplockLevel);
586 /* 575 /*
587 * cifs_oplock_break_put() can't be called 576 * cifs_oplock_break_put() can't be called
588 * from here. Get reference after queueing 577 * from here. Get reference after queueing
589 * succeeded. cifs_oplock_break() will 578 * succeeded. cifs_oplock_break() will
590 * synchronize using GlobalSMSSeslock. 579 * synchronize using cifs_file_list_lock.
591 */ 580 */
592 if (queue_work(system_nrt_wq, 581 if (queue_work(system_nrt_wq,
593 &netfile->oplock_break)) 582 &netfile->oplock_break))
594 cifs_oplock_break_get(netfile); 583 cifs_oplock_break_get(netfile);
595 netfile->oplock_break_cancelled = false; 584 netfile->oplock_break_cancelled = false;
596 585
597 read_unlock(&GlobalSMBSeslock); 586 spin_unlock(&cifs_file_list_lock);
598 read_unlock(&cifs_tcp_ses_lock); 587 spin_unlock(&cifs_tcp_ses_lock);
599 return true; 588 return true;
600 } 589 }
601 read_unlock(&GlobalSMBSeslock); 590 spin_unlock(&cifs_file_list_lock);
602 read_unlock(&cifs_tcp_ses_lock); 591 spin_unlock(&cifs_tcp_ses_lock);
603 cFYI(1, "No matching file for oplock break"); 592 cFYI(1, "No matching file for oplock break");
604 return true; 593 return true;
605 } 594 }
606 } 595 }
607 read_unlock(&cifs_tcp_ses_lock); 596 spin_unlock(&cifs_tcp_ses_lock);
608 cFYI(1, "Can not process oplock break for non-existent connection"); 597 cFYI(1, "Can not process oplock break for non-existent connection");
609 return true; 598 return true;
610} 599}
@@ -729,6 +718,26 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
729 "properly. Hardlinks will not be recognized on this " 718 "properly. Hardlinks will not be recognized on this "
730 "mount. Consider mounting with the \"noserverino\" " 719 "mount. Consider mounting with the \"noserverino\" "
731 "option to silence this message.", 720 "option to silence this message.",
732 cifs_sb->tcon->treeName); 721 cifs_sb_master_tcon(cifs_sb)->treeName);
722 }
723}
724
725void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
726{
727 oplock &= 0xF;
728
729 if (oplock == OPLOCK_EXCLUSIVE) {
730 cinode->clientCanCacheAll = true;
731 cinode->clientCanCacheRead = true;
732 cFYI(1, "Exclusive Oplock granted on inode %p",
733 &cinode->vfs_inode);
734 } else if (oplock == OPLOCK_READ) {
735 cinode->clientCanCacheAll = false;
736 cinode->clientCanCacheRead = true;
737 cFYI(1, "Level II Oplock granted on inode %p",
738 &cinode->vfs_inode);
739 } else {
740 cinode->clientCanCacheAll = false;
741 cinode->clientCanCacheRead = false;
733 } 742 }
734} 743}
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..5d52e4a3b1ed 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
62#define NTLMSSP_NEGOTIATE_56 0x80000000 62#define NTLMSSP_NEGOTIATE_56 0x80000000
63 63
64/* Define AV Pair Field IDs */
65enum av_field_type {
66 NTLMSSP_AV_EOL = 0,
67 NTLMSSP_AV_NB_COMPUTER_NAME,
68 NTLMSSP_AV_NB_DOMAIN_NAME,
69 NTLMSSP_AV_DNS_COMPUTER_NAME,
70 NTLMSSP_AV_DNS_DOMAIN_NAME,
71 NTLMSSP_AV_DNS_TREE_NAME,
72 NTLMSSP_AV_FLAGS,
73 NTLMSSP_AV_TIMESTAMP,
74 NTLMSSP_AV_RESTRICTION,
75 NTLMSSP_AV_TARGET_NAME,
76 NTLMSSP_AV_CHANNEL_BINDINGS
77};
78
64/* Although typedefs are not commonly used for structure definitions */ 79/* Although typedefs are not commonly used for structure definitions */
65/* in the Linux kernel, in this particular case they are useful */ 80/* in the Linux kernel, in this particular case they are useful */
66/* to more closely match the standards document for NTLMSSP from */ 81/* to more closely match the standards document for NTLMSSP from */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d5e591fab475..ef7bb7b50f58 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,7 +102,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
102 return NULL; 102 return NULL;
103 } 103 }
104 104
105 if (CIFS_SB(sb)->tcon->nocase) 105 if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
106 dentry->d_op = &cifs_ci_dentry_ops; 106 dentry->d_op = &cifs_ci_dentry_ops;
107 else 107 else
108 dentry->d_op = &cifs_dentry_ops; 108 dentry->d_op = &cifs_dentry_ops;
@@ -171,7 +171,7 @@ static void
171cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, 171cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
172 struct cifs_sb_info *cifs_sb) 172 struct cifs_sb_info *cifs_sb)
173{ 173{
174 int offset = cifs_sb->tcon->ses->server->timeAdj; 174 int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
175 175
176 memset(fattr, 0, sizeof(*fattr)); 176 memset(fattr, 0, sizeof(*fattr));
177 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate, 177 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +199,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
199 int len; 199 int len;
200 int oplock = 0; 200 int oplock = 0;
201 int rc; 201 int rc;
202 struct cifsTconInfo *ptcon = cifs_sb->tcon; 202 struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
203 char *tmpbuffer; 203 char *tmpbuffer;
204 204
205 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, 205 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,34 +223,35 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
223static int initiate_cifs_search(const int xid, struct file *file) 223static int initiate_cifs_search(const int xid, struct file *file)
224{ 224{
225 int rc = 0; 225 int rc = 0;
226 char *full_path; 226 char *full_path = NULL;
227 struct cifsFileInfo *cifsFile; 227 struct cifsFileInfo *cifsFile;
228 struct cifs_sb_info *cifs_sb; 228 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
229 struct tcon_link *tlink;
229 struct cifsTconInfo *pTcon; 230 struct cifsTconInfo *pTcon;
230 231
231 if (file->private_data == NULL) { 232 tlink = cifs_sb_tlink(cifs_sb);
233 if (IS_ERR(tlink))
234 return PTR_ERR(tlink);
235 pTcon = tlink_tcon(tlink);
236
237 if (file->private_data == NULL)
232 file->private_data = 238 file->private_data =
233 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 239 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
240 if (file->private_data == NULL) {
241 rc = -ENOMEM;
242 goto error_exit;
234 } 243 }
235 244
236 if (file->private_data == NULL)
237 return -ENOMEM;
238 cifsFile = file->private_data; 245 cifsFile = file->private_data;
239 cifsFile->invalidHandle = true; 246 cifsFile->invalidHandle = true;
240 cifsFile->srch_inf.endOfSearch = false; 247 cifsFile->srch_inf.endOfSearch = false;
241 248 cifsFile->tlink = cifs_get_tlink(tlink);
242 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
243 if (cifs_sb == NULL)
244 return -EINVAL;
245
246 pTcon = cifs_sb->tcon;
247 if (pTcon == NULL)
248 return -EINVAL;
249 249
250 full_path = build_path_from_dentry(file->f_path.dentry); 250 full_path = build_path_from_dentry(file->f_path.dentry);
251 251 if (full_path == NULL) {
252 if (full_path == NULL) 252 rc = -ENOMEM;
253 return -ENOMEM; 253 goto error_exit;
254 }
254 255
255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos); 256 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
256 257
@@ -283,7 +284,9 @@ ffirst_retry:
283 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 284 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
284 goto ffirst_retry; 285 goto ffirst_retry;
285 } 286 }
287error_exit:
286 kfree(full_path); 288 kfree(full_path);
289 cifs_put_tlink(tlink);
287 return rc; 290 return rc;
288} 291}
289 292
@@ -525,14 +528,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
525 (index_to_find < first_entry_in_buffer)) { 528 (index_to_find < first_entry_in_buffer)) {
526 /* close and restart search */ 529 /* close and restart search */
527 cFYI(1, "search backing up - close and restart search"); 530 cFYI(1, "search backing up - close and restart search");
528 write_lock(&GlobalSMBSeslock); 531 spin_lock(&cifs_file_list_lock);
529 if (!cifsFile->srch_inf.endOfSearch && 532 if (!cifsFile->srch_inf.endOfSearch &&
530 !cifsFile->invalidHandle) { 533 !cifsFile->invalidHandle) {
531 cifsFile->invalidHandle = true; 534 cifsFile->invalidHandle = true;
532 write_unlock(&GlobalSMBSeslock); 535 spin_unlock(&cifs_file_list_lock);
533 CIFSFindClose(xid, pTcon, cifsFile->netfid); 536 CIFSFindClose(xid, pTcon, cifsFile->netfid);
534 } else 537 } else
535 write_unlock(&GlobalSMBSeslock); 538 spin_unlock(&cifs_file_list_lock);
536 if (cifsFile->srch_inf.ntwrk_buf_start) { 539 if (cifsFile->srch_inf.ntwrk_buf_start) {
537 cFYI(1, "freeing SMB ff cache buf on search rewind"); 540 cFYI(1, "freeing SMB ff cache buf on search rewind");
538 if (cifsFile->srch_inf.smallBuf) 541 if (cifsFile->srch_inf.smallBuf)
@@ -738,6 +741,15 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
738 cifs_autodisable_serverino(cifs_sb); 741 cifs_autodisable_serverino(cifs_sb);
739 } 742 }
740 743
744 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
745 CIFSCouldBeMFSymlink(&fattr))
746 /*
747 * trying to get the type and mode can be slow,
748 * so just call those regular files for now, and mark
749 * for reval
750 */
751 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
752
741 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 753 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
742 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); 754 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
743 755
@@ -777,9 +789,17 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
777 xid = GetXid(); 789 xid = GetXid();
778 790
779 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 791 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
780 pTcon = cifs_sb->tcon; 792
781 if (pTcon == NULL) 793 /*
782 return -EINVAL; 794 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
795 * '..'. Otherwise we won't be able to notify VFS in case of failure.
796 */
797 if (file->private_data == NULL) {
798 rc = initiate_cifs_search(xid, file);
799 cFYI(1, "initiate cifs search rc %d", rc);
800 if (rc)
801 goto rddir2_exit;
802 }
783 803
784 switch ((int) file->f_pos) { 804 switch ((int) file->f_pos) {
785 case 0: 805 case 0:
@@ -805,14 +825,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
805 if after then keep searching till find it */ 825 if after then keep searching till find it */
806 826
807 if (file->private_data == NULL) { 827 if (file->private_data == NULL) {
808 rc = initiate_cifs_search(xid, file);
809 cFYI(1, "initiate cifs search rc %d", rc);
810 if (rc) {
811 FreeXid(xid);
812 return rc;
813 }
814 }
815 if (file->private_data == NULL) {
816 rc = -EINVAL; 828 rc = -EINVAL;
817 FreeXid(xid); 829 FreeXid(xid);
818 return rc; 830 return rc;
@@ -829,6 +841,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
829 CIFSFindClose(xid, pTcon, cifsFile->netfid); 841 CIFSFindClose(xid, pTcon, cifsFile->netfid);
830 } */ 842 } */
831 843
844 pTcon = tlink_tcon(cifsFile->tlink);
832 rc = find_cifs_entry(xid, pTcon, file, 845 rc = find_cifs_entry(xid, pTcon, file,
833 &current_entry, &num_to_fill); 846 &current_entry, &num_to_fill);
834 if (rc) { 847 if (rc) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..7b01d3f6eed6 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include "cifs_spnego.h" 33#include "cifs_spnego.h"
34 34
35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
36 unsigned char *p24);
37
38/* 35/*
39 * Checks if this is the first smb session to be reconnected after 36 * Checks if this is the first smb session to be reconnected after
40 * the socket has been reestablished (so we know whether to use vc 0). 37 * the socket has been reestablished (so we know whether to use vc 0).
@@ -80,7 +77,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
80 if (max_vcs < 2) 77 if (max_vcs < 2)
81 max_vcs = 0xFFFF; 78 max_vcs = 0xFFFF;
82 79
83 write_lock(&cifs_tcp_ses_lock); 80 spin_lock(&cifs_tcp_ses_lock);
84 if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) 81 if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
85 goto get_vc_num_exit; /* vcnum will be zero */ 82 goto get_vc_num_exit; /* vcnum will be zero */
86 for (i = ses->server->srv_count - 1; i < max_vcs; i++) { 83 for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +109,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
112 vcnum = i; 109 vcnum = i;
113 ses->vcnum = vcnum; 110 ses->vcnum = vcnum;
114get_vc_num_exit: 111get_vc_num_exit:
115 write_unlock(&cifs_tcp_ses_lock); 112 spin_unlock(&cifs_tcp_ses_lock);
116 113
117 return cpu_to_le16(vcnum); 114 return cpu_to_le16(vcnum);
118} 115}
@@ -383,6 +380,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
383static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, 380static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
384 struct cifsSesInfo *ses) 381 struct cifsSesInfo *ses)
385{ 382{
383 unsigned int tioffset; /* challenge message target info area */
384 unsigned int tilen; /* challenge message target info area length */
385
386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
387 387
388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -399,11 +399,23 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
399 return -EINVAL; 399 return -EINVAL;
400 } 400 }
401 401
402 memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); 402 memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
403 /* BB we could decode pblob->NegotiateFlags; some may be useful */ 403 /* BB we could decode pblob->NegotiateFlags; some may be useful */
404 /* In particular we can examine sign flags */ 404 /* In particular we can examine sign flags */
405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then 405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
406 we must set the MIC field of the AUTHENTICATE_MESSAGE */ 406 we must set the MIC field of the AUTHENTICATE_MESSAGE */
407 ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
408 tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
409 tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
410 if (tilen) {
411 ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
412 if (!ses->auth_key.response) {
413 cERROR(1, "Challenge target info allocation failure");
414 return -ENOMEM;
415 }
416 memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
417 ses->auth_key.len = tilen;
418 }
407 419
408 return 0; 420 return 0;
409} 421}
@@ -425,12 +437,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
425 /* BB is NTLMV2 session security format easier to use here? */ 437 /* BB is NTLMV2 session security format easier to use here? */
426 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 438 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
427 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 439 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
428 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 440 NTLMSSP_NEGOTIATE_NTLM;
429 if (ses->server->secMode & 441 if (ses->server->secMode &
430 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 442 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
431 flags |= NTLMSSP_NEGOTIATE_SIGN; 443 flags |= NTLMSSP_NEGOTIATE_SIGN;
432 if (ses->server->secMode & SECMODE_SIGN_REQUIRED) 444 if (!ses->server->session_estab)
433 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; 445 flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
446 NTLMSSP_NEGOTIATE_EXTENDED_SEC;
447 }
434 448
435 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 449 sec_blob->NegotiateFlags |= cpu_to_le32(flags);
436 450
@@ -448,13 +462,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
448 maximum possible size is fixed and small, making this approach cleaner. 462 maximum possible size is fixed and small, making this approach cleaner.
449 This function returns the length of the data in the blob */ 463 This function returns the length of the data in the blob */
450static int build_ntlmssp_auth_blob(unsigned char *pbuffer, 464static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
465 u16 *buflen,
451 struct cifsSesInfo *ses, 466 struct cifsSesInfo *ses,
452 const struct nls_table *nls_cp, bool first) 467 const struct nls_table *nls_cp)
453{ 468{
469 int rc;
454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 470 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
455 __u32 flags; 471 __u32 flags;
456 unsigned char *tmp; 472 unsigned char *tmp;
457 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
458 473
459 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 474 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
460 sec_blob->MessageType = NtLmAuthenticate; 475 sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
462 flags = NTLMSSP_NEGOTIATE_56 | 477 flags = NTLMSSP_NEGOTIATE_56 |
463 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 478 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
464 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 479 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
465 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 480 NTLMSSP_NEGOTIATE_NTLM;
466 if (ses->server->secMode & 481 if (ses->server->secMode &
467 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 482 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
468 flags |= NTLMSSP_NEGOTIATE_SIGN; 483 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -477,19 +492,20 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
477 sec_blob->LmChallengeResponse.Length = 0; 492 sec_blob->LmChallengeResponse.Length = 0;
478 sec_blob->LmChallengeResponse.MaximumLength = 0; 493 sec_blob->LmChallengeResponse.MaximumLength = 0;
479 494
480 /* calculate session key, BB what about adding similar ntlmv2 path? */
481 SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
482 if (first)
483 cifs_calculate_mac_key(&ses->server->mac_signing_key,
484 ntlm_session_key, ses->password);
485
486 memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
487 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); 495 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
488 sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE); 496 rc = setup_ntlmv2_rsp(ses, nls_cp);
489 sec_blob->NtChallengeResponse.MaximumLength = 497 if (rc) {
490 cpu_to_le16(CIFS_SESS_KEY_SIZE); 498 cERROR(1, "Error %d during NTLMSSP authentication", rc);
499 goto setup_ntlmv2_ret;
500 }
501 memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
502 ses->auth_key.len - CIFS_SESS_KEY_SIZE);
503 tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
491 504
492 tmp += CIFS_SESS_KEY_SIZE; 505 sec_blob->NtChallengeResponse.Length =
506 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
507 sec_blob->NtChallengeResponse.MaximumLength =
508 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
493 509
494 if (ses->domainName == NULL) { 510 if (ses->domainName == NULL) {
495 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 511 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +517,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
501 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, 517 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
502 MAX_USERNAME_SIZE, nls_cp); 518 MAX_USERNAME_SIZE, nls_cp);
503 len *= 2; /* unicode is 2 bytes each */ 519 len *= 2; /* unicode is 2 bytes each */
504 len += 2; /* trailing null */
505 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 520 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
506 sec_blob->DomainName.Length = cpu_to_le16(len); 521 sec_blob->DomainName.Length = cpu_to_le16(len);
507 sec_blob->DomainName.MaximumLength = cpu_to_le16(len); 522 sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +533,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
518 len = cifs_strtoUCS((__le16 *)tmp, ses->userName, 533 len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
519 MAX_USERNAME_SIZE, nls_cp); 534 MAX_USERNAME_SIZE, nls_cp);
520 len *= 2; /* unicode is 2 bytes each */ 535 len *= 2; /* unicode is 2 bytes each */
521 len += 2; /* trailing null */
522 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 536 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
523 sec_blob->UserName.Length = cpu_to_le16(len); 537 sec_blob->UserName.Length = cpu_to_le16(len);
524 sec_blob->UserName.MaximumLength = cpu_to_le16(len); 538 sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,10 +544,23 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
530 sec_blob->WorkstationName.MaximumLength = 0; 544 sec_blob->WorkstationName.MaximumLength = 0;
531 tmp += 2; 545 tmp += 2;
532 546
533 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); 547 if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
534 sec_blob->SessionKey.Length = 0; 548 !calc_seckey(ses)) {
535 sec_blob->SessionKey.MaximumLength = 0; 549 memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
536 return tmp - pbuffer; 550 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
551 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
552 sec_blob->SessionKey.MaximumLength =
553 cpu_to_le16(CIFS_CPHTXT_SIZE);
554 tmp += CIFS_CPHTXT_SIZE;
555 } else {
556 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
557 sec_blob->SessionKey.Length = 0;
558 sec_blob->SessionKey.MaximumLength = 0;
559 }
560
561setup_ntlmv2_ret:
562 *buflen = tmp - pbuffer;
563 return rc;
537} 564}
538 565
539 566
@@ -545,19 +572,6 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
545 572
546 return; 573 return;
547} 574}
548
549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
550 struct cifsSesInfo *ses,
551 const struct nls_table *nls, bool first_time)
552{
553 int bloblen;
554
555 bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
556 first_time);
557 pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
558
559 return bloblen;
560}
561#endif 575#endif
562 576
563int 577int
@@ -579,18 +593,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
579 int bytes_remaining; 593 int bytes_remaining;
580 struct key *spnego_key = NULL; 594 struct key *spnego_key = NULL;
581 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 595 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
582 bool first_time; 596 u16 blob_len;
597 char *ntlmsspblob = NULL;
583 598
584 if (ses == NULL) 599 if (ses == NULL)
585 return -EINVAL; 600 return -EINVAL;
586 601
587 read_lock(&cifs_tcp_ses_lock);
588 first_time = is_first_ses_reconnect(ses);
589 read_unlock(&cifs_tcp_ses_lock);
590
591 type = ses->server->secType; 602 type = ses->server->secType;
592
593 cFYI(1, "sess setup type %d", type); 603 cFYI(1, "sess setup type %d", type);
604 if (type == RawNTLMSSP) {
605 /* if memory allocation is successful, caller of this function
606 * frees it.
607 */
608 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
609 if (!ses->ntlmssp)
610 return -ENOMEM;
611 }
612
594ssetup_ntlmssp_authenticate: 613ssetup_ntlmssp_authenticate:
595 if (phase == NtLmChallenge) 614 if (phase == NtLmChallenge)
596 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ 615 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -655,10 +674,14 @@ ssetup_ntlmssp_authenticate:
655 /* no capabilities flags in old lanman negotiation */ 674 /* no capabilities flags in old lanman negotiation */
656 675
657 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 676 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
658 /* BB calculate hash with password */
659 /* and copy into bcc */
660 677
661 calc_lanman_hash(ses->password, ses->server->cryptKey, 678 /* Calculate hash with password and copy into bcc_ptr.
679 * Encryption Key (stored as in cryptkey) gets used if the
680 * security mode bit in Negottiate Protocol response states
681 * to use challenge/response method (i.e. Password bit is 1).
682 */
683
684 calc_lanman_hash(ses->password, ses->server->cryptkey,
662 ses->server->secMode & SECMODE_PW_ENCRYPT ? 685 ses->server->secMode & SECMODE_PW_ENCRYPT ?
663 true : false, lnm_session_key); 686 true : false, lnm_session_key);
664 687
@@ -676,28 +699,27 @@ ssetup_ntlmssp_authenticate:
676 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 699 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
677#endif 700#endif
678 } else if (type == NTLM) { 701 } else if (type == NTLM) {
679 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
680
681 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); 702 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
682 pSMB->req_no_secext.CaseInsensitivePasswordLength = 703 pSMB->req_no_secext.CaseInsensitivePasswordLength =
683 cpu_to_le16(CIFS_SESS_KEY_SIZE); 704 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
684 pSMB->req_no_secext.CaseSensitivePasswordLength = 705 pSMB->req_no_secext.CaseSensitivePasswordLength =
685 cpu_to_le16(CIFS_SESS_KEY_SIZE); 706 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
686 707
687 /* calculate session key */ 708 /* calculate ntlm response and session key */
688 SMBNTencrypt(ses->password, ses->server->cryptKey, 709 rc = setup_ntlm_response(ses);
689 ntlm_session_key); 710 if (rc) {
711 cERROR(1, "Error %d during NTLM authentication", rc);
712 goto ssetup_exit;
713 }
690 714
691 if (first_time) /* should this be moved into common code 715 /* copy ntlm response */
692 with similar ntlmv2 path? */ 716 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
693 cifs_calculate_mac_key(&ses->server->mac_signing_key, 717 CIFS_AUTH_RESP_SIZE);
694 ntlm_session_key, ses->password); 718 bcc_ptr += CIFS_AUTH_RESP_SIZE;
695 /* copy session key */ 719 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
720 CIFS_AUTH_RESP_SIZE);
721 bcc_ptr += CIFS_AUTH_RESP_SIZE;
696 722
697 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
698 bcc_ptr += CIFS_SESS_KEY_SIZE;
699 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
700 bcc_ptr += CIFS_SESS_KEY_SIZE;
701 if (ses->capabilities & CAP_UNICODE) { 723 if (ses->capabilities & CAP_UNICODE) {
702 /* unicode strings must be word aligned */ 724 /* unicode strings must be word aligned */
703 if (iov[0].iov_len % 2) { 725 if (iov[0].iov_len % 2) {
@@ -708,33 +730,27 @@ ssetup_ntlmssp_authenticate:
708 } else 730 } else
709 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 731 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
710 } else if (type == NTLMv2) { 732 } else if (type == NTLMv2) {
711 char *v2_sess_key =
712 kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
713
714 /* BB FIXME change all users of v2_sess_key to
715 struct ntlmv2_resp */
716
717 if (v2_sess_key == NULL) {
718 rc = -ENOMEM;
719 goto ssetup_exit;
720 }
721
722 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); 733 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
723 734
724 /* LM2 password would be here if we supported it */ 735 /* LM2 password would be here if we supported it */
725 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; 736 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
726 /* cpu_to_le16(LM2_SESS_KEY_SIZE); */
727 737
738 /* calculate nlmv2 response and session key */
739 rc = setup_ntlmv2_rsp(ses, nls_cp);
740 if (rc) {
741 cERROR(1, "Error %d during NTLMv2 authentication", rc);
742 goto ssetup_exit;
743 }
744 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
745 ses->auth_key.len - CIFS_SESS_KEY_SIZE);
746 bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
747
748 /* set case sensitive password length after tilen may get
749 * assigned, tilen is 0 otherwise.
750 */
728 pSMB->req_no_secext.CaseSensitivePasswordLength = 751 pSMB->req_no_secext.CaseSensitivePasswordLength =
729 cpu_to_le16(sizeof(struct ntlmv2_resp)); 752 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
730 753
731 /* calculate session key */
732 setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
733 /* FIXME: calculate MAC key */
734 memcpy(bcc_ptr, (char *)v2_sess_key,
735 sizeof(struct ntlmv2_resp));
736 bcc_ptr += sizeof(struct ntlmv2_resp);
737 kfree(v2_sess_key);
738 if (ses->capabilities & CAP_UNICODE) { 754 if (ses->capabilities & CAP_UNICODE) {
739 if (iov[0].iov_len % 2) { 755 if (iov[0].iov_len % 2) {
740 *bcc_ptr = 0; 756 *bcc_ptr = 0;
@@ -746,6 +762,7 @@ ssetup_ntlmssp_authenticate:
746 } else if (type == Kerberos) { 762 } else if (type == Kerberos) {
747#ifdef CONFIG_CIFS_UPCALL 763#ifdef CONFIG_CIFS_UPCALL
748 struct cifs_spnego_msg *msg; 764 struct cifs_spnego_msg *msg;
765
749 spnego_key = cifs_get_spnego_key(ses); 766 spnego_key = cifs_get_spnego_key(ses);
750 if (IS_ERR(spnego_key)) { 767 if (IS_ERR(spnego_key)) {
751 rc = PTR_ERR(spnego_key); 768 rc = PTR_ERR(spnego_key);
@@ -763,19 +780,17 @@ ssetup_ntlmssp_authenticate:
763 rc = -EKEYREJECTED; 780 rc = -EKEYREJECTED;
764 goto ssetup_exit; 781 goto ssetup_exit;
765 } 782 }
766 /* bail out if key is too long */ 783
767 if (msg->sesskey_len > 784 ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
768 sizeof(ses->server->mac_signing_key.data.krb5)) { 785 if (!ses->auth_key.response) {
769 cERROR(1, "Kerberos signing key too long (%u bytes)", 786 cERROR(1, "Kerberos can't allocate (%u bytes) memory",
770 msg->sesskey_len); 787 msg->sesskey_len);
771 rc = -EOVERFLOW; 788 rc = -ENOMEM;
772 goto ssetup_exit; 789 goto ssetup_exit;
773 } 790 }
774 if (first_time) { 791 memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
775 ses->server->mac_signing_key.len = msg->sesskey_len; 792 ses->auth_key.len = msg->sesskey_len;
776 memcpy(ses->server->mac_signing_key.data.krb5, 793
777 msg->data, msg->sesskey_len);
778 }
779 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 794 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
780 capabilities |= CAP_EXTENDED_SECURITY; 795 capabilities |= CAP_EXTENDED_SECURITY;
781 pSMB->req.Capabilities = cpu_to_le32(capabilities); 796 pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -815,12 +830,30 @@ ssetup_ntlmssp_authenticate:
815 if (phase == NtLmNegotiate) { 830 if (phase == NtLmNegotiate) {
816 setup_ntlmssp_neg_req(pSMB, ses); 831 setup_ntlmssp_neg_req(pSMB, ses);
817 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); 832 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
818 } else if (phase == NtLmAuthenticate) { 834 } else if (phase == NtLmAuthenticate) {
819 int blob_len; 835 /* 5 is an empirical value, large enought to
820 blob_len = setup_ntlmssp_auth_req(pSMB, ses, 836 * hold authenticate message, max 10 of
821 nls_cp, 837 * av paris, doamin,user,workstation mames,
822 first_time); 838 * flags etc..
839 */
840 ntlmsspblob = kmalloc(
841 5*sizeof(struct _AUTHENTICATE_MESSAGE),
842 GFP_KERNEL);
843 if (!ntlmsspblob) {
844 cERROR(1, "Can't allocate NTLMSSP");
845 rc = -ENOMEM;
846 goto ssetup_exit;
847 }
848
849 rc = build_ntlmssp_auth_blob(ntlmsspblob,
850 &blob_len, ses, nls_cp);
851 if (rc)
852 goto ssetup_exit;
823 iov[1].iov_len = blob_len; 853 iov[1].iov_len = blob_len;
854 iov[1].iov_base = ntlmsspblob;
855 pSMB->req.SecurityBlobLength =
856 cpu_to_le16(blob_len);
824 /* Make sure that we tell the server that we 857 /* Make sure that we tell the server that we
825 are using the uid that it just gave us back 858 are using the uid that it just gave us back
826 on the response (challenge) */ 859 on the response (challenge) */
@@ -830,7 +863,6 @@ ssetup_ntlmssp_authenticate:
830 rc = -ENOSYS; 863 rc = -ENOSYS;
831 goto ssetup_exit; 864 goto ssetup_exit;
832 } 865 }
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
834 /* unicode strings must be word aligned */ 866 /* unicode strings must be word aligned */
835 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 867 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
836 *bcc_ptr = 0; 868 *bcc_ptr = 0;
@@ -861,8 +893,6 @@ ssetup_ntlmssp_authenticate:
861 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 893 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
862 /* SMB request buf freed in SendReceive2 */ 894 /* SMB request buf freed in SendReceive2 */
863 895
864 cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
865
866 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 896 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
867 smb_buf = (struct smb_hdr *)iov[0].iov_base; 897 smb_buf = (struct smb_hdr *)iov[0].iov_base;
868 898
@@ -895,7 +925,6 @@ ssetup_ntlmssp_authenticate:
895 bcc_ptr = pByteArea(smb_buf); 925 bcc_ptr = pByteArea(smb_buf);
896 926
897 if (smb_buf->WordCount == 4) { 927 if (smb_buf->WordCount == 4) {
898 __u16 blob_len;
899 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 928 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
900 if (blob_len > bytes_remaining) { 929 if (blob_len > bytes_remaining) {
901 cERROR(1, "bad security blob length %d", blob_len); 930 cERROR(1, "bad security blob length %d", blob_len);
@@ -931,6 +960,8 @@ ssetup_exit:
931 key_put(spnego_key); 960 key_put(spnego_key);
932 } 961 }
933 kfree(str_area); 962 kfree(str_area);
963 kfree(ntlmsspblob);
964 ntlmsspblob = NULL;
934 if (resp_buf_type == CIFS_SMALL_BUFFER) { 965 if (resp_buf_type == CIFS_SMALL_BUFFER) {
935 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base); 966 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
936 cifs_small_buf_release(iov[0].iov_base); 967 cifs_small_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..e0588cdf4cc5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
544 SECMODE_SIGN_ENABLED))) { 544 SECMODE_SIGN_ENABLED))) {
545 rc = cifs_verify_signature(midQ->resp_buf, 545 rc = cifs_verify_signature(midQ->resp_buf,
546 &ses->server->mac_signing_key, 546 ses->server,
547 midQ->sequence_number+1); 547 midQ->sequence_number+1);
548 if (rc) { 548 if (rc) {
549 cERROR(1, "Unexpected SMB signature"); 549 cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
732 SECMODE_SIGN_ENABLED))) { 732 SECMODE_SIGN_ENABLED))) {
733 rc = cifs_verify_signature(out_buf, 733 rc = cifs_verify_signature(out_buf,
734 &ses->server->mac_signing_key, 734 ses->server,
735 midQ->sequence_number+1); 735 midQ->sequence_number+1);
736 if (rc) { 736 if (rc) {
737 cERROR(1, "Unexpected SMB signature"); 737 cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
982 SECMODE_SIGN_ENABLED))) { 982 SECMODE_SIGN_ENABLED))) {
983 rc = cifs_verify_signature(out_buf, 983 rc = cifs_verify_signature(out_buf,
984 &ses->server->mac_signing_key, 984 ses->server,
985 midQ->sequence_number+1); 985 midQ->sequence_number+1);
986 if (rc) { 986 if (rc) {
987 cERROR(1, "Unexpected SMB signature"); 987 cERROR(1, "Unexpected SMB signature");
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa6..a264b744bb41 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -47,9 +47,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
47#ifdef CONFIG_CIFS_XATTR 47#ifdef CONFIG_CIFS_XATTR
48 int xid; 48 int xid;
49 struct cifs_sb_info *cifs_sb; 49 struct cifs_sb_info *cifs_sb;
50 struct tcon_link *tlink;
50 struct cifsTconInfo *pTcon; 51 struct cifsTconInfo *pTcon;
51 struct super_block *sb; 52 struct super_block *sb;
52 char *full_path; 53 char *full_path = NULL;
53 54
54 if (direntry == NULL) 55 if (direntry == NULL)
55 return -EIO; 56 return -EIO;
@@ -58,16 +59,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
58 sb = direntry->d_inode->i_sb; 59 sb = direntry->d_inode->i_sb;
59 if (sb == NULL) 60 if (sb == NULL)
60 return -EIO; 61 return -EIO;
61 xid = GetXid();
62 62
63 cifs_sb = CIFS_SB(sb); 63 cifs_sb = CIFS_SB(sb);
64 pTcon = cifs_sb->tcon; 64 tlink = cifs_sb_tlink(cifs_sb);
65 if (IS_ERR(tlink))
66 return PTR_ERR(tlink);
67 pTcon = tlink_tcon(tlink);
68
69 xid = GetXid();
65 70
66 full_path = build_path_from_dentry(direntry); 71 full_path = build_path_from_dentry(direntry);
67 if (full_path == NULL) { 72 if (full_path == NULL) {
68 rc = -ENOMEM; 73 rc = -ENOMEM;
69 FreeXid(xid); 74 goto remove_ea_exit;
70 return rc;
71 } 75 }
72 if (ea_name == NULL) { 76 if (ea_name == NULL) {
73 cFYI(1, "Null xattr names not supported"); 77 cFYI(1, "Null xattr names not supported");
@@ -91,6 +95,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
91remove_ea_exit: 95remove_ea_exit:
92 kfree(full_path); 96 kfree(full_path);
93 FreeXid(xid); 97 FreeXid(xid);
98 cifs_put_tlink(tlink);
94#endif 99#endif
95 return rc; 100 return rc;
96} 101}
@@ -102,6 +107,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
102#ifdef CONFIG_CIFS_XATTR 107#ifdef CONFIG_CIFS_XATTR
103 int xid; 108 int xid;
104 struct cifs_sb_info *cifs_sb; 109 struct cifs_sb_info *cifs_sb;
110 struct tcon_link *tlink;
105 struct cifsTconInfo *pTcon; 111 struct cifsTconInfo *pTcon;
106 struct super_block *sb; 112 struct super_block *sb;
107 char *full_path; 113 char *full_path;
@@ -113,16 +119,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
113 sb = direntry->d_inode->i_sb; 119 sb = direntry->d_inode->i_sb;
114 if (sb == NULL) 120 if (sb == NULL)
115 return -EIO; 121 return -EIO;
116 xid = GetXid();
117 122
118 cifs_sb = CIFS_SB(sb); 123 cifs_sb = CIFS_SB(sb);
119 pTcon = cifs_sb->tcon; 124 tlink = cifs_sb_tlink(cifs_sb);
125 if (IS_ERR(tlink))
126 return PTR_ERR(tlink);
127 pTcon = tlink_tcon(tlink);
128
129 xid = GetXid();
120 130
121 full_path = build_path_from_dentry(direntry); 131 full_path = build_path_from_dentry(direntry);
122 if (full_path == NULL) { 132 if (full_path == NULL) {
123 rc = -ENOMEM; 133 rc = -ENOMEM;
124 FreeXid(xid); 134 goto set_ea_exit;
125 return rc;
126 } 135 }
127 /* return dos attributes as pseudo xattr */ 136 /* return dos attributes as pseudo xattr */
128 /* return alt name if available as pseudo attr */ 137 /* return alt name if available as pseudo attr */
@@ -132,9 +141,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
132 returns as xattrs */ 141 returns as xattrs */
133 if (value_size > MAX_EA_VALUE_SIZE) { 142 if (value_size > MAX_EA_VALUE_SIZE) {
134 cFYI(1, "size of EA value too large"); 143 cFYI(1, "size of EA value too large");
135 kfree(full_path); 144 rc = -EOPNOTSUPP;
136 FreeXid(xid); 145 goto set_ea_exit;
137 return -EOPNOTSUPP;
138 } 146 }
139 147
140 if (ea_name == NULL) { 148 if (ea_name == NULL) {
@@ -198,6 +206,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
198set_ea_exit: 206set_ea_exit:
199 kfree(full_path); 207 kfree(full_path);
200 FreeXid(xid); 208 FreeXid(xid);
209 cifs_put_tlink(tlink);
201#endif 210#endif
202 return rc; 211 return rc;
203} 212}
@@ -209,6 +218,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
209#ifdef CONFIG_CIFS_XATTR 218#ifdef CONFIG_CIFS_XATTR
210 int xid; 219 int xid;
211 struct cifs_sb_info *cifs_sb; 220 struct cifs_sb_info *cifs_sb;
221 struct tcon_link *tlink;
212 struct cifsTconInfo *pTcon; 222 struct cifsTconInfo *pTcon;
213 struct super_block *sb; 223 struct super_block *sb;
214 char *full_path; 224 char *full_path;
@@ -221,16 +231,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
221 if (sb == NULL) 231 if (sb == NULL)
222 return -EIO; 232 return -EIO;
223 233
224 xid = GetXid();
225
226 cifs_sb = CIFS_SB(sb); 234 cifs_sb = CIFS_SB(sb);
227 pTcon = cifs_sb->tcon; 235 tlink = cifs_sb_tlink(cifs_sb);
236 if (IS_ERR(tlink))
237 return PTR_ERR(tlink);
238 pTcon = tlink_tcon(tlink);
239
240 xid = GetXid();
228 241
229 full_path = build_path_from_dentry(direntry); 242 full_path = build_path_from_dentry(direntry);
230 if (full_path == NULL) { 243 if (full_path == NULL) {
231 rc = -ENOMEM; 244 rc = -ENOMEM;
232 FreeXid(xid); 245 goto get_ea_exit;
233 return rc;
234 } 246 }
235 /* return dos attributes as pseudo xattr */ 247 /* return dos attributes as pseudo xattr */
236 /* return alt name if available as pseudo attr */ 248 /* return alt name if available as pseudo attr */
@@ -323,6 +335,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
323get_ea_exit: 335get_ea_exit:
324 kfree(full_path); 336 kfree(full_path);
325 FreeXid(xid); 337 FreeXid(xid);
338 cifs_put_tlink(tlink);
326#endif 339#endif
327 return rc; 340 return rc;
328} 341}
@@ -333,6 +346,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
333#ifdef CONFIG_CIFS_XATTR 346#ifdef CONFIG_CIFS_XATTR
334 int xid; 347 int xid;
335 struct cifs_sb_info *cifs_sb; 348 struct cifs_sb_info *cifs_sb;
349 struct tcon_link *tlink;
336 struct cifsTconInfo *pTcon; 350 struct cifsTconInfo *pTcon;
337 struct super_block *sb; 351 struct super_block *sb;
338 char *full_path; 352 char *full_path;
@@ -346,18 +360,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
346 return -EIO; 360 return -EIO;
347 361
348 cifs_sb = CIFS_SB(sb); 362 cifs_sb = CIFS_SB(sb);
349 pTcon = cifs_sb->tcon;
350
351 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 363 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
352 return -EOPNOTSUPP; 364 return -EOPNOTSUPP;
353 365
366 tlink = cifs_sb_tlink(cifs_sb);
367 if (IS_ERR(tlink))
368 return PTR_ERR(tlink);
369 pTcon = tlink_tcon(tlink);
370
354 xid = GetXid(); 371 xid = GetXid();
355 372
356 full_path = build_path_from_dentry(direntry); 373 full_path = build_path_from_dentry(direntry);
357 if (full_path == NULL) { 374 if (full_path == NULL) {
358 rc = -ENOMEM; 375 rc = -ENOMEM;
359 FreeXid(xid); 376 goto list_ea_exit;
360 return rc;
361 } 377 }
362 /* return dos attributes as pseudo xattr */ 378 /* return dos attributes as pseudo xattr */
363 /* return alt name if available as pseudo attr */ 379 /* return alt name if available as pseudo attr */
@@ -370,8 +386,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
370 cifs_sb->mnt_cifs_flags & 386 cifs_sb->mnt_cifs_flags &
371 CIFS_MOUNT_MAP_SPECIAL_CHR); 387 CIFS_MOUNT_MAP_SPECIAL_CHR);
372 388
389list_ea_exit:
373 kfree(full_path); 390 kfree(full_path);
374 FreeXid(xid); 391 FreeXid(xid);
392 cifs_put_tlink(tlink);
375#endif 393#endif
376 return rc; 394 return rc;
377} 395}
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22a..9060f08e70cf 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,6 +17,7 @@
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/spinlock.h>
20 21
21#include <linux/coda.h> 22#include <linux/coda.h>
22#include <linux/coda_linux.h> 23#include <linux/coda_linux.h>
@@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask)
31{ 32{
32 struct coda_inode_info *cii = ITOC(inode); 33 struct coda_inode_info *cii = ITOC(inode);
33 34
35 spin_lock(&cii->c_lock);
34 cii->c_cached_epoch = atomic_read(&permission_epoch); 36 cii->c_cached_epoch = atomic_read(&permission_epoch);
35 if (cii->c_uid != current_fsuid()) { 37 if (cii->c_uid != current_fsuid()) {
36 cii->c_uid = current_fsuid(); 38 cii->c_uid = current_fsuid();
37 cii->c_cached_perm = mask; 39 cii->c_cached_perm = mask;
38 } else 40 } else
39 cii->c_cached_perm |= mask; 41 cii->c_cached_perm |= mask;
42 spin_unlock(&cii->c_lock);
40} 43}
41 44
42/* remove cached acl from an inode */ 45/* remove cached acl from an inode */
43void coda_cache_clear_inode(struct inode *inode) 46void coda_cache_clear_inode(struct inode *inode)
44{ 47{
45 struct coda_inode_info *cii = ITOC(inode); 48 struct coda_inode_info *cii = ITOC(inode);
49 spin_lock(&cii->c_lock);
46 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1; 50 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
51 spin_unlock(&cii->c_lock);
47} 52}
48 53
49/* remove all acl caches */ 54/* remove all acl caches */
@@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb)
57int coda_cache_check(struct inode *inode, int mask) 62int coda_cache_check(struct inode *inode, int mask)
58{ 63{
59 struct coda_inode_info *cii = ITOC(inode); 64 struct coda_inode_info *cii = ITOC(inode);
60 int hit; 65 int hit;
61 66
62 hit = (mask & cii->c_cached_perm) == mask && 67 spin_lock(&cii->c_lock);
63 cii->c_uid == current_fsuid() && 68 hit = (mask & cii->c_cached_perm) == mask &&
64 cii->c_cached_epoch == atomic_read(&permission_epoch); 69 cii->c_uid == current_fsuid() &&
70 cii->c_cached_epoch == atomic_read(&permission_epoch);
71 spin_unlock(&cii->c_lock);
65 72
66 return hit; 73 return hit;
67} 74}
68 75
69 76
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929eec..602240569c89 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
45static int coda_test_inode(struct inode *inode, void *data) 45static int coda_test_inode(struct inode *inode, void *data)
46{ 46{
47 struct CodaFid *fid = (struct CodaFid *)data; 47 struct CodaFid *fid = (struct CodaFid *)data;
48 return coda_fideq(&(ITOC(inode)->c_fid), fid); 48 struct coda_inode_info *cii = ITOC(inode);
49 return coda_fideq(&cii->c_fid, fid);
49} 50}
50 51
51static int coda_set_inode(struct inode *inode, void *data) 52static int coda_set_inode(struct inode *inode, void *data)
52{ 53{
53 struct CodaFid *fid = (struct CodaFid *)data; 54 struct CodaFid *fid = (struct CodaFid *)data;
54 ITOC(inode)->c_fid = *fid; 55 struct coda_inode_info *cii = ITOC(inode);
56 cii->c_fid = *fid;
55 return 0; 57 return 0;
56} 58}
57 59
@@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
71 cii = ITOC(inode); 73 cii = ITOC(inode);
72 /* we still need to set i_ino for things like stat(2) */ 74 /* we still need to set i_ino for things like stat(2) */
73 inode->i_ino = hash; 75 inode->i_ino = hash;
76 /* inode is locked and unique, no need to grab cii->c_lock */
74 cii->c_mapcount = 0; 77 cii->c_mapcount = 0;
75 unlock_new_inode(inode); 78 unlock_new_inode(inode);
76 } 79 }
@@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
107} 110}
108 111
109 112
113/* Although we treat Coda file identifiers as immutable, there is one
114 * special case for files created during a disconnection where they may
115 * not be globally unique. When an identifier collision is detected we
116 * first try to flush the cached inode from the kernel and finally
117 * resort to renaming/rehashing in-place. Userspace remembers both old
118 * and new values of the identifier to handle any in-flight upcalls.
119 * The real solution is to use globally unique UUIDs as identifiers, but
120 * retrofitting the existing userspace code for this is non-trivial. */
110void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 121void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid,
111 struct CodaFid *newfid) 122 struct CodaFid *newfid)
112{ 123{
113 struct coda_inode_info *cii; 124 struct coda_inode_info *cii = ITOC(inode);
114 unsigned long hash = coda_f2i(newfid); 125 unsigned long hash = coda_f2i(newfid);
115 126
116 cii = ITOC(inode);
117
118 BUG_ON(!coda_fideq(&cii->c_fid, oldfid)); 127 BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
119 128
120 /* replace fid and rehash inode */ 129 /* replace fid and rehash inode */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0b..5d8b35539601 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,7 +17,7 @@
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/smp_lock.h> 20#include <linux/spinlock.h>
21 21
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23 23
@@ -116,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
116 goto exit; 116 goto exit;
117 } 117 }
118 118
119 lock_kernel();
120
121 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, 119 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
122 &type, &resfid); 120 &type, &resfid);
123 if (!error) 121 if (!error)
124 error = coda_cnode_make(&inode, &resfid, dir->i_sb); 122 error = coda_cnode_make(&inode, &resfid, dir->i_sb);
125 123
126 unlock_kernel();
127
128 if (error && error != -ENOENT) 124 if (error && error != -ENOENT)
129 return ERR_PTR(error); 125 return ERR_PTR(error);
130 126
@@ -140,28 +136,24 @@ exit:
140 136
141int coda_permission(struct inode *inode, int mask) 137int coda_permission(struct inode *inode, int mask)
142{ 138{
143 int error = 0; 139 int error;
144 140
145 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 141 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
146 142
147 if (!mask) 143 if (!mask)
148 return 0; 144 return 0;
149 145
150 if ((mask & MAY_EXEC) && !execute_ok(inode)) 146 if ((mask & MAY_EXEC) && !execute_ok(inode))
151 return -EACCES; 147 return -EACCES;
152 148
153 lock_kernel();
154
155 if (coda_cache_check(inode, mask)) 149 if (coda_cache_check(inode, mask))
156 goto out; 150 return 0;
157 151
158 error = venus_access(inode->i_sb, coda_i2f(inode), mask); 152 error = venus_access(inode->i_sb, coda_i2f(inode), mask);
159 153
160 if (!error) 154 if (!error)
161 coda_cache_enter(inode, mask); 155 coda_cache_enter(inode, mask);
162 156
163 out:
164 unlock_kernel();
165 return error; 157 return error;
166} 158}
167 159
@@ -200,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
200/* creation routines: create, mknod, mkdir, link, symlink */ 192/* creation routines: create, mknod, mkdir, link, symlink */
201static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) 193static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
202{ 194{
203 int error=0; 195 int error;
204 const char *name=de->d_name.name; 196 const char *name=de->d_name.name;
205 int length=de->d_name.len; 197 int length=de->d_name.len;
206 struct inode *inode; 198 struct inode *inode;
207 struct CodaFid newfid; 199 struct CodaFid newfid;
208 struct coda_vattr attrs; 200 struct coda_vattr attrs;
209 201
210 lock_kernel(); 202 if (coda_isroot(dir) && coda_iscontrol(name, length))
211
212 if (coda_isroot(dir) && coda_iscontrol(name, length)) {
213 unlock_kernel();
214 return -EPERM; 203 return -EPERM;
215 }
216 204
217 error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 205 error = venus_create(dir->i_sb, coda_i2f(dir), name, length,
218 0, mode, &newfid, &attrs); 206 0, mode, &newfid, &attrs);
219 207 if (error)
220 if ( error ) { 208 goto err_out;
221 unlock_kernel();
222 d_drop(de);
223 return error;
224 }
225 209
226 inode = coda_iget(dir->i_sb, &newfid, &attrs); 210 inode = coda_iget(dir->i_sb, &newfid, &attrs);
227 if ( IS_ERR(inode) ) { 211 if (IS_ERR(inode)) {
228 unlock_kernel(); 212 error = PTR_ERR(inode);
229 d_drop(de); 213 goto err_out;
230 return PTR_ERR(inode);
231 } 214 }
232 215
233 /* invalidate the directory cnode's attributes */ 216 /* invalidate the directory cnode's attributes */
234 coda_dir_update_mtime(dir); 217 coda_dir_update_mtime(dir);
235 unlock_kernel();
236 d_instantiate(de, inode); 218 d_instantiate(de, inode);
237 return 0; 219 return 0;
220err_out:
221 d_drop(de);
222 return error;
238} 223}
239 224
240static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) 225static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
246 int error; 231 int error;
247 struct CodaFid newfid; 232 struct CodaFid newfid;
248 233
249 lock_kernel(); 234 if (coda_isroot(dir) && coda_iscontrol(name, len))
250
251 if (coda_isroot(dir) && coda_iscontrol(name, len)) {
252 unlock_kernel();
253 return -EPERM; 235 return -EPERM;
254 }
255 236
256 attrs.va_mode = mode; 237 attrs.va_mode = mode;
257 error = venus_mkdir(dir->i_sb, coda_i2f(dir), 238 error = venus_mkdir(dir->i_sb, coda_i2f(dir),
258 name, len, &newfid, &attrs); 239 name, len, &newfid, &attrs);
259 240 if (error)
260 if ( error ) { 241 goto err_out;
261 unlock_kernel();
262 d_drop(de);
263 return error;
264 }
265 242
266 inode = coda_iget(dir->i_sb, &newfid, &attrs); 243 inode = coda_iget(dir->i_sb, &newfid, &attrs);
267 if ( IS_ERR(inode) ) { 244 if (IS_ERR(inode)) {
268 unlock_kernel(); 245 error = PTR_ERR(inode);
269 d_drop(de); 246 goto err_out;
270 return PTR_ERR(inode);
271 } 247 }
272 248
273 /* invalidate the directory cnode's attributes */ 249 /* invalidate the directory cnode's attributes */
274 coda_dir_inc_nlink(dir); 250 coda_dir_inc_nlink(dir);
275 coda_dir_update_mtime(dir); 251 coda_dir_update_mtime(dir);
276 unlock_kernel();
277 d_instantiate(de, inode); 252 d_instantiate(de, inode);
278 return 0; 253 return 0;
254err_out:
255 d_drop(de);
256 return error;
279} 257}
280 258
281/* try to make de an entry in dir_inodde linked to source_de */ 259/* try to make de an entry in dir_inodde linked to source_de */
@@ -287,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
287 int len = de->d_name.len; 265 int len = de->d_name.len;
288 int error; 266 int error;
289 267
290 lock_kernel(); 268 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
291
292 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
293 unlock_kernel();
294 return -EPERM; 269 return -EPERM;
295 }
296 270
297 error = venus_link(dir_inode->i_sb, coda_i2f(inode), 271 error = venus_link(dir_inode->i_sb, coda_i2f(inode),
298 coda_i2f(dir_inode), (const char *)name, len); 272 coda_i2f(dir_inode), (const char *)name, len);
299
300 if (error) { 273 if (error) {
301 d_drop(de); 274 d_drop(de);
302 goto out; 275 return error;
303 } 276 }
304 277
305 coda_dir_update_mtime(dir_inode); 278 coda_dir_update_mtime(dir_inode);
306 atomic_inc(&inode->i_count); 279 ihold(inode);
307 d_instantiate(de, inode); 280 d_instantiate(de, inode);
308 inc_nlink(inode); 281 inc_nlink(inode);
309 282 return 0;
310out:
311 unlock_kernel();
312 return(error);
313} 283}
314 284
315 285
316static int coda_symlink(struct inode *dir_inode, struct dentry *de, 286static int coda_symlink(struct inode *dir_inode, struct dentry *de,
317 const char *symname) 287 const char *symname)
318{ 288{
319 const char *name = de->d_name.name; 289 const char *name = de->d_name.name;
320 int len = de->d_name.len; 290 int len = de->d_name.len;
321 int symlen; 291 int symlen;
322 int error = 0; 292 int error;
323
324 lock_kernel();
325 293
326 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) { 294 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
327 unlock_kernel();
328 return -EPERM; 295 return -EPERM;
329 }
330 296
331 symlen = strlen(symname); 297 symlen = strlen(symname);
332 if ( symlen > CODA_MAXPATHLEN ) { 298 if (symlen > CODA_MAXPATHLEN)
333 unlock_kernel();
334 return -ENAMETOOLONG; 299 return -ENAMETOOLONG;
335 }
336 300
337 /* 301 /*
338 * This entry is now negative. Since we do not create 302 * This entry is now negative. Since we do not create
@@ -343,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
343 symname, symlen); 307 symname, symlen);
344 308
345 /* mtime is no good anymore */ 309 /* mtime is no good anymore */
346 if ( !error ) 310 if (!error)
347 coda_dir_update_mtime(dir_inode); 311 coda_dir_update_mtime(dir_inode);
348 312
349 unlock_kernel();
350 return error; 313 return error;
351} 314}
352 315
@@ -357,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
357 const char *name = de->d_name.name; 320 const char *name = de->d_name.name;
358 int len = de->d_name.len; 321 int len = de->d_name.len;
359 322
360 lock_kernel();
361
362 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len); 323 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
363 if ( error ) { 324 if (error)
364 unlock_kernel();
365 return error; 325 return error;
366 }
367 326
368 coda_dir_update_mtime(dir); 327 coda_dir_update_mtime(dir);
369 drop_nlink(de->d_inode); 328 drop_nlink(de->d_inode);
370 unlock_kernel();
371 return 0; 329 return 0;
372} 330}
373 331
@@ -377,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
377 int len = de->d_name.len; 335 int len = de->d_name.len;
378 int error; 336 int error;
379 337
380 lock_kernel();
381
382 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 338 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
383 if (!error) { 339 if (!error) {
384 /* VFS may delete the child */ 340 /* VFS may delete the child */
@@ -389,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
389 coda_dir_drop_nlink(dir); 345 coda_dir_drop_nlink(dir);
390 coda_dir_update_mtime(dir); 346 coda_dir_update_mtime(dir);
391 } 347 }
392 unlock_kernel();
393 return error; 348 return error;
394} 349}
395 350
@@ -403,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
403 int new_length = new_dentry->d_name.len; 358 int new_length = new_dentry->d_name.len;
404 int error; 359 int error;
405 360
406 lock_kernel();
407
408 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 361 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
409 coda_i2f(new_dir), old_length, new_length, 362 coda_i2f(new_dir), old_length, new_length,
410 (const char *) old_name, (const char *)new_name); 363 (const char *) old_name, (const char *)new_name);
411 364 if (!error) {
412 if ( !error ) { 365 if (new_dentry->d_inode) {
413 if ( new_dentry->d_inode ) { 366 if (S_ISDIR(new_dentry->d_inode->i_mode)) {
414 if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
415 coda_dir_drop_nlink(old_dir); 367 coda_dir_drop_nlink(old_dir);
416 coda_dir_inc_nlink(new_dir); 368 coda_dir_inc_nlink(new_dir);
417 } 369 }
@@ -423,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
423 coda_flag_inode(new_dir, C_VATTR); 375 coda_flag_inode(new_dir, C_VATTR);
424 } 376 }
425 } 377 }
426 unlock_kernel();
427
428 return error; 378 return error;
429} 379}
430 380
@@ -594,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
594 struct inode *inode = de->d_inode; 544 struct inode *inode = de->d_inode;
595 struct coda_inode_info *cii; 545 struct coda_inode_info *cii;
596 546
597 if (!inode) 547 if (!inode || coda_isroot(inode))
598 return 1;
599 lock_kernel();
600 if (coda_isroot(inode))
601 goto out; 548 goto out;
602 if (is_bad_inode(inode)) 549 if (is_bad_inode(inode))
603 goto bad; 550 goto bad;
@@ -617,13 +564,12 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
617 goto out; 564 goto out;
618 565
619 /* clear the flags. */ 566 /* clear the flags. */
567 spin_lock(&cii->c_lock);
620 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 568 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
621 569 spin_unlock(&cii->c_lock);
622bad: 570bad:
623 unlock_kernel();
624 return 0; 571 return 0;
625out: 572out:
626 unlock_kernel();
627 return 1; 573 return 1;
628} 574}
629 575
@@ -656,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry)
656int coda_revalidate_inode(struct dentry *dentry) 602int coda_revalidate_inode(struct dentry *dentry)
657{ 603{
658 struct coda_vattr attr; 604 struct coda_vattr attr;
659 int error = 0; 605 int error;
660 int old_mode; 606 int old_mode;
661 ino_t old_ino; 607 ino_t old_ino;
662 struct inode *inode = dentry->d_inode; 608 struct inode *inode = dentry->d_inode;
663 struct coda_inode_info *cii = ITOC(inode); 609 struct coda_inode_info *cii = ITOC(inode);
664 610
665 lock_kernel(); 611 if (!cii->c_flags)
666 if ( !cii->c_flags ) 612 return 0;
667 goto ok;
668 613
669 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) { 614 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
670 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr); 615 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
671 if ( error ) 616 if (error)
672 goto return_bad; 617 return -EIO;
673 618
674 /* this inode may be lost if: 619 /* this inode may be lost if:
675 - it's ino changed 620 - it's ino changed
@@ -688,17 +633,13 @@ int coda_revalidate_inode(struct dentry *dentry)
688 /* the following can happen when a local fid is replaced 633 /* the following can happen when a local fid is replaced
689 with a global one, here we lose and declare the inode bad */ 634 with a global one, here we lose and declare the inode bad */
690 if (inode->i_ino != old_ino) 635 if (inode->i_ino != old_ino)
691 goto return_bad; 636 return -EIO;
692 637
693 coda_flag_inode_children(inode, C_FLUSH); 638 coda_flag_inode_children(inode, C_FLUSH);
639
640 spin_lock(&cii->c_lock);
694 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 641 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
642 spin_unlock(&cii->c_lock);
695 } 643 }
696
697ok:
698 unlock_kernel();
699 return 0; 644 return 0;
700
701return_bad:
702 unlock_kernel();
703 return -EIO;
704} 645}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb4..c8b50ba4366a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,7 +15,7 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/cred.h> 16#include <linux/cred.h>
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/spinlock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
@@ -109,19 +109,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
109 109
110 coda_inode = coda_file->f_path.dentry->d_inode; 110 coda_inode = coda_file->f_path.dentry->d_inode;
111 host_inode = host_file->f_path.dentry->d_inode; 111 host_inode = host_file->f_path.dentry->d_inode;
112
113 cii = ITOC(coda_inode);
114 spin_lock(&cii->c_lock);
112 coda_file->f_mapping = host_file->f_mapping; 115 coda_file->f_mapping = host_file->f_mapping;
113 if (coda_inode->i_mapping == &coda_inode->i_data) 116 if (coda_inode->i_mapping == &coda_inode->i_data)
114 coda_inode->i_mapping = host_inode->i_mapping; 117 coda_inode->i_mapping = host_inode->i_mapping;
115 118
116 /* only allow additional mmaps as long as userspace isn't changing 119 /* only allow additional mmaps as long as userspace isn't changing
117 * the container file on us! */ 120 * the container file on us! */
118 else if (coda_inode->i_mapping != host_inode->i_mapping) 121 else if (coda_inode->i_mapping != host_inode->i_mapping) {
122 spin_unlock(&cii->c_lock);
119 return -EBUSY; 123 return -EBUSY;
124 }
120 125
121 /* keep track of how often the coda_inode/host_file has been mmapped */ 126 /* keep track of how often the coda_inode/host_file has been mmapped */
122 cii = ITOC(coda_inode);
123 cii->c_mapcount++; 127 cii->c_mapcount++;
124 cfi->cfi_mapcount++; 128 cfi->cfi_mapcount++;
129 spin_unlock(&cii->c_lock);
125 130
126 return host_file->f_op->mmap(host_file, vma); 131 return host_file->f_op->mmap(host_file, vma);
127} 132}
@@ -138,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
138 if (!cfi) 143 if (!cfi)
139 return -ENOMEM; 144 return -ENOMEM;
140 145
141 lock_kernel();
142
143 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, 146 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
144 &host_file); 147 &host_file);
145 if (!host_file) 148 if (!host_file)
@@ -147,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
147 150
148 if (error) { 151 if (error) {
149 kfree(cfi); 152 kfree(cfi);
150 unlock_kernel();
151 return error; 153 return error;
152 } 154 }
153 155
@@ -159,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
159 161
160 BUG_ON(coda_file->private_data != NULL); 162 BUG_ON(coda_file->private_data != NULL);
161 coda_file->private_data = cfi; 163 coda_file->private_data = cfi;
162
163 unlock_kernel();
164 return 0; 164 return 0;
165} 165}
166 166
@@ -171,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
171 struct coda_file_info *cfi; 171 struct coda_file_info *cfi;
172 struct coda_inode_info *cii; 172 struct coda_inode_info *cii;
173 struct inode *host_inode; 173 struct inode *host_inode;
174 int err = 0; 174 int err;
175
176 lock_kernel();
177 175
178 cfi = CODA_FTOC(coda_file); 176 cfi = CODA_FTOC(coda_file);
179 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 177 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +183,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
185 cii = ITOC(coda_inode); 183 cii = ITOC(coda_inode);
186 184
187 /* did we mmap this file? */ 185 /* did we mmap this file? */
186 spin_lock(&cii->c_lock);
188 if (coda_inode->i_mapping == &host_inode->i_data) { 187 if (coda_inode->i_mapping == &host_inode->i_data) {
189 cii->c_mapcount -= cfi->cfi_mapcount; 188 cii->c_mapcount -= cfi->cfi_mapcount;
190 if (!cii->c_mapcount) 189 if (!cii->c_mapcount)
191 coda_inode->i_mapping = &coda_inode->i_data; 190 coda_inode->i_mapping = &coda_inode->i_data;
192 } 191 }
192 spin_unlock(&cii->c_lock);
193 193
194 fput(cfi->cfi_container); 194 fput(cfi->cfi_container);
195 kfree(coda_file->private_data); 195 kfree(coda_file->private_data);
196 coda_file->private_data = NULL; 196 coda_file->private_data = NULL;
197 197
198 unlock_kernel();
199
200 /* VFS fput ignores the return value from file_operations->release, so 198 /* VFS fput ignores the return value from file_operations->release, so
201 * there is no use returning an error here */ 199 * there is no use returning an error here */
202 return 0; 200 return 0;
@@ -207,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync)
207 struct file *host_file; 205 struct file *host_file;
208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode; 206 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 207 struct coda_file_info *cfi;
210 int err = 0; 208 int err;
211 209
212 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) || 210 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
213 S_ISLNK(coda_inode->i_mode))) 211 S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync)
218 host_file = cfi->cfi_container; 216 host_file = cfi->cfi_container;
219 217
220 err = vfs_fsync(host_file, datasync); 218 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 219 if (!err && !datasync)
222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 220 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
224 unlock_kernel();
225 }
226 221
227 return err; 222 return err;
228} 223}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6526e6f21ecf..5ea57c8c7f97 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/unistd.h> 17#include <linux/unistd.h>
18#include <linux/smp_lock.h> 18#include <linux/mutex.h>
19#include <linux/spinlock.h>
19#include <linux/file.h> 20#include <linux/file.h>
20#include <linux/vfs.h> 21#include <linux/vfs.h>
21#include <linux/slab.h> 22#include <linux/slab.h>
@@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
51 ei->c_flags = 0; 52 ei->c_flags = 0;
52 ei->c_uid = 0; 53 ei->c_uid = 0;
53 ei->c_cached_perm = 0; 54 ei->c_cached_perm = 0;
55 spin_lock_init(&ei->c_lock);
54 return &ei->vfs_inode; 56 return &ei->vfs_inode;
55} 57}
56 58
@@ -143,7 +145,7 @@ static int get_device_index(struct coda_mount_data *data)
143static int coda_fill_super(struct super_block *sb, void *data, int silent) 145static int coda_fill_super(struct super_block *sb, void *data, int silent)
144{ 146{
145 struct inode *root = NULL; 147 struct inode *root = NULL;
146 struct venus_comm *vc = NULL; 148 struct venus_comm *vc;
147 struct CodaFid fid; 149 struct CodaFid fid;
148 int error; 150 int error;
149 int idx; 151 int idx;
@@ -157,21 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
157 printk(KERN_INFO "coda_read_super: device index: %i\n", idx); 159 printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
158 160
159 vc = &coda_comms[idx]; 161 vc = &coda_comms[idx];
162 mutex_lock(&vc->vc_mutex);
163
160 if (!vc->vc_inuse) { 164 if (!vc->vc_inuse) {
161 printk("coda_read_super: No pseudo device\n"); 165 printk("coda_read_super: No pseudo device\n");
162 return -EINVAL; 166 error = -EINVAL;
167 goto unlock_out;
163 } 168 }
164 169
165 if ( vc->vc_sb ) { 170 if (vc->vc_sb) {
166 printk("coda_read_super: Device already mounted\n"); 171 printk("coda_read_super: Device already mounted\n");
167 return -EBUSY; 172 error = -EBUSY;
173 goto unlock_out;
168 } 174 }
169 175
170 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); 176 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
171 if (error) 177 if (error)
172 goto bdi_err; 178 goto unlock_out;
173 179
174 vc->vc_sb = sb; 180 vc->vc_sb = sb;
181 mutex_unlock(&vc->vc_mutex);
175 182
176 sb->s_fs_info = vc; 183 sb->s_fs_info = vc;
177 sb->s_flags |= MS_NOATIME; 184 sb->s_flags |= MS_NOATIME;
@@ -200,26 +207,33 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
200 printk("coda_read_super: rootinode is %ld dev %s\n", 207 printk("coda_read_super: rootinode is %ld dev %s\n",
201 root->i_ino, root->i_sb->s_id); 208 root->i_ino, root->i_sb->s_id);
202 sb->s_root = d_alloc_root(root); 209 sb->s_root = d_alloc_root(root);
203 if (!sb->s_root) 210 if (!sb->s_root) {
211 error = -EINVAL;
204 goto error; 212 goto error;
205 return 0; 213 }
214 return 0;
206 215
207 error: 216error:
208 bdi_destroy(&vc->bdi);
209 bdi_err:
210 if (root) 217 if (root)
211 iput(root); 218 iput(root);
212 if (vc)
213 vc->vc_sb = NULL;
214 219
215 return -EINVAL; 220 mutex_lock(&vc->vc_mutex);
221 bdi_destroy(&vc->bdi);
222 vc->vc_sb = NULL;
223 sb->s_fs_info = NULL;
224unlock_out:
225 mutex_unlock(&vc->vc_mutex);
226 return error;
216} 227}
217 228
218static void coda_put_super(struct super_block *sb) 229static void coda_put_super(struct super_block *sb)
219{ 230{
220 bdi_destroy(&coda_vcp(sb)->bdi); 231 struct venus_comm *vcp = coda_vcp(sb);
221 coda_vcp(sb)->vc_sb = NULL; 232 mutex_lock(&vcp->vc_mutex);
233 bdi_destroy(&vcp->bdi);
234 vcp->vc_sb = NULL;
222 sb->s_fs_info = NULL; 235 sb->s_fs_info = NULL;
236 mutex_unlock(&vcp->vc_mutex);
223 237
224 printk("Coda: Bye bye.\n"); 238 printk("Coda: Bye bye.\n");
225} 239}
@@ -245,8 +259,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
245 struct coda_vattr vattr; 259 struct coda_vattr vattr;
246 int error; 260 int error;
247 261
248 lock_kernel();
249
250 memset(&vattr, 0, sizeof(vattr)); 262 memset(&vattr, 0, sizeof(vattr));
251 263
252 inode->i_ctime = CURRENT_TIME_SEC; 264 inode->i_ctime = CURRENT_TIME_SEC;
@@ -256,13 +268,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
256 /* Venus is responsible for truncating the container-file!!! */ 268 /* Venus is responsible for truncating the container-file!!! */
257 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr); 269 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
258 270
259 if ( !error ) { 271 if (!error) {
260 coda_vattr_to_iattr(inode, &vattr); 272 coda_vattr_to_iattr(inode, &vattr);
261 coda_cache_clear_inode(inode); 273 coda_cache_clear_inode(inode);
262 } 274 }
263
264 unlock_kernel();
265
266 return error; 275 return error;
267} 276}
268 277
@@ -276,12 +285,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
276{ 285{
277 int error; 286 int error;
278 287
279 lock_kernel();
280
281 error = venus_statfs(dentry, buf); 288 error = venus_statfs(dentry, buf);
282 289
283 unlock_kernel();
284
285 if (error) { 290 if (error) {
286 /* fake something like AFS does */ 291 /* fake something like AFS does */
287 buf->f_blocks = 9000000; 292 buf->f_blocks = 9000000;
@@ -301,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
301 306
302/* init_coda: used by filesystems.c to register coda */ 307/* init_coda: used by filesystems.c to register coda */
303 308
304static int coda_get_sb(struct file_system_type *fs_type, 309static struct dentry *coda_mount(struct file_system_type *fs_type,
305 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 310 int flags, const char *dev_name, void *data)
306{ 311{
307 return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt); 312 return mount_nodev(fs_type, flags, data, coda_fill_super);
308} 313}
309 314
310struct file_system_type coda_fs_type = { 315struct file_system_type coda_fs_type = {
311 .owner = THIS_MODULE, 316 .owner = THIS_MODULE,
312 .name = "coda", 317 .name = "coda",
313 .get_sb = coda_get_sb, 318 .mount = coda_mount,
314 .kill_sb = kill_anon_super, 319 .kill_sb = kill_anon_super,
315 .fs_flags = FS_BINARY_MOUNTDATA, 320 .fs_flags = FS_BINARY_MOUNTDATA,
316}; 321};
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c9..2fd89b5c5c7b 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -23,8 +23,6 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
28/* pioctl ops */ 26/* pioctl ops */
29static int coda_ioctl_permission(struct inode *inode, int mask); 27static int coda_ioctl_permission(struct inode *inode, int mask);
30static long coda_pioctl(struct file *filp, unsigned int cmd, 28static long coda_pioctl(struct file *filp, unsigned int cmd,
@@ -39,6 +37,7 @@ const struct inode_operations coda_ioctl_inode_operations = {
39const struct file_operations coda_ioctl_operations = { 37const struct file_operations coda_ioctl_operations = {
40 .owner = THIS_MODULE, 38 .owner = THIS_MODULE,
41 .unlocked_ioctl = coda_pioctl, 39 .unlocked_ioctl = coda_pioctl,
40 .llseek = noop_llseek,
42}; 41};
43 42
44/* the coda pioctl inode ops */ 43/* the coda pioctl inode ops */
@@ -57,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
57 struct inode *target_inode = NULL; 56 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp; 57 struct coda_inode_info *cnp;
59 58
60 lock_kernel();
61
62 /* get the Pioctl data arguments from user space */ 59 /* get the Pioctl data arguments from user space */
63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 60 if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
64 error = -EINVAL; 61 return -EINVAL;
65 goto out;
66 }
67 62
68 /* 63 /*
69 * Look up the pathname. Note that the pathname is in 64 * Look up the pathname. Note that the pathname is in
@@ -75,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
75 error = user_lpath(data.path, &path); 70 error = user_lpath(data.path, &path);
76 71
77 if (error) 72 if (error)
78 goto out; 73 return error;
79 else 74
80 target_inode = path.dentry->d_inode; 75 target_inode = path.dentry->d_inode;
81 76
82 /* return if it is not a Coda inode */ 77 /* return if it is not a Coda inode */
83 if (target_inode->i_sb != inode->i_sb) { 78 if (target_inode->i_sb != inode->i_sb) {
84 path_put(&path);
85 error = -EINVAL; 79 error = -EINVAL;
86 goto out; 80 goto out;
87 } 81 }
@@ -90,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
90 cnp = ITOC(target_inode); 84 cnp = ITOC(target_inode);
91 85
92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 86 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
93
94 path_put(&path);
95
96out: 87out:
97 unlock_kernel(); 88 path_put(&path);
98 return error; 89 return error;
99} 90}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 116af7546cf0..62647a8595e4 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
35#include <linux/poll.h> 35#include <linux/poll.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/smp_lock.h> 38#include <linux/mutex.h>
39#include <linux/device.h> 39#include <linux/device.h>
40#include <asm/io.h> 40#include <asm/io.h>
41#include <asm/system.h> 41#include <asm/system.h>
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
67 unsigned int mask = POLLOUT | POLLWRNORM; 67 unsigned int mask = POLLOUT | POLLWRNORM;
68 68
69 poll_wait(file, &vcp->vc_waitq, wait); 69 poll_wait(file, &vcp->vc_waitq, wait);
70 mutex_lock(&vcp->vc_mutex);
70 if (!list_empty(&vcp->vc_pending)) 71 if (!list_empty(&vcp->vc_pending))
71 mask |= POLLIN | POLLRDNORM; 72 mask |= POLLIN | POLLRDNORM;
73 mutex_unlock(&vcp->vc_mutex);
72 74
73 return mask; 75 return mask;
74} 76}
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
108 return -EFAULT; 110 return -EFAULT;
109 111
110 if (DOWNCALL(hdr.opcode)) { 112 if (DOWNCALL(hdr.opcode)) {
111 struct super_block *sb = NULL; 113 union outputArgs *dcbuf;
112 union outputArgs *dcbuf;
113 int size = sizeof(*dcbuf); 114 int size = sizeof(*dcbuf);
114 115
115 sb = vcp->vc_sb;
116 if ( !sb ) {
117 count = nbytes;
118 goto out;
119 }
120
121 if ( nbytes < sizeof(struct coda_out_hdr) ) { 116 if ( nbytes < sizeof(struct coda_out_hdr) ) {
122 printk("coda_downcall opc %d uniq %d, not enough!\n", 117 printk("coda_downcall opc %d uniq %d, not enough!\n",
123 hdr.opcode, hdr.unique); 118 hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
137 } 132 }
138 133
139 /* what downcall errors does Venus handle ? */ 134 /* what downcall errors does Venus handle ? */
140 lock_kernel(); 135 error = coda_downcall(vcp, hdr.opcode, dcbuf);
141 error = coda_downcall(hdr.opcode, dcbuf, sb);
142 unlock_kernel();
143 136
144 CODA_FREE(dcbuf, nbytes); 137 CODA_FREE(dcbuf, nbytes);
145 if (error) { 138 if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
152 } 145 }
153 146
154 /* Look for the message on the processing queue. */ 147 /* Look for the message on the processing queue. */
155 lock_kernel(); 148 mutex_lock(&vcp->vc_mutex);
156 list_for_each(lh, &vcp->vc_processing) { 149 list_for_each(lh, &vcp->vc_processing) {
157 tmp = list_entry(lh, struct upc_req , uc_chain); 150 tmp = list_entry(lh, struct upc_req , uc_chain);
158 if (tmp->uc_unique == hdr.unique) { 151 if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
161 break; 154 break;
162 } 155 }
163 } 156 }
164 unlock_kernel(); 157 mutex_unlock(&vcp->vc_mutex);
165 158
166 if (!req) { 159 if (!req) {
167 printk("psdev_write: msg (%d, %d) not found\n", 160 printk("psdev_write: msg (%d, %d) not found\n",
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
216 if (nbytes == 0) 209 if (nbytes == 0)
217 return 0; 210 return 0;
218 211
219 lock_kernel(); 212 mutex_lock(&vcp->vc_mutex);
220 213
221 add_wait_queue(&vcp->vc_waitq, &wait); 214 add_wait_queue(&vcp->vc_waitq, &wait);
222 set_current_state(TASK_INTERRUPTIBLE); 215 set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
230 retval = -ERESTARTSYS; 223 retval = -ERESTARTSYS;
231 break; 224 break;
232 } 225 }
226 mutex_unlock(&vcp->vc_mutex);
233 schedule(); 227 schedule();
228 mutex_lock(&vcp->vc_mutex);
234 } 229 }
235 230
236 set_current_state(TASK_RUNNING); 231 set_current_state(TASK_RUNNING);
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
263 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); 258 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
264 kfree(req); 259 kfree(req);
265out: 260out:
266 unlock_kernel(); 261 mutex_unlock(&vcp->vc_mutex);
267 return (count ? count : retval); 262 return (count ? count : retval);
268} 263}
269 264
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
276 if (idx < 0 || idx >= MAX_CODADEVS) 271 if (idx < 0 || idx >= MAX_CODADEVS)
277 return -ENODEV; 272 return -ENODEV;
278 273
279 lock_kernel();
280
281 err = -EBUSY; 274 err = -EBUSY;
282 vcp = &coda_comms[idx]; 275 vcp = &coda_comms[idx];
276 mutex_lock(&vcp->vc_mutex);
277
283 if (!vcp->vc_inuse) { 278 if (!vcp->vc_inuse) {
284 vcp->vc_inuse++; 279 vcp->vc_inuse++;
285 280
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
293 err = 0; 288 err = 0;
294 } 289 }
295 290
296 unlock_kernel(); 291 mutex_unlock(&vcp->vc_mutex);
297 return err; 292 return err;
298} 293}
299 294
@@ -308,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
308 return -1; 303 return -1;
309 } 304 }
310 305
311 lock_kernel(); 306 mutex_lock(&vcp->vc_mutex);
312 307
313 /* Wakeup clients so they can return. */ 308 /* Wakeup clients so they can return. */
314 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) { 309 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
@@ -333,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
333 328
334 file->private_data = NULL; 329 file->private_data = NULL;
335 vcp->vc_inuse--; 330 vcp->vc_inuse--;
336 unlock_kernel(); 331 mutex_unlock(&vcp->vc_mutex);
337 return 0; 332 return 0;
338} 333}
339 334
@@ -346,6 +341,7 @@ static const struct file_operations coda_psdev_fops = {
346 .unlocked_ioctl = coda_psdev_ioctl, 341 .unlocked_ioctl = coda_psdev_ioctl,
347 .open = coda_psdev_open, 342 .open = coda_psdev_open,
348 .release = coda_psdev_release, 343 .release = coda_psdev_release,
344 .llseek = noop_llseek,
349}; 345};
350 346
351static int init_coda_psdev(void) 347static int init_coda_psdev(void)
@@ -361,9 +357,11 @@ static int init_coda_psdev(void)
361 err = PTR_ERR(coda_psdev_class); 357 err = PTR_ERR(coda_psdev_class);
362 goto out_chrdev; 358 goto out_chrdev;
363 } 359 }
364 for (i = 0; i < MAX_CODADEVS; i++) 360 for (i = 0; i < MAX_CODADEVS; i++) {
361 mutex_init(&(&coda_comms[i])->vc_mutex);
365 device_create(coda_psdev_class, NULL, 362 device_create(coda_psdev_class, NULL,
366 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); 363 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
364 }
367 coda_sysctl_init(); 365 coda_sysctl_init();
368 goto out; 366 goto out;
369 367
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b7258458..af78f007a2b0 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,7 +14,6 @@
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18 17
19#include <linux/coda.h> 18#include <linux/coda.h>
20#include <linux/coda_linux.h> 19#include <linux/coda_linux.h>
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
29 unsigned int len = PAGE_SIZE; 28 unsigned int len = PAGE_SIZE;
30 char *p = kmap(page); 29 char *p = kmap(page);
31 30
32 lock_kernel();
33 cii = ITOC(inode); 31 cii = ITOC(inode);
34 32
35 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); 33 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
36 unlock_kernel();
37 if (error) 34 if (error)
38 goto fail; 35 goto fail;
39 SetPageUptodate(page); 36 SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b8893ab6f9e6..c3563cab9758 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,6 +27,7 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/mutex.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
@@ -606,7 +607,8 @@ static void coda_unblock_signals(sigset_t *old)
606 (r)->uc_opcode != CODA_RELEASE) || \ 607 (r)->uc_opcode != CODA_RELEASE) || \
607 (r)->uc_flags & CODA_REQ_READ)) 608 (r)->uc_flags & CODA_REQ_READ))
608 609
609static inline void coda_waitfor_upcall(struct upc_req *req) 610static inline void coda_waitfor_upcall(struct venus_comm *vcp,
611 struct upc_req *req)
610{ 612{
611 DECLARE_WAITQUEUE(wait, current); 613 DECLARE_WAITQUEUE(wait, current);
612 unsigned long timeout = jiffies + coda_timeout * HZ; 614 unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -639,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
639 break; 641 break;
640 } 642 }
641 643
644 mutex_unlock(&vcp->vc_mutex);
642 if (blocked) 645 if (blocked)
643 schedule_timeout(HZ); 646 schedule_timeout(HZ);
644 else 647 else
645 schedule(); 648 schedule();
649 mutex_lock(&vcp->vc_mutex);
646 } 650 }
647 if (blocked) 651 if (blocked)
648 coda_unblock_signals(&old); 652 coda_unblock_signals(&old);
@@ -667,18 +671,23 @@ static int coda_upcall(struct venus_comm *vcp,
667{ 671{
668 union outputArgs *out; 672 union outputArgs *out;
669 union inputArgs *sig_inputArgs; 673 union inputArgs *sig_inputArgs;
670 struct upc_req *req, *sig_req; 674 struct upc_req *req = NULL, *sig_req;
671 int error = 0; 675 int error;
676
677 mutex_lock(&vcp->vc_mutex);
672 678
673 if (!vcp->vc_inuse) { 679 if (!vcp->vc_inuse) {
674 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); 680 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
675 return -ENXIO; 681 error = -ENXIO;
682 goto exit;
676 } 683 }
677 684
678 /* Format the request message. */ 685 /* Format the request message. */
679 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); 686 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
680 if (!req) 687 if (!req) {
681 return -ENOMEM; 688 error = -ENOMEM;
689 goto exit;
690 }
682 691
683 req->uc_data = (void *)buffer; 692 req->uc_data = (void *)buffer;
684 req->uc_flags = 0; 693 req->uc_flags = 0;
@@ -705,7 +714,7 @@ static int coda_upcall(struct venus_comm *vcp,
705 * ENODEV. */ 714 * ENODEV. */
706 715
707 /* Go to sleep. Wake up on signals only after the timeout. */ 716 /* Go to sleep. Wake up on signals only after the timeout. */
708 coda_waitfor_upcall(req); 717 coda_waitfor_upcall(vcp, req);
709 718
710 /* Op went through, interrupt or not... */ 719 /* Op went through, interrupt or not... */
711 if (req->uc_flags & CODA_REQ_WRITE) { 720 if (req->uc_flags & CODA_REQ_WRITE) {
@@ -759,6 +768,7 @@ static int coda_upcall(struct venus_comm *vcp,
759 768
760exit: 769exit:
761 kfree(req); 770 kfree(req);
771 mutex_unlock(&vcp->vc_mutex);
762 return error; 772 return error;
763} 773}
764 774
@@ -796,21 +806,24 @@ exit:
796 * 806 *
797 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ 807 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
798 808
799int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb) 809int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
800{ 810{
801 struct inode *inode = NULL; 811 struct inode *inode = NULL;
802 struct CodaFid *fid, *newfid; 812 struct CodaFid *fid = NULL, *newfid;
813 struct super_block *sb;
803 814
804 /* Handle invalidation requests. */ 815 /* Handle invalidation requests. */
805 if ( !sb || !sb->s_root) 816 mutex_lock(&vcp->vc_mutex);
806 return 0; 817 sb = vcp->vc_sb;
818 if (!sb || !sb->s_root)
819 goto unlock_out;
807 820
808 switch (opcode) { 821 switch (opcode) {
809 case CODA_FLUSH: 822 case CODA_FLUSH:
810 coda_cache_clear_all(sb); 823 coda_cache_clear_all(sb);
811 shrink_dcache_sb(sb); 824 shrink_dcache_sb(sb);
812 if (sb->s_root->d_inode) 825 if (sb->s_root->d_inode)
813 coda_flag_inode(sb->s_root->d_inode, C_FLUSH); 826 coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
814 break; 827 break;
815 828
816 case CODA_PURGEUSER: 829 case CODA_PURGEUSER:
@@ -819,45 +832,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
819 832
820 case CODA_ZAPDIR: 833 case CODA_ZAPDIR:
821 fid = &out->coda_zapdir.CodaFid; 834 fid = &out->coda_zapdir.CodaFid;
822 inode = coda_fid_to_inode(fid, sb);
823 if (inode) {
824 coda_flag_inode_children(inode, C_PURGE);
825 coda_flag_inode(inode, C_VATTR);
826 }
827 break; 835 break;
828 836
829 case CODA_ZAPFILE: 837 case CODA_ZAPFILE:
830 fid = &out->coda_zapfile.CodaFid; 838 fid = &out->coda_zapfile.CodaFid;
831 inode = coda_fid_to_inode(fid, sb);
832 if (inode)
833 coda_flag_inode(inode, C_VATTR);
834 break; 839 break;
835 840
836 case CODA_PURGEFID: 841 case CODA_PURGEFID:
837 fid = &out->coda_purgefid.CodaFid; 842 fid = &out->coda_purgefid.CodaFid;
843 break;
844
845 case CODA_REPLACE:
846 fid = &out->coda_replace.OldFid;
847 break;
848 }
849 if (fid)
838 inode = coda_fid_to_inode(fid, sb); 850 inode = coda_fid_to_inode(fid, sb);
839 if (inode) {
840 coda_flag_inode_children(inode, C_PURGE);
841 851
842 /* catch the dentries later if some are still busy */ 852unlock_out:
843 coda_flag_inode(inode, C_PURGE); 853 mutex_unlock(&vcp->vc_mutex);
844 d_prune_aliases(inode);
845 854
846 } 855 if (!inode)
856 return 0;
857
858 switch (opcode) {
859 case CODA_ZAPDIR:
860 coda_flag_inode_children(inode, C_PURGE);
861 coda_flag_inode(inode, C_VATTR);
862 break;
863
864 case CODA_ZAPFILE:
865 coda_flag_inode(inode, C_VATTR);
866 break;
867
868 case CODA_PURGEFID:
869 coda_flag_inode_children(inode, C_PURGE);
870
871 /* catch the dentries later if some are still busy */
872 coda_flag_inode(inode, C_PURGE);
873 d_prune_aliases(inode);
847 break; 874 break;
848 875
849 case CODA_REPLACE: 876 case CODA_REPLACE:
850 fid = &out->coda_replace.OldFid;
851 newfid = &out->coda_replace.NewFid; 877 newfid = &out->coda_replace.NewFid;
852 inode = coda_fid_to_inode(fid, sb); 878 coda_replace_fid(inode, fid, newfid);
853 if (inode)
854 coda_replace_fid(inode, fid, newfid);
855 break; 879 break;
856 } 880 }
857 881 iput(inode);
858 if (inode)
859 iput(inode);
860
861 return 0; 882 return 0;
862} 883}
863 884
diff --git a/fs/compat.c b/fs/compat.c
index 0644a154672b..c580c322fa6b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -29,8 +29,6 @@
29#include <linux/vfs.h> 29#include <linux/vfs.h>
30#include <linux/ioctl.h> 30#include <linux/ioctl.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/smb.h>
33#include <linux/smb_mount.h>
34#include <linux/ncp_mount.h> 32#include <linux/ncp_mount.h>
35#include <linux/nfs4_mount.h> 33#include <linux/nfs4_mount.h>
36#include <linux/syscalls.h> 34#include <linux/syscalls.h>
@@ -51,6 +49,7 @@
51#include <linux/eventpoll.h> 49#include <linux/eventpoll.h>
52#include <linux/fs_struct.h> 50#include <linux/fs_struct.h>
53#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/pagemap.h>
54 53
55#include <asm/uaccess.h> 54#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
@@ -608,14 +607,14 @@ ssize_t compat_rw_copy_check_uvector(int type,
608 /* 607 /*
609 * Single unix specification: 608 * Single unix specification:
610 * We should -EINVAL if an element length is not >= 0 and fitting an 609 * We should -EINVAL if an element length is not >= 0 and fitting an
611 * ssize_t. The total length is fitting an ssize_t 610 * ssize_t.
612 * 611 *
613 * Be careful here because iov_len is a size_t not an ssize_t 612 * In Linux, the total length is limited to MAX_RW_COUNT, there is
613 * no overflow possibility.
614 */ 614 */
615 tot_len = 0; 615 tot_len = 0;
616 ret = -EINVAL; 616 ret = -EINVAL;
617 for (seg = 0; seg < nr_segs; seg++) { 617 for (seg = 0; seg < nr_segs; seg++) {
618 compat_ssize_t tmp = tot_len;
619 compat_uptr_t buf; 618 compat_uptr_t buf;
620 compat_ssize_t len; 619 compat_ssize_t len;
621 620
@@ -626,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
626 } 625 }
627 if (len < 0) /* size_t not fitting in compat_ssize_t .. */ 626 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
628 goto out; 627 goto out;
629 tot_len += len;
630 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
631 goto out;
632 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { 628 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
633 ret = -EFAULT; 629 ret = -EFAULT;
634 goto out; 630 goto out;
635 } 631 }
632 if (len > MAX_RW_COUNT - tot_len)
633 len = MAX_RW_COUNT - tot_len;
634 tot_len += len;
636 iov->iov_base = compat_ptr(buf); 635 iov->iov_base = compat_ptr(buf);
637 iov->iov_len = (compat_size_t) len; 636 iov->iov_len = (compat_size_t) len;
638 uvector++; 637 uvector++;
@@ -745,30 +744,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
745 return raw_data; 744 return raw_data;
746} 745}
747 746
748struct compat_smb_mount_data {
749 compat_int_t version;
750 __compat_uid_t mounted_uid;
751 __compat_uid_t uid;
752 __compat_gid_t gid;
753 compat_mode_t file_mode;
754 compat_mode_t dir_mode;
755};
756
757static void *do_smb_super_data_conv(void *raw_data)
758{
759 struct smb_mount_data *s = raw_data;
760 struct compat_smb_mount_data *c_s = raw_data;
761
762 if (c_s->version != SMB_MOUNT_OLDVERSION)
763 goto out;
764 s->dir_mode = c_s->dir_mode;
765 s->file_mode = c_s->file_mode;
766 s->gid = c_s->gid;
767 s->uid = c_s->uid;
768 s->mounted_uid = c_s->mounted_uid;
769 out:
770 return raw_data;
771}
772 747
773struct compat_nfs_string { 748struct compat_nfs_string {
774 compat_uint_t len; 749 compat_uint_t len;
@@ -835,7 +810,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
835 return 0; 810 return 0;
836} 811}
837 812
838#define SMBFS_NAME "smbfs"
839#define NCPFS_NAME "ncpfs" 813#define NCPFS_NAME "ncpfs"
840#define NFS4_NAME "nfs4" 814#define NFS4_NAME "nfs4"
841 815
@@ -870,9 +844,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
870 retval = -EINVAL; 844 retval = -EINVAL;
871 845
872 if (kernel_type && data_page) { 846 if (kernel_type && data_page) {
873 if (!strcmp(kernel_type, SMBFS_NAME)) { 847 if (!strcmp(kernel_type, NCPFS_NAME)) {
874 do_smb_super_data_conv((void *)data_page);
875 } else if (!strcmp(kernel_type, NCPFS_NAME)) {
876 do_ncp_super_data_conv((void *)data_page); 848 do_ncp_super_data_conv((void *)data_page);
877 } else if (!strcmp(kernel_type, NFS4_NAME)) { 849 } else if (!strcmp(kernel_type, NFS4_NAME)) {
878 if (do_nfs4_super_data_conv((void *) data_page)) 850 if (do_nfs4_super_data_conv((void *) data_page))
@@ -1963,7 +1935,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1963} 1935}
1964#endif /* HAVE_SET_RESTORE_SIGMASK */ 1936#endif /* HAVE_SET_RESTORE_SIGMASK */
1965 1937
1966#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) 1938#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
1967/* Stuff for NFS server syscalls... */ 1939/* Stuff for NFS server syscalls... */
1968struct compat_nfsctl_svc { 1940struct compat_nfsctl_svc {
1969 u16 svc32_port; 1941 u16 svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..410ed188faa1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -46,7 +46,6 @@
46#include <linux/videodev.h> 46#include <linux/videodev.h>
47#include <linux/netdevice.h> 47#include <linux/netdevice.h>
48#include <linux/raw.h> 48#include <linux/raw.h>
49#include <linux/smb_fs.h>
50#include <linux/blkdev.h> 49#include <linux/blkdev.h>
51#include <linux/elevator.h> 50#include <linux/elevator.h>
52#include <linux/rtc.h> 51#include <linux/rtc.h>
@@ -558,25 +557,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
558 557
559#endif /* CONFIG_BLOCK */ 558#endif /* CONFIG_BLOCK */
560 559
561static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
562 compat_uid_t __user *argp)
563{
564 mm_segment_t old_fs = get_fs();
565 __kernel_uid_t kuid;
566 int err;
567
568 cmd = SMB_IOC_GETMOUNTUID;
569
570 set_fs(KERNEL_DS);
571 err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
572 set_fs(old_fs);
573
574 if (err >= 0)
575 err = put_user(kuid, argp);
576
577 return err;
578}
579
580/* Bluetooth ioctls */ 560/* Bluetooth ioctls */
581#define HCIUARTSETPROTO _IOW('U', 200, int) 561#define HCIUARTSETPROTO _IOW('U', 200, int)
582#define HCIUARTGETPROTO _IOR('U', 201, int) 562#define HCIUARTGETPROTO _IOR('U', 201, int)
@@ -599,69 +579,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
599#define HIDPGETCONNLIST _IOR('H', 210, int) 579#define HIDPGETCONNLIST _IOR('H', 210, int)
600#define HIDPGETCONNINFO _IOR('H', 211, int) 580#define HIDPGETCONNINFO _IOR('H', 211, int)
601 581
602#ifdef CONFIG_BLOCK
603struct raw32_config_request
604{
605 compat_int_t raw_minor;
606 __u64 block_major;
607 __u64 block_minor;
608} __attribute__((packed));
609
610static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
611{
612 int ret;
613
614 if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
615 return -EFAULT;
616
617 ret = __get_user(req->raw_minor, &user_req->raw_minor);
618 ret |= __get_user(req->block_major, &user_req->block_major);
619 ret |= __get_user(req->block_minor, &user_req->block_minor);
620
621 return ret ? -EFAULT : 0;
622}
623
624static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
625{
626 int ret;
627
628 if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
629 return -EFAULT;
630
631 ret = __put_user(req->raw_minor, &user_req->raw_minor);
632 ret |= __put_user(req->block_major, &user_req->block_major);
633 ret |= __put_user(req->block_minor, &user_req->block_minor);
634
635 return ret ? -EFAULT : 0;
636}
637
638static int raw_ioctl(unsigned fd, unsigned cmd,
639 struct raw32_config_request __user *user_req)
640{
641 int ret;
642
643 switch (cmd) {
644 case RAW_SETBIND:
645 default: { /* RAW_GETBIND */
646 struct raw_config_request req;
647 mm_segment_t oldfs = get_fs();
648
649 if ((ret = get_raw32_request(&req, user_req)))
650 return ret;
651
652 set_fs(KERNEL_DS);
653 ret = sys_ioctl(fd,cmd,(unsigned long)&req);
654 set_fs(oldfs);
655
656 if ((!ret) && (cmd == RAW_GETBIND)) {
657 ret = set_raw32_request(&req, user_req);
658 }
659 break;
660 }
661 }
662 return ret;
663}
664#endif /* CONFIG_BLOCK */
665 582
666struct serial_struct32 { 583struct serial_struct32 {
667 compat_int_t type; 584 compat_int_t type;
@@ -1265,8 +1182,6 @@ COMPATIBLE_IOCTL(OSS_GETVERSION)
1265/* Raw devices */ 1182/* Raw devices */
1266COMPATIBLE_IOCTL(RAW_SETBIND) 1183COMPATIBLE_IOCTL(RAW_SETBIND)
1267COMPATIBLE_IOCTL(RAW_GETBIND) 1184COMPATIBLE_IOCTL(RAW_GETBIND)
1268/* SMB ioctls which do not need any translations */
1269COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
1270/* Watchdog */ 1185/* Watchdog */
1271COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) 1186COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
1272COMPATIBLE_IOCTL(WDIOC_GETSTATUS) 1187COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1523,15 +1438,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1523 case MTIOCGET32: 1438 case MTIOCGET32:
1524 case MTIOCPOS32: 1439 case MTIOCPOS32:
1525 return mt_ioctl_trans(fd, cmd, argp); 1440 return mt_ioctl_trans(fd, cmd, argp);
1526 /* Raw devices */
1527 case RAW_SETBIND:
1528 case RAW_GETBIND:
1529 return raw_ioctl(fd, cmd, argp);
1530#endif 1441#endif
1531 /* One SMB ioctl needs translations. */
1532#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
1533 case SMB_IOC_GETMOUNTUID_32:
1534 return do_smb_getmountuid(fd, cmd, argp);
1535 /* Serial */ 1442 /* Serial */
1536 case TIOCGSERIAL: 1443 case TIOCGSERIAL:
1537 case TIOCSSERIAL: 1444 case TIOCSSERIAL:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..253476d78ed8 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
135{ 135{
136 struct inode * inode = new_inode(configfs_sb); 136 struct inode * inode = new_inode(configfs_sb);
137 if (inode) { 137 if (inode) {
138 inode->i_ino = get_next_ino();
138 inode->i_mapping->a_ops = &configfs_aops; 139 inode->i_mapping->a_ops = &configfs_aops;
139 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 140 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
140 inode->i_op = &configfs_inode_operations; 141 inode->i_op = &configfs_inode_operations;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2d..7d3607febe1c 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
104 return 0; 104 return 0;
105} 105}
106 106
107static int configfs_get_sb(struct file_system_type *fs_type, 107static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
108 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 108 int flags, const char *dev_name, void *data)
109{ 109{
110 return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt); 110 return mount_single(fs_type, flags, data, configfs_fill_super);
111} 111}
112 112
113static struct file_system_type configfs_fs_type = { 113static struct file_system_type configfs_fs_type = {
114 .owner = THIS_MODULE, 114 .owner = THIS_MODULE,
115 .name = "configfs", 115 .name = "configfs",
116 .get_sb = configfs_get_sb, 116 .mount = configfs_do_mount,
117 .kill_sb = kill_litter_super, 117 .kill_sb = kill_litter_super,
118}; 118};
119 119
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 1e7a33028d33..32fd5fe9ca0e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -533,17 +533,16 @@ static const struct super_operations cramfs_ops = {
533 .statfs = cramfs_statfs, 533 .statfs = cramfs_statfs,
534}; 534};
535 535
536static int cramfs_get_sb(struct file_system_type *fs_type, 536static struct dentry *cramfs_mount(struct file_system_type *fs_type,
537 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 537 int flags, const char *dev_name, void *data)
538{ 538{
539 return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super, 539 return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
540 mnt);
541} 540}
542 541
543static struct file_system_type cramfs_fs_type = { 542static struct file_system_type cramfs_fs_type = {
544 .owner = THIS_MODULE, 543 .owner = THIS_MODULE,
545 .name = "cramfs", 544 .name = "cramfs",
546 .get_sb = cramfs_get_sb, 545 .mount = cramfs_mount,
547 .kill_sb = kill_block_super, 546 .kill_sb = kill_block_super,
548 .fs_flags = FS_REQUIRES_DEV, 547 .fs_flags = FS_REQUIRES_DEV,
549}; 548};
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..23702a9d4e6d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = {
67 .age_limit = 45, 67 .age_limit = 45,
68}; 68};
69 69
70static void __d_free(struct dentry *dentry) 70static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
71static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
72
73#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
74int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
75 size_t *lenp, loff_t *ppos)
76{
77 dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
78 dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
79 return proc_dointvec(table, write, buffer, lenp, ppos);
80}
81#endif
82
83static void __d_free(struct rcu_head *head)
71{ 84{
85 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
86
72 WARN_ON(!list_empty(&dentry->d_alias)); 87 WARN_ON(!list_empty(&dentry->d_alias));
73 if (dname_external(dentry)) 88 if (dname_external(dentry))
74 kfree(dentry->d_name.name); 89 kfree(dentry->d_name.name);
75 kmem_cache_free(dentry_cache, dentry); 90 kmem_cache_free(dentry_cache, dentry);
76} 91}
77 92
78static void d_callback(struct rcu_head *head)
79{
80 struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
81 __d_free(dentry);
82}
83
84/* 93/*
85 * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry 94 * no dcache_lock, please.
86 * inside dcache_lock.
87 */ 95 */
88static void d_free(struct dentry *dentry) 96static void d_free(struct dentry *dentry)
89{ 97{
98 percpu_counter_dec(&nr_dentry);
90 if (dentry->d_op && dentry->d_op->d_release) 99 if (dentry->d_op && dentry->d_op->d_release)
91 dentry->d_op->d_release(dentry); 100 dentry->d_op->d_release(dentry);
101
92 /* if dentry was never inserted into hash, immediate free is OK */ 102 /* if dentry was never inserted into hash, immediate free is OK */
93 if (hlist_unhashed(&dentry->d_hash)) 103 if (hlist_unhashed(&dentry->d_hash))
94 __d_free(dentry); 104 __d_free(&dentry->d_u.d_rcu);
95 else 105 else
96 call_rcu(&dentry->d_u.d_rcu, d_callback); 106 call_rcu(&dentry->d_u.d_rcu, __d_free);
97} 107}
98 108
99/* 109/*
@@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
123} 133}
124 134
125/* 135/*
126 * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. 136 * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
127 */ 137 */
128static void dentry_lru_add(struct dentry *dentry) 138static void dentry_lru_add(struct dentry *dentry)
129{ 139{
130 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 140 if (list_empty(&dentry->d_lru)) {
131 dentry->d_sb->s_nr_dentry_unused++; 141 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
132 dentry_stat.nr_unused++; 142 dentry->d_sb->s_nr_dentry_unused++;
133} 143 percpu_counter_inc(&nr_dentry_unused);
134 144 }
135static void dentry_lru_add_tail(struct dentry *dentry)
136{
137 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
138 dentry->d_sb->s_nr_dentry_unused++;
139 dentry_stat.nr_unused++;
140} 145}
141 146
142static void dentry_lru_del(struct dentry *dentry) 147static void dentry_lru_del(struct dentry *dentry)
143{ 148{
144 if (!list_empty(&dentry->d_lru)) { 149 if (!list_empty(&dentry->d_lru)) {
145 list_del(&dentry->d_lru); 150 list_del_init(&dentry->d_lru);
146 dentry->d_sb->s_nr_dentry_unused--; 151 dentry->d_sb->s_nr_dentry_unused--;
147 dentry_stat.nr_unused--; 152 percpu_counter_dec(&nr_dentry_unused);
148 } 153 }
149} 154}
150 155
151static void dentry_lru_del_init(struct dentry *dentry) 156static void dentry_lru_move_tail(struct dentry *dentry)
152{ 157{
153 if (likely(!list_empty(&dentry->d_lru))) { 158 if (list_empty(&dentry->d_lru)) {
154 list_del_init(&dentry->d_lru); 159 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
155 dentry->d_sb->s_nr_dentry_unused--; 160 dentry->d_sb->s_nr_dentry_unused++;
156 dentry_stat.nr_unused--; 161 percpu_counter_inc(&nr_dentry_unused);
162 } else {
163 list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
157 } 164 }
158} 165}
159 166
@@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry)
172 struct dentry *parent; 179 struct dentry *parent;
173 180
174 list_del(&dentry->d_u.d_child); 181 list_del(&dentry->d_u.d_child);
175 dentry_stat.nr_dentry--; /* For d_free, below */
176 /*drops the locks, at that point nobody can reach this dentry */ 182 /*drops the locks, at that point nobody can reach this dentry */
177 dentry_iput(dentry); 183 dentry_iput(dentry);
178 if (IS_ROOT(dentry)) 184 if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@ repeat:
237 if (dentry->d_op->d_delete(dentry)) 243 if (dentry->d_op->d_delete(dentry))
238 goto unhash_it; 244 goto unhash_it;
239 } 245 }
246
240 /* Unreachable? Get rid of it */ 247 /* Unreachable? Get rid of it */
241 if (d_unhashed(dentry)) 248 if (d_unhashed(dentry))
242 goto kill_it; 249 goto kill_it;
243 if (list_empty(&dentry->d_lru)) { 250
244 dentry->d_flags |= DCACHE_REFERENCED; 251 /* Otherwise leave it cached and ensure it's on the LRU */
245 dentry_lru_add(dentry); 252 dentry->d_flags |= DCACHE_REFERENCED;
246 } 253 dentry_lru_add(dentry);
254
247 spin_unlock(&dentry->d_lock); 255 spin_unlock(&dentry->d_lock);
248 spin_unlock(&dcache_lock); 256 spin_unlock(&dcache_lock);
249 return; 257 return;
@@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry)
318EXPORT_SYMBOL(d_invalidate); 326EXPORT_SYMBOL(d_invalidate);
319 327
320/* This should be called _only_ with dcache_lock held */ 328/* This should be called _only_ with dcache_lock held */
321
322static inline struct dentry * __dget_locked(struct dentry *dentry) 329static inline struct dentry * __dget_locked(struct dentry *dentry)
323{ 330{
324 atomic_inc(&dentry->d_count); 331 atomic_inc(&dentry->d_count);
325 dentry_lru_del_init(dentry); 332 dentry_lru_del(dentry);
326 return dentry; 333 return dentry;
327} 334}
328 335
@@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry)
441 448
442 if (dentry->d_op && dentry->d_op->d_delete) 449 if (dentry->d_op && dentry->d_op->d_delete)
443 dentry->d_op->d_delete(dentry); 450 dentry->d_op->d_delete(dentry);
444 dentry_lru_del_init(dentry); 451 dentry_lru_del(dentry);
445 __d_drop(dentry); 452 __d_drop(dentry);
446 dentry = d_kill(dentry); 453 dentry = d_kill(dentry);
447 spin_lock(&dcache_lock); 454 spin_lock(&dcache_lock);
448 } 455 }
449} 456}
450 457
451/* 458static void shrink_dentry_list(struct list_head *list)
452 * Shrink the dentry LRU on a given superblock.
453 * @sb : superblock to shrink dentry LRU.
454 * @count: If count is NULL, we prune all dentries on superblock.
455 * @flags: If flags is non-zero, we need to do special processing based on
456 * which flags are set. This means we don't need to maintain multiple
457 * similar copies of this loop.
458 */
459static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
460{ 459{
461 LIST_HEAD(referenced);
462 LIST_HEAD(tmp);
463 struct dentry *dentry; 460 struct dentry *dentry;
464 int cnt = 0;
465 461
466 BUG_ON(!sb); 462 while (!list_empty(list)) {
467 BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); 463 dentry = list_entry(list->prev, struct dentry, d_lru);
468 spin_lock(&dcache_lock); 464 dentry_lru_del(dentry);
469 if (count != NULL)
470 /* called from prune_dcache() and shrink_dcache_parent() */
471 cnt = *count;
472restart:
473 if (count == NULL)
474 list_splice_init(&sb->s_dentry_lru, &tmp);
475 else {
476 while (!list_empty(&sb->s_dentry_lru)) {
477 dentry = list_entry(sb->s_dentry_lru.prev,
478 struct dentry, d_lru);
479 BUG_ON(dentry->d_sb != sb);
480 465
481 spin_lock(&dentry->d_lock);
482 /*
483 * If we are honouring the DCACHE_REFERENCED flag and
484 * the dentry has this flag set, don't free it. Clear
485 * the flag and put it back on the LRU.
486 */
487 if ((flags & DCACHE_REFERENCED)
488 && (dentry->d_flags & DCACHE_REFERENCED)) {
489 dentry->d_flags &= ~DCACHE_REFERENCED;
490 list_move(&dentry->d_lru, &referenced);
491 spin_unlock(&dentry->d_lock);
492 } else {
493 list_move_tail(&dentry->d_lru, &tmp);
494 spin_unlock(&dentry->d_lock);
495 cnt--;
496 if (!cnt)
497 break;
498 }
499 cond_resched_lock(&dcache_lock);
500 }
501 }
502 while (!list_empty(&tmp)) {
503 dentry = list_entry(tmp.prev, struct dentry, d_lru);
504 dentry_lru_del_init(dentry);
505 spin_lock(&dentry->d_lock);
506 /* 466 /*
507 * We found an inuse dentry which was not removed from 467 * We found an inuse dentry which was not removed from
508 * the LRU because of laziness during lookup. Do not free 468 * the LRU because of laziness during lookup. Do not free
509 * it - just keep it off the LRU list. 469 * it - just keep it off the LRU list.
510 */ 470 */
471 spin_lock(&dentry->d_lock);
511 if (atomic_read(&dentry->d_count)) { 472 if (atomic_read(&dentry->d_count)) {
512 spin_unlock(&dentry->d_lock); 473 spin_unlock(&dentry->d_lock);
513 continue; 474 continue;
@@ -516,13 +477,60 @@ restart:
516 /* dentry->d_lock was dropped in prune_one_dentry() */ 477 /* dentry->d_lock was dropped in prune_one_dentry() */
517 cond_resched_lock(&dcache_lock); 478 cond_resched_lock(&dcache_lock);
518 } 479 }
519 if (count == NULL && !list_empty(&sb->s_dentry_lru)) 480}
520 goto restart; 481
521 if (count != NULL) 482/**
522 *count = cnt; 483 * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
484 * @sb: superblock to shrink dentry LRU.
485 * @count: number of entries to prune
486 * @flags: flags to control the dentry processing
487 *
488 * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
489 */
490static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
491{
492 /* called from prune_dcache() and shrink_dcache_parent() */
493 struct dentry *dentry;
494 LIST_HEAD(referenced);
495 LIST_HEAD(tmp);
496 int cnt = *count;
497
498 spin_lock(&dcache_lock);
499 while (!list_empty(&sb->s_dentry_lru)) {
500 dentry = list_entry(sb->s_dentry_lru.prev,
501 struct dentry, d_lru);
502 BUG_ON(dentry->d_sb != sb);
503
504 /*
505 * If we are honouring the DCACHE_REFERENCED flag and the
506 * dentry has this flag set, don't free it. Clear the flag
507 * and put it back on the LRU.
508 */
509 if (flags & DCACHE_REFERENCED) {
510 spin_lock(&dentry->d_lock);
511 if (dentry->d_flags & DCACHE_REFERENCED) {
512 dentry->d_flags &= ~DCACHE_REFERENCED;
513 list_move(&dentry->d_lru, &referenced);
514 spin_unlock(&dentry->d_lock);
515 cond_resched_lock(&dcache_lock);
516 continue;
517 }
518 spin_unlock(&dentry->d_lock);
519 }
520
521 list_move_tail(&dentry->d_lru, &tmp);
522 if (!--cnt)
523 break;
524 cond_resched_lock(&dcache_lock);
525 }
526
527 *count = cnt;
528 shrink_dentry_list(&tmp);
529
523 if (!list_empty(&referenced)) 530 if (!list_empty(&referenced))
524 list_splice(&referenced, &sb->s_dentry_lru); 531 list_splice(&referenced, &sb->s_dentry_lru);
525 spin_unlock(&dcache_lock); 532 spin_unlock(&dcache_lock);
533
526} 534}
527 535
528/** 536/**
@@ -538,7 +546,7 @@ static void prune_dcache(int count)
538{ 546{
539 struct super_block *sb, *p = NULL; 547 struct super_block *sb, *p = NULL;
540 int w_count; 548 int w_count;
541 int unused = dentry_stat.nr_unused; 549 int unused = percpu_counter_sum_positive(&nr_dentry_unused);
542 int prune_ratio; 550 int prune_ratio;
543 int pruned; 551 int pruned;
544 552
@@ -608,13 +616,19 @@ static void prune_dcache(int count)
608 * shrink_dcache_sb - shrink dcache for a superblock 616 * shrink_dcache_sb - shrink dcache for a superblock
609 * @sb: superblock 617 * @sb: superblock
610 * 618 *
611 * Shrink the dcache for the specified super block. This 619 * Shrink the dcache for the specified super block. This is used to free
612 * is used to free the dcache before unmounting a file 620 * the dcache before unmounting a file system.
613 * system
614 */ 621 */
615void shrink_dcache_sb(struct super_block * sb) 622void shrink_dcache_sb(struct super_block *sb)
616{ 623{
617 __shrink_dcache_sb(sb, NULL, 0); 624 LIST_HEAD(tmp);
625
626 spin_lock(&dcache_lock);
627 while (!list_empty(&sb->s_dentry_lru)) {
628 list_splice_init(&sb->s_dentry_lru, &tmp);
629 shrink_dentry_list(&tmp);
630 }
631 spin_unlock(&dcache_lock);
618} 632}
619EXPORT_SYMBOL(shrink_dcache_sb); 633EXPORT_SYMBOL(shrink_dcache_sb);
620 634
@@ -632,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
632 646
633 /* detach this root from the system */ 647 /* detach this root from the system */
634 spin_lock(&dcache_lock); 648 spin_lock(&dcache_lock);
635 dentry_lru_del_init(dentry); 649 dentry_lru_del(dentry);
636 __d_drop(dentry); 650 __d_drop(dentry);
637 spin_unlock(&dcache_lock); 651 spin_unlock(&dcache_lock);
638 652
@@ -646,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
646 spin_lock(&dcache_lock); 660 spin_lock(&dcache_lock);
647 list_for_each_entry(loop, &dentry->d_subdirs, 661 list_for_each_entry(loop, &dentry->d_subdirs,
648 d_u.d_child) { 662 d_u.d_child) {
649 dentry_lru_del_init(loop); 663 dentry_lru_del(loop);
650 __d_drop(loop); 664 __d_drop(loop);
651 cond_resched_lock(&dcache_lock); 665 cond_resched_lock(&dcache_lock);
652 } 666 }
@@ -703,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
703 * otherwise we ascend to the parent and move to the 717 * otherwise we ascend to the parent and move to the
704 * next sibling if there is one */ 718 * next sibling if there is one */
705 if (!parent) 719 if (!parent)
706 goto out; 720 return;
707
708 dentry = parent; 721 dentry = parent;
709
710 } while (list_empty(&dentry->d_subdirs)); 722 } while (list_empty(&dentry->d_subdirs));
711 723
712 dentry = list_entry(dentry->d_subdirs.next, 724 dentry = list_entry(dentry->d_subdirs.next,
713 struct dentry, d_u.d_child); 725 struct dentry, d_u.d_child);
714 } 726 }
715out:
716 /* several dentries were freed, need to correct nr_dentry */
717 spin_lock(&dcache_lock);
718 dentry_stat.nr_dentry -= detached;
719 spin_unlock(&dcache_lock);
720} 727}
721 728
722/* 729/*
@@ -830,14 +837,15 @@ resume:
830 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 837 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
831 next = tmp->next; 838 next = tmp->next;
832 839
833 dentry_lru_del_init(dentry);
834 /* 840 /*
835 * move only zero ref count dentries to the end 841 * move only zero ref count dentries to the end
836 * of the unused list for prune_dcache 842 * of the unused list for prune_dcache
837 */ 843 */
838 if (!atomic_read(&dentry->d_count)) { 844 if (!atomic_read(&dentry->d_count)) {
839 dentry_lru_add_tail(dentry); 845 dentry_lru_move_tail(dentry);
840 found++; 846 found++;
847 } else {
848 dentry_lru_del(dentry);
841 } 849 }
842 850
843 /* 851 /*
@@ -900,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
900 */ 908 */
901static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 909static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
902{ 910{
911 int nr_unused;
912
903 if (nr) { 913 if (nr) {
904 if (!(gfp_mask & __GFP_FS)) 914 if (!(gfp_mask & __GFP_FS))
905 return -1; 915 return -1;
906 prune_dcache(nr); 916 prune_dcache(nr);
907 } 917 }
908 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 918
919 nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
920 return (nr_unused / 100) * sysctl_vfs_cache_pressure;
909} 921}
910 922
911static struct shrinker dcache_shrinker = { 923static struct shrinker dcache_shrinker = {
@@ -972,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
972 spin_lock(&dcache_lock); 984 spin_lock(&dcache_lock);
973 if (parent) 985 if (parent)
974 list_add(&dentry->d_u.d_child, &parent->d_subdirs); 986 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
975 dentry_stat.nr_dentry++;
976 spin_unlock(&dcache_lock); 987 spin_unlock(&dcache_lock);
977 988
989 percpu_counter_inc(&nr_dentry);
990
978 return dentry; 991 return dentry;
979} 992}
980EXPORT_SYMBOL(d_alloc); 993EXPORT_SYMBOL(d_alloc);
@@ -1478,33 +1491,26 @@ out:
1478 * This is used by ncpfs in its readdir implementation. 1491 * This is used by ncpfs in its readdir implementation.
1479 * Zero is returned in the dentry is invalid. 1492 * Zero is returned in the dentry is invalid.
1480 */ 1493 */
1481 1494int d_validate(struct dentry *dentry, struct dentry *parent)
1482int d_validate(struct dentry *dentry, struct dentry *dparent)
1483{ 1495{
1484 struct hlist_head *base; 1496 struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
1485 struct hlist_node *lhp; 1497 struct hlist_node *node;
1498 struct dentry *d;
1486 1499
1487 /* Check whether the ptr might be valid at all.. */ 1500 /* Check whether the ptr might be valid at all.. */
1488 if (!kmem_ptr_validate(dentry_cache, dentry)) 1501 if (!kmem_ptr_validate(dentry_cache, dentry))
1489 goto out; 1502 return 0;
1490 1503 if (dentry->d_parent != parent)
1491 if (dentry->d_parent != dparent) 1504 return 0;
1492 goto out;
1493 1505
1494 spin_lock(&dcache_lock); 1506 rcu_read_lock();
1495 base = d_hash(dparent, dentry->d_name.hash); 1507 hlist_for_each_entry_rcu(d, node, head, d_hash) {
1496 hlist_for_each(lhp,base) { 1508 if (d == dentry) {
1497 /* hlist_for_each_entry_rcu() not required for d_hash list 1509 dget(dentry);
1498 * as it is parsed under dcache_lock
1499 */
1500 if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
1501 __dget_locked(dentry);
1502 spin_unlock(&dcache_lock);
1503 return 1; 1510 return 1;
1504 } 1511 }
1505 } 1512 }
1506 spin_unlock(&dcache_lock); 1513 rcu_read_unlock();
1507out:
1508 return 0; 1514 return 0;
1509} 1515}
1510EXPORT_SYMBOL(d_validate); 1516EXPORT_SYMBOL(d_validate);
@@ -1994,7 +2000,7 @@ global_root:
1994 * Returns a pointer into the buffer or an error code if the 2000 * Returns a pointer into the buffer or an error code if the
1995 * path was too long. 2001 * path was too long.
1996 * 2002 *
1997 * "buflen" should be positive. Caller holds the dcache_lock. 2003 * "buflen" should be positive.
1998 * 2004 *
1999 * If path is not reachable from the supplied root, then the value of 2005 * If path is not reachable from the supplied root, then the value of
2000 * root is changed (without modifying refcounts). 2006 * root is changed (without modifying refcounts).
@@ -2006,10 +2012,12 @@ char *__d_path(const struct path *path, struct path *root,
2006 int error; 2012 int error;
2007 2013
2008 prepend(&res, &buflen, "\0", 1); 2014 prepend(&res, &buflen, "\0", 1);
2015 spin_lock(&dcache_lock);
2009 error = prepend_path(path, root, &res, &buflen); 2016 error = prepend_path(path, root, &res, &buflen);
2017 spin_unlock(&dcache_lock);
2018
2010 if (error) 2019 if (error)
2011 return ERR_PTR(error); 2020 return ERR_PTR(error);
2012
2013 return res; 2021 return res;
2014} 2022}
2015 2023
@@ -2419,6 +2427,9 @@ static void __init dcache_init(void)
2419{ 2427{
2420 int loop; 2428 int loop;
2421 2429
2430 percpu_counter_init(&nr_dentry, 0);
2431 percpu_counter_init(&nr_dentry_unused, 0);
2432
2422 /* 2433 /*
2423 * A constructor could be added for stable state like the lists, 2434 * A constructor could be added for stable state like the lists,
2424 * but it is probably not worth it because of the cache nature 2435 * but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b2..89d394d8fe24 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
43 .read = default_read_file, 43 .read = default_read_file,
44 .write = default_write_file, 44 .write = default_write_file,
45 .open = default_open, 45 .open = default_open,
46 .llseek = noop_llseek,
46}; 47};
47 48
48static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) 49static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
454 .read = read_file_bool, 455 .read = read_file_bool,
455 .write = write_file_bool, 456 .write = write_file_bool,
456 .open = default_open, 457 .open = default_open,
458 .llseek = default_llseek,
457}; 459};
458 460
459/** 461/**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
498static const struct file_operations fops_blob = { 500static const struct file_operations fops_blob = {
499 .read = read_file_blob, 501 .read = read_file_blob,
500 .open = default_open, 502 .open = default_open,
503 .llseek = default_llseek,
501}; 504};
502 505
503/** 506/**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..37a8ca7c1222 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
40 struct inode *inode = new_inode(sb); 40 struct inode *inode = new_inode(sb);
41 41
42 if (inode) { 42 if (inode) {
43 inode->i_ino = get_next_ino();
43 inode->i_mode = mode; 44 inode->i_mode = mode;
44 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
45 switch (mode & S_IFMT) { 46 switch (mode & S_IFMT) {
@@ -134,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
134 return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); 135 return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
135} 136}
136 137
137static int debug_get_sb(struct file_system_type *fs_type, 138static struct dentry *debug_mount(struct file_system_type *fs_type,
138 int flags, const char *dev_name, 139 int flags, const char *dev_name,
139 void *data, struct vfsmount *mnt) 140 void *data)
140{ 141{
141 return get_sb_single(fs_type, flags, data, debug_fill_super, mnt); 142 return mount_single(fs_type, flags, data, debug_fill_super);
142} 143}
143 144
144static struct file_system_type debug_fs_type = { 145static struct file_system_type debug_fs_type = {
145 .owner = THIS_MODULE, 146 .owner = THIS_MODULE,
146 .name = "debugfs", 147 .name = "debugfs",
147 .get_sb = debug_get_sb, 148 .mount = debug_mount,
148 .kill_sb = kill_litter_super, 149 .kill_sb = kill_litter_super,
149}; 150};
150 151
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b5235..1bb547c9cad6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
331} 331}
332 332
333/* 333/*
334 * devpts_get_sb() 334 * devpts_mount()
335 * 335 *
336 * If the '-o newinstance' mount option was specified, mount a new 336 * If the '-o newinstance' mount option was specified, mount a new
337 * (private) instance of devpts. PTYs created in this instance are 337 * (private) instance of devpts. PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
345 * semantics in devpts while preserving backward compatibility of the 345 * semantics in devpts while preserving backward compatibility of the
346 * current 'single-namespace' semantics. i.e all mounts of devpts 346 * current 'single-namespace' semantics. i.e all mounts of devpts
347 * without the 'newinstance' mount option should bind to the initial 347 * without the 'newinstance' mount option should bind to the initial
348 * kernel mount, like get_sb_single(). 348 * kernel mount, like mount_single().
349 * 349 *
350 * Mounts with 'newinstance' option create a new, private namespace. 350 * Mounts with 'newinstance' option create a new, private namespace.
351 * 351 *
352 * NOTE: 352 * NOTE:
353 * 353 *
354 * For single-mount semantics, devpts cannot use get_sb_single(), 354 * For single-mount semantics, devpts cannot use mount_single(),
355 * because get_sb_single()/sget() find and use the super-block from 355 * because mount_single()/sget() find and use the super-block from
356 * the most recent mount of devpts. But that recent mount may be a 356 * the most recent mount of devpts. But that recent mount may be a
357 * 'newinstance' mount and get_sb_single() would pick the newinstance 357 * 'newinstance' mount and mount_single() would pick the newinstance
358 * super-block instead of the initial super-block. 358 * super-block instead of the initial super-block.
359 */ 359 */
360static int devpts_get_sb(struct file_system_type *fs_type, 360static struct dentry *devpts_mount(struct file_system_type *fs_type,
361 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 361 int flags, const char *dev_name, void *data)
362{ 362{
363 int error; 363 int error;
364 struct pts_mount_opts opts; 364 struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
366 366
367 error = parse_mount_options(data, PARSE_MOUNT, &opts); 367 error = parse_mount_options(data, PARSE_MOUNT, &opts);
368 if (error) 368 if (error)
369 return error; 369 return ERR_PTR(error);
370 370
371 if (opts.newinstance) 371 if (opts.newinstance)
372 s = sget(fs_type, NULL, set_anon_super, NULL); 372 s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
374 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); 374 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
375 375
376 if (IS_ERR(s)) 376 if (IS_ERR(s))
377 return PTR_ERR(s); 377 return ERR_CAST(s);
378 378
379 if (!s->s_root) { 379 if (!s->s_root) {
380 s->s_flags = flags; 380 s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
390 if (error) 390 if (error)
391 goto out_undo_sget; 391 goto out_undo_sget;
392 392
393 simple_set_mnt(mnt, s); 393 return dget(s->s_root);
394
395 return 0;
396 394
397out_undo_sget: 395out_undo_sget:
398 deactivate_locked_super(s); 396 deactivate_locked_super(s);
399 return error; 397 return ERR_PTR(error);
400} 398}
401 399
402#else 400#else
@@ -404,10 +402,10 @@ out_undo_sget:
404 * This supports only the legacy single-instance semantics (no 402 * This supports only the legacy single-instance semantics (no
405 * multiple-instance semantics) 403 * multiple-instance semantics)
406 */ 404 */
407static int devpts_get_sb(struct file_system_type *fs_type, int flags, 405static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
408 const char *dev_name, void *data, struct vfsmount *mnt) 406 const char *dev_name, void *data)
409{ 407{
410 return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); 408 return mount_single(fs_type, flags, data, devpts_fill_super);
411} 409}
412#endif 410#endif
413 411
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
421 419
422static struct file_system_type devpts_fs_type = { 420static struct file_system_type devpts_fs_type = {
423 .name = "devpts", 421 .name = "devpts",
424 .get_sb = devpts_get_sb, 422 .mount = devpts_mount,
425 .kill_sb = devpts_kill_sb, 423 .kill_sb = devpts_kill_sb,
426}; 424};
427 425
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..85882f6ba5f7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
218 * filesystems can use it to hold additional state between get_block calls and 218 * filesystems can use it to hold additional state between get_block calls and
219 * dio_complete. 219 * dio_complete.
220 */ 220 */
221static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) 221static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
222{ 222{
223 ssize_t transferred = 0; 223 ssize_t transferred = 0;
224 224
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf25158746..6b42ba807dfd 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
643static const struct file_operations waiters_fops = { 643static const struct file_operations waiters_fops = {
644 .owner = THIS_MODULE, 644 .owner = THIS_MODULE,
645 .open = waiters_open, 645 .open = waiters_open,
646 .read = waiters_read 646 .read = waiters_read,
647 .llseek = default_llseek,
647}; 648};
648 649
649void dlm_delete_debug_file(struct dlm_ls *ls) 650void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15ca..64e5f3efdd81 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1846 struct dlm_lkb *gr; 1846 struct dlm_lkb *gr;
1847 1847
1848 list_for_each_entry(gr, head, lkb_statequeue) { 1848 list_for_each_entry(gr, head, lkb_statequeue) {
1849 /* skip self when sending basts to convertqueue */
1850 if (gr == lkb)
1851 continue;
1849 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) { 1852 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
1850 queue_bast(r, gr, lkb->lkb_rqmode); 1853 queue_bast(r, gr, lkb->lkb_rqmode);
1851 gr->lkb_highbast = lkb->lkb_rqmode; 1854 gr->lkb_highbast = lkb->lkb_rqmode;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db6943..30d8b85febbf 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
412 .read = dev_read, 412 .read = dev_read,
413 .write = dev_write, 413 .write = dev_write,
414 .poll = dev_poll, 414 .poll = dev_poll,
415 .owner = THIS_MODULE 415 .owner = THIS_MODULE,
416 .llseek = noop_llseek,
416}; 417};
417 418
418static struct miscdevice plock_dev_misc = { 419static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130c..66d6c16bf440 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
1009 .write = device_write, 1009 .write = device_write,
1010 .poll = device_poll, 1010 .poll = device_poll,
1011 .owner = THIS_MODULE, 1011 .owner = THIS_MODULE,
1012 .llseek = noop_llseek,
1012}; 1013};
1013 1014
1014static const struct file_operations ctl_device_fops = { 1015static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
1017 .read = device_read, 1018 .read = device_read,
1018 .write = device_write, 1019 .write = device_write,
1019 .owner = THIS_MODULE, 1020 .owner = THIS_MODULE,
1021 .llseek = noop_llseek,
1020}; 1022};
1021 1023
1022static struct miscdevice ctl_device = { 1024static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
1029 .open = monitor_device_open, 1031 .open = monitor_device_open,
1030 .release = monitor_device_close, 1032 .release = monitor_device_close,
1031 .owner = THIS_MODULE, 1033 .owner = THIS_MODULE,
1034 .llseek = noop_llseek,
1032}; 1035};
1033 1036
1034static struct miscdevice monitor_device = { 1037static struct miscdevice monitor_device = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a9..413a3c48f0bb 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat {
377#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 377#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010
378#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 378#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020
379#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 379#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040
380#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080
380 u32 flags; 381 u32 flags;
381 struct list_head global_auth_tok_list; 382 struct list_head global_auth_tok_list;
382 struct mutex global_auth_tok_list_mutex; 383 struct mutex global_auth_tok_list_mutex;
@@ -477,7 +478,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
477static inline struct ecryptfs_file_info * 478static inline struct ecryptfs_file_info *
478ecryptfs_file_to_private(struct file *file) 479ecryptfs_file_to_private(struct file *file)
479{ 480{
480 return (struct ecryptfs_file_info *)file->private_data; 481 return file->private_data;
481} 482}
482 483
483static inline void 484static inline void
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 622c95140802..91da02987bff 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/compat.h> 32#include <linux/compat.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/smp_lock.h>
35#include "ecryptfs_kernel.h" 34#include "ecryptfs_kernel.h"
36 35
37/** 36/**
@@ -284,11 +283,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
284 int rc = 0; 283 int rc = 0;
285 struct file *lower_file = NULL; 284 struct file *lower_file = NULL;
286 285
287 lock_kernel();
288 lower_file = ecryptfs_file_to_lower(file); 286 lower_file = ecryptfs_file_to_lower(file);
289 if (lower_file->f_op && lower_file->f_op->fasync) 287 if (lower_file->f_op && lower_file->f_op->fasync)
290 rc = lower_file->f_op->fasync(fd, lower_file, flag); 288 rc = lower_file->f_op->fasync(fd, lower_file, flag);
291 unlock_kernel();
292 return rc; 289 return rc;
293} 290}
294 291
@@ -332,6 +329,7 @@ const struct file_operations ecryptfs_dir_fops = {
332 .fsync = ecryptfs_fsync, 329 .fsync = ecryptfs_fsync,
333 .fasync = ecryptfs_fasync, 330 .fasync = ecryptfs_fasync,
334 .splice_read = generic_file_splice_read, 331 .splice_read = generic_file_splice_read,
332 .llseek = default_llseek,
335}; 333};
336 334
337const struct file_operations ecryptfs_main_fops = { 335const struct file_operations ecryptfs_main_fops = {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3fbc94203380..9d1a22d62765 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/xattr.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
@@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
70 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); 71 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
71 struct dentry *dentry_save; 72 struct dentry *dentry_save;
72 struct vfsmount *vfsmount_save; 73 struct vfsmount *vfsmount_save;
74 unsigned int flags_save;
73 int rc; 75 int rc;
74 76
75 dentry_save = nd->path.dentry; 77 dentry_save = nd->path.dentry;
76 vfsmount_save = nd->path.mnt; 78 vfsmount_save = nd->path.mnt;
79 flags_save = nd->flags;
77 nd->path.dentry = lower_dentry; 80 nd->path.dentry = lower_dentry;
78 nd->path.mnt = lower_mnt; 81 nd->path.mnt = lower_mnt;
82 nd->flags &= ~LOOKUP_OPEN;
79 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); 83 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
80 nd->path.dentry = dentry_save; 84 nd->path.dentry = dentry_save;
81 nd->path.mnt = vfsmount_save; 85 nd->path.mnt = vfsmount_save;
86 nd->flags = flags_save;
82 return rc; 87 return rc;
83} 88}
84 89
@@ -1108,10 +1113,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1108 rc = -EOPNOTSUPP; 1113 rc = -EOPNOTSUPP;
1109 goto out; 1114 goto out;
1110 } 1115 }
1111 mutex_lock(&lower_dentry->d_inode->i_mutex); 1116
1112 rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value, 1117 rc = vfs_setxattr(lower_dentry, name, value, size, flags);
1113 size, flags);
1114 mutex_unlock(&lower_dentry->d_inode->i_mutex);
1115out: 1118out:
1116 return rc; 1119 return rc;
1117} 1120}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 73811cfa2ea4..b1f6858a5223 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -446,6 +446,7 @@ out:
446 */ 446 */
447static int 447static int
448ecryptfs_find_auth_tok_for_sig( 448ecryptfs_find_auth_tok_for_sig(
449 struct key **auth_tok_key,
449 struct ecryptfs_auth_tok **auth_tok, 450 struct ecryptfs_auth_tok **auth_tok,
450 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 451 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
451 char *sig) 452 char *sig)
@@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig(
453 struct ecryptfs_global_auth_tok *global_auth_tok; 454 struct ecryptfs_global_auth_tok *global_auth_tok;
454 int rc = 0; 455 int rc = 0;
455 456
457 (*auth_tok_key) = NULL;
456 (*auth_tok) = NULL; 458 (*auth_tok) = NULL;
457 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, 459 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
458 mount_crypt_stat, sig)) { 460 mount_crypt_stat, sig)) {
459 struct key *auth_tok_key;
460 461
461 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, 462 /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
463 * mount_crypt_stat structure, we prevent to use auth toks that
464 * are not inserted through the ecryptfs_add_global_auth_tok
465 * function.
466 */
467 if (mount_crypt_stat->flags
468 & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
469 return -EINVAL;
470
471 rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
462 sig); 472 sig);
463 } else 473 } else
464 (*auth_tok) = global_auth_tok->global_auth_tok; 474 (*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,6 +519,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
509 char *filename, size_t filename_size) 519 char *filename, size_t filename_size)
510{ 520{
511 struct ecryptfs_write_tag_70_packet_silly_stack *s; 521 struct ecryptfs_write_tag_70_packet_silly_stack *s;
522 struct key *auth_tok_key = NULL;
512 int rc = 0; 523 int rc = 0;
513 524
514 s = kmalloc(sizeof(*s), GFP_KERNEL); 525 s = kmalloc(sizeof(*s), GFP_KERNEL);
@@ -606,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
606 } 617 }
607 dest[s->i++] = s->cipher_code; 618 dest[s->i++] = s->cipher_code;
608 rc = ecryptfs_find_auth_tok_for_sig( 619 rc = ecryptfs_find_auth_tok_for_sig(
620 &auth_tok_key,
609 &s->auth_tok, mount_crypt_stat, 621 &s->auth_tok, mount_crypt_stat,
610 mount_crypt_stat->global_default_fnek_sig); 622 mount_crypt_stat->global_default_fnek_sig);
611 if (rc) { 623 if (rc) {
@@ -753,6 +765,8 @@ out_free_unlock:
753out_unlock: 765out_unlock:
754 mutex_unlock(s->tfm_mutex); 766 mutex_unlock(s->tfm_mutex);
755out: 767out:
768 if (auth_tok_key)
769 key_put(auth_tok_key);
756 kfree(s); 770 kfree(s);
757 return rc; 771 return rc;
758} 772}
@@ -798,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
798 char *data, size_t max_packet_size) 812 char *data, size_t max_packet_size)
799{ 813{
800 struct ecryptfs_parse_tag_70_packet_silly_stack *s; 814 struct ecryptfs_parse_tag_70_packet_silly_stack *s;
815 struct key *auth_tok_key = NULL;
801 int rc = 0; 816 int rc = 0;
802 817
803 (*packet_size) = 0; 818 (*packet_size) = 0;
@@ -910,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
910 * >= ECRYPTFS_MAX_IV_BYTES. */ 925 * >= ECRYPTFS_MAX_IV_BYTES. */
911 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); 926 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
912 s->desc.info = s->iv; 927 s->desc.info = s->iv;
913 rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, 928 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
929 &s->auth_tok, mount_crypt_stat,
914 s->fnek_sig_hex); 930 s->fnek_sig_hex);
915 if (rc) { 931 if (rc) {
916 printk(KERN_ERR "%s: Error attempting to find auth tok for " 932 printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -986,6 +1002,8 @@ out:
986 (*filename_size) = 0; 1002 (*filename_size) = 0;
987 (*filename) = NULL; 1003 (*filename) = NULL;
988 } 1004 }
1005 if (auth_tok_key)
1006 key_put(auth_tok_key);
989 kfree(s); 1007 kfree(s);
990 return rc; 1008 return rc;
991} 1009}
@@ -1557,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
1557 ECRYPTFS_VERSION_MAJOR, 1575 ECRYPTFS_VERSION_MAJOR,
1558 ECRYPTFS_VERSION_MINOR); 1576 ECRYPTFS_VERSION_MINOR);
1559 rc = -EINVAL; 1577 rc = -EINVAL;
1560 goto out; 1578 goto out_release_key;
1561 } 1579 }
1562 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD 1580 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
1563 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { 1581 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
1564 printk(KERN_ERR "Invalid auth_tok structure " 1582 printk(KERN_ERR "Invalid auth_tok structure "
1565 "returned from key query\n"); 1583 "returned from key query\n");
1566 rc = -EINVAL; 1584 rc = -EINVAL;
1567 goto out; 1585 goto out_release_key;
1586 }
1587out_release_key:
1588 if (rc) {
1589 key_put(*auth_tok_key);
1590 (*auth_tok_key) = NULL;
1568 } 1591 }
1569out: 1592out:
1570 return rc; 1593 return rc;
@@ -1688,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1688 struct ecryptfs_auth_tok_list_item *auth_tok_list_item; 1711 struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
1689 size_t tag_11_contents_size; 1712 size_t tag_11_contents_size;
1690 size_t tag_11_packet_size; 1713 size_t tag_11_packet_size;
1714 struct key *auth_tok_key = NULL;
1691 int rc = 0; 1715 int rc = 0;
1692 1716
1693 INIT_LIST_HEAD(&auth_tok_list); 1717 INIT_LIST_HEAD(&auth_tok_list);
@@ -1784,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1784 * just one will be sufficient to decrypt to get the FEK. */ 1808 * just one will be sufficient to decrypt to get the FEK. */
1785find_next_matching_auth_tok: 1809find_next_matching_auth_tok:
1786 found_auth_tok = 0; 1810 found_auth_tok = 0;
1811 if (auth_tok_key) {
1812 key_put(auth_tok_key);
1813 auth_tok_key = NULL;
1814 }
1787 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { 1815 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
1788 candidate_auth_tok = &auth_tok_list_item->auth_tok; 1816 candidate_auth_tok = &auth_tok_list_item->auth_tok;
1789 if (unlikely(ecryptfs_verbosity > 0)) { 1817 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1800,10 +1828,11 @@ find_next_matching_auth_tok:
1800 rc = -EINVAL; 1828 rc = -EINVAL;
1801 goto out_wipe_list; 1829 goto out_wipe_list;
1802 } 1830 }
1803 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, 1831 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
1832 &matching_auth_tok,
1804 crypt_stat->mount_crypt_stat, 1833 crypt_stat->mount_crypt_stat,
1805 candidate_auth_tok_sig); 1834 candidate_auth_tok_sig);
1806 if (matching_auth_tok) { 1835 if (!rc) {
1807 found_auth_tok = 1; 1836 found_auth_tok = 1;
1808 goto found_matching_auth_tok; 1837 goto found_matching_auth_tok;
1809 } 1838 }
@@ -1866,6 +1895,8 @@ found_matching_auth_tok:
1866out_wipe_list: 1895out_wipe_list:
1867 wipe_auth_tok_list(&auth_tok_list); 1896 wipe_auth_tok_list(&auth_tok_list);
1868out: 1897out:
1898 if (auth_tok_key)
1899 key_put(auth_tok_key);
1869 return rc; 1900 return rc;
1870} 1901}
1871 1902
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..a9dbd62518e6 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, 209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, 210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_unlink_sigs, ecryptfs_opt_err }; 211 ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
212 ecryptfs_opt_err };
212 213
213static const match_table_t tokens = { 214static const match_table_t tokens = {
214 {ecryptfs_opt_sig, "sig=%s"}, 215 {ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +224,7 @@ static const match_table_t tokens = {
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, 224 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, 225 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
225 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, 226 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
227 {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
226 {ecryptfs_opt_err, NULL} 228 {ecryptfs_opt_err, NULL}
227}; 229};
228 230
@@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
406 case ecryptfs_opt_unlink_sigs: 408 case ecryptfs_opt_unlink_sigs:
407 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; 409 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
408 break; 410 break;
411 case ecryptfs_opt_mount_auth_tok_only:
412 mount_crypt_stat->flags |=
413 ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
414 break;
409 case ecryptfs_opt_err: 415 case ecryptfs_opt_err:
410 default: 416 default:
411 printk(KERN_WARNING 417 printk(KERN_WARNING
@@ -540,9 +546,8 @@ out:
540 * ecryptfs_interpose to perform most of the linking 546 * ecryptfs_interpose to perform most of the linking
541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) 547 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
542 */ 548 */
543static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, 549static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
544 const char *dev_name, void *raw_data, 550 const char *dev_name, void *raw_data)
545 struct vfsmount *mnt)
546{ 551{
547 struct super_block *s; 552 struct super_block *s;
548 struct ecryptfs_sb_info *sbi; 553 struct ecryptfs_sb_info *sbi;
@@ -607,8 +612,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
607 err = "Reading sb failed"; 612 err = "Reading sb failed";
608 goto out; 613 goto out;
609 } 614 }
610 simple_set_mnt(mnt, s); 615 return dget(s->s_root);
611 return 0;
612 616
613out: 617out:
614 if (sbi) { 618 if (sbi) {
@@ -616,7 +620,7 @@ out:
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi); 620 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 } 621 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc); 622 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
619 return rc; 623 return ERR_PTR(rc);
620} 624}
621 625
622/** 626/**
@@ -639,7 +643,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
639static struct file_system_type ecryptfs_fs_type = { 643static struct file_system_type ecryptfs_fs_type = {
640 .owner = THIS_MODULE, 644 .owner = THIS_MODULE,
641 .name = "ecryptfs", 645 .name = "ecryptfs",
642 .get_sb = ecryptfs_get_sb, 646 .mount = ecryptfs_mount,
643 .kill_sb = ecryptfs_kill_block_super, 647 .kill_sb = ecryptfs_kill_block_super,
644 .fs_flags = 0 648 .fs_flags = 0
645}; 649};
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 00208c3d7e92..940a82e63dc3 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
482 .read = ecryptfs_miscdev_read, 482 .read = ecryptfs_miscdev_read,
483 .write = ecryptfs_miscdev_write, 483 .write = ecryptfs_miscdev_write,
484 .release = ecryptfs_miscdev_release, 484 .release = ecryptfs_miscdev_release,
485 .llseek = noop_llseek,
485}; 486};
486 487
487static struct miscdevice ecryptfs_miscdev = { 488static struct miscdevice ecryptfs_miscdev = {
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f7fc286a3aa9..253732382d37 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -180,6 +180,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
180 seq_printf(m, ",ecryptfs_encrypted_view"); 180 seq_printf(m, ",ecryptfs_encrypted_view");
181 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) 181 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
182 seq_printf(m, ",ecryptfs_unlink_sigs"); 182 seq_printf(m, ",ecryptfs_unlink_sigs");
183 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
184 seq_printf(m, ",ecryptfs_mount_auth_tok_only");
183 185
184 return 0; 186 return 0;
185} 187}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f04942810818..5073a07652cc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
20static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); 20static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
21static int efs_fill_super(struct super_block *s, void *d, int silent); 21static int efs_fill_super(struct super_block *s, void *d, int silent);
22 22
23static int efs_get_sb(struct file_system_type *fs_type, 23static struct dentry *efs_mount(struct file_system_type *fs_type,
24 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 24 int flags, const char *dev_name, void *data)
25{ 25{
26 return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt); 26 return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
27} 27}
28 28
29static struct file_system_type efs_fs_type = { 29static struct file_system_type efs_fs_type = {
30 .owner = THIS_MODULE, 30 .owner = THIS_MODULE,
31 .name = "efs", 31 .name = "efs",
32 .get_sb = efs_get_sb, 32 .mount = efs_mount,
33 .kill_sb = kill_block_super, 33 .kill_sb = kill_block_super,
34 .fs_flags = FS_REQUIRES_DEV, 34 .fs_flags = FS_REQUIRES_DEV,
35}; 35};
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf88..e0194b3e14d6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
293 .poll = eventfd_poll, 293 .poll = eventfd_poll,
294 .read = eventfd_read, 294 .read = eventfd_read,
295 .write = eventfd_write, 295 .write = eventfd_write,
296 .llseek = noop_llseek,
296}; 297};
297 298
298/** 299/**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919cb..8cf07242067d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
77/* Maximum number of nesting allowed inside epoll sets */ 77/* Maximum number of nesting allowed inside epoll sets */
78#define EP_MAX_NESTS 4 78#define EP_MAX_NESTS 4
79 79
80/* Maximum msec timeout value storeable in a long int */
81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
82
83#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 80#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
84 81
85#define EP_UNACTIVE_PTR ((void *) -1L) 82#define EP_UNACTIVE_PTR ((void *) -1L)
@@ -674,7 +671,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
674/* File callbacks that implement the eventpoll file behaviour */ 671/* File callbacks that implement the eventpoll file behaviour */
675static const struct file_operations eventpoll_fops = { 672static const struct file_operations eventpoll_fops = {
676 .release = ep_eventpoll_release, 673 .release = ep_eventpoll_release,
677 .poll = ep_eventpoll_poll 674 .poll = ep_eventpoll_poll,
675 .llseek = noop_llseek,
678}; 676};
679 677
680/* Fast test to see if the file is an evenpoll file */ 678/* Fast test to see if the file is an evenpoll file */
@@ -1116,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep,
1116static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1114static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1117 int maxevents, long timeout) 1115 int maxevents, long timeout)
1118{ 1116{
1119 int res, eavail; 1117 int res, eavail, timed_out = 0;
1120 unsigned long flags; 1118 unsigned long flags;
1121 long jtimeout; 1119 long slack;
1122 wait_queue_t wait; 1120 wait_queue_t wait;
1123 1121 struct timespec end_time;
1124 /* 1122 ktime_t expires, *to = NULL;
1125 * Calculate the timeout by checking for the "infinite" value (-1) 1123
1126 * and the overflow condition. The passed timeout is in milliseconds, 1124 if (timeout > 0) {
1127 * that why (t * HZ) / 1000. 1125 ktime_get_ts(&end_time);
1128 */ 1126 timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
1129 jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? 1127 slack = select_estimate_accuracy(&end_time);
1130 MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; 1128 to = &expires;
1129 *to = timespec_to_ktime(end_time);
1130 } else if (timeout == 0) {
1131 timed_out = 1;
1132 }
1131 1133
1132retry: 1134retry:
1133 spin_lock_irqsave(&ep->lock, flags); 1135 spin_lock_irqsave(&ep->lock, flags);
@@ -1149,7 +1151,7 @@ retry:
1149 * to TASK_INTERRUPTIBLE before doing the checks. 1151 * to TASK_INTERRUPTIBLE before doing the checks.
1150 */ 1152 */
1151 set_current_state(TASK_INTERRUPTIBLE); 1153 set_current_state(TASK_INTERRUPTIBLE);
1152 if (!list_empty(&ep->rdllist) || !jtimeout) 1154 if (!list_empty(&ep->rdllist) || timed_out)
1153 break; 1155 break;
1154 if (signal_pending(current)) { 1156 if (signal_pending(current)) {
1155 res = -EINTR; 1157 res = -EINTR;
@@ -1157,7 +1159,9 @@ retry:
1157 } 1159 }
1158 1160
1159 spin_unlock_irqrestore(&ep->lock, flags); 1161 spin_unlock_irqrestore(&ep->lock, flags);
1160 jtimeout = schedule_timeout(jtimeout); 1162 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1163 timed_out = 1;
1164
1161 spin_lock_irqsave(&ep->lock, flags); 1165 spin_lock_irqsave(&ep->lock, flags);
1162 } 1166 }
1163 __remove_wait_queue(&ep->wq, &wait); 1167 __remove_wait_queue(&ep->wq, &wait);
@@ -1175,7 +1179,7 @@ retry:
1175 * more luck. 1179 * more luck.
1176 */ 1180 */
1177 if (!res && eavail && 1181 if (!res && eavail &&
1178 !(res = ep_send_events(ep, events, maxevents)) && jtimeout) 1182 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1179 goto retry; 1183 goto retry;
1180 1184
1181 return res; 1185 return res;
diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..99d33a1371e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
54#include <linux/fsnotify.h> 54#include <linux/fsnotify.h>
55#include <linux/fs_struct.h> 55#include <linux/fs_struct.h>
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
@@ -65,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
65unsigned int core_pipe_limit; 66unsigned int core_pipe_limit;
66int suid_dumpable = 0; 67int suid_dumpable = 0;
67 68
69struct core_name {
70 char *corename;
71 int used, size;
72};
73static atomic_t call_count = ATOMIC_INIT(1);
74
68/* The maximal length of core_pattern is also specified in sysctl.c */ 75/* The maximal length of core_pattern is also specified in sysctl.c */
69 76
70static LIST_HEAD(formats); 77static LIST_HEAD(formats);
@@ -759,6 +766,10 @@ static int exec_mmap(struct mm_struct *mm)
759 tsk->mm = mm; 766 tsk->mm = mm;
760 tsk->active_mm = mm; 767 tsk->active_mm = mm;
761 activate_mm(active_mm, mm); 768 activate_mm(active_mm, mm);
769 if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
770 atomic_dec(&old_mm->oom_disable_count);
771 atomic_inc(&tsk->mm->oom_disable_count);
772 }
762 task_unlock(tsk); 773 task_unlock(tsk);
763 arch_pick_mmap_layout(mm); 774 arch_pick_mmap_layout(mm);
764 if (old_mm) { 775 if (old_mm) {
@@ -998,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm)
998 1009
999 bprm->mm = NULL; /* We're using it now */ 1010 bprm->mm = NULL; /* We're using it now */
1000 1011
1001 current->flags &= ~PF_RANDOMIZE; 1012 current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
1002 flush_thread(); 1013 flush_thread();
1003 current->personality &= ~bprm->per_clear; 1014 current->personality &= ~bprm->per_clear;
1004 1015
@@ -1078,14 +1089,14 @@ EXPORT_SYMBOL(setup_new_exec);
1078 */ 1089 */
1079int prepare_bprm_creds(struct linux_binprm *bprm) 1090int prepare_bprm_creds(struct linux_binprm *bprm)
1080{ 1091{
1081 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1092 if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1082 return -ERESTARTNOINTR; 1093 return -ERESTARTNOINTR;
1083 1094
1084 bprm->cred = prepare_exec_creds(); 1095 bprm->cred = prepare_exec_creds();
1085 if (likely(bprm->cred)) 1096 if (likely(bprm->cred))
1086 return 0; 1097 return 0;
1087 1098
1088 mutex_unlock(&current->cred_guard_mutex); 1099 mutex_unlock(&current->signal->cred_guard_mutex);
1089 return -ENOMEM; 1100 return -ENOMEM;
1090} 1101}
1091 1102
@@ -1093,7 +1104,7 @@ void free_bprm(struct linux_binprm *bprm)
1093{ 1104{
1094 free_arg_pages(bprm); 1105 free_arg_pages(bprm);
1095 if (bprm->cred) { 1106 if (bprm->cred) {
1096 mutex_unlock(&current->cred_guard_mutex); 1107 mutex_unlock(&current->signal->cred_guard_mutex);
1097 abort_creds(bprm->cred); 1108 abort_creds(bprm->cred);
1098 } 1109 }
1099 kfree(bprm); 1110 kfree(bprm);
@@ -1114,13 +1125,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1114 * credentials; any time after this it may be unlocked. 1125 * credentials; any time after this it may be unlocked.
1115 */ 1126 */
1116 security_bprm_committed_creds(bprm); 1127 security_bprm_committed_creds(bprm);
1117 mutex_unlock(&current->cred_guard_mutex); 1128 mutex_unlock(&current->signal->cred_guard_mutex);
1118} 1129}
1119EXPORT_SYMBOL(install_exec_creds); 1130EXPORT_SYMBOL(install_exec_creds);
1120 1131
1121/* 1132/*
1122 * determine how safe it is to execute the proposed program 1133 * determine how safe it is to execute the proposed program
1123 * - the caller must hold current->cred_guard_mutex to protect against 1134 * - the caller must hold ->cred_guard_mutex to protect against
1124 * PTRACE_ATTACH 1135 * PTRACE_ATTACH
1125 */ 1136 */
1126int check_unsafe_exec(struct linux_binprm *bprm) 1137int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1401,7 +1412,6 @@ int do_execve(const char * filename,
1401 if (retval < 0) 1412 if (retval < 0)
1402 goto out; 1413 goto out;
1403 1414
1404 current->flags &= ~PF_KTHREAD;
1405 retval = search_binary_handler(bprm,regs); 1415 retval = search_binary_handler(bprm,regs);
1406 if (retval < 0) 1416 if (retval < 0)
1407 goto out; 1417 goto out;
@@ -1454,127 +1464,148 @@ void set_binfmt(struct linux_binfmt *new)
1454 1464
1455EXPORT_SYMBOL(set_binfmt); 1465EXPORT_SYMBOL(set_binfmt);
1456 1466
1467static int expand_corename(struct core_name *cn)
1468{
1469 char *old_corename = cn->corename;
1470
1471 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
1472 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
1473
1474 if (!cn->corename) {
1475 kfree(old_corename);
1476 return -ENOMEM;
1477 }
1478
1479 return 0;
1480}
1481
1482static int cn_printf(struct core_name *cn, const char *fmt, ...)
1483{
1484 char *cur;
1485 int need;
1486 int ret;
1487 va_list arg;
1488
1489 va_start(arg, fmt);
1490 need = vsnprintf(NULL, 0, fmt, arg);
1491 va_end(arg);
1492
1493 if (likely(need < cn->size - cn->used - 1))
1494 goto out_printf;
1495
1496 ret = expand_corename(cn);
1497 if (ret)
1498 goto expand_fail;
1499
1500out_printf:
1501 cur = cn->corename + cn->used;
1502 va_start(arg, fmt);
1503 vsnprintf(cur, need + 1, fmt, arg);
1504 va_end(arg);
1505 cn->used += need;
1506 return 0;
1507
1508expand_fail:
1509 return ret;
1510}
1511
1457/* format_corename will inspect the pattern parameter, and output a 1512/* format_corename will inspect the pattern parameter, and output a
1458 * name into corename, which must have space for at least 1513 * name into corename, which must have space for at least
1459 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. 1514 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1460 */ 1515 */
1461static int format_corename(char *corename, long signr) 1516static int format_corename(struct core_name *cn, long signr)
1462{ 1517{
1463 const struct cred *cred = current_cred(); 1518 const struct cred *cred = current_cred();
1464 const char *pat_ptr = core_pattern; 1519 const char *pat_ptr = core_pattern;
1465 int ispipe = (*pat_ptr == '|'); 1520 int ispipe = (*pat_ptr == '|');
1466 char *out_ptr = corename;
1467 char *const out_end = corename + CORENAME_MAX_SIZE;
1468 int rc;
1469 int pid_in_pattern = 0; 1521 int pid_in_pattern = 0;
1522 int err = 0;
1523
1524 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
1525 cn->corename = kmalloc(cn->size, GFP_KERNEL);
1526 cn->used = 0;
1527
1528 if (!cn->corename)
1529 return -ENOMEM;
1470 1530
1471 /* Repeat as long as we have more pattern to process and more output 1531 /* Repeat as long as we have more pattern to process and more output
1472 space */ 1532 space */
1473 while (*pat_ptr) { 1533 while (*pat_ptr) {
1474 if (*pat_ptr != '%') { 1534 if (*pat_ptr != '%') {
1475 if (out_ptr == out_end) 1535 if (*pat_ptr == 0)
1476 goto out; 1536 goto out;
1477 *out_ptr++ = *pat_ptr++; 1537 err = cn_printf(cn, "%c", *pat_ptr++);
1478 } else { 1538 } else {
1479 switch (*++pat_ptr) { 1539 switch (*++pat_ptr) {
1540 /* single % at the end, drop that */
1480 case 0: 1541 case 0:
1481 goto out; 1542 goto out;
1482 /* Double percent, output one percent */ 1543 /* Double percent, output one percent */
1483 case '%': 1544 case '%':
1484 if (out_ptr == out_end) 1545 err = cn_printf(cn, "%c", '%');
1485 goto out;
1486 *out_ptr++ = '%';
1487 break; 1546 break;
1488 /* pid */ 1547 /* pid */
1489 case 'p': 1548 case 'p':
1490 pid_in_pattern = 1; 1549 pid_in_pattern = 1;
1491 rc = snprintf(out_ptr, out_end - out_ptr, 1550 err = cn_printf(cn, "%d",
1492 "%d", task_tgid_vnr(current)); 1551 task_tgid_vnr(current));
1493 if (rc > out_end - out_ptr)
1494 goto out;
1495 out_ptr += rc;
1496 break; 1552 break;
1497 /* uid */ 1553 /* uid */
1498 case 'u': 1554 case 'u':
1499 rc = snprintf(out_ptr, out_end - out_ptr, 1555 err = cn_printf(cn, "%d", cred->uid);
1500 "%d", cred->uid);
1501 if (rc > out_end - out_ptr)
1502 goto out;
1503 out_ptr += rc;
1504 break; 1556 break;
1505 /* gid */ 1557 /* gid */
1506 case 'g': 1558 case 'g':
1507 rc = snprintf(out_ptr, out_end - out_ptr, 1559 err = cn_printf(cn, "%d", cred->gid);
1508 "%d", cred->gid);
1509 if (rc > out_end - out_ptr)
1510 goto out;
1511 out_ptr += rc;
1512 break; 1560 break;
1513 /* signal that caused the coredump */ 1561 /* signal that caused the coredump */
1514 case 's': 1562 case 's':
1515 rc = snprintf(out_ptr, out_end - out_ptr, 1563 err = cn_printf(cn, "%ld", signr);
1516 "%ld", signr);
1517 if (rc > out_end - out_ptr)
1518 goto out;
1519 out_ptr += rc;
1520 break; 1564 break;
1521 /* UNIX time of coredump */ 1565 /* UNIX time of coredump */
1522 case 't': { 1566 case 't': {
1523 struct timeval tv; 1567 struct timeval tv;
1524 do_gettimeofday(&tv); 1568 do_gettimeofday(&tv);
1525 rc = snprintf(out_ptr, out_end - out_ptr, 1569 err = cn_printf(cn, "%lu", tv.tv_sec);
1526 "%lu", tv.tv_sec);
1527 if (rc > out_end - out_ptr)
1528 goto out;
1529 out_ptr += rc;
1530 break; 1570 break;
1531 } 1571 }
1532 /* hostname */ 1572 /* hostname */
1533 case 'h': 1573 case 'h':
1534 down_read(&uts_sem); 1574 down_read(&uts_sem);
1535 rc = snprintf(out_ptr, out_end - out_ptr, 1575 err = cn_printf(cn, "%s",
1536 "%s", utsname()->nodename); 1576 utsname()->nodename);
1537 up_read(&uts_sem); 1577 up_read(&uts_sem);
1538 if (rc > out_end - out_ptr)
1539 goto out;
1540 out_ptr += rc;
1541 break; 1578 break;
1542 /* executable */ 1579 /* executable */
1543 case 'e': 1580 case 'e':
1544 rc = snprintf(out_ptr, out_end - out_ptr, 1581 err = cn_printf(cn, "%s", current->comm);
1545 "%s", current->comm);
1546 if (rc > out_end - out_ptr)
1547 goto out;
1548 out_ptr += rc;
1549 break; 1582 break;
1550 /* core limit size */ 1583 /* core limit size */
1551 case 'c': 1584 case 'c':
1552 rc = snprintf(out_ptr, out_end - out_ptr, 1585 err = cn_printf(cn, "%lu",
1553 "%lu", rlimit(RLIMIT_CORE)); 1586 rlimit(RLIMIT_CORE));
1554 if (rc > out_end - out_ptr)
1555 goto out;
1556 out_ptr += rc;
1557 break; 1587 break;
1558 default: 1588 default:
1559 break; 1589 break;
1560 } 1590 }
1561 ++pat_ptr; 1591 ++pat_ptr;
1562 } 1592 }
1593
1594 if (err)
1595 return err;
1563 } 1596 }
1597
1564 /* Backward compatibility with core_uses_pid: 1598 /* Backward compatibility with core_uses_pid:
1565 * 1599 *
1566 * If core_pattern does not include a %p (as is the default) 1600 * If core_pattern does not include a %p (as is the default)
1567 * and core_uses_pid is set, then .%pid will be appended to 1601 * and core_uses_pid is set, then .%pid will be appended to
1568 * the filename. Do not do this for piped commands. */ 1602 * the filename. Do not do this for piped commands. */
1569 if (!ispipe && !pid_in_pattern && core_uses_pid) { 1603 if (!ispipe && !pid_in_pattern && core_uses_pid) {
1570 rc = snprintf(out_ptr, out_end - out_ptr, 1604 err = cn_printf(cn, ".%d", task_tgid_vnr(current));
1571 ".%d", task_tgid_vnr(current)); 1605 if (err)
1572 if (rc > out_end - out_ptr) 1606 return err;
1573 goto out;
1574 out_ptr += rc;
1575 } 1607 }
1576out: 1608out:
1577 *out_ptr = 0;
1578 return ispipe; 1609 return ispipe;
1579} 1610}
1580 1611
@@ -1851,7 +1882,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
1851void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1882void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1852{ 1883{
1853 struct core_state core_state; 1884 struct core_state core_state;
1854 char corename[CORENAME_MAX_SIZE + 1]; 1885 struct core_name cn;
1855 struct mm_struct *mm = current->mm; 1886 struct mm_struct *mm = current->mm;
1856 struct linux_binfmt * binfmt; 1887 struct linux_binfmt * binfmt;
1857 const struct cred *old_cred; 1888 const struct cred *old_cred;
@@ -1906,7 +1937,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1906 */ 1937 */
1907 clear_thread_flag(TIF_SIGPENDING); 1938 clear_thread_flag(TIF_SIGPENDING);
1908 1939
1909 ispipe = format_corename(corename, signr); 1940 ispipe = format_corename(&cn, signr);
1941
1942 if (ispipe == -ENOMEM) {
1943 printk(KERN_WARNING "format_corename failed\n");
1944 printk(KERN_WARNING "Aborting core\n");
1945 goto fail_corename;
1946 }
1910 1947
1911 if (ispipe) { 1948 if (ispipe) {
1912 int dump_count; 1949 int dump_count;
@@ -1943,7 +1980,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1943 goto fail_dropcount; 1980 goto fail_dropcount;
1944 } 1981 }
1945 1982
1946 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); 1983 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
1947 if (!helper_argv) { 1984 if (!helper_argv) {
1948 printk(KERN_WARNING "%s failed to allocate memory\n", 1985 printk(KERN_WARNING "%s failed to allocate memory\n",
1949 __func__); 1986 __func__);
@@ -1956,7 +1993,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1956 argv_free(helper_argv); 1993 argv_free(helper_argv);
1957 if (retval) { 1994 if (retval) {
1958 printk(KERN_INFO "Core dump to %s pipe failed\n", 1995 printk(KERN_INFO "Core dump to %s pipe failed\n",
1959 corename); 1996 cn.corename);
1960 goto close_fail; 1997 goto close_fail;
1961 } 1998 }
1962 } else { 1999 } else {
@@ -1965,7 +2002,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1965 if (cprm.limit < binfmt->min_coredump) 2002 if (cprm.limit < binfmt->min_coredump)
1966 goto fail_unlock; 2003 goto fail_unlock;
1967 2004
1968 cprm.file = filp_open(corename, 2005 cprm.file = filp_open(cn.corename,
1969 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 2006 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1970 0600); 2007 0600);
1971 if (IS_ERR(cprm.file)) 2008 if (IS_ERR(cprm.file))
@@ -2007,6 +2044,8 @@ fail_dropcount:
2007 if (ispipe) 2044 if (ispipe)
2008 atomic_dec(&core_dump_count); 2045 atomic_dec(&core_dump_count);
2009fail_unlock: 2046fail_unlock:
2047 kfree(cn.corename);
2048fail_corename:
2010 coredump_finish(mm); 2049 coredump_finish(mm);
2011 revert_creds(old_cred); 2050 revert_creds(old_cred);
2012fail_creds: 2051fail_creds:
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc1..dcc941d82d67 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
420 err = exofs_write_begin(NULL, page->mapping, pos, len, 420 err = exofs_write_begin(NULL, page->mapping, pos, len,
421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
422 if (err) 422 if (err)
423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n", 423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
424 err); 424 err);
425 425
426 de->inode_no = cpu_to_le64(inode->i_ino); 426 de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, 556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
557 &page, NULL); 557 &page, NULL);
558 if (err) 558 if (err)
559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n", 559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
560 err); 560 err);
561 if (pde) 561 if (pde)
562 pde->rec_len = cpu_to_le16(to - from); 562 pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
46{ 46{
47 int ret; 47 int ret;
48 struct inode *inode = filp->f_mapping->host; 48 struct inode *inode = filp->f_mapping->host;
49 struct writeback_control wbc = {
50 .sync_mode = WB_SYNC_ALL,
51 .nr_to_write = 0, /* metadata-only; caller takes care of data */
52 };
53 struct super_block *sb; 49 struct super_block *sb;
54 50
55 if (!(inode->i_state & I_DIRTY)) 51 if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
57 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 53 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
58 return 0; 54 return 0;
59 55
60 ret = sync_inode(inode, &wbc); 56 ret = sync_inode_metadata(inode, 1);
61 57
62 /* This is a good place to write the sb */ 58 /* This is a good place to write the sb */
63 /* TODO: Sechedule an sb-sync on create */ 59 /* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3eadd97324b1..42685424817b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -185,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
185/* Called at the end of reads, to optionally unlock pages and update their 185/* Called at the end of reads, to optionally unlock pages and update their
186 * status. 186 * status.
187 */ 187 */
188static int __readpages_done(struct page_collect *pcol, bool do_unlock) 188static int __readpages_done(struct page_collect *pcol)
189{ 189{
190 int i; 190 int i;
191 u64 resid; 191 u64 resid;
@@ -221,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
221 page_stat ? "bad_bytes" : "good_bytes"); 221 page_stat ? "bad_bytes" : "good_bytes");
222 222
223 ret = update_read_page(page, page_stat); 223 ret = update_read_page(page, page_stat);
224 if (do_unlock) 224 if (!pcol->read_4_write)
225 unlock_page(page); 225 unlock_page(page);
226 length += PAGE_SIZE; 226 length += PAGE_SIZE;
227 } 227 }
@@ -236,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
236{ 236{
237 struct page_collect *pcol = p; 237 struct page_collect *pcol = p;
238 238
239 __readpages_done(pcol, true); 239 __readpages_done(pcol);
240 atomic_dec(&pcol->sbi->s_curr_pending); 240 atomic_dec(&pcol->sbi->s_curr_pending);
241 kfree(pcol); 241 kfree(pcol);
242} 242}
@@ -257,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
257 } 257 }
258} 258}
259 259
260static int read_exec(struct page_collect *pcol, bool is_sync) 260static int read_exec(struct page_collect *pcol)
261{ 261{
262 struct exofs_i_info *oi = exofs_i(pcol->inode); 262 struct exofs_i_info *oi = exofs_i(pcol->inode);
263 struct exofs_io_state *ios = pcol->ios; 263 struct exofs_io_state *ios = pcol->ios;
@@ -267,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
267 if (!pcol->pages) 267 if (!pcol->pages)
268 return 0; 268 return 0;
269 269
270 /* see comment in _readpage() about sync reads */
271 WARN_ON(is_sync && (pcol->nr_pages != 1));
272
273 ios->pages = pcol->pages; 270 ios->pages = pcol->pages;
274 ios->nr_pages = pcol->nr_pages; 271 ios->nr_pages = pcol->nr_pages;
275 ios->length = pcol->length; 272 ios->length = pcol->length;
276 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 273 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
277 274
278 if (is_sync) { 275 if (pcol->read_4_write) {
279 exofs_oi_read(oi, pcol->ios); 276 exofs_oi_read(oi, pcol->ios);
280 return __readpages_done(pcol, false); 277 return __readpages_done(pcol);
281 } 278 }
282 279
283 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 280 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -303,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
303 return 0; 300 return 0;
304 301
305err: 302err:
306 if (!is_sync) 303 if (!pcol->read_4_write)
307 _unlock_pcol_pages(pcol, ret, READ); 304 _unlock_pcol_pages(pcol, ret, READ);
308 305
309 pcol_free(pcol); 306 pcol_free(pcol);
@@ -356,7 +353,7 @@ static int readpage_strip(void *data, struct page *page)
356 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 353 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
357 " splitting\n", inode->i_ino, page->index); 354 " splitting\n", inode->i_ino, page->index);
358 355
359 return read_exec(pcol, false); 356 return read_exec(pcol);
360 } 357 }
361 358
362try_again: 359try_again:
@@ -366,7 +363,7 @@ try_again:
366 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 363 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
367 page->index)) { 364 page->index)) {
368 /* Discontinuity detected, split the request */ 365 /* Discontinuity detected, split the request */
369 ret = read_exec(pcol, false); 366 ret = read_exec(pcol);
370 if (unlikely(ret)) 367 if (unlikely(ret))
371 goto fail; 368 goto fail;
372 goto try_again; 369 goto try_again;
@@ -391,7 +388,7 @@ try_again:
391 page, len, pcol->nr_pages, pcol->length); 388 page, len, pcol->nr_pages, pcol->length);
392 389
393 /* split the request, and start again with current page */ 390 /* split the request, and start again with current page */
394 ret = read_exec(pcol, false); 391 ret = read_exec(pcol);
395 if (unlikely(ret)) 392 if (unlikely(ret))
396 goto fail; 393 goto fail;
397 394
@@ -420,27 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
420 return ret; 417 return ret;
421 } 418 }
422 419
423 return read_exec(&pcol, false); 420 return read_exec(&pcol);
424} 421}
425 422
426static int _readpage(struct page *page, bool is_sync) 423static int _readpage(struct page *page, bool read_4_write)
427{ 424{
428 struct page_collect pcol; 425 struct page_collect pcol;
429 int ret; 426 int ret;
430 427
431 _pcol_init(&pcol, 1, page->mapping->host); 428 _pcol_init(&pcol, 1, page->mapping->host);
432 429
433 /* readpage_strip might call read_exec(,is_sync==false) at several 430 pcol.read_4_write = read_4_write;
434 * places but not if we have a single page.
435 */
436 pcol.read_4_write = is_sync;
437 ret = readpage_strip(&pcol, page); 431 ret = readpage_strip(&pcol, page);
438 if (ret) { 432 if (ret) {
439 EXOFS_ERR("_readpage => %d\n", ret); 433 EXOFS_ERR("_readpage => %d\n", ret);
440 return ret; 434 return ret;
441 } 435 }
442 436
443 return read_exec(&pcol, is_sync); 437 return read_exec(&pcol);
444} 438}
445 439
446/* 440/*
@@ -511,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
511 505
512 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
513 if (!pcol_copy) { 507 if (!pcol_copy) {
514 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); 508 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
515 ret = -ENOMEM; 509 ret = -ENOMEM;
516 goto err; 510 goto err;
517 } 511 }
@@ -527,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
527 521
528 ret = exofs_oi_write(oi, ios); 522 ret = exofs_oi_write(oi, ios);
529 if (unlikely(ret)) { 523 if (unlikely(ret)) {
530 EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); 524 EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
531 goto err; 525 goto err;
532 } 526 }
533 527
@@ -628,7 +622,7 @@ try_again:
628 /* split the request, next loop will start again */ 622 /* split the request, next loop will start again */
629 ret = write_exec(pcol); 623 ret = write_exec(pcol);
630 if (unlikely(ret)) { 624 if (unlikely(ret)) {
631 EXOFS_DBGMSG("write_exec faild => %d", ret); 625 EXOFS_DBGMSG("write_exec failed => %d", ret);
632 goto fail; 626 goto fail;
633 } 627 }
634 628
@@ -719,7 +713,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
719 ret = simple_write_begin(file, mapping, pos, len, flags, pagep, 713 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
720 fsdata); 714 fsdata);
721 if (ret) { 715 if (ret) {
722 EXOFS_DBGMSG("simple_write_begin faild\n"); 716 EXOFS_DBGMSG("simple_write_begin failed\n");
723 goto out; 717 goto out;
724 } 718 }
725 719
@@ -732,7 +726,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
732 if (ret) { 726 if (ret) {
733 /*SetPageError was done by _readpage. Is it ok?*/ 727 /*SetPageError was done by _readpage. Is it ok?*/
734 unlock_page(page); 728 unlock_page(page);
735 EXOFS_DBGMSG("__readpage_filler faild\n"); 729 EXOFS_DBGMSG("__readpage_filler failed\n");
736 } 730 }
737 } 731 }
738out: 732out:
@@ -1036,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1036 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1030 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
1037 } 1031 }
1038 1032
1033 inode->i_mapping->backing_dev_info = sb->s_bdi;
1039 if (S_ISREG(inode->i_mode)) { 1034 if (S_ISREG(inode->i_mode)) {
1040 inode->i_op = &exofs_file_inode_operations; 1035 inode->i_op = &exofs_file_inode_operations;
1041 inode->i_fop = &exofs_file_operations; 1036 inode->i_fop = &exofs_file_operations;
@@ -1072,8 +1067,10 @@ bad_inode:
1072int __exofs_wait_obj_created(struct exofs_i_info *oi) 1067int __exofs_wait_obj_created(struct exofs_i_info *oi)
1073{ 1068{
1074 if (!obj_created(oi)) { 1069 if (!obj_created(oi)) {
1070 EXOFS_DBGMSG("!obj_created\n");
1075 BUG_ON(!obj_2bcreated(oi)); 1071 BUG_ON(!obj_2bcreated(oi));
1076 wait_event(oi->i_wq, obj_created(oi)); 1072 wait_event(oi->i_wq, obj_created(oi));
1073 EXOFS_DBGMSG("wait_event done\n");
1077 } 1074 }
1078 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1075 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1079} 1076}
@@ -1095,7 +1092,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
1095 atomic_dec(&sbi->s_curr_pending); 1092 atomic_dec(&sbi->s_curr_pending);
1096 1093
1097 if (unlikely(ret)) { 1094 if (unlikely(ret)) {
1098 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1095 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
1099 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1096 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1100 /*TODO: When FS is corrupted creation can fail, object already 1097 /*TODO: When FS is corrupted creation can fail, object already
1101 * exist. Get rid of this asynchronous creation, if exist 1098 * exist. Get rid of this asynchronous creation, if exist
@@ -1107,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
1107 1104
1108 set_obj_created(oi); 1105 set_obj_created(oi);
1109 1106
1110 atomic_dec(&inode->i_count);
1111 wake_up(&oi->i_wq); 1107 wake_up(&oi->i_wq);
1112} 1108}
1113 1109
@@ -1135,6 +1131,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1135 1131
1136 sbi = sb->s_fs_info; 1132 sbi = sb->s_fs_info;
1137 1133
1134 inode->i_mapping->backing_dev_info = sb->s_bdi;
1138 sb->s_dirt = 1; 1135 sb->s_dirt = 1;
1139 inode_init_owner(inode, dir, mode); 1136 inode_init_owner(inode, dir, mode);
1140 inode->i_ino = sbi->s_nextid++; 1137 inode->i_ino = sbi->s_nextid++;
@@ -1157,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1157 ios->obj.id = exofs_oi_objno(oi); 1154 ios->obj.id = exofs_oi_objno(oi);
1158 exofs_make_credential(oi->i_cred, &ios->obj); 1155 exofs_make_credential(oi->i_cred, &ios->obj);
1159 1156
1160 /* increment the refcount so that the inode will still be around when we
1161 * reach the callback
1162 */
1163 atomic_inc(&inode->i_count);
1164
1165 ios->done = create_done; 1157 ios->done = create_done;
1166 ios->private = inode; 1158 ios->private = inode;
1167 ios->cred = oi->i_cred; 1159 ios->cred = oi->i_cred;
1168 ret = exofs_sbi_create(ios); 1160 ret = exofs_sbi_create(ios);
1169 if (ret) { 1161 if (ret) {
1170 atomic_dec(&inode->i_count);
1171 exofs_put_io_state(ios); 1162 exofs_put_io_state(ios);
1172 return ERR_PTR(ret); 1163 return ERR_PTR(ret);
1173 } 1164 }
@@ -1215,7 +1206,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1215 1206
1216 args = kzalloc(sizeof(*args), GFP_KERNEL); 1207 args = kzalloc(sizeof(*args), GFP_KERNEL);
1217 if (!args) { 1208 if (!args) {
1218 EXOFS_DBGMSG("Faild kzalloc of args\n"); 1209 EXOFS_DBGMSG("Failed kzalloc of args\n");
1219 return -ENOMEM; 1210 return -ENOMEM;
1220 } 1211 }
1221 1212
@@ -1257,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1257 ios->out_attr_len = 1; 1248 ios->out_attr_len = 1;
1258 ios->out_attr = &attr; 1249 ios->out_attr = &attr;
1259 1250
1260 if (!obj_created(oi)) { 1251 wait_obj_created(oi);
1261 EXOFS_DBGMSG("!obj_created\n");
1262 BUG_ON(!obj_2bcreated(oi));
1263 wait_event(oi->i_wq, obj_created(oi));
1264 EXOFS_DBGMSG("wait_event done\n");
1265 }
1266 1252
1267 if (!do_sync) { 1253 if (!do_sync) {
1268 args->sbi = sbi; 1254 args->sbi = sbi;
@@ -1325,12 +1311,12 @@ void exofs_evict_inode(struct inode *inode)
1325 inode->i_size = 0; 1311 inode->i_size = 0;
1326 end_writeback(inode); 1312 end_writeback(inode);
1327 1313
1328 /* if we are deleting an obj that hasn't been created yet, wait */ 1314 /* if we are deleting an obj that hasn't been created yet, wait.
1329 if (!obj_created(oi)) { 1315 * This also makes sure that create_done cannot be called with an
1330 BUG_ON(!obj_2bcreated(oi)); 1316 * already evicted inode.
1331 wait_event(oi->i_wq, obj_created(oi)); 1317 */
1332 /* ignore the error attempt a remove anyway */ 1318 wait_obj_created(oi);
1333 } 1319 /* ignore the error, attempt a remove anyway */
1334 1320
1335 /* Now Remove the OSD objects */ 1321 /* Now Remove the OSD objects */
1336 ret = exofs_get_io_state(&sbi->layout, &ios); 1322 ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6550bf70e41d..f74a2ec027a6 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
55 55
56 ret = osd_finalize_request(or, 0, cred, NULL); 56 ret = osd_finalize_request(or, 0, cred, NULL);
57 if (unlikely(ret)) { 57 if (unlikely(ret)) {
58 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); 58 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
59 goto out; 59 goto out;
60 } 60 }
61 61
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
79 */ 79 */
80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); 80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
81 if (unlikely(!ios)) { 81 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", 82 EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs)); 83 exofs_io_state_size(layout->s_numdevs));
84 *pios = NULL; 84 *pios = NULL;
85 return -ENOMEM; 85 return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
172 172
173 ret = osd_finalize_request(or, 0, ios->cred, NULL); 173 ret = osd_finalize_request(or, 0, ios->cred, NULL);
174 if (unlikely(ret)) { 174 if (unlikely(ret)) {
175 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", 175 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
176 ret); 176 ret);
177 return ret; 177 return ret;
178 } 178 }
@@ -361,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
361 361
362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
363 if (unlikely(!per_dev->bio)) { 363 if (unlikely(!per_dev->bio)) {
364 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", 364 EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
365 bio_size); 365 bio_size);
366 return -ENOMEM; 366 return -ENOMEM;
367 } 367 }
@@ -564,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
564 master_dev->bio->bi_max_vecs); 564 master_dev->bio->bi_max_vecs);
565 if (unlikely(!bio)) { 565 if (unlikely(!bio)) {
566 EXOFS_DBGMSG( 566 EXOFS_DBGMSG(
567 "Faild to allocate BIO size=%u\n", 567 "Failed to allocate BIO size=%u\n",
568 master_dev->bio->bi_max_vecs); 568 master_dev->bio->bi_max_vecs);
569 ret = -ENOMEM; 569 ret = -ENOMEM;
570 goto out; 570 goto out;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
153 153
154 inode->i_ctime = CURRENT_TIME; 154 inode->i_ctime = CURRENT_TIME;
155 inode_inc_link_count(inode); 155 inode_inc_link_count(inode);
156 atomic_inc(&inode->i_count); 156 ihold(inode);
157 157
158 return exofs_add_nondir(dentry, inode); 158 return exofs_add_nondir(dentry, inode);
159} 159}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 047e92fa3af8..79c3ae6e0456 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -659,19 +659,19 @@ free_bdi:
659/* 659/*
660 * Set up the superblock (calls exofs_fill_super eventually) 660 * Set up the superblock (calls exofs_fill_super eventually)
661 */ 661 */
662static int exofs_get_sb(struct file_system_type *type, 662static struct dentry *exofs_mount(struct file_system_type *type,
663 int flags, const char *dev_name, 663 int flags, const char *dev_name,
664 void *data, struct vfsmount *mnt) 664 void *data)
665{ 665{
666 struct exofs_mountopt opts; 666 struct exofs_mountopt opts;
667 int ret; 667 int ret;
668 668
669 ret = parse_options(data, &opts); 669 ret = parse_options(data, &opts);
670 if (ret) 670 if (ret)
671 return ret; 671 return ERR_PTR(ret);
672 672
673 opts.dev_name = dev_name; 673 opts.dev_name = dev_name;
674 return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt); 674 return mount_nodev(type, flags, &opts, exofs_fill_super);
675} 675}
676 676
677/* 677/*
@@ -809,7 +809,7 @@ static const struct export_operations exofs_export_ops = {
809static struct file_system_type exofs_type = { 809static struct file_system_type exofs_type = {
810 .owner = THIS_MODULE, 810 .owner = THIS_MODULE,
811 .name = "exofs", 811 .name = "exofs",
812 .get_sb = exofs_get_sb, 812 .mount = exofs_mount,
813 .kill_sb = generic_shutdown_super, 813 .kill_sb = generic_shutdown_super,
814}; 814};
815 815
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..51b304056f10 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
74find_disconnected_root(struct dentry *dentry) 74find_disconnected_root(struct dentry *dentry)
75{ 75{
76 dget(dentry); 76 dget(dentry);
77 spin_lock(&dentry->d_lock); 77 while (!IS_ROOT(dentry)) {
78 while (!IS_ROOT(dentry) && 78 struct dentry *parent = dget_parent(dentry);
79 (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { 79
80 struct dentry *parent = dentry->d_parent; 80 if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
81 dget(parent); 81 dput(parent);
82 spin_unlock(&dentry->d_lock); 82 break;
83 }
84
83 dput(dentry); 85 dput(dentry);
84 dentry = parent; 86 dentry = parent;
85 spin_lock(&dentry->d_lock);
86 } 87 }
87 spin_unlock(&dentry->d_lock);
88 return dentry; 88 return dentry;
89} 89}
90 90
91
92/* 91/*
93 * Make sure target_dir is fully connected to the dentry tree. 92 * Make sure target_dir is fully connected to the dentry tree.
94 * 93 *
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c6c684b44ea1..0d06f4e75699 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
646 return here; 646 return here;
647} 647}
648 648
649/* 649/**
650 * ext2_try_to_allocate() 650 * ext2_try_to_allocate()
651 * @sb: superblock 651 * @sb: superblock
652 * @handle: handle to this transaction
653 * @group: given allocation block group 652 * @group: given allocation block group
654 * @bitmap_bh: bufferhead holds the block bitmap 653 * @bitmap_bh: bufferhead holds the block bitmap
655 * @grp_goal: given target block within the group 654 * @grp_goal: given target block within the group
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..2709b34206ab 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
98 if (IS_DIRSYNC(dir)) { 98 if (IS_DIRSYNC(dir)) {
99 err = write_one_page(page, 1); 99 err = write_one_page(page, 1);
100 if (!err) 100 if (!err)
101 err = ext2_sync_inode(dir); 101 err = sync_inode_metadata(dir, 1);
102 } else { 102 } else {
103 unlock_page(page); 103 unlock_page(page);
104 } 104 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, struct writeback_control *); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_evict_inode(struct inode *); 122extern void ext2_evict_inode(struct inode *);
123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 123extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern int ext2_setattr (struct dentry *, struct iattr *); 124extern int ext2_setattr (struct dentry *, struct iattr *);
126extern void ext2_set_inode_flags(struct inode *inode); 125extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 940c96168868..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -458,7 +458,7 @@ failed_out:
458 * the same format as ext2_get_branch() would do. We are calling it after 458 * the same format as ext2_get_branch() would do. We are calling it after
459 * we had read the existing part of chain and partial points to the last 459 * we had read the existing part of chain and partial points to the last
460 * triple of that (one with zero ->key). Upon the exit we have the same 460 * triple of that (one with zero ->key). Upon the exit we have the same
461 * picture as after the successful ext2_get_block(), excpet that in one 461 * picture as after the successful ext2_get_block(), except that in one
462 * place chain is disconnected - *branch->p is still zero (we did not 462 * place chain is disconnected - *branch->p is still zero (we did not
463 * set the last link), but branch->key contains the number that should 463 * set the last link), but branch->key contains the number that should
464 * be placed into *branch->p to fill that gap. 464 * be placed into *branch->p to fill that gap.
@@ -662,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
662 mutex_lock(&ei->truncate_mutex); 662 mutex_lock(&ei->truncate_mutex);
663 /* 663 /*
664 * If the indirect block is missing while we are reading 664 * If the indirect block is missing while we are reading
665 * the chain(ext3_get_branch() returns -EAGAIN err), or 665 * the chain(ext2_get_branch() returns -EAGAIN err), or
666 * if the chain has been changed after we grab the semaphore, 666 * if the chain has been changed after we grab the semaphore,
667 * (either because another process truncated this branch, or 667 * (either because another process truncated this branch, or
668 * another get_block allocated this branch) re-grab the chain to see if 668 * another get_block allocated this branch) re-grab the chain to see if
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1204 if (inode_needs_sync(inode)) { 1204 if (inode_needs_sync(inode)) {
1205 sync_mapping_buffers(inode->i_mapping); 1205 sync_mapping_buffers(inode->i_mapping);
1206 ext2_sync_inode (inode); 1206 sync_inode_metadata(inode, 1);
1207 } else { 1207 } else {
1208 mark_inode_dirty(inode); 1208 mark_inode_dirty(inode);
1209 } 1209 }
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1524} 1524}
1525 1525
1526int ext2_sync_inode(struct inode *inode)
1527{
1528 struct writeback_control wbc = {
1529 .sync_mode = WB_SYNC_ALL,
1530 .nr_to_write = 0, /* sys_fsync did this */
1531 };
1532 return sync_inode(inode, &wbc);
1533}
1534
1535int ext2_setattr(struct dentry *dentry, struct iattr *iattr) 1526int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1536{ 1527{
1537 struct inode *inode = dentry->d_inode; 1528 struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..f8aecd2e3297 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
206 206
207 inode->i_ctime = CURRENT_TIME_SEC; 207 inode->i_ctime = CURRENT_TIME_SEC;
208 inode_inc_link_count(inode); 208 inode_inc_link_count(inode);
209 atomic_inc(&inode->i_count); 209 ihold(inode);
210 210
211 err = ext2_add_link(dentry, inode); 211 err = ext2_add_link(dentry, inode);
212 if (!err) { 212 if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1ec602673ea8..d89e0b6a2d78 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -747,15 +747,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
747 __le32 features; 747 __le32 features;
748 int err; 748 int err;
749 749
750 err = -ENOMEM;
750 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 751 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
751 if (!sbi) 752 if (!sbi)
752 return -ENOMEM; 753 goto failed_unlock;
753 754
754 sbi->s_blockgroup_lock = 755 sbi->s_blockgroup_lock =
755 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 756 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
756 if (!sbi->s_blockgroup_lock) { 757 if (!sbi->s_blockgroup_lock) {
757 kfree(sbi); 758 kfree(sbi);
758 return -ENOMEM; 759 goto failed_unlock;
759 } 760 }
760 sb->s_fs_info = sbi; 761 sb->s_fs_info = sbi;
761 sbi->s_sb_block = sb_block; 762 sbi->s_sb_block = sb_block;
@@ -1107,6 +1108,7 @@ failed_sbi:
1107 sb->s_fs_info = NULL; 1108 sb->s_fs_info = NULL;
1108 kfree(sbi->s_blockgroup_lock); 1109 kfree(sbi->s_blockgroup_lock);
1109 kfree(sbi); 1110 kfree(sbi);
1111failed_unlock:
1110 return ret; 1112 return ret;
1111} 1113}
1112 1114
@@ -1219,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1219 } 1221 }
1220 1222
1221 es = sbi->s_es; 1223 es = sbi->s_es;
1222 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1224 if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
1223 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1224 invalidate_inodes(sb)) {
1225 ext2_msg(sb, KERN_WARNING, "warning: refusing change of " 1225 ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
1226 "xip flag with busy inodes while remounting"); 1226 "xip flag with busy inodes while remounting");
1227 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1227 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
@@ -1356,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1356 return 0; 1356 return 0;
1357} 1357}
1358 1358
1359static int ext2_get_sb(struct file_system_type *fs_type, 1359static struct dentry *ext2_mount(struct file_system_type *fs_type,
1360 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1360 int flags, const char *dev_name, void *data)
1361{ 1361{
1362 return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt); 1362 return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
1363} 1363}
1364 1364
1365#ifdef CONFIG_QUOTA 1365#ifdef CONFIG_QUOTA
@@ -1473,7 +1473,7 @@ out:
1473static struct file_system_type ext2_fs_type = { 1473static struct file_system_type ext2_fs_type = {
1474 .owner = THIS_MODULE, 1474 .owner = THIS_MODULE,
1475 .name = "ext2", 1475 .name = "ext2",
1476 .get_sb = ext2_get_sb, 1476 .mount = ext2_mount,
1477 .kill_sb = kill_block_super, 1477 .kill_sb = kill_block_super,
1478 .fs_flags = FS_REQUIRES_DEV, 1478 .fs_flags = FS_REQUIRES_DEV,
1479}; 1479};
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..f84700be3274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
699 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; 699 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
700 inode->i_ctime = CURRENT_TIME_SEC; 700 inode->i_ctime = CURRENT_TIME_SEC;
701 if (IS_SYNC(inode)) { 701 if (IS_SYNC(inode)) {
702 error = ext2_sync_inode (inode); 702 error = sync_inode_metadata(inode, 1);
703 /* In case sync failed due to ENOSPC the inode was actually 703 /* In case sync failed due to ENOSPC the inode was actually
704 * written (only some dirty data were not) so we just proceed 704 * written (only some dirty data were not) so we just proceed
705 * as if nothing happened and cleanup the unused block */ 705 * as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4ded..b3db22649426 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -792,9 +792,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
792 if (here < 0) 792 if (here < 0)
793 here = 0; 793 here = 0;
794 794
795 p = ((char *)bh->b_data) + (here >> 3); 795 p = bh->b_data + (here >> 3);
796 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); 796 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
797 next = (r - ((char *)bh->b_data)) << 3; 797 next = (r - bh->b_data) << 3;
798 798
799 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) 799 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
800 return next; 800 return next;
@@ -810,8 +810,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
810 810
811/** 811/**
812 * claim_block() 812 * claim_block()
813 * @lock: the spin lock for this block group
813 * @block: the free block (group relative) to allocate 814 * @block: the free block (group relative) to allocate
814 * @bh: the bufferhead containts the block group bitmap 815 * @bh: the buffer_head contains the block group bitmap
815 * 816 *
816 * We think we can allocate this block in this bitmap. Try to set the bit. 817 * We think we can allocate this block in this bitmap. Try to set the bit.
817 * If that succeeds then check that nobody has allocated and then freed the 818 * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +957,11 @@ fail_access:
956 * but we will shift to the place where start_block is, 957 * but we will shift to the place where start_block is,
957 * then start from there, when looking for a reservable space. 958 * then start from there, when looking for a reservable space.
958 * 959 *
959 * @size: the target new reservation window size 960 * @my_rsv: the reservation window
960 * 961 *
961 * @group_first_block: the first block we consider to start 962 * @sb: the super block
963 *
964 * @start_block: the first block we consider to start
962 * the real search from 965 * the real search from
963 * 966 *
964 * @last_block: 967 * @last_block:
@@ -1084,7 +1087,7 @@ static int find_next_reservable_window(
1084 * 1087 *
1085 * failed: we failed to find a reservation window in this group 1088 * failed: we failed to find a reservation window in this group
1086 * 1089 *
1087 * @rsv: the reservation 1090 * @my_rsv: the reservation window
1088 * 1091 *
1089 * @grp_goal: The goal (group-relative). It is where the search for a 1092 * @grp_goal: The goal (group-relative). It is where the search for a
1090 * free reservable space should start from. 1093 * free reservable space should start from.
@@ -1273,8 +1276,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1273 * @group: given allocation block group 1276 * @group: given allocation block group
1274 * @bitmap_bh: bufferhead holds the block bitmap 1277 * @bitmap_bh: bufferhead holds the block bitmap
1275 * @grp_goal: given target block within the group 1278 * @grp_goal: given target block within the group
1276 * @count: target number of blocks to allocate
1277 * @my_rsv: reservation window 1279 * @my_rsv: reservation window
1280 * @count: target number of blocks to allocate
1278 * @errp: pointer to store the error code 1281 * @errp: pointer to store the error code
1279 * 1282 *
1280 * This is the main function used to allocate a new block and its reservation 1283 * This is the main function used to allocate a new block and its reservation
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
90 * storage 90 * storage
91 */ 91 */
92 if (needs_barrier) 92 if (needs_barrier)
93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
94 BLKDEV_IFL_WAIT);
95 return ret; 94 return ret;
96} 95}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4ab72db3559e..9724aef22460 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -570,9 +570,14 @@ got:
570 ei->i_state_flags = 0; 570 ei->i_state_flags = 0;
571 ext3_set_inode_state(inode, EXT3_STATE_NEW); 571 ext3_set_inode_state(inode, EXT3_STATE_NEW);
572 572
573 ei->i_extra_isize = 573 /* See comment in ext3_iget for explanation */
574 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 574 if (ino >= EXT3_FIRST_INO(sb) + 1 &&
575 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 575 EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
576 ei->i_extra_isize =
577 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
578 } else {
579 ei->i_extra_isize = 0;
580 }
576 581
577 ret = inode; 582 ret = inode;
578 dquot_initialize(inode); 583 dquot_initialize(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..a9580617edd2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -498,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
498} 498}
499 499
500/** 500/**
501 * ext3_blks_to_allocate: Look up the block map and count the number 501 * ext3_blks_to_allocate - Look up the block map and count the number
502 * of direct blocks need to be allocated for the given branch. 502 * of direct blocks need to be allocated for the given branch.
503 * 503 *
504 * @branch: chain of indirect blocks 504 * @branch: chain of indirect blocks
@@ -536,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
536} 536}
537 537
538/** 538/**
539 * ext3_alloc_blocks: multiple allocate blocks needed for a branch 539 * ext3_alloc_blocks - multiple allocate blocks needed for a branch
540 * @handle: handle for this transaction
541 * @inode: owner
542 * @goal: preferred place for allocation
540 * @indirect_blks: the number of blocks need to allocate for indirect 543 * @indirect_blks: the number of blocks need to allocate for indirect
541 * blocks 544 * blocks
542 * 545 * @blks: number of blocks need to allocated for direct blocks
543 * @new_blocks: on return it will store the new block numbers for 546 * @new_blocks: on return it will store the new block numbers for
544 * the indirect blocks(if needed) and the first direct block, 547 * the indirect blocks(if needed) and the first direct block,
545 * @blks: on return it will store the total number of allocated 548 * @err: here we store the error value
546 * direct blocks 549 *
550 * return the number of direct blocks allocated
547 */ 551 */
548static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, 552static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
549 ext3_fsblk_t goal, int indirect_blks, int blks, 553 ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -598,9 +602,11 @@ failed_out:
598 602
599/** 603/**
600 * ext3_alloc_branch - allocate and set up a chain of blocks. 604 * ext3_alloc_branch - allocate and set up a chain of blocks.
605 * @handle: handle for this transaction
601 * @inode: owner 606 * @inode: owner
602 * @indirect_blks: number of allocated indirect blocks 607 * @indirect_blks: number of allocated indirect blocks
603 * @blks: number of allocated direct blocks 608 * @blks: number of allocated direct blocks
609 * @goal: preferred place for allocation
604 * @offsets: offsets (in the blocks) to store the pointers to next. 610 * @offsets: offsets (in the blocks) to store the pointers to next.
605 * @branch: place to store the chain in. 611 * @branch: place to store the chain in.
606 * 612 *
@@ -700,10 +706,9 @@ failed:
700 706
701/** 707/**
702 * ext3_splice_branch - splice the allocated branch onto inode. 708 * ext3_splice_branch - splice the allocated branch onto inode.
709 * @handle: handle for this transaction
703 * @inode: owner 710 * @inode: owner
704 * @block: (logical) number of block we are adding 711 * @block: (logical) number of block we are adding
705 * @chain: chain of indirect blocks (with a missing link - see
706 * ext3_alloc_branch)
707 * @where: location of missing link 712 * @where: location of missing link
708 * @num: number of indirect blocks we are adding 713 * @num: number of indirect blocks we are adding
709 * @blks: number of direct blocks we are adding 714 * @blks: number of direct blocks we are adding
@@ -1696,8 +1701,8 @@ static int ext3_journalled_writepage(struct page *page,
1696 * doesn't seem much point in redirtying the page here. 1701 * doesn't seem much point in redirtying the page here.
1697 */ 1702 */
1698 ClearPageChecked(page); 1703 ClearPageChecked(page);
1699 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 1704 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
1700 ext3_get_block); 1705 ext3_get_block);
1701 if (ret != 0) { 1706 if (ret != 0) {
1702 ext3_journal_stop(handle); 1707 ext3_journal_stop(handle);
1703 goto out_unlock; 1708 goto out_unlock;
@@ -2530,7 +2535,6 @@ void ext3_truncate(struct inode *inode)
2530 */ 2535 */
2531 } else { 2536 } else {
2532 /* Shared branch grows from an indirect block */ 2537 /* Shared branch grows from an indirect block */
2533 BUFFER_TRACE(partial->bh, "get_write_access");
2534 ext3_free_branches(handle, inode, partial->bh, 2538 ext3_free_branches(handle, inode, partial->bh,
2535 partial->p, 2539 partial->p,
2536 partial->p+1, (chain+n-1) - partial); 2540 partial->p+1, (chain+n-1) - partial);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..bce9dce639b8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2260,7 +2260,7 @@ retry:
2260 2260
2261 inode->i_ctime = CURRENT_TIME_SEC; 2261 inode->i_ctime = CURRENT_TIME_SEC;
2262 inc_nlink(inode); 2262 inc_nlink(inode);
2263 atomic_inc(&inode->i_count); 2263 ihold(inode);
2264 2264
2265 err = ext3_add_entry(handle, dentry, inode); 2265 err = ext3_add_entry(handle, dentry, inode);
2266 if (!err) { 2266 if (!err) {
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0ccd7b12b73c..e746d30b1232 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -977,7 +977,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
977 o_blocks_count = le32_to_cpu(es->s_blocks_count); 977 o_blocks_count = le32_to_cpu(es->s_blocks_count);
978 978
979 if (test_opt(sb, DEBUG)) 979 if (test_opt(sb, DEBUG))
980 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", 980 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
981 " upto "E3FSBLK" blocks\n",
981 o_blocks_count, n_blocks_count); 982 o_blocks_count, n_blocks_count);
982 983
983 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 984 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -985,7 +986,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
985 986
986 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 987 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
987 printk(KERN_ERR "EXT3-fs: filesystem on %s:" 988 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
988 " too large to resize to %lu blocks safely\n", 989 " too large to resize to "E3FSBLK" blocks safely\n",
989 sb->s_id, n_blocks_count); 990 sb->s_id, n_blocks_count);
990 if (sizeof(sector_t) < 8) 991 if (sizeof(sector_t) < 8)
991 ext3_warning(sb, __func__, 992 ext3_warning(sb, __func__,
@@ -1065,11 +1066,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1065 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1066 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1066 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1067 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1067 mutex_unlock(&EXT3_SB(sb)->s_resize_lock); 1068 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1068 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, 1069 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
1069 o_blocks_count + add); 1070 o_blocks_count, o_blocks_count + add);
1070 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1071 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1071 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, 1072 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
1072 o_blocks_count + add); 1073 o_blocks_count, o_blocks_count + add);
1073 if ((err = ext3_journal_stop(handle))) 1074 if ((err = ext3_journal_stop(handle)))
1074 goto exit_put; 1075 goto exit_put;
1075 if (test_opt(sb, DEBUG)) 1076 if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a367dd044280..2fedaf8b5012 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -411,9 +411,6 @@ static void ext3_put_super (struct super_block * sb)
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
415 lock_kernel();
416
417 ext3_xattr_put_super(sb); 414 ext3_xattr_put_super(sb);
418 err = journal_destroy(sbi->s_journal); 415 err = journal_destroy(sbi->s_journal);
419 sbi->s_journal = NULL; 416 sbi->s_journal = NULL;
@@ -462,8 +459,6 @@ static void ext3_put_super (struct super_block * sb)
462 sb->s_fs_info = NULL; 459 sb->s_fs_info = NULL;
463 kfree(sbi->s_blockgroup_lock); 460 kfree(sbi->s_blockgroup_lock);
464 kfree(sbi); 461 kfree(sbi);
465
466 unlock_kernel();
467} 462}
468 463
469static struct kmem_cache *ext3_inode_cachep; 464static struct kmem_cache *ext3_inode_cachep;
@@ -1306,9 +1301,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1306 ext3_msg(sb, KERN_WARNING, 1301 ext3_msg(sb, KERN_WARNING,
1307 "warning: mounting fs with errors, " 1302 "warning: mounting fs with errors, "
1308 "running e2fsck is recommended"); 1303 "running e2fsck is recommended");
1309 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1304 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1310 le16_to_cpu(es->s_mnt_count) >= 1305 le16_to_cpu(es->s_mnt_count) >=
1311 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1306 le16_to_cpu(es->s_max_mnt_count))
1312 ext3_msg(sb, KERN_WARNING, 1307 ext3_msg(sb, KERN_WARNING,
1313 "warning: maximal mount count reached, " 1308 "warning: maximal mount count reached, "
1314 "running e2fsck is recommended"); 1309 "running e2fsck is recommended");
@@ -1325,7 +1320,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1325 valid forever! :) */ 1320 valid forever! :) */
1326 es->s_state &= cpu_to_le16(~EXT3_VALID_FS); 1321 es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
1327#endif 1322#endif
1328 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1323 if (!le16_to_cpu(es->s_max_mnt_count))
1329 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); 1324 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
1330 le16_add_cpu(&es->s_mnt_count, 1); 1325 le16_add_cpu(&es->s_mnt_count, 1);
1331 es->s_mtime = cpu_to_le32(get_seconds()); 1326 es->s_mtime = cpu_to_le32(get_seconds());
@@ -1627,8 +1622,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1627 sbi->s_resgid = EXT3_DEF_RESGID; 1622 sbi->s_resgid = EXT3_DEF_RESGID;
1628 sbi->s_sb_block = sb_block; 1623 sbi->s_sb_block = sb_block;
1629 1624
1630 unlock_kernel();
1631
1632 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); 1625 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1633 if (!blocksize) { 1626 if (!blocksize) {
1634 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); 1627 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -1654,7 +1647,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1654 * Note: s_es must be initialized as soon as possible because 1647 * Note: s_es must be initialized as soon as possible because
1655 * some ext3 macro-instructions depend on its value 1648 * some ext3 macro-instructions depend on its value
1656 */ 1649 */
1657 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); 1650 es = (struct ext3_super_block *) (bh->b_data + offset);
1658 sbi->s_es = es; 1651 sbi->s_es = es;
1659 sb->s_magic = le16_to_cpu(es->s_magic); 1652 sb->s_magic = le16_to_cpu(es->s_magic);
1660 if (sb->s_magic != EXT3_SUPER_MAGIC) 1653 if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1765,7 +1758,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1765 "error: can't read superblock on 2nd try"); 1758 "error: can't read superblock on 2nd try");
1766 goto failed_mount; 1759 goto failed_mount;
1767 } 1760 }
1768 es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); 1761 es = (struct ext3_super_block *)(bh->b_data + offset);
1769 sbi->s_es = es; 1762 sbi->s_es = es;
1770 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { 1763 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1771 ext3_msg(sb, KERN_ERR, 1764 ext3_msg(sb, KERN_ERR,
@@ -1864,13 +1857,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1864 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - 1857 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
1865 le32_to_cpu(es->s_first_data_block) - 1) 1858 le32_to_cpu(es->s_first_data_block) - 1)
1866 / EXT3_BLOCKS_PER_GROUP(sb)) + 1; 1859 / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
1867 db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / 1860 db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
1868 EXT3_DESC_PER_BLOCK(sb);
1869 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), 1861 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1870 GFP_KERNEL); 1862 GFP_KERNEL);
1871 if (sbi->s_group_desc == NULL) { 1863 if (sbi->s_group_desc == NULL) {
1872 ext3_msg(sb, KERN_ERR, 1864 ext3_msg(sb, KERN_ERR,
1873 "error: not enough memory"); 1865 "error: not enough memory");
1866 ret = -ENOMEM;
1874 goto failed_mount; 1867 goto failed_mount;
1875 } 1868 }
1876 1869
@@ -1958,6 +1951,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1958 } 1951 }
1959 if (err) { 1952 if (err) {
1960 ext3_msg(sb, KERN_ERR, "error: insufficient memory"); 1953 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1954 ret = err;
1961 goto failed_mount3; 1955 goto failed_mount3;
1962 } 1956 }
1963 1957
@@ -2025,7 +2019,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2025 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": 2019 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
2026 "writeback"); 2020 "writeback");
2027 2021
2028 lock_kernel();
2029 return 0; 2022 return 0;
2030 2023
2031cantfind_ext3: 2024cantfind_ext3:
@@ -2055,7 +2048,6 @@ out_fail:
2055 sb->s_fs_info = NULL; 2048 sb->s_fs_info = NULL;
2056 kfree(sbi->s_blockgroup_lock); 2049 kfree(sbi->s_blockgroup_lock);
2057 kfree(sbi); 2050 kfree(sbi);
2058 lock_kernel();
2059 return ret; 2051 return ret;
2060} 2052}
2061 2053
@@ -2168,7 +2160,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2168 goto out_bdev; 2160 goto out_bdev;
2169 } 2161 }
2170 2162
2171 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); 2163 es = (struct ext3_super_block *) (bh->b_data + offset);
2172 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || 2164 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
2173 !(le32_to_cpu(es->s_feature_incompat) & 2165 !(le32_to_cpu(es->s_feature_incompat) &
2174 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { 2166 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -2361,6 +2353,21 @@ static int ext3_commit_super(struct super_block *sb,
2361 2353
2362 if (!sbh) 2354 if (!sbh)
2363 return error; 2355 return error;
2356
2357 if (buffer_write_io_error(sbh)) {
2358 /*
2359 * Oh, dear. A previous attempt to write the
2360 * superblock failed. This could happen because the
2361 * USB device was yanked out. Or it could happen to
2362 * be a transient write error and maybe the block will
2363 * be remapped. Nothing we can do but to retry the
2364 * write and hope for the best.
2365 */
2366 ext3_msg(sb, KERN_ERR, "previous I/O error to "
2367 "superblock detected");
2368 clear_buffer_write_io_error(sbh);
2369 set_buffer_uptodate(sbh);
2370 }
2364 /* 2371 /*
2365 * If the file system is mounted read-only, don't update the 2372 * If the file system is mounted read-only, don't update the
2366 * superblock write time. This avoids updating the superblock 2373 * superblock write time. This avoids updating the superblock
@@ -2377,8 +2384,15 @@ static int ext3_commit_super(struct super_block *sb,
2377 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2384 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2378 BUFFER_TRACE(sbh, "marking dirty"); 2385 BUFFER_TRACE(sbh, "marking dirty");
2379 mark_buffer_dirty(sbh); 2386 mark_buffer_dirty(sbh);
2380 if (sync) 2387 if (sync) {
2381 error = sync_dirty_buffer(sbh); 2388 error = sync_dirty_buffer(sbh);
2389 if (buffer_write_io_error(sbh)) {
2390 ext3_msg(sb, KERN_ERR, "I/O error while writing "
2391 "superblock");
2392 clear_buffer_write_io_error(sbh);
2393 set_buffer_uptodate(sbh);
2394 }
2395 }
2382 return error; 2396 return error;
2383} 2397}
2384 2398
@@ -2538,8 +2552,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2538 int i; 2552 int i;
2539#endif 2553#endif
2540 2554
2541 lock_kernel();
2542
2543 /* Store the original options */ 2555 /* Store the original options */
2544 lock_super(sb); 2556 lock_super(sb);
2545 old_sb_flags = sb->s_flags; 2557 old_sb_flags = sb->s_flags;
@@ -2648,7 +2660,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2648 kfree(old_opts.s_qf_names[i]); 2660 kfree(old_opts.s_qf_names[i]);
2649#endif 2661#endif
2650 unlock_super(sb); 2662 unlock_super(sb);
2651 unlock_kernel();
2652 2663
2653 if (enable_quota) 2664 if (enable_quota)
2654 dquot_resume(sb, -1); 2665 dquot_resume(sb, -1);
@@ -2669,7 +2680,6 @@ restore_opts:
2669 } 2680 }
2670#endif 2681#endif
2671 unlock_super(sb); 2682 unlock_super(sb);
2672 unlock_kernel();
2673 return err; 2683 return err;
2674} 2684}
2675 2685
@@ -3010,16 +3020,16 @@ out:
3010 3020
3011#endif 3021#endif
3012 3022
3013static int ext3_get_sb(struct file_system_type *fs_type, 3023static struct dentry *ext3_mount(struct file_system_type *fs_type,
3014 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3024 int flags, const char *dev_name, void *data)
3015{ 3025{
3016 return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt); 3026 return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
3017} 3027}
3018 3028
3019static struct file_system_type ext3_fs_type = { 3029static struct file_system_type ext3_fs_type = {
3020 .owner = THIS_MODULE, 3030 .owner = THIS_MODULE,
3021 .name = "ext3", 3031 .name = "ext3",
3022 .get_sb = ext3_get_sb, 3032 .mount = ext3_mount,
3023 .kill_sb = kill_block_super, 3033 .kill_sb = kill_block_super,
3024 .fs_flags = FS_REQUIRES_DEV, 3034 .fs_flags = FS_REQUIRES_DEV,
3025}; 3035};
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..c947e36eda6c 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_EXT4_FS) += ext4.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
10 10
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..14c3af26c671 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
171 * less than the blocksize * 8 ( which is the size 171 * less than the blocksize * 8 ( which is the size
172 * of bitmap ), set rest of the block bitmap to 1 172 * of bitmap ), set rest of the block bitmap to 1
173 */ 173 */
174 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 174 ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
175 bh->b_data);
175 } 176 }
176 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); 177 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
177} 178}
@@ -489,7 +490,7 @@ error_return:
489 * Check if filesystem has nblocks free & available for allocation. 490 * Check if filesystem has nblocks free & available for allocation.
490 * On success return 1, return 0 on failure. 491 * On success return 1, return 0 on failure.
491 */ 492 */
492int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 493static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
493{ 494{
494 s64 free_blocks, dirty_blocks, root_blocks; 495 s64 free_blocks, dirty_blocks, root_blocks;
495 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 496 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
29 29
30static struct kmem_cache *ext4_system_zone_cachep; 30static struct kmem_cache *ext4_system_zone_cachep;
31 31
32int __init init_ext4_system_zone(void) 32int __init ext4_init_system_zone(void)
33{ 33{
34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
35 SLAB_RECLAIM_ACCOUNT);
36 if (ext4_system_zone_cachep == NULL) 35 if (ext4_system_zone_cachep == NULL)
37 return -ENOMEM; 36 return -ENOMEM;
38 return 0; 37 return 0;
39} 38}
40 39
41void exit_ext4_system_zone(void) 40void ext4_exit_system_zone(void)
42{ 41{
43 kmem_cache_destroy(ext4_system_zone_cachep); 42 kmem_cache_destroy(ext4_system_zone_cachep);
44} 43}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..ece76fb6a40c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
39 struct file *filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = ext4_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext4_readdir, /* we take BKL. needed?*/ 44 .readdir = ext4_readdir, /* we take BKL. needed?*/
45 .unlocked_ioctl = ext4_ioctl, 45 .unlocked_ioctl = ext4_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..6a5edea2d70b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -168,7 +168,20 @@ struct mpage_da_data {
168 int pages_written; 168 int pages_written;
169 int retval; 169 int retval;
170}; 170};
171#define EXT4_IO_UNWRITTEN 0x1 171
172/*
173 * Flags for ext4_io_end->flags
174 */
175#define EXT4_IO_END_UNWRITTEN 0x0001
176#define EXT4_IO_END_ERROR 0x0002
177
178struct ext4_io_page {
179 struct page *p_page;
180 atomic_t p_count;
181};
182
183#define MAX_IO_PAGES 128
184
172typedef struct ext4_io_end { 185typedef struct ext4_io_end {
173 struct list_head list; /* per-file finished IO list */ 186 struct list_head list; /* per-file finished IO list */
174 struct inode *inode; /* file being written to */ 187 struct inode *inode; /* file being written to */
@@ -179,8 +192,18 @@ typedef struct ext4_io_end {
179 struct work_struct work; /* data work queue */ 192 struct work_struct work; /* data work queue */
180 struct kiocb *iocb; /* iocb struct for AIO */ 193 struct kiocb *iocb; /* iocb struct for AIO */
181 int result; /* error value for AIO */ 194 int result; /* error value for AIO */
195 int num_io_pages;
196 struct ext4_io_page *pages[MAX_IO_PAGES];
182} ext4_io_end_t; 197} ext4_io_end_t;
183 198
199struct ext4_io_submit {
200 int io_op;
201 struct bio *io_bio;
202 ext4_io_end_t *io_end;
203 struct ext4_io_page *io_page;
204 sector_t io_next_block;
205};
206
184/* 207/*
185 * Special inodes numbers 208 * Special inodes numbers
186 */ 209 */
@@ -205,6 +228,7 @@ typedef struct ext4_io_end {
205#define EXT4_MIN_BLOCK_SIZE 1024 228#define EXT4_MIN_BLOCK_SIZE 1024
206#define EXT4_MAX_BLOCK_SIZE 65536 229#define EXT4_MAX_BLOCK_SIZE 65536
207#define EXT4_MIN_BLOCK_LOG_SIZE 10 230#define EXT4_MIN_BLOCK_LOG_SIZE 10
231#define EXT4_MAX_BLOCK_LOG_SIZE 16
208#ifdef __KERNEL__ 232#ifdef __KERNEL__
209# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) 233# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize)
210#else 234#else
@@ -834,6 +858,7 @@ struct ext4_inode_info {
834 spinlock_t i_completed_io_lock; 858 spinlock_t i_completed_io_lock;
835 /* current io_end structure for async DIO write*/ 859 /* current io_end structure for async DIO write*/
836 ext4_io_end_t *cur_aio_dio; 860 ext4_io_end_t *cur_aio_dio;
861 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
837 862
838 /* 863 /*
839 * Transactions that contain inode's metadata needed to complete 864 * Transactions that contain inode's metadata needed to complete
@@ -889,6 +914,7 @@ struct ext4_inode_info {
889#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 914#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
890#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 915#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
891#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 916#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
917#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
892 918
893#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 919#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
894#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 920#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -1087,7 +1113,6 @@ struct ext4_sb_info {
1087 struct completion s_kobj_unregister; 1113 struct completion s_kobj_unregister;
1088 1114
1089 /* Journaling */ 1115 /* Journaling */
1090 struct inode *s_journal_inode;
1091 struct journal_s *s_journal; 1116 struct journal_s *s_journal;
1092 struct list_head s_orphan; 1117 struct list_head s_orphan;
1093 struct mutex s_orphan_lock; 1118 struct mutex s_orphan_lock;
@@ -1120,10 +1145,7 @@ struct ext4_sb_info {
1120 /* for buddy allocator */ 1145 /* for buddy allocator */
1121 struct ext4_group_info ***s_group_info; 1146 struct ext4_group_info ***s_group_info;
1122 struct inode *s_buddy_cache; 1147 struct inode *s_buddy_cache;
1123 long s_blocks_reserved;
1124 spinlock_t s_reserve_lock;
1125 spinlock_t s_md_lock; 1148 spinlock_t s_md_lock;
1126 tid_t s_last_transaction;
1127 unsigned short *s_mb_offsets; 1149 unsigned short *s_mb_offsets;
1128 unsigned int *s_mb_maxs; 1150 unsigned int *s_mb_maxs;
1129 1151
@@ -1141,7 +1163,6 @@ struct ext4_sb_info {
1141 unsigned long s_mb_last_start; 1163 unsigned long s_mb_last_start;
1142 1164
1143 /* stats for buddy allocator */ 1165 /* stats for buddy allocator */
1144 spinlock_t s_mb_pa_lock;
1145 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 1166 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
1146 atomic_t s_bal_success; /* we found long enough chunks */ 1167 atomic_t s_bal_success; /* we found long enough chunks */
1147 atomic_t s_bal_allocated; /* in blocks */ 1168 atomic_t s_bal_allocated; /* in blocks */
@@ -1172,6 +1193,11 @@ struct ext4_sb_info {
1172 1193
1173 /* timer for periodic error stats printing */ 1194 /* timer for periodic error stats printing */
1174 struct timer_list s_err_report; 1195 struct timer_list s_err_report;
1196
1197 /* Lazy inode table initialization info */
1198 struct ext4_li_request *s_li_request;
1199 /* Wait multiplier for lazy initialization thread */
1200 unsigned int s_li_wait_mult;
1175}; 1201};
1176 1202
1177static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1203static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1533,7 +1559,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1533void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 1559void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1534 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); 1560 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
1535 1561
1536extern struct proc_dir_entry *ext4_proc_root; 1562/*
1563 * Timeout and state flag for lazy initialization inode thread.
1564 */
1565#define EXT4_DEF_LI_WAIT_MULT 10
1566#define EXT4_DEF_LI_MAX_START_DELAY 5
1567#define EXT4_LAZYINIT_QUIT 0x0001
1568#define EXT4_LAZYINIT_RUNNING 0x0002
1569
1570/*
1571 * Lazy inode table initialization info
1572 */
1573struct ext4_lazy_init {
1574 unsigned long li_state;
1575
1576 wait_queue_head_t li_wait_daemon;
1577 wait_queue_head_t li_wait_task;
1578 struct timer_list li_timer;
1579 struct task_struct *li_task;
1580
1581 struct list_head li_request_list;
1582 struct mutex li_list_mtx;
1583};
1584
1585struct ext4_li_request {
1586 struct super_block *lr_super;
1587 struct ext4_sb_info *lr_sbi;
1588 ext4_group_t lr_next_group;
1589 struct list_head lr_request;
1590 unsigned long lr_next_sched;
1591 unsigned long lr_timeout;
1592};
1593
1594struct ext4_features {
1595 struct kobject f_kobj;
1596 struct completion f_kobj_unregister;
1597};
1537 1598
1538/* 1599/*
1539 * Function prototypes 1600 * Function prototypes
@@ -1561,7 +1622,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1561extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1622extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1562 ext4_fsblk_t goal, unsigned long *count, int *errp); 1623 ext4_fsblk_t goal, unsigned long *count, int *errp);
1563extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1624extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1564extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1565extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1625extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1566 ext4_fsblk_t block, unsigned long count); 1626 ext4_fsblk_t block, unsigned long count);
1567extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1627extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1605,11 +1665,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1605extern unsigned long ext4_count_free_inodes(struct super_block *); 1665extern unsigned long ext4_count_free_inodes(struct super_block *);
1606extern unsigned long ext4_count_dirs(struct super_block *); 1666extern unsigned long ext4_count_dirs(struct super_block *);
1607extern void ext4_check_inodes_bitmap(struct super_block *); 1667extern void ext4_check_inodes_bitmap(struct super_block *);
1608extern unsigned ext4_init_inode_bitmap(struct super_block *sb, 1668extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1609 struct buffer_head *bh, 1669extern int ext4_init_inode_table(struct super_block *sb,
1610 ext4_group_t group, 1670 ext4_group_t group, int barrier);
1611 struct ext4_group_desc *desc);
1612extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1613 1671
1614/* mballoc.c */ 1672/* mballoc.c */
1615extern long ext4_mb_stats; 1673extern long ext4_mb_stats;
@@ -1620,16 +1678,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1620 struct ext4_allocation_request *, int *); 1678 struct ext4_allocation_request *, int *);
1621extern int ext4_mb_reserve_blocks(struct super_block *, int); 1679extern int ext4_mb_reserve_blocks(struct super_block *, int);
1622extern void ext4_discard_preallocations(struct inode *); 1680extern void ext4_discard_preallocations(struct inode *);
1623extern int __init init_ext4_mballoc(void); 1681extern int __init ext4_init_mballoc(void);
1624extern void exit_ext4_mballoc(void); 1682extern void ext4_exit_mballoc(void);
1625extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1683extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1626 struct buffer_head *bh, ext4_fsblk_t block, 1684 struct buffer_head *bh, ext4_fsblk_t block,
1627 unsigned long count, int flags); 1685 unsigned long count, int flags);
1628extern int ext4_mb_add_groupinfo(struct super_block *sb, 1686extern int ext4_mb_add_groupinfo(struct super_block *sb,
1629 ext4_group_t i, struct ext4_group_desc *desc); 1687 ext4_group_t i, struct ext4_group_desc *desc);
1630extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1688extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1631extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1689
1632 ext4_group_t, int);
1633/* inode.c */ 1690/* inode.c */
1634struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1691struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1635 ext4_lblk_t, int, int *); 1692 ext4_lblk_t, int, int *);
@@ -1657,13 +1714,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
1657extern int ext4_alloc_da_blocks(struct inode *inode); 1714extern int ext4_alloc_da_blocks(struct inode *inode);
1658extern void ext4_set_aops(struct inode *inode); 1715extern void ext4_set_aops(struct inode *inode);
1659extern int ext4_writepage_trans_blocks(struct inode *); 1716extern int ext4_writepage_trans_blocks(struct inode *);
1660extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1661extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1717extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1662extern int ext4_block_truncate_page(handle_t *handle, 1718extern int ext4_block_truncate_page(handle_t *handle,
1663 struct address_space *mapping, loff_t from); 1719 struct address_space *mapping, loff_t from);
1664extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1720extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1665extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1721extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1666extern int flush_completed_IO(struct inode *inode);
1667extern void ext4_da_update_reserve_space(struct inode *inode, 1722extern void ext4_da_update_reserve_space(struct inode *inode,
1668 int used, int quota_claim); 1723 int used, int quota_claim);
1669/* ioctl.c */ 1724/* ioctl.c */
@@ -1960,6 +2015,7 @@ extern const struct file_operations ext4_dir_operations;
1960/* file.c */ 2015/* file.c */
1961extern const struct inode_operations ext4_file_inode_operations; 2016extern const struct inode_operations ext4_file_inode_operations;
1962extern const struct file_operations ext4_file_operations; 2017extern const struct file_operations ext4_file_operations;
2018extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
1963 2019
1964/* namei.c */ 2020/* namei.c */
1965extern const struct inode_operations ext4_dir_inode_operations; 2021extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2029,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1973/* block_validity */ 2029/* block_validity */
1974extern void ext4_release_system_zone(struct super_block *sb); 2030extern void ext4_release_system_zone(struct super_block *sb);
1975extern int ext4_setup_system_zone(struct super_block *sb); 2031extern int ext4_setup_system_zone(struct super_block *sb);
1976extern int __init init_ext4_system_zone(void); 2032extern int __init ext4_init_system_zone(void);
1977extern void exit_ext4_system_zone(void); 2033extern void ext4_exit_system_zone(void);
1978extern int ext4_data_block_valid(struct ext4_sb_info *sbi, 2034extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
1979 ext4_fsblk_t start_blk, 2035 ext4_fsblk_t start_blk,
1980 unsigned int count); 2036 unsigned int count);
@@ -2002,6 +2058,18 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2002 __u64 start_orig, __u64 start_donor, 2058 __u64 start_orig, __u64 start_donor,
2003 __u64 len, __u64 *moved_len); 2059 __u64 len, __u64 *moved_len);
2004 2060
2061/* page-io.c */
2062extern int __init ext4_init_pageio(void);
2063extern void ext4_exit_pageio(void);
2064extern void ext4_ioend_wait(struct inode *);
2065extern void ext4_free_io_end(ext4_io_end_t *io);
2066extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2067extern int ext4_end_io_nolock(ext4_io_end_t *io);
2068extern void ext4_io_submit(struct ext4_io_submit *io);
2069extern int ext4_bio_write_page(struct ext4_io_submit *io,
2070 struct page *page,
2071 int len,
2072 struct writeback_control *wbc);
2005 2073
2006/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2074/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2007enum ext4_state_bits { 2075enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..28ce70fd9cd0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); 225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226} 226}
227 227
228/*
229 * ext4_ext_pblock:
230 * combine low and high parts of physical block number into ext4_fsblk_t
231 */
232static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
233{
234 ext4_fsblk_t block;
235
236 block = le32_to_cpu(ex->ee_start_lo);
237 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
238 return block;
239}
240
241/*
242 * ext4_idx_pblock:
243 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
244 */
245static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
246{
247 ext4_fsblk_t block;
248
249 block = le32_to_cpu(ix->ei_leaf_lo);
250 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
251 return block;
252}
253
254/*
255 * ext4_ext_store_pblock:
256 * stores a large physical block number into an extent struct,
257 * breaking it into parts
258 */
259static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
260 ext4_fsblk_t pb)
261{
262 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
263 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
264 0xffff);
265}
266
267/*
268 * ext4_idx_store_pblock:
269 * stores a large physical block number into an index struct,
270 * breaking it into parts
271 */
272static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
273 ext4_fsblk_t pb)
274{
275 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
276 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
277 0xffff);
278}
279
228extern int ext4_ext_calc_metadata_amount(struct inode *inode, 280extern int ext4_ext_calc_metadata_amount(struct inode *inode,
229 sector_t lblocks); 281 sector_t lblocks);
230extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
231extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
232extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
233extern int ext4_extent_tree_init(handle_t *, struct inode *); 282extern int ext4_extent_tree_init(handle_t *, struct inode *);
234extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 283extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
235 int num, 284 int num,
@@ -237,19 +286,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
237extern int ext4_can_extents_be_merged(struct inode *inode, 286extern int ext4_can_extents_be_merged(struct inode *inode,
238 struct ext4_extent *ex1, 287 struct ext4_extent *ex1,
239 struct ext4_extent *ex2); 288 struct ext4_extent *ex2);
240extern int ext4_ext_try_to_merge(struct inode *inode,
241 struct ext4_ext_path *path,
242 struct ext4_extent *);
243extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
244extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); 289extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
245extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
246 ext_prepare_callback, void *);
247extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 290extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
248 struct ext4_ext_path *); 291 struct ext4_ext_path *);
249extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
250 ext4_lblk_t *, ext4_fsblk_t *);
251extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
252 ext4_lblk_t *, ext4_fsblk_t *);
253extern void ext4_ext_drop_refs(struct ext4_ext_path *); 292extern void ext4_ext_drop_refs(struct ext4_ext_path *);
254extern int ext4_ext_check_inode(struct inode *inode); 293extern int ext4_ext_check_inode(struct inode *inode);
255#endif /* _EXT4_EXTENTS */ 294#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..0554c48cb1fd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h" 45#include "ext4_extents.h"
46 46
47
48/*
49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{
54 ext4_fsblk_t block;
55
56 block = le32_to_cpu(ex->ee_start_lo);
57 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58 return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67 ext4_fsblk_t block;
68
69 block = le32_to_cpu(ix->ei_leaf_lo);
70 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71 return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95
96static int ext4_ext_truncate_extend_restart(handle_t *handle, 47static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode, 48 struct inode *inode,
98 int needed) 49 int needed)
@@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
169 /* try to predict block placement */ 120 /* try to predict block placement */
170 ex = path[depth].p_ext; 121 ex = path[depth].p_ext;
171 if (ex) 122 if (ex)
172 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); 123 return (ext4_ext_pblock(ex) +
124 (block - le32_to_cpu(ex->ee_block)));
173 125
174 /* it looks like index is empty; 126 /* it looks like index is empty;
175 * try to find starting block from index itself */ 127 * try to find starting block from index itself */
@@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
354 306
355static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 307static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
356{ 308{
357 ext4_fsblk_t block = ext_pblock(ext); 309 ext4_fsblk_t block = ext4_ext_pblock(ext);
358 int len = ext4_ext_get_actual_len(ext); 310 int len = ext4_ext_get_actual_len(ext);
359 311
360 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 312 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
363static int ext4_valid_extent_idx(struct inode *inode, 315static int ext4_valid_extent_idx(struct inode *inode,
364 struct ext4_extent_idx *ext_idx) 316 struct ext4_extent_idx *ext_idx)
365{ 317{
366 ext4_fsblk_t block = idx_pblock(ext_idx); 318 ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
367 319
368 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 320 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
369} 321}
@@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
463 for (k = 0; k <= l; k++, path++) { 415 for (k = 0; k <= l; k++, path++) {
464 if (path->p_idx) { 416 if (path->p_idx) {
465 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 417 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
466 idx_pblock(path->p_idx)); 418 ext4_idx_pblock(path->p_idx));
467 } else if (path->p_ext) { 419 } else if (path->p_ext) {
468 ext_debug(" %d:[%d]%d:%llu ", 420 ext_debug(" %d:[%d]%d:%llu ",
469 le32_to_cpu(path->p_ext->ee_block), 421 le32_to_cpu(path->p_ext->ee_block),
470 ext4_ext_is_uninitialized(path->p_ext), 422 ext4_ext_is_uninitialized(path->p_ext),
471 ext4_ext_get_actual_len(path->p_ext), 423 ext4_ext_get_actual_len(path->p_ext),
472 ext_pblock(path->p_ext)); 424 ext4_ext_pblock(path->p_ext));
473 } else 425 } else
474 ext_debug(" []"); 426 ext_debug(" []");
475 } 427 }
@@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
494 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 446 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
495 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 447 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
496 ext4_ext_is_uninitialized(ex), 448 ext4_ext_is_uninitialized(ex),
497 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 449 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
498 } 450 }
499 ext_debug("\n"); 451 ext_debug("\n");
500} 452}
@@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
545 497
546 path->p_idx = l - 1; 498 path->p_idx = l - 1;
547 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 499 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
548 idx_pblock(path->p_idx)); 500 ext4_idx_pblock(path->p_idx));
549 501
550#ifdef CHECK_BINSEARCH 502#ifdef CHECK_BINSEARCH
551 { 503 {
@@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode,
614 path->p_ext = l - 1; 566 path->p_ext = l - 1;
615 ext_debug(" -> %d:%llu:[%d]%d ", 567 ext_debug(" -> %d:%llu:[%d]%d ",
616 le32_to_cpu(path->p_ext->ee_block), 568 le32_to_cpu(path->p_ext->ee_block),
617 ext_pblock(path->p_ext), 569 ext4_ext_pblock(path->p_ext),
618 ext4_ext_is_uninitialized(path->p_ext), 570 ext4_ext_is_uninitialized(path->p_ext),
619 ext4_ext_get_actual_len(path->p_ext)); 571 ext4_ext_get_actual_len(path->p_ext));
620 572
@@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
682 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 634 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
683 635
684 ext4_ext_binsearch_idx(inode, path + ppos, block); 636 ext4_ext_binsearch_idx(inode, path + ppos, block);
685 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 637 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
686 path[ppos].p_depth = i; 638 path[ppos].p_depth = i;
687 path[ppos].p_ext = NULL; 639 path[ppos].p_ext = NULL;
688 640
@@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
721 ext4_ext_binsearch(inode, path + ppos, block); 673 ext4_ext_binsearch(inode, path + ppos, block);
722 /* if not an empty leaf */ 674 /* if not an empty leaf */
723 if (path[ppos].p_ext) 675 if (path[ppos].p_ext)
724 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 676 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
725 677
726 ext4_ext_show_path(inode, path); 678 ext4_ext_show_path(inode, path);
727 679
@@ -739,9 +691,9 @@ err:
739 * insert new index [@logical;@ptr] into the block at @curp; 691 * insert new index [@logical;@ptr] into the block at @curp;
740 * check where to insert: before @curp or after @curp 692 * check where to insert: before @curp or after @curp
741 */ 693 */
742int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 694static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
743 struct ext4_ext_path *curp, 695 struct ext4_ext_path *curp,
744 int logical, ext4_fsblk_t ptr) 696 int logical, ext4_fsblk_t ptr)
745{ 697{
746 struct ext4_extent_idx *ix; 698 struct ext4_extent_idx *ix;
747 int len, err; 699 int len, err;
@@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
917 EXT_MAX_EXTENT(path[depth].p_hdr)) { 869 EXT_MAX_EXTENT(path[depth].p_hdr)) {
918 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 870 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
919 le32_to_cpu(path[depth].p_ext->ee_block), 871 le32_to_cpu(path[depth].p_ext->ee_block),
920 ext_pblock(path[depth].p_ext), 872 ext4_ext_pblock(path[depth].p_ext),
921 ext4_ext_is_uninitialized(path[depth].p_ext), 873 ext4_ext_is_uninitialized(path[depth].p_ext),
922 ext4_ext_get_actual_len(path[depth].p_ext), 874 ext4_ext_get_actual_len(path[depth].p_ext),
923 newblock); 875 newblock);
@@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
1007 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 959 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
1008 ext_debug("%d: move %d:%llu in new index %llu\n", i, 960 ext_debug("%d: move %d:%llu in new index %llu\n", i,
1009 le32_to_cpu(path[i].p_idx->ei_block), 961 le32_to_cpu(path[i].p_idx->ei_block),
1010 idx_pblock(path[i].p_idx), 962 ext4_idx_pblock(path[i].p_idx),
1011 newblock); 963 newblock);
1012 /*memmove(++fidx, path[i].p_idx++, 964 /*memmove(++fidx, path[i].p_idx++,
1013 sizeof(struct ext4_extent_idx)); 965 sizeof(struct ext4_extent_idx));
@@ -1146,7 +1098,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1146 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1098 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1147 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1099 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1148 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1100 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1149 idx_pblock(EXT_FIRST_INDEX(neh))); 1101 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1150 1102
1151 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1103 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1152 err = ext4_ext_dirty(handle, inode, curp); 1104 err = ext4_ext_dirty(handle, inode, curp);
@@ -1232,9 +1184,9 @@ out:
1232 * returns 0 at @phys 1184 * returns 0 at @phys
1233 * return value contains 0 (success) or error code 1185 * return value contains 0 (success) or error code
1234 */ 1186 */
1235int 1187static int ext4_ext_search_left(struct inode *inode,
1236ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, 1188 struct ext4_ext_path *path,
1237 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1189 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1238{ 1190{
1239 struct ext4_extent_idx *ix; 1191 struct ext4_extent_idx *ix;
1240 struct ext4_extent *ex; 1192 struct ext4_extent *ex;
@@ -1286,7 +1238,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1286 } 1238 }
1287 1239
1288 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1240 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1289 *phys = ext_pblock(ex) + ee_len - 1; 1241 *phys = ext4_ext_pblock(ex) + ee_len - 1;
1290 return 0; 1242 return 0;
1291} 1243}
1292 1244
@@ -1297,9 +1249,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1297 * returns 0 at @phys 1249 * returns 0 at @phys
1298 * return value contains 0 (success) or error code 1250 * return value contains 0 (success) or error code
1299 */ 1251 */
1300int 1252static int ext4_ext_search_right(struct inode *inode,
1301ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, 1253 struct ext4_ext_path *path,
1302 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1254 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1303{ 1255{
1304 struct buffer_head *bh = NULL; 1256 struct buffer_head *bh = NULL;
1305 struct ext4_extent_header *eh; 1257 struct ext4_extent_header *eh;
@@ -1342,7 +1294,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1342 } 1294 }
1343 } 1295 }
1344 *logical = le32_to_cpu(ex->ee_block); 1296 *logical = le32_to_cpu(ex->ee_block);
1345 *phys = ext_pblock(ex); 1297 *phys = ext4_ext_pblock(ex);
1346 return 0; 1298 return 0;
1347 } 1299 }
1348 1300
@@ -1357,7 +1309,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1357 /* next allocated block in this leaf */ 1309 /* next allocated block in this leaf */
1358 ex++; 1310 ex++;
1359 *logical = le32_to_cpu(ex->ee_block); 1311 *logical = le32_to_cpu(ex->ee_block);
1360 *phys = ext_pblock(ex); 1312 *phys = ext4_ext_pblock(ex);
1361 return 0; 1313 return 0;
1362 } 1314 }
1363 1315
@@ -1376,7 +1328,7 @@ got_index:
1376 * follow it and find the closest allocated 1328 * follow it and find the closest allocated
1377 * block to the right */ 1329 * block to the right */
1378 ix++; 1330 ix++;
1379 block = idx_pblock(ix); 1331 block = ext4_idx_pblock(ix);
1380 while (++depth < path->p_depth) { 1332 while (++depth < path->p_depth) {
1381 bh = sb_bread(inode->i_sb, block); 1333 bh = sb_bread(inode->i_sb, block);
1382 if (bh == NULL) 1334 if (bh == NULL)
@@ -1388,7 +1340,7 @@ got_index:
1388 return -EIO; 1340 return -EIO;
1389 } 1341 }
1390 ix = EXT_FIRST_INDEX(eh); 1342 ix = EXT_FIRST_INDEX(eh);
1391 block = idx_pblock(ix); 1343 block = ext4_idx_pblock(ix);
1392 put_bh(bh); 1344 put_bh(bh);
1393 } 1345 }
1394 1346
@@ -1402,7 +1354,7 @@ got_index:
1402 } 1354 }
1403 ex = EXT_FIRST_EXTENT(eh); 1355 ex = EXT_FIRST_EXTENT(eh);
1404 *logical = le32_to_cpu(ex->ee_block); 1356 *logical = le32_to_cpu(ex->ee_block);
1405 *phys = ext_pblock(ex); 1357 *phys = ext4_ext_pblock(ex);
1406 put_bh(bh); 1358 put_bh(bh);
1407 return 0; 1359 return 0;
1408} 1360}
@@ -1573,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1573 return 0; 1525 return 0;
1574#endif 1526#endif
1575 1527
1576 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) 1528 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1577 return 1; 1529 return 1;
1578 return 0; 1530 return 0;
1579} 1531}
@@ -1585,9 +1537,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1585 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1537 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1586 * 1 if they got merged. 1538 * 1 if they got merged.
1587 */ 1539 */
1588int ext4_ext_try_to_merge(struct inode *inode, 1540static int ext4_ext_try_to_merge(struct inode *inode,
1589 struct ext4_ext_path *path, 1541 struct ext4_ext_path *path,
1590 struct ext4_extent *ex) 1542 struct ext4_extent *ex)
1591{ 1543{
1592 struct ext4_extent_header *eh; 1544 struct ext4_extent_header *eh;
1593 unsigned int depth, len; 1545 unsigned int depth, len;
@@ -1632,9 +1584,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1632 * such that there will be no overlap, and then returns 1. 1584 * such that there will be no overlap, and then returns 1.
1633 * If there is no overlap found, it returns 0. 1585 * If there is no overlap found, it returns 0.
1634 */ 1586 */
1635unsigned int ext4_ext_check_overlap(struct inode *inode, 1587static unsigned int ext4_ext_check_overlap(struct inode *inode,
1636 struct ext4_extent *newext, 1588 struct ext4_extent *newext,
1637 struct ext4_ext_path *path) 1589 struct ext4_ext_path *path)
1638{ 1590{
1639 ext4_lblk_t b1, b2; 1591 ext4_lblk_t b1, b2;
1640 unsigned int depth, len1; 1592 unsigned int depth, len1;
@@ -1706,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1706 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1658 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1707 && ext4_can_extents_be_merged(inode, ex, newext)) { 1659 && ext4_can_extents_be_merged(inode, ex, newext)) {
1708 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1660 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1709 ext4_ext_is_uninitialized(newext), 1661 ext4_ext_is_uninitialized(newext),
1710 ext4_ext_get_actual_len(newext), 1662 ext4_ext_get_actual_len(newext),
1711 le32_to_cpu(ex->ee_block), 1663 le32_to_cpu(ex->ee_block),
1712 ext4_ext_is_uninitialized(ex), 1664 ext4_ext_is_uninitialized(ex),
1713 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1665 ext4_ext_get_actual_len(ex),
1666 ext4_ext_pblock(ex));
1714 err = ext4_ext_get_access(handle, inode, path + depth); 1667 err = ext4_ext_get_access(handle, inode, path + depth);
1715 if (err) 1668 if (err)
1716 return err; 1669 return err;
@@ -1780,7 +1733,7 @@ has_space:
1780 /* there is no extent in this leaf, create first one */ 1733 /* there is no extent in this leaf, create first one */
1781 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1734 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1782 le32_to_cpu(newext->ee_block), 1735 le32_to_cpu(newext->ee_block),
1783 ext_pblock(newext), 1736 ext4_ext_pblock(newext),
1784 ext4_ext_is_uninitialized(newext), 1737 ext4_ext_is_uninitialized(newext),
1785 ext4_ext_get_actual_len(newext)); 1738 ext4_ext_get_actual_len(newext));
1786 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1739 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1747,7 @@ has_space:
1794 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1747 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1795 "move %d from 0x%p to 0x%p\n", 1748 "move %d from 0x%p to 0x%p\n",
1796 le32_to_cpu(newext->ee_block), 1749 le32_to_cpu(newext->ee_block),
1797 ext_pblock(newext), 1750 ext4_ext_pblock(newext),
1798 ext4_ext_is_uninitialized(newext), 1751 ext4_ext_is_uninitialized(newext),
1799 ext4_ext_get_actual_len(newext), 1752 ext4_ext_get_actual_len(newext),
1800 nearex, len, nearex + 1, nearex + 2); 1753 nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1761,7 @@ has_space:
1808 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1761 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1809 "move %d from 0x%p to 0x%p\n", 1762 "move %d from 0x%p to 0x%p\n",
1810 le32_to_cpu(newext->ee_block), 1763 le32_to_cpu(newext->ee_block),
1811 ext_pblock(newext), 1764 ext4_ext_pblock(newext),
1812 ext4_ext_is_uninitialized(newext), 1765 ext4_ext_is_uninitialized(newext),
1813 ext4_ext_get_actual_len(newext), 1766 ext4_ext_get_actual_len(newext),
1814 nearex, len, nearex + 1, nearex + 2); 1767 nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1772,7 @@ has_space:
1819 le16_add_cpu(&eh->eh_entries, 1); 1772 le16_add_cpu(&eh->eh_entries, 1);
1820 nearex = path[depth].p_ext; 1773 nearex = path[depth].p_ext;
1821 nearex->ee_block = newext->ee_block; 1774 nearex->ee_block = newext->ee_block;
1822 ext4_ext_store_pblock(nearex, ext_pblock(newext)); 1775 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1823 nearex->ee_len = newext->ee_len; 1776 nearex->ee_len = newext->ee_len;
1824 1777
1825merge: 1778merge:
@@ -1845,9 +1798,9 @@ cleanup:
1845 return err; 1798 return err;
1846} 1799}
1847 1800
1848int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1801static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1849 ext4_lblk_t num, ext_prepare_callback func, 1802 ext4_lblk_t num, ext_prepare_callback func,
1850 void *cbdata) 1803 void *cbdata)
1851{ 1804{
1852 struct ext4_ext_path *path = NULL; 1805 struct ext4_ext_path *path = NULL;
1853 struct ext4_ext_cache cbex; 1806 struct ext4_ext_cache cbex;
@@ -1923,7 +1876,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1923 } else { 1876 } else {
1924 cbex.ec_block = le32_to_cpu(ex->ee_block); 1877 cbex.ec_block = le32_to_cpu(ex->ee_block);
1925 cbex.ec_len = ext4_ext_get_actual_len(ex); 1878 cbex.ec_len = ext4_ext_get_actual_len(ex);
1926 cbex.ec_start = ext_pblock(ex); 1879 cbex.ec_start = ext4_ext_pblock(ex);
1927 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1880 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1928 } 1881 }
1929 1882
@@ -2073,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2073 2026
2074 /* free index block */ 2027 /* free index block */
2075 path--; 2028 path--;
2076 leaf = idx_pblock(path->p_idx); 2029 leaf = ext4_idx_pblock(path->p_idx);
2077 if (unlikely(path->p_hdr->eh_entries == 0)) { 2030 if (unlikely(path->p_hdr->eh_entries == 0)) {
2078 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2031 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2079 return -EIO; 2032 return -EIO;
@@ -2181,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2181 ext4_fsblk_t start; 2134 ext4_fsblk_t start;
2182 2135
2183 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2136 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2184 start = ext_pblock(ex) + ee_len - num; 2137 start = ext4_ext_pblock(ex) + ee_len - num;
2185 ext_debug("free last %u blocks starting %llu\n", num, start); 2138 ext_debug("free last %u blocks starting %llu\n", num, start);
2186 ext4_free_blocks(handle, inode, 0, start, num, flags); 2139 ext4_free_blocks(handle, inode, 0, start, num, flags);
2187 } else if (from == le32_to_cpu(ex->ee_block) 2140 } else if (from == le32_to_cpu(ex->ee_block)
@@ -2310,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2310 goto out; 2263 goto out;
2311 2264
2312 ext_debug("new extent: %u:%u:%llu\n", block, num, 2265 ext_debug("new extent: %u:%u:%llu\n", block, num,
2313 ext_pblock(ex)); 2266 ext4_ext_pblock(ex));
2314 ex--; 2267 ex--;
2315 ex_ee_block = le32_to_cpu(ex->ee_block); 2268 ex_ee_block = le32_to_cpu(ex->ee_block);
2316 ex_ee_len = ext4_ext_get_actual_len(ex); 2269 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2421,9 +2374,9 @@ again:
2421 struct buffer_head *bh; 2374 struct buffer_head *bh;
2422 /* go to the next level */ 2375 /* go to the next level */
2423 ext_debug("move to level %d (block %llu)\n", 2376 ext_debug("move to level %d (block %llu)\n",
2424 i + 1, idx_pblock(path[i].p_idx)); 2377 i + 1, ext4_idx_pblock(path[i].p_idx));
2425 memset(path + i + 1, 0, sizeof(*path)); 2378 memset(path + i + 1, 0, sizeof(*path));
2426 bh = sb_bread(sb, idx_pblock(path[i].p_idx)); 2379 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2427 if (!bh) { 2380 if (!bh) {
2428 /* should we reset i_size? */ 2381 /* should we reset i_size? */
2429 err = -EIO; 2382 err = -EIO;
@@ -2535,77 +2488,21 @@ void ext4_ext_release(struct super_block *sb)
2535#endif 2488#endif
2536} 2489}
2537 2490
2538static void bi_complete(struct bio *bio, int error)
2539{
2540 complete((struct completion *)bio->bi_private);
2541}
2542
2543/* FIXME!! we need to try to merge to left or right after zero-out */ 2491/* FIXME!! we need to try to merge to left or right after zero-out */
2544static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2492static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2545{ 2493{
2494 ext4_fsblk_t ee_pblock;
2495 unsigned int ee_len;
2546 int ret; 2496 int ret;
2547 struct bio *bio;
2548 int blkbits, blocksize;
2549 sector_t ee_pblock;
2550 struct completion event;
2551 unsigned int ee_len, len, done, offset;
2552 2497
2553
2554 blkbits = inode->i_blkbits;
2555 blocksize = inode->i_sb->s_blocksize;
2556 ee_len = ext4_ext_get_actual_len(ex); 2498 ee_len = ext4_ext_get_actual_len(ex);
2557 ee_pblock = ext_pblock(ex); 2499 ee_pblock = ext4_ext_pblock(ex);
2558
2559 /* convert ee_pblock to 512 byte sectors */
2560 ee_pblock = ee_pblock << (blkbits - 9);
2561
2562 while (ee_len > 0) {
2563
2564 if (ee_len > BIO_MAX_PAGES)
2565 len = BIO_MAX_PAGES;
2566 else
2567 len = ee_len;
2568
2569 bio = bio_alloc(GFP_NOIO, len);
2570 if (!bio)
2571 return -ENOMEM;
2572
2573 bio->bi_sector = ee_pblock;
2574 bio->bi_bdev = inode->i_sb->s_bdev;
2575
2576 done = 0;
2577 offset = 0;
2578 while (done < len) {
2579 ret = bio_add_page(bio, ZERO_PAGE(0),
2580 blocksize, offset);
2581 if (ret != blocksize) {
2582 /*
2583 * We can't add any more pages because of
2584 * hardware limitations. Start a new bio.
2585 */
2586 break;
2587 }
2588 done++;
2589 offset += blocksize;
2590 if (offset >= PAGE_CACHE_SIZE)
2591 offset = 0;
2592 }
2593 2500
2594 init_completion(&event); 2501 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2595 bio->bi_private = &event; 2502 if (ret > 0)
2596 bio->bi_end_io = bi_complete; 2503 ret = 0;
2597 submit_bio(WRITE, bio);
2598 wait_for_completion(&event);
2599 2504
2600 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2505 return ret;
2601 bio_put(bio);
2602 return -EIO;
2603 }
2604 bio_put(bio);
2605 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9);
2607 }
2608 return 0;
2609} 2506}
2610 2507
2611#define EXT4_EXT_ZERO_LEN 7 2508#define EXT4_EXT_ZERO_LEN 7
@@ -2651,12 +2548,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2651 ee_block = le32_to_cpu(ex->ee_block); 2548 ee_block = le32_to_cpu(ex->ee_block);
2652 ee_len = ext4_ext_get_actual_len(ex); 2549 ee_len = ext4_ext_get_actual_len(ex);
2653 allocated = ee_len - (map->m_lblk - ee_block); 2550 allocated = ee_len - (map->m_lblk - ee_block);
2654 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2551 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2655 2552
2656 ex2 = ex; 2553 ex2 = ex;
2657 orig_ex.ee_block = ex->ee_block; 2554 orig_ex.ee_block = ex->ee_block;
2658 orig_ex.ee_len = cpu_to_le16(ee_len); 2555 orig_ex.ee_len = cpu_to_le16(ee_len);
2659 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2556 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2660 2557
2661 /* 2558 /*
2662 * It is safe to convert extent to initialized via explicit 2559 * It is safe to convert extent to initialized via explicit
@@ -2675,7 +2572,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2675 /* update the extent length and mark as initialized */ 2572 /* update the extent length and mark as initialized */
2676 ex->ee_block = orig_ex.ee_block; 2573 ex->ee_block = orig_ex.ee_block;
2677 ex->ee_len = orig_ex.ee_len; 2574 ex->ee_len = orig_ex.ee_len;
2678 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2575 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2679 ext4_ext_dirty(handle, inode, path + depth); 2576 ext4_ext_dirty(handle, inode, path + depth);
2680 /* zeroed the full extent */ 2577 /* zeroed the full extent */
2681 return allocated; 2578 return allocated;
@@ -2710,7 +2607,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2710 ex->ee_block = orig_ex.ee_block; 2607 ex->ee_block = orig_ex.ee_block;
2711 ex->ee_len = cpu_to_le16(ee_len - allocated); 2608 ex->ee_len = cpu_to_le16(ee_len - allocated);
2712 ext4_ext_mark_uninitialized(ex); 2609 ext4_ext_mark_uninitialized(ex);
2713 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2610 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2714 ext4_ext_dirty(handle, inode, path + depth); 2611 ext4_ext_dirty(handle, inode, path + depth);
2715 2612
2716 ex3 = &newex; 2613 ex3 = &newex;
@@ -2725,7 +2622,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2725 goto fix_extent_len; 2622 goto fix_extent_len;
2726 ex->ee_block = orig_ex.ee_block; 2623 ex->ee_block = orig_ex.ee_block;
2727 ex->ee_len = orig_ex.ee_len; 2624 ex->ee_len = orig_ex.ee_len;
2728 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2625 ext4_ext_store_pblock(ex,
2626 ext4_ext_pblock(&orig_ex));
2729 ext4_ext_dirty(handle, inode, path + depth); 2627 ext4_ext_dirty(handle, inode, path + depth);
2730 /* blocks available from map->m_lblk */ 2628 /* blocks available from map->m_lblk */
2731 return allocated; 2629 return allocated;
@@ -2782,7 +2680,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2782 /* update the extent length and mark as initialized */ 2680 /* update the extent length and mark as initialized */
2783 ex->ee_block = orig_ex.ee_block; 2681 ex->ee_block = orig_ex.ee_block;
2784 ex->ee_len = orig_ex.ee_len; 2682 ex->ee_len = orig_ex.ee_len;
2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2683 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2786 ext4_ext_dirty(handle, inode, path + depth); 2684 ext4_ext_dirty(handle, inode, path + depth);
2787 /* zeroed the full extent */ 2685 /* zeroed the full extent */
2788 /* blocks available from map->m_lblk */ 2686 /* blocks available from map->m_lblk */
@@ -2833,7 +2731,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2833 /* update the extent length and mark as initialized */ 2731 /* update the extent length and mark as initialized */
2834 ex->ee_block = orig_ex.ee_block; 2732 ex->ee_block = orig_ex.ee_block;
2835 ex->ee_len = orig_ex.ee_len; 2733 ex->ee_len = orig_ex.ee_len;
2836 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2734 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2837 ext4_ext_dirty(handle, inode, path + depth); 2735 ext4_ext_dirty(handle, inode, path + depth);
2838 /* zero out the first half */ 2736 /* zero out the first half */
2839 /* blocks available from map->m_lblk */ 2737 /* blocks available from map->m_lblk */
@@ -2902,7 +2800,7 @@ insert:
2902 /* update the extent length and mark as initialized */ 2800 /* update the extent length and mark as initialized */
2903 ex->ee_block = orig_ex.ee_block; 2801 ex->ee_block = orig_ex.ee_block;
2904 ex->ee_len = orig_ex.ee_len; 2802 ex->ee_len = orig_ex.ee_len;
2905 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2803 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2906 ext4_ext_dirty(handle, inode, path + depth); 2804 ext4_ext_dirty(handle, inode, path + depth);
2907 /* zero out the first half */ 2805 /* zero out the first half */
2908 return allocated; 2806 return allocated;
@@ -2915,7 +2813,7 @@ out:
2915fix_extent_len: 2813fix_extent_len:
2916 ex->ee_block = orig_ex.ee_block; 2814 ex->ee_block = orig_ex.ee_block;
2917 ex->ee_len = orig_ex.ee_len; 2815 ex->ee_len = orig_ex.ee_len;
2918 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2816 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2919 ext4_ext_mark_uninitialized(ex); 2817 ext4_ext_mark_uninitialized(ex);
2920 ext4_ext_dirty(handle, inode, path + depth); 2818 ext4_ext_dirty(handle, inode, path + depth);
2921 return err; 2819 return err;
@@ -2973,12 +2871,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2973 ee_block = le32_to_cpu(ex->ee_block); 2871 ee_block = le32_to_cpu(ex->ee_block);
2974 ee_len = ext4_ext_get_actual_len(ex); 2872 ee_len = ext4_ext_get_actual_len(ex);
2975 allocated = ee_len - (map->m_lblk - ee_block); 2873 allocated = ee_len - (map->m_lblk - ee_block);
2976 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2874 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2977 2875
2978 ex2 = ex; 2876 ex2 = ex;
2979 orig_ex.ee_block = ex->ee_block; 2877 orig_ex.ee_block = ex->ee_block;
2980 orig_ex.ee_len = cpu_to_le16(ee_len); 2878 orig_ex.ee_len = cpu_to_le16(ee_len);
2981 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2879 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2982 2880
2983 /* 2881 /*
2984 * It is safe to convert extent to initialized via explicit 2882 * It is safe to convert extent to initialized via explicit
@@ -3027,7 +2925,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3027 /* update the extent length and mark as initialized */ 2925 /* update the extent length and mark as initialized */
3028 ex->ee_block = orig_ex.ee_block; 2926 ex->ee_block = orig_ex.ee_block;
3029 ex->ee_len = orig_ex.ee_len; 2927 ex->ee_len = orig_ex.ee_len;
3030 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2928 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3031 ext4_ext_dirty(handle, inode, path + depth); 2929 ext4_ext_dirty(handle, inode, path + depth);
3032 /* zeroed the full extent */ 2930 /* zeroed the full extent */
3033 /* blocks available from map->m_lblk */ 2931 /* blocks available from map->m_lblk */
@@ -3099,7 +2997,7 @@ insert:
3099 /* update the extent length and mark as initialized */ 2997 /* update the extent length and mark as initialized */
3100 ex->ee_block = orig_ex.ee_block; 2998 ex->ee_block = orig_ex.ee_block;
3101 ex->ee_len = orig_ex.ee_len; 2999 ex->ee_len = orig_ex.ee_len;
3102 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3000 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3103 ext4_ext_dirty(handle, inode, path + depth); 3001 ext4_ext_dirty(handle, inode, path + depth);
3104 /* zero out the first half */ 3002 /* zero out the first half */
3105 return allocated; 3003 return allocated;
@@ -3112,7 +3010,7 @@ out:
3112fix_extent_len: 3010fix_extent_len:
3113 ex->ee_block = orig_ex.ee_block; 3011 ex->ee_block = orig_ex.ee_block;
3114 ex->ee_len = orig_ex.ee_len; 3012 ex->ee_len = orig_ex.ee_len;
3115 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3013 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3116 ext4_ext_mark_uninitialized(ex); 3014 ext4_ext_mark_uninitialized(ex);
3117 ext4_ext_dirty(handle, inode, path + depth); 3015 ext4_ext_dirty(handle, inode, path + depth);
3118 return err; 3016 return err;
@@ -3180,6 +3078,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3180 unmap_underlying_metadata(bdev, block + i); 3078 unmap_underlying_metadata(bdev, block + i);
3181} 3079}
3182 3080
3081/*
3082 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3083 */
3084static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3085 struct ext4_map_blocks *map,
3086 struct ext4_ext_path *path,
3087 unsigned int len)
3088{
3089 int i, depth;
3090 struct ext4_extent_header *eh;
3091 struct ext4_extent *ex, *last_ex;
3092
3093 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3094 return 0;
3095
3096 depth = ext_depth(inode);
3097 eh = path[depth].p_hdr;
3098 ex = path[depth].p_ext;
3099
3100 if (unlikely(!eh->eh_entries)) {
3101 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3102 "EOFBLOCKS_FL set");
3103 return -EIO;
3104 }
3105 last_ex = EXT_LAST_EXTENT(eh);
3106 /*
3107 * We should clear the EOFBLOCKS_FL flag if we are writing the
3108 * last block in the last extent in the file. We test this by
3109 * first checking to see if the caller to
3110 * ext4_ext_get_blocks() was interested in the last block (or
3111 * a block beyond the last block) in the current extent. If
3112 * this turns out to be false, we can bail out from this
3113 * function immediately.
3114 */
3115 if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
3116 ext4_ext_get_actual_len(last_ex))
3117 return 0;
3118 /*
3119 * If the caller does appear to be planning to write at or
3120 * beyond the end of the current extent, we then test to see
3121 * if the current extent is the last extent in the file, by
3122 * checking to make sure it was reached via the rightmost node
3123 * at each level of the tree.
3124 */
3125 for (i = depth-1; i >= 0; i--)
3126 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3127 return 0;
3128 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3129 return ext4_mark_inode_dirty(handle, inode);
3130}
3131
3183static int 3132static int
3184ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3133ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3185 struct ext4_map_blocks *map, 3134 struct ext4_map_blocks *map,
@@ -3206,7 +3155,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3206 * completed 3155 * completed
3207 */ 3156 */
3208 if (io) 3157 if (io)
3209 io->flag = EXT4_IO_UNWRITTEN; 3158 io->flag = EXT4_IO_END_UNWRITTEN;
3210 else 3159 else
3211 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3160 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3212 if (ext4_should_dioread_nolock(inode)) 3161 if (ext4_should_dioread_nolock(inode))
@@ -3217,8 +3166,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3217 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3166 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3218 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3167 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3219 path); 3168 path);
3220 if (ret >= 0) 3169 if (ret >= 0) {
3221 ext4_update_inode_fsync_trans(handle, inode, 1); 3170 ext4_update_inode_fsync_trans(handle, inode, 1);
3171 err = check_eofblocks_fl(handle, inode, map, path,
3172 map->m_len);
3173 } else
3174 err = ret;
3222 goto out2; 3175 goto out2;
3223 } 3176 }
3224 /* buffered IO case */ 3177 /* buffered IO case */
@@ -3244,8 +3197,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3244 3197
3245 /* buffered write, writepage time, convert*/ 3198 /* buffered write, writepage time, convert*/
3246 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3199 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3247 if (ret >= 0) 3200 if (ret >= 0) {
3248 ext4_update_inode_fsync_trans(handle, inode, 1); 3201 ext4_update_inode_fsync_trans(handle, inode, 1);
3202 err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
3203 if (err < 0)
3204 goto out2;
3205 }
3206
3249out: 3207out:
3250 if (ret <= 0) { 3208 if (ret <= 0) {
3251 err = ret; 3209 err = ret;
@@ -3292,6 +3250,7 @@ out2:
3292 } 3250 }
3293 return err ? err : allocated; 3251 return err ? err : allocated;
3294} 3252}
3253
3295/* 3254/*
3296 * Block allocation/map/preallocation routine for extents based files 3255 * Block allocation/map/preallocation routine for extents based files
3297 * 3256 *
@@ -3315,9 +3274,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3315{ 3274{
3316 struct ext4_ext_path *path = NULL; 3275 struct ext4_ext_path *path = NULL;
3317 struct ext4_extent_header *eh; 3276 struct ext4_extent_header *eh;
3318 struct ext4_extent newex, *ex, *last_ex; 3277 struct ext4_extent newex, *ex;
3319 ext4_fsblk_t newblock; 3278 ext4_fsblk_t newblock;
3320 int i, err = 0, depth, ret, cache_type; 3279 int err = 0, depth, ret, cache_type;
3321 unsigned int allocated = 0; 3280 unsigned int allocated = 0;
3322 struct ext4_allocation_request ar; 3281 struct ext4_allocation_request ar;
3323 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3282 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3341,7 +3300,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3341 /* block is already allocated */ 3300 /* block is already allocated */
3342 newblock = map->m_lblk 3301 newblock = map->m_lblk
3343 - le32_to_cpu(newex.ee_block) 3302 - le32_to_cpu(newex.ee_block)
3344 + ext_pblock(&newex); 3303 + ext4_ext_pblock(&newex);
3345 /* number of remaining blocks in the extent */ 3304 /* number of remaining blocks in the extent */
3346 allocated = ext4_ext_get_actual_len(&newex) - 3305 allocated = ext4_ext_get_actual_len(&newex) -
3347 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3306 (map->m_lblk - le32_to_cpu(newex.ee_block));
@@ -3379,7 +3338,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3379 ex = path[depth].p_ext; 3338 ex = path[depth].p_ext;
3380 if (ex) { 3339 if (ex) {
3381 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3340 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3382 ext4_fsblk_t ee_start = ext_pblock(ex); 3341 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3383 unsigned short ee_len; 3342 unsigned short ee_len;
3384 3343
3385 /* 3344 /*
@@ -3488,7 +3447,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3488 */ 3447 */
3489 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3448 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3490 if (io) 3449 if (io)
3491 io->flag = EXT4_IO_UNWRITTEN; 3450 io->flag = EXT4_IO_END_UNWRITTEN;
3492 else 3451 else
3493 ext4_set_inode_state(inode, 3452 ext4_set_inode_state(inode,
3494 EXT4_STATE_DIO_UNWRITTEN); 3453 EXT4_STATE_DIO_UNWRITTEN);
@@ -3497,44 +3456,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3497 map->m_flags |= EXT4_MAP_UNINIT; 3456 map->m_flags |= EXT4_MAP_UNINIT;
3498 } 3457 }
3499 3458
3500 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { 3459 err = check_eofblocks_fl(handle, inode, map, path, ar.len);
3501 if (unlikely(!eh->eh_entries)) { 3460 if (err)
3502 EXT4_ERROR_INODE(inode, 3461 goto out2;
3503 "eh->eh_entries == 0 and " 3462
3504 "EOFBLOCKS_FL set");
3505 err = -EIO;
3506 goto out2;
3507 }
3508 last_ex = EXT_LAST_EXTENT(eh);
3509 /*
3510 * If the current leaf block was reached by looking at
3511 * the last index block all the way down the tree, and
3512 * we are extending the inode beyond the last extent
3513 * in the current leaf block, then clear the
3514 * EOFBLOCKS_FL flag.
3515 */
3516 for (i = depth-1; i >= 0; i--) {
3517 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3518 break;
3519 }
3520 if ((i < 0) &&
3521 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3522 ext4_ext_get_actual_len(last_ex)))
3523 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3524 }
3525 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3463 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3526 if (err) { 3464 if (err) {
3527 /* free data blocks we just allocated */ 3465 /* free data blocks we just allocated */
3528 /* not a good idea to call discard here directly, 3466 /* not a good idea to call discard here directly,
3529 * but otherwise we'd need to call it every free() */ 3467 * but otherwise we'd need to call it every free() */
3530 ext4_discard_preallocations(inode); 3468 ext4_discard_preallocations(inode);
3531 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3469 ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
3532 ext4_ext_get_actual_len(&newex), 0); 3470 ext4_ext_get_actual_len(&newex), 0);
3533 goto out2; 3471 goto out2;
3534 } 3472 }
3535 3473
3536 /* previous routine could use block we allocated */ 3474 /* previous routine could use block we allocated */
3537 newblock = ext_pblock(&newex); 3475 newblock = ext4_ext_pblock(&newex);
3538 allocated = ext4_ext_get_actual_len(&newex); 3476 allocated = ext4_ext_get_actual_len(&newex);
3539 if (allocated > map->m_len) 3477 if (allocated > map->m_len)
3540 allocated = map->m_len; 3478 allocated = map->m_len;
@@ -3729,7 +3667,7 @@ retry:
3729 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3667 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3730 "returned error inode#%lu, block=%u, " 3668 "returned error inode#%lu, block=%u, "
3731 "max_blocks=%u", __func__, 3669 "max_blocks=%u", __func__,
3732 inode->i_ino, block, max_blocks); 3670 inode->i_ino, map.m_lblk, max_blocks);
3733#endif 3671#endif
3734 ext4_mark_inode_dirty(handle, inode); 3672 ext4_mark_inode_dirty(handle, inode);
3735 ret2 = ext4_journal_stop(handle); 3673 ret2 = ext4_journal_stop(handle);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..5a5c55ddceef 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -130,8 +130,50 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
130 return dquot_file_open(inode, filp); 130 return dquot_file_open(inode, filp);
131} 131}
132 132
133/*
134 * ext4_llseek() copied from generic_file_llseek() to handle both
135 * block-mapped and extent-mapped maxbytes values. This should
136 * otherwise be identical with generic_file_llseek().
137 */
138loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
139{
140 struct inode *inode = file->f_mapping->host;
141 loff_t maxbytes;
142
143 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
144 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
145 else
146 maxbytes = inode->i_sb->s_maxbytes;
147 mutex_lock(&inode->i_mutex);
148 switch (origin) {
149 case SEEK_END:
150 offset += inode->i_size;
151 break;
152 case SEEK_CUR:
153 if (offset == 0) {
154 mutex_unlock(&inode->i_mutex);
155 return file->f_pos;
156 }
157 offset += file->f_pos;
158 break;
159 }
160
161 if (offset < 0 || offset > maxbytes) {
162 mutex_unlock(&inode->i_mutex);
163 return -EINVAL;
164 }
165
166 if (offset != file->f_pos) {
167 file->f_pos = offset;
168 file->f_version = 0;
169 }
170 mutex_unlock(&inode->i_mutex);
171
172 return offset;
173}
174
133const struct file_operations ext4_file_operations = { 175const struct file_operations ext4_file_operations = {
134 .llseek = generic_file_llseek, 176 .llseek = ext4_llseek,
135 .read = do_sync_read, 177 .read = do_sync_read,
136 .write = do_sync_write, 178 .write = do_sync_write,
137 .aio_read = generic_file_aio_read, 179 .aio_read = generic_file_aio_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..c1a7bc923cf6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
34 34
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37static void dump_completed_IO(struct inode * inode)
38{
39#ifdef EXT4_DEBUG
40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags;
43
44 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
45 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
46 return;
47 }
48
49 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
50 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
51 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
52 cur = &io->list;
53 before = cur->prev;
54 io0 = container_of(before, ext4_io_end_t, list);
55 after = cur->next;
56 io1 = container_of(after, ext4_io_end_t, list);
57
58 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
59 io, inode->i_ino, io0, io1);
60 }
61 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
62#endif
63}
64
65/*
66 * This function is called from ext4_sync_file().
67 *
68 * When IO is completed, the work to convert unwritten extents to
69 * written is queued on workqueue but may not get immediately
70 * scheduled. When fsync is called, we need to ensure the
71 * conversion is complete before fsync returns.
72 * The inode keeps track of a list of pending/completed IO that
73 * might needs to do the conversion. This function walks through
74 * the list and convert the related unwritten extents for completed IO
75 * to written.
76 * The function return the number of pending IOs on success.
77 */
78static int flush_completed_IO(struct inode *inode)
79{
80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode);
82 unsigned long flags;
83 int ret = 0;
84 int ret2 = 0;
85
86 if (list_empty(&ei->i_completed_io_list))
87 return ret;
88
89 dump_completed_IO(inode);
90 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
91 while (!list_empty(&ei->i_completed_io_list)){
92 io = list_entry(ei->i_completed_io_list.next,
93 ext4_io_end_t, list);
94 /*
95 * Calling ext4_end_io_nolock() to convert completed
96 * IO to written.
97 *
98 * When ext4_sync_file() is called, run_queue() may already
99 * about to flush the work corresponding to this io structure.
100 * It will be upset if it founds the io structure related
101 * to the work-to-be schedule is freed.
102 *
103 * Thus we need to keep the io structure still valid here after
104 * convertion finished. The io structure has a flag to
105 * avoid double converting from both fsync and background work
106 * queue work.
107 */
108 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
109 ret = ext4_end_io_nolock(io);
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
111 if (ret < 0)
112 ret2 = ret;
113 else
114 list_del_init(&io->list);
115 }
116 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
117 return (ret2 < 0) ? ret2 : 0;
118}
119
37/* 120/*
38 * If we're not journaling and this is a just-created file, we have to 121 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since 122 * sync our parent directory (if it was freshly created) since
@@ -128,10 +211,9 @@ int ext4_sync_file(struct file *file, int datasync)
128 (journal->j_fs_dev != journal->j_dev) && 211 (journal->j_fs_dev != journal->j_dev) &&
129 (journal->j_flags & JBD2_BARRIER)) 212 (journal->j_flags & JBD2_BARRIER))
130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 213 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
131 NULL, BLKDEV_IFL_WAIT); 214 NULL);
132 ret = jbd2_log_wait_commit(journal, commit_tid); 215 ret = jbd2_log_wait_commit(journal, commit_tid);
133 } else if (journal->j_flags & JBD2_BARRIER) 216 } else if (journal->j_flags & JBD2_BARRIER)
134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 217 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
135 BLKDEV_IFL_WAIT);
136 return ret; 218 return ret;
137} 219}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..1ce240a23ebb 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
50 * need to use it within a single byte (to ensure we get endianness right). 50 * need to use it within a single byte (to ensure we get endianness right).
51 * We can use memset for the rest of the bitmap as there are no other users. 51 * We can use memset for the rest of the bitmap as there are no other users.
52 */ 52 */
53void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) 53void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
54{ 54{
55 int i; 55 int i;
56 56
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
65} 65}
66 66
67/* Initializes an uninitialized inode bitmap */ 67/* Initializes an uninitialized inode bitmap */
68unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, 68static unsigned ext4_init_inode_bitmap(struct super_block *sb,
69 ext4_group_t block_group, 69 struct buffer_head *bh,
70 struct ext4_group_desc *gdp) 70 ext4_group_t block_group,
71 struct ext4_group_desc *gdp)
71{ 72{
72 struct ext4_sb_info *sbi = EXT4_SB(sb); 73 struct ext4_sb_info *sbi = EXT4_SB(sb);
73 74
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
85 } 86 }
86 87
87 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 88 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
88 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 89 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
89 bh->b_data); 90 bh->b_data);
90 91
91 return EXT4_INODES_PER_GROUP(sb); 92 return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
107 desc = ext4_get_group_desc(sb, block_group, NULL); 108 desc = ext4_get_group_desc(sb, block_group, NULL);
108 if (!desc) 109 if (!desc)
109 return NULL; 110 return NULL;
111
110 bitmap_blk = ext4_inode_bitmap(sb, desc); 112 bitmap_blk = ext4_inode_bitmap(sb, desc);
111 bh = sb_getblk(sb, bitmap_blk); 113 bh = sb_getblk(sb, bitmap_blk);
112 if (unlikely(!bh)) { 114 if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 125 unlock_buffer(bh);
124 return bh; 126 return bh;
125 } 127 }
128
126 ext4_lock_group(sb, block_group); 129 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 130 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 131 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
133 return bh; 136 return bh;
134 } 137 }
135 ext4_unlock_group(sb, block_group); 138 ext4_unlock_group(sb, block_group);
139
136 if (buffer_uptodate(bh)) { 140 if (buffer_uptodate(bh)) {
137 /* 141 /*
138 * if not uninit if bh is uptodate, 142 * if not uninit if bh is uptodate,
@@ -411,8 +415,8 @@ struct orlov_stats {
411 * for a particular block group or flex_bg. If flex_size is 1, then g 415 * for a particular block group or flex_bg. If flex_size is 1, then g
412 * is a block group number; otherwise it is flex_bg number. 416 * is a block group number; otherwise it is flex_bg number.
413 */ 417 */
414void get_orlov_stats(struct super_block *sb, ext4_group_t g, 418static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
415 int flex_size, struct orlov_stats *stats) 419 int flex_size, struct orlov_stats *stats)
416{ 420{
417 struct ext4_group_desc *desc; 421 struct ext4_group_desc *desc;
418 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; 422 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
712{ 716{
713 int free = 0, retval = 0, count; 717 int free = 0, retval = 0, count;
714 struct ext4_sb_info *sbi = EXT4_SB(sb); 718 struct ext4_sb_info *sbi = EXT4_SB(sb);
719 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 720 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
716 721
722 /*
723 * We have to be sure that new inode allocation does not race with
724 * inode table initialization, because otherwise we may end up
725 * allocating and writing new inode right before sb_issue_zeroout
726 * takes place and overwriting our new inode with zeroes. So we
727 * take alloc_sem to prevent it.
728 */
729 down_read(&grp->alloc_sem);
717 ext4_lock_group(sb, group); 730 ext4_lock_group(sb, group);
718 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 731 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
719 /* not a free inode */ 732 /* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
724 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 737 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
725 ino > EXT4_INODES_PER_GROUP(sb)) { 738 ino > EXT4_INODES_PER_GROUP(sb)) {
726 ext4_unlock_group(sb, group); 739 ext4_unlock_group(sb, group);
740 up_read(&grp->alloc_sem);
727 ext4_error(sb, "reserved inode or inode > inodes count - " 741 ext4_error(sb, "reserved inode or inode > inodes count - "
728 "block_group = %u, inode=%lu", group, 742 "block_group = %u, inode=%lu", group,
729 ino + group * EXT4_INODES_PER_GROUP(sb)); 743 ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 786 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773err_ret: 787err_ret:
774 ext4_unlock_group(sb, group); 788 ext4_unlock_group(sb, group);
789 up_read(&grp->alloc_sem);
775 return retval; 790 return retval;
776} 791}
777 792
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1205 } 1220 }
1206 return count; 1221 return count;
1207} 1222}
1223
1224/*
1225 * Zeroes not yet zeroed inode table - just write zeroes through the whole
1226 * inode table. Must be called without any spinlock held. The only place
1227 * where it is called from on active part of filesystem is ext4lazyinit
1228 * thread, so we do not need any special locks, however we have to prevent
1229 * inode allocation from the current group, so we take alloc_sem lock, to
1230 * block ext4_claim_inode until we are finished.
1231 */
1232extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1233 int barrier)
1234{
1235 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1236 struct ext4_sb_info *sbi = EXT4_SB(sb);
1237 struct ext4_group_desc *gdp = NULL;
1238 struct buffer_head *group_desc_bh;
1239 handle_t *handle;
1240 ext4_fsblk_t blk;
1241 int num, ret = 0, used_blks = 0;
1242
1243 /* This should not happen, but just to be sure check this */
1244 if (sb->s_flags & MS_RDONLY) {
1245 ret = 1;
1246 goto out;
1247 }
1248
1249 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1250 if (!gdp)
1251 goto out;
1252
1253 /*
1254 * We do not need to lock this, because we are the only one
1255 * handling this flag.
1256 */
1257 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1258 goto out;
1259
1260 handle = ext4_journal_start_sb(sb, 1);
1261 if (IS_ERR(handle)) {
1262 ret = PTR_ERR(handle);
1263 goto out;
1264 }
1265
1266 down_write(&grp->alloc_sem);
1267 /*
1268 * If inode bitmap was already initialized there may be some
1269 * used inodes so we need to skip blocks with used inodes in
1270 * inode table.
1271 */
1272 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1273 used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1274 ext4_itable_unused_count(sb, gdp)),
1275 sbi->s_inodes_per_block);
1276
1277 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1278 ext4_error(sb, "Something is wrong with group %u\n"
1279 "Used itable blocks: %d"
1280 "itable unused count: %u\n",
1281 group, used_blks,
1282 ext4_itable_unused_count(sb, gdp));
1283 ret = 1;
1284 goto out;
1285 }
1286
1287 blk = ext4_inode_table(sb, gdp) + used_blks;
1288 num = sbi->s_itb_per_group - used_blks;
1289
1290 BUFFER_TRACE(group_desc_bh, "get_write_access");
1291 ret = ext4_journal_get_write_access(handle,
1292 group_desc_bh);
1293 if (ret)
1294 goto err_out;
1295
1296 /*
1297 * Skip zeroout if the inode table is full. But we set the ZEROED
1298 * flag anyway, because obviously, when it is full it does not need
1299 * further zeroing.
1300 */
1301 if (unlikely(num == 0))
1302 goto skip_zeroout;
1303
1304 ext4_debug("going to zero out inode table in group %d\n",
1305 group);
1306 ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1307 if (ret < 0)
1308 goto err_out;
1309 if (barrier)
1310 blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
1311
1312skip_zeroout:
1313 ext4_lock_group(sb, group);
1314 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1315 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
1316 ext4_unlock_group(sb, group);
1317
1318 BUFFER_TRACE(group_desc_bh,
1319 "call ext4_handle_dirty_metadata");
1320 ret = ext4_handle_dirty_metadata(handle, NULL,
1321 group_desc_bh);
1322
1323err_out:
1324 up_write(&grp->alloc_sem);
1325 ext4_journal_stop(handle);
1326out:
1327 return ret;
1328}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..bdbe69902207 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
53static inline int ext4_begin_ordered_truncate(struct inode *inode, 53static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 loff_t new_size) 54 loff_t new_size)
55{ 55{
56 trace_ext4_begin_ordered_truncate(inode, new_size);
56 return jbd2_journal_begin_ordered_truncate( 57 return jbd2_journal_begin_ordered_truncate(
57 EXT4_SB(inode->i_sb)->s_journal, 58 EXT4_SB(inode->i_sb)->s_journal,
58 &EXT4_I(inode)->jinode, 59 &EXT4_I(inode)->jinode,
@@ -60,6 +61,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
60} 61}
61 62
62static void ext4_invalidatepage(struct page *page, unsigned long offset); 63static void ext4_invalidatepage(struct page *page, unsigned long offset);
64static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
65 struct buffer_head *bh_result, int create);
66static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
67static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
68static int __ext4_journalled_writepage(struct page *page, unsigned int len);
69static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
63 70
64/* 71/*
65 * Test whether an inode is a fast symlink. 72 * Test whether an inode is a fast symlink.
@@ -172,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
172 handle_t *handle; 179 handle_t *handle;
173 int err; 180 int err;
174 181
182 trace_ext4_evict_inode(inode);
175 if (inode->i_nlink) { 183 if (inode->i_nlink) {
176 truncate_inode_pages(&inode->i_data, 0); 184 truncate_inode_pages(&inode->i_data, 0);
177 goto no_delete; 185 goto no_delete;
@@ -755,6 +763,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 * parent to disk. 763 * parent to disk.
756 */ 764 */
757 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 765 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
766 if (unlikely(!bh)) {
767 err = -EIO;
768 goto failed;
769 }
770
758 branch[n].bh = bh; 771 branch[n].bh = bh;
759 lock_buffer(bh); 772 lock_buffer(bh);
760 BUFFER_TRACE(bh, "call get_create_access"); 773 BUFFER_TRACE(bh, "call get_create_access");
@@ -1207,8 +1220,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1207 break; 1220 break;
1208 idx++; 1221 idx++;
1209 num++; 1222 num++;
1210 if (num >= max_pages) 1223 if (num >= max_pages) {
1224 done = 1;
1211 break; 1225 break;
1226 }
1212 } 1227 }
1213 pagevec_release(&pvec); 1228 pagevec_release(&pvec);
1214 } 1229 }
@@ -1538,10 +1553,10 @@ static int do_journal_get_write_access(handle_t *handle,
1538 if (!buffer_mapped(bh) || buffer_freed(bh)) 1553 if (!buffer_mapped(bh) || buffer_freed(bh))
1539 return 0; 1554 return 0;
1540 /* 1555 /*
1541 * __block_prepare_write() could have dirtied some buffers. Clean 1556 * __block_write_begin() could have dirtied some buffers. Clean
1542 * the dirty bit as jbd2_journal_get_write_access() could complain 1557 * the dirty bit as jbd2_journal_get_write_access() could complain
1543 * otherwise about fs integrity issues. Setting of the dirty bit 1558 * otherwise about fs integrity issues. Setting of the dirty bit
1544 * by __block_prepare_write() isn't a real problem here as we clear 1559 * by __block_write_begin() isn't a real problem here as we clear
1545 * the bit before releasing a page lock and thus writeback cannot 1560 * the bit before releasing a page lock and thus writeback cannot
1546 * ever write the buffer. 1561 * ever write the buffer.
1547 */ 1562 */
@@ -1995,16 +2010,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1995 * 2010 *
1996 * As pages are already locked by write_cache_pages(), we can't use it 2011 * As pages are already locked by write_cache_pages(), we can't use it
1997 */ 2012 */
1998static int mpage_da_submit_io(struct mpage_da_data *mpd) 2013static int mpage_da_submit_io(struct mpage_da_data *mpd,
2014 struct ext4_map_blocks *map)
1999{ 2015{
2000 long pages_skipped;
2001 struct pagevec pvec; 2016 struct pagevec pvec;
2002 unsigned long index, end; 2017 unsigned long index, end;
2003 int ret = 0, err, nr_pages, i; 2018 int ret = 0, err, nr_pages, i;
2004 struct inode *inode = mpd->inode; 2019 struct inode *inode = mpd->inode;
2005 struct address_space *mapping = inode->i_mapping; 2020 struct address_space *mapping = inode->i_mapping;
2021 loff_t size = i_size_read(inode);
2022 unsigned int len, block_start;
2023 struct buffer_head *bh, *page_bufs = NULL;
2024 int journal_data = ext4_should_journal_data(inode);
2025 sector_t pblock = 0, cur_logical = 0;
2026 struct ext4_io_submit io_submit;
2006 2027
2007 BUG_ON(mpd->next_page <= mpd->first_page); 2028 BUG_ON(mpd->next_page <= mpd->first_page);
2029 memset(&io_submit, 0, sizeof(io_submit));
2008 /* 2030 /*
2009 * We need to start from the first_page to the next_page - 1 2031 * We need to start from the first_page to the next_page - 1
2010 * to make sure we also write the mapped dirty buffer_heads. 2032 * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,122 +2042,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2020 if (nr_pages == 0) 2042 if (nr_pages == 0)
2021 break; 2043 break;
2022 for (i = 0; i < nr_pages; i++) { 2044 for (i = 0; i < nr_pages; i++) {
2045 int commit_write = 0, redirty_page = 0;
2023 struct page *page = pvec.pages[i]; 2046 struct page *page = pvec.pages[i];
2024 2047
2025 index = page->index; 2048 index = page->index;
2026 if (index > end) 2049 if (index > end)
2027 break; 2050 break;
2051
2052 if (index == size >> PAGE_CACHE_SHIFT)
2053 len = size & ~PAGE_CACHE_MASK;
2054 else
2055 len = PAGE_CACHE_SIZE;
2056 if (map) {
2057 cur_logical = index << (PAGE_CACHE_SHIFT -
2058 inode->i_blkbits);
2059 pblock = map->m_pblk + (cur_logical -
2060 map->m_lblk);
2061 }
2028 index++; 2062 index++;
2029 2063
2030 BUG_ON(!PageLocked(page)); 2064 BUG_ON(!PageLocked(page));
2031 BUG_ON(PageWriteback(page)); 2065 BUG_ON(PageWriteback(page));
2032 2066
2033 pages_skipped = mpd->wbc->pages_skipped;
2034 err = mapping->a_ops->writepage(page, mpd->wbc);
2035 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
2036 /*
2037 * have successfully written the page
2038 * without skipping the same
2039 */
2040 mpd->pages_written++;
2041 /* 2067 /*
2042 * In error case, we have to continue because 2068 * If the page does not have buffers (for
2043 * remaining pages are still locked 2069 * whatever reason), try to create them using
2044 * XXX: unlock and re-dirty them? 2070 * __block_write_begin. If this fails,
2071 * redirty the page and move on.
2045 */ 2072 */
2046 if (ret == 0) 2073 if (!page_has_buffers(page)) {
2047 ret = err; 2074 if (__block_write_begin(page, 0, len,
2048 } 2075 noalloc_get_block_write)) {
2049 pagevec_release(&pvec); 2076 redirty_page:
2050 } 2077 redirty_page_for_writepage(mpd->wbc,
2051 return ret; 2078 page);
2052} 2079 unlock_page(page);
2053 2080 continue;
2054/* 2081 }
2055 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2082 commit_write = 1;
2056 * 2083 }
2057 * the function goes through all passed space and put actual disk
2058 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2059 */
2060static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2061 struct ext4_map_blocks *map)
2062{
2063 struct inode *inode = mpd->inode;
2064 struct address_space *mapping = inode->i_mapping;
2065 int blocks = map->m_len;
2066 sector_t pblock = map->m_pblk, cur_logical;
2067 struct buffer_head *head, *bh;
2068 pgoff_t index, end;
2069 struct pagevec pvec;
2070 int nr_pages, i;
2071
2072 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2073 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2074 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2075
2076 pagevec_init(&pvec, 0);
2077
2078 while (index <= end) {
2079 /* XXX: optimize tail */
2080 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2081 if (nr_pages == 0)
2082 break;
2083 for (i = 0; i < nr_pages; i++) {
2084 struct page *page = pvec.pages[i];
2085
2086 index = page->index;
2087 if (index > end)
2088 break;
2089 index++;
2090
2091 BUG_ON(!PageLocked(page));
2092 BUG_ON(PageWriteback(page));
2093 BUG_ON(!page_has_buffers(page));
2094
2095 bh = page_buffers(page);
2096 head = bh;
2097
2098 /* skip blocks out of the range */
2099 do {
2100 if (cur_logical >= map->m_lblk)
2101 break;
2102 cur_logical++;
2103 } while ((bh = bh->b_this_page) != head);
2104 2084
2085 bh = page_bufs = page_buffers(page);
2086 block_start = 0;
2105 do { 2087 do {
2106 if (cur_logical >= map->m_lblk + blocks) 2088 if (!bh)
2107 break; 2089 goto redirty_page;
2108 2090 if (map && (cur_logical >= map->m_lblk) &&
2109 if (buffer_delay(bh) || buffer_unwritten(bh)) { 2091 (cur_logical <= (map->m_lblk +
2110 2092 (map->m_len - 1)))) {
2111 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2112
2113 if (buffer_delay(bh)) { 2093 if (buffer_delay(bh)) {
2114 clear_buffer_delay(bh); 2094 clear_buffer_delay(bh);
2115 bh->b_blocknr = pblock; 2095 bh->b_blocknr = pblock;
2116 } else {
2117 /*
2118 * unwritten already should have
2119 * blocknr assigned. Verify that
2120 */
2121 clear_buffer_unwritten(bh);
2122 BUG_ON(bh->b_blocknr != pblock);
2123 } 2096 }
2097 if (buffer_unwritten(bh) ||
2098 buffer_mapped(bh))
2099 BUG_ON(bh->b_blocknr != pblock);
2100 if (map->m_flags & EXT4_MAP_UNINIT)
2101 set_buffer_uninit(bh);
2102 clear_buffer_unwritten(bh);
2103 }
2124 2104
2125 } else if (buffer_mapped(bh)) 2105 /* redirty page if block allocation undone */
2126 BUG_ON(bh->b_blocknr != pblock); 2106 if (buffer_delay(bh) || buffer_unwritten(bh))
2127 2107 redirty_page = 1;
2128 if (map->m_flags & EXT4_MAP_UNINIT) 2108 bh = bh->b_this_page;
2129 set_buffer_uninit(bh); 2109 block_start += bh->b_size;
2130 cur_logical++; 2110 cur_logical++;
2131 pblock++; 2111 pblock++;
2132 } while ((bh = bh->b_this_page) != head); 2112 } while (bh != page_bufs);
2113
2114 if (redirty_page)
2115 goto redirty_page;
2116
2117 if (commit_write)
2118 /* mark the buffer_heads as dirty & uptodate */
2119 block_commit_write(page, 0, len);
2120
2121 /*
2122 * Delalloc doesn't support data journalling,
2123 * but eventually maybe we'll lift this
2124 * restriction.
2125 */
2126 if (unlikely(journal_data && PageChecked(page)))
2127 err = __ext4_journalled_writepage(page, len);
2128 else
2129 err = ext4_bio_write_page(&io_submit, page,
2130 len, mpd->wbc);
2131
2132 if (!err)
2133 mpd->pages_written++;
2134 /*
2135 * In error case, we have to continue because
2136 * remaining pages are still locked
2137 */
2138 if (ret == 0)
2139 ret = err;
2133 } 2140 }
2134 pagevec_release(&pvec); 2141 pagevec_release(&pvec);
2135 } 2142 }
2143 ext4_io_submit(&io_submit);
2144 return ret;
2136} 2145}
2137 2146
2138
2139static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2147static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2140 sector_t logical, long blk_cnt) 2148 sector_t logical, long blk_cnt)
2141{ 2149{
@@ -2187,35 +2195,32 @@ static void ext4_print_free_blocks(struct inode *inode)
2187} 2195}
2188 2196
2189/* 2197/*
2190 * mpage_da_map_blocks - go through given space 2198 * mpage_da_map_and_submit - go through given space, map them
2199 * if necessary, and then submit them for I/O
2191 * 2200 *
2192 * @mpd - bh describing space 2201 * @mpd - bh describing space
2193 * 2202 *
2194 * The function skips space we know is already mapped to disk blocks. 2203 * The function skips space we know is already mapped to disk blocks.
2195 * 2204 *
2196 */ 2205 */
2197static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2206static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2198{ 2207{
2199 int err, blks, get_blocks_flags; 2208 int err, blks, get_blocks_flags;
2200 struct ext4_map_blocks map; 2209 struct ext4_map_blocks map, *mapp = NULL;
2201 sector_t next = mpd->b_blocknr; 2210 sector_t next = mpd->b_blocknr;
2202 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2211 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2203 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2212 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2204 handle_t *handle = NULL; 2213 handle_t *handle = NULL;
2205 2214
2206 /* 2215 /*
2207 * We consider only non-mapped and non-allocated blocks 2216 * If the blocks are mapped already, or we couldn't accumulate
2217 * any blocks, then proceed immediately to the submission stage.
2208 */ 2218 */
2209 if ((mpd->b_state & (1 << BH_Mapped)) && 2219 if ((mpd->b_size == 0) ||
2210 !(mpd->b_state & (1 << BH_Delay)) && 2220 ((mpd->b_state & (1 << BH_Mapped)) &&
2211 !(mpd->b_state & (1 << BH_Unwritten))) 2221 !(mpd->b_state & (1 << BH_Delay)) &&
2212 return 0; 2222 !(mpd->b_state & (1 << BH_Unwritten))))
2213 2223 goto submit_io;
2214 /*
2215 * If we didn't accumulate anything to write simply return
2216 */
2217 if (!mpd->b_size)
2218 return 0;
2219 2224
2220 handle = ext4_journal_current_handle(); 2225 handle = ext4_journal_current_handle();
2221 BUG_ON(!handle); 2226 BUG_ON(!handle);
@@ -2252,17 +2257,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2252 2257
2253 err = blks; 2258 err = blks;
2254 /* 2259 /*
2255 * If get block returns with error we simply 2260 * If get block returns EAGAIN or ENOSPC and there
2256 * return. Later writepage will redirty the page and 2261 * appears to be free blocks we will call
2257 * writepages will find the dirty page again 2262 * ext4_writepage() for all of the pages which will
2263 * just redirty the pages.
2258 */ 2264 */
2259 if (err == -EAGAIN) 2265 if (err == -EAGAIN)
2260 return 0; 2266 goto submit_io;
2261 2267
2262 if (err == -ENOSPC && 2268 if (err == -ENOSPC &&
2263 ext4_count_free_blocks(sb)) { 2269 ext4_count_free_blocks(sb)) {
2264 mpd->retval = err; 2270 mpd->retval = err;
2265 return 0; 2271 goto submit_io;
2266 } 2272 }
2267 2273
2268 /* 2274 /*
@@ -2287,10 +2293,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2287 /* invalidate all the pages */ 2293 /* invalidate all the pages */
2288 ext4_da_block_invalidatepages(mpd, next, 2294 ext4_da_block_invalidatepages(mpd, next,
2289 mpd->b_size >> mpd->inode->i_blkbits); 2295 mpd->b_size >> mpd->inode->i_blkbits);
2290 return err; 2296 return;
2291 } 2297 }
2292 BUG_ON(blks == 0); 2298 BUG_ON(blks == 0);
2293 2299
2300 mapp = &map;
2294 if (map.m_flags & EXT4_MAP_NEW) { 2301 if (map.m_flags & EXT4_MAP_NEW) {
2295 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2302 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2296 int i; 2303 int i;
@@ -2299,18 +2306,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2299 unmap_underlying_metadata(bdev, map.m_pblk + i); 2306 unmap_underlying_metadata(bdev, map.m_pblk + i);
2300 } 2307 }
2301 2308
2302 /*
2303 * If blocks are delayed marked, we need to
2304 * put actual blocknr and drop delayed bit
2305 */
2306 if ((mpd->b_state & (1 << BH_Delay)) ||
2307 (mpd->b_state & (1 << BH_Unwritten)))
2308 mpage_put_bnr_to_bhs(mpd, &map);
2309
2310 if (ext4_should_order_data(mpd->inode)) { 2309 if (ext4_should_order_data(mpd->inode)) {
2311 err = ext4_jbd2_file_inode(handle, mpd->inode); 2310 err = ext4_jbd2_file_inode(handle, mpd->inode);
2312 if (err) 2311 if (err)
2313 return err; 2312 /* This only happens if the journal is aborted */
2313 return;
2314 } 2314 }
2315 2315
2316 /* 2316 /*
@@ -2321,10 +2321,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2321 disksize = i_size_read(mpd->inode); 2321 disksize = i_size_read(mpd->inode);
2322 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2322 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2323 ext4_update_i_disksize(mpd->inode, disksize); 2323 ext4_update_i_disksize(mpd->inode, disksize);
2324 return ext4_mark_inode_dirty(handle, mpd->inode); 2324 err = ext4_mark_inode_dirty(handle, mpd->inode);
2325 if (err)
2326 ext4_error(mpd->inode->i_sb,
2327 "Failed to mark inode %lu dirty",
2328 mpd->inode->i_ino);
2325 } 2329 }
2326 2330
2327 return 0; 2331submit_io:
2332 mpage_da_submit_io(mpd, mapp);
2333 mpd->io_done = 1;
2328} 2334}
2329 2335
2330#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2336#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2407,7 @@ flush_it:
2401 * We couldn't merge the block to our extent, so we 2407 * We couldn't merge the block to our extent, so we
2402 * need to flush current extent and start new one 2408 * need to flush current extent and start new one
2403 */ 2409 */
2404 if (mpage_da_map_blocks(mpd) == 0) 2410 mpage_da_map_and_submit(mpd);
2405 mpage_da_submit_io(mpd);
2406 mpd->io_done = 1;
2407 return; 2411 return;
2408} 2412}
2409 2413
@@ -2422,9 +2426,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2422 * The function finds extents of pages and scan them for all blocks. 2426 * The function finds extents of pages and scan them for all blocks.
2423 */ 2427 */
2424static int __mpage_da_writepage(struct page *page, 2428static int __mpage_da_writepage(struct page *page,
2425 struct writeback_control *wbc, void *data) 2429 struct writeback_control *wbc,
2430 struct mpage_da_data *mpd)
2426{ 2431{
2427 struct mpage_da_data *mpd = data;
2428 struct inode *inode = mpd->inode; 2432 struct inode *inode = mpd->inode;
2429 struct buffer_head *bh, *head; 2433 struct buffer_head *bh, *head;
2430 sector_t logical; 2434 sector_t logical;
@@ -2435,15 +2439,13 @@ static int __mpage_da_writepage(struct page *page,
2435 if (mpd->next_page != page->index) { 2439 if (mpd->next_page != page->index) {
2436 /* 2440 /*
2437 * Nope, we can't. So, we map non-allocated blocks 2441 * Nope, we can't. So, we map non-allocated blocks
2438 * and start IO on them using writepage() 2442 * and start IO on them
2439 */ 2443 */
2440 if (mpd->next_page != mpd->first_page) { 2444 if (mpd->next_page != mpd->first_page) {
2441 if (mpage_da_map_blocks(mpd) == 0) 2445 mpage_da_map_and_submit(mpd);
2442 mpage_da_submit_io(mpd);
2443 /* 2446 /*
2444 * skip rest of the page in the page_vec 2447 * skip rest of the page in the page_vec
2445 */ 2448 */
2446 mpd->io_done = 1;
2447 redirty_page_for_writepage(wbc, page); 2449 redirty_page_for_writepage(wbc, page);
2448 unlock_page(page); 2450 unlock_page(page);
2449 return MPAGE_DA_EXTENT_TAIL; 2451 return MPAGE_DA_EXTENT_TAIL;
@@ -2550,8 +2552,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2550 if (buffer_delay(bh)) 2552 if (buffer_delay(bh))
2551 return 0; /* Not sure this could or should happen */ 2553 return 0; /* Not sure this could or should happen */
2552 /* 2554 /*
2553 * XXX: __block_prepare_write() unmaps passed block, 2555 * XXX: __block_write_begin() unmaps passed block, is it OK?
2554 * is it OK?
2555 */ 2556 */
2556 ret = ext4_da_reserve_space(inode, iblock); 2557 ret = ext4_da_reserve_space(inode, iblock);
2557 if (ret) 2558 if (ret)
@@ -2583,7 +2584,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2583/* 2584/*
2584 * This function is used as a standard get_block_t calback function 2585 * This function is used as a standard get_block_t calback function
2585 * when there is no desire to allocate any blocks. It is used as a 2586 * when there is no desire to allocate any blocks. It is used as a
2586 * callback function for block_prepare_write() and block_write_full_page(). 2587 * callback function for block_write_begin() and block_write_full_page().
2587 * These functions should only try to map a single block at a time. 2588 * These functions should only try to map a single block at a time.
2588 * 2589 *
2589 * Since this function doesn't do block allocations even if the caller 2590 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2624,7 @@ static int __ext4_journalled_writepage(struct page *page,
2623 int ret = 0; 2624 int ret = 0;
2624 int err; 2625 int err;
2625 2626
2627 ClearPageChecked(page);
2626 page_bufs = page_buffers(page); 2628 page_bufs = page_buffers(page);
2627 BUG_ON(!page_bufs); 2629 BUG_ON(!page_bufs);
2628 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2630 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2700,7 +2702,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2700static int ext4_writepage(struct page *page, 2702static int ext4_writepage(struct page *page,
2701 struct writeback_control *wbc) 2703 struct writeback_control *wbc)
2702{ 2704{
2703 int ret = 0; 2705 int ret = 0, commit_write = 0;
2704 loff_t size; 2706 loff_t size;
2705 unsigned int len; 2707 unsigned int len;
2706 struct buffer_head *page_bufs = NULL; 2708 struct buffer_head *page_bufs = NULL;
@@ -2713,71 +2715,44 @@ static int ext4_writepage(struct page *page,
2713 else 2715 else
2714 len = PAGE_CACHE_SIZE; 2716 len = PAGE_CACHE_SIZE;
2715 2717
2716 if (page_has_buffers(page)) { 2718 /*
2717 page_bufs = page_buffers(page); 2719 * If the page does not have buffers (for whatever reason),
2718 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2720 * try to create them using __block_write_begin. If this
2719 ext4_bh_delay_or_unwritten)) { 2721 * fails, redirty the page and move on.
2720 /* 2722 */
2721 * We don't want to do block allocation 2723 if (!page_has_buffers(page)) {
2722 * So redirty the page and return 2724 if (__block_write_begin(page, 0, len,
2723 * We may reach here when we do a journal commit 2725 noalloc_get_block_write)) {
2724 * via journal_submit_inode_data_buffers. 2726 redirty_page:
2725 * If we don't have mapping block we just ignore
2726 * them. We can also reach here via shrink_page_list
2727 */
2728 redirty_page_for_writepage(wbc, page); 2727 redirty_page_for_writepage(wbc, page);
2729 unlock_page(page); 2728 unlock_page(page);
2730 return 0; 2729 return 0;
2731 } 2730 }
2732 } else { 2731 commit_write = 1;
2732 }
2733 page_bufs = page_buffers(page);
2734 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2735 ext4_bh_delay_or_unwritten)) {
2733 /* 2736 /*
2734 * The test for page_has_buffers() is subtle: 2737 * We don't want to do block allocation, so redirty
2735 * We know the page is dirty but it lost buffers. That means 2738 * the page and return. We may reach here when we do
2736 * that at some moment in time after write_begin()/write_end() 2739 * a journal commit via journal_submit_inode_data_buffers.
2737 * has been called all buffers have been clean and thus they 2740 * We can also reach here via shrink_page_list
2738 * must have been written at least once. So they are all
2739 * mapped and we can happily proceed with mapping them
2740 * and writing the page.
2741 *
2742 * Try to initialize the buffer_heads and check whether
2743 * all are mapped and non delay. We don't want to
2744 * do block allocation here.
2745 */ 2741 */
2746 ret = block_prepare_write(page, 0, len, 2742 goto redirty_page;
2747 noalloc_get_block_write); 2743 }
2748 if (!ret) { 2744 if (commit_write)
2749 page_bufs = page_buffers(page);
2750 /* check whether all are mapped and non delay */
2751 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2752 ext4_bh_delay_or_unwritten)) {
2753 redirty_page_for_writepage(wbc, page);
2754 unlock_page(page);
2755 return 0;
2756 }
2757 } else {
2758 /*
2759 * We can't do block allocation here
2760 * so just redity the page and unlock
2761 * and return
2762 */
2763 redirty_page_for_writepage(wbc, page);
2764 unlock_page(page);
2765 return 0;
2766 }
2767 /* now mark the buffer_heads as dirty and uptodate */ 2745 /* now mark the buffer_heads as dirty and uptodate */
2768 block_commit_write(page, 0, len); 2746 block_commit_write(page, 0, len);
2769 }
2770 2747
2771 if (PageChecked(page) && ext4_should_journal_data(inode)) { 2748 if (PageChecked(page) && ext4_should_journal_data(inode))
2772 /* 2749 /*
2773 * It's mmapped pagecache. Add buffers and journal it. There 2750 * It's mmapped pagecache. Add buffers and journal it. There
2774 * doesn't seem much point in redirtying the page here. 2751 * doesn't seem much point in redirtying the page here.
2775 */ 2752 */
2776 ClearPageChecked(page);
2777 return __ext4_journalled_writepage(page, len); 2753 return __ext4_journalled_writepage(page, len);
2778 }
2779 2754
2780 if (page_bufs && buffer_uninit(page_bufs)) { 2755 if (buffer_uninit(page_bufs)) {
2781 ext4_set_bh_endio(page_bufs, inode); 2756 ext4_set_bh_endio(page_bufs, inode);
2782 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2757 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2783 wbc, ext4_end_io_buffer_write); 2758 wbc, ext4_end_io_buffer_write);
@@ -2824,25 +2799,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2824 */ 2799 */
2825static int write_cache_pages_da(struct address_space *mapping, 2800static int write_cache_pages_da(struct address_space *mapping,
2826 struct writeback_control *wbc, 2801 struct writeback_control *wbc,
2827 struct mpage_da_data *mpd) 2802 struct mpage_da_data *mpd,
2803 pgoff_t *done_index)
2828{ 2804{
2829 int ret = 0; 2805 int ret = 0;
2830 int done = 0; 2806 int done = 0;
2831 struct pagevec pvec; 2807 struct pagevec pvec;
2832 int nr_pages; 2808 unsigned nr_pages;
2833 pgoff_t index; 2809 pgoff_t index;
2834 pgoff_t end; /* Inclusive */ 2810 pgoff_t end; /* Inclusive */
2835 long nr_to_write = wbc->nr_to_write; 2811 long nr_to_write = wbc->nr_to_write;
2812 int tag;
2836 2813
2837 pagevec_init(&pvec, 0); 2814 pagevec_init(&pvec, 0);
2838 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2815 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2839 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2816 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2840 2817
2818 if (wbc->sync_mode == WB_SYNC_ALL)
2819 tag = PAGECACHE_TAG_TOWRITE;
2820 else
2821 tag = PAGECACHE_TAG_DIRTY;
2822
2823 *done_index = index;
2841 while (!done && (index <= end)) { 2824 while (!done && (index <= end)) {
2842 int i; 2825 int i;
2843 2826
2844 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2827 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2845 PAGECACHE_TAG_DIRTY,
2846 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2828 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2847 if (nr_pages == 0) 2829 if (nr_pages == 0)
2848 break; 2830 break;
@@ -2862,6 +2844,8 @@ static int write_cache_pages_da(struct address_space *mapping,
2862 break; 2844 break;
2863 } 2845 }
2864 2846
2847 *done_index = page->index + 1;
2848
2865 lock_page(page); 2849 lock_page(page);
2866 2850
2867 /* 2851 /*
@@ -2947,6 +2931,8 @@ static int ext4_da_writepages(struct address_space *mapping,
2947 long desired_nr_to_write, nr_to_writebump = 0; 2931 long desired_nr_to_write, nr_to_writebump = 0;
2948 loff_t range_start = wbc->range_start; 2932 loff_t range_start = wbc->range_start;
2949 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2933 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2934 pgoff_t done_index = 0;
2935 pgoff_t end;
2950 2936
2951 trace_ext4_da_writepages(inode, wbc); 2937 trace_ext4_da_writepages(inode, wbc);
2952 2938
@@ -2982,8 +2968,11 @@ static int ext4_da_writepages(struct address_space *mapping,
2982 wbc->range_start = index << PAGE_CACHE_SHIFT; 2968 wbc->range_start = index << PAGE_CACHE_SHIFT;
2983 wbc->range_end = LLONG_MAX; 2969 wbc->range_end = LLONG_MAX;
2984 wbc->range_cyclic = 0; 2970 wbc->range_cyclic = 0;
2985 } else 2971 end = -1;
2972 } else {
2986 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2973 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2974 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2975 }
2987 2976
2988 /* 2977 /*
2989 * This works around two forms of stupidity. The first is in 2978 * This works around two forms of stupidity. The first is in
@@ -3002,9 +2991,12 @@ static int ext4_da_writepages(struct address_space *mapping,
3002 * sbi->max_writeback_mb_bump whichever is smaller. 2991 * sbi->max_writeback_mb_bump whichever is smaller.
3003 */ 2992 */
3004 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2993 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
3005 if (!range_cyclic && range_whole) 2994 if (!range_cyclic && range_whole) {
3006 desired_nr_to_write = wbc->nr_to_write * 8; 2995 if (wbc->nr_to_write == LONG_MAX)
3007 else 2996 desired_nr_to_write = wbc->nr_to_write;
2997 else
2998 desired_nr_to_write = wbc->nr_to_write * 8;
2999 } else
3008 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 3000 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
3009 max_pages); 3001 max_pages);
3010 if (desired_nr_to_write > max_pages) 3002 if (desired_nr_to_write > max_pages)
@@ -3021,6 +3013,9 @@ static int ext4_da_writepages(struct address_space *mapping,
3021 pages_skipped = wbc->pages_skipped; 3013 pages_skipped = wbc->pages_skipped;
3022 3014
3023retry: 3015retry:
3016 if (wbc->sync_mode == WB_SYNC_ALL)
3017 tag_pages_for_writeback(mapping, index, end);
3018
3024 while (!ret && wbc->nr_to_write > 0) { 3019 while (!ret && wbc->nr_to_write > 0) {
3025 3020
3026 /* 3021 /*
@@ -3059,16 +3054,14 @@ retry:
3059 mpd.io_done = 0; 3054 mpd.io_done = 0;
3060 mpd.pages_written = 0; 3055 mpd.pages_written = 0;
3061 mpd.retval = 0; 3056 mpd.retval = 0;
3062 ret = write_cache_pages_da(mapping, wbc, &mpd); 3057 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3063 /* 3058 /*
3064 * If we have a contiguous extent of pages and we 3059 * If we have a contiguous extent of pages and we
3065 * haven't done the I/O yet, map the blocks and submit 3060 * haven't done the I/O yet, map the blocks and submit
3066 * them for I/O. 3061 * them for I/O.
3067 */ 3062 */
3068 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3063 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3069 if (mpage_da_map_blocks(&mpd) == 0) 3064 mpage_da_map_and_submit(&mpd);
3070 mpage_da_submit_io(&mpd);
3071 mpd.io_done = 1;
3072 ret = MPAGE_DA_EXTENT_TAIL; 3065 ret = MPAGE_DA_EXTENT_TAIL;
3073 } 3066 }
3074 trace_ext4_da_write_pages(inode, &mpd); 3067 trace_ext4_da_write_pages(inode, &mpd);
@@ -3115,14 +3108,13 @@ retry:
3115 __func__, wbc->nr_to_write, ret); 3108 __func__, wbc->nr_to_write, ret);
3116 3109
3117 /* Update index */ 3110 /* Update index */
3118 index += pages_written;
3119 wbc->range_cyclic = range_cyclic; 3111 wbc->range_cyclic = range_cyclic;
3120 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3112 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3121 /* 3113 /*
3122 * set the writeback_index so that range_cyclic 3114 * set the writeback_index so that range_cyclic
3123 * mode will write it back later 3115 * mode will write it back later
3124 */ 3116 */
3125 mapping->writeback_index = index; 3117 mapping->writeback_index = done_index;
3126 3118
3127out_writepages: 3119out_writepages:
3128 wbc->nr_to_write -= nr_to_writebump; 3120 wbc->nr_to_write -= nr_to_writebump;
@@ -3457,15 +3449,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3457 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3449 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3458} 3450}
3459 3451
3460static void ext4_free_io_end(ext4_io_end_t *io)
3461{
3462 BUG_ON(!io);
3463 if (io->page)
3464 put_page(io->page);
3465 iput(io->inode);
3466 kfree(io);
3467}
3468
3469static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3452static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3470{ 3453{
3471 struct buffer_head *head, *bh; 3454 struct buffer_head *head, *bh;
@@ -3642,173 +3625,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3642 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3625 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3643} 3626}
3644 3627
3645static void dump_completed_IO(struct inode * inode)
3646{
3647#ifdef EXT4_DEBUG
3648 struct list_head *cur, *before, *after;
3649 ext4_io_end_t *io, *io0, *io1;
3650 unsigned long flags;
3651
3652 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3653 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3654 return;
3655 }
3656
3657 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3658 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3659 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3660 cur = &io->list;
3661 before = cur->prev;
3662 io0 = container_of(before, ext4_io_end_t, list);
3663 after = cur->next;
3664 io1 = container_of(after, ext4_io_end_t, list);
3665
3666 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3667 io, inode->i_ino, io0, io1);
3668 }
3669 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3670#endif
3671}
3672
3673/*
3674 * check a range of space and convert unwritten extents to written.
3675 */
3676static int ext4_end_io_nolock(ext4_io_end_t *io)
3677{
3678 struct inode *inode = io->inode;
3679 loff_t offset = io->offset;
3680 ssize_t size = io->size;
3681 int ret = 0;
3682
3683 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3684 "list->prev 0x%p\n",
3685 io, inode->i_ino, io->list.next, io->list.prev);
3686
3687 if (list_empty(&io->list))
3688 return ret;
3689
3690 if (io->flag != EXT4_IO_UNWRITTEN)
3691 return ret;
3692
3693 ret = ext4_convert_unwritten_extents(inode, offset, size);
3694 if (ret < 0) {
3695 printk(KERN_EMERG "%s: failed to convert unwritten"
3696 "extents to written extents, error is %d"
3697 " io is still on inode %lu aio dio list\n",
3698 __func__, ret, inode->i_ino);
3699 return ret;
3700 }
3701
3702 if (io->iocb)
3703 aio_complete(io->iocb, io->result, 0);
3704 /* clear the DIO AIO unwritten flag */
3705 io->flag = 0;
3706 return ret;
3707}
3708
3709/*
3710 * work on completed aio dio IO, to convert unwritten extents to extents
3711 */
3712static void ext4_end_io_work(struct work_struct *work)
3713{
3714 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3715 struct inode *inode = io->inode;
3716 struct ext4_inode_info *ei = EXT4_I(inode);
3717 unsigned long flags;
3718 int ret;
3719
3720 mutex_lock(&inode->i_mutex);
3721 ret = ext4_end_io_nolock(io);
3722 if (ret < 0) {
3723 mutex_unlock(&inode->i_mutex);
3724 return;
3725 }
3726
3727 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3728 if (!list_empty(&io->list))
3729 list_del_init(&io->list);
3730 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3731 mutex_unlock(&inode->i_mutex);
3732 ext4_free_io_end(io);
3733}
3734
3735/*
3736 * This function is called from ext4_sync_file().
3737 *
3738 * When IO is completed, the work to convert unwritten extents to
3739 * written is queued on workqueue but may not get immediately
3740 * scheduled. When fsync is called, we need to ensure the
3741 * conversion is complete before fsync returns.
3742 * The inode keeps track of a list of pending/completed IO that
3743 * might needs to do the conversion. This function walks through
3744 * the list and convert the related unwritten extents for completed IO
3745 * to written.
3746 * The function return the number of pending IOs on success.
3747 */
3748int flush_completed_IO(struct inode *inode)
3749{
3750 ext4_io_end_t *io;
3751 struct ext4_inode_info *ei = EXT4_I(inode);
3752 unsigned long flags;
3753 int ret = 0;
3754 int ret2 = 0;
3755
3756 if (list_empty(&ei->i_completed_io_list))
3757 return ret;
3758
3759 dump_completed_IO(inode);
3760 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3761 while (!list_empty(&ei->i_completed_io_list)){
3762 io = list_entry(ei->i_completed_io_list.next,
3763 ext4_io_end_t, list);
3764 /*
3765 * Calling ext4_end_io_nolock() to convert completed
3766 * IO to written.
3767 *
3768 * When ext4_sync_file() is called, run_queue() may already
3769 * about to flush the work corresponding to this io structure.
3770 * It will be upset if it founds the io structure related
3771 * to the work-to-be schedule is freed.
3772 *
3773 * Thus we need to keep the io structure still valid here after
3774 * convertion finished. The io structure has a flag to
3775 * avoid double converting from both fsync and background work
3776 * queue work.
3777 */
3778 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3779 ret = ext4_end_io_nolock(io);
3780 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3781 if (ret < 0)
3782 ret2 = ret;
3783 else
3784 list_del_init(&io->list);
3785 }
3786 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3787 return (ret2 < 0) ? ret2 : 0;
3788}
3789
3790static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3791{
3792 ext4_io_end_t *io = NULL;
3793
3794 io = kmalloc(sizeof(*io), flags);
3795
3796 if (io) {
3797 igrab(inode);
3798 io->inode = inode;
3799 io->flag = 0;
3800 io->offset = 0;
3801 io->size = 0;
3802 io->page = NULL;
3803 io->iocb = NULL;
3804 io->result = 0;
3805 INIT_WORK(&io->work, ext4_end_io_work);
3806 INIT_LIST_HEAD(&io->list);
3807 }
3808
3809 return io;
3810}
3811
3812static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3628static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3813 ssize_t size, void *private, int ret, 3629 ssize_t size, void *private, int ret,
3814 bool is_async) 3630 bool is_async)
@@ -3828,7 +3644,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3828 size); 3644 size);
3829 3645
3830 /* if not aio dio with unwritten extents, just free io and return */ 3646 /* if not aio dio with unwritten extents, just free io and return */
3831 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3647 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3832 ext4_free_io_end(io_end); 3648 ext4_free_io_end(io_end);
3833 iocb->private = NULL; 3649 iocb->private = NULL;
3834out: 3650out:
@@ -3845,14 +3661,14 @@ out:
3845 } 3661 }
3846 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3662 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3847 3663
3848 /* queue the work to convert unwritten extents to written */
3849 queue_work(wq, &io_end->work);
3850
3851 /* Add the io_end to per-inode completed aio dio list*/ 3664 /* Add the io_end to per-inode completed aio dio list*/
3852 ei = EXT4_I(io_end->inode); 3665 ei = EXT4_I(io_end->inode);
3853 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3666 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3854 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3667 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3855 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3668 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3669
3670 /* queue the work to convert unwritten extents to written */
3671 queue_work(wq, &io_end->work);
3856 iocb->private = NULL; 3672 iocb->private = NULL;
3857} 3673}
3858 3674
@@ -3873,7 +3689,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3873 goto out; 3689 goto out;
3874 } 3690 }
3875 3691
3876 io_end->flag = EXT4_IO_UNWRITTEN; 3692 io_end->flag = EXT4_IO_END_UNWRITTEN;
3877 inode = io_end->inode; 3693 inode = io_end->inode;
3878 3694
3879 /* Add the io_end to per-inode completed io list*/ 3695 /* Add the io_end to per-inode completed io list*/
@@ -5464,6 +5280,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5464{ 5280{
5465 struct inode *inode = dentry->d_inode; 5281 struct inode *inode = dentry->d_inode;
5466 int error, rc = 0; 5282 int error, rc = 0;
5283 int orphan = 0;
5467 const unsigned int ia_valid = attr->ia_valid; 5284 const unsigned int ia_valid = attr->ia_valid;
5468 5285
5469 error = inode_change_ok(inode, attr); 5286 error = inode_change_ok(inode, attr);
@@ -5519,8 +5336,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5519 error = PTR_ERR(handle); 5336 error = PTR_ERR(handle);
5520 goto err_out; 5337 goto err_out;
5521 } 5338 }
5522 5339 if (ext4_handle_valid(handle)) {
5523 error = ext4_orphan_add(handle, inode); 5340 error = ext4_orphan_add(handle, inode);
5341 orphan = 1;
5342 }
5524 EXT4_I(inode)->i_disksize = attr->ia_size; 5343 EXT4_I(inode)->i_disksize = attr->ia_size;
5525 rc = ext4_mark_inode_dirty(handle, inode); 5344 rc = ext4_mark_inode_dirty(handle, inode);
5526 if (!error) 5345 if (!error)
@@ -5538,6 +5357,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5538 goto err_out; 5357 goto err_out;
5539 } 5358 }
5540 ext4_orphan_del(handle, inode); 5359 ext4_orphan_del(handle, inode);
5360 orphan = 0;
5541 ext4_journal_stop(handle); 5361 ext4_journal_stop(handle);
5542 goto err_out; 5362 goto err_out;
5543 } 5363 }
@@ -5560,7 +5380,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5560 * If the call to ext4_truncate failed to get a transaction handle at 5380 * If the call to ext4_truncate failed to get a transaction handle at
5561 * all, we need to clean up the in-core orphan list manually. 5381 * all, we need to clean up the in-core orphan list manually.
5562 */ 5382 */
5563 if (inode->i_nlink) 5383 if (orphan && inode->i_nlink)
5564 ext4_orphan_del(NULL, inode); 5384 ext4_orphan_del(NULL, inode);
5565 5385
5566 if (!rc && (ia_valid & ATTR_MODE)) 5386 if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5412,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5592 * will return the blocks that include the delayed allocation 5412 * will return the blocks that include the delayed allocation
5593 * blocks for this file. 5413 * blocks for this file.
5594 */ 5414 */
5595 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
5596 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5415 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5597 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
5598 5416
5599 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5417 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5600 return 0; 5418 return 0;
@@ -5643,7 +5461,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5643 * 5461 *
5644 * Also account for superblock, inode, quota and xattr blocks 5462 * Also account for superblock, inode, quota and xattr blocks
5645 */ 5463 */
5646int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5464static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5647{ 5465{
5648 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5466 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5649 int gdpblocks; 5467 int gdpblocks;
@@ -5831,6 +5649,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5831 int err, ret; 5649 int err, ret;
5832 5650
5833 might_sleep(); 5651 might_sleep();
5652 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5834 err = ext4_reserve_inode_write(handle, inode, &iloc); 5653 err = ext4_reserve_inode_write(handle, inode, &iloc);
5835 if (ext4_handle_valid(handle) && 5654 if (ext4_handle_valid(handle) &&
5836 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5655 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..5b4d4e3a4d58 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
338static struct kmem_cache *ext4_pspace_cachep; 338static struct kmem_cache *ext4_pspace_cachep;
339static struct kmem_cache *ext4_ac_cachep; 339static struct kmem_cache *ext4_ac_cachep;
340static struct kmem_cache *ext4_free_ext_cachep; 340static struct kmem_cache *ext4_free_ext_cachep;
341
342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES \
346 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
347static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
348
341static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 349static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
342 ext4_group_t group); 350 ext4_group_t group);
343static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 351static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -939,6 +947,85 @@ out:
939} 947}
940 948
941/* 949/*
950 * lock the group_info alloc_sem of all the groups
951 * belonging to the same buddy cache page. This
952 * make sure other parallel operation on the buddy
953 * cache doesn't happen whild holding the buddy cache
954 * lock
955 */
956static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
957 ext4_group_t group)
958{
959 int i;
960 int block, pnum;
961 int blocks_per_page;
962 int groups_per_page;
963 ext4_group_t ngroups = ext4_get_groups_count(sb);
964 ext4_group_t first_group;
965 struct ext4_group_info *grp;
966
967 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
968 /*
969 * the buddy cache inode stores the block bitmap
970 * and buddy information in consecutive blocks.
971 * So for each group we need two blocks.
972 */
973 block = group * 2;
974 pnum = block / blocks_per_page;
975 first_group = pnum * blocks_per_page / 2;
976
977 groups_per_page = blocks_per_page >> 1;
978 if (groups_per_page == 0)
979 groups_per_page = 1;
980 /* read all groups the page covers into the cache */
981 for (i = 0; i < groups_per_page; i++) {
982
983 if ((first_group + i) >= ngroups)
984 break;
985 grp = ext4_get_group_info(sb, first_group + i);
986 /* take all groups write allocation
987 * semaphore. This make sure there is
988 * no block allocation going on in any
989 * of that groups
990 */
991 down_write_nested(&grp->alloc_sem, i);
992 }
993 return i;
994}
995
996static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
997 ext4_group_t group, int locked_group)
998{
999 int i;
1000 int block, pnum;
1001 int blocks_per_page;
1002 ext4_group_t first_group;
1003 struct ext4_group_info *grp;
1004
1005 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1006 /*
1007 * the buddy cache inode stores the block bitmap
1008 * and buddy information in consecutive blocks.
1009 * So for each group we need two blocks.
1010 */
1011 block = group * 2;
1012 pnum = block / blocks_per_page;
1013 first_group = pnum * blocks_per_page / 2;
1014 /* release locks on all the groups */
1015 for (i = 0; i < locked_group; i++) {
1016
1017 grp = ext4_get_group_info(sb, first_group + i);
1018 /* take all groups write allocation
1019 * semaphore. This make sure there is
1020 * no block allocation going on in any
1021 * of that groups
1022 */
1023 up_write(&grp->alloc_sem);
1024 }
1025
1026}
1027
1028/*
942 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1029 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
943 * block group lock of all groups for this page; do not hold the BG lock when 1030 * block group lock of all groups for this page; do not hold the BG lock when
944 * calling this routine! 1031 * calling this routine!
@@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1915 return 0; 2002 return 0;
1916} 2003}
1917 2004
1918/*
1919 * lock the group_info alloc_sem of all the groups
1920 * belonging to the same buddy cache page. This
1921 * make sure other parallel operation on the buddy
1922 * cache doesn't happen whild holding the buddy cache
1923 * lock
1924 */
1925int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1926{
1927 int i;
1928 int block, pnum;
1929 int blocks_per_page;
1930 int groups_per_page;
1931 ext4_group_t ngroups = ext4_get_groups_count(sb);
1932 ext4_group_t first_group;
1933 struct ext4_group_info *grp;
1934
1935 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1936 /*
1937 * the buddy cache inode stores the block bitmap
1938 * and buddy information in consecutive blocks.
1939 * So for each group we need two blocks.
1940 */
1941 block = group * 2;
1942 pnum = block / blocks_per_page;
1943 first_group = pnum * blocks_per_page / 2;
1944
1945 groups_per_page = blocks_per_page >> 1;
1946 if (groups_per_page == 0)
1947 groups_per_page = 1;
1948 /* read all groups the page covers into the cache */
1949 for (i = 0; i < groups_per_page; i++) {
1950
1951 if ((first_group + i) >= ngroups)
1952 break;
1953 grp = ext4_get_group_info(sb, first_group + i);
1954 /* take all groups write allocation
1955 * semaphore. This make sure there is
1956 * no block allocation going on in any
1957 * of that groups
1958 */
1959 down_write_nested(&grp->alloc_sem, i);
1960 }
1961 return i;
1962}
1963
1964void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1965 ext4_group_t group, int locked_group)
1966{
1967 int i;
1968 int block, pnum;
1969 int blocks_per_page;
1970 ext4_group_t first_group;
1971 struct ext4_group_info *grp;
1972
1973 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1974 /*
1975 * the buddy cache inode stores the block bitmap
1976 * and buddy information in consecutive blocks.
1977 * So for each group we need two blocks.
1978 */
1979 block = group * 2;
1980 pnum = block / blocks_per_page;
1981 first_group = pnum * blocks_per_page / 2;
1982 /* release locks on all the groups */
1983 for (i = 0; i < locked_group; i++) {
1984
1985 grp = ext4_get_group_info(sb, first_group + i);
1986 /* take all groups write allocation
1987 * semaphore. This make sure there is
1988 * no block allocation going on in any
1989 * of that groups
1990 */
1991 up_write(&grp->alloc_sem);
1992 }
1993
1994}
1995
1996static noinline_for_stack int 2005static noinline_for_stack int
1997ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2006ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1998{ 2007{
@@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2233 .release = seq_release, 2242 .release = seq_release,
2234}; 2243};
2235 2244
2245static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2246{
2247 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2248 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2249
2250 BUG_ON(!cachep);
2251 return cachep;
2252}
2236 2253
2237/* Create and initialize ext4_group_info data for the given group. */ 2254/* Create and initialize ext4_group_info data for the given group. */
2238int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2255int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2239 struct ext4_group_desc *desc) 2256 struct ext4_group_desc *desc)
2240{ 2257{
2241 int i, len; 2258 int i;
2242 int metalen = 0; 2259 int metalen = 0;
2243 struct ext4_sb_info *sbi = EXT4_SB(sb); 2260 struct ext4_sb_info *sbi = EXT4_SB(sb);
2244 struct ext4_group_info **meta_group_info; 2261 struct ext4_group_info **meta_group_info;
2262 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2245 2263
2246 /* 2264 /*
2247 * First check if this group is the first of a reserved block. 2265 * First check if this group is the first of a reserved block.
@@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2261 meta_group_info; 2279 meta_group_info;
2262 } 2280 }
2263 2281
2264 /*
2265 * calculate needed size. if change bb_counters size,
2266 * don't forget about ext4_mb_generate_buddy()
2267 */
2268 len = offsetof(typeof(**meta_group_info),
2269 bb_counters[sb->s_blocksize_bits + 2]);
2270
2271 meta_group_info = 2282 meta_group_info =
2272 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2283 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2273 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2284 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2274 2285
2275 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2286 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2276 if (meta_group_info[i] == NULL) { 2287 if (meta_group_info[i] == NULL) {
2277 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2288 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2278 goto exit_group_info; 2289 goto exit_group_info;
2279 } 2290 }
2291 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2280 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2292 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2281 &(meta_group_info[i]->bb_state)); 2293 &(meta_group_info[i]->bb_state));
2282 2294
@@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2331 int num_meta_group_infos_max; 2343 int num_meta_group_infos_max;
2332 int array_size; 2344 int array_size;
2333 struct ext4_group_desc *desc; 2345 struct ext4_group_desc *desc;
2346 struct kmem_cache *cachep;
2334 2347
2335 /* This is the number of blocks used by GDT */ 2348 /* This is the number of blocks used by GDT */
2336 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2349 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2373,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2373 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2386 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2374 goto err_freesgi; 2387 goto err_freesgi;
2375 } 2388 }
2389 sbi->s_buddy_cache->i_ino = get_next_ino();
2376 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2390 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2377 for (i = 0; i < ngroups; i++) { 2391 for (i = 0; i < ngroups; i++) {
2378 desc = ext4_get_group_desc(sb, i, NULL); 2392 desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
2388 return 0; 2402 return 0;
2389 2403
2390err_freebuddy: 2404err_freebuddy:
2405 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2391 while (i-- > 0) 2406 while (i-- > 0)
2392 kfree(ext4_get_group_info(sb, i)); 2407 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2393 i = num_meta_group_infos; 2408 i = num_meta_group_infos;
2394 while (i-- > 0) 2409 while (i-- > 0)
2395 kfree(sbi->s_group_info[i]); 2410 kfree(sbi->s_group_info[i]);
@@ -2406,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2406 unsigned offset; 2421 unsigned offset;
2407 unsigned max; 2422 unsigned max;
2408 int ret; 2423 int ret;
2424 int cache_index;
2425 struct kmem_cache *cachep;
2426 char *namep = NULL;
2409 2427
2410 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2428 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2411 2429
2412 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2430 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2413 if (sbi->s_mb_offsets == NULL) { 2431 if (sbi->s_mb_offsets == NULL) {
2414 return -ENOMEM; 2432 ret = -ENOMEM;
2433 goto out;
2415 } 2434 }
2416 2435
2417 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2436 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2418 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2437 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2419 if (sbi->s_mb_maxs == NULL) { 2438 if (sbi->s_mb_maxs == NULL) {
2420 kfree(sbi->s_mb_offsets); 2439 ret = -ENOMEM;
2421 return -ENOMEM; 2440 goto out;
2441 }
2442
2443 cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2444 cachep = ext4_groupinfo_caches[cache_index];
2445 if (!cachep) {
2446 char name[32];
2447 int len = offsetof(struct ext4_group_info,
2448 bb_counters[sb->s_blocksize_bits + 2]);
2449
2450 sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
2451 namep = kstrdup(name, GFP_KERNEL);
2452 if (!namep) {
2453 ret = -ENOMEM;
2454 goto out;
2455 }
2456
2457 /* Need to free the kmem_cache_name() when we
2458 * destroy the slab */
2459 cachep = kmem_cache_create(namep, len, 0,
2460 SLAB_RECLAIM_ACCOUNT, NULL);
2461 if (!cachep) {
2462 ret = -ENOMEM;
2463 goto out;
2464 }
2465 ext4_groupinfo_caches[cache_index] = cachep;
2422 } 2466 }
2423 2467
2424 /* order 0 is regular bitmap */ 2468 /* order 0 is regular bitmap */
@@ -2439,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2439 /* init file for buddy data */ 2483 /* init file for buddy data */
2440 ret = ext4_mb_init_backend(sb); 2484 ret = ext4_mb_init_backend(sb);
2441 if (ret != 0) { 2485 if (ret != 0) {
2442 kfree(sbi->s_mb_offsets); 2486 goto out;
2443 kfree(sbi->s_mb_maxs);
2444 return ret;
2445 } 2487 }
2446 2488
2447 spin_lock_init(&sbi->s_md_lock); 2489 spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2456 2498
2457 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2499 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2458 if (sbi->s_locality_groups == NULL) { 2500 if (sbi->s_locality_groups == NULL) {
2459 kfree(sbi->s_mb_offsets); 2501 ret = -ENOMEM;
2460 kfree(sbi->s_mb_maxs); 2502 goto out;
2461 return -ENOMEM;
2462 } 2503 }
2463 for_each_possible_cpu(i) { 2504 for_each_possible_cpu(i) {
2464 struct ext4_locality_group *lg; 2505 struct ext4_locality_group *lg;
@@ -2475,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2475 2516
2476 if (sbi->s_journal) 2517 if (sbi->s_journal)
2477 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2518 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2478 return 0; 2519out:
2520 if (ret) {
2521 kfree(sbi->s_mb_offsets);
2522 kfree(sbi->s_mb_maxs);
2523 kfree(namep);
2524 }
2525 return ret;
2479} 2526}
2480 2527
2481/* need to called with the ext4 group lock held */ 2528/* need to called with the ext4 group lock held */
@@ -2503,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
2503 int num_meta_group_infos; 2550 int num_meta_group_infos;
2504 struct ext4_group_info *grinfo; 2551 struct ext4_group_info *grinfo;
2505 struct ext4_sb_info *sbi = EXT4_SB(sb); 2552 struct ext4_sb_info *sbi = EXT4_SB(sb);
2553 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2506 2554
2507 if (sbi->s_group_info) { 2555 if (sbi->s_group_info) {
2508 for (i = 0; i < ngroups; i++) { 2556 for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
2513 ext4_lock_group(sb, i); 2561 ext4_lock_group(sb, i);
2514 ext4_mb_cleanup_pa(grinfo); 2562 ext4_mb_cleanup_pa(grinfo);
2515 ext4_unlock_group(sb, i); 2563 ext4_unlock_group(sb, i);
2516 kfree(grinfo); 2564 kmem_cache_free(cachep, grinfo);
2517 } 2565 }
2518 num_meta_group_infos = (ngroups + 2566 num_meta_group_infos = (ngroups +
2519 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2567 EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,7 +2605,7 @@ int ext4_mb_release(struct super_block *sb)
2557 return 0; 2605 return 0;
2558} 2606}
2559 2607
2560static inline void ext4_issue_discard(struct super_block *sb, 2608static inline int ext4_issue_discard(struct super_block *sb,
2561 ext4_group_t block_group, ext4_grpblk_t block, int count) 2609 ext4_group_t block_group, ext4_grpblk_t block, int count)
2562{ 2610{
2563 int ret; 2611 int ret;
@@ -2566,11 +2614,12 @@ static inline void ext4_issue_discard(struct super_block *sb,
2566 discard_block = block + ext4_group_first_block_no(sb, block_group); 2614 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb, 2615 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count); 2616 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count); 2617 ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2570 if (ret == EOPNOTSUPP) { 2618 if (ret == -EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling"); 2619 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); 2620 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2573 } 2621 }
2622 return ret;
2574} 2623}
2575 2624
2576/* 2625/*
@@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
2658 2707
2659#endif 2708#endif
2660 2709
2661int __init init_ext4_mballoc(void) 2710int __init ext4_init_mballoc(void)
2662{ 2711{
2663 ext4_pspace_cachep = 2712 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2664 kmem_cache_create("ext4_prealloc_space", 2713 SLAB_RECLAIM_ACCOUNT);
2665 sizeof(struct ext4_prealloc_space),
2666 0, SLAB_RECLAIM_ACCOUNT, NULL);
2667 if (ext4_pspace_cachep == NULL) 2714 if (ext4_pspace_cachep == NULL)
2668 return -ENOMEM; 2715 return -ENOMEM;
2669 2716
2670 ext4_ac_cachep = 2717 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2671 kmem_cache_create("ext4_alloc_context", 2718 SLAB_RECLAIM_ACCOUNT);
2672 sizeof(struct ext4_allocation_context),
2673 0, SLAB_RECLAIM_ACCOUNT, NULL);
2674 if (ext4_ac_cachep == NULL) { 2719 if (ext4_ac_cachep == NULL) {
2675 kmem_cache_destroy(ext4_pspace_cachep); 2720 kmem_cache_destroy(ext4_pspace_cachep);
2676 return -ENOMEM; 2721 return -ENOMEM;
2677 } 2722 }
2678 2723
2679 ext4_free_ext_cachep = 2724 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2680 kmem_cache_create("ext4_free_block_extents", 2725 SLAB_RECLAIM_ACCOUNT);
2681 sizeof(struct ext4_free_data),
2682 0, SLAB_RECLAIM_ACCOUNT, NULL);
2683 if (ext4_free_ext_cachep == NULL) { 2726 if (ext4_free_ext_cachep == NULL) {
2684 kmem_cache_destroy(ext4_pspace_cachep); 2727 kmem_cache_destroy(ext4_pspace_cachep);
2685 kmem_cache_destroy(ext4_ac_cachep); 2728 kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void)
2689 return 0; 2732 return 0;
2690} 2733}
2691 2734
2692void exit_ext4_mballoc(void) 2735void ext4_exit_mballoc(void)
2693{ 2736{
2737 int i;
2694 /* 2738 /*
2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2739 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2696 * before destroying the slab cache. 2740 * before destroying the slab cache.
@@ -2699,6 +2743,15 @@ void exit_ext4_mballoc(void)
2699 kmem_cache_destroy(ext4_pspace_cachep); 2743 kmem_cache_destroy(ext4_pspace_cachep);
2700 kmem_cache_destroy(ext4_ac_cachep); 2744 kmem_cache_destroy(ext4_ac_cachep);
2701 kmem_cache_destroy(ext4_free_ext_cachep); 2745 kmem_cache_destroy(ext4_free_ext_cachep);
2746
2747 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2748 struct kmem_cache *cachep = ext4_groupinfo_caches[i];
2749 if (cachep) {
2750 char *name = (char *)kmem_cache_name(cachep);
2751 kmem_cache_destroy(cachep);
2752 kfree(name);
2753 }
2754 }
2702 ext4_remove_debugfs_entry(); 2755 ext4_remove_debugfs_entry();
2703} 2756}
2704 2757
@@ -3535,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3535 */ 3588 */
3536static noinline_for_stack int 3589static noinline_for_stack int
3537ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3590ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3538 struct ext4_prealloc_space *pa, 3591 struct ext4_prealloc_space *pa)
3539 struct ext4_allocation_context *ac)
3540{ 3592{
3541 struct super_block *sb = e4b->bd_sb; 3593 struct super_block *sb = e4b->bd_sb;
3542 struct ext4_sb_info *sbi = EXT4_SB(sb); 3594 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3554 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3606 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3555 end = bit + pa->pa_len; 3607 end = bit + pa->pa_len;
3556 3608
3557 if (ac) {
3558 ac->ac_sb = sb;
3559 ac->ac_inode = pa->pa_inode;
3560 }
3561
3562 while (bit < end) { 3609 while (bit < end) {
3563 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3610 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3564 if (bit >= end) 3611 if (bit >= end)
@@ -3569,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3569 (unsigned) next - bit, (unsigned) group); 3616 (unsigned) next - bit, (unsigned) group);
3570 free += next - bit; 3617 free += next - bit;
3571 3618
3572 if (ac) { 3619 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3573 ac->ac_b_ex.fe_group = group; 3620 trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
3574 ac->ac_b_ex.fe_start = bit; 3621 grp_blk_start + bit, next - bit);
3575 ac->ac_b_ex.fe_len = next - bit;
3576 ac->ac_b_ex.fe_logical = 0;
3577 trace_ext4_mballoc_discard(ac);
3578 }
3579
3580 trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
3581 next - bit);
3582 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3622 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3583 bit = next + 1; 3623 bit = next + 1;
3584 } 3624 }
@@ -3601,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3601 3641
3602static noinline_for_stack int 3642static noinline_for_stack int
3603ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3643ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3604 struct ext4_prealloc_space *pa, 3644 struct ext4_prealloc_space *pa)
3605 struct ext4_allocation_context *ac)
3606{ 3645{
3607 struct super_block *sb = e4b->bd_sb; 3646 struct super_block *sb = e4b->bd_sb;
3608 ext4_group_t group; 3647 ext4_group_t group;
3609 ext4_grpblk_t bit; 3648 ext4_grpblk_t bit;
3610 3649
3611 trace_ext4_mb_release_group_pa(sb, ac, pa); 3650 trace_ext4_mb_release_group_pa(sb, pa);
3612 BUG_ON(pa->pa_deleted == 0); 3651 BUG_ON(pa->pa_deleted == 0);
3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3652 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3653 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3615 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3654 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3616 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3655 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3617 3656 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3618 if (ac) {
3619 ac->ac_sb = sb;
3620 ac->ac_inode = NULL;
3621 ac->ac_b_ex.fe_group = group;
3622 ac->ac_b_ex.fe_start = bit;
3623 ac->ac_b_ex.fe_len = pa->pa_len;
3624 ac->ac_b_ex.fe_logical = 0;
3625 trace_ext4_mballoc_discard(ac);
3626 }
3627 3657
3628 return 0; 3658 return 0;
3629} 3659}
@@ -3644,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3644 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3674 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3645 struct buffer_head *bitmap_bh = NULL; 3675 struct buffer_head *bitmap_bh = NULL;
3646 struct ext4_prealloc_space *pa, *tmp; 3676 struct ext4_prealloc_space *pa, *tmp;
3647 struct ext4_allocation_context *ac;
3648 struct list_head list; 3677 struct list_head list;
3649 struct ext4_buddy e4b; 3678 struct ext4_buddy e4b;
3650 int err; 3679 int err;
@@ -3673,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3673 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3702 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3674 3703
3675 INIT_LIST_HEAD(&list); 3704 INIT_LIST_HEAD(&list);
3676 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3677 if (ac)
3678 ac->ac_sb = sb;
3679repeat: 3705repeat:
3680 ext4_lock_group(sb, group); 3706 ext4_lock_group(sb, group);
3681 list_for_each_entry_safe(pa, tmp, 3707 list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3756,9 @@ repeat:
3730 spin_unlock(pa->pa_obj_lock); 3756 spin_unlock(pa->pa_obj_lock);
3731 3757
3732 if (pa->pa_type == MB_GROUP_PA) 3758 if (pa->pa_type == MB_GROUP_PA)
3733 ext4_mb_release_group_pa(&e4b, pa, ac); 3759 ext4_mb_release_group_pa(&e4b, pa);
3734 else 3760 else
3735 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3761 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3736 3762
3737 list_del(&pa->u.pa_tmp_list); 3763 list_del(&pa->u.pa_tmp_list);
3738 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3764 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3766,6 @@ repeat:
3740 3766
3741out: 3767out:
3742 ext4_unlock_group(sb, group); 3768 ext4_unlock_group(sb, group);
3743 if (ac)
3744 kmem_cache_free(ext4_ac_cachep, ac);
3745 ext4_mb_unload_buddy(&e4b); 3769 ext4_mb_unload_buddy(&e4b);
3746 put_bh(bitmap_bh); 3770 put_bh(bitmap_bh);
3747 return free; 3771 return free;
@@ -3762,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
3762 struct super_block *sb = inode->i_sb; 3786 struct super_block *sb = inode->i_sb;
3763 struct buffer_head *bitmap_bh = NULL; 3787 struct buffer_head *bitmap_bh = NULL;
3764 struct ext4_prealloc_space *pa, *tmp; 3788 struct ext4_prealloc_space *pa, *tmp;
3765 struct ext4_allocation_context *ac;
3766 ext4_group_t group = 0; 3789 ext4_group_t group = 0;
3767 struct list_head list; 3790 struct list_head list;
3768 struct ext4_buddy e4b; 3791 struct ext4_buddy e4b;
@@ -3778,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
3778 3801
3779 INIT_LIST_HEAD(&list); 3802 INIT_LIST_HEAD(&list);
3780 3803
3781 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3782 if (ac) {
3783 ac->ac_sb = sb;
3784 ac->ac_inode = inode;
3785 }
3786repeat: 3804repeat:
3787 /* first, collect all pa's in the inode */ 3805 /* first, collect all pa's in the inode */
3788 spin_lock(&ei->i_prealloc_lock); 3806 spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3870,7 @@ repeat:
3852 3870
3853 ext4_lock_group(sb, group); 3871 ext4_lock_group(sb, group);
3854 list_del(&pa->pa_group_list); 3872 list_del(&pa->pa_group_list);
3855 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3873 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3856 ext4_unlock_group(sb, group); 3874 ext4_unlock_group(sb, group);
3857 3875
3858 ext4_mb_unload_buddy(&e4b); 3876 ext4_mb_unload_buddy(&e4b);
@@ -3861,8 +3879,6 @@ repeat:
3861 list_del(&pa->u.pa_tmp_list); 3879 list_del(&pa->u.pa_tmp_list);
3862 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3880 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3863 } 3881 }
3864 if (ac)
3865 kmem_cache_free(ext4_ac_cachep, ac);
3866} 3882}
3867 3883
3868/* 3884/*
@@ -4060,14 +4076,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4060 struct ext4_buddy e4b; 4076 struct ext4_buddy e4b;
4061 struct list_head discard_list; 4077 struct list_head discard_list;
4062 struct ext4_prealloc_space *pa, *tmp; 4078 struct ext4_prealloc_space *pa, *tmp;
4063 struct ext4_allocation_context *ac;
4064 4079
4065 mb_debug(1, "discard locality group preallocation\n"); 4080 mb_debug(1, "discard locality group preallocation\n");
4066 4081
4067 INIT_LIST_HEAD(&discard_list); 4082 INIT_LIST_HEAD(&discard_list);
4068 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4069 if (ac)
4070 ac->ac_sb = sb;
4071 4083
4072 spin_lock(&lg->lg_prealloc_lock); 4084 spin_lock(&lg->lg_prealloc_lock);
4073 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4085 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4131,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4119 } 4131 }
4120 ext4_lock_group(sb, group); 4132 ext4_lock_group(sb, group);
4121 list_del(&pa->pa_group_list); 4133 list_del(&pa->pa_group_list);
4122 ext4_mb_release_group_pa(&e4b, pa, ac); 4134 ext4_mb_release_group_pa(&e4b, pa);
4123 ext4_unlock_group(sb, group); 4135 ext4_unlock_group(sb, group);
4124 4136
4125 ext4_mb_unload_buddy(&e4b); 4137 ext4_mb_unload_buddy(&e4b);
4126 list_del(&pa->u.pa_tmp_list); 4138 list_del(&pa->u.pa_tmp_list);
4127 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4139 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4128 } 4140 }
4129 if (ac)
4130 kmem_cache_free(ext4_ac_cachep, ac);
4131} 4141}
4132 4142
4133/* 4143/*
@@ -4491,7 +4501,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4491{ 4501{
4492 struct buffer_head *bitmap_bh = NULL; 4502 struct buffer_head *bitmap_bh = NULL;
4493 struct super_block *sb = inode->i_sb; 4503 struct super_block *sb = inode->i_sb;
4494 struct ext4_allocation_context *ac = NULL;
4495 struct ext4_group_desc *gdp; 4504 struct ext4_group_desc *gdp;
4496 unsigned long freed = 0; 4505 unsigned long freed = 0;
4497 unsigned int overflow; 4506 unsigned int overflow;
@@ -4531,6 +4540,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4531 if (!bh) 4540 if (!bh)
4532 tbh = sb_find_get_block(inode->i_sb, 4541 tbh = sb_find_get_block(inode->i_sb,
4533 block + i); 4542 block + i);
4543 if (unlikely(!tbh))
4544 continue;
4534 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4545 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4535 inode, tbh, block + i); 4546 inode, tbh, block + i);
4536 } 4547 }
@@ -4546,12 +4557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4546 if (!ext4_should_writeback_data(inode)) 4557 if (!ext4_should_writeback_data(inode))
4547 flags |= EXT4_FREE_BLOCKS_METADATA; 4558 flags |= EXT4_FREE_BLOCKS_METADATA;
4548 4559
4549 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4550 if (ac) {
4551 ac->ac_inode = inode;
4552 ac->ac_sb = sb;
4553 }
4554
4555do_more: 4560do_more:
4556 overflow = 0; 4561 overflow = 0;
4557 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4562 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4614,7 @@ do_more:
4609 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4614 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4610 } 4615 }
4611#endif 4616#endif
4612 if (ac) { 4617 trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
4613 ac->ac_b_ex.fe_group = block_group;
4614 ac->ac_b_ex.fe_start = bit;
4615 ac->ac_b_ex.fe_len = count;
4616 trace_ext4_mballoc_free(ac);
4617 }
4618 4618
4619 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4619 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4620 if (err) 4620 if (err)
@@ -4644,8 +4644,6 @@ do_more:
4644 mb_clear_bits(bitmap_bh->b_data, bit, count); 4644 mb_clear_bits(bitmap_bh->b_data, bit, count);
4645 mb_free_blocks(inode, &e4b, bit, count); 4645 mb_free_blocks(inode, &e4b, bit, count);
4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4647 if (test_opt(sb, DISCARD))
4648 ext4_issue_discard(sb, block_group, bit, count);
4649 } 4647 }
4650 4648
4651 ret = ext4_free_blks_count(sb, gdp) + count; 4649 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4683,190 @@ error_return:
4685 dquot_free_block(inode, freed); 4683 dquot_free_block(inode, freed);
4686 brelse(bitmap_bh); 4684 brelse(bitmap_bh);
4687 ext4_std_error(sb, err); 4685 ext4_std_error(sb, err);
4688 if (ac)
4689 kmem_cache_free(ext4_ac_cachep, ac);
4690 return; 4686 return;
4691} 4687}
4688
4689/**
4690 * ext4_trim_extent -- function to TRIM one single free extent in the group
4691 * @sb: super block for the file system
4692 * @start: starting block of the free extent in the alloc. group
4693 * @count: number of blocks to TRIM
4694 * @group: alloc. group we are working with
4695 * @e4b: ext4 buddy for the group
4696 *
4697 * Trim "count" blocks starting at "start" in the "group". To assure that no
4698 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4699 * be called with under the group lock.
4700 */
4701static int ext4_trim_extent(struct super_block *sb, int start, int count,
4702 ext4_group_t group, struct ext4_buddy *e4b)
4703{
4704 struct ext4_free_extent ex;
4705 int ret = 0;
4706
4707 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4708
4709 ex.fe_start = start;
4710 ex.fe_group = group;
4711 ex.fe_len = count;
4712
4713 /*
4714 * Mark blocks used, so no one can reuse them while
4715 * being trimmed.
4716 */
4717 mb_mark_used(e4b, &ex);
4718 ext4_unlock_group(sb, group);
4719
4720 ret = ext4_issue_discard(sb, group, start, count);
4721 if (ret)
4722 ext4_std_error(sb, ret);
4723
4724 ext4_lock_group(sb, group);
4725 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4726 return ret;
4727}
4728
4729/**
4730 * ext4_trim_all_free -- function to trim all free space in alloc. group
4731 * @sb: super block for file system
4732 * @e4b: ext4 buddy
4733 * @start: first group block to examine
4734 * @max: last group block to examine
4735 * @minblocks: minimum extent block count
4736 *
4737 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4738 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4739 * the extent.
4740 *
4741 *
4742 * ext4_trim_all_free walks through group's block bitmap searching for free
4743 * extents. When the free extent is found, mark it as used in group buddy
4744 * bitmap. Then issue a TRIM command on this extent and free the extent in
4745 * the group buddy bitmap. This is done until whole group is scanned.
4746 */
4747ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4748 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4749{
4750 void *bitmap;
4751 ext4_grpblk_t next, count = 0;
4752 ext4_group_t group;
4753 int ret = 0;
4754
4755 BUG_ON(e4b == NULL);
4756
4757 bitmap = e4b->bd_bitmap;
4758 group = e4b->bd_group;
4759 start = (e4b->bd_info->bb_first_free > start) ?
4760 e4b->bd_info->bb_first_free : start;
4761 ext4_lock_group(sb, group);
4762
4763 while (start < max) {
4764 start = mb_find_next_zero_bit(bitmap, max, start);
4765 if (start >= max)
4766 break;
4767 next = mb_find_next_bit(bitmap, max, start);
4768
4769 if ((next - start) >= minblocks) {
4770 ret = ext4_trim_extent(sb, start,
4771 next - start, group, e4b);
4772 if (ret < 0)
4773 break;
4774 count += next - start;
4775 }
4776 start = next + 1;
4777
4778 if (fatal_signal_pending(current)) {
4779 count = -ERESTARTSYS;
4780 break;
4781 }
4782
4783 if (need_resched()) {
4784 ext4_unlock_group(sb, group);
4785 cond_resched();
4786 ext4_lock_group(sb, group);
4787 }
4788
4789 if ((e4b->bd_info->bb_free - count) < minblocks)
4790 break;
4791 }
4792 ext4_unlock_group(sb, group);
4793
4794 ext4_debug("trimmed %d blocks in the group %d\n",
4795 count, group);
4796
4797 if (ret < 0)
4798 count = ret;
4799
4800 return count;
4801}
4802
4803/**
4804 * ext4_trim_fs() -- trim ioctl handle function
4805 * @sb: superblock for filesystem
4806 * @range: fstrim_range structure
4807 *
4808 * start: First Byte to trim
4809 * len: number of Bytes to trim from start
4810 * minlen: minimum extent length in Bytes
4811 * ext4_trim_fs goes through all allocation groups containing Bytes from
4812 * start to start+len. For each such a group ext4_trim_all_free function
4813 * is invoked to trim all free space.
4814 */
4815int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4816{
4817 struct ext4_buddy e4b;
4818 ext4_group_t first_group, last_group;
4819 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4820 ext4_grpblk_t cnt = 0, first_block, last_block;
4821 uint64_t start, len, minlen, trimmed;
4822 int ret = 0;
4823
4824 start = range->start >> sb->s_blocksize_bits;
4825 len = range->len >> sb->s_blocksize_bits;
4826 minlen = range->minlen >> sb->s_blocksize_bits;
4827 trimmed = 0;
4828
4829 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4830 return -EINVAL;
4831
4832 /* Determine first and last group to examine based on start and len */
4833 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4834 &first_group, &first_block);
4835 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4836 &last_group, &last_block);
4837 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4838 last_block = EXT4_BLOCKS_PER_GROUP(sb);
4839
4840 if (first_group > last_group)
4841 return -EINVAL;
4842
4843 for (group = first_group; group <= last_group; group++) {
4844 ret = ext4_mb_load_buddy(sb, group, &e4b);
4845 if (ret) {
4846 ext4_error(sb, "Error in loading buddy "
4847 "information for %u", group);
4848 break;
4849 }
4850
4851 if (len >= EXT4_BLOCKS_PER_GROUP(sb))
4852 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
4853 else
4854 last_block = len;
4855
4856 if (e4b.bd_info->bb_free >= minlen) {
4857 cnt = ext4_trim_all_free(sb, &e4b, first_block,
4858 last_block, minlen);
4859 if (cnt < 0) {
4860 ret = cnt;
4861 ext4_mb_unload_buddy(&e4b);
4862 break;
4863 }
4864 }
4865 ext4_mb_unload_buddy(&e4b);
4866 trimmed += cnt;
4867 first_block = 0;
4868 }
4869 range->len = trimmed * sb->s_blocksize;
4870
4871 return ret;
4872}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..25f3a974b725 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
412 struct buffer_head *bh; 412 struct buffer_head *bh;
413 struct ext4_extent_header *eh; 413 struct ext4_extent_header *eh;
414 414
415 block = idx_pblock(ix); 415 block = ext4_idx_pblock(ix);
416 bh = sb_bread(inode->i_sb, block); 416 bh = sb_bread(inode->i_sb, block);
417 if (!bh) 417 if (!bh)
418 return -EIO; 418 return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..b9f3e7862f13 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
86 /* leaf block */ 86 /* leaf block */
87 *extent = ++path[ppos].p_ext; 87 *extent = ++path[ppos].p_ext;
88 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 88 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
89 return 0; 89 return 0;
90 } 90 }
91 91
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
96 96
97 /* index block */ 97 /* index block */
98 path[ppos].p_idx++; 98 path[ppos].p_idx++;
99 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 99 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
100 if (path[ppos+1].p_bh) 100 if (path[ppos+1].p_bh)
101 brelse(path[ppos+1].p_bh); 101 brelse(path[ppos+1].p_bh);
102 path[ppos+1].p_bh = 102 path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
111 path[cur_ppos].p_idx = 111 path[cur_ppos].p_idx =
112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr); 112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
113 path[cur_ppos].p_block = 113 path[cur_ppos].p_block =
114 idx_pblock(path[cur_ppos].p_idx); 114 ext4_idx_pblock(path[cur_ppos].p_idx);
115 if (path[cur_ppos+1].p_bh) 115 if (path[cur_ppos+1].p_bh)
116 brelse(path[cur_ppos+1].p_bh); 116 brelse(path[cur_ppos+1].p_bh);
117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, 117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
133 path[leaf_ppos].p_ext = *extent = 133 path[leaf_ppos].p_ext = *extent =
134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
135 path[leaf_ppos].p_block = 135 path[leaf_ppos].p_block =
136 ext_pblock(path[leaf_ppos].p_ext); 136 ext4_ext_pblock(path[leaf_ppos].p_ext);
137 return 0; 137 return 0;
138 } 138 }
139 } 139 }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
249 */ 249 */
250 o_end->ee_block = end_ext->ee_block; 250 o_end->ee_block = end_ext->ee_block;
251 o_end->ee_len = end_ext->ee_len; 251 o_end->ee_len = end_ext->ee_len;
252 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 252 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
253 } 253 }
254 254
255 o_start->ee_len = start_ext->ee_len; 255 o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
276 */ 276 */
277 o_end->ee_block = end_ext->ee_block; 277 o_end->ee_block = end_ext->ee_block;
278 o_end->ee_len = end_ext->ee_len; 278 o_end->ee_len = end_ext->ee_len;
279 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 279 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
280 280
281 /* 281 /*
282 * Set 0 to the extent block if new_ext was 282 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
361 /* Insert new entry */ 361 /* Insert new entry */
362 if (new_ext->ee_len) { 362 if (new_ext->ee_len) {
363 o_start[i] = *new_ext; 363 o_start[i] = *new_ext;
364 ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); 364 ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
365 } 365 }
366 366
367 /* Insert end entry */ 367 /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
488 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
489 489
490 new_ext.ee_block = cpu_to_le32(*from); 490 new_ext.ee_block = cpu_to_le32(*from);
491 ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); 491 ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
492 new_ext.ee_len = dext->ee_len; 492 new_ext.ee_len = dext->ee_len;
493 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 493 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
553 copy_extent_status(oext, &end_ext); 553 copy_extent_status(oext, &end_ext);
554 end_ext_alen = ext4_ext_get_actual_len(&end_ext); 554 end_ext_alen = ext4_ext_get_actual_len(&end_ext);
555 ext4_ext_store_pblock(&end_ext, 555 ext4_ext_store_pblock(&end_ext,
556 (ext_pblock(o_end) + oext_alen - end_ext_alen)); 556 (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
557 end_ext.ee_block = 557 end_ext.ee_block =
558 cpu_to_le32(le32_to_cpu(o_end->ee_block) + 558 cpu_to_le32(le32_to_cpu(o_end->ee_block) +
559 oext_alen - end_ext_alen); 559 oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
604 /* When tmp_dext is too large, pick up the target range. */ 604 /* When tmp_dext is too large, pick up the target range. */
605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
606 606
607 ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff); 607 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
608 tmp_dext->ee_block = 608 tmp_dext->ee_block =
609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); 609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); 610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
613 tmp_dext->ee_len = cpu_to_le16(max_count); 613 tmp_dext->ee_len = cpu_to_le16(max_count);
614 614
615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); 615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
616 ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff); 616 ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
617 617
618 /* Adjust extent length if donor extent is larger than orig */ 618 /* Adjust extent length if donor extent is larger than orig */
619 if (ext4_ext_get_actual_len(tmp_dext) > 619 if (ext4_ext_get_actual_len(tmp_dext) >
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..92203b8a099f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
856 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 856 struct buffer_head *bh_use[NAMEI_RA_SIZE];
857 struct buffer_head *bh, *ret = NULL; 857 struct buffer_head *bh, *ret = NULL;
858 ext4_lblk_t start, block, b; 858 ext4_lblk_t start, block, b;
859 const u8 *name = d_name->name;
859 int ra_max = 0; /* Number of bh's in the readahead 860 int ra_max = 0; /* Number of bh's in the readahead
860 buffer, bh_use[] */ 861 buffer, bh_use[] */
861 int ra_ptr = 0; /* Current index into readahead 862 int ra_ptr = 0; /* Current index into readahead
@@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
870 namelen = d_name->len; 871 namelen = d_name->len;
871 if (namelen > EXT4_NAME_LEN) 872 if (namelen > EXT4_NAME_LEN)
872 return NULL; 873 return NULL;
874 if ((namelen <= 2) && (name[0] == '.') &&
875 (name[1] == '.' || name[1] == '0')) {
876 /*
877 * "." or ".." will only be in the first block
878 * NFS may look up ".."; "." should be handled by the VFS
879 */
880 block = start = 0;
881 nblocks = 1;
882 goto restart;
883 }
873 if (is_dx(dir)) { 884 if (is_dx(dir)) {
874 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); 885 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
875 /* 886 /*
@@ -960,55 +971,35 @@ cleanup_and_exit:
960static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, 971static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
961 struct ext4_dir_entry_2 **res_dir, int *err) 972 struct ext4_dir_entry_2 **res_dir, int *err)
962{ 973{
963 struct super_block * sb; 974 struct super_block * sb = dir->i_sb;
964 struct dx_hash_info hinfo; 975 struct dx_hash_info hinfo;
965 u32 hash;
966 struct dx_frame frames[2], *frame; 976 struct dx_frame frames[2], *frame;
967 struct ext4_dir_entry_2 *de, *top;
968 struct buffer_head *bh; 977 struct buffer_head *bh;
969 ext4_lblk_t block; 978 ext4_lblk_t block;
970 int retval; 979 int retval;
971 int namelen = d_name->len;
972 const u8 *name = d_name->name;
973 980
974 sb = dir->i_sb; 981 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
975 /* NFS may look up ".." - look at dx_root directory block */ 982 return NULL;
976 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
977 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
978 return NULL;
979 } else {
980 frame = frames;
981 frame->bh = NULL; /* for dx_release() */
982 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
983 dx_set_block(frame->at, 0); /* dx_root block is 0 */
984 }
985 hash = hinfo.hash;
986 do { 983 do {
987 block = dx_get_block(frame->at); 984 block = dx_get_block(frame->at);
988 if (!(bh = ext4_bread (NULL,dir, block, 0, err))) 985 if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
989 goto errout; 986 goto errout;
990 de = (struct ext4_dir_entry_2 *) bh->b_data;
991 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
992 EXT4_DIR_REC_LEN(0));
993 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
994 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
995 + ((char *) de - bh->b_data);
996
997 if (!ext4_check_dir_entry(dir, de, bh, off)) {
998 brelse(bh);
999 *err = ERR_BAD_DX_DIR;
1000 goto errout;
1001 }
1002 987
1003 if (ext4_match(namelen, name, de)) { 988 retval = search_dirblock(bh, dir, d_name,
1004 *res_dir = de; 989 block << EXT4_BLOCK_SIZE_BITS(sb),
1005 dx_release(frames); 990 res_dir);
1006 return bh; 991 if (retval == 1) { /* Success! */
1007 } 992 dx_release(frames);
993 return bh;
1008 } 994 }
1009 brelse(bh); 995 brelse(bh);
996 if (retval == -1) {
997 *err = ERR_BAD_DX_DIR;
998 goto errout;
999 }
1000
1010 /* Check to see if we should continue to search */ 1001 /* Check to see if we should continue to search */
1011 retval = ext4_htree_next_block(dir, hash, frame, 1002 retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1012 frames, NULL); 1003 frames, NULL);
1013 if (retval < 0) { 1004 if (retval < 0) {
1014 ext4_warning(sb, 1005 ext4_warning(sb,
@@ -2312,7 +2303,7 @@ retry:
2312 2303
2313 inode->i_ctime = ext4_current_time(inode); 2304 inode->i_ctime = ext4_current_time(inode);
2314 ext4_inc_count(handle, inode); 2305 ext4_inc_count(handle, inode);
2315 atomic_inc(&inode->i_count); 2306 ihold(inode);
2316 2307
2317 err = ext4_add_entry(handle, dentry, inode); 2308 err = ext4_add_entry(handle, dentry, inode);
2318 if (!err) { 2309 if (!err) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..7f5451cd1d38
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,431 @@
1/*
2 * linux/fs/ext4/page-io.c
3 *
4 * This contains the new page_io functions for ext4
5 *
6 * Written by Theodore Ts'o, 2010.
7 */
8
9#include <linux/module.h>
10#include <linux/fs.h>
11#include <linux/time.h>
12#include <linux/jbd2.h>
13#include <linux/highuid.h>
14#include <linux/pagemap.h>
15#include <linux/quotaops.h>
16#include <linux/string.h>
17#include <linux/buffer_head.h>
18#include <linux/writeback.h>
19#include <linux/pagevec.h>
20#include <linux/mpage.h>
21#include <linux/namei.h>
22#include <linux/uio.h>
23#include <linux/bio.h>
24#include <linux/workqueue.h>
25#include <linux/kernel.h>
26#include <linux/slab.h>
27
28#include "ext4_jbd2.h"
29#include "xattr.h"
30#include "acl.h"
31#include "ext4_extents.h"
32
33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
39int __init ext4_init_pageio(void)
40{
41 int i;
42
43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
44 if (io_page_cachep == NULL)
45 return -ENOMEM;
46 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
47 if (io_page_cachep == NULL) {
48 kmem_cache_destroy(io_page_cachep);
49 return -ENOMEM;
50 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
53
54 return 0;
55}
56
57void ext4_exit_pageio(void)
58{
59 kmem_cache_destroy(io_end_cachep);
60 kmem_cache_destroy(io_page_cachep);
61}
62
63void ext4_ioend_wait(struct inode *inode)
64{
65 wait_queue_head_t *wq = to_ioend_wq(inode);
66
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68}
69
70static void put_io_page(struct ext4_io_page *io_page)
71{
72 if (atomic_dec_and_test(&io_page->p_count)) {
73 end_page_writeback(io_page->p_page);
74 put_page(io_page->p_page);
75 kmem_cache_free(io_page_cachep, io_page);
76 }
77}
78
79void ext4_free_io_end(ext4_io_end_t *io)
80{
81 int i;
82 wait_queue_head_t *wq;
83
84 BUG_ON(!io);
85 if (io->page)
86 put_page(io->page);
87 for (i = 0; i < io->num_io_pages; i++)
88 put_io_page(io->pages[i]);
89 io->num_io_pages = 0;
90 wq = to_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq))
93 wake_up_all(wq);
94 kmem_cache_free(io_end_cachep, io);
95}
96
97/*
98 * check a range of space and convert unwritten extents to written.
99 */
100int ext4_end_io_nolock(ext4_io_end_t *io)
101{
102 struct inode *inode = io->inode;
103 loff_t offset = io->offset;
104 ssize_t size = io->size;
105 int ret = 0;
106
107 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
108 "list->prev 0x%p\n",
109 io, inode->i_ino, io->list.next, io->list.prev);
110
111 if (list_empty(&io->list))
112 return ret;
113
114 if (!(io->flag & EXT4_IO_END_UNWRITTEN))
115 return ret;
116
117 ret = ext4_convert_unwritten_extents(inode, offset, size);
118 if (ret < 0) {
119 printk(KERN_EMERG "%s: failed to convert unwritten "
120 "extents to written extents, error is %d "
121 "io is still on inode %lu aio dio list\n",
122 __func__, ret, inode->i_ino);
123 return ret;
124 }
125
126 if (io->iocb)
127 aio_complete(io->iocb, io->result, 0);
128 /* clear the DIO AIO unwritten flag */
129 io->flag &= ~EXT4_IO_END_UNWRITTEN;
130 return ret;
131}
132
133/*
134 * work on completed aio dio IO, to convert unwritten extents to extents
135 */
136static void ext4_end_io_work(struct work_struct *work)
137{
138 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
139 struct inode *inode = io->inode;
140 struct ext4_inode_info *ei = EXT4_I(inode);
141 unsigned long flags;
142 int ret;
143
144 mutex_lock(&inode->i_mutex);
145 ret = ext4_end_io_nolock(io);
146 if (ret < 0) {
147 mutex_unlock(&inode->i_mutex);
148 return;
149 }
150
151 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
152 if (!list_empty(&io->list))
153 list_del_init(&io->list);
154 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
155 mutex_unlock(&inode->i_mutex);
156 ext4_free_io_end(io);
157}
158
159ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
160{
161 ext4_io_end_t *io = NULL;
162
163 io = kmem_cache_alloc(io_end_cachep, flags);
164 if (io) {
165 memset(io, 0, sizeof(*io));
166 atomic_inc(&EXT4_I(inode)->i_ioend_count);
167 io->inode = inode;
168 INIT_WORK(&io->work, ext4_end_io_work);
169 INIT_LIST_HEAD(&io->list);
170 }
171 return io;
172}
173
174/*
175 * Print an buffer I/O error compatible with the fs/buffer.c. This
176 * provides compatibility with dmesg scrapers that look for a specific
177 * buffer I/O error message. We really need a unified error reporting
178 * structure to userspace ala Digital Unix's uerf system, but it's
179 * probably not going to happen in my lifetime, due to LKML politics...
180 */
181static void buffer_io_error(struct buffer_head *bh)
182{
183 char b[BDEVNAME_SIZE];
184 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
185 bdevname(bh->b_bdev, b),
186 (unsigned long long)bh->b_blocknr);
187}
188
189static void ext4_end_bio(struct bio *bio, int error)
190{
191 ext4_io_end_t *io_end = bio->bi_private;
192 struct workqueue_struct *wq;
193 struct inode *inode;
194 unsigned long flags;
195 int i;
196
197 BUG_ON(!io_end);
198 bio->bi_private = NULL;
199 bio->bi_end_io = NULL;
200 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
201 error = 0;
202 bio_put(bio);
203
204 for (i = 0; i < io_end->num_io_pages; i++) {
205 struct page *page = io_end->pages[i]->p_page;
206 struct buffer_head *bh, *head;
207 int partial_write = 0;
208
209 head = page_buffers(page);
210 if (error)
211 SetPageError(page);
212 BUG_ON(!head);
213 if (head->b_size == PAGE_CACHE_SIZE)
214 clear_buffer_dirty(head);
215 else {
216 loff_t offset;
217 loff_t io_end_offset = io_end->offset + io_end->size;
218
219 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
220 bh = head;
221 do {
222 if ((offset >= io_end->offset) &&
223 (offset+bh->b_size <= io_end_offset)) {
224 if (error)
225 buffer_io_error(bh);
226
227 clear_buffer_dirty(bh);
228 }
229 if (buffer_delay(bh))
230 partial_write = 1;
231 else if (!buffer_mapped(bh))
232 clear_buffer_dirty(bh);
233 else if (buffer_dirty(bh))
234 partial_write = 1;
235 offset += bh->b_size;
236 bh = bh->b_this_page;
237 } while (bh != head);
238 }
239
240 put_io_page(io_end->pages[i]);
241
242 /*
243 * If this is a partial write which happened to make
244 * all buffers uptodate then we can optimize away a
245 * bogus readpage() for the next read(). Here we
246 * 'discover' whether the page went uptodate as a
247 * result of this (potentially partial) write.
248 */
249 if (!partial_write)
250 SetPageUptodate(page);
251 }
252 io_end->num_io_pages = 0;
253 inode = io_end->inode;
254
255 if (error) {
256 io_end->flag |= EXT4_IO_END_ERROR;
257 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
258 "(offset %llu size %ld starting block %llu)",
259 inode->i_ino,
260 (unsigned long long) io_end->offset,
261 (long) io_end->size,
262 (unsigned long long)
263 bio->bi_sector >> (inode->i_blkbits - 9));
264 }
265
266 /* Add the io_end to per-inode completed io list*/
267 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
268 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
269 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
270
271 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
272 /* queue the work to convert unwritten extents to written */
273 queue_work(wq, &io_end->work);
274}
275
276void ext4_io_submit(struct ext4_io_submit *io)
277{
278 struct bio *bio = io->io_bio;
279
280 if (bio) {
281 bio_get(io->io_bio);
282 submit_bio(io->io_op, io->io_bio);
283 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
284 bio_put(io->io_bio);
285 }
286 io->io_bio = 0;
287 io->io_op = 0;
288 io->io_end = 0;
289}
290
291static int io_submit_init(struct ext4_io_submit *io,
292 struct inode *inode,
293 struct writeback_control *wbc,
294 struct buffer_head *bh)
295{
296 ext4_io_end_t *io_end;
297 struct page *page = bh->b_page;
298 int nvecs = bio_get_nr_vecs(bh->b_bdev);
299 struct bio *bio;
300
301 io_end = ext4_init_io_end(inode, GFP_NOFS);
302 if (!io_end)
303 return -ENOMEM;
304 do {
305 bio = bio_alloc(GFP_NOIO, nvecs);
306 nvecs >>= 1;
307 } while (bio == NULL);
308
309 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
310 bio->bi_bdev = bh->b_bdev;
311 bio->bi_private = io->io_end = io_end;
312 bio->bi_end_io = ext4_end_bio;
313
314 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
315
316 io->io_bio = bio;
317 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
318 WRITE_SYNC_PLUG : WRITE);
319 io->io_next_block = bh->b_blocknr;
320 return 0;
321}
322
323static int io_submit_add_bh(struct ext4_io_submit *io,
324 struct ext4_io_page *io_page,
325 struct inode *inode,
326 struct writeback_control *wbc,
327 struct buffer_head *bh)
328{
329 ext4_io_end_t *io_end;
330 int ret;
331
332 if (buffer_new(bh)) {
333 clear_buffer_new(bh);
334 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
335 }
336
337 if (!buffer_mapped(bh) || buffer_delay(bh)) {
338 if (!buffer_mapped(bh))
339 clear_buffer_dirty(bh);
340 if (io->io_bio)
341 ext4_io_submit(io);
342 return 0;
343 }
344
345 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
346submit_and_retry:
347 ext4_io_submit(io);
348 }
349 if (io->io_bio == NULL) {
350 ret = io_submit_init(io, inode, wbc, bh);
351 if (ret)
352 return ret;
353 }
354 io_end = io->io_end;
355 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
356 (io_end->pages[io_end->num_io_pages-1] != io_page))
357 goto submit_and_retry;
358 if (buffer_uninit(bh))
359 io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
360 io->io_end->size += bh->b_size;
361 io->io_next_block++;
362 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
363 if (ret != bh->b_size)
364 goto submit_and_retry;
365 if ((io_end->num_io_pages == 0) ||
366 (io_end->pages[io_end->num_io_pages-1] != io_page)) {
367 io_end->pages[io_end->num_io_pages++] = io_page;
368 atomic_inc(&io_page->p_count);
369 }
370 return 0;
371}
372
373int ext4_bio_write_page(struct ext4_io_submit *io,
374 struct page *page,
375 int len,
376 struct writeback_control *wbc)
377{
378 struct inode *inode = page->mapping->host;
379 unsigned block_start, block_end, blocksize;
380 struct ext4_io_page *io_page;
381 struct buffer_head *bh, *head;
382 int ret = 0;
383
384 blocksize = 1 << inode->i_blkbits;
385
386 BUG_ON(PageWriteback(page));
387 set_page_writeback(page);
388 ClearPageError(page);
389
390 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
391 if (!io_page) {
392 set_page_dirty(page);
393 unlock_page(page);
394 return -ENOMEM;
395 }
396 io_page->p_page = page;
397 atomic_set(&io_page->p_count, 1);
398 get_page(page);
399
400 for (bh = head = page_buffers(page), block_start = 0;
401 bh != head || !block_start;
402 block_start = block_end, bh = bh->b_this_page) {
403 block_end = block_start + blocksize;
404 if (block_start >= len) {
405 clear_buffer_dirty(bh);
406 set_buffer_uptodate(bh);
407 continue;
408 }
409 ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
410 if (ret) {
411 /*
412 * We only get here on ENOMEM. Not much else
413 * we can do but mark the page as dirty, and
414 * better luck next time.
415 */
416 set_page_dirty(page);
417 break;
418 }
419 }
420 unlock_page(page);
421 /*
422 * If the page was truncated before we could do the writeback,
423 * or we had a memory allocation error while trying to write
424 * the first buffer head, we won't have submitted any pages for
425 * I/O. In that case we need to make sure we've cleared the
426 * PageWriteback bit from the page to prevent the system from
427 * wedging later on.
428 */
429 put_io_page(io_page);
430 return ret;
431}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..dc963929de65 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb,
226 } 226 }
227 227
228 /* Zero out all of the reserved backup group descriptor table blocks */ 228 /* Zero out all of the reserved backup group descriptor table blocks */
229 for (i = 0, bit = gdblocks + 1, block = start + bit; 229 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
230 i < reserved_gdb; i++, block++, bit++) { 230 block, sbi->s_itb_per_group);
231 struct buffer_head *gdb; 231 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
232 232 GFP_NOFS);
233 ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); 233 if (err)
234 234 goto exit_bh;
235 if ((err = extend_or_restart_transaction(handle, 1, bh)))
236 goto exit_bh;
237 235
238 if (IS_ERR(gdb = bclean(handle, sb, block))) {
239 err = PTR_ERR(gdb);
240 goto exit_bh;
241 }
242 ext4_handle_dirty_metadata(handle, NULL, gdb);
243 ext4_set_bit(bit, bh->b_data);
244 brelse(gdb);
245 }
246 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 236 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
247 input->block_bitmap - start); 237 input->block_bitmap - start);
248 ext4_set_bit(input->block_bitmap - start, bh->b_data); 238 ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,28 +241,18 @@ static int setup_new_group_blocks(struct super_block *sb,
251 ext4_set_bit(input->inode_bitmap - start, bh->b_data); 241 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
252 242
253 /* Zero out all of the inode table blocks */ 243 /* Zero out all of the inode table blocks */
254 for (i = 0, block = input->inode_table, bit = block - start; 244 block = input->inode_table;
255 i < sbi->s_itb_per_group; i++, bit++, block++) { 245 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
256 struct buffer_head *it; 246 block, sbi->s_itb_per_group);
257 247 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
258 ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); 248 if (err)
259 249 goto exit_bh;
260 if ((err = extend_or_restart_transaction(handle, 1, bh)))
261 goto exit_bh;
262
263 if (IS_ERR(it = bclean(handle, sb, block))) {
264 err = PTR_ERR(it);
265 goto exit_bh;
266 }
267 ext4_handle_dirty_metadata(handle, NULL, it);
268 brelse(it);
269 ext4_set_bit(bit, bh->b_data);
270 }
271 250
272 if ((err = extend_or_restart_transaction(handle, 2, bh))) 251 if ((err = extend_or_restart_transaction(handle, 2, bh)))
273 goto exit_bh; 252 goto exit_bh;
274 253
275 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); 254 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
255 bh->b_data);
276 ext4_handle_dirty_metadata(handle, NULL, bh); 256 ext4_handle_dirty_metadata(handle, NULL, bh);
277 brelse(bh); 257 brelse(bh);
278 /* Mark unused entries in inode bitmap used */ 258 /* Mark unused entries in inode bitmap used */
@@ -283,8 +263,8 @@ static int setup_new_group_blocks(struct super_block *sb,
283 goto exit_journal; 263 goto exit_journal;
284 } 264 }
285 265
286 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 266 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
287 bh->b_data); 267 bh->b_data);
288 ext4_handle_dirty_metadata(handle, NULL, bh); 268 ext4_handle_dirty_metadata(handle, NULL, bh);
289exit_bh: 269exit_bh:
290 brelse(bh); 270 brelse(bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f47c366bf15..61182fe6254e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/blkdev.h> 27#include <linux/blkdev.h>
28#include <linux/parser.h> 28#include <linux/parser.h>
29#include <linux/smp_lock.h>
30#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 30#include <linux/exportfs.h>
32#include <linux/vfs.h> 31#include <linux/vfs.h>
@@ -41,6 +40,9 @@
41#include <linux/crc16.h> 40#include <linux/crc16.h>
42#include <asm/uaccess.h> 41#include <asm/uaccess.h>
43 42
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45
44#include "ext4.h" 46#include "ext4.h"
45#include "ext4_jbd2.h" 47#include "ext4_jbd2.h"
46#include "xattr.h" 48#include "xattr.h"
@@ -50,8 +52,11 @@
50#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 53#include <trace/events/ext4.h>
52 54
53struct proc_dir_entry *ext4_proc_root; 55static struct proc_dir_entry *ext4_proc_root;
54static struct kset *ext4_kset; 56static struct kset *ext4_kset;
57struct ext4_lazy_init *ext4_li_info;
58struct mutex ext4_li_mtx;
59struct ext4_features *ext4_feat;
55 60
56static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 61static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
57 unsigned long journal_devnum); 62 unsigned long journal_devnum);
@@ -68,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 73static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 74static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt); 77 const char *dev_name, void *data);
78static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb);
73 80
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 81#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = { 82static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE, 83 .owner = THIS_MODULE,
77 .name = "ext3", 84 .name = "ext3",
78 .get_sb = ext4_get_sb, 85 .mount = ext4_mount,
79 .kill_sb = kill_block_super, 86 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV, 87 .fs_flags = FS_REQUIRES_DEV,
81}; 88};
@@ -702,13 +709,13 @@ static void ext4_put_super(struct super_block *sb)
702 struct ext4_super_block *es = sbi->s_es; 709 struct ext4_super_block *es = sbi->s_es;
703 int i, err; 710 int i, err;
704 711
712 ext4_unregister_li_request(sb);
705 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 713 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
706 714
707 flush_workqueue(sbi->dio_unwritten_wq); 715 flush_workqueue(sbi->dio_unwritten_wq);
708 destroy_workqueue(sbi->dio_unwritten_wq); 716 destroy_workqueue(sbi->dio_unwritten_wq);
709 717
710 lock_super(sb); 718 lock_super(sb);
711 lock_kernel();
712 if (sb->s_dirt) 719 if (sb->s_dirt)
713 ext4_commit_super(sb, 1); 720 ext4_commit_super(sb, 1);
714 721
@@ -719,6 +726,7 @@ static void ext4_put_super(struct super_block *sb)
719 ext4_abort(sb, "Couldn't clean up the journal"); 726 ext4_abort(sb, "Couldn't clean up the journal");
720 } 727 }
721 728
729 del_timer(&sbi->s_err_report);
722 ext4_release_system_zone(sb); 730 ext4_release_system_zone(sb);
723 ext4_mb_release(sb); 731 ext4_mb_release(sb);
724 ext4_ext_release(sb); 732 ext4_ext_release(sb);
@@ -775,7 +783,6 @@ static void ext4_put_super(struct super_block *sb)
775 * Now that we are completely done shutting down the 783 * Now that we are completely done shutting down the
776 * superblock, we need to actually destroy the kobject. 784 * superblock, we need to actually destroy the kobject.
777 */ 785 */
778 unlock_kernel();
779 unlock_super(sb); 786 unlock_super(sb);
780 kobject_put(&sbi->s_kobj); 787 kobject_put(&sbi->s_kobj);
781 wait_for_completion(&sbi->s_kobj_unregister); 788 wait_for_completion(&sbi->s_kobj_unregister);
@@ -821,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
821 ei->cur_aio_dio = NULL; 828 ei->cur_aio_dio = NULL;
822 ei->i_sync_tid = 0; 829 ei->i_sync_tid = 0;
823 ei->i_datasync_tid = 0; 830 ei->i_datasync_tid = 0;
831 atomic_set(&ei->i_ioend_count, 0);
824 832
825 return &ei->vfs_inode; 833 return &ei->vfs_inode;
826} 834}
827 835
836static int ext4_drop_inode(struct inode *inode)
837{
838 int drop = generic_drop_inode(inode);
839
840 trace_ext4_drop_inode(inode, drop);
841 return drop;
842}
843
828static void ext4_destroy_inode(struct inode *inode) 844static void ext4_destroy_inode(struct inode *inode)
829{ 845{
846 ext4_ioend_wait(inode);
830 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 847 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
831 ext4_msg(inode->i_sb, KERN_ERR, 848 ext4_msg(inode->i_sb, KERN_ERR,
832 "Inode %lu (%p): orphan list check failed!", 849 "Inode %lu (%p): orphan list check failed!",
@@ -1045,6 +1062,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1045 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) 1062 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1046 seq_puts(seq, ",block_validity"); 1063 seq_puts(seq, ",block_validity");
1047 1064
1065 if (!test_opt(sb, INIT_INODE_TABLE))
1066 seq_puts(seq, ",noinit_inode_table");
1067 else if (sbi->s_li_wait_mult)
1068 seq_printf(seq, ",init_inode_table=%u",
1069 (unsigned) sbi->s_li_wait_mult);
1070
1048 ext4_show_quota_options(seq, sb); 1071 ext4_show_quota_options(seq, sb);
1049 1072
1050 return 0; 1073 return 0;
@@ -1160,6 +1183,7 @@ static const struct super_operations ext4_sops = {
1160 .destroy_inode = ext4_destroy_inode, 1183 .destroy_inode = ext4_destroy_inode,
1161 .write_inode = ext4_write_inode, 1184 .write_inode = ext4_write_inode,
1162 .dirty_inode = ext4_dirty_inode, 1185 .dirty_inode = ext4_dirty_inode,
1186 .drop_inode = ext4_drop_inode,
1163 .evict_inode = ext4_evict_inode, 1187 .evict_inode = ext4_evict_inode,
1164 .put_super = ext4_put_super, 1188 .put_super = ext4_put_super,
1165 .sync_fs = ext4_sync_fs, 1189 .sync_fs = ext4_sync_fs,
@@ -1173,6 +1197,7 @@ static const struct super_operations ext4_sops = {
1173 .quota_write = ext4_quota_write, 1197 .quota_write = ext4_quota_write,
1174#endif 1198#endif
1175 .bdev_try_to_free_page = bdev_try_to_free_page, 1199 .bdev_try_to_free_page = bdev_try_to_free_page,
1200 .trim_fs = ext4_trim_fs
1176}; 1201};
1177 1202
1178static const struct super_operations ext4_nojournal_sops = { 1203static const struct super_operations ext4_nojournal_sops = {
@@ -1180,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = {
1180 .destroy_inode = ext4_destroy_inode, 1205 .destroy_inode = ext4_destroy_inode,
1181 .write_inode = ext4_write_inode, 1206 .write_inode = ext4_write_inode,
1182 .dirty_inode = ext4_dirty_inode, 1207 .dirty_inode = ext4_dirty_inode,
1208 .drop_inode = ext4_drop_inode,
1183 .evict_inode = ext4_evict_inode, 1209 .evict_inode = ext4_evict_inode,
1184 .write_super = ext4_write_super, 1210 .write_super = ext4_write_super,
1185 .put_super = ext4_put_super, 1211 .put_super = ext4_put_super,
@@ -1219,6 +1245,7 @@ enum {
1219 Opt_inode_readahead_blks, Opt_journal_ioprio, 1245 Opt_inode_readahead_blks, Opt_journal_ioprio,
1220 Opt_dioread_nolock, Opt_dioread_lock, 1246 Opt_dioread_nolock, Opt_dioread_lock,
1221 Opt_discard, Opt_nodiscard, 1247 Opt_discard, Opt_nodiscard,
1248 Opt_init_inode_table, Opt_noinit_inode_table,
1222}; 1249};
1223 1250
1224static const match_table_t tokens = { 1251static const match_table_t tokens = {
@@ -1289,6 +1316,9 @@ static const match_table_t tokens = {
1289 {Opt_dioread_lock, "dioread_lock"}, 1316 {Opt_dioread_lock, "dioread_lock"},
1290 {Opt_discard, "discard"}, 1317 {Opt_discard, "discard"},
1291 {Opt_nodiscard, "nodiscard"}, 1318 {Opt_nodiscard, "nodiscard"},
1319 {Opt_init_inode_table, "init_itable=%u"},
1320 {Opt_init_inode_table, "init_itable"},
1321 {Opt_noinit_inode_table, "noinit_itable"},
1292 {Opt_err, NULL}, 1322 {Opt_err, NULL},
1293}; 1323};
1294 1324
@@ -1759,6 +1789,20 @@ set_qf_format:
1759 case Opt_dioread_lock: 1789 case Opt_dioread_lock:
1760 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1790 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1761 break; 1791 break;
1792 case Opt_init_inode_table:
1793 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
1794 if (args[0].from) {
1795 if (match_int(&args[0], &option))
1796 return 0;
1797 } else
1798 option = EXT4_DEF_LI_WAIT_MULT;
1799 if (option < 0)
1800 return 0;
1801 sbi->s_li_wait_mult = option;
1802 break;
1803 case Opt_noinit_inode_table:
1804 clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
1805 break;
1762 default: 1806 default:
1763 ext4_msg(sb, KERN_ERR, 1807 ext4_msg(sb, KERN_ERR,
1764 "Unrecognized mount option \"%s\" " 1808 "Unrecognized mount option \"%s\" "
@@ -1942,7 +1986,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
1942} 1986}
1943 1987
1944/* Called at mount-time, super-block is locked */ 1988/* Called at mount-time, super-block is locked */
1945static int ext4_check_descriptors(struct super_block *sb) 1989static int ext4_check_descriptors(struct super_block *sb,
1990 ext4_group_t *first_not_zeroed)
1946{ 1991{
1947 struct ext4_sb_info *sbi = EXT4_SB(sb); 1992 struct ext4_sb_info *sbi = EXT4_SB(sb);
1948 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); 1993 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1951,7 +1996,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1951 ext4_fsblk_t inode_bitmap; 1996 ext4_fsblk_t inode_bitmap;
1952 ext4_fsblk_t inode_table; 1997 ext4_fsblk_t inode_table;
1953 int flexbg_flag = 0; 1998 int flexbg_flag = 0;
1954 ext4_group_t i; 1999 ext4_group_t i, grp = sbi->s_groups_count;
1955 2000
1956 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 2001 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1957 flexbg_flag = 1; 2002 flexbg_flag = 1;
@@ -1967,6 +2012,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1967 last_block = first_block + 2012 last_block = first_block +
1968 (EXT4_BLOCKS_PER_GROUP(sb) - 1); 2013 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1969 2014
2015 if ((grp == sbi->s_groups_count) &&
2016 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2017 grp = i;
2018
1970 block_bitmap = ext4_block_bitmap(sb, gdp); 2019 block_bitmap = ext4_block_bitmap(sb, gdp);
1971 if (block_bitmap < first_block || block_bitmap > last_block) { 2020 if (block_bitmap < first_block || block_bitmap > last_block) {
1972 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2021 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2004,6 +2053,8 @@ static int ext4_check_descriptors(struct super_block *sb)
2004 if (!flexbg_flag) 2053 if (!flexbg_flag)
2005 first_block += EXT4_BLOCKS_PER_GROUP(sb); 2054 first_block += EXT4_BLOCKS_PER_GROUP(sb);
2006 } 2055 }
2056 if (NULL != first_not_zeroed)
2057 *first_not_zeroed = grp;
2007 2058
2008 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 2059 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
2009 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); 2060 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2376,6 +2427,7 @@ static struct ext4_attr ext4_attr_##_name = { \
2376#define EXT4_ATTR(name, mode, show, store) \ 2427#define EXT4_ATTR(name, mode, show, store) \
2377static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2428static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2378 2429
2430#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2379#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) 2431#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2380#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) 2432#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2381#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2433#define EXT4_RW_ATTR_SBI_UI(name, elname) \
@@ -2412,6 +2464,16 @@ static struct attribute *ext4_attrs[] = {
2412 NULL, 2464 NULL,
2413}; 2465};
2414 2466
2467/* Features this copy of ext4 supports */
2468EXT4_INFO_ATTR(lazy_itable_init);
2469EXT4_INFO_ATTR(batched_discard);
2470
2471static struct attribute *ext4_feat_attrs[] = {
2472 ATTR_LIST(lazy_itable_init),
2473 ATTR_LIST(batched_discard),
2474 NULL,
2475};
2476
2415static ssize_t ext4_attr_show(struct kobject *kobj, 2477static ssize_t ext4_attr_show(struct kobject *kobj,
2416 struct attribute *attr, char *buf) 2478 struct attribute *attr, char *buf)
2417{ 2479{
@@ -2440,7 +2502,6 @@ static void ext4_sb_release(struct kobject *kobj)
2440 complete(&sbi->s_kobj_unregister); 2502 complete(&sbi->s_kobj_unregister);
2441} 2503}
2442 2504
2443
2444static const struct sysfs_ops ext4_attr_ops = { 2505static const struct sysfs_ops ext4_attr_ops = {
2445 .show = ext4_attr_show, 2506 .show = ext4_attr_show,
2446 .store = ext4_attr_store, 2507 .store = ext4_attr_store,
@@ -2452,6 +2513,17 @@ static struct kobj_type ext4_ktype = {
2452 .release = ext4_sb_release, 2513 .release = ext4_sb_release,
2453}; 2514};
2454 2515
2516static void ext4_feat_release(struct kobject *kobj)
2517{
2518 complete(&ext4_feat->f_kobj_unregister);
2519}
2520
2521static struct kobj_type ext4_feat_ktype = {
2522 .default_attrs = ext4_feat_attrs,
2523 .sysfs_ops = &ext4_attr_ops,
2524 .release = ext4_feat_release,
2525};
2526
2455/* 2527/*
2456 * Check whether this filesystem can be mounted based on 2528 * Check whether this filesystem can be mounted based on
2457 * the features present and the RDONLY/RDWR mount requested. 2529 * the features present and the RDONLY/RDWR mount requested.
@@ -2542,6 +2614,371 @@ static void print_daily_error_info(unsigned long arg)
2542 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2614 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2543} 2615}
2544 2616
2617static void ext4_lazyinode_timeout(unsigned long data)
2618{
2619 struct task_struct *p = (struct task_struct *)data;
2620 wake_up_process(p);
2621}
2622
2623/* Find next suitable group and run ext4_init_inode_table */
2624static int ext4_run_li_request(struct ext4_li_request *elr)
2625{
2626 struct ext4_group_desc *gdp = NULL;
2627 ext4_group_t group, ngroups;
2628 struct super_block *sb;
2629 unsigned long timeout = 0;
2630 int ret = 0;
2631
2632 sb = elr->lr_super;
2633 ngroups = EXT4_SB(sb)->s_groups_count;
2634
2635 for (group = elr->lr_next_group; group < ngroups; group++) {
2636 gdp = ext4_get_group_desc(sb, group, NULL);
2637 if (!gdp) {
2638 ret = 1;
2639 break;
2640 }
2641
2642 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2643 break;
2644 }
2645
2646 if (group == ngroups)
2647 ret = 1;
2648
2649 if (!ret) {
2650 timeout = jiffies;
2651 ret = ext4_init_inode_table(sb, group,
2652 elr->lr_timeout ? 0 : 1);
2653 if (elr->lr_timeout == 0) {
2654 timeout = jiffies - timeout;
2655 if (elr->lr_sbi->s_li_wait_mult)
2656 timeout *= elr->lr_sbi->s_li_wait_mult;
2657 else
2658 timeout *= 20;
2659 elr->lr_timeout = timeout;
2660 }
2661 elr->lr_next_sched = jiffies + elr->lr_timeout;
2662 elr->lr_next_group = group + 1;
2663 }
2664
2665 return ret;
2666}
2667
2668/*
2669 * Remove lr_request from the list_request and free the
2670 * request tructure. Should be called with li_list_mtx held
2671 */
2672static void ext4_remove_li_request(struct ext4_li_request *elr)
2673{
2674 struct ext4_sb_info *sbi;
2675
2676 if (!elr)
2677 return;
2678
2679 sbi = elr->lr_sbi;
2680
2681 list_del(&elr->lr_request);
2682 sbi->s_li_request = NULL;
2683 kfree(elr);
2684}
2685
2686static void ext4_unregister_li_request(struct super_block *sb)
2687{
2688 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
2689
2690 if (!ext4_li_info)
2691 return;
2692
2693 mutex_lock(&ext4_li_info->li_list_mtx);
2694 ext4_remove_li_request(elr);
2695 mutex_unlock(&ext4_li_info->li_list_mtx);
2696}
2697
2698/*
2699 * This is the function where ext4lazyinit thread lives. It walks
2700 * through the request list searching for next scheduled filesystem.
2701 * When such a fs is found, run the lazy initialization request
2702 * (ext4_rn_li_request) and keep track of the time spend in this
2703 * function. Based on that time we compute next schedule time of
2704 * the request. When walking through the list is complete, compute
2705 * next waking time and put itself into sleep.
2706 */
2707static int ext4_lazyinit_thread(void *arg)
2708{
2709 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2710 struct list_head *pos, *n;
2711 struct ext4_li_request *elr;
2712 unsigned long next_wakeup;
2713 DEFINE_WAIT(wait);
2714
2715 BUG_ON(NULL == eli);
2716
2717 eli->li_timer.data = (unsigned long)current;
2718 eli->li_timer.function = ext4_lazyinode_timeout;
2719
2720 eli->li_task = current;
2721 wake_up(&eli->li_wait_task);
2722
2723cont_thread:
2724 while (true) {
2725 next_wakeup = MAX_JIFFY_OFFSET;
2726
2727 mutex_lock(&eli->li_list_mtx);
2728 if (list_empty(&eli->li_request_list)) {
2729 mutex_unlock(&eli->li_list_mtx);
2730 goto exit_thread;
2731 }
2732
2733 list_for_each_safe(pos, n, &eli->li_request_list) {
2734 elr = list_entry(pos, struct ext4_li_request,
2735 lr_request);
2736
2737 if (time_after_eq(jiffies, elr->lr_next_sched)) {
2738 if (ext4_run_li_request(elr) != 0) {
2739 /* error, remove the lazy_init job */
2740 ext4_remove_li_request(elr);
2741 continue;
2742 }
2743 }
2744
2745 if (time_before(elr->lr_next_sched, next_wakeup))
2746 next_wakeup = elr->lr_next_sched;
2747 }
2748 mutex_unlock(&eli->li_list_mtx);
2749
2750 if (freezing(current))
2751 refrigerator();
2752
2753 if ((time_after_eq(jiffies, next_wakeup)) ||
2754 (MAX_JIFFY_OFFSET == next_wakeup)) {
2755 cond_resched();
2756 continue;
2757 }
2758
2759 eli->li_timer.expires = next_wakeup;
2760 add_timer(&eli->li_timer);
2761 prepare_to_wait(&eli->li_wait_daemon, &wait,
2762 TASK_INTERRUPTIBLE);
2763 if (time_before(jiffies, next_wakeup))
2764 schedule();
2765 finish_wait(&eli->li_wait_daemon, &wait);
2766 }
2767
2768exit_thread:
2769 /*
2770 * It looks like the request list is empty, but we need
2771 * to check it under the li_list_mtx lock, to prevent any
2772 * additions into it, and of course we should lock ext4_li_mtx
2773 * to atomically free the list and ext4_li_info, because at
2774 * this point another ext4 filesystem could be registering
2775 * new one.
2776 */
2777 mutex_lock(&ext4_li_mtx);
2778 mutex_lock(&eli->li_list_mtx);
2779 if (!list_empty(&eli->li_request_list)) {
2780 mutex_unlock(&eli->li_list_mtx);
2781 mutex_unlock(&ext4_li_mtx);
2782 goto cont_thread;
2783 }
2784 mutex_unlock(&eli->li_list_mtx);
2785 del_timer_sync(&ext4_li_info->li_timer);
2786 eli->li_task = NULL;
2787 wake_up(&eli->li_wait_task);
2788
2789 kfree(ext4_li_info);
2790 ext4_li_info = NULL;
2791 mutex_unlock(&ext4_li_mtx);
2792
2793 return 0;
2794}
2795
2796static void ext4_clear_request_list(void)
2797{
2798 struct list_head *pos, *n;
2799 struct ext4_li_request *elr;
2800
2801 mutex_lock(&ext4_li_info->li_list_mtx);
2802 if (list_empty(&ext4_li_info->li_request_list))
2803 return;
2804
2805 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2806 elr = list_entry(pos, struct ext4_li_request,
2807 lr_request);
2808 ext4_remove_li_request(elr);
2809 }
2810 mutex_unlock(&ext4_li_info->li_list_mtx);
2811}
2812
2813static int ext4_run_lazyinit_thread(void)
2814{
2815 struct task_struct *t;
2816
2817 t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
2818 if (IS_ERR(t)) {
2819 int err = PTR_ERR(t);
2820 ext4_clear_request_list();
2821 del_timer_sync(&ext4_li_info->li_timer);
2822 kfree(ext4_li_info);
2823 ext4_li_info = NULL;
2824 printk(KERN_CRIT "EXT4: error %d creating inode table "
2825 "initialization thread\n",
2826 err);
2827 return err;
2828 }
2829 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2830
2831 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2832 return 0;
2833}
2834
2835/*
2836 * Check whether it make sense to run itable init. thread or not.
2837 * If there is at least one uninitialized inode table, return
2838 * corresponding group number, else the loop goes through all
2839 * groups and return total number of groups.
2840 */
2841static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
2842{
2843 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
2844 struct ext4_group_desc *gdp = NULL;
2845
2846 for (group = 0; group < ngroups; group++) {
2847 gdp = ext4_get_group_desc(sb, group, NULL);
2848 if (!gdp)
2849 continue;
2850
2851 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2852 break;
2853 }
2854
2855 return group;
2856}
2857
2858static int ext4_li_info_new(void)
2859{
2860 struct ext4_lazy_init *eli = NULL;
2861
2862 eli = kzalloc(sizeof(*eli), GFP_KERNEL);
2863 if (!eli)
2864 return -ENOMEM;
2865
2866 eli->li_task = NULL;
2867 INIT_LIST_HEAD(&eli->li_request_list);
2868 mutex_init(&eli->li_list_mtx);
2869
2870 init_waitqueue_head(&eli->li_wait_daemon);
2871 init_waitqueue_head(&eli->li_wait_task);
2872 init_timer(&eli->li_timer);
2873 eli->li_state |= EXT4_LAZYINIT_QUIT;
2874
2875 ext4_li_info = eli;
2876
2877 return 0;
2878}
2879
2880static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
2881 ext4_group_t start)
2882{
2883 struct ext4_sb_info *sbi = EXT4_SB(sb);
2884 struct ext4_li_request *elr;
2885 unsigned long rnd;
2886
2887 elr = kzalloc(sizeof(*elr), GFP_KERNEL);
2888 if (!elr)
2889 return NULL;
2890
2891 elr->lr_super = sb;
2892 elr->lr_sbi = sbi;
2893 elr->lr_next_group = start;
2894
2895 /*
2896 * Randomize first schedule time of the request to
2897 * spread the inode table initialization requests
2898 * better.
2899 */
2900 get_random_bytes(&rnd, sizeof(rnd));
2901 elr->lr_next_sched = jiffies + (unsigned long)rnd %
2902 (EXT4_DEF_LI_MAX_START_DELAY * HZ);
2903
2904 return elr;
2905}
2906
2907static int ext4_register_li_request(struct super_block *sb,
2908 ext4_group_t first_not_zeroed)
2909{
2910 struct ext4_sb_info *sbi = EXT4_SB(sb);
2911 struct ext4_li_request *elr;
2912 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2913 int ret;
2914
2915 if (sbi->s_li_request != NULL)
2916 return 0;
2917
2918 if (first_not_zeroed == ngroups ||
2919 (sb->s_flags & MS_RDONLY) ||
2920 !test_opt(sb, INIT_INODE_TABLE)) {
2921 sbi->s_li_request = NULL;
2922 return 0;
2923 }
2924
2925 if (first_not_zeroed == ngroups) {
2926 sbi->s_li_request = NULL;
2927 return 0;
2928 }
2929
2930 elr = ext4_li_request_new(sb, first_not_zeroed);
2931 if (!elr)
2932 return -ENOMEM;
2933
2934 mutex_lock(&ext4_li_mtx);
2935
2936 if (NULL == ext4_li_info) {
2937 ret = ext4_li_info_new();
2938 if (ret)
2939 goto out;
2940 }
2941
2942 mutex_lock(&ext4_li_info->li_list_mtx);
2943 list_add(&elr->lr_request, &ext4_li_info->li_request_list);
2944 mutex_unlock(&ext4_li_info->li_list_mtx);
2945
2946 sbi->s_li_request = elr;
2947
2948 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
2949 ret = ext4_run_lazyinit_thread();
2950 if (ret)
2951 goto out;
2952 }
2953out:
2954 mutex_unlock(&ext4_li_mtx);
2955 if (ret)
2956 kfree(elr);
2957 return ret;
2958}
2959
2960/*
2961 * We do not need to lock anything since this is called on
2962 * module unload.
2963 */
2964static void ext4_destroy_lazyinit_thread(void)
2965{
2966 /*
2967 * If thread exited earlier
2968 * there's nothing to be done.
2969 */
2970 if (!ext4_li_info)
2971 return;
2972
2973 ext4_clear_request_list();
2974
2975 while (ext4_li_info->li_task) {
2976 wake_up(&ext4_li_info->li_wait_daemon);
2977 wait_event(ext4_li_info->li_wait_task,
2978 ext4_li_info->li_task == NULL);
2979 }
2980}
2981
2545static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2982static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2546 __releases(kernel_lock) 2983 __releases(kernel_lock)
2547 __acquires(kernel_lock) 2984 __acquires(kernel_lock)
@@ -2567,6 +3004,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2567 __u64 blocks_count; 3004 __u64 blocks_count;
2568 int err; 3005 int err;
2569 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3006 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3007 ext4_group_t first_not_zeroed;
2570 3008
2571 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 3009 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2572 if (!sbi) 3010 if (!sbi)
@@ -2588,8 +3026,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2588 sbi->s_sectors_written_start = 3026 sbi->s_sectors_written_start =
2589 part_stat_read(sb->s_bdev->bd_part, sectors[1]); 3027 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
2590 3028
2591 unlock_kernel();
2592
2593 /* Cleanup superblock name */ 3029 /* Cleanup superblock name */
2594 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3030 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
2595 *cp = '!'; 3031 *cp = '!';
@@ -2629,6 +3065,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2629 3065
2630 /* Set defaults before we parse the mount options */ 3066 /* Set defaults before we parse the mount options */
2631 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3067 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3068 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
2632 if (def_mount_opts & EXT4_DEFM_DEBUG) 3069 if (def_mount_opts & EXT4_DEFM_DEBUG)
2633 set_opt(sbi->s_mount_opt, DEBUG); 3070 set_opt(sbi->s_mount_opt, DEBUG);
2634 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3071 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2906,7 +3343,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2906 goto failed_mount2; 3343 goto failed_mount2;
2907 } 3344 }
2908 } 3345 }
2909 if (!ext4_check_descriptors(sb)) { 3346 if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
2910 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3347 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
2911 goto failed_mount2; 3348 goto failed_mount2;
2912 } 3349 }
@@ -2922,6 +3359,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2922 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3359 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2923 spin_lock_init(&sbi->s_next_gen_lock); 3360 spin_lock_init(&sbi->s_next_gen_lock);
2924 3361
3362 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3363 ext4_count_free_blocks(sb));
3364 if (!err) {
3365 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3366 ext4_count_free_inodes(sb));
3367 }
3368 if (!err) {
3369 err = percpu_counter_init(&sbi->s_dirs_counter,
3370 ext4_count_dirs(sb));
3371 }
3372 if (!err) {
3373 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
3374 }
3375 if (err) {
3376 ext4_msg(sb, KERN_ERR, "insufficient memory");
3377 goto failed_mount3;
3378 }
3379
2925 sbi->s_stripe = ext4_get_stripe_size(sbi); 3380 sbi->s_stripe = ext4_get_stripe_size(sbi);
2926 sbi->s_max_writeback_mb_bump = 128; 3381 sbi->s_max_writeback_mb_bump = 128;
2927 3382
@@ -3020,22 +3475,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3020 } 3475 }
3021 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3476 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3022 3477
3023no_journal: 3478 /*
3024 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3479 * The journal may have updated the bg summary counts, so we
3025 ext4_count_free_blocks(sb)); 3480 * need to update the global counters.
3026 if (!err) 3481 */
3027 err = percpu_counter_init(&sbi->s_freeinodes_counter, 3482 percpu_counter_set(&sbi->s_freeblocks_counter,
3028 ext4_count_free_inodes(sb)); 3483 ext4_count_free_blocks(sb));
3029 if (!err) 3484 percpu_counter_set(&sbi->s_freeinodes_counter,
3030 err = percpu_counter_init(&sbi->s_dirs_counter, 3485 ext4_count_free_inodes(sb));
3031 ext4_count_dirs(sb)); 3486 percpu_counter_set(&sbi->s_dirs_counter,
3032 if (!err) 3487 ext4_count_dirs(sb));
3033 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3488 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3034 if (err) {
3035 ext4_msg(sb, KERN_ERR, "insufficient memory");
3036 goto failed_mount_wq;
3037 }
3038 3489
3490no_journal:
3039 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3491 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
3040 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3492 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3041 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3493 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3127,6 +3579,10 @@ no_journal:
3127 goto failed_mount4; 3579 goto failed_mount4;
3128 } 3580 }
3129 3581
3582 err = ext4_register_li_request(sb, first_not_zeroed);
3583 if (err)
3584 goto failed_mount4;
3585
3130 sbi->s_kobj.kset = ext4_kset; 3586 sbi->s_kobj.kset = ext4_kset;
3131 init_completion(&sbi->s_kobj_unregister); 3587 init_completion(&sbi->s_kobj_unregister);
3132 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, 3588 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3164,7 +3620,6 @@ no_journal:
3164 if (es->s_error_count) 3620 if (es->s_error_count)
3165 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 3621 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3166 3622
3167 lock_kernel();
3168 kfree(orig_data); 3623 kfree(orig_data);
3169 return 0; 3624 return 0;
3170 3625
@@ -3182,10 +3637,6 @@ failed_mount_wq:
3182 jbd2_journal_destroy(sbi->s_journal); 3637 jbd2_journal_destroy(sbi->s_journal);
3183 sbi->s_journal = NULL; 3638 sbi->s_journal = NULL;
3184 } 3639 }
3185 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3186 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3187 percpu_counter_destroy(&sbi->s_dirs_counter);
3188 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3189failed_mount3: 3640failed_mount3:
3190 if (sbi->s_flex_groups) { 3641 if (sbi->s_flex_groups) {
3191 if (is_vmalloc_addr(sbi->s_flex_groups)) 3642 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3193,6 +3644,10 @@ failed_mount3:
3193 else 3644 else
3194 kfree(sbi->s_flex_groups); 3645 kfree(sbi->s_flex_groups);
3195 } 3646 }
3647 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3648 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3649 percpu_counter_destroy(&sbi->s_dirs_counter);
3650 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3196failed_mount2: 3651failed_mount2:
3197 for (i = 0; i < db_count; i++) 3652 for (i = 0; i < db_count; i++)
3198 brelse(sbi->s_group_desc[i]); 3653 brelse(sbi->s_group_desc[i]);
@@ -3211,7 +3666,6 @@ out_fail:
3211 sb->s_fs_info = NULL; 3666 sb->s_fs_info = NULL;
3212 kfree(sbi->s_blockgroup_lock); 3667 kfree(sbi->s_blockgroup_lock);
3213 kfree(sbi); 3668 kfree(sbi);
3214 lock_kernel();
3215out_free_orig: 3669out_free_orig:
3216 kfree(orig_data); 3670 kfree(orig_data);
3217 return ret; 3671 return ret;
@@ -3468,7 +3922,7 @@ static int ext4_load_journal(struct super_block *sb,
3468 EXT4_SB(sb)->s_journal = journal; 3922 EXT4_SB(sb)->s_journal = journal;
3469 ext4_clear_journal_err(sb, es); 3923 ext4_clear_journal_err(sb, es);
3470 3924
3471 if (journal_devnum && 3925 if (!really_read_only && journal_devnum &&
3472 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 3926 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3473 es->s_journal_dev = cpu_to_le32(journal_devnum); 3927 es->s_journal_dev = cpu_to_le32(journal_devnum);
3474 3928
@@ -3522,9 +3976,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3522 es->s_kbytes_written = 3976 es->s_kbytes_written =
3523 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 3977 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3524 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3978 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3525 &EXT4_SB(sb)->s_freeblocks_counter)); 3979 &EXT4_SB(sb)->s_freeblocks_counter));
3526 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3980 es->s_free_inodes_count =
3527 &EXT4_SB(sb)->s_freeinodes_counter)); 3981 cpu_to_le32(percpu_counter_sum_positive(
3982 &EXT4_SB(sb)->s_freeinodes_counter));
3528 sb->s_dirt = 0; 3983 sb->s_dirt = 0;
3529 BUFFER_TRACE(sbh, "marking dirty"); 3984 BUFFER_TRACE(sbh, "marking dirty");
3530 mark_buffer_dirty(sbh); 3985 mark_buffer_dirty(sbh);
@@ -3720,8 +4175,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3720#endif 4175#endif
3721 char *orig_data = kstrdup(data, GFP_KERNEL); 4176 char *orig_data = kstrdup(data, GFP_KERNEL);
3722 4177
3723 lock_kernel();
3724
3725 /* Store the original options */ 4178 /* Store the original options */
3726 lock_super(sb); 4179 lock_super(sb);
3727 old_sb_flags = sb->s_flags; 4180 old_sb_flags = sb->s_flags;
@@ -3844,6 +4297,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3844 enable_quota = 1; 4297 enable_quota = 1;
3845 } 4298 }
3846 } 4299 }
4300
4301 /*
4302 * Reinitialize lazy itable initialization thread based on
4303 * current settings
4304 */
4305 if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
4306 ext4_unregister_li_request(sb);
4307 else {
4308 ext4_group_t first_not_zeroed;
4309 first_not_zeroed = ext4_has_uninit_itable(sb);
4310 ext4_register_li_request(sb, first_not_zeroed);
4311 }
4312
3847 ext4_setup_system_zone(sb); 4313 ext4_setup_system_zone(sb);
3848 if (sbi->s_journal == NULL) 4314 if (sbi->s_journal == NULL)
3849 ext4_commit_super(sb, 1); 4315 ext4_commit_super(sb, 1);
@@ -3856,7 +4322,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3856 kfree(old_opts.s_qf_names[i]); 4322 kfree(old_opts.s_qf_names[i]);
3857#endif 4323#endif
3858 unlock_super(sb); 4324 unlock_super(sb);
3859 unlock_kernel();
3860 if (enable_quota) 4325 if (enable_quota)
3861 dquot_resume(sb, -1); 4326 dquot_resume(sb, -1);
3862 4327
@@ -3882,7 +4347,6 @@ restore_opts:
3882 } 4347 }
3883#endif 4348#endif
3884 unlock_super(sb); 4349 unlock_super(sb);
3885 unlock_kernel();
3886 kfree(orig_data); 4350 kfree(orig_data);
3887 return err; 4351 return err;
3888} 4352}
@@ -4116,12 +4580,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4116 4580
4117static int ext4_quota_off(struct super_block *sb, int type) 4581static int ext4_quota_off(struct super_block *sb, int type)
4118{ 4582{
4119 /* Force all delayed allocation blocks to be allocated */ 4583 /* Force all delayed allocation blocks to be allocated.
4120 if (test_opt(sb, DELALLOC)) { 4584 * Caller already holds s_umount sem */
4121 down_read(&sb->s_umount); 4585 if (test_opt(sb, DELALLOC))
4122 sync_filesystem(sb); 4586 sync_filesystem(sb);
4123 up_read(&sb->s_umount);
4124 }
4125 4587
4126 return dquot_quota_off(sb, type); 4588 return dquot_quota_off(sb, type);
4127} 4589}
@@ -4227,17 +4689,17 @@ out:
4227 4689
4228#endif 4690#endif
4229 4691
4230static int ext4_get_sb(struct file_system_type *fs_type, int flags, 4692static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4231 const char *dev_name, void *data, struct vfsmount *mnt) 4693 const char *dev_name, void *data)
4232{ 4694{
4233 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4695 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
4234} 4696}
4235 4697
4236#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4698#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4237static struct file_system_type ext2_fs_type = { 4699static struct file_system_type ext2_fs_type = {
4238 .owner = THIS_MODULE, 4700 .owner = THIS_MODULE,
4239 .name = "ext2", 4701 .name = "ext2",
4240 .get_sb = ext4_get_sb, 4702 .mount = ext4_mount,
4241 .kill_sb = kill_block_super, 4703 .kill_sb = kill_block_super,
4242 .fs_flags = FS_REQUIRES_DEV, 4704 .fs_flags = FS_REQUIRES_DEV,
4243}; 4705};
@@ -4282,28 +4744,58 @@ static inline void unregister_as_ext3(void) { }
4282static struct file_system_type ext4_fs_type = { 4744static struct file_system_type ext4_fs_type = {
4283 .owner = THIS_MODULE, 4745 .owner = THIS_MODULE,
4284 .name = "ext4", 4746 .name = "ext4",
4285 .get_sb = ext4_get_sb, 4747 .mount = ext4_mount,
4286 .kill_sb = kill_block_super, 4748 .kill_sb = kill_block_super,
4287 .fs_flags = FS_REQUIRES_DEV, 4749 .fs_flags = FS_REQUIRES_DEV,
4288}; 4750};
4289 4751
4290static int __init init_ext4_fs(void) 4752int __init ext4_init_feat_adverts(void)
4753{
4754 struct ext4_features *ef;
4755 int ret = -ENOMEM;
4756
4757 ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
4758 if (!ef)
4759 goto out;
4760
4761 ef->f_kobj.kset = ext4_kset;
4762 init_completion(&ef->f_kobj_unregister);
4763 ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
4764 "features");
4765 if (ret) {
4766 kfree(ef);
4767 goto out;
4768 }
4769
4770 ext4_feat = ef;
4771 ret = 0;
4772out:
4773 return ret;
4774}
4775
4776static int __init ext4_init_fs(void)
4291{ 4777{
4292 int err; 4778 int err;
4293 4779
4294 ext4_check_flag_values(); 4780 ext4_check_flag_values();
4295 err = init_ext4_system_zone(); 4781 err = ext4_init_pageio();
4296 if (err) 4782 if (err)
4297 return err; 4783 return err;
4784 err = ext4_init_system_zone();
4785 if (err)
4786 goto out5;
4298 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 4787 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4299 if (!ext4_kset) 4788 if (!ext4_kset)
4300 goto out4; 4789 goto out4;
4301 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 4790 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4302 err = init_ext4_mballoc(); 4791
4792 err = ext4_init_feat_adverts();
4793
4794 err = ext4_init_mballoc();
4303 if (err) 4795 if (err)
4304 goto out3; 4796 goto out3;
4305 4797
4306 err = init_ext4_xattr(); 4798 err = ext4_init_xattr();
4307 if (err) 4799 if (err)
4308 goto out2; 4800 goto out2;
4309 err = init_inodecache(); 4801 err = init_inodecache();
@@ -4314,38 +4806,46 @@ static int __init init_ext4_fs(void)
4314 err = register_filesystem(&ext4_fs_type); 4806 err = register_filesystem(&ext4_fs_type);
4315 if (err) 4807 if (err)
4316 goto out; 4808 goto out;
4809
4810 ext4_li_info = NULL;
4811 mutex_init(&ext4_li_mtx);
4317 return 0; 4812 return 0;
4318out: 4813out:
4319 unregister_as_ext2(); 4814 unregister_as_ext2();
4320 unregister_as_ext3(); 4815 unregister_as_ext3();
4321 destroy_inodecache(); 4816 destroy_inodecache();
4322out1: 4817out1:
4323 exit_ext4_xattr(); 4818 ext4_exit_xattr();
4324out2: 4819out2:
4325 exit_ext4_mballoc(); 4820 ext4_exit_mballoc();
4326out3: 4821out3:
4822 kfree(ext4_feat);
4327 remove_proc_entry("fs/ext4", NULL); 4823 remove_proc_entry("fs/ext4", NULL);
4328 kset_unregister(ext4_kset); 4824 kset_unregister(ext4_kset);
4329out4: 4825out4:
4330 exit_ext4_system_zone(); 4826 ext4_exit_system_zone();
4827out5:
4828 ext4_exit_pageio();
4331 return err; 4829 return err;
4332} 4830}
4333 4831
4334static void __exit exit_ext4_fs(void) 4832static void __exit ext4_exit_fs(void)
4335{ 4833{
4834 ext4_destroy_lazyinit_thread();
4336 unregister_as_ext2(); 4835 unregister_as_ext2();
4337 unregister_as_ext3(); 4836 unregister_as_ext3();
4338 unregister_filesystem(&ext4_fs_type); 4837 unregister_filesystem(&ext4_fs_type);
4339 destroy_inodecache(); 4838 destroy_inodecache();
4340 exit_ext4_xattr(); 4839 ext4_exit_xattr();
4341 exit_ext4_mballoc(); 4840 ext4_exit_mballoc();
4342 remove_proc_entry("fs/ext4", NULL); 4841 remove_proc_entry("fs/ext4", NULL);
4343 kset_unregister(ext4_kset); 4842 kset_unregister(ext4_kset);
4344 exit_ext4_system_zone(); 4843 ext4_exit_system_zone();
4844 ext4_exit_pageio();
4345} 4845}
4346 4846
4347MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 4847MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
4348MODULE_DESCRIPTION("Fourth Extended Filesystem"); 4848MODULE_DESCRIPTION("Fourth Extended Filesystem");
4349MODULE_LICENSE("GPL"); 4849MODULE_LICENSE("GPL");
4350module_init(init_ext4_fs) 4850module_init(ext4_init_fs)
4351module_exit(exit_ext4_fs) 4851module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..fa4b899da4b3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1588#undef BLOCK_HASH_SHIFT 1588#undef BLOCK_HASH_SHIFT
1589 1589
1590int __init 1590int __init
1591init_ext4_xattr(void) 1591ext4_init_xattr(void)
1592{ 1592{
1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); 1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
1594 if (!ext4_xattr_cache) 1594 if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
1597} 1597}
1598 1598
1599void 1599void
1600exit_ext4_xattr(void) 1600ext4_exit_xattr(void)
1601{ 1601{
1602 if (ext4_xattr_cache) 1602 if (ext4_xattr_cache)
1603 mb_cache_destroy(ext4_xattr_cache); 1603 mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..1ef16520b950 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
84 struct ext4_inode *raw_inode, handle_t *handle); 84 struct ext4_inode *raw_inode, handle_t *handle);
85 85
86extern int init_ext4_xattr(void); 86extern int __init ext4_init_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void ext4_exit_xattr(void);
88 88
89extern const struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
121{ 121{
122} 122}
123 123
124static inline int 124static __init inline int
125init_ext4_xattr(void) 125ext4_init_xattr(void)
126{ 126{
127 return 0; 127 return 0;
128} 128}
129 129
130static inline void 130static inline void
131exit_ext4_xattr(void) 131ext4_exit_xattr(void)
132{ 132{
133} 133}
134 134
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
577 577
578 sb_issue_discard(sb, 578 sb_issue_discard(sb,
579 fat_clus_to_blknr(sbi, first_cl), 579 fat_clus_to_blknr(sbi, first_cl),
580 nr_clus * sbi->sec_per_clus); 580 nr_clus * sbi->sec_per_clus,
581 GFP_NOFS, 0);
581 582
582 first_cl = cluster; 583 first_cl = cluster;
583 } 584 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 830058057d33..ad6998a92c30 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/smp_lock.h>
18#include <linux/seq_file.h> 17#include <linux/seq_file.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include <linux/mpage.h> 19#include <linux/mpage.h>
@@ -489,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
489{ 488{
490 struct msdos_sb_info *sbi = MSDOS_SB(sb); 489 struct msdos_sb_info *sbi = MSDOS_SB(sb);
491 490
492 lock_kernel();
493
494 if (sb->s_dirt) 491 if (sb->s_dirt)
495 fat_write_super(sb); 492 fat_write_super(sb);
496 493
@@ -504,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
504 501
505 sb->s_fs_info = NULL; 502 sb->s_fs_info = NULL;
506 kfree(sbi); 503 kfree(sbi);
507
508 unlock_kernel();
509} 504}
510 505
511static struct kmem_cache *fat_inode_cachep; 506static struct kmem_cache *fat_inode_cachep;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f2356388..970e682ea754 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
255 255
256 for (i = 0; i < nr_bhs; i++) { 256 for (i = 0; i < nr_bhs; i++) {
257 wait_on_buffer(bhs[i]); 257 wait_on_buffer(bhs[i]);
258 if (buffer_eopnotsupp(bhs[i])) { 258 if (!err && !buffer_uptodate(bhs[i]))
259 clear_buffer_eopnotsupp(bhs[i]);
260 err = -EOPNOTSUPP;
261 } else if (!err && !buffer_uptodate(bhs[i]))
262 err = -EIO; 259 err = -EIO;
263 } 260 }
264 return err; 261 return err;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd77..3345aabd1dd7 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -662,27 +662,30 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
662{ 662{
663 int res; 663 int res;
664 664
665 lock_super(sb);
665 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0); 666 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
666 if (res) 667 if (res) {
668 unlock_super(sb);
667 return res; 669 return res;
670 }
668 671
669 sb->s_flags |= MS_NOATIME; 672 sb->s_flags |= MS_NOATIME;
670 sb->s_root->d_op = &msdos_dentry_operations; 673 sb->s_root->d_op = &msdos_dentry_operations;
674 unlock_super(sb);
671 return 0; 675 return 0;
672} 676}
673 677
674static int msdos_get_sb(struct file_system_type *fs_type, 678static struct dentry *msdos_mount(struct file_system_type *fs_type,
675 int flags, const char *dev_name, 679 int flags, const char *dev_name,
676 void *data, struct vfsmount *mnt) 680 void *data)
677{ 681{
678 return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super, 682 return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
679 mnt);
680} 683}
681 684
682static struct file_system_type msdos_fs_type = { 685static struct file_system_type msdos_fs_type = {
683 .owner = THIS_MODULE, 686 .owner = THIS_MODULE,
684 .name = "msdos", 687 .name = "msdos",
685 .get_sb = msdos_get_sb, 688 .mount = msdos_mount,
686 .kill_sb = kill_block_super, 689 .kill_sb = kill_block_super,
687 .fs_flags = FS_REQUIRES_DEV, 690 .fs_flags = FS_REQUIRES_DEV,
688}; 691};
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fbaa..b936703b8924 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1055,30 +1055,33 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
1055{ 1055{
1056 int res; 1056 int res;
1057 1057
1058 lock_super(sb);
1058 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1); 1059 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
1059 if (res) 1060 if (res) {
1061 unlock_super(sb);
1060 return res; 1062 return res;
1063 }
1061 1064
1062 if (MSDOS_SB(sb)->options.name_check != 's') 1065 if (MSDOS_SB(sb)->options.name_check != 's')
1063 sb->s_root->d_op = &vfat_ci_dentry_ops; 1066 sb->s_root->d_op = &vfat_ci_dentry_ops;
1064 else 1067 else
1065 sb->s_root->d_op = &vfat_dentry_ops; 1068 sb->s_root->d_op = &vfat_dentry_ops;
1066 1069
1070 unlock_super(sb);
1067 return 0; 1071 return 0;
1068} 1072}
1069 1073
1070static int vfat_get_sb(struct file_system_type *fs_type, 1074static struct dentry *vfat_mount(struct file_system_type *fs_type,
1071 int flags, const char *dev_name, 1075 int flags, const char *dev_name,
1072 void *data, struct vfsmount *mnt) 1076 void *data)
1073{ 1077{
1074 return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super, 1078 return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
1075 mnt);
1076} 1079}
1077 1080
1078static struct file_system_type vfat_fs_type = { 1081static struct file_system_type vfat_fs_type = {
1079 .owner = THIS_MODULE, 1082 .owner = THIS_MODULE,
1080 .name = "vfat", 1083 .name = "vfat",
1081 .get_sb = vfat_get_sb, 1084 .mount = vfat_mount,
1082 .kill_sb = kill_block_super, 1085 .kill_sb = kill_block_super,
1083 .fs_flags = FS_REQUIRES_DEV, 1086 .fs_flags = FS_REQUIRES_DEV,
1084}; 1087};
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f8cc34f542c3..ecc8b3954ed6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
640 * match the state "is the filp on a fasync list". 640 * match the state "is the filp on a fasync list".
641 * 641 *
642 */ 642 */
643static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 643int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
644{ 644{
645 struct fasync_struct *fa, **fp; 645 struct fasync_struct *fa, **fp;
646 int result = 0; 646 int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
666 return result; 666 return result;
667} 667}
668 668
669struct fasync_struct *fasync_alloc(void)
670{
671 return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
672}
673
669/* 674/*
670 * Add a fasync entry. Return negative on error, positive if 675 * NOTE! This can be used only for unused fasync entries:
671 * added, and zero if did nothing but change an existing one. 676 * entries that actually got inserted on the fasync list
677 * need to be released by rcu - see fasync_remove_entry.
678 */
679void fasync_free(struct fasync_struct *new)
680{
681 kmem_cache_free(fasync_cache, new);
682}
683
684/*
685 * Insert a new entry into the fasync list. Return the pointer to the
686 * old one if we didn't use the new one.
672 * 687 *
673 * NOTE! It is very important that the FASYNC flag always 688 * NOTE! It is very important that the FASYNC flag always
674 * match the state "is the filp on a fasync list". 689 * match the state "is the filp on a fasync list".
675 */ 690 */
676static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) 691struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
677{ 692{
678 struct fasync_struct *new, *fa, **fp; 693 struct fasync_struct *fa, **fp;
679 int result = 0;
680
681 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
682 if (!new)
683 return -ENOMEM;
684 694
685 spin_lock(&filp->f_lock); 695 spin_lock(&filp->f_lock);
686 spin_lock(&fasync_lock); 696 spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
691 spin_lock_irq(&fa->fa_lock); 701 spin_lock_irq(&fa->fa_lock);
692 fa->fa_fd = fd; 702 fa->fa_fd = fd;
693 spin_unlock_irq(&fa->fa_lock); 703 spin_unlock_irq(&fa->fa_lock);
694
695 kmem_cache_free(fasync_cache, new);
696 goto out; 704 goto out;
697 } 705 }
698 706
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
702 new->fa_fd = fd; 710 new->fa_fd = fd;
703 new->fa_next = *fapp; 711 new->fa_next = *fapp;
704 rcu_assign_pointer(*fapp, new); 712 rcu_assign_pointer(*fapp, new);
705 result = 1;
706 filp->f_flags |= FASYNC; 713 filp->f_flags |= FASYNC;
707 714
708out: 715out:
709 spin_unlock(&fasync_lock); 716 spin_unlock(&fasync_lock);
710 spin_unlock(&filp->f_lock); 717 spin_unlock(&filp->f_lock);
711 return result; 718 return fa;
719}
720
721/*
722 * Add a fasync entry. Return negative on error, positive if
723 * added, and zero if did nothing but change an existing one.
724 */
725static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
726{
727 struct fasync_struct *new;
728
729 new = fasync_alloc();
730 if (!new)
731 return -ENOMEM;
732
733 /*
734 * fasync_insert_entry() returns the old (update) entry if
735 * it existed.
736 *
737 * So free the (unused) new entry and return 0 to let the
738 * caller know that we didn't add any new fasync entries.
739 */
740 if (fasync_insert_entry(fd, filp, fapp, new)) {
741 fasync_free(new);
742 return 0;
743 }
744
745 return 1;
712} 746}
713 747
714/* 748/*
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d2..4e303c22d5ee 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
151 */ 151 */
152const struct file_operations def_fifo_fops = { 152const struct file_operations def_fifo_fops = {
153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */ 153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */
154 .llseek = noop_llseek,
154}; 155};
diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..c3dee381f1b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
60/* 60/*
61 * Return the total number of open files in the system 61 * Return the total number of open files in the system
62 */ 62 */
63static int get_nr_files(void) 63static long get_nr_files(void)
64{ 64{
65 return percpu_counter_read_positive(&nr_files); 65 return percpu_counter_read_positive(&nr_files);
66} 66}
@@ -68,7 +68,7 @@ static int get_nr_files(void)
68/* 68/*
69 * Return the maximum number of open files in the system 69 * Return the maximum number of open files in the system
70 */ 70 */
71int get_max_files(void) 71unsigned long get_max_files(void)
72{ 72{
73 return files_stat.max_files; 73 return files_stat.max_files;
74} 74}
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
82 void __user *buffer, size_t *lenp, loff_t *ppos) 82 void __user *buffer, size_t *lenp, loff_t *ppos)
83{ 83{
84 files_stat.nr_files = get_nr_files(); 84 files_stat.nr_files = get_nr_files();
85 return proc_dointvec(table, write, buffer, lenp, ppos); 85 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
86} 86}
87#else 87#else
88int proc_nr_files(ctl_table *table, int write, 88int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
105struct file *get_empty_filp(void) 105struct file *get_empty_filp(void)
106{ 106{
107 const struct cred *cred = current_cred(); 107 const struct cred *cred = current_cred();
108 static int old_max; 108 static long old_max;
109 struct file * f; 109 struct file * f;
110 110
111 /* 111 /*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
140over: 140over:
141 /* Ran out of filps - report that */ 141 /* Ran out of filps - report that */
142 if (get_nr_files() > old_max) { 142 if (get_nr_files() > old_max) {
143 printk(KERN_INFO "VFS: file-max limit %d reached\n", 143 pr_info("VFS: file-max limit %lu reached\n", get_max_files());
144 get_max_files());
145 old_max = get_nr_files(); 144 old_max = get_nr_files();
146 } 145 }
147 goto fail; 146 goto fail;
@@ -487,7 +486,7 @@ retry:
487 486
488void __init files_init(unsigned long mempages) 487void __init files_init(unsigned long mempages)
489{ 488{
490 int n; 489 unsigned long n;
491 490
492 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 491 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
493 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 492 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
498 */ 497 */
499 498
500 n = (mempages * (PAGE_SIZE / 1024)) / 10; 499 n = (mempages * (PAGE_SIZE / 1024)) / 10;
501 files_stat.max_files = n; 500 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
502 if (files_stat.max_files < NR_FILE)
503 files_stat.max_files = NR_FILE;
504 files_defer_init(); 501 files_defer_init();
505 lg_lock_init(files_lglock); 502 lg_lock_init(files_lglock);
506 percpu_counter_init(&nr_files, 0); 503 percpu_counter_init(&nr_files, 0);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..8c04eac5079d 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
260 struct inode *ip = NULL; 260 struct inode *ip = NULL;
261 261
262 if ((ip = new_inode(sbp))) { 262 if ((ip = new_inode(sbp))) {
263 ip->i_ino = get_next_ino();
263 vxfs_iinit(ip, vip); 264 vxfs_iinit(ip, vip);
264 ip->i_mapping->a_ops = &vxfs_aops; 265 ip->i_mapping->a_ops = &vxfs_aops;
265 } 266 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c6..6c5131d592f0 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
36#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/kernel.h> 37#include <linux/kernel.h>
38#include <linux/pagemap.h> 38#include <linux/pagemap.h>
39#include <linux/smp_lock.h>
40 39
41#include "vxfs.h" 40#include "vxfs.h"
42#include "vxfs_dir.h" 41#include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
212 if (dp->d_name.len > VXFS_NAMELEN) 211 if (dp->d_name.len > VXFS_NAMELEN)
213 return ERR_PTR(-ENAMETOOLONG); 212 return ERR_PTR(-ENAMETOOLONG);
214 213
215 lock_kernel();
216 ino = vxfs_inode_by_name(dip, dp); 214 ino = vxfs_inode_by_name(dip, dp);
217 if (ino) { 215 if (ino) {
218 ip = vxfs_iget(dip->i_sb, ino); 216 ip = vxfs_iget(dip->i_sb, ino);
219 if (IS_ERR(ip)) { 217 if (IS_ERR(ip))
220 unlock_kernel();
221 return ERR_CAST(ip); 218 return ERR_CAST(ip);
222 }
223 } 219 }
224 unlock_kernel();
225 d_add(dp, ip); 220 d_add(dp, ip);
226 return NULL; 221 return NULL;
227} 222}
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
248 u_long page, npages, block, pblocks, nblocks, offset; 243 u_long page, npages, block, pblocks, nblocks, offset;
249 loff_t pos; 244 loff_t pos;
250 245
251 lock_kernel();
252
253 switch ((long)fp->f_pos) { 246 switch ((long)fp->f_pos) {
254 case 0: 247 case 0:
255 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) 248 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
265 258
266 pos = fp->f_pos - 2; 259 pos = fp->f_pos - 2;
267 260
268 if (pos > VXFS_DIRROUND(ip->i_size)) { 261 if (pos > VXFS_DIRROUND(ip->i_size))
269 unlock_kernel();
270 return 0; 262 return 0;
271 }
272 263
273 npages = dir_pages(ip); 264 npages = dir_pages(ip);
274 nblocks = dir_blocks(ip); 265 nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
327done: 318done:
328 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; 319 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
329out: 320out:
330 unlock_kernel();
331 return 0; 321 return 0;
332} 322}
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index dc0c041e85cb..9d1c99558389 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/smp_lock.h>
42#include <linux/stat.h> 41#include <linux/stat.h>
43#include <linux/vfs.h> 42#include <linux/vfs.h>
44#include <linux/mount.h> 43#include <linux/mount.h>
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
81{ 80{
82 struct vxfs_sb_info *infp = VXFS_SBI(sbp); 81 struct vxfs_sb_info *infp = VXFS_SBI(sbp);
83 82
84 lock_kernel();
85
86 vxfs_put_fake_inode(infp->vsi_fship); 83 vxfs_put_fake_inode(infp->vsi_fship);
87 vxfs_put_fake_inode(infp->vsi_ilist); 84 vxfs_put_fake_inode(infp->vsi_ilist);
88 vxfs_put_fake_inode(infp->vsi_stilist); 85 vxfs_put_fake_inode(infp->vsi_stilist);
89 86
90 brelse(infp->vsi_bp); 87 brelse(infp->vsi_bp);
91 kfree(infp); 88 kfree(infp);
92
93 unlock_kernel();
94} 89}
95 90
96/** 91/**
@@ -148,7 +143,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
148 * The superblock on success, else %NULL. 143 * The superblock on success, else %NULL.
149 * 144 *
150 * Locking: 145 * Locking:
151 * We are under the bkl and @sbp->s_lock. 146 * We are under @sbp->s_lock.
152 */ 147 */
153static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) 148static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
154{ 149{
@@ -251,17 +246,16 @@ out:
251/* 246/*
252 * The usual module blurb. 247 * The usual module blurb.
253 */ 248 */
254static int vxfs_get_sb(struct file_system_type *fs_type, 249static struct dentry *vxfs_mount(struct file_system_type *fs_type,
255 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 250 int flags, const char *dev_name, void *data)
256{ 251{
257 return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super, 252 return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
258 mnt);
259} 253}
260 254
261static struct file_system_type vxfs_fs_type = { 255static struct file_system_type vxfs_fs_type = {
262 .owner = THIS_MODULE, 256 .owner = THIS_MODULE,
263 .name = "vxfs", 257 .name = "vxfs",
264 .get_sb = vxfs_get_sb, 258 .mount = vxfs_mount,
265 .kill_sb = kill_block_super, 259 .kill_sb = kill_block_super,
266 .fs_flags = FS_REQUIRES_DEV, 260 .fs_flags = FS_REQUIRES_DEV,
267}; 261};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ab38fef1c9a1..3d06ccc953aa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,6 +79,11 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
79 return sb->s_bdi; 79 return sb->s_bdi;
80} 80}
81 81
82static inline struct inode *wb_inode(struct list_head *head)
83{
84 return list_entry(head, struct inode, i_wb_list);
85}
86
82static void bdi_queue_work(struct backing_dev_info *bdi, 87static void bdi_queue_work(struct backing_dev_info *bdi,
83 struct wb_writeback_work *work) 88 struct wb_writeback_work *work)
84{ 89{
@@ -172,11 +177,11 @@ static void redirty_tail(struct inode *inode)
172 if (!list_empty(&wb->b_dirty)) { 177 if (!list_empty(&wb->b_dirty)) {
173 struct inode *tail; 178 struct inode *tail;
174 179
175 tail = list_entry(wb->b_dirty.next, struct inode, i_list); 180 tail = wb_inode(wb->b_dirty.next);
176 if (time_before(inode->dirtied_when, tail->dirtied_when)) 181 if (time_before(inode->dirtied_when, tail->dirtied_when))
177 inode->dirtied_when = jiffies; 182 inode->dirtied_when = jiffies;
178 } 183 }
179 list_move(&inode->i_list, &wb->b_dirty); 184 list_move(&inode->i_wb_list, &wb->b_dirty);
180} 185}
181 186
182/* 187/*
@@ -186,7 +191,7 @@ static void requeue_io(struct inode *inode)
186{ 191{
187 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 192 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
188 193
189 list_move(&inode->i_list, &wb->b_more_io); 194 list_move(&inode->i_wb_list, &wb->b_more_io);
190} 195}
191 196
192static void inode_sync_complete(struct inode *inode) 197static void inode_sync_complete(struct inode *inode)
@@ -227,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
227 int do_sb_sort = 0; 232 int do_sb_sort = 0;
228 233
229 while (!list_empty(delaying_queue)) { 234 while (!list_empty(delaying_queue)) {
230 inode = list_entry(delaying_queue->prev, struct inode, i_list); 235 inode = wb_inode(delaying_queue->prev);
231 if (older_than_this && 236 if (older_than_this &&
232 inode_dirtied_after(inode, *older_than_this)) 237 inode_dirtied_after(inode, *older_than_this))
233 break; 238 break;
234 if (sb && sb != inode->i_sb) 239 if (sb && sb != inode->i_sb)
235 do_sb_sort = 1; 240 do_sb_sort = 1;
236 sb = inode->i_sb; 241 sb = inode->i_sb;
237 list_move(&inode->i_list, &tmp); 242 list_move(&inode->i_wb_list, &tmp);
238 } 243 }
239 244
240 /* just one sb in list, splice to dispatch_queue and we're done */ 245 /* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
245 250
246 /* Move inodes from one superblock together */ 251 /* Move inodes from one superblock together */
247 while (!list_empty(&tmp)) { 252 while (!list_empty(&tmp)) {
248 inode = list_entry(tmp.prev, struct inode, i_list); 253 sb = wb_inode(tmp.prev)->i_sb;
249 sb = inode->i_sb;
250 list_for_each_prev_safe(pos, node, &tmp) { 254 list_for_each_prev_safe(pos, node, &tmp) {
251 inode = list_entry(pos, struct inode, i_list); 255 inode = wb_inode(pos);
252 if (inode->i_sb == sb) 256 if (inode->i_sb == sb)
253 list_move(&inode->i_list, dispatch_queue); 257 list_move(&inode->i_wb_list, dispatch_queue);
254 } 258 }
255 } 259 }
256} 260}
@@ -408,16 +412,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
408 * completion. 412 * completion.
409 */ 413 */
410 redirty_tail(inode); 414 redirty_tail(inode);
411 } else if (atomic_read(&inode->i_count)) {
412 /*
413 * The inode is clean, inuse
414 */
415 list_move(&inode->i_list, &inode_in_use);
416 } else { 415 } else {
417 /* 416 /*
418 * The inode is clean, unused 417 * The inode is clean. At this point we either have
418 * a reference to the inode or it's on it's way out.
419 * No need to add it back to the LRU.
419 */ 420 */
420 list_move(&inode->i_list, &inode_unused); 421 list_del_init(&inode->i_wb_list);
421 } 422 }
422 } 423 }
423 inode_sync_complete(inode); 424 inode_sync_complete(inode);
@@ -465,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
465{ 466{
466 while (!list_empty(&wb->b_io)) { 467 while (!list_empty(&wb->b_io)) {
467 long pages_skipped; 468 long pages_skipped;
468 struct inode *inode = list_entry(wb->b_io.prev, 469 struct inode *inode = wb_inode(wb->b_io.prev);
469 struct inode, i_list);
470 470
471 if (inode->i_sb != sb) { 471 if (inode->i_sb != sb) {
472 if (only_this_sb) { 472 if (only_this_sb) {
@@ -487,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
487 return 0; 487 return 0;
488 } 488 }
489 489
490 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 490 /*
491 * Don't bother with new inodes or inodes beeing freed, first
492 * kind does not need peridic writeout yet, and for the latter
493 * kind writeout is handled by the freer.
494 */
495 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
491 requeue_io(inode); 496 requeue_io(inode);
492 continue; 497 continue;
493 } 498 }
499
494 /* 500 /*
495 * Was this inode dirtied after sync_sb_inodes was called? 501 * Was this inode dirtied after sync_sb_inodes was called?
496 * This keeps sync from extra jobs and livelock. 502 * This keeps sync from extra jobs and livelock.
@@ -498,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
498 if (inode_dirtied_after(inode, wbc->wb_start)) 504 if (inode_dirtied_after(inode, wbc->wb_start))
499 return 1; 505 return 1;
500 506
501 BUG_ON(inode->i_state & I_FREEING);
502 __iget(inode); 507 __iget(inode);
503 pages_skipped = wbc->pages_skipped; 508 pages_skipped = wbc->pages_skipped;
504 writeback_single_inode(inode, wbc); 509 writeback_single_inode(inode, wbc);
@@ -536,8 +541,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
536 queue_io(wb, wbc->older_than_this); 541 queue_io(wb, wbc->older_than_this);
537 542
538 while (!list_empty(&wb->b_io)) { 543 while (!list_empty(&wb->b_io)) {
539 struct inode *inode = list_entry(wb->b_io.prev, 544 struct inode *inode = wb_inode(wb->b_io.prev);
540 struct inode, i_list);
541 struct super_block *sb = inode->i_sb; 545 struct super_block *sb = inode->i_sb;
542 546
543 if (!pin_sb_for_writeback(sb)) { 547 if (!pin_sb_for_writeback(sb)) {
@@ -582,7 +586,7 @@ static inline bool over_bground_thresh(void)
582 global_dirty_limits(&background_thresh, &dirty_thresh); 586 global_dirty_limits(&background_thresh, &dirty_thresh);
583 587
584 return (global_page_state(NR_FILE_DIRTY) + 588 return (global_page_state(NR_FILE_DIRTY) +
585 global_page_state(NR_UNSTABLE_NFS) >= background_thresh); 589 global_page_state(NR_UNSTABLE_NFS) > background_thresh);
586} 590}
587 591
588/* 592/*
@@ -675,8 +679,7 @@ static long wb_writeback(struct bdi_writeback *wb,
675 */ 679 */
676 spin_lock(&inode_lock); 680 spin_lock(&inode_lock);
677 if (!list_empty(&wb->b_more_io)) { 681 if (!list_empty(&wb->b_more_io)) {
678 inode = list_entry(wb->b_more_io.prev, 682 inode = wb_inode(wb->b_more_io.prev);
679 struct inode, i_list);
680 trace_wbc_writeback_wait(&wbc, wb->bdi); 683 trace_wbc_writeback_wait(&wbc, wb->bdi);
681 inode_wait_for_writeback(inode); 684 inode_wait_for_writeback(inode);
682 } 685 }
@@ -704,6 +707,17 @@ get_next_work_item(struct backing_dev_info *bdi)
704 return work; 707 return work;
705} 708}
706 709
710/*
711 * Add in the number of potentially dirty inodes, because each inode
712 * write can dirty pagecache in the underlying blockdev.
713 */
714static unsigned long get_nr_dirty_pages(void)
715{
716 return global_page_state(NR_FILE_DIRTY) +
717 global_page_state(NR_UNSTABLE_NFS) +
718 get_nr_dirty_inodes();
719}
720
707static long wb_check_old_data_flush(struct bdi_writeback *wb) 721static long wb_check_old_data_flush(struct bdi_writeback *wb)
708{ 722{
709 unsigned long expired; 723 unsigned long expired;
@@ -721,9 +735,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
721 return 0; 735 return 0;
722 736
723 wb->last_old_flush = jiffies; 737 wb->last_old_flush = jiffies;
724 nr_pages = global_page_state(NR_FILE_DIRTY) + 738 nr_pages = get_nr_dirty_pages();
725 global_page_state(NR_UNSTABLE_NFS) +
726 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
727 739
728 if (nr_pages) { 740 if (nr_pages) {
729 struct wb_writeback_work work = { 741 struct wb_writeback_work work = {
@@ -790,7 +802,7 @@ int bdi_writeback_thread(void *data)
790 struct backing_dev_info *bdi = wb->bdi; 802 struct backing_dev_info *bdi = wb->bdi;
791 long pages_written; 803 long pages_written;
792 804
793 current->flags |= PF_FLUSHER | PF_SWAPWRITE; 805 current->flags |= PF_SWAPWRITE;
794 set_freezable(); 806 set_freezable();
795 wb->last_active = jiffies; 807 wb->last_active = jiffies;
796 808
@@ -962,7 +974,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
962 * dirty list. Add blockdev inodes as well. 974 * dirty list. Add blockdev inodes as well.
963 */ 975 */
964 if (!S_ISBLK(inode->i_mode)) { 976 if (!S_ISBLK(inode->i_mode)) {
965 if (hlist_unhashed(&inode->i_hash)) 977 if (inode_unhashed(inode))
966 goto out; 978 goto out;
967 } 979 }
968 if (inode->i_state & I_FREEING) 980 if (inode->i_state & I_FREEING)
@@ -990,7 +1002,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
990 } 1002 }
991 1003
992 inode->dirtied_when = jiffies; 1004 inode->dirtied_when = jiffies;
993 list_move(&inode->i_list, &bdi->wb.b_dirty); 1005 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
994 } 1006 }
995 } 1007 }
996out: 1008out:
@@ -1069,33 +1081,42 @@ static void wait_sb_inodes(struct super_block *sb)
1069} 1081}
1070 1082
1071/** 1083/**
1072 * writeback_inodes_sb - writeback dirty inodes from given super_block 1084 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
1073 * @sb: the superblock 1085 * @sb: the superblock
1086 * @nr: the number of pages to write
1074 * 1087 *
1075 * Start writeback on some inodes on this super_block. No guarantees are made 1088 * Start writeback on some inodes on this super_block. No guarantees are made
1076 * on how many (if any) will be written, and this function does not wait 1089 * on how many (if any) will be written, and this function does not wait
1077 * for IO completion of submitted IO. The number of pages submitted is 1090 * for IO completion of submitted IO.
1078 * returned.
1079 */ 1091 */
1080void writeback_inodes_sb(struct super_block *sb) 1092void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1081{ 1093{
1082 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1083 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1084 DECLARE_COMPLETION_ONSTACK(done); 1094 DECLARE_COMPLETION_ONSTACK(done);
1085 struct wb_writeback_work work = { 1095 struct wb_writeback_work work = {
1086 .sb = sb, 1096 .sb = sb,
1087 .sync_mode = WB_SYNC_NONE, 1097 .sync_mode = WB_SYNC_NONE,
1088 .done = &done, 1098 .done = &done,
1099 .nr_pages = nr,
1089 }; 1100 };
1090 1101
1091 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1102 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1092
1093 work.nr_pages = nr_dirty + nr_unstable +
1094 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1095
1096 bdi_queue_work(sb->s_bdi, &work); 1103 bdi_queue_work(sb->s_bdi, &work);
1097 wait_for_completion(&done); 1104 wait_for_completion(&done);
1098} 1105}
1106EXPORT_SYMBOL(writeback_inodes_sb_nr);
1107
1108/**
1109 * writeback_inodes_sb - writeback dirty inodes from given super_block
1110 * @sb: the superblock
1111 *
1112 * Start writeback on some inodes on this super_block. No guarantees are made
1113 * on how many (if any) will be written, and this function does not wait
1114 * for IO completion of submitted IO.
1115 */
1116void writeback_inodes_sb(struct super_block *sb)
1117{
1118 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
1119}
1099EXPORT_SYMBOL(writeback_inodes_sb); 1120EXPORT_SYMBOL(writeback_inodes_sb);
1100 1121
1101/** 1122/**
@@ -1118,6 +1139,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb)
1118EXPORT_SYMBOL(writeback_inodes_sb_if_idle); 1139EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1119 1140
1120/** 1141/**
1142 * writeback_inodes_sb_if_idle - start writeback if none underway
1143 * @sb: the superblock
1144 * @nr: the number of pages to write
1145 *
1146 * Invoke writeback_inodes_sb if no writeback is currently underway.
1147 * Returns 1 if writeback was started, 0 if not.
1148 */
1149int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1150 unsigned long nr)
1151{
1152 if (!writeback_in_progress(sb->s_bdi)) {
1153 down_read(&sb->s_umount);
1154 writeback_inodes_sb_nr(sb, nr);
1155 up_read(&sb->s_umount);
1156 return 1;
1157 } else
1158 return 0;
1159}
1160EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1161
1162/**
1121 * sync_inodes_sb - sync sb inode pages 1163 * sync_inodes_sb - sync sb inode pages
1122 * @sb: the superblock 1164 * @sb: the superblock
1123 * 1165 *
@@ -1198,3 +1240,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1198 return ret; 1240 return ret;
1199} 1241}
1200EXPORT_SYMBOL(sync_inode); 1242EXPORT_SYMBOL(sync_inode);
1243
1244/**
1245 * sync_inode - write an inode to disk
1246 * @inode: the inode to sync
1247 * @wait: wait for I/O to complete.
1248 *
1249 * Write an inode to disk and adjust it's dirty state after completion.
1250 *
1251 * Note: only writes the actual inode, no associated data or other metadata.
1252 */
1253int sync_inode_metadata(struct inode *inode, int wait)
1254{
1255 struct writeback_control wbc = {
1256 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
1257 .nr_to_write = 0, /* metadata-only */
1258 };
1259
1260 return sync_inode(inode, &wbc);
1261}
1262EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f9..85542a7daf40 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
179static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
180 .open = nonseekable_open, 180 .open = nonseekable_open,
181 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
182 .llseek = no_llseek,
182}; 183};
183 184
184static const struct file_operations fuse_ctl_waiting_ops = { 185static const struct file_operations fuse_ctl_waiting_ops = {
185 .open = nonseekable_open, 186 .open = nonseekable_open,
186 .read = fuse_conn_waiting_read, 187 .read = fuse_conn_waiting_read,
188 .llseek = no_llseek,
187}; 189};
188 190
189static const struct file_operations fuse_conn_max_background_ops = { 191static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open, 192 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read, 193 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write, 194 .write = fuse_conn_max_background_write,
195 .llseek = no_llseek,
193}; 196};
194 197
195static const struct file_operations fuse_conn_congestion_threshold_ops = { 198static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open, 199 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read, 200 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write, 201 .write = fuse_conn_congestion_threshold_write,
202 .llseek = no_llseek,
199}; 203};
200 204
201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 205static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
@@ -218,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
218 if (!inode) 222 if (!inode)
219 return NULL; 223 return NULL;
220 224
225 inode->i_ino = get_next_ino();
221 inode->i_mode = mode; 226 inode->i_mode = mode;
222 inode->i_uid = fc->user_id; 227 inode->i_uid = fc->user_id;
223 inode->i_gid = fc->group_id; 228 inode->i_gid = fc->group_id;
@@ -317,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
317 return 0; 322 return 0;
318} 323}
319 324
320static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags, 325static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
321 const char *dev_name, void *raw_data, 326 int flags, const char *dev_name, void *raw_data)
322 struct vfsmount *mnt)
323{ 327{
324 return get_sb_single(fs_type, flags, raw_data, 328 return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
325 fuse_ctl_fill_super, mnt);
326} 329}
327 330
328static void fuse_ctl_kill_sb(struct super_block *sb) 331static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -341,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
341static struct file_system_type fuse_ctl_fs_type = { 344static struct file_system_type fuse_ctl_fs_type = {
342 .owner = THIS_MODULE, 345 .owner = THIS_MODULE,
343 .name = "fusectl", 346 .name = "fusectl",
344 .get_sb = fuse_ctl_get_sb, 347 .mount = fuse_ctl_mount,
345 .kill_sb = fuse_ctl_kill_sb, 348 .kill_sb = fuse_ctl_kill_sb,
346}; 349};
347 350
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278bd..3e87cce5837d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
182 .unlocked_ioctl = cuse_file_ioctl, 182 .unlocked_ioctl = cuse_file_ioctl,
183 .compat_ioctl = cuse_file_compat_ioctl, 183 .compat_ioctl = cuse_file_compat_ioctl,
184 .poll = fuse_file_poll, 184 .poll = fuse_file_poll,
185 .llseek = noop_llseek,
185}; 186};
186 187
187 188
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cde755cca564..6e07696308dc 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
809 int err; 809 int err;
810 struct page *page = *pagep; 810 struct page *page = *pagep;
811 811
812 if (page && zeroing && count < PAGE_SIZE) { 812 if (page && zeroing && count < PAGE_SIZE)
813 void *mapaddr = kmap_atomic(page, KM_USER1); 813 clear_highpage(page);
814 memset(mapaddr, 0, PAGE_SIZE); 814
815 kunmap_atomic(mapaddr, KM_USER1);
816 }
817 while (count) { 815 while (count) {
818 if (cs->write && cs->pipebufs && page) { 816 if (cs->write && cs->pipebufs && page) {
819 return fuse_ref_page(cs, page, offset, count); 817 return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
830 } 828 }
831 } 829 }
832 if (page) { 830 if (page) {
833 void *mapaddr = kmap_atomic(page, KM_USER1); 831 void *mapaddr = kmap_atomic(page, KM_USER0);
834 void *buf = mapaddr + offset; 832 void *buf = mapaddr + offset;
835 offset += fuse_copy_do(cs, &buf, &count); 833 offset += fuse_copy_do(cs, &buf, &count);
836 kunmap_atomic(mapaddr, KM_USER1); 834 kunmap_atomic(mapaddr, KM_USER0);
837 } else 835 } else
838 offset += fuse_copy_do(cs, NULL, &count); 836 offset += fuse_copy_do(cs, NULL, &count);
839 } 837 }
@@ -1336,12 +1334,7 @@ out_finish:
1336 1334
1337static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) 1335static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1338{ 1336{
1339 int i; 1337 release_pages(req->pages, req->num_pages, 0);
1340
1341 for (i = 0; i < req->num_pages; i++) {
1342 struct page *page = req->pages[i];
1343 page_cache_release(page);
1344 }
1345} 1338}
1346 1339
1347static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, 1340static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index da9e6e11374c..cfce3ad86a92 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1041,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1041 return err; 1041 return err;
1042} 1042}
1043 1043
1044static int fuse_get_sb(struct file_system_type *fs_type, 1044static struct dentry *fuse_mount(struct file_system_type *fs_type,
1045 int flags, const char *dev_name, 1045 int flags, const char *dev_name,
1046 void *raw_data, struct vfsmount *mnt) 1046 void *raw_data)
1047{ 1047{
1048 return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); 1048 return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
1049} 1049}
1050 1050
1051static void fuse_kill_sb_anon(struct super_block *sb) 1051static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1065,17 +1065,16 @@ static struct file_system_type fuse_fs_type = {
1065 .owner = THIS_MODULE, 1065 .owner = THIS_MODULE,
1066 .name = "fuse", 1066 .name = "fuse",
1067 .fs_flags = FS_HAS_SUBTYPE, 1067 .fs_flags = FS_HAS_SUBTYPE,
1068 .get_sb = fuse_get_sb, 1068 .mount = fuse_mount,
1069 .kill_sb = fuse_kill_sb_anon, 1069 .kill_sb = fuse_kill_sb_anon,
1070}; 1070};
1071 1071
1072#ifdef CONFIG_BLOCK 1072#ifdef CONFIG_BLOCK
1073static int fuse_get_sb_blk(struct file_system_type *fs_type, 1073static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
1074 int flags, const char *dev_name, 1074 int flags, const char *dev_name,
1075 void *raw_data, struct vfsmount *mnt) 1075 void *raw_data)
1076{ 1076{
1077 return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super, 1077 return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
1078 mnt);
1079} 1078}
1080 1079
1081static void fuse_kill_sb_blk(struct super_block *sb) 1080static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1094,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
1094static struct file_system_type fuseblk_fs_type = { 1093static struct file_system_type fuseblk_fs_type = {
1095 .owner = THIS_MODULE, 1094 .owner = THIS_MODULE,
1096 .name = "fuseblk", 1095 .name = "fuseblk",
1097 .get_sb = fuse_get_sb_blk, 1096 .mount = fuse_mount_blk,
1098 .kill_sb = fuse_kill_sb_blk, 1097 .kill_sb = fuse_kill_sb_blk,
1099 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, 1098 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
1100}; 1099};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 6b24afb96aae..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
618 struct gfs2_alloc *al = NULL; 618 struct gfs2_alloc *al = NULL;
619 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 619 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
620 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 620 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
621 unsigned to = from + len;
622 struct page *page; 621 struct page *page;
623 622
624 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 623 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
691 } 690 }
692 691
693prepare_write: 692prepare_write:
694 error = block_prepare_write(page, from, to, gfs2_block_map); 693 error = __block_write_begin(page, from, len, gfs2_block_map);
695out: 694out:
696 if (error == 0) 695 if (error == 0)
697 return 0; 696 return 0;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d34..5ab3839dfcb9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
138 struct gfs2_inum_host *inum) 138 struct gfs2_inum_host *inum)
139{ 139{
140 struct gfs2_sbd *sdp = sb->s_fs_info; 140 struct gfs2_sbd *sdp = sb->s_fs_info;
141 struct gfs2_holder i_gh;
142 struct inode *inode; 141 struct inode *inode;
143 struct dentry *dentry; 142 struct dentry *dentry;
144 int error;
145 143
146 inode = gfs2_ilookup(sb, inum->no_addr); 144 inode = gfs2_ilookup(sb, inum->no_addr);
147 if (inode) { 145 if (inode) {
@@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
152 goto out_inode; 150 goto out_inode;
153 } 151 }
154 152
155 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, 153 inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
156 LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 154 GFS2_BLKST_DINODE);
157 if (error) 155 if (IS_ERR(inode))
158 return ERR_PTR(error); 156 return ERR_CAST(inode);
159
160 error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
161 if (error)
162 goto fail;
163
164 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
165 if (IS_ERR(inode)) {
166 error = PTR_ERR(inode);
167 goto fail;
168 }
169
170 error = gfs2_inode_refresh(GFS2_I(inode));
171 if (error) {
172 iput(inode);
173 goto fail;
174 }
175
176 /* Pick up the works we bypass in gfs2_inode_lookup */
177 if (inode->i_state & I_NEW)
178 gfs2_set_iop(inode);
179
180 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
181 iput(inode);
182 goto fail;
183 }
184
185 error = -EIO;
186 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
187 iput(inode);
188 goto fail;
189 }
190
191 gfs2_glock_dq_uninit(&i_gh);
192 157
193out_inode: 158out_inode:
194 dentry = d_obtain_alias(inode); 159 dentry = d_obtain_alias(inode);
195 if (!IS_ERR(dentry)) 160 if (!IS_ERR(dentry))
196 dentry->d_op = &gfs2_dops; 161 dentry->d_op = &gfs2_dops;
197 return dentry; 162 return dentry;
198fail:
199 gfs2_glock_dq_uninit(&i_gh);
200 return ERR_PTR(error);
201} 163}
202 164
203static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, 165static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 237ee6a940df..aa996471ec5c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -622,6 +622,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
622 * cluster; until we do, disable leases (by just returning -EINVAL), 622 * cluster; until we do, disable leases (by just returning -EINVAL),
623 * unless the administrator has requested purely local locking. 623 * unless the administrator has requested purely local locking.
624 * 624 *
625 * Locking: called under lock_flocks
626 *
625 * Returns: errno 627 * Returns: errno
626 */ 628 */
627 629
@@ -773,6 +775,7 @@ const struct file_operations gfs2_dir_fops = {
773 .fsync = gfs2_fsync, 775 .fsync = gfs2_fsync,
774 .lock = gfs2_lock, 776 .lock = gfs2_lock,
775 .flock = gfs2_flock, 777 .flock = gfs2_flock,
778 .llseek = default_llseek,
776}; 779};
777 780
778#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ 781#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -799,5 +802,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
799 .open = gfs2_open, 802 .open = gfs2_open,
800 .release = gfs2_close, 803 .release = gfs2_close,
801 .fsync = gfs2_fsync, 804 .fsync = gfs2_fsync,
805 .llseek = default_llseek,
802}; 806};
803 807
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f099..f92c17704169 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work)
686{ 686{
687 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); 687 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
688 struct gfs2_sbd *sdp = gl->gl_sbd; 688 struct gfs2_sbd *sdp = gl->gl_sbd;
689 struct gfs2_inode *ip = NULL; 689 struct gfs2_inode *ip;
690 struct inode *inode; 690 struct inode *inode;
691 u64 no_addr = 0; 691 u64 no_addr = gl->gl_name.ln_number;
692
693 ip = gl->gl_object;
694 /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
692 695
693 spin_lock(&gl->gl_spin);
694 ip = (struct gfs2_inode *)gl->gl_object;
695 if (ip) 696 if (ip)
696 no_addr = ip->i_no_addr;
697 spin_unlock(&gl->gl_spin);
698 if (ip) {
699 inode = gfs2_ilookup(sdp->sd_vfs, no_addr); 697 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
700 if (inode) { 698 else
701 d_prune_aliases(inode); 699 inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
702 iput(inode); 700 if (inode && !IS_ERR(inode)) {
703 } 701 d_prune_aliases(inode);
702 iput(inode);
704 } 703 }
705 gfs2_glock_put(gl); 704 gfs2_glock_put(gl);
706} 705}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8cf..e1213f7f9217 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); 73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
74} 74}
75 75
76struct gfs2_skip_data {
77 u64 no_addr;
78 int skipped;
79};
80
81static int iget_skip_test(struct inode *inode, void *opaque)
82{
83 struct gfs2_inode *ip = GFS2_I(inode);
84 struct gfs2_skip_data *data = opaque;
85
86 if (ip->i_no_addr == data->no_addr) {
87 if (inode->i_state & (I_FREEING|I_WILL_FREE)){
88 data->skipped = 1;
89 return 0;
90 }
91 return 1;
92 }
93 return 0;
94}
95
96static int iget_skip_set(struct inode *inode, void *opaque)
97{
98 struct gfs2_inode *ip = GFS2_I(inode);
99 struct gfs2_skip_data *data = opaque;
100
101 if (data->skipped)
102 return 1;
103 inode->i_ino = (unsigned long)(data->no_addr);
104 ip->i_no_addr = data->no_addr;
105 return 0;
106}
107
108static struct inode *gfs2_iget_skip(struct super_block *sb,
109 u64 no_addr)
110{
111 struct gfs2_skip_data data;
112 unsigned long hash = (unsigned long)no_addr;
113
114 data.no_addr = no_addr;
115 data.skipped = 0;
116 return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
117}
118
119/** 76/**
120 * GFS2 lookup code fills in vfs inode contents based on info obtained 77 * GFS2 lookup code fills in vfs inode contents based on info obtained
121 * from directory entry inside gfs2_inode_lookup(). This has caused issues 78 * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -243,93 +200,54 @@ fail:
243 return ERR_PTR(error); 200 return ERR_PTR(error);
244} 201}
245 202
246/** 203struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
247 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation 204 u64 *no_formal_ino, unsigned int blktype)
248 * and try to reclaim it by doing iput.
249 *
250 * This function assumes no rgrp locks are currently held.
251 *
252 * @sb: The super block
253 * no_addr: The inode number
254 *
255 */
256
257void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
258{ 205{
259 struct gfs2_sbd *sdp; 206 struct super_block *sb = sdp->sd_vfs;
260 struct gfs2_inode *ip; 207 struct gfs2_holder i_gh;
261 struct gfs2_glock *io_gl = NULL;
262 int error;
263 struct gfs2_holder gh;
264 struct inode *inode; 208 struct inode *inode;
209 int error;
265 210
266 inode = gfs2_iget_skip(sb, no_addr); 211 error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
267 212 LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
268 if (!inode) 213 if (error)
269 return; 214 return ERR_PTR(error);
270
271 /* If it's not a new inode, someone's using it, so leave it alone. */
272 if (!(inode->i_state & I_NEW)) {
273 iput(inode);
274 return;
275 }
276
277 ip = GFS2_I(inode);
278 sdp = GFS2_SB(inode);
279 ip->i_no_formal_ino = -1;
280 215
281 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 216 error = gfs2_check_blk_type(sdp, no_addr, blktype);
282 if (unlikely(error)) 217 if (error)
283 goto fail; 218 goto fail;
284 ip->i_gl->gl_object = ip;
285 219
286 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); 220 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
287 if (unlikely(error)) 221 if (IS_ERR(inode))
288 goto fail_put; 222 goto fail;
289
290 set_bit(GIF_INVALID, &ip->i_flags);
291 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
292 &ip->i_iopen_gh);
293 if (unlikely(error))
294 goto fail_iopen;
295 223
296 ip->i_iopen_gh.gh_gl->gl_object = ip; 224 error = gfs2_inode_refresh(GFS2_I(inode));
297 gfs2_glock_put(io_gl); 225 if (error)
298 io_gl = NULL; 226 goto fail_iput;
299 227
300 inode->i_mode = DT2IF(DT_UNKNOWN); 228 /* Pick up the works we bypass in gfs2_inode_lookup */
229 if (inode->i_state & I_NEW)
230 gfs2_set_iop(inode);
301 231
302 /* 232 /* Two extra checks for NFS only */
303 * We must read the inode in order to work out its type in 233 if (no_formal_ino) {
304 * this case. Note that this doesn't happen often as we normally 234 error = -ESTALE;
305 * know the type beforehand. This code path only occurs during 235 if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
306 * unlinked inode recovery (where it is safe to do this glock, 236 goto fail_iput;
307 * which is not true in the general case).
308 */
309 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
310 &gh);
311 if (unlikely(error))
312 goto fail_glock;
313 237
314 /* Inode is now uptodate */ 238 error = -EIO;
315 gfs2_glock_dq_uninit(&gh); 239 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
316 gfs2_set_iop(inode); 240 goto fail_iput;
317 241
318 /* The iput will cause it to be deleted. */ 242 error = 0;
319 iput(inode); 243 }
320 return;
321 244
322fail_glock:
323 gfs2_glock_dq(&ip->i_iopen_gh);
324fail_iopen:
325 if (io_gl)
326 gfs2_glock_put(io_gl);
327fail_put:
328 ip->i_gl->gl_object = NULL;
329 gfs2_glock_put(ip->i_gl);
330fail: 245fail:
331 iget_failed(inode); 246 gfs2_glock_dq_uninit(&i_gh);
332 return; 247 return error ? ERR_PTR(error) : inode;
248fail_iput:
249 iput(inode);
250 goto fail;
333} 251}
334 252
335static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 253static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc6..d8499fadcc53 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,9 @@ err:
99extern void gfs2_set_iop(struct inode *inode); 99extern void gfs2_set_iop(struct inode *inode);
100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
101 u64 no_addr, u64 no_formal_ino); 101 u64 no_addr, u64 no_formal_ino);
102extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); 102extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
103 u64 *no_formal_ino,
104 unsigned int blktype);
103extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 105extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
104 106
105extern int gfs2_inode_refresh(struct gfs2_inode *ip); 107extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ac750bd31a6f..eb01f3575e10 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
592 lh->lh_hash = cpu_to_be32(hash); 592 lh->lh_hash = cpu_to_be32(hash);
593 593
594 bh->b_end_io = end_buffer_write_sync; 594 bh->b_end_io = end_buffer_write_sync;
595 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
596 goto skip_barrier;
597 get_bh(bh); 595 get_bh(bh);
598 submit_bh(WRITE_BARRIER | REQ_META, bh); 596 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
599 wait_on_buffer(bh);
600 if (buffer_eopnotsupp(bh)) {
601 clear_buffer_eopnotsupp(bh);
602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
605 lock_buffer(bh);
606skip_barrier:
607 get_bh(bh);
608 submit_bh(WRITE_SYNC | REQ_META, bh); 597 submit_bh(WRITE_SYNC | REQ_META, bh);
609 wait_on_buffer(bh); 598 else
610 } 599 submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
600 wait_on_buffer(bh);
601
611 if (!buffer_uptodate(bh)) 602 if (!buffer_uptodate(bh))
612 gfs2_io_error_bh(sdp, bh); 603 gfs2_io_error_bh(sdp, bh);
613 brelse(bh); 604 brelse(bh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index d7eb1e209aa8..ebef7ab6e17e 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -144,7 +144,7 @@ static int __init init_gfs2_fs(void)
144 144
145 error = -ENOMEM; 145 error = -ENOMEM;
146 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 146 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
147 WQ_RESCUER | WQ_FREEZEABLE, 0); 147 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
148 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
149 goto fail_wq; 149 goto fail_wq;
150 150
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
55 * activity, but those code paths have their own higher-level 55 * activity, but those code paths have their own higher-level
56 * throttling. 56 * throttling.
57 */ 57 */
58 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 58 if (wbc->sync_mode != WB_SYNC_NONE) {
59 lock_buffer(bh); 59 lock_buffer(bh);
60 } else if (!trylock_buffer(bh)) { 60 } else if (!trylock_buffer(bh)) {
61 redirty_page_for_writepage(wbc, page); 61 redirty_page_for_writepage(wbc, page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aeafc233dc89..3eb1393f7b81 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1219,7 +1219,6 @@ fail_sb:
1219fail_locking: 1219fail_locking:
1220 init_locking(sdp, &mount_gh, UNDO); 1220 init_locking(sdp, &mount_gh, UNDO);
1221fail_lm: 1221fail_lm:
1222 invalidate_inodes(sb);
1223 gfs2_gl_hash_clear(sdp); 1222 gfs2_gl_hash_clear(sdp);
1224 gfs2_lm_unmount(sdp); 1223 gfs2_lm_unmount(sdp);
1225fail_sys: 1224fail_sys:
@@ -1251,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
1251} 1250}
1252 1251
1253/** 1252/**
1254 * gfs2_get_sb - Get the GFS2 superblock 1253 * gfs2_mount - Get the GFS2 superblock
1255 * @fs_type: The GFS2 filesystem type 1254 * @fs_type: The GFS2 filesystem type
1256 * @flags: Mount flags 1255 * @flags: Mount flags
1257 * @dev_name: The name of the device 1256 * @dev_name: The name of the device
1258 * @data: The mount arguments 1257 * @data: The mount arguments
1259 * @mnt: The vfsmnt for this mount
1260 * 1258 *
1261 * Q. Why not use get_sb_bdev() ? 1259 * Q. Why not use get_sb_bdev() ?
1262 * A. We need to select one of two root directories to mount, independent 1260 * A. We need to select one of two root directories to mount, independent
@@ -1265,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
1265 * Returns: 0 or -ve on error 1263 * Returns: 0 or -ve on error
1266 */ 1264 */
1267 1265
1268static int gfs2_get_sb(struct file_system_type *fs_type, int flags, 1266static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1269 const char *dev_name, void *data, struct vfsmount *mnt) 1267 const char *dev_name, void *data)
1270{ 1268{
1271 struct block_device *bdev; 1269 struct block_device *bdev;
1272 struct super_block *s; 1270 struct super_block *s;
@@ -1280,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1280 1278
1281 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1279 bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1282 if (IS_ERR(bdev)) 1280 if (IS_ERR(bdev))
1283 return PTR_ERR(bdev); 1281 return ERR_CAST(bdev);
1284 1282
1285 /* 1283 /*
1286 * once the super is inserted into the list by sget, s_umount 1284 * once the super is inserted into the list by sget, s_umount
@@ -1299,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1299 if (IS_ERR(s)) 1297 if (IS_ERR(s))
1300 goto error_bdev; 1298 goto error_bdev;
1301 1299
1300 if (s->s_root)
1301 close_bdev_exclusive(bdev, mode);
1302
1302 memset(&args, 0, sizeof(args)); 1303 memset(&args, 0, sizeof(args));
1303 args.ar_quota = GFS2_QUOTA_DEFAULT; 1304 args.ar_quota = GFS2_QUOTA_DEFAULT;
1304 args.ar_data = GFS2_DATA_DEFAULT; 1305 args.ar_data = GFS2_DATA_DEFAULT;
@@ -1310,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1310 error = gfs2_mount_args(&args, data); 1311 error = gfs2_mount_args(&args, data);
1311 if (error) { 1312 if (error) {
1312 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1313 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
1313 if (s->s_root) 1314 goto error_super;
1314 goto error_super;
1315 deactivate_locked_super(s);
1316 return error;
1317 } 1315 }
1318 1316
1319 if (s->s_root) { 1317 if (s->s_root) {
1320 error = -EBUSY; 1318 error = -EBUSY;
1321 if ((flags ^ s->s_flags) & MS_RDONLY) 1319 if ((flags ^ s->s_flags) & MS_RDONLY)
1322 goto error_super; 1320 goto error_super;
1323 close_bdev_exclusive(bdev, mode);
1324 } else { 1321 } else {
1325 char b[BDEVNAME_SIZE]; 1322 char b[BDEVNAME_SIZE];
1326 1323
@@ -1329,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1329 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 1326 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
1330 sb_set_blocksize(s, block_size(bdev)); 1327 sb_set_blocksize(s, block_size(bdev));
1331 error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); 1328 error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
1332 if (error) { 1329 if (error)
1333 deactivate_locked_super(s); 1330 goto error_super;
1334 return error;
1335 }
1336 s->s_flags |= MS_ACTIVE; 1331 s->s_flags |= MS_ACTIVE;
1337 bdev->bd_super = s; 1332 bdev->bd_super = s;
1338 } 1333 }
1339 1334
1340 sdp = s->s_fs_info; 1335 sdp = s->s_fs_info;
1341 mnt->mnt_sb = s;
1342 if (args.ar_meta) 1336 if (args.ar_meta)
1343 mnt->mnt_root = dget(sdp->sd_master_dir); 1337 return dget(sdp->sd_master_dir);
1344 else 1338 else
1345 mnt->mnt_root = dget(sdp->sd_root_dir); 1339 return dget(sdp->sd_root_dir);
1346 return 0;
1347 1340
1348error_super: 1341error_super:
1349 deactivate_locked_super(s); 1342 deactivate_locked_super(s);
1343 return ERR_PTR(error);
1350error_bdev: 1344error_bdev:
1351 close_bdev_exclusive(bdev, mode); 1345 close_bdev_exclusive(bdev, mode);
1352 return error; 1346 return ERR_PTR(error);
1353} 1347}
1354 1348
1355static int set_meta_super(struct super_block *s, void *ptr) 1349static int set_meta_super(struct super_block *s, void *ptr)
@@ -1357,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
1357 return -EINVAL; 1351 return -EINVAL;
1358} 1352}
1359 1353
1360static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, 1354static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
1361 const char *dev_name, void *data, struct vfsmount *mnt) 1355 int flags, const char *dev_name, void *data)
1362{ 1356{
1363 struct super_block *s; 1357 struct super_block *s;
1364 struct gfs2_sbd *sdp; 1358 struct gfs2_sbd *sdp;
@@ -1369,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1369 if (error) { 1363 if (error) {
1370 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1364 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
1371 dev_name, error); 1365 dev_name, error);
1372 return error; 1366 return ERR_PTR(error);
1373 } 1367 }
1374 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, 1368 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
1375 path.dentry->d_inode->i_sb->s_bdev); 1369 path.dentry->d_inode->i_sb->s_bdev);
1376 path_put(&path); 1370 path_put(&path);
1377 if (IS_ERR(s)) { 1371 if (IS_ERR(s)) {
1378 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1372 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
1379 return PTR_ERR(s); 1373 return ERR_CAST(s);
1380 } 1374 }
1381 if ((flags ^ s->s_flags) & MS_RDONLY) { 1375 if ((flags ^ s->s_flags) & MS_RDONLY) {
1382 deactivate_locked_super(s); 1376 deactivate_locked_super(s);
1383 return -EBUSY; 1377 return ERR_PTR(-EBUSY);
1384 } 1378 }
1385 sdp = s->s_fs_info; 1379 sdp = s->s_fs_info;
1386 mnt->mnt_sb = s; 1380 return dget(sdp->sd_master_dir);
1387 mnt->mnt_root = dget(sdp->sd_master_dir);
1388 return 0;
1389} 1381}
1390 1382
1391static void gfs2_kill_sb(struct super_block *sb) 1383static void gfs2_kill_sb(struct super_block *sb)
@@ -1411,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
1411struct file_system_type gfs2_fs_type = { 1403struct file_system_type gfs2_fs_type = {
1412 .name = "gfs2", 1404 .name = "gfs2",
1413 .fs_flags = FS_REQUIRES_DEV, 1405 .fs_flags = FS_REQUIRES_DEV,
1414 .get_sb = gfs2_get_sb, 1406 .mount = gfs2_mount,
1415 .kill_sb = gfs2_kill_sb, 1407 .kill_sb = gfs2_kill_sb,
1416 .owner = THIS_MODULE, 1408 .owner = THIS_MODULE,
1417}; 1409};
@@ -1419,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
1419struct file_system_type gfs2meta_fs_type = { 1411struct file_system_type gfs2meta_fs_type = {
1420 .name = "gfs2meta", 1412 .name = "gfs2meta",
1421 .fs_flags = FS_REQUIRES_DEV, 1413 .fs_flags = FS_REQUIRES_DEV,
1422 .get_sb = gfs2_get_sb_meta, 1414 .mount = gfs2_mount_meta,
1423 .owner = THIS_MODULE, 1415 .owner = THIS_MODULE,
1424}; 1416};
1425 1417
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 0534510200d5..12cbea7502c2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -255,7 +255,7 @@ out_parent:
255 gfs2_holder_uninit(ghs); 255 gfs2_holder_uninit(ghs);
256 gfs2_holder_uninit(ghs + 1); 256 gfs2_holder_uninit(ghs + 1);
257 if (!error) { 257 if (!error) {
258 atomic_inc(&inode->i_count); 258 ihold(inode);
259 d_instantiate(dentry, inode); 259 d_instantiate(dentry, inode);
260 mark_inode_dirty(inode); 260 mark_inode_dirty(inode);
261 } 261 }
@@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1294 int error; 1294 int error;
1295 1295
1296 if (!page_has_buffers(page)) { 1296 if (!page_has_buffers(page)) {
1297 error = block_prepare_write(page, from, to, gfs2_block_map); 1297 error = __block_write_begin(page, from, to - from, gfs2_block_map);
1298 if (unlikely(error)) 1298 if (unlikely(error))
1299 return error; 1299 return error;
1300 1300
@@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1313 next += bh->b_size; 1313 next += bh->b_size;
1314 if (buffer_mapped(bh)) { 1314 if (buffer_mapped(bh)) {
1315 if (end) { 1315 if (end) {
1316 error = block_prepare_write(page, start, end, 1316 error = __block_write_begin(page, start, end - start,
1317 gfs2_block_map); 1317 gfs2_block_map);
1318 if (unlikely(error)) 1318 if (unlikely(error))
1319 return error; 1319 return error;
@@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1328 } while (next < to); 1328 } while (next < to);
1329 1329
1330 if (end) { 1330 if (end) {
1331 error = block_prepare_write(page, start, end, gfs2_block_map); 1331 error = __block_write_begin(page, start, end - start, gfs2_block_map);
1332 if (unlikely(error)) 1332 if (unlikely(error))
1333 return error; 1333 return error;
1334 empty_write_end(page, start, end); 1334 empty_write_end(page, start, end);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fb67f593f408..33c8407b876f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -866,8 +866,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
866 if ((start + nr_sects) != blk) { 866 if ((start + nr_sects) != blk) {
867 rv = blkdev_issue_discard(bdev, start, 867 rv = blkdev_issue_discard(bdev, start,
868 nr_sects, GFP_NOFS, 868 nr_sects, GFP_NOFS,
869 BLKDEV_IFL_WAIT | 869 0);
870 BLKDEV_IFL_BARRIER);
871 if (rv) 870 if (rv)
872 goto fail; 871 goto fail;
873 nr_sects = 0; 872 nr_sects = 0;
@@ -881,8 +880,7 @@ start_new_extent:
881 } 880 }
882 } 881 }
883 if (nr_sects) { 882 if (nr_sects) {
884 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 883 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
885 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
886 if (rv) 884 if (rv)
887 goto fail; 885 goto fail;
888 } 886 }
@@ -965,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
965 * The inode, if one has been found, in inode. 963 * The inode, if one has been found, in inode.
966 */ 964 */
967 965
968static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 966static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
969 u64 skip)
970{ 967{
971 u32 goal = 0, block; 968 u32 goal = 0, block;
972 u64 no_addr; 969 u64 no_addr;
973 struct gfs2_sbd *sdp = rgd->rd_sbd; 970 struct gfs2_sbd *sdp = rgd->rd_sbd;
974 unsigned int n; 971 unsigned int n;
972 struct gfs2_glock *gl;
973 struct gfs2_inode *ip;
974 int error;
975 int found = 0;
975 976
976 for(;;) { 977 while (goal < rgd->rd_data) {
977 if (goal >= rgd->rd_data)
978 break;
979 down_write(&sdp->sd_log_flush_lock); 978 down_write(&sdp->sd_log_flush_lock);
980 n = 1; 979 n = 1;
981 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, 980 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -992,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
992 if (no_addr == skip) 991 if (no_addr == skip)
993 continue; 992 continue;
994 *last_unlinked = no_addr; 993 *last_unlinked = no_addr;
995 return no_addr; 994
995 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
996 if (error)
997 continue;
998
999 /* If the inode is already in cache, we can ignore it here
1000 * because the existing inode disposal code will deal with
1001 * it when all refs have gone away. Accessing gl_object like
1002 * this is not safe in general. Here it is ok because we do
1003 * not dereference the pointer, and we only need an approx
1004 * answer to whether it is NULL or not.
1005 */
1006 ip = gl->gl_object;
1007
1008 if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1009 gfs2_glock_put(gl);
1010 else
1011 found++;
1012
1013 /* Limit reclaim to sensible number of tasks */
1014 if (found > 2*NR_CPUS)
1015 return;
996 } 1016 }
997 1017
998 rgd->rd_flags &= ~GFS2_RDF_CHECK; 1018 rgd->rd_flags &= ~GFS2_RDF_CHECK;
999 return 0; 1019 return;
1000} 1020}
1001 1021
1002/** 1022/**
@@ -1077,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1077 * Try to acquire rgrp in way which avoids contending with others. 1097 * Try to acquire rgrp in way which avoids contending with others.
1078 * 1098 *
1079 * Returns: errno 1099 * Returns: errno
1080 * unlinked: the block address of an unlinked block to be reclaimed
1081 */ 1100 */
1082 1101
1083static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, 1102static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1084 u64 *last_unlinked)
1085{ 1103{
1086 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1104 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1087 struct gfs2_rgrpd *rgd, *begin = NULL; 1105 struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1091,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1091 int loops = 0; 1109 int loops = 0;
1092 int error, rg_locked; 1110 int error, rg_locked;
1093 1111
1094 *unlinked = 0;
1095 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1112 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1096 1113
1097 while (rgd) { 1114 while (rgd) {
@@ -1108,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1108 case 0: 1125 case 0:
1109 if (try_rgrp_fit(rgd, al)) 1126 if (try_rgrp_fit(rgd, al))
1110 goto out; 1127 goto out;
1111 /* If the rg came in already locked, there's no 1128 if (rgd->rd_flags & GFS2_RDF_CHECK)
1112 way we can recover from a failed try_rgrp_unlink 1129 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1113 because that would require an iput which can only
1114 happen after the rgrp is unlocked. */
1115 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1116 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1117 ip->i_no_addr);
1118 if (!rg_locked) 1130 if (!rg_locked)
1119 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1131 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1120 if (*unlinked)
1121 return -EAGAIN;
1122 /* fall through */ 1132 /* fall through */
1123 case GLR_TRYFAILED: 1133 case GLR_TRYFAILED:
1124 rgd = recent_rgrp_next(rgd); 1134 rgd = recent_rgrp_next(rgd);
@@ -1147,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1147 case 0: 1157 case 0:
1148 if (try_rgrp_fit(rgd, al)) 1158 if (try_rgrp_fit(rgd, al))
1149 goto out; 1159 goto out;
1150 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) 1160 if (rgd->rd_flags & GFS2_RDF_CHECK)
1151 *unlinked = try_rgrp_unlink(rgd, last_unlinked, 1161 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1152 ip->i_no_addr);
1153 if (!rg_locked) 1162 if (!rg_locked)
1154 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1163 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1155 if (*unlinked)
1156 return -EAGAIN;
1157 break; 1164 break;
1158 1165
1159 case GLR_TRYFAILED: 1166 case GLR_TRYFAILED:
@@ -1206,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1206 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1213 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1207 struct gfs2_alloc *al = ip->i_alloc; 1214 struct gfs2_alloc *al = ip->i_alloc;
1208 int error = 0; 1215 int error = 0;
1209 u64 last_unlinked = NO_BLOCK, unlinked; 1216 u64 last_unlinked = NO_BLOCK;
1217 int tries = 0;
1210 1218
1211 if (gfs2_assert_warn(sdp, al->al_requested)) 1219 if (gfs2_assert_warn(sdp, al->al_requested))
1212 return -EINVAL; 1220 return -EINVAL;
1213 1221
1214try_again:
1215 if (hold_rindex) { 1222 if (hold_rindex) {
1216 /* We need to hold the rindex unless the inode we're using is 1223 /* We need to hold the rindex unless the inode we're using is
1217 the rindex itself, in which case it's already held. */ 1224 the rindex itself, in which case it's already held. */
@@ -1220,31 +1227,23 @@ try_again:
1220 else if (!sdp->sd_rgrps) /* We may not have the rindex read 1227 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1221 in, so: */ 1228 in, so: */
1222 error = gfs2_ri_update_special(ip); 1229 error = gfs2_ri_update_special(ip);
1230 if (error)
1231 return error;
1223 } 1232 }
1224 1233
1225 if (error) 1234 do {
1226 return error; 1235 error = get_local_rgrp(ip, &last_unlinked);
1236 /* If there is no space, flushing the log may release some */
1237 if (error)
1238 gfs2_log_flush(sdp, NULL);
1239 } while (error && tries++ < 3);
1227 1240
1228 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1229 dinodes along the way, error will equal -EAGAIN and unlinked will
1230 contains it block address. We then need to look up that inode and
1231 try to free it, and try the allocation again. */
1232 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1233 if (error) { 1241 if (error) {
1234 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) 1242 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1235 gfs2_glock_dq_uninit(&al->al_ri_gh); 1243 gfs2_glock_dq_uninit(&al->al_ri_gh);
1236 if (error != -EAGAIN) 1244 return error;
1237 return error;
1238
1239 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1240 /* regardless of whether or not gfs2_process_unlinked_inode
1241 was successful, we don't want to repeat it again. */
1242 last_unlinked = unlinked;
1243 gfs2_log_flush(sdp, NULL);
1244 error = 0;
1245
1246 goto try_again;
1247 } 1245 }
1246
1248 /* no error, so we have the rgrp set in the inode's allocation. */ 1247 /* no error, so we have the rgrp set in the inode's allocation. */
1249 al->al_file = file; 1248 al->al_file = file;
1250 al->al_line = line; 1249 al->al_line = line;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 047d1176096c..2b2c4997430b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -857,7 +857,6 @@ restart:
857 gfs2_clear_rgrpd(sdp); 857 gfs2_clear_rgrpd(sdp);
858 gfs2_jindex_free(sdp); 858 gfs2_jindex_free(sdp);
859 /* Take apart glock structures and buffer lists */ 859 /* Take apart glock structures and buffer lists */
860 invalidate_inodes(sdp->sd_vfs);
861 gfs2_gl_hash_clear(sdp); 860 gfs2_gl_hash_clear(sdp);
862 /* Unmount the locking protocol */ 861 /* Unmount the locking protocol */
863 gfs2_lm_unmount(sdp); 862 gfs2_lm_unmount(sdp);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..c8cffb81e849 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
147 u16 blockoffset; 147 u16 blockoffset;
148 148
149 int fs_div; 149 int fs_div;
150
151 struct hlist_head rsrc_inodes;
152}; 150};
153 151
154#define HFS_FLG_BITMAP_DIRTY 0 152#define HFS_FLG_BITMAP_DIRTY 0
@@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
254 sb->s_dirt = 1; 252 sb->s_dirt = 1;
255} 253}
256 254
257static inline void hfs_buffer_sync(struct buffer_head *bh)
258{
259 while (buffer_locked(bh)) {
260 wait_on_buffer(bh);
261 }
262 if (buffer_dirty(bh)) {
263 ll_rw_block(WRITE, 1, &bh);
264 wait_on_buffer(bh);
265 }
266}
267
268#define sb_bread512(sb, sec, data) ({ \ 255#define sb_bread512(sb, sec, data) ({ \
269 struct buffer_head *__bh; \ 256 struct buffer_head *__bh; \
270 sector_t __block; \ 257 sector_t __block; \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
524 HFS_I(inode)->rsrc_inode = dir; 524 HFS_I(inode)->rsrc_inode = dir;
525 HFS_I(dir)->rsrc_inode = inode; 525 HFS_I(dir)->rsrc_inode = inode;
526 igrab(dir); 526 igrab(dir);
527 hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes); 527 hlist_add_fake(&inode->i_hash);
528 mark_inode_dirty(inode); 528 mark_inode_dirty(inode);
529out: 529out:
530 d_add(dentry, inode); 530 d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
220 mdb->drLsMod = hfs_mtime(); 220 mdb->drLsMod = hfs_mtime();
221 221
222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh); 222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
223 hfs_buffer_sync(HFS_SB(sb)->mdb_bh); 223 sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
224 } 224 }
225 225
226 return 0; 226 return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); 287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); 288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); 289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
290 hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh); 290 sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
291 } 291 }
292 292
293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { 293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34235d4bf08b..4824c27cebb8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/smp_lock.h>
24#include <linux/vfs.h> 23#include <linux/vfs.h>
25 24
26#include "hfs_fs.h" 25#include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
79 */ 78 */
80static void hfs_put_super(struct super_block *sb) 79static void hfs_put_super(struct super_block *sb)
81{ 80{
82 lock_kernel();
83
84 if (sb->s_dirt) 81 if (sb->s_dirt)
85 hfs_write_super(sb); 82 hfs_write_super(sb);
86 hfs_mdb_close(sb); 83 hfs_mdb_close(sb);
87 /* release the MDB's resources */ 84 /* release the MDB's resources */
88 hfs_mdb_put(sb); 85 hfs_mdb_put(sb);
89
90 unlock_kernel();
91} 86}
92 87
93/* 88/*
@@ -385,8 +380,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
385 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); 380 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
386 if (!sbi) 381 if (!sbi)
387 return -ENOMEM; 382 return -ENOMEM;
383
388 sb->s_fs_info = sbi; 384 sb->s_fs_info = sbi;
389 INIT_HLIST_HEAD(&sbi->rsrc_inodes);
390 385
391 res = -EINVAL; 386 res = -EINVAL;
392 if (!parse_options((char *)data, sbi)) { 387 if (!parse_options((char *)data, sbi)) {
@@ -446,17 +441,16 @@ bail:
446 return res; 441 return res;
447} 442}
448 443
449static int hfs_get_sb(struct file_system_type *fs_type, 444static struct dentry *hfs_mount(struct file_system_type *fs_type,
450 int flags, const char *dev_name, void *data, 445 int flags, const char *dev_name, void *data)
451 struct vfsmount *mnt)
452{ 446{
453 return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt); 447 return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
454} 448}
455 449
456static struct file_system_type hfs_fs_type = { 450static struct file_system_type hfs_fs_type = {
457 .owner = THIS_MODULE, 451 .owner = THIS_MODULE,
458 .name = "hfs", 452 .name = "hfs",
459 .get_sb = hfs_get_sb, 453 .mount = hfs_mount,
460 .kill_sb = kill_block_super, 454 .kill_sb = kill_block_super,
461 .fs_flags = FS_REQUIRES_DEV, 455 .fs_flags = FS_REQUIRES_DEV,
462}; 456};
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d236d85ec9d7..9d59c0571f59 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -286,7 +286,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
286 286
287 inc_nlink(inode); 287 inc_nlink(inode);
288 hfsplus_instantiate(dst_dentry, inode, cnid); 288 hfsplus_instantiate(dst_dentry, inode, cnid);
289 atomic_inc(&inode->i_count); 289 ihold(inode);
290 inode->i_ctime = CURRENT_TIME_SEC; 290 inode->i_ctime = CURRENT_TIME_SEC;
291 mark_inode_dirty(inode); 291 mark_inode_dirty(inode);
292 sbi->file_count++; 292 sbi->file_count++;
@@ -317,8 +317,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
317 res = hfsplus_rename_cat(inode->i_ino, 317 res = hfsplus_rename_cat(inode->i_ino,
318 dir, &dentry->d_name, 318 dir, &dentry->d_name,
319 sbi->hidden_dir, &str); 319 sbi->hidden_dir, &str);
320 if (!res) 320 if (!res) {
321 inode->i_flags |= S_DEAD; 321 inode->i_flags |= S_DEAD;
322 drop_nlink(inode);
323 }
322 goto out; 324 goto out;
323 } 325 }
324 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); 326 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 78449280dae0..8afd7e84f98d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -211,7 +211,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
211 * appear hashed, but do not put on any lists. hlist_del() 211 * appear hashed, but do not put on any lists. hlist_del()
212 * will work fine and require no locking. 212 * will work fine and require no locking.
213 */ 213 */
214 inode->i_hash.pprev = &inode->i_hash.next; 214 hlist_add_fake(&inode->i_hash);
215 215
216 mark_inode_dirty(inode); 216 mark_inode_dirty(inode);
217out: 217out:
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5b4667e08ef7..40a85a3ded6e 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -92,7 +92,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
92 mark_inode_dirty(inode); 92 mark_inode_dirty(inode);
93 93
94out_unlock_inode: 94out_unlock_inode:
95 mutex_lock(&inode->i_mutex); 95 mutex_unlock(&inode->i_mutex);
96out_drop_write: 96out_drop_write:
97 mnt_drop_write(file->f_path.mnt); 97 mnt_drop_write(file->f_path.mnt);
98out: 98out:
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a88d7536103..52cc746d3ba3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -495,18 +495,16 @@ static void hfsplus_destroy_inode(struct inode *inode)
495 495
496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
497 497
498static int hfsplus_get_sb(struct file_system_type *fs_type, 498static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
499 int flags, const char *dev_name, void *data, 499 int flags, const char *dev_name, void *data)
500 struct vfsmount *mnt)
501{ 500{
502 return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super, 501 return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
503 mnt);
504} 502}
505 503
506static struct file_system_type hfsplus_fs_type = { 504static struct file_system_type hfsplus_fs_type = {
507 .owner = THIS_MODULE, 505 .owner = THIS_MODULE,
508 .name = "hfsplus", 506 .name = "hfsplus",
509 .get_sb = hfsplus_get_sb, 507 .mount = hfsplus_mount,
510 .kill_sb = kill_block_super, 508 .kill_sb = kill_block_super,
511 .fs_flags = FS_REQUIRES_DEV, 509 .fs_flags = FS_REQUIRES_DEV,
512}; 510};
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6bbd75c5589b..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
28 * #define ATTR_KILL_SUID 2048 28 * #define ATTR_KILL_SUID 2048
29 * #define ATTR_KILL_SGID 4096 29 * #define ATTR_KILL_SGID 4096
30 * 30 *
31 * and this is because they were added in 2.5 development in this patch: 31 * and this is because they were added in 2.5 development.
32 *
33 * http://linux.bkbits.net:8080/linux-2.5/
34 * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
35 * |src/.|src/include|src/include/linux|related/include/linux/fs.h
36 *
37 * Actually, they are not needed by most ->setattr() methods - they are set by 32 * Actually, they are not needed by most ->setattr() methods - they are set by
38 * callers of notify_change() to notify that the setuid/setgid bits must be 33 * callers of notify_change() to notify that the setuid/setgid bits must be
39 * dropped. 34 * dropped.
@@ -96,7 +91,6 @@ extern int rename_file(char *from, char *to);
96extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 91extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
97 long long *bfree_out, long long *bavail_out, 92 long long *bfree_out, long long *bavail_out,
98 long long *files_out, long long *ffree_out, 93 long long *files_out, long long *ffree_out,
99 void *fsid_out, int fsid_size, long *namelen_out, 94 void *fsid_out, int fsid_size, long *namelen_out);
100 long *spare_out);
101 95
102#endif 96#endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..2c0f148a49e6 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
217 err = do_statfs(dentry->d_sb->s_fs_info, 217 err = do_statfs(dentry->d_sb->s_fs_info,
218 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, 218 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
219 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 219 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
220 &sf->f_namelen, sf->f_spare); 220 &sf->f_namelen);
221 if (err) 221 if (err)
222 return err; 222 return err;
223 sf->f_blocks = f_blocks; 223 sf->f_blocks = f_blocks;
@@ -962,11 +962,11 @@ out:
962 return err; 962 return err;
963} 963}
964 964
965static int hostfs_read_sb(struct file_system_type *type, 965static struct dentry *hostfs_read_sb(struct file_system_type *type,
966 int flags, const char *dev_name, 966 int flags, const char *dev_name,
967 void *data, struct vfsmount *mnt) 967 void *data)
968{ 968{
969 return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt); 969 return mount_nodev(type, flags, data, hostfs_fill_sb_common);
970} 970}
971 971
972static void hostfs_kill_sb(struct super_block *s) 972static void hostfs_kill_sb(struct super_block *s)
@@ -978,7 +978,7 @@ static void hostfs_kill_sb(struct super_block *s)
978static struct file_system_type hostfs_type = { 978static struct file_system_type hostfs_type = {
979 .owner = THIS_MODULE, 979 .owner = THIS_MODULE,
980 .name = "hostfs", 980 .name = "hostfs",
981 .get_sb = hostfs_read_sb, 981 .mount = hostfs_read_sb,
982 .kill_sb = hostfs_kill_sb, 982 .kill_sb = hostfs_kill_sb,
983 .fs_flags = 0, 983 .fs_flags = 0,
984}; 984};
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
94 94
95 dir = opendir(path); 95 dir = opendir(path);
96 *err_out = errno; 96 *err_out = errno;
97 if (dir == NULL) 97
98 return NULL;
99 return dir; 98 return dir;
100} 99}
101 100
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
205 if (attrs->ia_valid & HOSTFS_ATTR_MODE) { 204 if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
206 if (fd >= 0) { 205 if (fd >= 0) {
207 if (fchmod(fd, attrs->ia_mode) != 0) 206 if (fchmod(fd, attrs->ia_mode) != 0)
208 return (-errno); 207 return -errno;
209 } else if (chmod(file, attrs->ia_mode) != 0) { 208 } else if (chmod(file, attrs->ia_mode) != 0) {
210 return -errno; 209 return -errno;
211 } 210 }
@@ -364,8 +363,7 @@ int rename_file(char *from, char *to)
364int do_statfs(char *root, long *bsize_out, long long *blocks_out, 363int do_statfs(char *root, long *bsize_out, long long *blocks_out,
365 long long *bfree_out, long long *bavail_out, 364 long long *bfree_out, long long *bavail_out,
366 long long *files_out, long long *ffree_out, 365 long long *files_out, long long *ffree_out,
367 void *fsid_out, int fsid_size, long *namelen_out, 366 void *fsid_out, int fsid_size, long *namelen_out)
368 long *spare_out)
369{ 367{
370 struct statfs64 buf; 368 struct statfs64 buf;
371 int err; 369 int err;
@@ -384,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
384 sizeof(buf.f_fsid) > fsid_size ? fsid_size : 382 sizeof(buf.f_fsid) > fsid_size ? fsid_size :
385 sizeof(buf.f_fsid)); 383 sizeof(buf.f_fsid));
386 *namelen_out = buf.f_namelen; 384 *namelen_out = buf.f_namelen;
387 spare_out[0] = buf.f_spare[0]; 385
388 spare_out[1] = buf.f_spare[1];
389 spare_out[2] = buf.f_spare[2];
390 spare_out[3] = buf.f_spare[3];
391 spare_out[4] = buf.f_spare[4];
392 return 0; 386 return 0;
393} 387}
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
1config HPFS_FS 1config HPFS_FS
2 tristate "OS/2 HPFS file system support" 2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # nontrivial to fix
4 help 5 help
5 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS 6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
6 is the file system used for organizing files on OS/2 hard disk 7 is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e3..793cb9d943d2 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
14#ifdef DEBUG_LOCKS 14#ifdef DEBUG_LOCKS
15 printk("lock creation\n"); 15 printk("lock creation\n");
16#endif 16#endif
17 down(&hpfs_sb(s)->hpfs_creation_de); 17 mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
18} 18}
19 19
20void hpfs_unlock_creation(struct super_block *s) 20void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
22#ifdef DEBUG_LOCKS 22#ifdef DEBUG_LOCKS
23 printk("unlock creation\n"); 23 printk("unlock creation\n");
24#endif 24#endif
25 up(&hpfs_sb(s)->hpfs_creation_de); 25 mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
26} 26}
27 27
28/* Map a sector into a buffer and return pointers to it and to the buffer. */ 28/* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b59eac0232a0..2fee17d0d9ab 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
87 unsigned *sb_bmp_dir; /* main bitmap directory */ 87 unsigned *sb_bmp_dir; /* main bitmap directory */
88 unsigned sb_c_bitmap; /* current bitmap */ 88 unsigned sb_c_bitmap; /* current bitmap */
89 unsigned sb_max_fwd_alloc; /* max forwad allocation */ 89 unsigned sb_max_fwd_alloc; /* max forwad allocation */
90 struct semaphore hpfs_creation_de; /* when creating dirents, nobody else 90 struct mutex hpfs_creation_de; /* when creating dirents, nobody else
91 can alloc blocks */ 91 can alloc blocks */
92 /*unsigned sb_mounting : 1;*/ 92 /*unsigned sb_mounting : 1;*/
93 int sb_timeshift; 93 int sb_timeshift;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 2607010be2fe..6c5f01597c3a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -477,17 +477,21 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 477
478 int o; 478 int o;
479 479
480 lock_kernel();
481
480 save_mount_options(s, options); 482 save_mount_options(s, options);
481 483
482 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 484 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
483 if (!sbi) 485 if (!sbi) {
486 unlock_kernel();
484 return -ENOMEM; 487 return -ENOMEM;
488 }
485 s->s_fs_info = sbi; 489 s->s_fs_info = sbi;
486 490
487 sbi->sb_bmp_dir = NULL; 491 sbi->sb_bmp_dir = NULL;
488 sbi->sb_cp_table = NULL; 492 sbi->sb_cp_table = NULL;
489 493
490 init_MUTEX(&sbi->hpfs_creation_de); 494 mutex_init(&sbi->hpfs_creation_de);
491 495
492 uid = current_uid(); 496 uid = current_uid();
493 gid = current_gid(); 497 gid = current_gid();
@@ -666,6 +670,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
666 root->i_blocks = 5; 670 root->i_blocks = 5;
667 hpfs_brelse4(&qbh); 671 hpfs_brelse4(&qbh);
668 } 672 }
673 unlock_kernel();
669 return 0; 674 return 0;
670 675
671bail4: brelse(bh2); 676bail4: brelse(bh2);
@@ -677,20 +682,20 @@ bail0:
677 kfree(sbi->sb_cp_table); 682 kfree(sbi->sb_cp_table);
678 s->s_fs_info = NULL; 683 s->s_fs_info = NULL;
679 kfree(sbi); 684 kfree(sbi);
685 unlock_kernel();
680 return -EINVAL; 686 return -EINVAL;
681} 687}
682 688
683static int hpfs_get_sb(struct file_system_type *fs_type, 689static struct dentry *hpfs_mount(struct file_system_type *fs_type,
684 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 690 int flags, const char *dev_name, void *data)
685{ 691{
686 return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super, 692 return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
687 mnt);
688} 693}
689 694
690static struct file_system_type hpfs_fs_type = { 695static struct file_system_type hpfs_fs_type = {
691 .owner = THIS_MODULE, 696 .owner = THIS_MODULE,
692 .name = "hpfs", 697 .name = "hpfs",
693 .get_sb = hpfs_get_sb, 698 .mount = hpfs_mount,
694 .kill_sb = kill_block_super, 699 .kill_sb = kill_block_super,
695 .fs_flags = FS_REQUIRES_DEV, 700 .fs_flags = FS_REQUIRES_DEV,
696}; 701};
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7b027720d820..f702b5f713fc 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -598,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
598 .readdir = hppfs_readdir, 598 .readdir = hppfs_readdir,
599 .open = hppfs_dir_open, 599 .open = hppfs_dir_open,
600 .fsync = hppfs_fsync, 600 .fsync = hppfs_fsync,
601 .llseek = default_llseek,
601}; 602};
602 603
603static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) 604static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -747,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
747 return(err); 748 return(err);
748} 749}
749 750
750static int hppfs_read_super(struct file_system_type *type, 751static struct dentry *hppfs_read_super(struct file_system_type *type,
751 int flags, const char *dev_name, 752 int flags, const char *dev_name,
752 void *data, struct vfsmount *mnt) 753 void *data)
753{ 754{
754 return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt); 755 return mount_nodev(type, flags, data, hppfs_fill_super);
755} 756}
756 757
757static struct file_system_type hppfs_type = { 758static struct file_system_type hppfs_type = {
758 .owner = THIS_MODULE, 759 .owner = THIS_MODULE,
759 .name = "hppfs", 760 .name = "hppfs",
760 .get_sb = hppfs_read_super, 761 .mount = hppfs_read_super,
761 .kill_sb = kill_anon_super, 762 .kill_sb = kill_anon_super,
762 .fs_flags = 0, 763 .fs_flags = 0,
763}; 764};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..a5fe68189eed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/magic.h> 33#include <linux/magic.h>
34#include <linux/migrate.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -455,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
455 inode = new_inode(sb); 456 inode = new_inode(sb);
456 if (inode) { 457 if (inode) {
457 struct hugetlbfs_inode_info *info; 458 struct hugetlbfs_inode_info *info;
459 inode->i_ino = get_next_ino();
458 inode->i_mode = mode; 460 inode->i_mode = mode;
459 inode->i_uid = uid; 461 inode->i_uid = uid;
460 inode->i_gid = gid; 462 inode->i_gid = gid;
@@ -573,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
573 return 0; 575 return 0;
574} 576}
575 577
578static int hugetlbfs_migrate_page(struct address_space *mapping,
579 struct page *newpage, struct page *page)
580{
581 int rc;
582
583 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
584 if (rc)
585 return rc;
586 migrate_page_copy(newpage, page);
587
588 return 0;
589}
590
576static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 591static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
577{ 592{
578 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 593 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +674,7 @@ static const struct address_space_operations hugetlbfs_aops = {
659 .write_begin = hugetlbfs_write_begin, 674 .write_begin = hugetlbfs_write_begin,
660 .write_end = hugetlbfs_write_end, 675 .write_end = hugetlbfs_write_end,
661 .set_page_dirty = hugetlbfs_set_page_dirty, 676 .set_page_dirty = hugetlbfs_set_page_dirty,
677 .migratepage = hugetlbfs_migrate_page,
662}; 678};
663 679
664 680
@@ -674,6 +690,7 @@ const struct file_operations hugetlbfs_file_operations = {
674 .mmap = hugetlbfs_file_mmap, 690 .mmap = hugetlbfs_file_mmap,
675 .fsync = noop_fsync, 691 .fsync = noop_fsync,
676 .get_unmapped_area = hugetlb_get_unmapped_area, 692 .get_unmapped_area = hugetlb_get_unmapped_area,
693 .llseek = default_llseek,
677}; 694};
678 695
679static const struct inode_operations hugetlbfs_dir_inode_operations = { 696static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -879,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
879 } 896 }
880} 897}
881 898
882static int hugetlbfs_get_sb(struct file_system_type *fs_type, 899static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
883 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 900 int flags, const char *dev_name, void *data)
884{ 901{
885 return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); 902 return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
886} 903}
887 904
888static struct file_system_type hugetlbfs_fs_type = { 905static struct file_system_type hugetlbfs_fs_type = {
889 .name = "hugetlbfs", 906 .name = "hugetlbfs",
890 .get_sb = hugetlbfs_get_sb, 907 .mount = hugetlbfs_mount,
891 .kill_sb = kill_litter_super, 908 .kill_sb = kill_litter_super,
892}; 909};
893 910
@@ -915,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
915 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 932 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
916 *user = current_user(); 933 *user = current_user();
917 if (user_shm_lock(size, *user)) { 934 if (user_shm_lock(size, *user)) {
918 WARN_ONCE(1, 935 printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
919 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
920 } else { 936 } else {
921 *user = NULL; 937 *user = NULL;
922 return ERR_PTR(-EPERM); 938 return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..ae2727ab0c3a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,11 +24,11 @@
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/async.h> 25#include <linux/async.h>
26#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h>
27 28
28/* 29/*
29 * This is needed for the following functions: 30 * This is needed for the following functions:
30 * - inode_has_buffers 31 * - inode_has_buffers
31 * - invalidate_inode_buffers
32 * - invalidate_bdev 32 * - invalidate_bdev
33 * 33 *
34 * FIXME: remove all knowledge of the buffer layer from this file 34 * FIXME: remove all knowledge of the buffer layer from this file
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
72 * allowing for low-overhead inode sync() operations. 72 * allowing for low-overhead inode sync() operations.
73 */ 73 */
74 74
75LIST_HEAD(inode_in_use); 75static LIST_HEAD(inode_lru);
76LIST_HEAD(inode_unused);
77static struct hlist_head *inode_hashtable __read_mostly; 76static struct hlist_head *inode_hashtable __read_mostly;
78 77
79/* 78/*
@@ -103,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem);
103 */ 102 */
104struct inodes_stat_t inodes_stat; 103struct inodes_stat_t inodes_stat;
105 104
105static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
106static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
107
106static struct kmem_cache *inode_cachep __read_mostly; 108static struct kmem_cache *inode_cachep __read_mostly;
107 109
110static inline int get_nr_inodes(void)
111{
112 return percpu_counter_sum_positive(&nr_inodes);
113}
114
115static inline int get_nr_inodes_unused(void)
116{
117 return percpu_counter_sum_positive(&nr_inodes_unused);
118}
119
120int get_nr_dirty_inodes(void)
121{
122 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
123 return nr_dirty > 0 ? nr_dirty : 0;
124
125}
126
127/*
128 * Handle nr_inode sysctl
129 */
130#ifdef CONFIG_SYSCTL
131int proc_nr_inodes(ctl_table *table, int write,
132 void __user *buffer, size_t *lenp, loff_t *ppos)
133{
134 inodes_stat.nr_inodes = get_nr_inodes();
135 inodes_stat.nr_unused = get_nr_inodes_unused();
136 return proc_dointvec(table, write, buffer, lenp, ppos);
137}
138#endif
139
108static void wake_up_inode(struct inode *inode) 140static void wake_up_inode(struct inode *inode)
109{ 141{
110 /* 142 /*
@@ -192,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
192 inode->i_fsnotify_mask = 0; 224 inode->i_fsnotify_mask = 0;
193#endif 225#endif
194 226
227 percpu_counter_inc(&nr_inodes);
228
195 return 0; 229 return 0;
196out: 230out:
197 return -ENOMEM; 231 return -ENOMEM;
@@ -232,11 +266,13 @@ void __destroy_inode(struct inode *inode)
232 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 266 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
233 posix_acl_release(inode->i_default_acl); 267 posix_acl_release(inode->i_default_acl);
234#endif 268#endif
269 percpu_counter_dec(&nr_inodes);
235} 270}
236EXPORT_SYMBOL(__destroy_inode); 271EXPORT_SYMBOL(__destroy_inode);
237 272
238void destroy_inode(struct inode *inode) 273static void destroy_inode(struct inode *inode)
239{ 274{
275 BUG_ON(!list_empty(&inode->i_lru));
240 __destroy_inode(inode); 276 __destroy_inode(inode);
241 if (inode->i_sb->s_op->destroy_inode) 277 if (inode->i_sb->s_op->destroy_inode)
242 inode->i_sb->s_op->destroy_inode(inode); 278 inode->i_sb->s_op->destroy_inode(inode);
@@ -255,6 +291,8 @@ void inode_init_once(struct inode *inode)
255 INIT_HLIST_NODE(&inode->i_hash); 291 INIT_HLIST_NODE(&inode->i_hash);
256 INIT_LIST_HEAD(&inode->i_dentry); 292 INIT_LIST_HEAD(&inode->i_dentry);
257 INIT_LIST_HEAD(&inode->i_devices); 293 INIT_LIST_HEAD(&inode->i_devices);
294 INIT_LIST_HEAD(&inode->i_wb_list);
295 INIT_LIST_HEAD(&inode->i_lru);
258 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 296 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
259 spin_lock_init(&inode->i_data.tree_lock); 297 spin_lock_init(&inode->i_data.tree_lock);
260 spin_lock_init(&inode->i_data.i_mmap_lock); 298 spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -281,14 +319,109 @@ static void init_once(void *foo)
281 */ 319 */
282void __iget(struct inode *inode) 320void __iget(struct inode *inode)
283{ 321{
284 if (atomic_inc_return(&inode->i_count) != 1) 322 atomic_inc(&inode->i_count);
285 return; 323}
324
325/*
326 * get additional reference to inode; caller must already hold one.
327 */
328void ihold(struct inode *inode)
329{
330 WARN_ON(atomic_inc_return(&inode->i_count) < 2);
331}
332EXPORT_SYMBOL(ihold);
333
334static void inode_lru_list_add(struct inode *inode)
335{
336 if (list_empty(&inode->i_lru)) {
337 list_add(&inode->i_lru, &inode_lru);
338 percpu_counter_inc(&nr_inodes_unused);
339 }
340}
286 341
287 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 342static void inode_lru_list_del(struct inode *inode)
288 list_move(&inode->i_list, &inode_in_use); 343{
289 inodes_stat.nr_unused--; 344 if (!list_empty(&inode->i_lru)) {
345 list_del_init(&inode->i_lru);
346 percpu_counter_dec(&nr_inodes_unused);
347 }
348}
349
350static inline void __inode_sb_list_add(struct inode *inode)
351{
352 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
290} 353}
291 354
355/**
356 * inode_sb_list_add - add inode to the superblock list of inodes
357 * @inode: inode to add
358 */
359void inode_sb_list_add(struct inode *inode)
360{
361 spin_lock(&inode_lock);
362 __inode_sb_list_add(inode);
363 spin_unlock(&inode_lock);
364}
365EXPORT_SYMBOL_GPL(inode_sb_list_add);
366
367static inline void __inode_sb_list_del(struct inode *inode)
368{
369 list_del_init(&inode->i_sb_list);
370}
371
372static unsigned long hash(struct super_block *sb, unsigned long hashval)
373{
374 unsigned long tmp;
375
376 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
377 L1_CACHE_BYTES;
378 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
379 return tmp & I_HASHMASK;
380}
381
382/**
383 * __insert_inode_hash - hash an inode
384 * @inode: unhashed inode
385 * @hashval: unsigned long value used to locate this object in the
386 * inode_hashtable.
387 *
388 * Add an inode to the inode hash for this superblock.
389 */
390void __insert_inode_hash(struct inode *inode, unsigned long hashval)
391{
392 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
393
394 spin_lock(&inode_lock);
395 hlist_add_head(&inode->i_hash, b);
396 spin_unlock(&inode_lock);
397}
398EXPORT_SYMBOL(__insert_inode_hash);
399
400/**
401 * __remove_inode_hash - remove an inode from the hash
402 * @inode: inode to unhash
403 *
404 * Remove an inode from the superblock.
405 */
406static void __remove_inode_hash(struct inode *inode)
407{
408 hlist_del_init(&inode->i_hash);
409}
410
411/**
412 * remove_inode_hash - remove an inode from the hash
413 * @inode: inode to unhash
414 *
415 * Remove an inode from the superblock.
416 */
417void remove_inode_hash(struct inode *inode)
418{
419 spin_lock(&inode_lock);
420 hlist_del_init(&inode->i_hash);
421 spin_unlock(&inode_lock);
422}
423EXPORT_SYMBOL(remove_inode_hash);
424
292void end_writeback(struct inode *inode) 425void end_writeback(struct inode *inode)
293{ 426{
294 might_sleep(); 427 might_sleep();
@@ -327,101 +460,113 @@ static void evict(struct inode *inode)
327 */ 460 */
328static void dispose_list(struct list_head *head) 461static void dispose_list(struct list_head *head)
329{ 462{
330 int nr_disposed = 0;
331
332 while (!list_empty(head)) { 463 while (!list_empty(head)) {
333 struct inode *inode; 464 struct inode *inode;
334 465
335 inode = list_first_entry(head, struct inode, i_list); 466 inode = list_first_entry(head, struct inode, i_lru);
336 list_del(&inode->i_list); 467 list_del_init(&inode->i_lru);
337 468
338 evict(inode); 469 evict(inode);
339 470
340 spin_lock(&inode_lock); 471 spin_lock(&inode_lock);
341 hlist_del_init(&inode->i_hash); 472 __remove_inode_hash(inode);
342 list_del_init(&inode->i_sb_list); 473 __inode_sb_list_del(inode);
343 spin_unlock(&inode_lock); 474 spin_unlock(&inode_lock);
344 475
345 wake_up_inode(inode); 476 wake_up_inode(inode);
346 destroy_inode(inode); 477 destroy_inode(inode);
347 nr_disposed++;
348 } 478 }
349 spin_lock(&inode_lock);
350 inodes_stat.nr_inodes -= nr_disposed;
351 spin_unlock(&inode_lock);
352} 479}
353 480
354/* 481/**
355 * Invalidate all inodes for a device. 482 * evict_inodes - evict all evictable inodes for a superblock
483 * @sb: superblock to operate on
484 *
485 * Make sure that no inodes with zero refcount are retained. This is
486 * called by superblock shutdown after having MS_ACTIVE flag removed,
487 * so any inode reaching zero refcount during or after that call will
488 * be immediately evicted.
356 */ 489 */
357static int invalidate_list(struct list_head *head, struct list_head *dispose) 490void evict_inodes(struct super_block *sb)
358{ 491{
359 struct list_head *next; 492 struct inode *inode, *next;
360 int busy = 0, count = 0; 493 LIST_HEAD(dispose);
361
362 next = head->next;
363 for (;;) {
364 struct list_head *tmp = next;
365 struct inode *inode;
366 494
367 /* 495 down_write(&iprune_sem);
368 * We can reschedule here without worrying about the list's
369 * consistency because the per-sb list of inodes must not
370 * change during umount anymore, and because iprune_sem keeps
371 * shrink_icache_memory() away.
372 */
373 cond_resched_lock(&inode_lock);
374 496
375 next = next->next; 497 spin_lock(&inode_lock);
376 if (tmp == head) 498 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
377 break; 499 if (atomic_read(&inode->i_count))
378 inode = list_entry(tmp, struct inode, i_sb_list);
379 if (inode->i_state & I_NEW)
380 continue; 500 continue;
381 invalidate_inode_buffers(inode); 501
382 if (!atomic_read(&inode->i_count)) { 502 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
383 list_move(&inode->i_list, dispose); 503 WARN_ON(1);
384 WARN_ON(inode->i_state & I_NEW);
385 inode->i_state |= I_FREEING;
386 count++;
387 continue; 504 continue;
388 } 505 }
389 busy = 1; 506
507 inode->i_state |= I_FREEING;
508
509 /*
510 * Move the inode off the IO lists and LRU once I_FREEING is
511 * set so that it won't get moved back on there if it is dirty.
512 */
513 list_move(&inode->i_lru, &dispose);
514 list_del_init(&inode->i_wb_list);
515 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
516 percpu_counter_dec(&nr_inodes_unused);
390 } 517 }
391 /* only unused inodes may be cached with i_count zero */ 518 spin_unlock(&inode_lock);
392 inodes_stat.nr_unused -= count; 519
393 return busy; 520 dispose_list(&dispose);
521 up_write(&iprune_sem);
394} 522}
395 523
396/** 524/**
397 * invalidate_inodes - discard the inodes on a device 525 * invalidate_inodes - attempt to free all inodes on a superblock
398 * @sb: superblock 526 * @sb: superblock to operate on
399 * 527 *
400 * Discard all of the inodes for a given superblock. If the discard 528 * Attempts to free all inodes for a given superblock. If there were any
401 * fails because there are busy inodes then a non zero value is returned. 529 * busy inodes return a non-zero value, else zero.
402 * If the discard is successful all the inodes have been discarded.
403 */ 530 */
404int invalidate_inodes(struct super_block *sb) 531int invalidate_inodes(struct super_block *sb)
405{ 532{
406 int busy; 533 int busy = 0;
407 LIST_HEAD(throw_away); 534 struct inode *inode, *next;
535 LIST_HEAD(dispose);
408 536
409 down_write(&iprune_sem); 537 down_write(&iprune_sem);
538
410 spin_lock(&inode_lock); 539 spin_lock(&inode_lock);
411 fsnotify_unmount_inodes(&sb->s_inodes); 540 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
412 busy = invalidate_list(&sb->s_inodes, &throw_away); 541 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
542 continue;
543 if (atomic_read(&inode->i_count)) {
544 busy = 1;
545 continue;
546 }
547
548 inode->i_state |= I_FREEING;
549
550 /*
551 * Move the inode off the IO lists and LRU once I_FREEING is
552 * set so that it won't get moved back on there if it is dirty.
553 */
554 list_move(&inode->i_lru, &dispose);
555 list_del_init(&inode->i_wb_list);
556 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
557 percpu_counter_dec(&nr_inodes_unused);
558 }
413 spin_unlock(&inode_lock); 559 spin_unlock(&inode_lock);
414 560
415 dispose_list(&throw_away); 561 dispose_list(&dispose);
416 up_write(&iprune_sem); 562 up_write(&iprune_sem);
417 563
418 return busy; 564 return busy;
419} 565}
420EXPORT_SYMBOL(invalidate_inodes);
421 566
422static int can_unuse(struct inode *inode) 567static int can_unuse(struct inode *inode)
423{ 568{
424 if (inode->i_state) 569 if (inode->i_state & ~I_REFERENCED)
425 return 0; 570 return 0;
426 if (inode_has_buffers(inode)) 571 if (inode_has_buffers(inode))
427 return 0; 572 return 0;
@@ -433,22 +578,24 @@ static int can_unuse(struct inode *inode)
433} 578}
434 579
435/* 580/*
436 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 581 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
437 * a temporary list and then are freed outside inode_lock by dispose_list(). 582 * temporary list and then are freed outside inode_lock by dispose_list().
438 * 583 *
439 * Any inodes which are pinned purely because of attached pagecache have their 584 * Any inodes which are pinned purely because of attached pagecache have their
440 * pagecache removed. We expect the final iput() on that inode to add it to 585 * pagecache removed. If the inode has metadata buffers attached to
441 * the front of the inode_unused list. So look for it there and if the 586 * mapping->private_list then try to remove them.
442 * inode is still freeable, proceed. The right inode is found 99.9% of the
443 * time in testing on a 4-way.
444 * 587 *
445 * If the inode has metadata buffers attached to mapping->private_list then 588 * If the inode has the I_REFERENCED flag set, then it means that it has been
446 * try to remove them. 589 * used recently - the flag is set in iput_final(). When we encounter such an
590 * inode, clear the flag and move it to the back of the LRU so it gets another
591 * pass through the LRU before it gets reclaimed. This is necessary because of
592 * the fact we are doing lazy LRU updates to minimise lock contention so the
593 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
594 * with this flag set because they are the inodes that are out of order.
447 */ 595 */
448static void prune_icache(int nr_to_scan) 596static void prune_icache(int nr_to_scan)
449{ 597{
450 LIST_HEAD(freeable); 598 LIST_HEAD(freeable);
451 int nr_pruned = 0;
452 int nr_scanned; 599 int nr_scanned;
453 unsigned long reap = 0; 600 unsigned long reap = 0;
454 601
@@ -457,13 +604,26 @@ static void prune_icache(int nr_to_scan)
457 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 604 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
458 struct inode *inode; 605 struct inode *inode;
459 606
460 if (list_empty(&inode_unused)) 607 if (list_empty(&inode_lru))
461 break; 608 break;
462 609
463 inode = list_entry(inode_unused.prev, struct inode, i_list); 610 inode = list_entry(inode_lru.prev, struct inode, i_lru);
464 611
465 if (inode->i_state || atomic_read(&inode->i_count)) { 612 /*
466 list_move(&inode->i_list, &inode_unused); 613 * Referenced or dirty inodes are still in use. Give them
614 * another pass through the LRU as we canot reclaim them now.
615 */
616 if (atomic_read(&inode->i_count) ||
617 (inode->i_state & ~I_REFERENCED)) {
618 list_del_init(&inode->i_lru);
619 percpu_counter_dec(&nr_inodes_unused);
620 continue;
621 }
622
623 /* recently referenced inodes get one more pass */
624 if (inode->i_state & I_REFERENCED) {
625 list_move(&inode->i_lru, &inode_lru);
626 inode->i_state &= ~I_REFERENCED;
467 continue; 627 continue;
468 } 628 }
469 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 629 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -475,18 +635,23 @@ static void prune_icache(int nr_to_scan)
475 iput(inode); 635 iput(inode);
476 spin_lock(&inode_lock); 636 spin_lock(&inode_lock);
477 637
478 if (inode != list_entry(inode_unused.next, 638 if (inode != list_entry(inode_lru.next,
479 struct inode, i_list)) 639 struct inode, i_lru))
480 continue; /* wrong inode or list_empty */ 640 continue; /* wrong inode or list_empty */
481 if (!can_unuse(inode)) 641 if (!can_unuse(inode))
482 continue; 642 continue;
483 } 643 }
484 list_move(&inode->i_list, &freeable);
485 WARN_ON(inode->i_state & I_NEW); 644 WARN_ON(inode->i_state & I_NEW);
486 inode->i_state |= I_FREEING; 645 inode->i_state |= I_FREEING;
487 nr_pruned++; 646
647 /*
648 * Move the inode off the IO lists and LRU once I_FREEING is
649 * set so that it won't get moved back on there if it is dirty.
650 */
651 list_move(&inode->i_lru, &freeable);
652 list_del_init(&inode->i_wb_list);
653 percpu_counter_dec(&nr_inodes_unused);
488 } 654 }
489 inodes_stat.nr_unused -= nr_pruned;
490 if (current_is_kswapd()) 655 if (current_is_kswapd())
491 __count_vm_events(KSWAPD_INODESTEAL, reap); 656 __count_vm_events(KSWAPD_INODESTEAL, reap);
492 else 657 else
@@ -518,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
518 return -1; 683 return -1;
519 prune_icache(nr); 684 prune_icache(nr);
520 } 685 }
521 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 686 return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
522} 687}
523 688
524static struct shrinker icache_shrinker = { 689static struct shrinker icache_shrinker = {
@@ -529,9 +694,6 @@ static struct shrinker icache_shrinker = {
529static void __wait_on_freeing_inode(struct inode *inode); 694static void __wait_on_freeing_inode(struct inode *inode);
530/* 695/*
531 * Called with the inode lock held. 696 * Called with the inode lock held.
532 * NOTE: we are not increasing the inode-refcount, you must call __iget()
533 * by hand after calling find_inode now! This simplifies iunique and won't
534 * add any additional branch in the common code.
535 */ 697 */
536static struct inode *find_inode(struct super_block *sb, 698static struct inode *find_inode(struct super_block *sb,
537 struct hlist_head *head, 699 struct hlist_head *head,
@@ -551,9 +713,10 @@ repeat:
551 __wait_on_freeing_inode(inode); 713 __wait_on_freeing_inode(inode);
552 goto repeat; 714 goto repeat;
553 } 715 }
554 break; 716 __iget(inode);
717 return inode;
555 } 718 }
556 return node ? inode : NULL; 719 return NULL;
557} 720}
558 721
559/* 722/*
@@ -576,53 +739,49 @@ repeat:
576 __wait_on_freeing_inode(inode); 739 __wait_on_freeing_inode(inode);
577 goto repeat; 740 goto repeat;
578 } 741 }
579 break; 742 __iget(inode);
743 return inode;
580 } 744 }
581 return node ? inode : NULL; 745 return NULL;
582}
583
584static unsigned long hash(struct super_block *sb, unsigned long hashval)
585{
586 unsigned long tmp;
587
588 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
589 L1_CACHE_BYTES;
590 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
591 return tmp & I_HASHMASK;
592}
593
594static inline void
595__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
596 struct inode *inode)
597{
598 inodes_stat.nr_inodes++;
599 list_add(&inode->i_list, &inode_in_use);
600 list_add(&inode->i_sb_list, &sb->s_inodes);
601 if (head)
602 hlist_add_head(&inode->i_hash, head);
603} 746}
604 747
605/** 748/*
606 * inode_add_to_lists - add a new inode to relevant lists 749 * Each cpu owns a range of LAST_INO_BATCH numbers.
607 * @sb: superblock inode belongs to 750 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
608 * @inode: inode to mark in use 751 * to renew the exhausted range.
609 * 752 *
610 * When an inode is allocated it needs to be accounted for, added to the in use 753 * This does not significantly increase overflow rate because every CPU can
611 * list, the owning superblock and the inode hash. This needs to be done under 754 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
612 * the inode_lock, so export a function to do this rather than the inode lock 755 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
613 * itself. We calculate the hash list to add to here so it is all internal 756 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
614 * which requires the caller to have already set up the inode number in the 757 * overflow rate by 2x, which does not seem too significant.
615 * inode to add. 758 *
759 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
760 * error if st_ino won't fit in target struct field. Use 32bit counter
761 * here to attempt to avoid that.
616 */ 762 */
617void inode_add_to_lists(struct super_block *sb, struct inode *inode) 763#define LAST_INO_BATCH 1024
764static DEFINE_PER_CPU(unsigned int, last_ino);
765
766unsigned int get_next_ino(void)
618{ 767{
619 struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); 768 unsigned int *p = &get_cpu_var(last_ino);
769 unsigned int res = *p;
620 770
621 spin_lock(&inode_lock); 771#ifdef CONFIG_SMP
622 __inode_add_to_lists(sb, head, inode); 772 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
623 spin_unlock(&inode_lock); 773 static atomic_t shared_last_ino;
774 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
775
776 res = next - LAST_INO_BATCH;
777 }
778#endif
779
780 *p = ++res;
781 put_cpu_var(last_ino);
782 return res;
624} 783}
625EXPORT_SYMBOL_GPL(inode_add_to_lists); 784EXPORT_SYMBOL(get_next_ino);
626 785
627/** 786/**
628 * new_inode - obtain an inode 787 * new_inode - obtain an inode
@@ -638,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
638 */ 797 */
639struct inode *new_inode(struct super_block *sb) 798struct inode *new_inode(struct super_block *sb)
640{ 799{
641 /*
642 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
643 * error if st_ino won't fit in target struct field. Use 32bit counter
644 * here to attempt to avoid that.
645 */
646 static unsigned int last_ino;
647 struct inode *inode; 800 struct inode *inode;
648 801
649 spin_lock_prefetch(&inode_lock); 802 spin_lock_prefetch(&inode_lock);
@@ -651,8 +804,7 @@ struct inode *new_inode(struct super_block *sb)
651 inode = alloc_inode(sb); 804 inode = alloc_inode(sb);
652 if (inode) { 805 if (inode) {
653 spin_lock(&inode_lock); 806 spin_lock(&inode_lock);
654 __inode_add_to_lists(sb, NULL, inode); 807 __inode_sb_list_add(inode);
655 inode->i_ino = ++last_ino;
656 inode->i_state = 0; 808 inode->i_state = 0;
657 spin_unlock(&inode_lock); 809 spin_unlock(&inode_lock);
658 } 810 }
@@ -663,7 +815,7 @@ EXPORT_SYMBOL(new_inode);
663void unlock_new_inode(struct inode *inode) 815void unlock_new_inode(struct inode *inode)
664{ 816{
665#ifdef CONFIG_DEBUG_LOCK_ALLOC 817#ifdef CONFIG_DEBUG_LOCK_ALLOC
666 if (inode->i_mode & S_IFDIR) { 818 if (S_ISDIR(inode->i_mode)) {
667 struct file_system_type *type = inode->i_sb->s_type; 819 struct file_system_type *type = inode->i_sb->s_type;
668 820
669 /* Set new key only if filesystem hasn't already changed it */ 821 /* Set new key only if filesystem hasn't already changed it */
@@ -720,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb,
720 if (set(inode, data)) 872 if (set(inode, data))
721 goto set_failed; 873 goto set_failed;
722 874
723 __inode_add_to_lists(sb, head, inode); 875 hlist_add_head(&inode->i_hash, head);
876 __inode_sb_list_add(inode);
724 inode->i_state = I_NEW; 877 inode->i_state = I_NEW;
725 spin_unlock(&inode_lock); 878 spin_unlock(&inode_lock);
726 879
@@ -735,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb,
735 * us. Use the old inode instead of the one we just 888 * us. Use the old inode instead of the one we just
736 * allocated. 889 * allocated.
737 */ 890 */
738 __iget(old);
739 spin_unlock(&inode_lock); 891 spin_unlock(&inode_lock);
740 destroy_inode(inode); 892 destroy_inode(inode);
741 inode = old; 893 inode = old;
@@ -767,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
767 old = find_inode_fast(sb, head, ino); 919 old = find_inode_fast(sb, head, ino);
768 if (!old) { 920 if (!old) {
769 inode->i_ino = ino; 921 inode->i_ino = ino;
770 __inode_add_to_lists(sb, head, inode); 922 hlist_add_head(&inode->i_hash, head);
923 __inode_sb_list_add(inode);
771 inode->i_state = I_NEW; 924 inode->i_state = I_NEW;
772 spin_unlock(&inode_lock); 925 spin_unlock(&inode_lock);
773 926
@@ -782,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
782 * us. Use the old inode instead of the one we just 935 * us. Use the old inode instead of the one we just
783 * allocated. 936 * allocated.
784 */ 937 */
785 __iget(old);
786 spin_unlock(&inode_lock); 938 spin_unlock(&inode_lock);
787 destroy_inode(inode); 939 destroy_inode(inode);
788 inode = old; 940 inode = old;
@@ -791,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
791 return inode; 943 return inode;
792} 944}
793 945
946/*
947 * search the inode cache for a matching inode number.
948 * If we find one, then the inode number we are trying to
949 * allocate is not unique and so we should not use it.
950 *
951 * Returns 1 if the inode number is unique, 0 if it is not.
952 */
953static int test_inode_iunique(struct super_block *sb, unsigned long ino)
954{
955 struct hlist_head *b = inode_hashtable + hash(sb, ino);
956 struct hlist_node *node;
957 struct inode *inode;
958
959 hlist_for_each_entry(inode, node, b, i_hash) {
960 if (inode->i_ino == ino && inode->i_sb == sb)
961 return 0;
962 }
963
964 return 1;
965}
966
794/** 967/**
795 * iunique - get a unique inode number 968 * iunique - get a unique inode number
796 * @sb: superblock 969 * @sb: superblock
@@ -812,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
812 * error if st_ino won't fit in target struct field. Use 32bit counter 985 * error if st_ino won't fit in target struct field. Use 32bit counter
813 * here to attempt to avoid that. 986 * here to attempt to avoid that.
814 */ 987 */
988 static DEFINE_SPINLOCK(iunique_lock);
815 static unsigned int counter; 989 static unsigned int counter;
816 struct inode *inode;
817 struct hlist_head *head;
818 ino_t res; 990 ino_t res;
819 991
820 spin_lock(&inode_lock); 992 spin_lock(&inode_lock);
993 spin_lock(&iunique_lock);
821 do { 994 do {
822 if (counter <= max_reserved) 995 if (counter <= max_reserved)
823 counter = max_reserved + 1; 996 counter = max_reserved + 1;
824 res = counter++; 997 res = counter++;
825 head = inode_hashtable + hash(sb, res); 998 } while (!test_inode_iunique(sb, res));
826 inode = find_inode_fast(sb, head, res); 999 spin_unlock(&iunique_lock);
827 } while (inode != NULL);
828 spin_unlock(&inode_lock); 1000 spin_unlock(&inode_lock);
829 1001
830 return res; 1002 return res;
@@ -876,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb,
876 spin_lock(&inode_lock); 1048 spin_lock(&inode_lock);
877 inode = find_inode(sb, head, test, data); 1049 inode = find_inode(sb, head, test, data);
878 if (inode) { 1050 if (inode) {
879 __iget(inode);
880 spin_unlock(&inode_lock); 1051 spin_unlock(&inode_lock);
881 if (likely(wait)) 1052 if (likely(wait))
882 wait_on_inode(inode); 1053 wait_on_inode(inode);
@@ -909,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb,
909 spin_lock(&inode_lock); 1080 spin_lock(&inode_lock);
910 inode = find_inode_fast(sb, head, ino); 1081 inode = find_inode_fast(sb, head, ino);
911 if (inode) { 1082 if (inode) {
912 __iget(inode);
913 spin_unlock(&inode_lock); 1083 spin_unlock(&inode_lock);
914 wait_on_inode(inode); 1084 wait_on_inode(inode);
915 return inode; 1085 return inode;
@@ -1095,7 +1265,7 @@ int insert_inode_locked(struct inode *inode)
1095 __iget(old); 1265 __iget(old);
1096 spin_unlock(&inode_lock); 1266 spin_unlock(&inode_lock);
1097 wait_on_inode(old); 1267 wait_on_inode(old);
1098 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1268 if (unlikely(!inode_unhashed(old))) {
1099 iput(old); 1269 iput(old);
1100 return -EBUSY; 1270 return -EBUSY;
1101 } 1271 }
@@ -1134,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1134 __iget(old); 1304 __iget(old);
1135 spin_unlock(&inode_lock); 1305 spin_unlock(&inode_lock);
1136 wait_on_inode(old); 1306 wait_on_inode(old);
1137 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1307 if (unlikely(!inode_unhashed(old))) {
1138 iput(old); 1308 iput(old);
1139 return -EBUSY; 1309 return -EBUSY;
1140 } 1310 }
@@ -1143,36 +1313,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1143} 1313}
1144EXPORT_SYMBOL(insert_inode_locked4); 1314EXPORT_SYMBOL(insert_inode_locked4);
1145 1315
1146/**
1147 * __insert_inode_hash - hash an inode
1148 * @inode: unhashed inode
1149 * @hashval: unsigned long value used to locate this object in the
1150 * inode_hashtable.
1151 *
1152 * Add an inode to the inode hash for this superblock.
1153 */
1154void __insert_inode_hash(struct inode *inode, unsigned long hashval)
1155{
1156 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
1157 spin_lock(&inode_lock);
1158 hlist_add_head(&inode->i_hash, head);
1159 spin_unlock(&inode_lock);
1160}
1161EXPORT_SYMBOL(__insert_inode_hash);
1162
1163/**
1164 * remove_inode_hash - remove an inode from the hash
1165 * @inode: inode to unhash
1166 *
1167 * Remove an inode from the superblock.
1168 */
1169void remove_inode_hash(struct inode *inode)
1170{
1171 spin_lock(&inode_lock);
1172 hlist_del_init(&inode->i_hash);
1173 spin_unlock(&inode_lock);
1174}
1175EXPORT_SYMBOL(remove_inode_hash);
1176 1316
1177int generic_delete_inode(struct inode *inode) 1317int generic_delete_inode(struct inode *inode)
1178{ 1318{
@@ -1187,7 +1327,7 @@ EXPORT_SYMBOL(generic_delete_inode);
1187 */ 1327 */
1188int generic_drop_inode(struct inode *inode) 1328int generic_drop_inode(struct inode *inode)
1189{ 1329{
1190 return !inode->i_nlink || hlist_unhashed(&inode->i_hash); 1330 return !inode->i_nlink || inode_unhashed(inode);
1191} 1331}
1192EXPORT_SYMBOL_GPL(generic_drop_inode); 1332EXPORT_SYMBOL_GPL(generic_drop_inode);
1193 1333
@@ -1213,10 +1353,11 @@ static void iput_final(struct inode *inode)
1213 drop = generic_drop_inode(inode); 1353 drop = generic_drop_inode(inode);
1214 1354
1215 if (!drop) { 1355 if (!drop) {
1216 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1217 list_move(&inode->i_list, &inode_unused);
1218 inodes_stat.nr_unused++;
1219 if (sb->s_flags & MS_ACTIVE) { 1356 if (sb->s_flags & MS_ACTIVE) {
1357 inode->i_state |= I_REFERENCED;
1358 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1359 inode_lru_list_add(inode);
1360 }
1220 spin_unlock(&inode_lock); 1361 spin_unlock(&inode_lock);
1221 return; 1362 return;
1222 } 1363 }
@@ -1227,19 +1368,23 @@ static void iput_final(struct inode *inode)
1227 spin_lock(&inode_lock); 1368 spin_lock(&inode_lock);
1228 WARN_ON(inode->i_state & I_NEW); 1369 WARN_ON(inode->i_state & I_NEW);
1229 inode->i_state &= ~I_WILL_FREE; 1370 inode->i_state &= ~I_WILL_FREE;
1230 inodes_stat.nr_unused--; 1371 __remove_inode_hash(inode);
1231 hlist_del_init(&inode->i_hash);
1232 } 1372 }
1233 list_del_init(&inode->i_list); 1373
1234 list_del_init(&inode->i_sb_list);
1235 WARN_ON(inode->i_state & I_NEW); 1374 WARN_ON(inode->i_state & I_NEW);
1236 inode->i_state |= I_FREEING; 1375 inode->i_state |= I_FREEING;
1237 inodes_stat.nr_inodes--; 1376
1377 /*
1378 * Move the inode off the IO lists and LRU once I_FREEING is
1379 * set so that it won't get moved back on there if it is dirty.
1380 */
1381 inode_lru_list_del(inode);
1382 list_del_init(&inode->i_wb_list);
1383
1384 __inode_sb_list_del(inode);
1238 spin_unlock(&inode_lock); 1385 spin_unlock(&inode_lock);
1239 evict(inode); 1386 evict(inode);
1240 spin_lock(&inode_lock); 1387 remove_inode_hash(inode);
1241 hlist_del_init(&inode->i_hash);
1242 spin_unlock(&inode_lock);
1243 wake_up_inode(inode); 1388 wake_up_inode(inode);
1244 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 1389 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1245 destroy_inode(inode); 1390 destroy_inode(inode);
@@ -1503,6 +1648,8 @@ void __init inode_init(void)
1503 SLAB_MEM_SPREAD), 1648 SLAB_MEM_SPREAD),
1504 init_once); 1649 init_once);
1505 register_shrinker(&icache_shrinker); 1650 register_shrinker(&icache_shrinker);
1651 percpu_counter_init(&nr_inodes, 0);
1652 percpu_counter_init(&nr_inodes_unused, 0);
1506 1653
1507 /* Hash may have been set up in inode_init_early */ 1654 /* Hash may have been set up in inode_init_early */
1508 if (!hashdist) 1655 if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..e43b9a4dbf4e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,3 +101,10 @@ extern void put_super(struct super_block *sb);
101struct nameidata; 101struct nameidata;
102extern struct file *nameidata_to_filp(struct nameidata *); 102extern struct file *nameidata_to_filp(struct nameidata *);
103extern void release_open_intent(struct nameidata *); 103extern void release_open_intent(struct nameidata *);
104
105/*
106 * inode.c
107 */
108extern int get_nr_dirty_inodes(void);
109extern void evict_inodes(struct super_block *);
110extern int invalidate_inodes(struct super_block *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f855ea4fc888..e92fdbb3bc3a 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -530,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp)
530 return thaw_super(sb); 530 return thaw_super(sb);
531} 531}
532 532
533static int ioctl_fstrim(struct file *filp, void __user *argp)
534{
535 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
536 struct fstrim_range range;
537 int ret = 0;
538
539 if (!capable(CAP_SYS_ADMIN))
540 return -EPERM;
541
542 /* If filesystem doesn't support trim feature, return. */
543 if (sb->s_op->trim_fs == NULL)
544 return -EOPNOTSUPP;
545
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 if (argp == NULL) {
551 range.start = 0;
552 range.len = ULLONG_MAX;
553 range.minlen = 0;
554 } else if (copy_from_user(&range, argp, sizeof(range)))
555 return -EFAULT;
556
557 ret = sb->s_op->trim_fs(sb, &range);
558 if (ret < 0)
559 return ret;
560
561 if ((argp != NULL) &&
562 (copy_to_user(argp, &range, sizeof(range))))
563 return -EFAULT;
564
565 return 0;
566}
567
533/* 568/*
534 * When you add any new common ioctls to the switches above and below 569 * When you add any new common ioctls to the switches above and below
535 * please update compat_sys_ioctl() too. 570 * please update compat_sys_ioctl() too.
@@ -580,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
580 error = ioctl_fsthaw(filp); 615 error = ioctl_fsthaw(filp);
581 break; 616 break;
582 617
618 case FITRIM:
619 error = ioctl_fstrim(filp, argp);
620 break;
621
583 case FS_IOC_FIEMAP: 622 case FS_IOC_FIEMAP:
584 return ioctl_fiemap(filp, arg); 623 return ioctl_fiemap(filp, arg);
585 624
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc6..2f7d05c89922 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -111,12 +111,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
111 read_lock(&tasklist_lock); 111 read_lock(&tasklist_lock);
112 switch (which) { 112 switch (which) {
113 case IOPRIO_WHO_PROCESS: 113 case IOPRIO_WHO_PROCESS:
114 rcu_read_lock();
114 if (!who) 115 if (!who)
115 p = current; 116 p = current;
116 else 117 else
117 p = find_task_by_vpid(who); 118 p = find_task_by_vpid(who);
118 if (p) 119 if (p)
119 ret = set_task_ioprio(p, ioprio); 120 ret = set_task_ioprio(p, ioprio);
121 rcu_read_unlock();
120 break; 122 break;
121 case IOPRIO_WHO_PGRP: 123 case IOPRIO_WHO_PGRP:
122 if (!who) 124 if (!who)
@@ -139,7 +141,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
139 break; 141 break;
140 142
141 do_each_thread(g, p) { 143 do_each_thread(g, p) {
142 if (__task_cred(p)->uid != who) 144 int match;
145
146 rcu_read_lock();
147 match = __task_cred(p)->uid == who;
148 rcu_read_unlock();
149 if (!match)
143 continue; 150 continue;
144 ret = set_task_ioprio(p, ioprio); 151 ret = set_task_ioprio(p, ioprio);
145 if (ret) 152 if (ret)
@@ -200,12 +207,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
200 read_lock(&tasklist_lock); 207 read_lock(&tasklist_lock);
201 switch (which) { 208 switch (which) {
202 case IOPRIO_WHO_PROCESS: 209 case IOPRIO_WHO_PROCESS:
210 rcu_read_lock();
203 if (!who) 211 if (!who)
204 p = current; 212 p = current;
205 else 213 else
206 p = find_task_by_vpid(who); 214 p = find_task_by_vpid(who);
207 if (p) 215 if (p)
208 ret = get_task_ioprio(p); 216 ret = get_task_ioprio(p);
217 rcu_read_unlock();
209 break; 218 break;
210 case IOPRIO_WHO_PGRP: 219 case IOPRIO_WHO_PGRP:
211 if (!who) 220 if (!who)
@@ -232,7 +241,12 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
232 break; 241 break;
233 242
234 do_each_thread(g, p) { 243 do_each_thread(g, p) {
235 if (__task_cred(p)->uid != user->uid) 244 int match;
245
246 rcu_read_lock();
247 match = __task_cred(p)->uid == user->uid;
248 rcu_read_unlock();
249 if (!match)
236 continue; 250 continue;
237 tmpio = get_task_ioprio(p); 251 tmpio = get_task_ioprio(p);
238 if (tmpio < 0) 252 if (tmpio < 0)
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac68..0542b6eedf80 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
10 * 10 *
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h>
14#include <linux/gfp.h> 13#include <linux/gfp.h>
15#include "isofs.h" 14#include "isofs.h"
16 15
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
255 char *tmpname; 254 char *tmpname;
256 struct iso_directory_record *tmpde; 255 struct iso_directory_record *tmpde;
257 struct inode *inode = filp->f_path.dentry->d_inode; 256 struct inode *inode = filp->f_path.dentry->d_inode;
257 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
258 258
259 tmpname = (char *)__get_free_page(GFP_KERNEL); 259 tmpname = (char *)__get_free_page(GFP_KERNEL);
260 if (tmpname == NULL) 260 if (tmpname == NULL)
261 return -ENOMEM; 261 return -ENOMEM;
262 262
263 lock_kernel(); 263 mutex_lock(&sbi->s_mutex);
264 tmpde = (struct iso_directory_record *) (tmpname+1024); 264 tmpde = (struct iso_directory_record *) (tmpname+1024);
265 265
266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); 266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
267 267
268 free_page((unsigned long) tmpname); 268 free_page((unsigned long) tmpname);
269 unlock_kernel(); 269 mutex_unlock(&sbi->s_mutex);
270 return result; 270 return result;
271} 271}
272 272
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5a44811b5027..bfdeb82a53be 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/nls.h> 18#include <linux/nls.h>
19#include <linux/ctype.h> 19#include <linux/ctype.h>
20#include <linux/smp_lock.h>
21#include <linux/statfs.h> 20#include <linux/statfs.h>
22#include <linux/cdrom.h> 21#include <linux/cdrom.h>
23#include <linux/parser.h> 22#include <linux/parser.h>
@@ -44,11 +43,7 @@ static void isofs_put_super(struct super_block *sb)
44 struct isofs_sb_info *sbi = ISOFS_SB(sb); 43 struct isofs_sb_info *sbi = ISOFS_SB(sb);
45 44
46#ifdef CONFIG_JOLIET 45#ifdef CONFIG_JOLIET
47 lock_kernel();
48
49 unload_nls(sbi->s_nls_iocharset); 46 unload_nls(sbi->s_nls_iocharset);
50
51 unlock_kernel();
52#endif 47#endif
53 48
54 kfree(sbi); 49 kfree(sbi);
@@ -549,6 +544,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
549} 544}
550 545
551/* 546/*
547 * Check if root directory is empty (has less than 3 files).
548 *
549 * Used to detect broken CDs where ISO root directory is empty but Joliet root
550 * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
551 * (and Joliet used instead) or else no files would be visible.
552 */
553static bool rootdir_empty(struct super_block *sb, unsigned long block)
554{
555 int offset = 0, files = 0, de_len;
556 struct iso_directory_record *de;
557 struct buffer_head *bh;
558
559 bh = sb_bread(sb, block);
560 if (!bh)
561 return true;
562 while (files < 3) {
563 de = (struct iso_directory_record *) (bh->b_data + offset);
564 de_len = *(unsigned char *) de;
565 if (de_len == 0)
566 break;
567 files++;
568 offset += de_len;
569 }
570 brelse(bh);
571 return files < 3;
572}
573
574/*
552 * Initialize the superblock and read the root inode. 575 * Initialize the superblock and read the root inode.
553 * 576 *
554 * Note: a check_disk_change() has been done immediately prior 577 * Note: a check_disk_change() has been done immediately prior
@@ -823,6 +846,7 @@ root_found:
823 sbi->s_utf8 = opt.utf8; 846 sbi->s_utf8 = opt.utf8;
824 sbi->s_nocompress = opt.nocompress; 847 sbi->s_nocompress = opt.nocompress;
825 sbi->s_overriderockperm = opt.overriderockperm; 848 sbi->s_overriderockperm = opt.overriderockperm;
849 mutex_init(&sbi->s_mutex);
826 /* 850 /*
827 * It would be incredibly stupid to allow people to mark every file 851 * It would be incredibly stupid to allow people to mark every file
828 * on the disk as suid, so we merely allow them to set the default 852 * on the disk as suid, so we merely allow them to set the default
@@ -847,6 +871,18 @@ root_found:
847 goto out_no_root; 871 goto out_no_root;
848 872
849 /* 873 /*
874 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
875 * correct Joliet root directory.
876 */
877 if (sbi->s_rock == 1 && joliet_level &&
878 rootdir_empty(s, sbi->s_firstdatazone)) {
879 printk(KERN_NOTICE
880 "ISOFS: primary root directory is empty. "
881 "Disabling Rock Ridge and switching to Joliet.");
882 sbi->s_rock = 0;
883 }
884
885 /*
850 * If this disk has both Rock Ridge and Joliet on it, then we 886 * If this disk has both Rock Ridge and Joliet on it, then we
851 * want to use Rock Ridge by default. This can be overridden 887 * want to use Rock Ridge by default. This can be overridden
852 * by using the norock mount option. There is still one other 888 * by using the norock mount option. There is still one other
@@ -966,27 +1002,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
966 * or getblk() if they are not. Returns the number of blocks inserted 1002 * or getblk() if they are not. Returns the number of blocks inserted
967 * (-ve == error.) 1003 * (-ve == error.)
968 */ 1004 */
969int isofs_get_blocks(struct inode *inode, sector_t iblock_s, 1005int isofs_get_blocks(struct inode *inode, sector_t iblock,
970 struct buffer_head **bh, unsigned long nblocks) 1006 struct buffer_head **bh, unsigned long nblocks)
971{ 1007{
972 unsigned long b_off; 1008 unsigned long b_off = iblock;
973 unsigned offset, sect_size; 1009 unsigned offset, sect_size;
974 unsigned int firstext; 1010 unsigned int firstext;
975 unsigned long nextblk, nextoff; 1011 unsigned long nextblk, nextoff;
976 long iblock = (long)iblock_s;
977 int section, rv, error; 1012 int section, rv, error;
978 struct iso_inode_info *ei = ISOFS_I(inode); 1013 struct iso_inode_info *ei = ISOFS_I(inode);
979 1014
980 lock_kernel();
981
982 error = -EIO; 1015 error = -EIO;
983 rv = 0; 1016 rv = 0;
984 if (iblock < 0 || iblock != iblock_s) { 1017 if (iblock != b_off) {
985 printk(KERN_DEBUG "%s: block number too large\n", __func__); 1018 printk(KERN_DEBUG "%s: block number too large\n", __func__);
986 goto abort; 1019 goto abort;
987 } 1020 }
988 1021
989 b_off = iblock;
990 1022
991 offset = 0; 1023 offset = 0;
992 firstext = ei->i_first_extent; 1024 firstext = ei->i_first_extent;
@@ -1004,8 +1036,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1004 * I/O errors. 1036 * I/O errors.
1005 */ 1037 */
1006 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { 1038 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
1007 printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", 1039 printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
1008 __func__, iblock, (unsigned long) inode->i_size); 1040 __func__, b_off,
1041 (unsigned long long)inode->i_size);
1009 goto abort; 1042 goto abort;
1010 } 1043 }
1011 1044
@@ -1031,9 +1064,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1031 if (++section > 100) { 1064 if (++section > 100) {
1032 printk(KERN_DEBUG "%s: More than 100 file sections ?!?" 1065 printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
1033 " aborting...\n", __func__); 1066 " aborting...\n", __func__);
1034 printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " 1067 printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
1035 "nextblk=%lu nextoff=%lu\n", __func__, 1068 "nextblk=%lu nextoff=%lu\n", __func__,
1036 iblock, firstext, (unsigned) sect_size, 1069 b_off, firstext, (unsigned) sect_size,
1037 nextblk, nextoff); 1070 nextblk, nextoff);
1038 goto abort; 1071 goto abort;
1039 } 1072 }
@@ -1054,7 +1087,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1054 1087
1055 error = 0; 1088 error = 0;
1056abort: 1089abort:
1057 unlock_kernel();
1058 return rv != 0 ? rv : error; 1090 return rv != 0 ? rv : error;
1059} 1091}
1060 1092
@@ -1475,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb,
1475 return inode; 1507 return inode;
1476} 1508}
1477 1509
1478static int isofs_get_sb(struct file_system_type *fs_type, 1510static struct dentry *isofs_mount(struct file_system_type *fs_type,
1479 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1511 int flags, const char *dev_name, void *data)
1480{ 1512{
1481 return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, 1513 return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
1482 mnt);
1483} 1514}
1484 1515
1485static struct file_system_type iso9660_fs_type = { 1516static struct file_system_type iso9660_fs_type = {
1486 .owner = THIS_MODULE, 1517 .owner = THIS_MODULE,
1487 .name = "iso9660", 1518 .name = "iso9660",
1488 .get_sb = isofs_get_sb, 1519 .mount = isofs_mount,
1489 .kill_sb = kill_block_super, 1520 .kill_sb = kill_block_super,
1490 .fs_flags = FS_REQUIRES_DEV, 1521 .fs_flags = FS_REQUIRES_DEV,
1491}; 1522};
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52a..2882dc089f87 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
55 gid_t s_gid; 55 gid_t s_gid;
56 uid_t s_uid; 56 uid_t s_uid;
57 struct nls_table *s_nls_iocharset; /* Native language support table */ 57 struct nls_table *s_nls_iocharset; /* Native language support table */
58 struct mutex s_mutex; /* replaces BKL, please remove if possible */
58}; 59};
59 60
60#define ISOFS_INVALID_MODE ((mode_t) -1) 61#define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867c..0d23abfd4280 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
6 * (C) 1991 Linus Torvalds - minix filesystem 6 * (C) 1991 Linus Torvalds - minix filesystem
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/gfp.h> 9#include <linux/gfp.h>
11#include "isofs.h" 10#include "isofs.h"
12 11
@@ -168,6 +167,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
168 int found; 167 int found;
169 unsigned long uninitialized_var(block); 168 unsigned long uninitialized_var(block);
170 unsigned long uninitialized_var(offset); 169 unsigned long uninitialized_var(offset);
170 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
171 struct inode *inode; 171 struct inode *inode;
172 struct page *page; 172 struct page *page;
173 173
@@ -177,7 +177,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
177 if (!page) 177 if (!page)
178 return ERR_PTR(-ENOMEM); 178 return ERR_PTR(-ENOMEM);
179 179
180 lock_kernel(); 180 mutex_lock(&sbi->s_mutex);
181 found = isofs_find_entry(dir, dentry, 181 found = isofs_find_entry(dir, dentry,
182 &block, &offset, 182 &block, &offset,
183 page_address(page), 183 page_address(page),
@@ -188,10 +188,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
188 if (found) { 188 if (found) {
189 inode = isofs_iget(dir->i_sb, block, offset); 189 inode = isofs_iget(dir->i_sb, block, offset);
190 if (IS_ERR(inode)) { 190 if (IS_ERR(inode)) {
191 unlock_kernel(); 191 mutex_unlock(&sbi->s_mutex);
192 return ERR_CAST(inode); 192 return ERR_CAST(inode);
193 } 193 }
194 } 194 }
195 unlock_kernel(); 195 mutex_unlock(&sbi->s_mutex);
196 return d_splice_alias(inode, dentry); 196 return d_splice_alias(inode, dentry);
197} 197}
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550fd..f9cd04db6eab 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/smp_lock.h>
12 11
13#include "isofs.h" 12#include "isofs.h"
14#include "rock.h" 13#include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
661{ 660{
662 struct inode *inode = page->mapping->host; 661 struct inode *inode = page->mapping->host;
663 struct iso_inode_info *ei = ISOFS_I(inode); 662 struct iso_inode_info *ei = ISOFS_I(inode);
663 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
664 char *link = kmap(page); 664 char *link = kmap(page);
665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
666 struct buffer_head *bh; 666 struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
673 struct rock_state rs; 673 struct rock_state rs;
674 int ret; 674 int ret;
675 675
676 if (!ISOFS_SB(inode->i_sb)->s_rock) 676 if (!sbi->s_rock)
677 goto error; 677 goto error;
678 678
679 init_rock_state(&rs, inode); 679 init_rock_state(&rs, inode);
680 block = ei->i_iget5_block; 680 block = ei->i_iget5_block;
681 lock_kernel(); 681 mutex_lock(&sbi->s_mutex);
682 bh = sb_bread(inode->i_sb, block); 682 bh = sb_bread(inode->i_sb, block);
683 if (!bh) 683 if (!bh)
684 goto out_noread; 684 goto out_noread;
@@ -748,7 +748,7 @@ repeat:
748 goto fail; 748 goto fail;
749 brelse(bh); 749 brelse(bh);
750 *rpnt = '\0'; 750 *rpnt = '\0';
751 unlock_kernel(); 751 mutex_unlock(&sbi->s_mutex);
752 SetPageUptodate(page); 752 SetPageUptodate(page);
753 kunmap(page); 753 kunmap(page);
754 unlock_page(page); 754 unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
765 printk("symlink spans iso9660 blocks\n"); 765 printk("symlink spans iso9660 blocks\n");
766fail: 766fail:
767 brelse(bh); 767 brelse(bh);
768 unlock_kernel(); 768 mutex_unlock(&sbi->s_mutex);
769error: 769error:
770 SetPageError(page); 770 SetPageError(page);
771 kunmap(page); 771 kunmap(page);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 05a38b9c4c0e..e4b87bc1fa56 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
221 goto restart; 221 goto restart;
222 } 222 }
223 if (buffer_locked(bh)) { 223 if (buffer_locked(bh)) {
224 atomic_inc(&bh->b_count); 224 get_bh(bh);
225 spin_unlock(&journal->j_list_lock); 225 spin_unlock(&journal->j_list_lock);
226 jbd_unlock_bh_state(bh); 226 jbd_unlock_bh_state(bh);
227 wait_on_buffer(bh); 227 wait_on_buffer(bh);
@@ -283,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
283 int ret = 0; 283 int ret = 0;
284 284
285 if (buffer_locked(bh)) { 285 if (buffer_locked(bh)) {
286 atomic_inc(&bh->b_count); 286 get_bh(bh);
287 spin_unlock(&journal->j_list_lock); 287 spin_unlock(&journal->j_list_lock);
288 jbd_unlock_bh_state(bh); 288 jbd_unlock_bh_state(bh);
289 wait_on_buffer(bh); 289 wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..34a4861c14b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
137 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "write commit block");
138 set_buffer_dirty(bh); 138 set_buffer_dirty(bh);
139 139
140 if (journal->j_flags & JFS_BARRIER) { 140 if (journal->j_flags & JFS_BARRIER)
141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER); 141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
142 142 else
143 /*
144 * Is it possible for another commit to fail at roughly
145 * the same time as this one? If so, we don't want to
146 * trust the barrier flag in the super, but instead want
147 * to remember if we sent a barrier request
148 */
149 if (ret == -EOPNOTSUPP) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n",
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JFS_BARRIER;
158 spin_unlock(&journal->j_state_lock);
159
160 /* And try again, without the barrier */
161 set_buffer_uptodate(bh);
162 set_buffer_dirty(bh);
163 ret = sync_dirty_buffer(bh);
164 }
165 } else {
166 ret = sync_dirty_buffer(bh); 143 ret = sync_dirty_buffer(bh);
167 }
168 144
169 put_bh(bh); /* One for getblk() */ 145 put_bh(bh); /* One for getblk() */
170 journal_put_journal_head(descriptor); 146 journal_put_journal_head(descriptor);
@@ -318,7 +294,7 @@ void journal_commit_transaction(journal_t *journal)
318 int first_tag = 0; 294 int first_tag = 0;
319 int tag_flag; 295 int tag_flag;
320 int i; 296 int i;
321 int write_op = WRITE; 297 int write_op = WRITE_SYNC;
322 298
323 /* 299 /*
324 * First job: lock down the current transaction and wait for 300 * First job: lock down the current transaction and wait for
@@ -611,13 +587,13 @@ void journal_commit_transaction(journal_t *journal)
611 /* Bump b_count to prevent truncate from stumbling over 587 /* Bump b_count to prevent truncate from stumbling over
612 the shadowed buffer! @@@ This can go if we ever get 588 the shadowed buffer! @@@ This can go if we ever get
613 rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 589 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
614 atomic_inc(&jh2bh(jh)->b_count); 590 get_bh(jh2bh(jh));
615 591
616 /* Make a temporary IO buffer with which to write it out 592 /* Make a temporary IO buffer with which to write it out
617 (this will requeue both the metadata buffer and the 593 (this will requeue both the metadata buffer and the
618 temporary IO buffer). new_bh goes on BJ_IO*/ 594 temporary IO buffer). new_bh goes on BJ_IO*/
619 595
620 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 596 set_buffer_jwrite(jh2bh(jh));
621 /* 597 /*
622 * akpm: journal_write_metadata_buffer() sets 598 * akpm: journal_write_metadata_buffer() sets
623 * new_bh->b_transaction to commit_transaction. 599 * new_bh->b_transaction to commit_transaction.
@@ -627,7 +603,7 @@ void journal_commit_transaction(journal_t *journal)
627 JBUFFER_TRACE(jh, "ph3: write metadata"); 603 JBUFFER_TRACE(jh, "ph3: write metadata");
628 flags = journal_write_metadata_buffer(commit_transaction, 604 flags = journal_write_metadata_buffer(commit_transaction,
629 jh, &new_jh, blocknr); 605 jh, &new_jh, blocknr);
630 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 606 set_buffer_jwrite(jh2bh(new_jh));
631 wbuf[bufs++] = jh2bh(new_jh); 607 wbuf[bufs++] = jh2bh(new_jh);
632 608
633 /* Record the new block's tag in the current descriptor 609 /* Record the new block's tag in the current descriptor
@@ -737,7 +713,7 @@ wait_for_iobuf:
737 shadowed buffer */ 713 shadowed buffer */
738 jh = commit_transaction->t_shadow_list->b_tprev; 714 jh = commit_transaction->t_shadow_list->b_tprev;
739 bh = jh2bh(jh); 715 bh = jh2bh(jh);
740 clear_bit(BH_JWrite, &bh->b_state); 716 clear_buffer_jwrite(bh);
741 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 717 J_ASSERT_BH(bh, buffer_jbddirty(bh));
742 718
743 /* The metadata is now released for reuse, but we need 719 /* The metadata is now released for reuse, but we need
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2c4b1f109da9..da1b5e4ffce1 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
36#include <linux/poison.h> 36#include <linux/poison.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/ratelimit.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/page.h> 42#include <asm/page.h>
@@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
84 85
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 86static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno); 87static void __journal_abort_soft (journal_t *journal, int errno);
88static const char *journal_dev_name(journal_t *journal, char *buffer);
87 89
88/* 90/*
89 * Helper function used to manage commit timeouts 91 * Helper function used to manage commit timeouts
@@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
439 */ 441 */
440 if (!tid_geq(journal->j_commit_request, target)) { 442 if (!tid_geq(journal->j_commit_request, target)) {
441 /* 443 /*
442 * We want a new commit: OK, mark the request and wakup the 444 * We want a new commit: OK, mark the request and wakeup the
443 * commit thread. We do _not_ do the commit ourselves. 445 * commit thread. We do _not_ do the commit ourselves.
444 */ 446 */
445 447
@@ -950,6 +952,8 @@ int journal_create(journal_t *journal)
950 if (err) 952 if (err)
951 return err; 953 return err;
952 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 954 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
955 if (unlikely(!bh))
956 return -ENOMEM;
953 lock_buffer(bh); 957 lock_buffer(bh);
954 memset (bh->b_data, 0, journal->j_blocksize); 958 memset (bh->b_data, 0, journal->j_blocksize);
955 BUFFER_TRACE(bh, "marking dirty"); 959 BUFFER_TRACE(bh, "marking dirty");
@@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait)
1010 goto out; 1014 goto out;
1011 } 1015 }
1012 1016
1017 if (buffer_write_io_error(bh)) {
1018 char b[BDEVNAME_SIZE];
1019 /*
1020 * Oh, dear. A previous attempt to write the journal
1021 * superblock failed. This could happen because the
1022 * USB device was yanked out. Or it could happen to
1023 * be a transient write error and maybe the block will
1024 * be remapped. Nothing we can do but to retry the
1025 * write and hope for the best.
1026 */
1027 printk(KERN_ERR "JBD: previous I/O error detected "
1028 "for journal superblock update for %s.\n",
1029 journal_dev_name(journal, b));
1030 clear_buffer_write_io_error(bh);
1031 set_buffer_uptodate(bh);
1032 }
1033
1013 spin_lock(&journal->j_state_lock); 1034 spin_lock(&journal->j_state_lock);
1014 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", 1035 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
1015 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1036 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1021,9 +1042,17 @@ void journal_update_superblock(journal_t *journal, int wait)
1021 1042
1022 BUFFER_TRACE(bh, "marking dirty"); 1043 BUFFER_TRACE(bh, "marking dirty");
1023 mark_buffer_dirty(bh); 1044 mark_buffer_dirty(bh);
1024 if (wait) 1045 if (wait) {
1025 sync_dirty_buffer(bh); 1046 sync_dirty_buffer(bh);
1026 else 1047 if (buffer_write_io_error(bh)) {
1048 char b[BDEVNAME_SIZE];
1049 printk(KERN_ERR "JBD: I/O error detected "
1050 "when updating journal superblock for %s.\n",
1051 journal_dev_name(journal, b));
1052 clear_buffer_write_io_error(bh);
1053 set_buffer_uptodate(bh);
1054 }
1055 } else
1027 write_dirty_buffer(bh, WRITE); 1056 write_dirty_buffer(bh, WRITE);
1028 1057
1029out: 1058out:
@@ -1719,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void)
1719static struct journal_head *journal_alloc_journal_head(void) 1748static struct journal_head *journal_alloc_journal_head(void)
1720{ 1749{
1721 struct journal_head *ret; 1750 struct journal_head *ret;
1722 static unsigned long last_warning;
1723 1751
1724#ifdef CONFIG_JBD_DEBUG 1752#ifdef CONFIG_JBD_DEBUG
1725 atomic_inc(&nr_journal_heads); 1753 atomic_inc(&nr_journal_heads);
@@ -1727,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void)
1727 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1755 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1728 if (ret == NULL) { 1756 if (ret == NULL) {
1729 jbd_debug(1, "out of memory for journal_head\n"); 1757 jbd_debug(1, "out of memory for journal_head\n");
1730 if (time_after(jiffies, last_warning + 5*HZ)) { 1758 printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1731 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", 1759 __func__);
1732 __func__); 1760
1733 last_warning = jiffies;
1734 }
1735 while (ret == NULL) { 1761 while (ret == NULL) {
1736 yield(); 1762 yield();
1737 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1763 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 81051dafebf5..5b43e96788e6 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -296,10 +296,10 @@ int journal_skip_recovery(journal_t *journal)
296#ifdef CONFIG_JBD_DEBUG 296#ifdef CONFIG_JBD_DEBUG
297 int dropped = info.end_transaction - 297 int dropped = info.end_transaction -
298 be32_to_cpu(journal->j_superblock->s_sequence); 298 be32_to_cpu(journal->j_superblock->s_sequence);
299#endif
300 jbd_debug(1, 299 jbd_debug(1,
301 "JBD: ignoring %d transaction%s from the journal.\n", 300 "JBD: ignoring %d transaction%s from the journal.\n",
302 dropped, (dropped == 1) ? "" : "s"); 301 dropped, (dropped == 1) ? "" : "s");
302#endif
303 journal->j_transaction_sequence = ++info.end_transaction; 303 journal->j_transaction_sequence = ++info.end_transaction;
304 } 304 }
305 305
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a491..846a3f314111 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
293 jbd_free_handle(handle); 293 jbd_free_handle(handle);
294 current->journal_info = NULL; 294 current->journal_info = NULL;
295 handle = ERR_PTR(err); 295 handle = ERR_PTR(err);
296 goto out;
297 } 296 }
298out:
299 return handle; 297 return handle;
300} 298}
301 299
@@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
528 transaction = handle->h_transaction; 526 transaction = handle->h_transaction;
529 journal = transaction->t_journal; 527 journal = transaction->t_journal;
530 528
531 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); 529 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
532 530
533 JBUFFER_TRACE(jh, "entry"); 531 JBUFFER_TRACE(jh, "entry");
534repeat: 532repeat:
@@ -713,7 +711,7 @@ done:
713 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 711 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
714 "Possible IO failure.\n"); 712 "Possible IO failure.\n");
715 page = jh2bh(jh)->b_page; 713 page = jh2bh(jh)->b_page;
716 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 714 offset = offset_in_page(jh2bh(jh)->b_data);
717 source = kmap_atomic(page, KM_USER0); 715 source = kmap_atomic(page, KM_USER0);
718 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 716 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
719 kunmap_atomic(source, KM_USER0); 717 kunmap_atomic(source, KM_USER0);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6a79fd0a1a32 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
299 transaction->t_chp_stats.cs_forced_to_close++; 299 transaction->t_chp_stats.cs_forced_to_close++;
300 spin_unlock(&journal->j_list_lock); 300 spin_unlock(&journal->j_list_lock);
301 jbd_unlock_bh_state(bh); 301 jbd_unlock_bh_state(bh);
302 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
303 /*
304 * The journal thread is dead; so starting and
305 * waiting for a commit to finish will cause
306 * us to wait for a _very_ long time.
307 */
308 printk(KERN_ERR "JBD2: %s: "
309 "Waiting for Godot: block %llu\n",
310 journal->j_devname,
311 (unsigned long long) bh->b_blocknr);
302 jbd2_log_start_commit(journal, tid); 312 jbd2_log_start_commit(journal, tid);
303 jbd2_log_wait_commit(journal, tid); 313 jbd2_log_wait_commit(journal, tid);
304 ret = 1; 314 ret = 1;
@@ -532,8 +542,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
532 */ 542 */
533 if ((journal->j_fs_dev != journal->j_dev) && 543 if ((journal->j_fs_dev != journal->j_dev) &&
534 (journal->j_flags & JBD2_BARRIER)) 544 (journal->j_flags & JBD2_BARRIER))
535 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 545 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
536 BLKDEV_IFL_WAIT);
537 if (!(journal->j_flags & JBD2_ABORT)) 546 if (!(journal->j_flags & JBD2_ABORT))
538 jbd2_journal_update_superblock(journal, 1); 547 jbd2_journal_update_superblock(journal, 1);
539 return 0; 548 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..f3ad1598b201 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/bitops.h>
29#include <trace/events/jbd2.h> 30#include <trace/events/jbd2.h>
31#include <asm/system.h>
30 32
31/* 33/*
32 * Default IO end handler for temporary BJ_IO buffer_heads. 34 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -134,25 +136,11 @@ static int journal_submit_commit_record(journal_t *journal,
134 136
135 if (journal->j_flags & JBD2_BARRIER && 137 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 138 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh); 140 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
139 if (ret == -EOPNOTSUPP) { 141 else
140 printk(KERN_WARNING
141 "JBD2: Disabling barriers on %s, "
142 "not supported by device\n", journal->j_devname);
143 write_lock(&journal->j_state_lock);
144 journal->j_flags &= ~JBD2_BARRIER;
145 write_unlock(&journal->j_state_lock);
146
147 /* And try again, without the barrier */
148 lock_buffer(bh);
149 set_buffer_uptodate(bh);
150 clear_buffer_dirty(bh);
151 ret = submit_bh(WRITE_SYNC_PLUG, bh);
152 }
153 } else {
154 ret = submit_bh(WRITE_SYNC_PLUG, bh); 142 ret = submit_bh(WRITE_SYNC_PLUG, bh);
155 } 143
156 *cbh = bh; 144 *cbh = bh;
157 return ret; 145 return ret;
158} 146}
@@ -166,29 +154,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
166{ 154{
167 int ret = 0; 155 int ret = 0;
168 156
169retry:
170 clear_buffer_dirty(bh); 157 clear_buffer_dirty(bh);
171 wait_on_buffer(bh); 158 wait_on_buffer(bh);
172 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
173 printk(KERN_WARNING
174 "JBD2: %s: disabling barries on %s - not supported "
175 "by device\n", __func__, journal->j_devname);
176 write_lock(&journal->j_state_lock);
177 journal->j_flags &= ~JBD2_BARRIER;
178 write_unlock(&journal->j_state_lock);
179
180 lock_buffer(bh);
181 clear_buffer_dirty(bh);
182 set_buffer_uptodate(bh);
183 bh->b_end_io = journal_end_buffer_io_sync;
184
185 ret = submit_bh(WRITE_SYNC_PLUG, bh);
186 if (ret) {
187 unlock_buffer(bh);
188 return ret;
189 }
190 goto retry;
191 }
192 159
193 if (unlikely(!buffer_uptodate(bh))) 160 if (unlikely(!buffer_uptodate(bh)))
194 ret = -EIO; 161 ret = -EIO;
@@ -236,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
236 spin_lock(&journal->j_list_lock); 203 spin_lock(&journal->j_list_lock);
237 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 204 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
238 mapping = jinode->i_vfs_inode->i_mapping; 205 mapping = jinode->i_vfs_inode->i_mapping;
239 jinode->i_flags |= JI_COMMIT_RUNNING; 206 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
240 spin_unlock(&journal->j_list_lock); 207 spin_unlock(&journal->j_list_lock);
241 /* 208 /*
242 * submit the inode data buffers. We use writepage 209 * submit the inode data buffers. We use writepage
@@ -251,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
251 spin_lock(&journal->j_list_lock); 218 spin_lock(&journal->j_list_lock);
252 J_ASSERT(jinode->i_transaction == commit_transaction); 219 J_ASSERT(jinode->i_transaction == commit_transaction);
253 commit_transaction->t_flushed_data_blocks = 1; 220 commit_transaction->t_flushed_data_blocks = 1;
254 jinode->i_flags &= ~JI_COMMIT_RUNNING; 221 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 smp_mb__after_clear_bit();
255 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 223 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
256 } 224 }
257 spin_unlock(&journal->j_list_lock); 225 spin_unlock(&journal->j_list_lock);
@@ -272,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
272 /* For locking, see the comment in journal_submit_data_buffers() */ 240 /* For locking, see the comment in journal_submit_data_buffers() */
273 spin_lock(&journal->j_list_lock); 241 spin_lock(&journal->j_list_lock);
274 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
275 jinode->i_flags |= JI_COMMIT_RUNNING; 243 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
276 spin_unlock(&journal->j_list_lock); 244 spin_unlock(&journal->j_list_lock);
277 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 245 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
278 if (err) { 246 if (err) {
@@ -288,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
288 ret = err; 256 ret = err;
289 } 257 }
290 spin_lock(&journal->j_list_lock); 258 spin_lock(&journal->j_list_lock);
291 jinode->i_flags &= ~JI_COMMIT_RUNNING; 259 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 smp_mb__after_clear_bit();
292 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 261 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
293 } 262 }
294 263
@@ -360,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
360 int tag_bytes = journal_tag_bytes(journal); 329 int tag_bytes = journal_tag_bytes(journal);
361 struct buffer_head *cbh = NULL; /* For transactional checksums */ 330 struct buffer_head *cbh = NULL; /* For transactional checksums */
362 __u32 crc32_sum = ~0; 331 __u32 crc32_sum = ~0;
363 int write_op = WRITE; 332 int write_op = WRITE_SYNC;
364 333
365 /* 334 /*
366 * First job: lock down the current transaction and wait for 335 * First job: lock down the current transaction and wait for
@@ -701,6 +670,16 @@ start_journal_io:
701 } 670 }
702 } 671 }
703 672
673 err = journal_finish_inode_data_buffers(journal, commit_transaction);
674 if (err) {
675 printk(KERN_WARNING
676 "JBD2: Detected IO errors while flushing file data "
677 "on %s\n", journal->j_devname);
678 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
679 jbd2_journal_abort(journal, err);
680 err = 0;
681 }
682
704 /* 683 /*
705 * If the journal is not located on the file system device, 684 * If the journal is not located on the file system device,
706 * then we must flush the file system device before we issue 685 * then we must flush the file system device before we issue
@@ -709,8 +688,7 @@ start_journal_io:
709 if (commit_transaction->t_flushed_data_blocks && 688 if (commit_transaction->t_flushed_data_blocks &&
710 (journal->j_fs_dev != journal->j_dev) && 689 (journal->j_fs_dev != journal->j_dev) &&
711 (journal->j_flags & JBD2_BARRIER)) 690 (journal->j_flags & JBD2_BARRIER))
712 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 691 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
713 BLKDEV_IFL_WAIT);
714 692
715 /* Done it all: now write the commit record asynchronously. */ 693 /* Done it all: now write the commit record asynchronously. */
716 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 694 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -719,19 +697,6 @@ start_journal_io:
719 &cbh, crc32_sum); 697 &cbh, crc32_sum);
720 if (err) 698 if (err)
721 __jbd2_journal_abort_hard(journal); 699 __jbd2_journal_abort_hard(journal);
722 if (journal->j_flags & JBD2_BARRIER)
723 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
724 BLKDEV_IFL_WAIT);
725 }
726
727 err = journal_finish_inode_data_buffers(journal, commit_transaction);
728 if (err) {
729 printk(KERN_WARNING
730 "JBD2: Detected IO errors while flushing file data "
731 "on %s\n", journal->j_devname);
732 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
733 jbd2_journal_abort(journal, err);
734 err = 0;
735 } 700 }
736 701
737 /* Lo and behold: we have just managed to send a transaction to 702 /* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +810,11 @@ wait_for_iobuf:
845 } 810 }
846 if (!err && !is_journal_aborted(journal)) 811 if (!err && !is_journal_aborted(journal))
847 err = journal_wait_on_commit_record(journal, cbh); 812 err = journal_wait_on_commit_record(journal, cbh);
813 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
814 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
815 journal->j_flags & JBD2_BARRIER) {
816 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
817 }
848 818
849 if (err) 819 if (err)
850 jbd2_journal_abort(journal, err); 820 jbd2_journal_abort(journal, err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 262419f83d80..c590d155c095 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -42,12 +42,14 @@
42#include <linux/log2.h> 42#include <linux/log2.h>
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/backing-dev.h> 44#include <linux/backing-dev.h>
45#include <linux/bitops.h>
45 46
46#define CREATE_TRACE_POINTS 47#define CREATE_TRACE_POINTS
47#include <trace/events/jbd2.h> 48#include <trace/events/jbd2.h>
48 49
49#include <asm/uaccess.h> 50#include <asm/uaccess.h>
50#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/system.h>
51 53
52EXPORT_SYMBOL(jbd2_journal_extend); 54EXPORT_SYMBOL(jbd2_journal_extend);
53EXPORT_SYMBOL(jbd2_journal_stop); 55EXPORT_SYMBOL(jbd2_journal_stop);
@@ -478,7 +480,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
478 */ 480 */
479 if (!tid_geq(journal->j_commit_request, target)) { 481 if (!tid_geq(journal->j_commit_request, target)) {
480 /* 482 /*
481 * We want a new commit: OK, mark the request and wakup the 483 * We want a new commit: OK, mark the request and wakeup the
482 * commit thread. We do _not_ do the commit ourselves. 484 * commit thread. We do _not_ do the commit ourselves.
483 */ 485 */
484 486
@@ -1836,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
1836 */ 1838 */
1837#define JBD2_MAX_SLABS 8 1839#define JBD2_MAX_SLABS 8
1838static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; 1840static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1839static DECLARE_MUTEX(jbd2_slab_create_sem);
1840 1841
1841static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { 1842static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1842 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", 1843 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1857,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
1857 1858
1858static int jbd2_journal_create_slab(size_t size) 1859static int jbd2_journal_create_slab(size_t size)
1859{ 1860{
1861 static DEFINE_MUTEX(jbd2_slab_create_mutex);
1860 int i = order_base_2(size) - 10; 1862 int i = order_base_2(size) - 10;
1861 size_t slab_size; 1863 size_t slab_size;
1862 1864
@@ -1868,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
1868 1870
1869 if (unlikely(i < 0)) 1871 if (unlikely(i < 0))
1870 i = 0; 1872 i = 0;
1871 down(&jbd2_slab_create_sem); 1873 mutex_lock(&jbd2_slab_create_mutex);
1872 if (jbd2_slab[i]) { 1874 if (jbd2_slab[i]) {
1873 up(&jbd2_slab_create_sem); 1875 mutex_unlock(&jbd2_slab_create_mutex);
1874 return 0; /* Already created */ 1876 return 0; /* Already created */
1875 } 1877 }
1876 1878
1877 slab_size = 1 << (i+10); 1879 slab_size = 1 << (i+10);
1878 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, 1880 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1879 slab_size, 0, NULL); 1881 slab_size, 0, NULL);
1880 up(&jbd2_slab_create_sem); 1882 mutex_unlock(&jbd2_slab_create_mutex);
1881 if (!jbd2_slab[i]) { 1883 if (!jbd2_slab[i]) {
1882 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); 1884 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1883 return -ENOMEM; 1885 return -ENOMEM;
@@ -2210,7 +2212,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
2210restart: 2212restart:
2211 spin_lock(&journal->j_list_lock); 2213 spin_lock(&journal->j_list_lock);
2212 /* Is commit writing out inode - we have to wait */ 2214 /* Is commit writing out inode - we have to wait */
2213 if (jinode->i_flags & JI_COMMIT_RUNNING) { 2215 if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
2214 wait_queue_head_t *wq; 2216 wait_queue_head_t *wq;
2215 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); 2217 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2216 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); 2218 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f3479d6e0a83..6bf0a242613e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -156,6 +156,7 @@ alloc_transaction:
156 */ 156 */
157repeat: 157repeat:
158 read_lock(&journal->j_state_lock); 158 read_lock(&journal->j_state_lock);
159 BUG_ON(journal->j_flags & JBD2_UNMOUNT);
159 if (is_journal_aborted(journal) || 160 if (is_journal_aborted(journal) ||
160 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 161 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
161 read_unlock(&journal->j_state_lock); 162 read_unlock(&journal->j_state_lock);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a906f538d11c..85c6be2db02f 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -23,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
23static inline struct jffs2_inode_cache * 23static inline struct jffs2_inode_cache *
24first_inode_chain(int *i, struct jffs2_sb_info *c) 24first_inode_chain(int *i, struct jffs2_sb_info *c)
25{ 25{
26 for (; *i < INOCACHE_HASHSIZE; (*i)++) { 26 for (; *i < c->inocache_hashsize; (*i)++) {
27 if (c->inocache_list[*i]) 27 if (c->inocache_list[*i])
28 return c->inocache_list[*i]; 28 return c->inocache_list[*i];
29 } 29 }
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 617a1e5694c1..de4247021d25 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -103,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
103 spin_unlock(&jffs2_compressor_list_lock); 103 spin_unlock(&jffs2_compressor_list_lock);
104 *datalen = orig_slen; 104 *datalen = orig_slen;
105 *cdatalen = orig_dlen; 105 *cdatalen = orig_dlen;
106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL); 106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
107 spin_lock(&jffs2_compressor_list_lock); 107 spin_lock(&jffs2_compressor_list_lock);
108 this->usecount--; 108 this->usecount--;
109 if (!compr_ret) { 109 if (!compr_ret) {
@@ -152,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
152 spin_unlock(&jffs2_compressor_list_lock); 152 spin_unlock(&jffs2_compressor_list_lock);
153 *datalen = orig_slen; 153 *datalen = orig_slen;
154 *cdatalen = orig_dlen; 154 *cdatalen = orig_dlen;
155 compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL); 155 compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
156 spin_lock(&jffs2_compressor_list_lock); 156 spin_lock(&jffs2_compressor_list_lock);
157 this->usecount--; 157 this->usecount--;
158 if (!compr_ret) { 158 if (!compr_ret) {
@@ -220,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
220 if (comprtype == this->compr) { 220 if (comprtype == this->compr) {
221 this->usecount++; 221 this->usecount++;
222 spin_unlock(&jffs2_compressor_list_lock); 222 spin_unlock(&jffs2_compressor_list_lock);
223 ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL); 223 ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
224 spin_lock(&jffs2_compressor_list_lock); 224 spin_lock(&jffs2_compressor_list_lock);
225 if (ret) { 225 if (ret) {
226 printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret); 226 printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index e471a9106fd9..13bb7597ab39 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -49,9 +49,9 @@ struct jffs2_compressor {
49 char *name; 49 char *name;
50 char compr; /* JFFS2_COMPR_XXX */ 50 char compr; /* JFFS2_COMPR_XXX */
51 int (*compress)(unsigned char *data_in, unsigned char *cpage_out, 51 int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
52 uint32_t *srclen, uint32_t *destlen, void *model); 52 uint32_t *srclen, uint32_t *destlen);
53 int (*decompress)(unsigned char *cdata_in, unsigned char *data_out, 53 int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
54 uint32_t cdatalen, uint32_t datalen, void *model); 54 uint32_t cdatalen, uint32_t datalen);
55 int usecount; 55 int usecount;
56 int disabled; /* if set the compressor won't compress */ 56 int disabled; /* if set the compressor won't compress */
57 unsigned char *compr_buf; /* used by size compr. mode */ 57 unsigned char *compr_buf; /* used by size compr. mode */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index ed25ae7c98eb..af186ee674d8 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -42,7 +42,7 @@ static int __init alloc_workspace(void)
42} 42}
43 43
44static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, 44static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
45 uint32_t *sourcelen, uint32_t *dstlen, void *model) 45 uint32_t *sourcelen, uint32_t *dstlen)
46{ 46{
47 size_t compress_size; 47 size_t compress_size;
48 int ret; 48 int ret;
@@ -67,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
67} 67}
68 68
69static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out, 69static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
70 uint32_t srclen, uint32_t destlen, void *model) 70 uint32_t srclen, uint32_t destlen)
71{ 71{
72 size_t dl = destlen; 72 size_t dl = destlen;
73 int ret; 73 int ret;
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 9696ad9ef5f7..16a5047903a6 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -31,8 +31,7 @@
31/* _compress returns the compressed size, -1 if bigger */ 31/* _compress returns the compressed size, -1 if bigger */
32static int jffs2_rtime_compress(unsigned char *data_in, 32static int jffs2_rtime_compress(unsigned char *data_in,
33 unsigned char *cpage_out, 33 unsigned char *cpage_out,
34 uint32_t *sourcelen, uint32_t *dstlen, 34 uint32_t *sourcelen, uint32_t *dstlen)
35 void *model)
36{ 35{
37 short positions[256]; 36 short positions[256];
38 int outpos = 0; 37 int outpos = 0;
@@ -73,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
73 72
74static int jffs2_rtime_decompress(unsigned char *data_in, 73static int jffs2_rtime_decompress(unsigned char *data_in,
75 unsigned char *cpage_out, 74 unsigned char *cpage_out,
76 uint32_t srclen, uint32_t destlen, 75 uint32_t srclen, uint32_t destlen)
77 void *model)
78{ 76{
79 short positions[256]; 77 short positions[256];
80 int outpos = 0; 78 int outpos = 0;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index a12b4f763373..9e7cec808c4c 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -298,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
298#if 0 298#if 0
299/* _compress returns the compressed size, -1 if bigger */ 299/* _compress returns the compressed size, -1 if bigger */
300int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, 300int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
301 uint32_t *sourcelen, uint32_t *dstlen, void *model) 301 uint32_t *sourcelen, uint32_t *dstlen)
302{ 302{
303 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, 303 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
304 cpage_out, sourcelen, dstlen); 304 cpage_out, sourcelen, dstlen);
@@ -306,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
306#endif 306#endif
307static int jffs2_dynrubin_compress(unsigned char *data_in, 307static int jffs2_dynrubin_compress(unsigned char *data_in,
308 unsigned char *cpage_out, 308 unsigned char *cpage_out,
309 uint32_t *sourcelen, uint32_t *dstlen, 309 uint32_t *sourcelen, uint32_t *dstlen)
310 void *model)
311{ 310{
312 int bits[8]; 311 int bits[8];
313 unsigned char histo[256]; 312 unsigned char histo[256];
@@ -387,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits,
387 386
388static int jffs2_rubinmips_decompress(unsigned char *data_in, 387static int jffs2_rubinmips_decompress(unsigned char *data_in,
389 unsigned char *cpage_out, 388 unsigned char *cpage_out,
390 uint32_t sourcelen, uint32_t dstlen, 389 uint32_t sourcelen, uint32_t dstlen)
391 void *model)
392{ 390{
393 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, 391 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
394 cpage_out, sourcelen, dstlen); 392 cpage_out, sourcelen, dstlen);
@@ -397,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
397 395
398static int jffs2_dynrubin_decompress(unsigned char *data_in, 396static int jffs2_dynrubin_decompress(unsigned char *data_in,
399 unsigned char *cpage_out, 397 unsigned char *cpage_out,
400 uint32_t sourcelen, uint32_t dstlen, 398 uint32_t sourcelen, uint32_t dstlen)
401 void *model)
402{ 399{
403 int bits[8]; 400 int bits[8];
404 int c; 401 int c;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 97fc45de6f81..fd05a0b9431d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -68,8 +68,7 @@ static void free_workspaces(void)
68 68
69static int jffs2_zlib_compress(unsigned char *data_in, 69static int jffs2_zlib_compress(unsigned char *data_in,
70 unsigned char *cpage_out, 70 unsigned char *cpage_out,
71 uint32_t *sourcelen, uint32_t *dstlen, 71 uint32_t *sourcelen, uint32_t *dstlen)
72 void *model)
73{ 72{
74 int ret; 73 int ret;
75 74
@@ -136,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
136 135
137static int jffs2_zlib_decompress(unsigned char *data_in, 136static int jffs2_zlib_decompress(unsigned char *data_in,
138 unsigned char *cpage_out, 137 unsigned char *cpage_out,
139 uint32_t srclen, uint32_t destlen, 138 uint32_t srclen, uint32_t destlen)
140 void *model)
141{ 139{
142 int ret; 140 int ret;
143 int wbits = MAX_WBITS; 141 int wbits = MAX_WBITS;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..92978658ed18 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
289 mutex_unlock(&f->sem); 289 mutex_unlock(&f->sem);
290 d_instantiate(dentry, old_dentry->d_inode); 290 d_instantiate(dentry, old_dentry->d_inode);
291 dir_i->i_mtime = dir_i->i_ctime = ITIME(now); 291 dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
292 atomic_inc(&old_dentry->d_inode->i_count); 292 ihold(old_dentry->d_inode);
293 } 293 }
294 return ret; 294 return ret;
295} 295}
@@ -367,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
367 } 367 }
368 368
369 /* We use f->target field to store the target path. */ 369 /* We use f->target field to store the target path. */
370 f->target = kmalloc(targetlen + 1, GFP_KERNEL); 370 f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
371 if (!f->target) { 371 if (!f->target) {
372 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); 372 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
373 mutex_unlock(&f->sem); 373 mutex_unlock(&f->sem);
@@ -376,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
376 goto fail; 376 goto fail;
377 } 377 }
378 378
379 memcpy(f->target, target, targetlen + 1);
380 D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target)); 379 D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
381 380
382 /* No data here. Only a metadata node, which will be 381 /* No data here. Only a metadata node, which will be
@@ -864,7 +863,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
864 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); 863 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
865 /* Might as well let the VFS know */ 864 /* Might as well let the VFS know */
866 d_instantiate(new_dentry, old_dentry->d_inode); 865 d_instantiate(new_dentry, old_dentry->d_inode);
867 atomic_inc(&old_dentry->d_inode->i_count); 866 ihold(old_dentry->d_inode);
868 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); 867 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
869 return ret; 868 return ret;
870 } 869 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index abac961f617b..e513f1913c15 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -151,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
151 } 151 }
152 152
153 /* Be nice */ 153 /* Be nice */
154 yield(); 154 cond_resched();
155 mutex_lock(&c->erase_free_sem); 155 mutex_lock(&c->erase_free_sem);
156 spin_lock(&c->erase_completion_lock); 156 spin_lock(&c->erase_completion_lock);
157 } 157 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 6b2964a19850..e896e67767eb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -21,7 +21,6 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/vfs.h> 22#include <linux/vfs.h>
23#include <linux/crc32.h> 23#include <linux/crc32.h>
24#include <linux/smp_lock.h>
25#include "nodelist.h" 24#include "nodelist.h"
26 25
27static int jffs2_flash_setup(struct jffs2_sb_info *c); 26static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -391,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
391 This also catches the case where it was stopped and this 390 This also catches the case where it was stopped and this
392 is just a remount to restart it. 391 is just a remount to restart it.
393 Flush the writebuffer, if neccecary, else we loose it */ 392 Flush the writebuffer, if neccecary, else we loose it */
394 lock_kernel();
395 if (!(sb->s_flags & MS_RDONLY)) { 393 if (!(sb->s_flags & MS_RDONLY)) {
396 jffs2_stop_garbage_collect_thread(c); 394 jffs2_stop_garbage_collect_thread(c);
397 mutex_lock(&c->alloc_sem); 395 mutex_lock(&c->alloc_sem);
@@ -403,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
403 jffs2_start_garbage_collect_thread(c); 401 jffs2_start_garbage_collect_thread(c);
404 402
405 *flags |= MS_NOATIME; 403 *flags |= MS_NOATIME;
406
407 unlock_kernel();
408 return 0; 404 return 0;
409} 405}
410 406
@@ -478,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
478 return inode; 474 return inode;
479} 475}
480 476
477static int calculate_inocache_hashsize(uint32_t flash_size)
478{
479 /*
480 * Pick a inocache hash size based on the size of the medium.
481 * Count how many megabytes we're dealing with, apply a hashsize twice
482 * that size, but rounding down to the usual big powers of 2. And keep
483 * to sensible bounds.
484 */
485
486 int size_mb = flash_size / 1024 / 1024;
487 int hashsize = (size_mb * 2) & ~0x3f;
488
489 if (hashsize < INOCACHE_HASHSIZE_MIN)
490 return INOCACHE_HASHSIZE_MIN;
491 if (hashsize > INOCACHE_HASHSIZE_MAX)
492 return INOCACHE_HASHSIZE_MAX;
493
494 return hashsize;
495}
481 496
482int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) 497int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
483{ 498{
@@ -524,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
524 if (ret) 539 if (ret)
525 return ret; 540 return ret;
526 541
527 c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); 542 c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
543 c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
528 if (!c->inocache_list) { 544 if (!c->inocache_list) {
529 ret = -ENOMEM; 545 ret = -ENOMEM;
530 goto out_wbuf; 546 goto out_wbuf;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 846a79452497..31dce611337c 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -219,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
219 if (!list_empty(&c->erase_complete_list) || 219 if (!list_empty(&c->erase_complete_list) ||
220 !list_empty(&c->erase_pending_list)) { 220 !list_empty(&c->erase_pending_list)) {
221 spin_unlock(&c->erase_completion_lock); 221 spin_unlock(&c->erase_completion_lock);
222 mutex_unlock(&c->alloc_sem);
222 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); 223 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
223 if (jffs2_erase_pending_blocks(c, 1)) { 224 if (jffs2_erase_pending_blocks(c, 1))
224 mutex_unlock(&c->alloc_sem);
225 return 0; 225 return 0;
226 } 226
227 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); 227 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
228 spin_lock(&c->erase_completion_lock); 228 spin_lock(&c->erase_completion_lock);
229 mutex_lock(&c->alloc_sem);
229 } 230 }
230 231
231 /* First, work out which block we're garbage-collecting */ 232 /* First, work out which block we're garbage-collecting */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 6784bc89add1..f864005de64c 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -100,6 +100,7 @@ struct jffs2_sb_info {
100 wait_queue_head_t erase_wait; /* For waiting for erases to complete */ 100 wait_queue_head_t erase_wait; /* For waiting for erases to complete */
101 101
102 wait_queue_head_t inocache_wq; 102 wait_queue_head_t inocache_wq;
103 int inocache_hashsize;
103 struct jffs2_inode_cache **inocache_list; 104 struct jffs2_inode_cache **inocache_list;
104 spinlock_t inocache_lock; 105 spinlock_t inocache_lock;
105 106
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index af02bd138469..5e03233c2363 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t
420{ 420{
421 struct jffs2_inode_cache *ret; 421 struct jffs2_inode_cache *ret;
422 422
423 ret = c->inocache_list[ino % INOCACHE_HASHSIZE]; 423 ret = c->inocache_list[ino % c->inocache_hashsize];
424 while (ret && ret->ino < ino) { 424 while (ret && ret->ino < ino) {
425 ret = ret->next; 425 ret = ret->next;
426 } 426 }
@@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new
441 441
442 dbg_inocache("add %p (ino #%u)\n", new, new->ino); 442 dbg_inocache("add %p (ino #%u)\n", new, new->ino);
443 443
444 prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE]; 444 prev = &c->inocache_list[new->ino % c->inocache_hashsize];
445 445
446 while ((*prev) && (*prev)->ino < new->ino) { 446 while ((*prev) && (*prev)->ino < new->ino) {
447 prev = &(*prev)->next; 447 prev = &(*prev)->next;
@@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
462 dbg_inocache("del %p (ino #%u)\n", old, old->ino); 462 dbg_inocache("del %p (ino #%u)\n", old, old->ino);
463 spin_lock(&c->inocache_lock); 463 spin_lock(&c->inocache_lock);
464 464
465 prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE]; 465 prev = &c->inocache_list[old->ino % c->inocache_hashsize];
466 466
467 while ((*prev) && (*prev)->ino < old->ino) { 467 while ((*prev) && (*prev)->ino < old->ino) {
468 prev = &(*prev)->next; 468 prev = &(*prev)->next;
@@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
487 int i; 487 int i;
488 struct jffs2_inode_cache *this, *next; 488 struct jffs2_inode_cache *this, *next;
489 489
490 for (i=0; i<INOCACHE_HASHSIZE; i++) { 490 for (i=0; i < c->inocache_hashsize; i++) {
491 this = c->inocache_list[i]; 491 this = c->inocache_list[i];
492 while (this) { 492 while (this) {
493 next = this->next; 493 next = this->next;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 523a91691052..5a53d9bdb2b5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -199,7 +199,8 @@ struct jffs2_inode_cache {
199#define RAWNODE_CLASS_XATTR_DATUM 1 199#define RAWNODE_CLASS_XATTR_DATUM 1
200#define RAWNODE_CLASS_XATTR_REF 2 200#define RAWNODE_CLASS_XATTR_REF 2
201 201
202#define INOCACHE_HASHSIZE 128 202#define INOCACHE_HASHSIZE_MIN 128
203#define INOCACHE_HASHSIZE_MAX 1024
203 204
204#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size) 205#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
205 206
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 46f870d1cc36..b632dddcb482 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -20,7 +20,7 @@
20#include "summary.h" 20#include "summary.h"
21#include "debug.h" 21#include "debug.h"
22 22
23#define DEFAULT_EMPTY_SCAN_SIZE 1024 23#define DEFAULT_EMPTY_SCAN_SIZE 256
24 24
25#define noisy_printk(noise, args...) do { \ 25#define noisy_printk(noise, args...) do { \
26 if (*(noise)) { \ 26 if (*(noise)) { \
@@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
435 unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) { 435 unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
436 struct jffs2_unknown_node *node; 436 struct jffs2_unknown_node *node;
437 struct jffs2_unknown_node crcnode; 437 struct jffs2_unknown_node crcnode;
438 uint32_t ofs, prevofs; 438 uint32_t ofs, prevofs, max_ofs;
439 uint32_t hdr_crc, buf_ofs, buf_len; 439 uint32_t hdr_crc, buf_ofs, buf_len;
440 int err; 440 int err;
441 int noise = 0; 441 int noise = 0;
@@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
550 550
551 /* We temporarily use 'ofs' as a pointer into the buffer/jeb */ 551 /* We temporarily use 'ofs' as a pointer into the buffer/jeb */
552 ofs = 0; 552 ofs = 0;
553 553 max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
554 /* Scan only 4KiB of 0xFF before declaring it's empty */ 554 /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
555 while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) 555 while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
556 ofs += 4; 556 ofs += 4;
557 557
558 if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) { 558 if (ofs == max_ofs) {
559#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 559#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
560 if (jffs2_cleanmarker_oob(c)) { 560 if (jffs2_cleanmarker_oob(c)) {
561 /* scan oob, take care of cleanmarker */ 561 /* scan oob, take care of cleanmarker */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 662bba099501..c86041b866a4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/list.h> 16#include <linux/list.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
@@ -146,6 +145,7 @@ static const struct super_operations jffs2_super_operations =
146static int jffs2_fill_super(struct super_block *sb, void *data, int silent) 145static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
147{ 146{
148 struct jffs2_sb_info *c; 147 struct jffs2_sb_info *c;
148 int ret;
149 149
150 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():" 150 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
151 " New superblock for device %d (\"%s\")\n", 151 " New superblock for device %d (\"%s\")\n",
@@ -175,15 +175,15 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
175#ifdef CONFIG_JFFS2_FS_POSIX_ACL 175#ifdef CONFIG_JFFS2_FS_POSIX_ACL
176 sb->s_flags |= MS_POSIXACL; 176 sb->s_flags |= MS_POSIXACL;
177#endif 177#endif
178 return jffs2_do_fill_super(sb, data, silent); 178 ret = jffs2_do_fill_super(sb, data, silent);
179 return ret;
179} 180}
180 181
181static int jffs2_get_sb(struct file_system_type *fs_type, 182static struct dentry *jffs2_mount(struct file_system_type *fs_type,
182 int flags, const char *dev_name, 183 int flags, const char *dev_name,
183 void *data, struct vfsmount *mnt) 184 void *data)
184{ 185{
185 return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super, 186 return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
186 mnt);
187} 187}
188 188
189static void jffs2_put_super (struct super_block *sb) 189static void jffs2_put_super (struct super_block *sb)
@@ -192,8 +192,6 @@ static void jffs2_put_super (struct super_block *sb)
192 192
193 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); 193 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
194 194
195 lock_kernel();
196
197 if (sb->s_dirt) 195 if (sb->s_dirt)
198 jffs2_write_super(sb); 196 jffs2_write_super(sb);
199 197
@@ -215,8 +213,6 @@ static void jffs2_put_super (struct super_block *sb)
215 if (c->mtd->sync) 213 if (c->mtd->sync)
216 c->mtd->sync(c->mtd); 214 c->mtd->sync(c->mtd);
217 215
218 unlock_kernel();
219
220 D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); 216 D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
221} 217}
222 218
@@ -232,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb)
232static struct file_system_type jffs2_fs_type = { 228static struct file_system_type jffs2_fs_type = {
233 .owner = THIS_MODULE, 229 .owner = THIS_MODULE,
234 .name = "jffs2", 230 .name = "jffs2",
235 .get_sb = jffs2_get_sb, 231 .mount = jffs2_mount,
236 .kill_sb = jffs2_kill_sb, 232 .kill_sb = jffs2_kill_sb,
237}; 233};
238 234
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
497 * appear hashed, but do not put on any lists. hlist_del() 497 * appear hashed, but do not put on any lists. hlist_del()
498 * will work fine and require no locking. 498 * will work fine and require no locking.
499 */ 499 */
500 ip->i_hash.pprev = &ip->i_hash.next; 500 hlist_add_fake(&ip->i_hash);
501 501
502 return (ip); 502 return (ip);
503} 503}
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a14516..e1b8493b9aaa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
1010 * option 2 - shutdown file systems 1010 * option 2 - shutdown file systems
1011 * associated with log ? 1011 * associated with log ?
1012 * option 3 - extend log ? 1012 * option 3 - extend log ?
1013 */
1014 /*
1015 * option 4 - second chance 1013 * option 4 - second chance
1016 * 1014 *
1017 * mark log wrapped, and continue. 1015 * mark log wrapped, and continue.
1018 * when all active transactions are completed, 1016 * when all active transactions are completed,
1019 * mark log vaild for recovery. 1017 * mark log valid for recovery.
1020 * if crashed during invalid state, log state 1018 * if crashed during invalid state, log state
1021 * implies invald log, forcing fsck(). 1019 * implies invalid log, forcing fsck().
1022 */ 1020 */
1023 /* mark log state log wrap in log superblock */ 1021 /* mark log state log wrap in log superblock */
1024 /* log->state = LOGWRAP; */ 1022 /* log->state = LOGWRAP; */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45a..9895595fd2f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
97 97
98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); 98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
99 if (ipaimap == NULL) { 99 if (ipaimap == NULL) {
100 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 100 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
101 rc = -EIO; 101 rc = -EIO;
102 goto errout20; 102 goto errout20;
103 } 103 }
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { 148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); 149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
150 if (!ipaimap2) { 150 if (!ipaimap2) {
151 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 151 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
152 rc = -EIO; 152 rc = -EIO;
153 goto errout35; 153 goto errout35;
154 } 154 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1279 * lazy commit thread finishes processing 1279 * lazy commit thread finishes processing
1280 */ 1280 */
1281 if (tblk->xflag & COMMIT_DELETE) { 1281 if (tblk->xflag & COMMIT_DELETE) {
1282 atomic_inc(&tblk->u.ip->i_count); 1282 ihold(tblk->u.ip);
1283 /* 1283 /*
1284 * Avoid a rare deadlock 1284 * Avoid a rare deadlock
1285 * 1285 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..231ca4af9bce 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
839 ip->i_ctime = CURRENT_TIME; 839 ip->i_ctime = CURRENT_TIME;
840 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 840 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
841 mark_inode_dirty(dir); 841 mark_inode_dirty(dir);
842 atomic_inc(&ip->i_count); 842 ihold(ip);
843 843
844 iplist[0] = ip; 844 iplist[0] = ip;
845 iplist[1] = dir; 845 iplist[1] = dir;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index ec8c3e4baca3..0669fc1cc3bf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <asm/uaccess.h> 34#include <asm/uaccess.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/smp_lock.h>
37 36
38#include "jfs_incore.h" 37#include "jfs_incore.h"
39#include "jfs_filsys.h" 38#include "jfs_filsys.h"
@@ -176,8 +175,6 @@ static void jfs_put_super(struct super_block *sb)
176 175
177 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 176 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
178 177
179 lock_kernel();
180
181 rc = jfs_umount(sb); 178 rc = jfs_umount(sb);
182 if (rc) 179 if (rc)
183 jfs_err("jfs_umount failed with return code %d", rc); 180 jfs_err("jfs_umount failed with return code %d", rc);
@@ -188,8 +185,6 @@ static void jfs_put_super(struct super_block *sb)
188 iput(sbi->direct_inode); 185 iput(sbi->direct_inode);
189 186
190 kfree(sbi); 187 kfree(sbi);
191
192 unlock_kernel();
193} 188}
194 189
195enum { 190enum {
@@ -369,19 +364,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
369 if (!parse_options(data, sb, &newLVSize, &flag)) { 364 if (!parse_options(data, sb, &newLVSize, &flag)) {
370 return -EINVAL; 365 return -EINVAL;
371 } 366 }
372 lock_kernel(); 367
373 if (newLVSize) { 368 if (newLVSize) {
374 if (sb->s_flags & MS_RDONLY) { 369 if (sb->s_flags & MS_RDONLY) {
375 printk(KERN_ERR 370 printk(KERN_ERR
376 "JFS: resize requires volume to be mounted read-write\n"); 371 "JFS: resize requires volume to be mounted read-write\n");
377 unlock_kernel();
378 return -EROFS; 372 return -EROFS;
379 } 373 }
380 rc = jfs_extendfs(sb, newLVSize, 0); 374 rc = jfs_extendfs(sb, newLVSize, 0);
381 if (rc) { 375 if (rc)
382 unlock_kernel();
383 return rc; 376 return rc;
384 }
385 } 377 }
386 378
387 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 379 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -397,36 +389,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
397 /* mark the fs r/w for quota activity */ 389 /* mark the fs r/w for quota activity */
398 sb->s_flags &= ~MS_RDONLY; 390 sb->s_flags &= ~MS_RDONLY;
399 391
400 unlock_kernel();
401 dquot_resume(sb, -1); 392 dquot_resume(sb, -1);
402 return ret; 393 return ret;
403 } 394 }
404 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 395 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
405 rc = dquot_suspend(sb, -1); 396 rc = dquot_suspend(sb, -1);
406 if (rc < 0) { 397 if (rc < 0) {
407 unlock_kernel();
408 return rc; 398 return rc;
409 } 399 }
410 rc = jfs_umount_rw(sb); 400 rc = jfs_umount_rw(sb);
411 JFS_SBI(sb)->flag = flag; 401 JFS_SBI(sb)->flag = flag;
412 unlock_kernel();
413 return rc; 402 return rc;
414 } 403 }
415 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) 404 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
416 if (!(sb->s_flags & MS_RDONLY)) { 405 if (!(sb->s_flags & MS_RDONLY)) {
417 rc = jfs_umount_rw(sb); 406 rc = jfs_umount_rw(sb);
418 if (rc) { 407 if (rc)
419 unlock_kernel();
420 return rc; 408 return rc;
421 } 409
422 JFS_SBI(sb)->flag = flag; 410 JFS_SBI(sb)->flag = flag;
423 ret = jfs_mount_rw(sb, 1); 411 ret = jfs_mount_rw(sb, 1);
424 unlock_kernel();
425 return ret; 412 return ret;
426 } 413 }
427 JFS_SBI(sb)->flag = flag; 414 JFS_SBI(sb)->flag = flag;
428 415
429 unlock_kernel();
430 return 0; 416 return 0;
431} 417}
432 418
@@ -446,6 +432,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
446 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); 432 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
447 if (!sbi) 433 if (!sbi)
448 return -ENOMEM; 434 return -ENOMEM;
435
449 sb->s_fs_info = sbi; 436 sb->s_fs_info = sbi;
450 sbi->sb = sb; 437 sbi->sb = sb;
451 sbi->uid = sbi->gid = sbi->umask = -1; 438 sbi->uid = sbi->gid = sbi->umask = -1;
@@ -596,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb)
596 return 0; 583 return 0;
597} 584}
598 585
599static int jfs_get_sb(struct file_system_type *fs_type, 586static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
600 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 587 int flags, const char *dev_name, void *data)
601{ 588{
602 return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super, 589 return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
603 mnt);
604} 590}
605 591
606static int jfs_sync_fs(struct super_block *sb, int wait) 592static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -783,7 +769,7 @@ static const struct export_operations jfs_export_operations = {
783static struct file_system_type jfs_fs_type = { 769static struct file_system_type jfs_fs_type = {
784 .owner = THIS_MODULE, 770 .owner = THIS_MODULE,
785 .name = "jfs", 771 .name = "jfs",
786 .get_sb = jfs_get_sb, 772 .mount = jfs_do_mount,
787 .kill_sb = kill_block_super, 773 .kill_sb = kill_block_super,
788 .fs_flags = FS_REQUIRES_DEV, 774 .fs_flags = FS_REQUIRES_DEV,
789}; 775};
diff --git a/fs/libfs.c b/fs/libfs.c
index 62baa0387d6e..a3accdf528ad 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = {
201 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 201 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
202 * will never be mountable) 202 * will never be mountable)
203 */ 203 */
204int get_sb_pseudo(struct file_system_type *fs_type, char *name, 204struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
205 const struct super_operations *ops, unsigned long magic, 205 const struct super_operations *ops, unsigned long magic)
206 struct vfsmount *mnt)
207{ 206{
208 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 207 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
209 struct dentry *dentry; 208 struct dentry *dentry;
@@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
211 struct qstr d_name = {.name = name, .len = strlen(name)}; 210 struct qstr d_name = {.name = name, .len = strlen(name)};
212 211
213 if (IS_ERR(s)) 212 if (IS_ERR(s))
214 return PTR_ERR(s); 213 return ERR_CAST(s);
215 214
216 s->s_flags = MS_NOUSER; 215 s->s_flags = MS_NOUSER;
217 s->s_maxbytes = MAX_LFS_FILESIZE; 216 s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
241 d_instantiate(dentry, root); 240 d_instantiate(dentry, root);
242 s->s_root = dentry; 241 s->s_root = dentry;
243 s->s_flags |= MS_ACTIVE; 242 s->s_flags |= MS_ACTIVE;
244 simple_set_mnt(mnt, s); 243 return dget(s->s_root);
245 return 0;
246 244
247Enomem: 245Enomem:
248 deactivate_locked_super(s); 246 deactivate_locked_super(s);
249 return -ENOMEM; 247 return ERR_PTR(-ENOMEM);
250} 248}
251 249
252int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 250int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -255,7 +253,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
255 253
256 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 254 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
257 inc_nlink(inode); 255 inc_nlink(inode);
258 atomic_inc(&inode->i_count); 256 ihold(inode);
259 dget(dentry); 257 dget(dentry);
260 d_instantiate(dentry, inode); 258 d_instantiate(dentry, inode);
261 return 0; 259 return 0;
@@ -892,10 +890,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
892 */ 890 */
893int generic_file_fsync(struct file *file, int datasync) 891int generic_file_fsync(struct file *file, int datasync)
894{ 892{
895 struct writeback_control wbc = {
896 .sync_mode = WB_SYNC_ALL,
897 .nr_to_write = 0, /* metadata-only; caller takes care of data */
898 };
899 struct inode *inode = file->f_mapping->host; 893 struct inode *inode = file->f_mapping->host;
900 int err; 894 int err;
901 int ret; 895 int ret;
@@ -906,7 +900,7 @@ int generic_file_fsync(struct file *file, int datasync)
906 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 900 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
907 return ret; 901 return ret;
908 902
909 err = sync_inode(inode, &wbc); 903 err = sync_inode_metadata(inode, 1);
910 if (ret == 0) 904 if (ret == 0)
911 ret = err; 905 ret = err;
912 return ret; 906 return ret;
@@ -955,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
955EXPORT_SYMBOL(dcache_dir_open); 949EXPORT_SYMBOL(dcache_dir_open);
956EXPORT_SYMBOL(dcache_readdir); 950EXPORT_SYMBOL(dcache_readdir);
957EXPORT_SYMBOL(generic_read_dir); 951EXPORT_SYMBOL(generic_read_dir);
958EXPORT_SYMBOL(get_sb_pseudo); 952EXPORT_SYMBOL(mount_pseudo);
959EXPORT_SYMBOL(simple_write_begin); 953EXPORT_SYMBOL(simple_write_begin);
960EXPORT_SYMBOL(simple_write_end); 954EXPORT_SYMBOL(simple_write_end);
961EXPORT_SYMBOL(simple_dir_inode_operations); 955EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993c..d5bb86866e6c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,6 +42,7 @@ struct nlm_wait {
42}; 42};
43 43
44static LIST_HEAD(nlm_blocked); 44static LIST_HEAD(nlm_blocked);
45static DEFINE_SPINLOCK(nlm_blocked_lock);
45 46
46/** 47/**
47 * nlmclnt_init - Set up per-NFS mount point lockd data structures 48 * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
97 block->b_lock = fl; 98 block->b_lock = fl;
98 init_waitqueue_head(&block->b_wait); 99 init_waitqueue_head(&block->b_wait);
99 block->b_status = nlm_lck_blocked; 100 block->b_status = nlm_lck_blocked;
101
102 spin_lock(&nlm_blocked_lock);
100 list_add(&block->b_list, &nlm_blocked); 103 list_add(&block->b_list, &nlm_blocked);
104 spin_unlock(&nlm_blocked_lock);
101 } 105 }
102 return block; 106 return block;
103} 107}
@@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
106{ 110{
107 if (block == NULL) 111 if (block == NULL)
108 return; 112 return;
113 spin_lock(&nlm_blocked_lock);
109 list_del(&block->b_list); 114 list_del(&block->b_list);
115 spin_unlock(&nlm_blocked_lock);
110 kfree(block); 116 kfree(block);
111} 117}
112 118
@@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
154 * Look up blocked request based on arguments. 160 * Look up blocked request based on arguments.
155 * Warning: must not use cookie to match it! 161 * Warning: must not use cookie to match it!
156 */ 162 */
163 spin_lock(&nlm_blocked_lock);
157 list_for_each_entry(block, &nlm_blocked, b_list) { 164 list_for_each_entry(block, &nlm_blocked, b_list) {
158 struct file_lock *fl_blocked = block->b_lock; 165 struct file_lock *fl_blocked = block->b_lock;
159 166
@@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
178 wake_up(&block->b_wait); 185 wake_up(&block->b_wait);
179 res = nlm_granted; 186 res = nlm_granted;
180 } 187 }
188 spin_unlock(&nlm_blocked_lock);
181 return res; 189 return res;
182} 190}
183 191
@@ -216,10 +224,6 @@ reclaimer(void *ptr)
216 allow_signal(SIGKILL); 224 allow_signal(SIGKILL);
217 225
218 down_write(&host->h_rwsem); 226 down_write(&host->h_rwsem);
219
220 /* This one ensures that our parent doesn't terminate while the
221 * reclaim is in progress */
222 lock_kernel();
223 lockd_up(); /* note: this cannot fail as lockd is already running */ 227 lockd_up(); /* note: this cannot fail as lockd is already running */
224 228
225 dprintk("lockd: reclaiming locks for host %s\n", host->h_name); 229 dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +264,17 @@ restart:
260 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); 264 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
261 265
262 /* Now, wake up all processes that sleep on a blocked lock */ 266 /* Now, wake up all processes that sleep on a blocked lock */
267 spin_lock(&nlm_blocked_lock);
263 list_for_each_entry(block, &nlm_blocked, b_list) { 268 list_for_each_entry(block, &nlm_blocked, b_list) {
264 if (block->b_host == host) { 269 if (block->b_host == host) {
265 block->b_status = nlm_lck_denied_grace_period; 270 block->b_status = nlm_lck_denied_grace_period;
266 wake_up(&block->b_wait); 271 wake_up(&block->b_wait);
267 } 272 }
268 } 273 }
274 spin_unlock(&nlm_blocked_lock);
269 275
270 /* Release host handle after use */ 276 /* Release host handle after use */
271 nlm_release_host(host); 277 nlm_release_host(host);
272 lockd_down(); 278 lockd_down();
273 unlock_kernel();
274 return 0; 279 return 0;
275} 280}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab4..47ea1e1925b8 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
166 /* Set up the argument struct */ 166 /* Set up the argument struct */
167 nlmclnt_setlockargs(call, fl); 167 nlmclnt_setlockargs(call, fl);
168 168
169 lock_kernel();
170 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { 169 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
171 if (fl->fl_type != F_UNLCK) { 170 if (fl->fl_type != F_UNLCK) {
172 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; 171 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
177 status = nlmclnt_test(call, fl); 176 status = nlmclnt_test(call, fl);
178 else 177 else
179 status = -EINVAL; 178 status = -EINVAL;
180
181 fl->fl_ops->fl_release_private(fl); 179 fl->fl_ops->fl_release_private(fl);
182 fl->fl_ops = NULL; 180 fl->fl_ops = NULL;
183 unlock_kernel();
184 181
185 dprintk("lockd: clnt proc returns %d\n", status); 182 dprintk("lockd: clnt proc returns %d\n", status);
186 return status; 183 return status;
@@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call)
226 223
227static void nlmclnt_rpc_release(void *data) 224static void nlmclnt_rpc_release(void *data)
228{ 225{
229 lock_kernel();
230 nlm_release_call(data); 226 nlm_release_call(data);
231 unlock_kernel();
232} 227}
233 228
234static int nlm_wait_on_grace(wait_queue_head_t *queue) 229static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -448,14 +443,18 @@ out:
448 443
449static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) 444static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
450{ 445{
446 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
451 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; 447 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
452 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); 448 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
453 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); 449 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
450 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
454} 451}
455 452
456static void nlmclnt_locks_release_private(struct file_lock *fl) 453static void nlmclnt_locks_release_private(struct file_lock *fl)
457{ 454{
455 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
458 list_del(&fl->fl_u.nfs_fl.list); 456 list_del(&fl->fl_u.nfs_fl.list);
457 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
459 nlm_put_lockowner(fl->fl_u.nfs_fl.owner); 458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
460} 459}
461 460
@@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
721die: 720die:
722 return; 721 return;
723 retry_rebind: 722 retry_rebind:
724 lock_kernel();
725 nlm_rebind_host(req->a_host); 723 nlm_rebind_host(req->a_host);
726 unlock_kernel();
727 retry_unlock: 724 retry_unlock:
728 rpc_restart_call(task); 725 rpc_restart_call(task);
729} 726}
@@ -801,9 +798,7 @@ retry_cancel:
801 /* Don't ever retry more than 3 times */ 798 /* Don't ever retry more than 3 times */
802 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) 799 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
803 goto die; 800 goto die;
804 lock_kernel();
805 nlm_rebind_host(req->a_host); 801 nlm_rebind_host(req->a_host);
806 unlock_kernel();
807 rpc_restart_call(task); 802 rpc_restart_call(task);
808 rpc_delay(task, 30 * HZ); 803 rpc_delay(task, 30 * HZ);
809} 804}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104c..25e21e4023b2 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host)
353 .to_retries = 5U, 353 .to_retries = 5U,
354 }; 354 };
355 struct rpc_create_args args = { 355 struct rpc_create_args args = {
356 .net = &init_net,
356 .protocol = host->h_proto, 357 .protocol = host->h_proto,
357 .address = nlm_addr(host), 358 .address = nlm_addr(host),
358 .addrsize = host->h_addrlen, 359 .addrsize = host->h_addrlen,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fbab..e0c918949644 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK), 69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
70 }; 70 };
71 struct rpc_create_args args = { 71 struct rpc_create_args args = {
72 .net = &init_net,
72 .protocol = XPRT_TRANSPORT_UDP, 73 .protocol = XPRT_TRANSPORT_UDP,
73 .address = (struct sockaddr *)&sin, 74 .address = (struct sockaddr *)&sin,
74 .addrsize = sizeof(sin), 75 .addrsize = sizeof(sin),
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a0391..abfff9d7979d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/smp.h> 24#include <linux/smp.h>
25#include <linux/smp_lock.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/kthread.h> 26#include <linux/kthread.h>
28#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
130 129
131 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); 130 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
132 131
133 /*
134 * FIXME: it would be nice if lockd didn't spend its entire life
135 * running under the BKL. At the very least, it would be good to
136 * have someone clarify what it's intended to protect here. I've
137 * seen some handwavy posts about posix locking needing to be
138 * done under the BKL, but it's far from clear.
139 */
140 lock_kernel();
141
142 if (!nlm_timeout) 132 if (!nlm_timeout)
143 nlm_timeout = LOCKD_DFLT_TIMEO; 133 nlm_timeout = LOCKD_DFLT_TIMEO;
144 nlmsvc_timeout = nlm_timeout * HZ; 134 nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
195 if (nlmsvc_ops) 185 if (nlmsvc_ops)
196 nlmsvc_invalidate_all(); 186 nlmsvc_invalidate_all();
197 nlm_shutdown_hosts(); 187 nlm_shutdown_hosts();
198 unlock_kernel();
199 return 0; 188 return 0;
200} 189}
201 190
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
206 195
207 xprt = svc_find_xprt(serv, name, family, 0); 196 xprt = svc_find_xprt(serv, name, family, 0);
208 if (xprt == NULL) 197 if (xprt == NULL)
209 return svc_create_xprt(serv, name, family, port, 198 return svc_create_xprt(serv, name, &init_net, family, port,
210 SVC_SOCK_DEFAULTS); 199 SVC_SOCK_DEFAULTS);
211 svc_xprt_put(xprt); 200 svc_xprt_put(xprt);
212 return 0; 201 return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a134..a336e832475d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
230 230
231static void nlm4svc_callback_release(void *data) 231static void nlm4svc_callback_release(void *data)
232{ 232{
233 lock_kernel();
234 nlm_release_call(data); 233 nlm_release_call(data);
235 unlock_kernel();
236} 234}
237 235
238static const struct rpc_call_ops nlm4svc_callback_ops = { 236static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc5..c462d346acbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
52 * The list of blocked locks to retry 52 * The list of blocked locks to retry
53 */ 53 */
54static LIST_HEAD(nlm_blocked); 54static LIST_HEAD(nlm_blocked);
55static DEFINE_SPINLOCK(nlm_blocked_lock);
55 56
56/* 57/*
57 * Insert a blocked lock into the global list 58 * Insert a blocked lock into the global list
58 */ 59 */
59static void 60static void
60nlmsvc_insert_block(struct nlm_block *block, unsigned long when) 61nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
61{ 62{
62 struct nlm_block *b; 63 struct nlm_block *b;
63 struct list_head *pos; 64 struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
87 block->b_when = when; 88 block->b_when = when;
88} 89}
89 90
91static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
92{
93 spin_lock(&nlm_blocked_lock);
94 nlmsvc_insert_block_locked(block, when);
95 spin_unlock(&nlm_blocked_lock);
96}
97
90/* 98/*
91 * Remove a block from the global list 99 * Remove a block from the global list
92 */ 100 */
@@ -94,7 +102,9 @@ static inline void
94nlmsvc_remove_block(struct nlm_block *block) 102nlmsvc_remove_block(struct nlm_block *block)
95{ 103{
96 if (!list_empty(&block->b_list)) { 104 if (!list_empty(&block->b_list)) {
105 spin_lock(&nlm_blocked_lock);
97 list_del_init(&block->b_list); 106 list_del_init(&block->b_list);
107 spin_unlock(&nlm_blocked_lock);
98 nlmsvc_release_block(block); 108 nlmsvc_release_block(block);
99 } 109 }
100} 110}
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
651 struct nlm_block *block; 661 struct nlm_block *block;
652 int rc = -ENOENT; 662 int rc = -ENOENT;
653 663
654 lock_kernel(); 664 spin_lock(&nlm_blocked_lock);
655 list_for_each_entry(block, &nlm_blocked, b_list) { 665 list_for_each_entry(block, &nlm_blocked, b_list) {
656 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 666 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
657 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", 667 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
665 } else if (result == 0) 675 } else if (result == 0)
666 block->b_granted = 1; 676 block->b_granted = 1;
667 677
668 nlmsvc_insert_block(block, 0); 678 nlmsvc_insert_block_locked(block, 0);
669 svc_wake_up(block->b_daemon); 679 svc_wake_up(block->b_daemon);
670 rc = 0; 680 rc = 0;
671 break; 681 break;
672 } 682 }
673 } 683 }
674 unlock_kernel(); 684 spin_unlock(&nlm_blocked_lock);
675 if (rc == -ENOENT) 685 if (rc == -ENOENT)
676 printk(KERN_WARNING "lockd: grant for unknown block\n"); 686 printk(KERN_WARNING "lockd: grant for unknown block\n");
677 return rc; 687 return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
690 struct nlm_block *block; 700 struct nlm_block *block;
691 701
692 dprintk("lockd: VFS unblock notification for block %p\n", fl); 702 dprintk("lockd: VFS unblock notification for block %p\n", fl);
703 spin_lock(&nlm_blocked_lock);
693 list_for_each_entry(block, &nlm_blocked, b_list) { 704 list_for_each_entry(block, &nlm_blocked, b_list) {
694 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 705 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
695 nlmsvc_insert_block(block, 0); 706 nlmsvc_insert_block_locked(block, 0);
707 spin_unlock(&nlm_blocked_lock);
696 svc_wake_up(block->b_daemon); 708 svc_wake_up(block->b_daemon);
697 return; 709 return;
698 } 710 }
699 } 711 }
700 712 spin_unlock(&nlm_blocked_lock);
701 printk(KERN_WARNING "lockd: notification for unknown block!\n"); 713 printk(KERN_WARNING "lockd: notification for unknown block!\n");
702} 714}
703 715
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
803 815
804 dprintk("lockd: GRANT_MSG RPC callback\n"); 816 dprintk("lockd: GRANT_MSG RPC callback\n");
805 817
806 lock_kernel(); 818 spin_lock(&nlm_blocked_lock);
807 /* if the block is not on a list at this point then it has 819 /* if the block is not on a list at this point then it has
808 * been invalidated. Don't try to requeue it. 820 * been invalidated. Don't try to requeue it.
809 * 821 *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
825 /* Call was successful, now wait for client callback */ 837 /* Call was successful, now wait for client callback */
826 timeout = 60 * HZ; 838 timeout = 60 * HZ;
827 } 839 }
828 nlmsvc_insert_block(block, timeout); 840 nlmsvc_insert_block_locked(block, timeout);
829 svc_wake_up(block->b_daemon); 841 svc_wake_up(block->b_daemon);
830out: 842out:
831 unlock_kernel(); 843 spin_unlock(&nlm_blocked_lock);
832} 844}
833 845
846/*
847 * FIXME: nlmsvc_release_block() grabs a mutex. This is not allowed for an
848 * .rpc_release rpc_call_op
849 */
834static void nlmsvc_grant_release(void *data) 850static void nlmsvc_grant_release(void *data)
835{ 851{
836 struct nlm_rqst *call = data; 852 struct nlm_rqst *call = data;
837
838 lock_kernel();
839 nlmsvc_release_block(call->a_block); 853 nlmsvc_release_block(call->a_block);
840 unlock_kernel();
841} 854}
842 855
843static const struct rpc_call_ops nlmsvc_grant_ops = { 856static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7c..c3069f38d602 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
260 260
261static void nlmsvc_callback_release(void *data) 261static void nlmsvc_callback_release(void *data)
262{ 262{
263 lock_kernel();
264 nlm_release_call(data); 263 nlm_release_call(data);
265 unlock_kernel();
266} 264}
267 265
268static const struct rpc_call_ops nlmsvc_callback_ops = { 266static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3da..1ca0679c80bf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
170 170
171again: 171again:
172 file->f_locks = 0; 172 file->f_locks = 0;
173 lock_flocks(); /* protects i_flock list */
173 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 174 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
174 if (fl->fl_lmops != &nlmsvc_lock_operations) 175 if (fl->fl_lmops != &nlmsvc_lock_operations)
175 continue; 176 continue;
@@ -181,6 +182,7 @@ again:
181 if (match(lockhost, host)) { 182 if (match(lockhost, host)) {
182 struct file_lock lock = *fl; 183 struct file_lock lock = *fl;
183 184
185 unlock_flocks();
184 lock.fl_type = F_UNLCK; 186 lock.fl_type = F_UNLCK;
185 lock.fl_start = 0; 187 lock.fl_start = 0;
186 lock.fl_end = OFFSET_MAX; 188 lock.fl_end = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
192 goto again; 194 goto again;
193 } 195 }
194 } 196 }
197 unlock_flocks();
195 198
196 return 0; 199 return 0;
197} 200}
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
226 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 229 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
227 return 1; 230 return 1;
228 231
232 lock_flocks();
229 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 233 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
230 if (fl->fl_lmops == &nlmsvc_lock_operations) 234 if (fl->fl_lmops == &nlmsvc_lock_operations) {
235 unlock_flocks();
231 return 1; 236 return 1;
237 }
232 } 238 }
239 unlock_flocks();
233 file->f_locks = 0; 240 file->f_locks = 0;
234 return 0; 241 return 0;
235} 242}
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc048..0e62dd35d088 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,14 +142,32 @@ int lease_break_time = 45;
142 142
143static LIST_HEAD(file_lock_list); 143static LIST_HEAD(file_lock_list);
144static LIST_HEAD(blocked_list); 144static LIST_HEAD(blocked_list);
145static DEFINE_SPINLOCK(file_lock_lock);
146
147/*
148 * Protects the two list heads above, plus the inode->i_flock list
149 * FIXME: should use a spinlock, once lockd and ceph are ready.
150 */
151void lock_flocks(void)
152{
153 spin_lock(&file_lock_lock);
154}
155EXPORT_SYMBOL_GPL(lock_flocks);
156
157void unlock_flocks(void)
158{
159 spin_unlock(&file_lock_lock);
160}
161EXPORT_SYMBOL_GPL(unlock_flocks);
145 162
146static struct kmem_cache *filelock_cache __read_mostly; 163static struct kmem_cache *filelock_cache __read_mostly;
147 164
148/* Allocate an empty lock structure. */ 165/* Allocate an empty lock structure. */
149static struct file_lock *locks_alloc_lock(void) 166struct file_lock *locks_alloc_lock(void)
150{ 167{
151 return kmem_cache_alloc(filelock_cache, GFP_KERNEL); 168 return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
152} 169}
170EXPORT_SYMBOL_GPL(locks_alloc_lock);
153 171
154void locks_release_private(struct file_lock *fl) 172void locks_release_private(struct file_lock *fl)
155{ 173{
@@ -168,7 +186,7 @@ void locks_release_private(struct file_lock *fl)
168EXPORT_SYMBOL_GPL(locks_release_private); 186EXPORT_SYMBOL_GPL(locks_release_private);
169 187
170/* Free a lock which is not in use. */ 188/* Free a lock which is not in use. */
171static void locks_free_lock(struct file_lock *fl) 189void locks_free_lock(struct file_lock *fl)
172{ 190{
173 BUG_ON(waitqueue_active(&fl->fl_wait)); 191 BUG_ON(waitqueue_active(&fl->fl_wait));
174 BUG_ON(!list_empty(&fl->fl_block)); 192 BUG_ON(!list_empty(&fl->fl_block));
@@ -177,6 +195,7 @@ static void locks_free_lock(struct file_lock *fl)
177 locks_release_private(fl); 195 locks_release_private(fl);
178 kmem_cache_free(filelock_cache, fl); 196 kmem_cache_free(filelock_cache, fl);
179} 197}
198EXPORT_SYMBOL(locks_free_lock);
180 199
181void locks_init_lock(struct file_lock *fl) 200void locks_init_lock(struct file_lock *fl)
182{ 201{
@@ -216,11 +235,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
216 fl->fl_ops->fl_copy_lock(new, fl); 235 fl->fl_ops->fl_copy_lock(new, fl);
217 new->fl_ops = fl->fl_ops; 236 new->fl_ops = fl->fl_ops;
218 } 237 }
219 if (fl->fl_lmops) { 238 if (fl->fl_lmops)
220 if (fl->fl_lmops->fl_copy_lock)
221 fl->fl_lmops->fl_copy_lock(new, fl);
222 new->fl_lmops = fl->fl_lmops; 239 new->fl_lmops = fl->fl_lmops;
223 }
224} 240}
225 241
226/* 242/*
@@ -511,9 +527,9 @@ static void __locks_delete_block(struct file_lock *waiter)
511 */ 527 */
512static void locks_delete_block(struct file_lock *waiter) 528static void locks_delete_block(struct file_lock *waiter)
513{ 529{
514 lock_kernel(); 530 lock_flocks();
515 __locks_delete_block(waiter); 531 __locks_delete_block(waiter);
516 unlock_kernel(); 532 unlock_flocks();
517} 533}
518 534
519/* Insert waiter into blocker's block list. 535/* Insert waiter into blocker's block list.
@@ -644,7 +660,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
644{ 660{
645 struct file_lock *cfl; 661 struct file_lock *cfl;
646 662
647 lock_kernel(); 663 lock_flocks();
648 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { 664 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
649 if (!IS_POSIX(cfl)) 665 if (!IS_POSIX(cfl))
650 continue; 666 continue;
@@ -657,7 +673,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
657 fl->fl_pid = pid_vnr(cfl->fl_nspid); 673 fl->fl_pid = pid_vnr(cfl->fl_nspid);
658 } else 674 } else
659 fl->fl_type = F_UNLCK; 675 fl->fl_type = F_UNLCK;
660 unlock_kernel(); 676 unlock_flocks();
661 return; 677 return;
662} 678}
663EXPORT_SYMBOL(posix_test_lock); 679EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +746,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
730 int error = 0; 746 int error = 0;
731 int found = 0; 747 int found = 0;
732 748
733 lock_kernel(); 749 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
734 if (request->fl_flags & FL_ACCESS)
735 goto find_conflict;
736
737 if (request->fl_type != F_UNLCK) {
738 error = -ENOMEM;
739 new_fl = locks_alloc_lock(); 750 new_fl = locks_alloc_lock();
740 if (new_fl == NULL) 751 if (!new_fl)
741 goto out; 752 return -ENOMEM;
742 error = 0;
743 } 753 }
744 754
755 lock_flocks();
756 if (request->fl_flags & FL_ACCESS)
757 goto find_conflict;
758
745 for_each_lock(inode, before) { 759 for_each_lock(inode, before) {
746 struct file_lock *fl = *before; 760 struct file_lock *fl = *before;
747 if (IS_POSIX(fl)) 761 if (IS_POSIX(fl))
@@ -767,8 +781,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
767 * If a higher-priority process was blocked on the old file lock, 781 * If a higher-priority process was blocked on the old file lock,
768 * give it the opportunity to lock the file. 782 * give it the opportunity to lock the file.
769 */ 783 */
770 if (found) 784 if (found) {
785 unlock_flocks();
771 cond_resched(); 786 cond_resched();
787 lock_flocks();
788 }
772 789
773find_conflict: 790find_conflict:
774 for_each_lock(inode, before) { 791 for_each_lock(inode, before) {
@@ -794,7 +811,7 @@ find_conflict:
794 error = 0; 811 error = 0;
795 812
796out: 813out:
797 unlock_kernel(); 814 unlock_flocks();
798 if (new_fl) 815 if (new_fl)
799 locks_free_lock(new_fl); 816 locks_free_lock(new_fl);
800 return error; 817 return error;
@@ -823,7 +840,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
823 new_fl2 = locks_alloc_lock(); 840 new_fl2 = locks_alloc_lock();
824 } 841 }
825 842
826 lock_kernel(); 843 lock_flocks();
827 if (request->fl_type != F_UNLCK) { 844 if (request->fl_type != F_UNLCK) {
828 for_each_lock(inode, before) { 845 for_each_lock(inode, before) {
829 fl = *before; 846 fl = *before;
@@ -991,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
991 locks_wake_up_blocks(left); 1008 locks_wake_up_blocks(left);
992 } 1009 }
993 out: 1010 out:
994 unlock_kernel(); 1011 unlock_flocks();
995 /* 1012 /*
996 * Free any unused locks. 1013 * Free any unused locks.
997 */ 1014 */
@@ -1066,14 +1083,14 @@ int locks_mandatory_locked(struct inode *inode)
1066 /* 1083 /*
1067 * Search the lock list for this inode for any POSIX locks. 1084 * Search the lock list for this inode for any POSIX locks.
1068 */ 1085 */
1069 lock_kernel(); 1086 lock_flocks();
1070 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1087 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1071 if (!IS_POSIX(fl)) 1088 if (!IS_POSIX(fl))
1072 continue; 1089 continue;
1073 if (fl->fl_owner != owner) 1090 if (fl->fl_owner != owner)
1074 break; 1091 break;
1075 } 1092 }
1076 unlock_kernel(); 1093 unlock_flocks();
1077 return fl ? -EAGAIN : 0; 1094 return fl ? -EAGAIN : 0;
1078} 1095}
1079 1096
@@ -1186,7 +1203,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1186 1203
1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); 1204 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1188 1205
1189 lock_kernel(); 1206 lock_flocks();
1190 1207
1191 time_out_leases(inode); 1208 time_out_leases(inode);
1192 1209
@@ -1247,8 +1264,10 @@ restart:
1247 break_time++; 1264 break_time++;
1248 } 1265 }
1249 locks_insert_block(flock, new_fl); 1266 locks_insert_block(flock, new_fl);
1267 unlock_flocks();
1250 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1268 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1251 !new_fl->fl_next, break_time); 1269 !new_fl->fl_next, break_time);
1270 lock_flocks();
1252 __locks_delete_block(new_fl); 1271 __locks_delete_block(new_fl);
1253 if (error >= 0) { 1272 if (error >= 0) {
1254 if (error == 0) 1273 if (error == 0)
@@ -1263,7 +1282,7 @@ restart:
1263 } 1282 }
1264 1283
1265out: 1284out:
1266 unlock_kernel(); 1285 unlock_flocks();
1267 if (!IS_ERR(new_fl)) 1286 if (!IS_ERR(new_fl))
1268 locks_free_lock(new_fl); 1287 locks_free_lock(new_fl);
1269 return error; 1288 return error;
@@ -1319,7 +1338,7 @@ int fcntl_getlease(struct file *filp)
1319 struct file_lock *fl; 1338 struct file_lock *fl;
1320 int type = F_UNLCK; 1339 int type = F_UNLCK;
1321 1340
1322 lock_kernel(); 1341 lock_flocks();
1323 time_out_leases(filp->f_path.dentry->d_inode); 1342 time_out_leases(filp->f_path.dentry->d_inode);
1324 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); 1343 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
1325 fl = fl->fl_next) { 1344 fl = fl->fl_next) {
@@ -1328,7 +1347,7 @@ int fcntl_getlease(struct file *filp)
1328 break; 1347 break;
1329 } 1348 }
1330 } 1349 }
1331 unlock_kernel(); 1350 unlock_flocks();
1332 return type; 1351 return type;
1333} 1352}
1334 1353
@@ -1341,36 +1360,32 @@ int fcntl_getlease(struct file *filp)
1341 * The (input) flp->fl_lmops->fl_break function is required 1360 * The (input) flp->fl_lmops->fl_break function is required
1342 * by break_lease(). 1361 * by break_lease().
1343 * 1362 *
1344 * Called with kernel lock held. 1363 * Called with file_lock_lock held.
1345 */ 1364 */
1346int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1365int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1347{ 1366{
1348 struct file_lock *fl, **before, **my_before = NULL, *lease; 1367 struct file_lock *fl, **before, **my_before = NULL, *lease;
1349 struct file_lock *new_fl = NULL;
1350 struct dentry *dentry = filp->f_path.dentry; 1368 struct dentry *dentry = filp->f_path.dentry;
1351 struct inode *inode = dentry->d_inode; 1369 struct inode *inode = dentry->d_inode;
1352 int error, rdlease_count = 0, wrlease_count = 0; 1370 int error, rdlease_count = 0, wrlease_count = 0;
1353 1371
1372 lease = *flp;
1373
1374 error = -EACCES;
1354 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) 1375 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
1355 return -EACCES; 1376 goto out;
1377 error = -EINVAL;
1356 if (!S_ISREG(inode->i_mode)) 1378 if (!S_ISREG(inode->i_mode))
1357 return -EINVAL; 1379 goto out;
1358 error = security_file_lock(filp, arg); 1380 error = security_file_lock(filp, arg);
1359 if (error) 1381 if (error)
1360 return error; 1382 goto out;
1361 1383
1362 time_out_leases(inode); 1384 time_out_leases(inode);
1363 1385
1364 BUG_ON(!(*flp)->fl_lmops->fl_break); 1386 BUG_ON(!(*flp)->fl_lmops->fl_break);
1365 1387
1366 lease = *flp;
1367
1368 if (arg != F_UNLCK) { 1388 if (arg != F_UNLCK) {
1369 error = -ENOMEM;
1370 new_fl = locks_alloc_lock();
1371 if (new_fl == NULL)
1372 goto out;
1373
1374 error = -EAGAIN; 1389 error = -EAGAIN;
1375 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1390 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1376 goto out; 1391 goto out;
@@ -1410,12 +1425,12 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1410 goto out; 1425 goto out;
1411 1426
1412 if (my_before != NULL) { 1427 if (my_before != NULL) {
1413 *flp = *my_before;
1414 error = lease->fl_lmops->fl_change(my_before, arg); 1428 error = lease->fl_lmops->fl_change(my_before, arg);
1429 if (!error)
1430 *flp = *my_before;
1415 goto out; 1431 goto out;
1416 } 1432 }
1417 1433
1418 error = 0;
1419 if (arg == F_UNLCK) 1434 if (arg == F_UNLCK)
1420 goto out; 1435 goto out;
1421 1436
@@ -1423,20 +1438,23 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1423 if (!leases_enable) 1438 if (!leases_enable)
1424 goto out; 1439 goto out;
1425 1440
1426 locks_copy_lock(new_fl, lease); 1441 locks_insert_lock(before, lease);
1427 locks_insert_lock(before, new_fl);
1428
1429 *flp = new_fl;
1430 return 0; 1442 return 0;
1431 1443
1432out: 1444out:
1433 if (new_fl != NULL)
1434 locks_free_lock(new_fl);
1435 return error; 1445 return error;
1436} 1446}
1437EXPORT_SYMBOL(generic_setlease); 1447EXPORT_SYMBOL(generic_setlease);
1438 1448
1439 /** 1449static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1450{
1451 if (filp->f_op && filp->f_op->setlease)
1452 return filp->f_op->setlease(filp, arg, lease);
1453 else
1454 return generic_setlease(filp, arg, lease);
1455}
1456
1457/**
1440 * vfs_setlease - sets a lease on an open file 1458 * vfs_setlease - sets a lease on an open file
1441 * @filp: file pointer 1459 * @filp: file pointer
1442 * @arg: type of lease to obtain 1460 * @arg: type of lease to obtain
@@ -1467,17 +1485,67 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1467{ 1485{
1468 int error; 1486 int error;
1469 1487
1470 lock_kernel(); 1488 lock_flocks();
1471 if (filp->f_op && filp->f_op->setlease) 1489 error = __vfs_setlease(filp, arg, lease);
1472 error = filp->f_op->setlease(filp, arg, lease); 1490 unlock_flocks();
1473 else
1474 error = generic_setlease(filp, arg, lease);
1475 unlock_kernel();
1476 1491
1477 return error; 1492 return error;
1478} 1493}
1479EXPORT_SYMBOL_GPL(vfs_setlease); 1494EXPORT_SYMBOL_GPL(vfs_setlease);
1480 1495
1496static int do_fcntl_delete_lease(struct file *filp)
1497{
1498 struct file_lock fl, *flp = &fl;
1499
1500 lease_init(filp, F_UNLCK, flp);
1501
1502 return vfs_setlease(filp, F_UNLCK, &flp);
1503}
1504
1505static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1506{
1507 struct file_lock *fl, *ret;
1508 struct fasync_struct *new;
1509 int error;
1510
1511 fl = lease_alloc(filp, arg);
1512 if (IS_ERR(fl))
1513 return PTR_ERR(fl);
1514
1515 new = fasync_alloc();
1516 if (!new) {
1517 locks_free_lock(fl);
1518 return -ENOMEM;
1519 }
1520 ret = fl;
1521 lock_flocks();
1522 error = __vfs_setlease(filp, arg, &ret);
1523 if (error) {
1524 unlock_flocks();
1525 locks_free_lock(fl);
1526 goto out_free_fasync;
1527 }
1528 if (ret != fl)
1529 locks_free_lock(fl);
1530
1531 /*
1532 * fasync_insert_entry() returns the old entry if any.
1533 * If there was no old entry, then it used 'new' and
1534 * inserted it into the fasync list. Clear new so that
1535 * we don't release it here.
1536 */
1537 if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
1538 new = NULL;
1539
1540 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1541 unlock_flocks();
1542
1543out_free_fasync:
1544 if (new)
1545 fasync_free(new);
1546 return error;
1547}
1548
1481/** 1549/**
1482 * fcntl_setlease - sets a lease on an open file 1550 * fcntl_setlease - sets a lease on an open file
1483 * @fd: open file descriptor 1551 * @fd: open file descriptor
@@ -1490,34 +1558,9 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
1490 */ 1558 */
1491int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1559int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1492{ 1560{
1493 struct file_lock fl, *flp = &fl; 1561 if (arg == F_UNLCK)
1494 struct inode *inode = filp->f_path.dentry->d_inode; 1562 return do_fcntl_delete_lease(filp);
1495 int error; 1563 return do_fcntl_add_lease(fd, filp, arg);
1496
1497 locks_init_lock(&fl);
1498 error = lease_init(filp, arg, &fl);
1499 if (error)
1500 return error;
1501
1502 lock_kernel();
1503
1504 error = vfs_setlease(filp, arg, &flp);
1505 if (error || arg == F_UNLCK)
1506 goto out_unlock;
1507
1508 error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
1509 if (error < 0) {
1510 /* remove lease just inserted by setlease */
1511 flp->fl_type = F_UNLCK | F_INPROGRESS;
1512 flp->fl_break_time = jiffies - 10;
1513 time_out_leases(inode);
1514 goto out_unlock;
1515 }
1516
1517 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1518out_unlock:
1519 unlock_kernel();
1520 return error;
1521} 1564}
1522 1565
1523/** 1566/**
@@ -2020,7 +2063,7 @@ void locks_remove_flock(struct file *filp)
2020 fl.fl_ops->fl_release_private(&fl); 2063 fl.fl_ops->fl_release_private(&fl);
2021 } 2064 }
2022 2065
2023 lock_kernel(); 2066 lock_flocks();
2024 before = &inode->i_flock; 2067 before = &inode->i_flock;
2025 2068
2026 while ((fl = *before) != NULL) { 2069 while ((fl = *before) != NULL) {
@@ -2038,7 +2081,7 @@ void locks_remove_flock(struct file *filp)
2038 } 2081 }
2039 before = &fl->fl_next; 2082 before = &fl->fl_next;
2040 } 2083 }
2041 unlock_kernel(); 2084 unlock_flocks();
2042} 2085}
2043 2086
2044/** 2087/**
@@ -2053,12 +2096,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
2053{ 2096{
2054 int status = 0; 2097 int status = 0;
2055 2098
2056 lock_kernel(); 2099 lock_flocks();
2057 if (waiter->fl_next) 2100 if (waiter->fl_next)
2058 __locks_delete_block(waiter); 2101 __locks_delete_block(waiter);
2059 else 2102 else
2060 status = -ENOENT; 2103 status = -ENOENT;
2061 unlock_kernel(); 2104 unlock_flocks();
2062 return status; 2105 return status;
2063} 2106}
2064 2107
@@ -2085,7 +2128,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
2085#include <linux/seq_file.h> 2128#include <linux/seq_file.h>
2086 2129
2087static void lock_get_status(struct seq_file *f, struct file_lock *fl, 2130static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2088 int id, char *pfx) 2131 loff_t id, char *pfx)
2089{ 2132{
2090 struct inode *inode = NULL; 2133 struct inode *inode = NULL;
2091 unsigned int fl_pid; 2134 unsigned int fl_pid;
@@ -2098,7 +2141,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2098 if (fl->fl_file != NULL) 2141 if (fl->fl_file != NULL)
2099 inode = fl->fl_file->f_path.dentry->d_inode; 2142 inode = fl->fl_file->f_path.dentry->d_inode;
2100 2143
2101 seq_printf(f, "%d:%s ", id, pfx); 2144 seq_printf(f, "%lld:%s ", id, pfx);
2102 if (IS_POSIX(fl)) { 2145 if (IS_POSIX(fl)) {
2103 seq_printf(f, "%6s %s ", 2146 seq_printf(f, "%6s %s ",
2104 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", 2147 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2161,30 +2204,33 @@ static int locks_show(struct seq_file *f, void *v)
2161 2204
2162 fl = list_entry(v, struct file_lock, fl_link); 2205 fl = list_entry(v, struct file_lock, fl_link);
2163 2206
2164 lock_get_status(f, fl, (long)f->private, ""); 2207 lock_get_status(f, fl, *((loff_t *)f->private), "");
2165 2208
2166 list_for_each_entry(bfl, &fl->fl_block, fl_block) 2209 list_for_each_entry(bfl, &fl->fl_block, fl_block)
2167 lock_get_status(f, bfl, (long)f->private, " ->"); 2210 lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
2168 2211
2169 f->private++;
2170 return 0; 2212 return 0;
2171} 2213}
2172 2214
2173static void *locks_start(struct seq_file *f, loff_t *pos) 2215static void *locks_start(struct seq_file *f, loff_t *pos)
2174{ 2216{
2175 lock_kernel(); 2217 loff_t *p = f->private;
2176 f->private = (void *)1; 2218
2219 lock_flocks();
2220 *p = (*pos + 1);
2177 return seq_list_start(&file_lock_list, *pos); 2221 return seq_list_start(&file_lock_list, *pos);
2178} 2222}
2179 2223
2180static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2224static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2181{ 2225{
2226 loff_t *p = f->private;
2227 ++*p;
2182 return seq_list_next(v, &file_lock_list, pos); 2228 return seq_list_next(v, &file_lock_list, pos);
2183} 2229}
2184 2230
2185static void locks_stop(struct seq_file *f, void *v) 2231static void locks_stop(struct seq_file *f, void *v)
2186{ 2232{
2187 unlock_kernel(); 2233 unlock_flocks();
2188} 2234}
2189 2235
2190static const struct seq_operations locks_seq_operations = { 2236static const struct seq_operations locks_seq_operations = {
@@ -2196,14 +2242,14 @@ static const struct seq_operations locks_seq_operations = {
2196 2242
2197static int locks_open(struct inode *inode, struct file *filp) 2243static int locks_open(struct inode *inode, struct file *filp)
2198{ 2244{
2199 return seq_open(filp, &locks_seq_operations); 2245 return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
2200} 2246}
2201 2247
2202static const struct file_operations proc_locks_operations = { 2248static const struct file_operations proc_locks_operations = {
2203 .open = locks_open, 2249 .open = locks_open,
2204 .read = seq_read, 2250 .read = seq_read,
2205 .llseek = seq_lseek, 2251 .llseek = seq_lseek,
2206 .release = seq_release, 2252 .release = seq_release_private,
2207}; 2253};
2208 2254
2209static int __init proc_locks_init(void) 2255static int __init proc_locks_init(void)
@@ -2231,7 +2277,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2231{ 2277{
2232 struct file_lock *fl; 2278 struct file_lock *fl;
2233 int result = 1; 2279 int result = 1;
2234 lock_kernel(); 2280 lock_flocks();
2235 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2281 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2236 if (IS_POSIX(fl)) { 2282 if (IS_POSIX(fl)) {
2237 if (fl->fl_type == F_RDLCK) 2283 if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2294,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2248 result = 0; 2294 result = 0;
2249 break; 2295 break;
2250 } 2296 }
2251 unlock_kernel(); 2297 unlock_flocks();
2252 return result; 2298 return result;
2253} 2299}
2254 2300
@@ -2271,7 +2317,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2271{ 2317{
2272 struct file_lock *fl; 2318 struct file_lock *fl;
2273 int result = 1; 2319 int result = 1;
2274 lock_kernel(); 2320 lock_flocks();
2275 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2321 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2276 if (IS_POSIX(fl)) { 2322 if (IS_POSIX(fl)) {
2277 if ((fl->fl_end < start) || (fl->fl_start > (start + len))) 2323 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2332,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2286 result = 0; 2332 result = 0;
2287 break; 2333 break;
2288 } 2334 }
2289 unlock_kernel(); 2335 unlock_flocks();
2290 return result; 2336 return result;
2291} 2337}
2292 2338
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a3040..92ca6fbe09bd 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
298 return sync_request(page, bdev, WRITE); 298 return sync_request(page, bdev, WRITE);
299} 299}
300 300
301static void bdev_put_device(struct super_block *sb) 301static void bdev_put_device(struct logfs_super *s)
302{ 302{
303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); 303 close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
304} 304}
305 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs) 306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -320,8 +320,8 @@ static const struct logfs_device_ops bd_devops = {
320 .put_device = bdev_put_device, 320 .put_device = bdev_put_device,
321}; 321};
322 322
323int logfs_get_sb_bdev(struct file_system_type *type, int flags, 323int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
324 const char *devname, struct vfsmount *mnt) 324 const char *devname)
325{ 325{
326 struct block_device *bdev; 326 struct block_device *bdev;
327 327
@@ -332,8 +332,11 @@ int logfs_get_sb_bdev(struct file_system_type *type, int flags,
332 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { 332 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
333 int mtdnr = MINOR(bdev->bd_dev); 333 int mtdnr = MINOR(bdev->bd_dev);
334 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 334 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
335 return logfs_get_sb_mtd(type, flags, mtdnr, mnt); 335 return logfs_get_sb_mtd(p, mtdnr);
336 } 336 }
337 337
338 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); 338 p->s_bdev = bdev;
339 p->s_mtd = NULL;
340 p->s_devops = &bd_devops;
341 return 0;
339} 342}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4b..7466e9dcc8c5 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
230 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); 230 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
231} 231}
232 232
233static void mtd_put_device(struct super_block *sb) 233static void mtd_put_device(struct logfs_super *s)
234{ 234{
235 put_mtd_device(logfs_super(sb)->s_mtd); 235 put_mtd_device(s->s_mtd);
236} 236}
237 237
238static int mtd_can_write_buf(struct super_block *sb, u64 ofs) 238static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
@@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
265 .put_device = mtd_put_device, 265 .put_device = mtd_put_device,
266}; 266};
267 267
268int logfs_get_sb_mtd(struct file_system_type *type, int flags, 268int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
269 int mtdnr, struct vfsmount *mnt)
270{ 269{
271 struct mtd_info *mtd; 270 struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
272 const struct logfs_device_ops *devops = &mtd_devops;
273
274 mtd = get_mtd_device(NULL, mtdnr);
275 if (IS_ERR(mtd)) 271 if (IS_ERR(mtd))
276 return PTR_ERR(mtd); 272 return PTR_ERR(mtd);
277 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); 273
274 s->s_bdev = NULL;
275 s->s_mtd = mtd;
276 s->s_devops = &mtd_devops;
277 return 0;
278} 278}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9777eb5b5522..409dfd65e9a1 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
569 return -EMLINK; 569 return -EMLINK;
570 570
571 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 571 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
572 atomic_inc(&inode->i_count); 572 ihold(inode);
573 inode->i_nlink++; 573 inode->i_nlink++;
574 mark_inode_dirty_sync(inode); 574 mark_inode_dirty_sync(inode);
575 575
@@ -827,4 +827,5 @@ const struct file_operations logfs_dir_fops = {
827 .unlocked_ioctl = logfs_ioctl, 827 .unlocked_ioctl = logfs_ioctl,
828 .readdir = logfs_readdir, 828 .readdir = logfs_readdir,
829 .read = generic_read_dir, 829 .read = generic_read_dir,
830 .llseek = default_llseek,
830}; 831};
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b8786264d243..57afd4a6fabb 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
136 int (*erase_segment)(struct logfs_area *area); 136 int (*erase_segment)(struct logfs_area *area);
137}; 137};
138 138
139struct logfs_super; /* forward */
139/** 140/**
140 * struct logfs_device_ops - device access operations 141 * struct logfs_device_ops - device access operations
141 * 142 *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
156 int ensure_write); 157 int ensure_write);
157 int (*can_write_buf)(struct super_block *sb, u64 ofs); 158 int (*can_write_buf)(struct super_block *sb, u64 ofs);
158 void (*sync)(struct super_block *sb); 159 void (*sync)(struct super_block *sb);
159 void (*put_device)(struct super_block *sb); 160 void (*put_device)(struct logfs_super *s);
160}; 161};
161 162
162/** 163/**
@@ -471,11 +472,13 @@ void logfs_compr_exit(void);
471 472
472/* dev_bdev.c */ 473/* dev_bdev.c */
473#ifdef CONFIG_BLOCK 474#ifdef CONFIG_BLOCK
474int logfs_get_sb_bdev(struct file_system_type *type, int flags, 475int logfs_get_sb_bdev(struct logfs_super *s,
475 const char *devname, struct vfsmount *mnt); 476 struct file_system_type *type,
477 const char *devname);
476#else 478#else
477static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, 479static inline int logfs_get_sb_bdev(struct logfs_super *s,
478 const char *devname, struct vfsmount *mnt) 480 struct file_system_type *type,
481 const char *devname)
479{ 482{
480 return -ENODEV; 483 return -ENODEV;
481} 484}
@@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
483 486
484/* dev_mtd.c */ 487/* dev_mtd.c */
485#ifdef CONFIG_MTD 488#ifdef CONFIG_MTD
486int logfs_get_sb_mtd(struct file_system_type *type, int flags, 489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
487 int mtdnr, struct vfsmount *mnt);
488#else 490#else
489static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, 491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
490 int mtdnr, struct vfsmount *mnt)
491{ 492{
492 return -ENODEV; 493 return -ENODEV;
493} 494}
@@ -619,9 +620,6 @@ void emergency_read_end(struct page *page);
619void logfs_crash_dump(struct super_block *sb); 620void logfs_crash_dump(struct super_block *sb);
620void *memchr_inv(const void *s, int c, size_t n); 621void *memchr_inv(const void *s, int c, size_t n);
621int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); 622int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
622int logfs_get_sb_device(struct file_system_type *type, int flags,
623 struct mtd_info *mtd, struct block_device *bdev,
624 const struct logfs_device_ops *devops, struct vfsmount *mnt);
625int logfs_check_ds(struct logfs_disk_super *ds); 623int logfs_check_ds(struct logfs_disk_super *ds);
626int logfs_write_sb(struct super_block *sb); 624int logfs_write_sb(struct super_block *sb);
627 625
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5336155c5d81..33435e4b14d2 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
325 return 0; 325 return 0;
326} 326}
327 327
328static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) 328static int logfs_get_sb_final(struct super_block *sb)
329{ 329{
330 struct logfs_super *super = logfs_super(sb); 330 struct logfs_super *super = logfs_super(sb);
331 struct inode *rootdir; 331 struct inode *rootdir;
@@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
356 } 356 }
357 357
358 log_super("LogFS: Finished mounting\n"); 358 log_super("LogFS: Finished mounting\n");
359 simple_set_mnt(mnt, sb);
360 return 0; 359 return 0;
361 360
362fail: 361fail:
@@ -529,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb)
529 logfs_cleanup_rw(sb); 528 logfs_cleanup_rw(sb);
530 if (super->s_erase_page) 529 if (super->s_erase_page)
531 __free_page(super->s_erase_page); 530 __free_page(super->s_erase_page);
532 super->s_devops->put_device(sb); 531 super->s_devops->put_device(super);
533 logfs_mempool_destroy(super->s_btree_pool); 532 logfs_mempool_destroy(super->s_btree_pool);
534 logfs_mempool_destroy(super->s_alias_pool); 533 logfs_mempool_destroy(super->s_alias_pool);
535 kfree(super); 534 kfree(super);
536 log_super("LogFS: Finished unmounting\n"); 535 log_super("LogFS: Finished unmounting\n");
537} 536}
538 537
539int logfs_get_sb_device(struct file_system_type *type, int flags, 538static struct dentry *logfs_get_sb_device(struct logfs_super *super,
540 struct mtd_info *mtd, struct block_device *bdev, 539 struct file_system_type *type, int flags)
541 const struct logfs_device_ops *devops, struct vfsmount *mnt)
542{ 540{
543 struct logfs_super *super;
544 struct super_block *sb; 541 struct super_block *sb;
545 int err = -ENOMEM; 542 int err = -ENOMEM;
546 static int mount_count; 543 static int mount_count;
547 544
548 log_super("LogFS: Start mount %x\n", mount_count++); 545 log_super("LogFS: Start mount %x\n", mount_count++);
549 super = kzalloc(sizeof(*super), GFP_KERNEL);
550 if (!super)
551 goto err0;
552 546
553 super->s_mtd = mtd;
554 super->s_bdev = bdev;
555 err = -EINVAL; 547 err = -EINVAL;
556 sb = sget(type, logfs_sb_test, logfs_sb_set, super); 548 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
557 if (IS_ERR(sb)) 549 if (IS_ERR(sb)) {
558 goto err0; 550 super->s_devops->put_device(super);
551 kfree(super);
552 return ERR_CAST(sb);
553 }
559 554
560 if (sb->s_root) { 555 if (sb->s_root) {
561 /* Device is already in use */ 556 /* Device is already in use */
562 err = 0; 557 super->s_devops->put_device(super);
563 simple_set_mnt(mnt, sb); 558 kfree(super);
564 goto err0; 559 return dget(sb->s_root);
565 } 560 }
566 561
567 super->s_devops = devops;
568
569 /* 562 /*
570 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache 563 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
571 * only covers 16TB and the upper 8TB are used for indirect blocks. 564 * only covers 16TB and the upper 8TB are used for indirect blocks.
@@ -581,10 +574,12 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
581 goto err1; 574 goto err1;
582 575
583 sb->s_flags |= MS_ACTIVE; 576 sb->s_flags |= MS_ACTIVE;
584 err = logfs_get_sb_final(sb, mnt); 577 err = logfs_get_sb_final(sb);
585 if (err) 578 if (err) {
586 deactivate_locked_super(sb); 579 deactivate_locked_super(sb);
587 return err; 580 return ERR_PTR(err);
581 }
582 return dget(sb->s_root);
588 583
589err1: 584err1:
590 /* no ->s_root, no ->put_super() */ 585 /* no ->s_root, no ->put_super() */
@@ -592,37 +587,45 @@ err1:
592 iput(super->s_segfile_inode); 587 iput(super->s_segfile_inode);
593 iput(super->s_mapping_inode); 588 iput(super->s_mapping_inode);
594 deactivate_locked_super(sb); 589 deactivate_locked_super(sb);
595 return err; 590 return ERR_PTR(err);
596err0:
597 kfree(super);
598 //devops->put_device(sb);
599 return err;
600} 591}
601 592
602static int logfs_get_sb(struct file_system_type *type, int flags, 593static struct dentry *logfs_mount(struct file_system_type *type, int flags,
603 const char *devname, void *data, struct vfsmount *mnt) 594 const char *devname, void *data)
604{ 595{
605 ulong mtdnr; 596 ulong mtdnr;
597 struct logfs_super *super;
598 int err;
606 599
607 if (!devname) 600 super = kzalloc(sizeof(*super), GFP_KERNEL);
608 return logfs_get_sb_bdev(type, flags, devname, mnt); 601 if (!super)
609 if (strncmp(devname, "mtd", 3)) 602 return ERR_PTR(-ENOMEM);
610 return logfs_get_sb_bdev(type, flags, devname, mnt);
611 603
612 { 604 if (!devname)
605 err = logfs_get_sb_bdev(super, type, devname);
606 else if (strncmp(devname, "mtd", 3))
607 err = logfs_get_sb_bdev(super, type, devname);
608 else {
613 char *garbage; 609 char *garbage;
614 mtdnr = simple_strtoul(devname+3, &garbage, 0); 610 mtdnr = simple_strtoul(devname+3, &garbage, 0);
615 if (*garbage) 611 if (*garbage)
616 return -EINVAL; 612 err = -EINVAL;
613 else
614 err = logfs_get_sb_mtd(super, mtdnr);
615 }
616
617 if (err) {
618 kfree(super);
619 return ERR_PTR(err);
617 } 620 }
618 621
619 return logfs_get_sb_mtd(type, flags, mtdnr, mnt); 622 return logfs_get_sb_device(super, type, flags);
620} 623}
621 624
622static struct file_system_type logfs_fs_type = { 625static struct file_system_type logfs_fs_type = {
623 .owner = THIS_MODULE, 626 .owner = THIS_MODULE,
624 .name = "logfs", 627 .name = "logfs",
625 .get_sb = logfs_get_sb, 628 .mount = logfs_mount,
626 .kill_sb = logfs_kill_sb, 629 .kill_sb = logfs_kill_sb,
627 .fs_flags = FS_REQUIRES_DEV, 630 .fs_flags = FS_REQUIRES_DEV,
628 631
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e39d6bf2e8fb..fb2020858a34 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -614,17 +614,16 @@ void minix_truncate(struct inode * inode)
614 V2_minix_truncate(inode); 614 V2_minix_truncate(inode);
615} 615}
616 616
617static int minix_get_sb(struct file_system_type *fs_type, 617static struct dentry *minix_mount(struct file_system_type *fs_type,
618 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 618 int flags, const char *dev_name, void *data)
619{ 619{
620 return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super, 620 return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
621 mnt);
622} 621}
623 622
624static struct file_system_type minix_fs_type = { 623static struct file_system_type minix_fs_type = {
625 .owner = THIS_MODULE, 624 .owner = THIS_MODULE,
626 .name = "minix", 625 .name = "minix",
627 .get_sb = minix_get_sb, 626 .mount = minix_mount,
628 .kill_sb = kill_block_super, 627 .kill_sb = kill_block_super,
629 .fs_flags = FS_REQUIRES_DEV, 628 .fs_flags = FS_REQUIRES_DEV,
630}; 629};
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..c0d35a3accef 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
101 101
102 inode->i_ctime = CURRENT_TIME_SEC; 102 inode->i_ctime = CURRENT_TIME_SEC;
103 inode_inc_link_count(inode); 103 inode_inc_link_count(inode);
104 atomic_inc(&inode->i_count); 104 ihold(inode);
105 return add_nondir(dentry, inode); 105 return add_nondir(dentry, inode);
106} 106}
107 107
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..5362af9b7372 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1121static struct dentry *__lookup_hash(struct qstr *name, 1121static struct dentry *__lookup_hash(struct qstr *name,
1122 struct dentry *base, struct nameidata *nd) 1122 struct dentry *base, struct nameidata *nd)
1123{ 1123{
1124 struct inode *inode = base->d_inode;
1124 struct dentry *dentry; 1125 struct dentry *dentry;
1125 struct inode *inode;
1126 int err; 1126 int err;
1127 1127
1128 inode = base->d_inode; 1128 err = exec_permission(inode);
1129 if (err)
1130 return ERR_PTR(err);
1129 1131
1130 /* 1132 /*
1131 * See if the low-level filesystem might want 1133 * See if the low-level filesystem might want
@@ -1161,11 +1163,6 @@ out:
1161 */ 1163 */
1162static struct dentry *lookup_hash(struct nameidata *nd) 1164static struct dentry *lookup_hash(struct nameidata *nd)
1163{ 1165{
1164 int err;
1165
1166 err = exec_permission(nd->path.dentry->d_inode);
1167 if (err)
1168 return ERR_PTR(err);
1169 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1166 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1170} 1167}
1171 1168
@@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1213 if (err) 1210 if (err)
1214 return ERR_PTR(err); 1211 return ERR_PTR(err);
1215 1212
1216 err = exec_permission(base->d_inode);
1217 if (err)
1218 return ERR_PTR(err);
1219 return __lookup_hash(&this, base, NULL); 1213 return __lookup_hash(&this, base, NULL);
1220} 1214}
1221 1215
@@ -1580,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd,
1580 */ 1574 */
1581 if (will_truncate) 1575 if (will_truncate)
1582 mnt_drop_write(nd->path.mnt); 1576 mnt_drop_write(nd->path.mnt);
1577 path_put(&nd->path);
1583 return filp; 1578 return filp;
1584 1579
1585exit: 1580exit:
@@ -1681,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1681 } 1676 }
1682 filp = nameidata_to_filp(nd); 1677 filp = nameidata_to_filp(nd);
1683 mnt_drop_write(nd->path.mnt); 1678 mnt_drop_write(nd->path.mnt);
1679 path_put(&nd->path);
1684 if (!IS_ERR(filp)) { 1680 if (!IS_ERR(filp)) {
1685 error = ima_file_check(filp, acc_mode); 1681 error = ima_file_check(filp, acc_mode);
1686 if (error) { 1682 if (error) {
@@ -2291,7 +2287,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2291 goto slashes; 2287 goto slashes;
2292 inode = dentry->d_inode; 2288 inode = dentry->d_inode;
2293 if (inode) 2289 if (inode)
2294 atomic_inc(&inode->i_count); 2290 ihold(inode);
2295 error = mnt_want_write(nd.path.mnt); 2291 error = mnt_want_write(nd.path.mnt);
2296 if (error) 2292 if (error)
2297 goto exit2; 2293 goto exit2;
diff --git a/fs/namespace.c b/fs/namespace.c
index a72eaabfe8f2..8a415c9c5e55 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
595 goto out_free; 595 goto out_free;
596 } 596 }
597 597
598 mnt->mnt_flags = old->mnt_flags; 598 mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
599 atomic_inc(&sb->s_active); 599 atomic_inc(&sb->s_active);
600 mnt->mnt_sb = sb; 600 mnt->mnt_sb = sb;
601 mnt->mnt_root = dget(root); 601 mnt->mnt_root = dget(root);
@@ -1744,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1744 if (!capable(CAP_SYS_ADMIN)) 1744 if (!capable(CAP_SYS_ADMIN))
1745 return -EPERM; 1745 return -EPERM;
1746 1746
1747 lock_kernel();
1748 mnt = do_kern_mount(type, flags, name, data); 1747 mnt = do_kern_mount(type, flags, name, data);
1749 unlock_kernel();
1750 if (IS_ERR(mnt)) 1748 if (IS_ERR(mnt))
1751 return PTR_ERR(mnt); 1749 return PTR_ERR(mnt);
1752 1750
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd58..aac8832e919e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -95,6 +95,34 @@ const struct dentry_operations ncp_root_dentry_operations =
95}; 95};
96 96
97 97
98#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
99
100static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
101{
102#ifdef CONFIG_NCPFS_SMALLDOS
103 int ns = ncp_namespace(i);
104
105 if ((ns == NW_NS_DOS)
106#ifdef CONFIG_NCPFS_OS2_NS
107 || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
108#endif /* CONFIG_NCPFS_OS2_NS */
109 )
110 return 0;
111#endif /* CONFIG_NCPFS_SMALLDOS */
112 return 1;
113}
114
115#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS)
116
117static inline int ncp_case_sensitive(struct dentry *dentry)
118{
119#ifdef CONFIG_NCPFS_NFS_NS
120 return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
121#else
122 return 0;
123#endif /* CONFIG_NCPFS_NFS_NS */
124}
125
98/* 126/*
99 * Note: leave the hash unchanged if the directory 127 * Note: leave the hash unchanged if the directory
100 * is case-sensitive. 128 * is case-sensitive.
@@ -102,13 +130,12 @@ const struct dentry_operations ncp_root_dentry_operations =
102static int 130static int
103ncp_hash_dentry(struct dentry *dentry, struct qstr *this) 131ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
104{ 132{
105 struct nls_table *t; 133 if (!ncp_case_sensitive(dentry)) {
106 unsigned long hash; 134 struct nls_table *t;
107 int i; 135 unsigned long hash;
108 136 int i;
109 t = NCP_IO_TABLE(dentry);
110 137
111 if (!ncp_case_sensitive(dentry->d_inode)) { 138 t = NCP_IO_TABLE(dentry);
112 hash = init_name_hash(); 139 hash = init_name_hash();
113 for (i=0; i<this->len ; i++) 140 for (i=0; i<this->len ; i++)
114 hash = partial_name_hash(ncp_tolower(t, this->name[i]), 141 hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -124,7 +151,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
124 if (a->len != b->len) 151 if (a->len != b->len)
125 return 1; 152 return 1;
126 153
127 if (ncp_case_sensitive(dentry->d_inode)) 154 if (ncp_case_sensitive(dentry))
128 return strncmp(a->name, b->name, a->len); 155 return strncmp(a->name, b->name, a->len);
129 156
130 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len); 157 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
@@ -266,7 +293,7 @@ leave_me:;
266 293
267 294
268static int 295static int
269__ncp_lookup_validate(struct dentry *dentry) 296ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
270{ 297{
271 struct ncp_server *server; 298 struct ncp_server *server;
272 struct dentry *parent; 299 struct dentry *parent;
@@ -283,9 +310,6 @@ __ncp_lookup_validate(struct dentry *dentry)
283 310
284 server = NCP_SERVER(dir); 311 server = NCP_SERVER(dir);
285 312
286 if (!ncp_conn_valid(server))
287 goto finished;
288
289 /* 313 /*
290 * Inspired by smbfs: 314 * Inspired by smbfs:
291 * The default validation is based on dentry age: 315 * The default validation is based on dentry age:
@@ -304,8 +328,11 @@ __ncp_lookup_validate(struct dentry *dentry)
304 if (ncp_is_server_root(dir)) { 328 if (ncp_is_server_root(dir)) {
305 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 329 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
306 dentry->d_name.len, 1); 330 dentry->d_name.len, 1);
307 if (!res) 331 if (!res) {
308 res = ncp_lookup_volume(server, __name, &(finfo.i)); 332 res = ncp_lookup_volume(server, __name, &(finfo.i));
333 if (!res)
334 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
335 }
309 } else { 336 } else {
310 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 337 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
311 dentry->d_name.len, !ncp_preserve_case(dir)); 338 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +347,17 @@ __ncp_lookup_validate(struct dentry *dentry)
320 * what we remember, it's not valid any more. 347 * what we remember, it's not valid any more.
321 */ 348 */
322 if (!res) { 349 if (!res) {
323 if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) { 350 struct inode *inode = dentry->d_inode;
351
352 mutex_lock(&inode->i_mutex);
353 if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
324 ncp_new_dentry(dentry); 354 ncp_new_dentry(dentry);
325 val=1; 355 val=1;
326 } else 356 } else
327 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); 357 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
328 358
329 ncp_update_inode2(dentry->d_inode, &finfo); 359 ncp_update_inode2(inode, &finfo);
360 mutex_unlock(&inode->i_mutex);
330 } 361 }
331 362
332finished: 363finished:
@@ -335,16 +366,6 @@ finished:
335 return val; 366 return val;
336} 367}
337 368
338static int
339ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
340{
341 int res;
342 lock_kernel();
343 res = __ncp_lookup_validate(dentry);
344 unlock_kernel();
345 return res;
346}
347
348static struct dentry * 369static struct dentry *
349ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) 370ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
350{ 371{
@@ -411,8 +432,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
411 int result, mtime_valid = 0; 432 int result, mtime_valid = 0;
412 time_t mtime = 0; 433 time_t mtime = 0;
413 434
414 lock_kernel();
415
416 ctl.page = NULL; 435 ctl.page = NULL;
417 ctl.cache = NULL; 436 ctl.cache = NULL;
418 437
@@ -421,6 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
421 (int) filp->f_pos); 440 (int) filp->f_pos);
422 441
423 result = -EIO; 442 result = -EIO;
443 /* Do not generate '.' and '..' when server is dead. */
424 if (!ncp_conn_valid(server)) 444 if (!ncp_conn_valid(server))
425 goto out; 445 goto out;
426 446
@@ -532,6 +552,12 @@ read_really:
532 ctl.head.end = ctl.fpos - 1; 552 ctl.head.end = ctl.fpos - 1;
533 ctl.head.eof = ctl.valid; 553 ctl.head.eof = ctl.valid;
534finished: 554finished:
555 if (ctl.page) {
556 kunmap(ctl.page);
557 SetPageUptodate(ctl.page);
558 unlock_page(ctl.page);
559 page_cache_release(ctl.page);
560 }
535 if (page) { 561 if (page) {
536 cache->head = ctl.head; 562 cache->head = ctl.head;
537 kunmap(page); 563 kunmap(page);
@@ -539,23 +565,17 @@ finished:
539 unlock_page(page); 565 unlock_page(page);
540 page_cache_release(page); 566 page_cache_release(page);
541 } 567 }
542 if (ctl.page) {
543 kunmap(ctl.page);
544 SetPageUptodate(ctl.page);
545 unlock_page(ctl.page);
546 page_cache_release(ctl.page);
547 }
548out: 568out:
549 unlock_kernel();
550 return result; 569 return result;
551} 570}
552 571
553static int 572static int
554ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 573ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
555 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry) 574 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
575 int inval_childs)
556{ 576{
557 struct dentry *newdent, *dentry = filp->f_path.dentry; 577 struct dentry *newdent, *dentry = filp->f_path.dentry;
558 struct inode *newino, *inode = dentry->d_inode; 578 struct inode *dir = dentry->d_inode;
559 struct ncp_cache_control ctl = *ctrl; 579 struct ncp_cache_control ctl = *ctrl;
560 struct qstr qname; 580 struct qstr qname;
561 int valid = 0; 581 int valid = 0;
@@ -564,9 +584,9 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
564 __u8 __name[NCP_MAXPATHLEN + 1]; 584 __u8 __name[NCP_MAXPATHLEN + 1];
565 585
566 qname.len = sizeof(__name); 586 qname.len = sizeof(__name);
567 if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len, 587 if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
568 entry->i.entryName, entry->i.nameLen, 588 entry->i.entryName, entry->i.nameLen,
569 !ncp_preserve_entry_case(inode, entry->i.NSCreator))) 589 !ncp_preserve_entry_case(dir, entry->i.NSCreator)))
570 return 1; /* I'm not sure */ 590 return 1; /* I'm not sure */
571 591
572 qname.name = __name; 592 qname.name = __name;
@@ -584,22 +604,64 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
584 goto end_advance; 604 goto end_advance;
585 } else { 605 } else {
586 hashed = 1; 606 hashed = 1;
587 memcpy((char *) newdent->d_name.name, qname.name, 607
588 newdent->d_name.len); 608 /* If case sensitivity changed for this volume, all entries below this one
609 should be thrown away. This entry itself is not affected, as its case
610 sensitivity is controlled by its own parent. */
611 if (inval_childs)
612 shrink_dcache_parent(newdent);
613
614 /*
615 * It is not as dangerous as it looks. NetWare's OS2 namespace is
616 * case preserving yet case insensitive. So we update dentry's name
617 * as received from server. We found dentry via d_lookup with our
618 * hash, so we know that hash does not change, and so replacing name
619 * should be reasonably safe.
620 */
621 if (qname.len == newdent->d_name.len &&
622 memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
623 struct inode *inode = newdent->d_inode;
624
625 /*
626 * Inside ncpfs all uses of d_name are either for debugging,
627 * or on functions which acquire inode mutex (mknod, creat,
628 * lookup). So grab i_mutex here, to be sure. d_path
629 * uses dcache_lock when generating path, so we should too.
630 * And finally d_compare is protected by dentry's d_lock, so
631 * here we go.
632 */
633 if (inode)
634 mutex_lock(&inode->i_mutex);
635 spin_lock(&dcache_lock);
636 spin_lock(&newdent->d_lock);
637 memcpy((char *) newdent->d_name.name, qname.name,
638 newdent->d_name.len);
639 spin_unlock(&newdent->d_lock);
640 spin_unlock(&dcache_lock);
641 if (inode)
642 mutex_unlock(&inode->i_mutex);
643 }
589 } 644 }
590 645
591 if (!newdent->d_inode) { 646 if (!newdent->d_inode) {
647 struct inode *inode;
648
592 entry->opened = 0; 649 entry->opened = 0;
593 entry->ino = iunique(inode->i_sb, 2); 650 entry->ino = iunique(dir->i_sb, 2);
594 newino = ncp_iget(inode->i_sb, entry); 651 inode = ncp_iget(dir->i_sb, entry);
595 if (newino) { 652 if (inode) {
596 newdent->d_op = &ncp_dentry_operations; 653 newdent->d_op = &ncp_dentry_operations;
597 d_instantiate(newdent, newino); 654 d_instantiate(newdent, inode);
598 if (!hashed) 655 if (!hashed)
599 d_rehash(newdent); 656 d_rehash(newdent);
600 } 657 }
601 } else 658 } else {
602 ncp_update_inode2(newdent->d_inode, entry); 659 struct inode *inode = newdent->d_inode;
660
661 mutex_lock(&inode->i_mutex);
662 ncp_update_inode2(inode, entry);
663 mutex_unlock(&inode->i_mutex);
664 }
603 665
604 if (newdent->d_inode) { 666 if (newdent->d_inode) {
605 ino = newdent->d_inode->i_ino; 667 ino = newdent->d_inode->i_ino;
@@ -617,7 +679,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
617 ctl.cache = NULL; 679 ctl.cache = NULL;
618 ctl.idx -= NCP_DIRCACHE_SIZE; 680 ctl.idx -= NCP_DIRCACHE_SIZE;
619 ctl.ofs += 1; 681 ctl.ofs += 1;
620 ctl.page = grab_cache_page(&inode->i_data, ctl.ofs); 682 ctl.page = grab_cache_page(&dir->i_data, ctl.ofs);
621 if (ctl.page) 683 if (ctl.page)
622 ctl.cache = kmap(ctl.page); 684 ctl.cache = kmap(ctl.page);
623 } 685 }
@@ -633,7 +695,7 @@ end_advance:
633 if (!ino) 695 if (!ino)
634 ino = find_inode_number(dentry, &qname); 696 ino = find_inode_number(dentry, &qname);
635 if (!ino) 697 if (!ino)
636 ino = iunique(inode->i_sb, 2); 698 ino = iunique(dir->i_sb, 2);
637 ctl.filled = filldir(dirent, qname.name, qname.len, 699 ctl.filled = filldir(dirent, qname.name, qname.len,
638 filp->f_pos, ino, DT_UNKNOWN); 700 filp->f_pos, ino, DT_UNKNOWN);
639 if (!ctl.filled) 701 if (!ctl.filled)
@@ -660,6 +722,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
660 (unsigned long) filp->f_pos); 722 (unsigned long) filp->f_pos);
661 723
662 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { 724 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
725 int inval_dentry;
663 726
664 if (ncp_get_volume_info_with_number(server, i, &info) != 0) 727 if (ncp_get_volume_info_with_number(server, i, &info) != 0)
665 return; 728 return;
@@ -675,8 +738,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
675 info.volume_name); 738 info.volume_name);
676 continue; 739 continue;
677 } 740 }
741 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
678 entry.volume = entry.i.volNumber; 742 entry.volume = entry.i.volNumber;
679 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 743 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
680 return; 744 return;
681 } 745 }
682} 746}
@@ -739,7 +803,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
739 rpl += onerpl; 803 rpl += onerpl;
740 rpls -= onerpl; 804 rpls -= onerpl;
741 entry.volume = entry.i.volNumber; 805 entry.volume = entry.i.volNumber;
742 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 806 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
743 break; 807 break;
744 } 808 }
745 } while (more); 809 } while (more);
@@ -775,17 +839,19 @@ int ncp_conn_logged_in(struct super_block *sb)
775 if (dent) { 839 if (dent) {
776 struct inode* ino = dent->d_inode; 840 struct inode* ino = dent->d_inode;
777 if (ino) { 841 if (ino) {
842 ncp_update_known_namespace(server, volNumber, NULL);
778 NCP_FINFO(ino)->volNumber = volNumber; 843 NCP_FINFO(ino)->volNumber = volNumber;
779 NCP_FINFO(ino)->dirEntNum = dirEntNum; 844 NCP_FINFO(ino)->dirEntNum = dirEntNum;
780 NCP_FINFO(ino)->DosDirNum = DosDirNum; 845 NCP_FINFO(ino)->DosDirNum = DosDirNum;
846 result = 0;
781 } else { 847 } else {
782 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); 848 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
783 } 849 }
784 } else { 850 } else {
785 DPRINTK("ncpfs: sb->s_root == NULL!\n"); 851 DPRINTK("ncpfs: sb->s_root == NULL!\n");
786 } 852 }
787 } 853 } else
788 result = 0; 854 result = 0;
789 855
790out: 856out:
791 return result; 857 return result;
@@ -799,7 +865,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
799 int error, res, len; 865 int error, res, len;
800 __u8 __name[NCP_MAXPATHLEN + 1]; 866 __u8 __name[NCP_MAXPATHLEN + 1];
801 867
802 lock_kernel();
803 error = -EIO; 868 error = -EIO;
804 if (!ncp_conn_valid(server)) 869 if (!ncp_conn_valid(server))
805 goto finished; 870 goto finished;
@@ -813,6 +878,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
813 dentry->d_name.len, 1); 878 dentry->d_name.len, 1);
814 if (!res) 879 if (!res)
815 res = ncp_lookup_volume(server, __name, &(finfo.i)); 880 res = ncp_lookup_volume(server, __name, &(finfo.i));
881 if (!res)
882 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
816 } else { 883 } else {
817 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 884 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
818 dentry->d_name.len, !ncp_preserve_case(dir)); 885 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -846,7 +913,6 @@ add_entry:
846 913
847finished: 914finished:
848 PPRINTK("ncp_lookup: result=%d\n", error); 915 PPRINTK("ncp_lookup: result=%d\n", error);
849 unlock_kernel();
850 return ERR_PTR(error); 916 return ERR_PTR(error);
851} 917}
852 918
@@ -887,11 +953,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
887 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n", 953 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
888 dentry->d_parent->d_name.name, dentry->d_name.name, mode); 954 dentry->d_parent->d_name.name, dentry->d_name.name, mode);
889 955
890 error = -EIO;
891 lock_kernel();
892 if (!ncp_conn_valid(server))
893 goto out;
894
895 ncp_age_dentry(server, dentry); 956 ncp_age_dentry(server, dentry);
896 len = sizeof(__name); 957 len = sizeof(__name);
897 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 958 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +978,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
917 if (result) { 978 if (result) {
918 if (result == 0x87) 979 if (result == 0x87)
919 error = -ENAMETOOLONG; 980 error = -ENAMETOOLONG;
981 else if (result < 0)
982 error = result;
920 DPRINTK("ncp_create: %s/%s failed\n", 983 DPRINTK("ncp_create: %s/%s failed\n",
921 dentry->d_parent->d_name.name, dentry->d_name.name); 984 dentry->d_parent->d_name.name, dentry->d_name.name);
922 goto out; 985 goto out;
@@ -935,7 +998,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
935 998
936 error = ncp_instantiate(dir, dentry, &finfo); 999 error = ncp_instantiate(dir, dentry, &finfo);
937out: 1000out:
938 unlock_kernel();
939 return error; 1001 return error;
940} 1002}
941 1003
@@ -955,11 +1017,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
955 DPRINTK("ncp_mkdir: making %s/%s\n", 1017 DPRINTK("ncp_mkdir: making %s/%s\n",
956 dentry->d_parent->d_name.name, dentry->d_name.name); 1018 dentry->d_parent->d_name.name, dentry->d_name.name);
957 1019
958 error = -EIO;
959 lock_kernel();
960 if (!ncp_conn_valid(server))
961 goto out;
962
963 ncp_age_dentry(server, dentry); 1020 ncp_age_dentry(server, dentry);
964 len = sizeof(__name); 1021 len = sizeof(__name);
965 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 1022 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1024,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
967 if (error) 1024 if (error)
968 goto out; 1025 goto out;
969 1026
970 error = -EACCES; 1027 error = ncp_open_create_file_or_subdir(server, dir, __name,
971 if (ncp_open_create_file_or_subdir(server, dir, __name,
972 OC_MODE_CREATE, aDIR, 1028 OC_MODE_CREATE, aDIR,
973 cpu_to_le16(0xffff), 1029 cpu_to_le16(0xffff),
974 &finfo) == 0) 1030 &finfo);
975 { 1031 if (error == 0) {
976 if (ncp_is_nfs_extras(server, finfo.volume)) { 1032 if (ncp_is_nfs_extras(server, finfo.volume)) {
977 mode |= S_IFDIR; 1033 mode |= S_IFDIR;
978 finfo.i.nfs.mode = mode; 1034 finfo.i.nfs.mode = mode;
@@ -983,9 +1039,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
983 goto out; 1039 goto out;
984 } 1040 }
985 error = ncp_instantiate(dir, dentry, &finfo); 1041 error = ncp_instantiate(dir, dentry, &finfo);
1042 } else if (error > 0) {
1043 error = -EACCES;
986 } 1044 }
987out: 1045out:
988 unlock_kernel();
989 return error; 1046 return error;
990} 1047}
991 1048
@@ -998,11 +1055,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
998 DPRINTK("ncp_rmdir: removing %s/%s\n", 1055 DPRINTK("ncp_rmdir: removing %s/%s\n",
999 dentry->d_parent->d_name.name, dentry->d_name.name); 1056 dentry->d_parent->d_name.name, dentry->d_name.name);
1000 1057
1001 error = -EIO;
1002 lock_kernel();
1003 if (!ncp_conn_valid(server))
1004 goto out;
1005
1006 error = -EBUSY; 1058 error = -EBUSY;
1007 if (!d_unhashed(dentry)) 1059 if (!d_unhashed(dentry))
1008 goto out; 1060 goto out;
@@ -1036,11 +1088,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1036 error = -ENOENT; 1088 error = -ENOENT;
1037 break; 1089 break;
1038 default: 1090 default:
1039 error = -EACCES; 1091 error = result < 0 ? result : -EACCES;
1040 break; 1092 break;
1041 } 1093 }
1042out: 1094out:
1043 unlock_kernel();
1044 return error; 1095 return error;
1045} 1096}
1046 1097
@@ -1050,15 +1101,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1050 struct ncp_server *server; 1101 struct ncp_server *server;
1051 int error; 1102 int error;
1052 1103
1053 lock_kernel();
1054 server = NCP_SERVER(dir); 1104 server = NCP_SERVER(dir);
1055 DPRINTK("ncp_unlink: unlinking %s/%s\n", 1105 DPRINTK("ncp_unlink: unlinking %s/%s\n",
1056 dentry->d_parent->d_name.name, dentry->d_name.name); 1106 dentry->d_parent->d_name.name, dentry->d_name.name);
1057 1107
1058 error = -EIO;
1059 if (!ncp_conn_valid(server))
1060 goto out;
1061
1062 /* 1108 /*
1063 * Check whether to close the file ... 1109 * Check whether to close the file ...
1064 */ 1110 */
@@ -1097,12 +1143,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1097 error = -ENOENT; 1143 error = -ENOENT;
1098 break; 1144 break;
1099 default: 1145 default:
1100 error = -EACCES; 1146 error = error < 0 ? error : -EACCES;
1101 break; 1147 break;
1102 } 1148 }
1103
1104out:
1105 unlock_kernel();
1106 return error; 1149 return error;
1107} 1150}
1108 1151
@@ -1118,11 +1161,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1118 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1161 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1119 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1162 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1120 1163
1121 error = -EIO;
1122 lock_kernel();
1123 if (!ncp_conn_valid(server))
1124 goto out;
1125
1126 ncp_age_dentry(server, old_dentry); 1164 ncp_age_dentry(server, old_dentry);
1127 ncp_age_dentry(server, new_dentry); 1165 ncp_age_dentry(server, new_dentry);
1128 1166
@@ -1161,11 +1199,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1161 error = -ENOENT; 1199 error = -ENOENT;
1162 break; 1200 break;
1163 default: 1201 default:
1164 error = -EACCES; 1202 error = error < 0 ? error : -EACCES;
1165 break; 1203 break;
1166 } 1204 }
1167out: 1205out:
1168 unlock_kernel();
1169 return error; 1206 return error;
1170} 1207}
1171 1208
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbdae..6c754f70c529 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -113,9 +113,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
113 DPRINTK("ncp_file_read: enter %s/%s\n", 113 DPRINTK("ncp_file_read: enter %s/%s\n",
114 dentry->d_parent->d_name.name, dentry->d_name.name); 114 dentry->d_parent->d_name.name, dentry->d_name.name);
115 115
116 if (!ncp_conn_valid(NCP_SERVER(inode)))
117 return -EIO;
118
119 pos = *ppos; 116 pos = *ppos;
120 117
121 if ((ssize_t) count < 0) { 118 if ((ssize_t) count < 0) {
@@ -192,13 +189,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
192 189
193 DPRINTK("ncp_file_write: enter %s/%s\n", 190 DPRINTK("ncp_file_write: enter %s/%s\n",
194 dentry->d_parent->d_name.name, dentry->d_name.name); 191 dentry->d_parent->d_name.name, dentry->d_name.name);
195 if (!ncp_conn_valid(NCP_SERVER(inode)))
196 return -EIO;
197 if ((ssize_t) count < 0) 192 if ((ssize_t) count < 0)
198 return -EINVAL; 193 return -EINVAL;
199 pos = *ppos; 194 pos = *ppos;
200 if (file->f_flags & O_APPEND) { 195 if (file->f_flags & O_APPEND) {
201 pos = inode->i_size; 196 pos = i_size_read(inode);
202 } 197 }
203 198
204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { 199 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +259,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
264 259
265 *ppos = pos; 260 *ppos = pos;
266 261
267 if (pos > inode->i_size) { 262 if (pos > i_size_read(inode)) {
268 inode->i_size = pos; 263 mutex_lock(&inode->i_mutex);
264 if (pos > i_size_read(inode))
265 i_size_write(inode, pos);
266 mutex_unlock(&inode->i_mutex);
269 } 267 }
270 DPRINTK("ncp_file_write: exit %s/%s\n", 268 DPRINTK("ncp_file_write: exit %s/%s\n",
271 dentry->d_parent->d_name.name, dentry->d_name.name); 269 dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +279,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
281 return 0; 279 return 0;
282} 280}
283 281
284static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
285{
286 loff_t ret;
287 lock_kernel();
288 ret = generic_file_llseek_unlocked(file, offset, origin);
289 unlock_kernel();
290 return ret;
291}
292
293const struct file_operations ncp_file_operations = 282const struct file_operations ncp_file_operations =
294{ 283{
295 .llseek = ncp_remote_llseek, 284 .llseek = generic_file_llseek,
296 .read = ncp_file_read, 285 .read = ncp_file_read,
297 .write = ncp_file_write, 286 .write = ncp_file_write,
298 .unlocked_ioctl = ncp_ioctl, 287 .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b4de38cf49f5..d290545aa0c4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -139,7 +139,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
139 inode->i_mode = nwi->nfs.mode; 139 inode->i_mode = nwi->nfs.mode;
140 } 140 }
141 141
142 inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT; 142 inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
143 143
144 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate); 144 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
145 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate); 145 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +158,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
158 inode->i_mode = server->m.dir_mode; 158 inode->i_mode = server->m.dir_mode;
159 /* for directories dataStreamSize seems to be some 159 /* for directories dataStreamSize seems to be some
160 Object ID ??? */ 160 Object ID ??? */
161 inode->i_size = NCP_BLOCK_SIZE; 161 i_size_write(inode, NCP_BLOCK_SIZE);
162 } else { 162 } else {
163 u32 size;
164
163 inode->i_mode = server->m.file_mode; 165 inode->i_mode = server->m.file_mode;
164 inode->i_size = le32_to_cpu(nwi->dataStreamSize); 166 size = le32_to_cpu(nwi->dataStreamSize);
167 i_size_write(inode, size);
165#ifdef CONFIG_NCPFS_EXTRAS 168#ifdef CONFIG_NCPFS_EXTRAS
166 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 169 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS))
167 && (nwi->attributes & aSHARED)) { 170 && (nwi->attributes & aSHARED)) {
168 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) { 171 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
169 case aHIDDEN: 172 case aHIDDEN:
170 if (server->m.flags & NCP_MOUNT_SYMLINKS) { 173 if (server->m.flags & NCP_MOUNT_SYMLINKS) {
171 if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE) 174 if (/* (size >= NCP_MIN_SYMLINK_SIZE)
172 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) { 175 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
173 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; 176 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
174 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK; 177 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
175 break; 178 break;
@@ -208,7 +211,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
208} 211}
209 212
210/* 213/*
211 * Fill in the inode based on the ncp_entry_info structure. 214 * Fill in the inode based on the ncp_entry_info structure. Used only for brand new inodes.
212 */ 215 */
213static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) 216static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
214{ 217{
@@ -254,6 +257,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
254 if (inode) { 257 if (inode) {
255 atomic_set(&NCP_FINFO(inode)->opened, info->opened); 258 atomic_set(&NCP_FINFO(inode)->opened, info->opened);
256 259
260 inode->i_mapping->backing_dev_info = sb->s_bdi;
257 inode->i_ino = info->ino; 261 inode->i_ino = info->ino;
258 ncp_set_attr(inode, info); 262 ncp_set_attr(inode, info);
259 if (S_ISREG(inode->i_mode)) { 263 if (S_ISREG(inode->i_mode)) {
@@ -299,10 +303,12 @@ ncp_evict_inode(struct inode *inode)
299 303
300static void ncp_stop_tasks(struct ncp_server *server) { 304static void ncp_stop_tasks(struct ncp_server *server) {
301 struct sock* sk = server->ncp_sock->sk; 305 struct sock* sk = server->ncp_sock->sk;
302 306
307 lock_sock(sk);
303 sk->sk_error_report = server->error_report; 308 sk->sk_error_report = server->error_report;
304 sk->sk_data_ready = server->data_ready; 309 sk->sk_data_ready = server->data_ready;
305 sk->sk_write_space = server->write_space; 310 sk->sk_write_space = server->write_space;
311 release_sock(sk);
306 del_timer_sync(&server->timeout_tm); 312 del_timer_sync(&server->timeout_tm);
307 flush_scheduled_work(); 313 flush_scheduled_work();
308} 314}
@@ -565,10 +571,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
565/* server->conn_status = 0; */ 571/* server->conn_status = 0; */
566/* server->root_dentry = NULL; */ 572/* server->root_dentry = NULL; */
567/* server->root_setuped = 0; */ 573/* server->root_setuped = 0; */
574 mutex_init(&server->root_setup_lock);
568#ifdef CONFIG_NCPFS_PACKET_SIGNING 575#ifdef CONFIG_NCPFS_PACKET_SIGNING
569/* server->sign_wanted = 0; */ 576/* server->sign_wanted = 0; */
570/* server->sign_active = 0; */ 577/* server->sign_active = 0; */
571#endif 578#endif
579 init_rwsem(&server->auth_rwsem);
572 server->auth.auth_type = NCP_AUTH_NONE; 580 server->auth.auth_type = NCP_AUTH_NONE;
573/* server->auth.object_name_len = 0; */ 581/* server->auth.object_name_len = 0; */
574/* server->auth.object_name = NULL; */ 582/* server->auth.object_name = NULL; */
@@ -593,16 +601,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
593 server->nls_io = load_nls_default(); 601 server->nls_io = load_nls_default();
594#endif /* CONFIG_NCPFS_NLS */ 602#endif /* CONFIG_NCPFS_NLS */
595 603
596 server->dentry_ttl = 0; /* no caching */ 604 atomic_set(&server->dentry_ttl, 0); /* no caching */
597 605
598 INIT_LIST_HEAD(&server->tx.requests); 606 INIT_LIST_HEAD(&server->tx.requests);
599 mutex_init(&server->rcv.creq_mutex); 607 mutex_init(&server->rcv.creq_mutex);
600 server->tx.creq = NULL; 608 server->tx.creq = NULL;
601 server->rcv.creq = NULL; 609 server->rcv.creq = NULL;
602 server->data_ready = sock->sk->sk_data_ready;
603 server->write_space = sock->sk->sk_write_space;
604 server->error_report = sock->sk->sk_error_report;
605 sock->sk->sk_user_data = server;
606 610
607 init_timer(&server->timeout_tm); 611 init_timer(&server->timeout_tm);
608#undef NCP_PACKET_SIZE 612#undef NCP_PACKET_SIZE
@@ -619,6 +623,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
619 if (server->rxbuf == NULL) 623 if (server->rxbuf == NULL)
620 goto out_txbuf; 624 goto out_txbuf;
621 625
626 lock_sock(sock->sk);
627 server->data_ready = sock->sk->sk_data_ready;
628 server->write_space = sock->sk->sk_write_space;
629 server->error_report = sock->sk->sk_error_report;
630 sock->sk->sk_user_data = server;
622 sock->sk->sk_data_ready = ncp_tcp_data_ready; 631 sock->sk->sk_data_ready = ncp_tcp_data_ready;
623 sock->sk->sk_error_report = ncp_tcp_error_report; 632 sock->sk->sk_error_report = ncp_tcp_error_report;
624 if (sock->type == SOCK_STREAM) { 633 if (sock->type == SOCK_STREAM) {
@@ -634,6 +643,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
634 server->timeout_tm.data = (unsigned long)server; 643 server->timeout_tm.data = (unsigned long)server;
635 server->timeout_tm.function = ncpdgram_timeout_call; 644 server->timeout_tm.function = ncpdgram_timeout_call;
636 } 645 }
646 release_sock(sock->sk);
637 647
638 ncp_lock_server(server); 648 ncp_lock_server(server);
639 error = ncp_connect(server); 649 error = ncp_connect(server);
@@ -658,8 +668,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
658 goto out_disconnect; 668 goto out_disconnect;
659 } 669 }
660 } 670 }
671 ncp_lock_server(server);
661 if (options & 2) 672 if (options & 2)
662 server->sign_wanted = 1; 673 server->sign_wanted = 1;
674 ncp_unlock_server(server);
663 } 675 }
664 else 676 else
665#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 677#endif /* CONFIG_NCPFS_PACKET_SIGNING */
@@ -720,6 +732,9 @@ out_nls:
720 unload_nls(server->nls_io); 732 unload_nls(server->nls_io);
721 unload_nls(server->nls_vol); 733 unload_nls(server->nls_vol);
722#endif 734#endif
735 mutex_destroy(&server->rcv.creq_mutex);
736 mutex_destroy(&server->root_setup_lock);
737 mutex_destroy(&server->mutex);
723out_fput2: 738out_fput2:
724 if (server->info_filp) 739 if (server->info_filp)
725 fput(server->info_filp); 740 fput(server->info_filp);
@@ -743,8 +758,6 @@ static void ncp_put_super(struct super_block *sb)
743{ 758{
744 struct ncp_server *server = NCP_SBP(sb); 759 struct ncp_server *server = NCP_SBP(sb);
745 760
746 lock_kernel();
747
748 ncp_lock_server(server); 761 ncp_lock_server(server);
749 ncp_disconnect(server); 762 ncp_disconnect(server);
750 ncp_unlock_server(server); 763 ncp_unlock_server(server);
@@ -756,6 +769,9 @@ static void ncp_put_super(struct super_block *sb)
756 unload_nls(server->nls_vol); 769 unload_nls(server->nls_vol);
757 unload_nls(server->nls_io); 770 unload_nls(server->nls_io);
758#endif /* CONFIG_NCPFS_NLS */ 771#endif /* CONFIG_NCPFS_NLS */
772 mutex_destroy(&server->rcv.creq_mutex);
773 mutex_destroy(&server->root_setup_lock);
774 mutex_destroy(&server->mutex);
759 775
760 if (server->info_filp) 776 if (server->info_filp)
761 fput(server->info_filp); 777 fput(server->info_filp);
@@ -771,8 +787,6 @@ static void ncp_put_super(struct super_block *sb)
771 vfree(server->packet); 787 vfree(server->packet);
772 sb->s_fs_info = NULL; 788 sb->s_fs_info = NULL;
773 kfree(server); 789 kfree(server);
774
775 unlock_kernel();
776} 790}
777 791
778static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) 792static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +865,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
851 865
852 result = -EIO; 866 result = -EIO;
853 867
854 lock_kernel();
855
856 server = NCP_SERVER(inode); 868 server = NCP_SERVER(inode);
857 if ((!server) || !ncp_conn_valid(server)) 869 if (!server) /* How this could happen? */
858 goto out; 870 goto out;
859 871
860 /* ageing the dentry to force validation */ 872 /* ageing the dentry to force validation */
@@ -981,8 +993,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
981 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode), 993 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
982 inode, info_mask, &info); 994 inode, info_mask, &info);
983 if (result != 0) { 995 if (result != 0) {
984 result = -EACCES;
985
986 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) { 996 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
987 /* NetWare seems not to allow this. I 997 /* NetWare seems not to allow this. I
988 do not know why. So, just tell the 998 do not know why. So, just tell the
@@ -1005,20 +1015,21 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
1005 mark_inode_dirty(inode); 1015 mark_inode_dirty(inode);
1006 1016
1007out: 1017out:
1008 unlock_kernel(); 1018 if (result > 0)
1019 result = -EACCES;
1009 return result; 1020 return result;
1010} 1021}
1011 1022
1012static int ncp_get_sb(struct file_system_type *fs_type, 1023static struct dentry *ncp_mount(struct file_system_type *fs_type,
1013 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1024 int flags, const char *dev_name, void *data)
1014{ 1025{
1015 return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt); 1026 return mount_nodev(fs_type, flags, data, ncp_fill_super);
1016} 1027}
1017 1028
1018static struct file_system_type ncp_fs_type = { 1029static struct file_system_type ncp_fs_type = {
1019 .owner = THIS_MODULE, 1030 .owner = THIS_MODULE,
1020 .name = "ncpfs", 1031 .name = "ncpfs",
1021 .get_sb = ncp_get_sb, 1032 .mount = ncp_mount,
1022 .kill_sb = kill_anon_super, 1033 .kill_sb = kill_anon_super,
1023 .fs_flags = FS_BINARY_MOUNTDATA, 1034 .fs_flags = FS_BINARY_MOUNTDATA,
1024}; 1035};
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 84a8cfc4e38e..c2a1f9a155c3 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -35,16 +35,11 @@
35#define NCP_PACKET_SIZE_INTERNAL 65536 35#define NCP_PACKET_SIZE_INTERNAL 65536
36 36
37static int 37static int
38ncp_get_fs_info(struct ncp_server * server, struct file *file, 38ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
39 struct ncp_fs_info __user *arg) 39 struct ncp_fs_info __user *arg)
40{ 40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct ncp_fs_info info; 41 struct ncp_fs_info info;
43 42
44 if (file_permission(file, MAY_WRITE) != 0
45 && current_uid() != server->m.mounted_uid)
46 return -EACCES;
47
48 if (copy_from_user(&info, arg, sizeof(info))) 43 if (copy_from_user(&info, arg, sizeof(info)))
49 return -EFAULT; 44 return -EFAULT;
50 45
@@ -65,16 +60,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
65} 60}
66 61
67static int 62static int
68ncp_get_fs_info_v2(struct ncp_server * server, struct file *file, 63ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
69 struct ncp_fs_info_v2 __user * arg) 64 struct ncp_fs_info_v2 __user * arg)
70{ 65{
71 struct inode *inode = file->f_path.dentry->d_inode;
72 struct ncp_fs_info_v2 info2; 66 struct ncp_fs_info_v2 info2;
73 67
74 if (file_permission(file, MAY_WRITE) != 0
75 && current_uid() != server->m.mounted_uid)
76 return -EACCES;
77
78 if (copy_from_user(&info2, arg, sizeof(info2))) 68 if (copy_from_user(&info2, arg, sizeof(info2)))
79 return -EFAULT; 69 return -EFAULT;
80 70
@@ -136,16 +126,11 @@ struct compat_ncp_privatedata_ioctl
136#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl) 126#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl)
137 127
138static int 128static int
139ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file, 129ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
140 struct compat_ncp_fs_info_v2 __user * arg) 130 struct compat_ncp_fs_info_v2 __user * arg)
141{ 131{
142 struct inode *inode = file->f_path.dentry->d_inode;
143 struct compat_ncp_fs_info_v2 info2; 132 struct compat_ncp_fs_info_v2 info2;
144 133
145 if (file_permission(file, MAY_WRITE) != 0
146 && current_uid() != server->m.mounted_uid)
147 return -EACCES;
148
149 if (copy_from_user(&info2, arg, sizeof(info2))) 134 if (copy_from_user(&info2, arg, sizeof(info2)))
150 return -EFAULT; 135 return -EFAULT;
151 136
@@ -182,11 +167,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
182 struct nls_table *iocharset; 167 struct nls_table *iocharset;
183 struct nls_table *oldset_io; 168 struct nls_table *oldset_io;
184 struct nls_table *oldset_cp; 169 struct nls_table *oldset_cp;
185 170 int utf8;
186 if (!capable(CAP_SYS_ADMIN)) 171 int err;
187 return -EACCES;
188 if (server->root_setuped)
189 return -EBUSY;
190 172
191 if (copy_from_user(&user, arg, sizeof(user))) 173 if (copy_from_user(&user, arg, sizeof(user)))
192 return -EFAULT; 174 return -EFAULT;
@@ -206,28 +188,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
206 user.iocharset[NCP_IOCSNAME_LEN] = 0; 188 user.iocharset[NCP_IOCSNAME_LEN] = 0;
207 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) { 189 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
208 iocharset = load_nls_default(); 190 iocharset = load_nls_default();
209 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 191 utf8 = 0;
210 } else if (!strcmp(user.iocharset, "utf8")) { 192 } else if (!strcmp(user.iocharset, "utf8")) {
211 iocharset = load_nls_default(); 193 iocharset = load_nls_default();
212 NCP_SET_FLAG(server, NCP_FLAG_UTF8); 194 utf8 = 1;
213 } else { 195 } else {
214 iocharset = load_nls(user.iocharset); 196 iocharset = load_nls(user.iocharset);
215 if (!iocharset) { 197 if (!iocharset) {
216 unload_nls(codepage); 198 unload_nls(codepage);
217 return -EBADRQC; 199 return -EBADRQC;
218 } 200 }
219 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 201 utf8 = 0;
220 } 202 }
221 203
222 oldset_cp = server->nls_vol; 204 mutex_lock(&server->root_setup_lock);
223 server->nls_vol = codepage; 205 if (server->root_setuped) {
224 oldset_io = server->nls_io; 206 oldset_cp = codepage;
225 server->nls_io = iocharset; 207 oldset_io = iocharset;
226 208 err = -EBUSY;
209 } else {
210 if (utf8)
211 NCP_SET_FLAG(server, NCP_FLAG_UTF8);
212 else
213 NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
214 oldset_cp = server->nls_vol;
215 server->nls_vol = codepage;
216 oldset_io = server->nls_io;
217 server->nls_io = iocharset;
218 err = 0;
219 }
220 mutex_unlock(&server->root_setup_lock);
227 unload_nls(oldset_cp); 221 unload_nls(oldset_cp);
228 unload_nls(oldset_io); 222 unload_nls(oldset_io);
229 223
230 return 0; 224 return err;
231} 225}
232 226
233static int 227static int
@@ -237,6 +231,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
237 int len; 231 int len;
238 232
239 memset(&user, 0, sizeof(user)); 233 memset(&user, 0, sizeof(user));
234 mutex_lock(&server->root_setup_lock);
240 if (server->nls_vol && server->nls_vol->charset) { 235 if (server->nls_vol && server->nls_vol->charset) {
241 len = strlen(server->nls_vol->charset); 236 len = strlen(server->nls_vol->charset);
242 if (len > NCP_IOCSNAME_LEN) 237 if (len > NCP_IOCSNAME_LEN)
@@ -254,6 +249,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
254 strncpy(user.iocharset, server->nls_io->charset, len); 249 strncpy(user.iocharset, server->nls_io->charset, len);
255 user.iocharset[len] = 0; 250 user.iocharset[len] = 0;
256 } 251 }
252 mutex_unlock(&server->root_setup_lock);
257 253
258 if (copy_to_user(arg, &user, sizeof(user))) 254 if (copy_to_user(arg, &user, sizeof(user)))
259 return -EFAULT; 255 return -EFAULT;
@@ -261,25 +257,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 257}
262#endif /* CONFIG_NCPFS_NLS */ 258#endif /* CONFIG_NCPFS_NLS */
263 259
264static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 260static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
265{ 261{
266 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 262 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 263 int result;
269 struct ncp_ioctl_request request; 264 struct ncp_ioctl_request request;
270 char* bouncebuffer; 265 char* bouncebuffer;
271 void __user *argp = (void __user *)arg; 266 void __user *argp = (void __user *)arg;
272 uid_t uid = current_uid();
273 267
274 switch (cmd) { 268 switch (cmd) {
275#ifdef CONFIG_COMPAT 269#ifdef CONFIG_COMPAT
276 case NCP_IOC_NCPREQUEST_32: 270 case NCP_IOC_NCPREQUEST_32:
277#endif 271#endif
278 case NCP_IOC_NCPREQUEST: 272 case NCP_IOC_NCPREQUEST:
279 if (file_permission(filp, MAY_WRITE) != 0
280 && uid != server->m.mounted_uid)
281 return -EACCES;
282
283#ifdef CONFIG_COMPAT 273#ifdef CONFIG_COMPAT
284 if (cmd == NCP_IOC_NCPREQUEST_32) { 274 if (cmd == NCP_IOC_NCPREQUEST_32) {
285 struct compat_ncp_ioctl_request request32; 275 struct compat_ncp_ioctl_request request32;
@@ -314,7 +304,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
314 server->current_size = request.size; 304 server->current_size = request.size;
315 memcpy(server->packet, bouncebuffer, request.size); 305 memcpy(server->packet, bouncebuffer, request.size);
316 306
317 result = ncp_request2(server, request.function, 307 result = ncp_request2(server, request.function,
318 bouncebuffer, NCP_PACKET_SIZE_INTERNAL); 308 bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
319 if (result < 0) 309 if (result < 0)
320 result = -EIO; 310 result = -EIO;
@@ -331,69 +321,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
331 321
332 case NCP_IOC_CONN_LOGGED_IN: 322 case NCP_IOC_CONN_LOGGED_IN:
333 323
334 if (!capable(CAP_SYS_ADMIN))
335 return -EACCES;
336 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE)) 324 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
337 return -EINVAL; 325 return -EINVAL;
326 mutex_lock(&server->root_setup_lock);
338 if (server->root_setuped) 327 if (server->root_setuped)
339 return -EBUSY; 328 result = -EBUSY;
340 server->root_setuped = 1; 329 else {
341 return ncp_conn_logged_in(inode->i_sb); 330 result = ncp_conn_logged_in(inode->i_sb);
331 if (result == 0)
332 server->root_setuped = 1;
333 }
334 mutex_unlock(&server->root_setup_lock);
335 return result;
342 336
343 case NCP_IOC_GET_FS_INFO: 337 case NCP_IOC_GET_FS_INFO:
344 return ncp_get_fs_info(server, filp, argp); 338 return ncp_get_fs_info(server, inode, argp);
345 339
346 case NCP_IOC_GET_FS_INFO_V2: 340 case NCP_IOC_GET_FS_INFO_V2:
347 return ncp_get_fs_info_v2(server, filp, argp); 341 return ncp_get_fs_info_v2(server, inode, argp);
348 342
349#ifdef CONFIG_COMPAT 343#ifdef CONFIG_COMPAT
350 case NCP_IOC_GET_FS_INFO_V2_32: 344 case NCP_IOC_GET_FS_INFO_V2_32:
351 return ncp_get_compat_fs_info_v2(server, filp, argp); 345 return ncp_get_compat_fs_info_v2(server, inode, argp);
352#endif 346#endif
353 /* we have too many combinations of CONFIG_COMPAT, 347 /* we have too many combinations of CONFIG_COMPAT,
354 * CONFIG_64BIT and CONFIG_UID16, so just handle 348 * CONFIG_64BIT and CONFIG_UID16, so just handle
355 * any of the possible ioctls */ 349 * any of the possible ioctls */
356 case NCP_IOC_GETMOUNTUID16: 350 case NCP_IOC_GETMOUNTUID16:
357 case NCP_IOC_GETMOUNTUID32: 351 {
358 case NCP_IOC_GETMOUNTUID64:
359 if (file_permission(filp, MAY_READ) != 0
360 && uid != server->m.mounted_uid)
361 return -EACCES;
362
363 if (cmd == NCP_IOC_GETMOUNTUID16) {
364 u16 uid; 352 u16 uid;
353
365 SET_UID(uid, server->m.mounted_uid); 354 SET_UID(uid, server->m.mounted_uid);
366 if (put_user(uid, (u16 __user *)argp)) 355 if (put_user(uid, (u16 __user *)argp))
367 return -EFAULT; 356 return -EFAULT;
368 } else if (cmd == NCP_IOC_GETMOUNTUID32) { 357 return 0;
369 if (put_user(server->m.mounted_uid,
370 (u32 __user *)argp))
371 return -EFAULT;
372 } else {
373 if (put_user(server->m.mounted_uid,
374 (u64 __user *)argp))
375 return -EFAULT;
376 } 358 }
359 case NCP_IOC_GETMOUNTUID32:
360 if (put_user(server->m.mounted_uid,
361 (u32 __user *)argp))
362 return -EFAULT;
363 return 0;
364 case NCP_IOC_GETMOUNTUID64:
365 if (put_user(server->m.mounted_uid,
366 (u64 __user *)argp))
367 return -EFAULT;
377 return 0; 368 return 0;
378 369
379 case NCP_IOC_GETROOT: 370 case NCP_IOC_GETROOT:
380 { 371 {
381 struct ncp_setroot_ioctl sr; 372 struct ncp_setroot_ioctl sr;
382 373
383 if (file_permission(filp, MAY_READ) != 0 374 result = -EACCES;
384 && uid != server->m.mounted_uid) 375 mutex_lock(&server->root_setup_lock);
385 return -EACCES;
386
387 if (server->m.mounted_vol[0]) { 376 if (server->m.mounted_vol[0]) {
388 struct dentry* dentry = inode->i_sb->s_root; 377 struct dentry* dentry = inode->i_sb->s_root;
389 378
390 if (dentry) { 379 if (dentry) {
391 struct inode* s_inode = dentry->d_inode; 380 struct inode* s_inode = dentry->d_inode;
392 381
393 if (s_inode) { 382 if (s_inode) {
394 sr.volNumber = NCP_FINFO(s_inode)->volNumber; 383 sr.volNumber = NCP_FINFO(s_inode)->volNumber;
395 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; 384 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
396 sr.namespace = server->name_space[sr.volNumber]; 385 sr.namespace = server->name_space[sr.volNumber];
386 result = 0;
397 } else 387 } else
398 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 388 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
399 } else 389 } else
@@ -402,10 +392,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
402 sr.volNumber = -1; 392 sr.volNumber = -1;
403 sr.namespace = 0; 393 sr.namespace = 0;
404 sr.dirEntNum = 0; 394 sr.dirEntNum = 0;
395 result = 0;
405 } 396 }
406 if (copy_to_user(argp, &sr, sizeof(sr))) 397 mutex_unlock(&server->root_setup_lock);
407 return -EFAULT; 398 if (!result && copy_to_user(argp, &sr, sizeof(sr)))
408 return 0; 399 result = -EFAULT;
400 return result;
409 } 401 }
410 402
411 case NCP_IOC_SETROOT: 403 case NCP_IOC_SETROOT:
@@ -416,103 +408,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
416 __le32 dosde; 408 __le32 dosde;
417 struct dentry* dentry; 409 struct dentry* dentry;
418 410
419 if (!capable(CAP_SYS_ADMIN))
420 {
421 return -EACCES;
422 }
423 if (server->root_setuped) return -EBUSY;
424 if (copy_from_user(&sr, argp, sizeof(sr))) 411 if (copy_from_user(&sr, argp, sizeof(sr)))
425 return -EFAULT; 412 return -EFAULT;
426 if (sr.volNumber < 0) { 413 mutex_lock(&server->root_setup_lock);
427 server->m.mounted_vol[0] = 0; 414 if (server->root_setuped)
428 vnum = NCP_NUMBER_OF_VOLUMES; 415 result = -EBUSY;
429 de = 0; 416 else {
430 dosde = 0; 417 if (sr.volNumber < 0) {
431 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) { 418 server->m.mounted_vol[0] = 0;
432 return -EINVAL; 419 vnum = NCP_NUMBER_OF_VOLUMES;
433 } else if (ncp_mount_subdir(server, sr.volNumber, 420 de = 0;
434 sr.namespace, sr.dirEntNum, 421 dosde = 0;
435 &vnum, &de, &dosde)) { 422 result = 0;
436 return -ENOENT; 423 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
437 } 424 result = -EINVAL;
438 425 } else if (ncp_mount_subdir(server, sr.volNumber,
439 dentry = inode->i_sb->s_root; 426 sr.namespace, sr.dirEntNum,
440 server->root_setuped = 1; 427 &vnum, &de, &dosde)) {
441 if (dentry) { 428 result = -ENOENT;
442 struct inode* s_inode = dentry->d_inode;
443
444 if (s_inode) {
445 NCP_FINFO(s_inode)->volNumber = vnum;
446 NCP_FINFO(s_inode)->dirEntNum = de;
447 NCP_FINFO(s_inode)->DosDirNum = dosde;
448 } else 429 } else
449 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 430 result = 0;
450 } else 431
451 DPRINTK("ncpfs: s_root==NULL\n"); 432 if (result == 0) {
433 dentry = inode->i_sb->s_root;
434 if (dentry) {
435 struct inode* s_inode = dentry->d_inode;
436
437 if (s_inode) {
438 NCP_FINFO(s_inode)->volNumber = vnum;
439 NCP_FINFO(s_inode)->dirEntNum = de;
440 NCP_FINFO(s_inode)->DosDirNum = dosde;
441 server->root_setuped = 1;
442 } else {
443 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
444 result = -EIO;
445 }
446 } else {
447 DPRINTK("ncpfs: s_root==NULL\n");
448 result = -EIO;
449 }
450 }
451 result = 0;
452 }
453 mutex_unlock(&server->root_setup_lock);
452 454
453 return 0; 455 return result;
454 } 456 }
455 457
456#ifdef CONFIG_NCPFS_PACKET_SIGNING 458#ifdef CONFIG_NCPFS_PACKET_SIGNING
457 case NCP_IOC_SIGN_INIT: 459 case NCP_IOC_SIGN_INIT:
458 if (file_permission(filp, MAY_WRITE) != 0 460 {
459 && uid != server->m.mounted_uid) 461 struct ncp_sign_init sign;
460 return -EACCES;
461
462 if (argp) {
463 if (server->sign_wanted)
464 {
465 struct ncp_sign_init sign;
466 462
463 if (argp)
467 if (copy_from_user(&sign, argp, sizeof(sign))) 464 if (copy_from_user(&sign, argp, sizeof(sign)))
468 return -EFAULT; 465 return -EFAULT;
469 memcpy(server->sign_root,sign.sign_root,8); 466 ncp_lock_server(server);
470 memcpy(server->sign_last,sign.sign_last,16); 467 mutex_lock(&server->rcv.creq_mutex);
471 server->sign_active = 1; 468 if (argp) {
469 if (server->sign_wanted) {
470 memcpy(server->sign_root,sign.sign_root,8);
471 memcpy(server->sign_last,sign.sign_last,16);
472 server->sign_active = 1;
473 }
474 /* ignore when signatures not wanted */
475 } else {
476 server->sign_active = 0;
472 } 477 }
473 /* ignore when signatures not wanted */ 478 mutex_unlock(&server->rcv.creq_mutex);
474 } else { 479 ncp_unlock_server(server);
475 server->sign_active = 0; 480 return 0;
476 } 481 }
477 return 0; 482
478
479 case NCP_IOC_SIGN_WANTED: 483 case NCP_IOC_SIGN_WANTED:
480 if (file_permission(filp, MAY_READ) != 0 484 {
481 && uid != server->m.mounted_uid) 485 int state;
482 return -EACCES; 486
483 487 ncp_lock_server(server);
484 if (put_user(server->sign_wanted, (int __user *)argp)) 488 state = server->sign_wanted;
485 return -EFAULT; 489 ncp_unlock_server(server);
486 return 0; 490 if (put_user(state, (int __user *)argp))
491 return -EFAULT;
492 return 0;
493 }
487 494
488 case NCP_IOC_SET_SIGN_WANTED: 495 case NCP_IOC_SET_SIGN_WANTED:
489 { 496 {
490 int newstate; 497 int newstate;
491 498
492 if (file_permission(filp, MAY_WRITE) != 0
493 && uid != server->m.mounted_uid)
494 return -EACCES;
495
496 /* get only low 8 bits... */ 499 /* get only low 8 bits... */
497 if (get_user(newstate, (unsigned char __user *)argp)) 500 if (get_user(newstate, (unsigned char __user *)argp))
498 return -EFAULT; 501 return -EFAULT;
502 result = 0;
503 ncp_lock_server(server);
499 if (server->sign_active) { 504 if (server->sign_active) {
500 /* cannot turn signatures OFF when active */ 505 /* cannot turn signatures OFF when active */
501 if (!newstate) return -EINVAL; 506 if (!newstate)
507 result = -EINVAL;
502 } else { 508 } else {
503 server->sign_wanted = newstate != 0; 509 server->sign_wanted = newstate != 0;
504 } 510 }
505 return 0; 511 ncp_unlock_server(server);
512 return result;
506 } 513 }
507 514
508#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 515#endif /* CONFIG_NCPFS_PACKET_SIGNING */
509 516
510#ifdef CONFIG_NCPFS_IOCTL_LOCKING 517#ifdef CONFIG_NCPFS_IOCTL_LOCKING
511 case NCP_IOC_LOCKUNLOCK: 518 case NCP_IOC_LOCKUNLOCK:
512 if (file_permission(filp, MAY_WRITE) != 0
513 && uid != server->m.mounted_uid)
514 return -EACCES;
515
516 { 519 {
517 struct ncp_lock_ioctl rqdata; 520 struct ncp_lock_ioctl rqdata;
518 521
@@ -541,16 +544,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
541 { 544 {
542 return result; 545 return result;
543 } 546 }
544 result = -EIO;
545 if (!ncp_conn_valid(server))
546 goto outrel;
547 result = -EISDIR; 547 result = -EISDIR;
548 if (!S_ISREG(inode->i_mode)) 548 if (!S_ISREG(inode->i_mode))
549 goto outrel; 549 goto outrel;
550 if (rqdata.cmd == NCP_LOCK_CLEAR) 550 if (rqdata.cmd == NCP_LOCK_CLEAR)
551 { 551 {
552 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode), 552 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
553 NCP_FINFO(inode)->file_handle, 553 NCP_FINFO(inode)->file_handle,
554 rqdata.offset, 554 rqdata.offset,
555 rqdata.length); 555 rqdata.length);
556 if (result > 0) result = 0; /* no such lock */ 556 if (result > 0) result = 0; /* no such lock */
@@ -573,7 +573,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
573 rqdata.timeout); 573 rqdata.timeout);
574 if (result > 0) result = -EAGAIN; 574 if (result > 0) result = -EAGAIN;
575 } 575 }
576outrel: 576outrel:
577 ncp_inode_close(inode); 577 ncp_inode_close(inode);
578 return result; 578 return result;
579 } 579 }
@@ -581,60 +581,62 @@ outrel:
581 581
582#ifdef CONFIG_COMPAT 582#ifdef CONFIG_COMPAT
583 case NCP_IOC_GETOBJECTNAME_32: 583 case NCP_IOC_GETOBJECTNAME_32:
584 if (uid != server->m.mounted_uid)
585 return -EACCES;
586 { 584 {
587 struct compat_ncp_objectname_ioctl user; 585 struct compat_ncp_objectname_ioctl user;
588 size_t outl; 586 size_t outl;
589 587
590 if (copy_from_user(&user, argp, sizeof(user))) 588 if (copy_from_user(&user, argp, sizeof(user)))
591 return -EFAULT; 589 return -EFAULT;
590 down_read(&server->auth_rwsem);
592 user.auth_type = server->auth.auth_type; 591 user.auth_type = server->auth.auth_type;
593 outl = user.object_name_len; 592 outl = user.object_name_len;
594 user.object_name_len = server->auth.object_name_len; 593 user.object_name_len = server->auth.object_name_len;
595 if (outl > user.object_name_len) 594 if (outl > user.object_name_len)
596 outl = user.object_name_len; 595 outl = user.object_name_len;
596 result = 0;
597 if (outl) { 597 if (outl) {
598 if (copy_to_user(compat_ptr(user.object_name), 598 if (copy_to_user(compat_ptr(user.object_name),
599 server->auth.object_name, 599 server->auth.object_name,
600 outl)) return -EFAULT; 600 outl))
601 result = -EFAULT;
601 } 602 }
602 if (copy_to_user(argp, &user, sizeof(user))) 603 up_read(&server->auth_rwsem);
603 return -EFAULT; 604 if (!result && copy_to_user(argp, &user, sizeof(user)))
604 return 0; 605 result = -EFAULT;
606 return result;
605 } 607 }
606#endif 608#endif
607 609
608 case NCP_IOC_GETOBJECTNAME: 610 case NCP_IOC_GETOBJECTNAME:
609 if (uid != server->m.mounted_uid)
610 return -EACCES;
611 { 611 {
612 struct ncp_objectname_ioctl user; 612 struct ncp_objectname_ioctl user;
613 size_t outl; 613 size_t outl;
614 614
615 if (copy_from_user(&user, argp, sizeof(user))) 615 if (copy_from_user(&user, argp, sizeof(user)))
616 return -EFAULT; 616 return -EFAULT;
617 down_read(&server->auth_rwsem);
617 user.auth_type = server->auth.auth_type; 618 user.auth_type = server->auth.auth_type;
618 outl = user.object_name_len; 619 outl = user.object_name_len;
619 user.object_name_len = server->auth.object_name_len; 620 user.object_name_len = server->auth.object_name_len;
620 if (outl > user.object_name_len) 621 if (outl > user.object_name_len)
621 outl = user.object_name_len; 622 outl = user.object_name_len;
623 result = 0;
622 if (outl) { 624 if (outl) {
623 if (copy_to_user(user.object_name, 625 if (copy_to_user(user.object_name,
624 server->auth.object_name, 626 server->auth.object_name,
625 outl)) return -EFAULT; 627 outl))
628 result = -EFAULT;
626 } 629 }
627 if (copy_to_user(argp, &user, sizeof(user))) 630 up_read(&server->auth_rwsem);
628 return -EFAULT; 631 if (!result && copy_to_user(argp, &user, sizeof(user)))
629 return 0; 632 result = -EFAULT;
633 return result;
630 } 634 }
631 635
632#ifdef CONFIG_COMPAT 636#ifdef CONFIG_COMPAT
633 case NCP_IOC_SETOBJECTNAME_32: 637 case NCP_IOC_SETOBJECTNAME_32:
634#endif 638#endif
635 case NCP_IOC_SETOBJECTNAME: 639 case NCP_IOC_SETOBJECTNAME:
636 if (uid != server->m.mounted_uid)
637 return -EACCES;
638 { 640 {
639 struct ncp_objectname_ioctl user; 641 struct ncp_objectname_ioctl user;
640 void* newname; 642 void* newname;
@@ -666,9 +668,7 @@ outrel:
666 } else { 668 } else {
667 newname = NULL; 669 newname = NULL;
668 } 670 }
669 /* enter critical section */ 671 down_write(&server->auth_rwsem);
670 /* maybe that kfree can sleep so do that this way */
671 /* it is at least more SMP friendly (in future...) */
672 oldname = server->auth.object_name; 672 oldname = server->auth.object_name;
673 oldnamelen = server->auth.object_name_len; 673 oldnamelen = server->auth.object_name_len;
674 oldprivate = server->priv.data; 674 oldprivate = server->priv.data;
@@ -678,7 +678,7 @@ outrel:
678 server->auth.object_name = newname; 678 server->auth.object_name = newname;
679 server->priv.len = 0; 679 server->priv.len = 0;
680 server->priv.data = NULL; 680 server->priv.data = NULL;
681 /* leave critical section */ 681 up_write(&server->auth_rwsem);
682 kfree(oldprivate); 682 kfree(oldprivate);
683 kfree(oldname); 683 kfree(oldname);
684 return 0; 684 return 0;
@@ -688,8 +688,6 @@ outrel:
688 case NCP_IOC_GETPRIVATEDATA_32: 688 case NCP_IOC_GETPRIVATEDATA_32:
689#endif 689#endif
690 case NCP_IOC_GETPRIVATEDATA: 690 case NCP_IOC_GETPRIVATEDATA:
691 if (uid != server->m.mounted_uid)
692 return -EACCES;
693 { 691 {
694 struct ncp_privatedata_ioctl user; 692 struct ncp_privatedata_ioctl user;
695 size_t outl; 693 size_t outl;
@@ -706,14 +704,20 @@ outrel:
706 if (copy_from_user(&user, argp, sizeof(user))) 704 if (copy_from_user(&user, argp, sizeof(user)))
707 return -EFAULT; 705 return -EFAULT;
708 706
707 down_read(&server->auth_rwsem);
709 outl = user.len; 708 outl = user.len;
710 user.len = server->priv.len; 709 user.len = server->priv.len;
711 if (outl > user.len) outl = user.len; 710 if (outl > user.len) outl = user.len;
711 result = 0;
712 if (outl) { 712 if (outl) {
713 if (copy_to_user(user.data, 713 if (copy_to_user(user.data,
714 server->priv.data, 714 server->priv.data,
715 outl)) return -EFAULT; 715 outl))
716 result = -EFAULT;
716 } 717 }
718 up_read(&server->auth_rwsem);
719 if (result)
720 return result;
717#ifdef CONFIG_COMPAT 721#ifdef CONFIG_COMPAT
718 if (cmd == NCP_IOC_GETPRIVATEDATA_32) { 722 if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
719 struct compat_ncp_privatedata_ioctl user32; 723 struct compat_ncp_privatedata_ioctl user32;
@@ -733,8 +737,6 @@ outrel:
733 case NCP_IOC_SETPRIVATEDATA_32: 737 case NCP_IOC_SETPRIVATEDATA_32:
734#endif 738#endif
735 case NCP_IOC_SETPRIVATEDATA: 739 case NCP_IOC_SETPRIVATEDATA:
736 if (uid != server->m.mounted_uid)
737 return -EACCES;
738 { 740 {
739 struct ncp_privatedata_ioctl user; 741 struct ncp_privatedata_ioctl user;
740 void* new; 742 void* new;
@@ -762,12 +764,12 @@ outrel:
762 } else { 764 } else {
763 new = NULL; 765 new = NULL;
764 } 766 }
765 /* enter critical section */ 767 down_write(&server->auth_rwsem);
766 old = server->priv.data; 768 old = server->priv.data;
767 oldlen = server->priv.len; 769 oldlen = server->priv.len;
768 server->priv.len = user.len; 770 server->priv.len = user.len;
769 server->priv.data = new; 771 server->priv.data = new;
770 /* leave critical section */ 772 up_write(&server->auth_rwsem);
771 kfree(old); 773 kfree(old);
772 return 0; 774 return 0;
773 } 775 }
@@ -775,17 +777,13 @@ outrel:
775#ifdef CONFIG_NCPFS_NLS 777#ifdef CONFIG_NCPFS_NLS
776 case NCP_IOC_SETCHARSETS: 778 case NCP_IOC_SETCHARSETS:
777 return ncp_set_charsets(server, argp); 779 return ncp_set_charsets(server, argp);
778 780
779 case NCP_IOC_GETCHARSETS: 781 case NCP_IOC_GETCHARSETS:
780 return ncp_get_charsets(server, argp); 782 return ncp_get_charsets(server, argp);
781 783
782#endif /* CONFIG_NCPFS_NLS */ 784#endif /* CONFIG_NCPFS_NLS */
783 785
784 case NCP_IOC_SETDENTRYTTL: 786 case NCP_IOC_SETDENTRYTTL:
785 if (file_permission(filp, MAY_WRITE) != 0 &&
786 uid != server->m.mounted_uid)
787 return -EACCES;
788
789 { 787 {
790 u_int32_t user; 788 u_int32_t user;
791 789
@@ -795,13 +793,13 @@ outrel:
795 if (user > 20000) 793 if (user > 20000)
796 return -EINVAL; 794 return -EINVAL;
797 user = (user * HZ) / 1000; 795 user = (user * HZ) / 1000;
798 server->dentry_ttl = user; 796 atomic_set(&server->dentry_ttl, user);
799 return 0; 797 return 0;
800 } 798 }
801 799
802 case NCP_IOC_GETDENTRYTTL: 800 case NCP_IOC_GETDENTRYTTL:
803 { 801 {
804 u_int32_t user = (server->dentry_ttl * 1000) / HZ; 802 u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
805 if (copy_to_user(argp, &user, sizeof(user))) 803 if (copy_to_user(argp, &user, sizeof(user)))
806 return -EFAULT; 804 return -EFAULT;
807 return 0; 805 return 0;
@@ -811,59 +809,103 @@ outrel:
811 return -EINVAL; 809 return -EINVAL;
812} 810}
813 811
814static int ncp_ioctl_need_write(unsigned int cmd) 812long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
815{ 813{
814 struct inode *inode = filp->f_dentry->d_inode;
815 struct ncp_server *server = NCP_SERVER(inode);
816 uid_t uid = current_uid();
817 int need_drop_write = 0;
818 long ret;
819
816 switch (cmd) { 820 switch (cmd) {
817 case NCP_IOC_GET_FS_INFO:
818 case NCP_IOC_GET_FS_INFO_V2:
819 case NCP_IOC_NCPREQUEST:
820 case NCP_IOC_SETDENTRYTTL:
821 case NCP_IOC_SIGN_INIT:
822 case NCP_IOC_LOCKUNLOCK:
823 case NCP_IOC_SET_SIGN_WANTED:
824 return 1;
825 case NCP_IOC_GETOBJECTNAME:
826 case NCP_IOC_SETOBJECTNAME:
827 case NCP_IOC_GETPRIVATEDATA:
828 case NCP_IOC_SETPRIVATEDATA:
829 case NCP_IOC_SETCHARSETS: 821 case NCP_IOC_SETCHARSETS:
830 case NCP_IOC_GETCHARSETS:
831 case NCP_IOC_CONN_LOGGED_IN: 822 case NCP_IOC_CONN_LOGGED_IN:
832 case NCP_IOC_GETDENTRYTTL:
833 case NCP_IOC_GETMOUNTUID2:
834 case NCP_IOC_SIGN_WANTED:
835 case NCP_IOC_GETROOT:
836 case NCP_IOC_SETROOT: 823 case NCP_IOC_SETROOT:
837 return 0; 824 if (!capable(CAP_SYS_ADMIN)) {
838 default: 825 ret = -EACCES;
839 /* unknown IOCTL command, assume write */ 826 goto out;
840 return 1; 827 }
828 break;
841 } 829 }
842} 830 if (server->m.mounted_uid != uid) {
843 831 switch (cmd) {
844long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845{
846 long ret;
847
848 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) {
850 /* 832 /*
851 * inside the ioctl(), any failures which 833 * Only mount owner can issue these ioctls. Information
852 * are because of file_permission() are 834 * necessary to authenticate to other NDS servers are
853 * -EACCESS, so it seems consistent to keep 835 * stored here.
854 * that here.
855 */ 836 */
856 if (mnt_want_write(filp->f_path.mnt)) { 837 case NCP_IOC_GETOBJECTNAME:
838 case NCP_IOC_SETOBJECTNAME:
839 case NCP_IOC_GETPRIVATEDATA:
840 case NCP_IOC_SETPRIVATEDATA:
841#ifdef CONFIG_COMPAT
842 case NCP_IOC_GETOBJECTNAME_32:
843 case NCP_IOC_SETOBJECTNAME_32:
844 case NCP_IOC_GETPRIVATEDATA_32:
845 case NCP_IOC_SETPRIVATEDATA_32:
846#endif
857 ret = -EACCES; 847 ret = -EACCES;
858 goto out; 848 goto out;
849 /*
850 * These require write access on the inode if user id
851 * does not match. Note that they do not write to the
852 * file... But old code did mnt_want_write, so I keep
853 * it as is. Of course not for mountpoint owner, as
854 * that breaks read-only mounts altogether as ncpmount
855 * needs working NCP_IOC_NCPREQUEST and
856 * NCP_IOC_GET_FS_INFO. Some of these codes (setdentryttl,
857 * signinit, setsignwanted) should be probably restricted
858 * to owner only, or even more to CAP_SYS_ADMIN).
859 */
860 case NCP_IOC_GET_FS_INFO:
861 case NCP_IOC_GET_FS_INFO_V2:
862 case NCP_IOC_NCPREQUEST:
863 case NCP_IOC_SETDENTRYTTL:
864 case NCP_IOC_SIGN_INIT:
865 case NCP_IOC_LOCKUNLOCK:
866 case NCP_IOC_SET_SIGN_WANTED:
867#ifdef CONFIG_COMPAT
868 case NCP_IOC_GET_FS_INFO_V2_32:
869 case NCP_IOC_NCPREQUEST_32:
870#endif
871 ret = mnt_want_write_file(filp);
872 if (ret)
873 goto out;
874 need_drop_write = 1;
875 ret = inode_permission(inode, MAY_WRITE);
876 if (ret)
877 goto outDropWrite;
878 break;
879 /*
880 * Read access required.
881 */
882 case NCP_IOC_GETMOUNTUID16:
883 case NCP_IOC_GETMOUNTUID32:
884 case NCP_IOC_GETMOUNTUID64:
885 case NCP_IOC_GETROOT:
886 case NCP_IOC_SIGN_WANTED:
887 ret = inode_permission(inode, MAY_READ);
888 if (ret)
889 goto out;
890 break;
891 /*
892 * Anybody can read these.
893 */
894 case NCP_IOC_GETCHARSETS:
895 case NCP_IOC_GETDENTRYTTL:
896 default:
897 /* Three codes below are protected by CAP_SYS_ADMIN above. */
898 case NCP_IOC_SETCHARSETS:
899 case NCP_IOC_CONN_LOGGED_IN:
900 case NCP_IOC_SETROOT:
901 break;
859 } 902 }
860 } 903 }
861 ret = __ncp_ioctl(filp, cmd, arg); 904 ret = __ncp_ioctl(inode, cmd, arg);
862 if (ncp_ioctl_need_write(cmd)) 905outDropWrite:
906 if (need_drop_write)
863 mnt_drop_write(filp->f_path.mnt); 907 mnt_drop_write(filp->f_path.mnt);
864
865out: 908out:
866 unlock_kernel();
867 return ret; 909 return ret;
868} 910}
869 911
@@ -872,10 +914,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
872{ 914{
873 long ret; 915 long ret;
874 916
875 lock_kernel();
876 arg = (unsigned long) compat_ptr(arg); 917 arg = (unsigned long) compat_ptr(arg);
877 ret = ncp_ioctl(file, cmd, arg); 918 ret = ncp_ioctl(file, cmd, arg);
878 unlock_kernel();
879 return ret; 919 return ret;
880} 920}
881#endif 921#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a5970..a95615a0b6ac 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]); 107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
108} 108}
109 109
110static inline u8 BVAL(void *data) 110static inline u8 BVAL(const void *data)
111{ 111{
112 return *(u8 *)data; 112 return *(const u8 *)data;
113} 113}
114 114
115static u8 ncp_reply_byte(struct ncp_server *server, int offset) 115static u8 ncp_reply_byte(struct ncp_server *server, int offset)
116{ 116{
117 return *(u8 *)ncp_reply_data(server, offset); 117 return *(const u8 *)ncp_reply_data(server, offset);
118} 118}
119 119
120static inline u16 WVAL_LH(void *data) 120static inline u16 WVAL_LH(const void *data)
121{ 121{
122 return get_unaligned_le16(data); 122 return get_unaligned_le16(data);
123} 123}
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
134 return get_unaligned_be16(ncp_reply_data(server, offset)); 134 return get_unaligned_be16(ncp_reply_data(server, offset));
135} 135}
136 136
137static inline u32 DVAL_LH(void *data) 137static inline u32 DVAL_LH(const void *data)
138{ 138{
139 return get_unaligned_le32(data); 139 return get_unaligned_le32(data);
140} 140}
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
349 return result; 349 return result;
350} 350}
351 351
352void ncp_extract_file_info(void *structure, struct nw_info_struct *target) 352void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
353{ 353{
354 __u8 *name_len; 354 const __u8 *name_len;
355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen); 355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
356 356
357 memcpy(target, structure, info_struct_size); 357 memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
364} 364}
365 365
366#ifdef CONFIG_NCPFS_NFS_NS 366#ifdef CONFIG_NCPFS_NFS_NS
367static inline void ncp_extract_nfs_info(unsigned char *structure, 367static inline void ncp_extract_nfs_info(const unsigned char *structure,
368 struct nw_nfs_info *target) 368 struct nw_nfs_info *target)
369{ 369{
370 target->mode = DVAL_LH(structure); 370 target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
417 * Returns information for a (one-component) name relative to 417 * Returns information for a (one-component) name relative to
418 * the specified directory. 418 * the specified directory.
419 */ 419 */
420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path, 420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
421 struct nw_info_struct *target) 421 struct nw_info_struct *target)
422{ 422{
423 __u8 volnum = NCP_FINFO(dir)->volNumber; 423 __u8 volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
452#ifdef CONFIG_NCPFS_NFS_NS 452#ifdef CONFIG_NCPFS_NFS_NS
453static int 453static int
454ncp_obtain_DOS_dir_base(struct ncp_server *server, 454ncp_obtain_DOS_dir_base(struct ncp_server *server,
455 __u8 volnum, __le32 dirent, 455 __u8 ns, __u8 volnum, __le32 dirent,
456 char *path, /* At most 1 component */ 456 const char *path, /* At most 1 component */
457 __le32 *DOS_dir_base) 457 __le32 *DOS_dir_base)
458{ 458{
459 int result; 459 int result;
460 460
461 ncp_init_request(server); 461 ncp_init_request(server);
462 ncp_add_byte(server, 6); /* subfunction */ 462 ncp_add_byte(server, 6); /* subfunction */
463 ncp_add_byte(server, server->name_space[volnum]); 463 ncp_add_byte(server, ns);
464 ncp_add_byte(server, server->name_space[volnum]); 464 ncp_add_byte(server, ns);
465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ 465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
466 ncp_add_dword(server, RIM_DIRECTORY); 466 ncp_add_dword(server, RIM_DIRECTORY);
467 ncp_add_handle_path(server, volnum, dirent, 1, path); 467 ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */ 523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
524} 524}
525 525
526int
527ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
528{
529 int ns = ncp_get_known_namespace(server, volume);
530
531 if (ret_ns)
532 *ret_ns = ns;
533
534 DPRINTK("lookup_vol: namespace[%d] = %d\n",
535 volume, server->name_space[volume]);
536
537 if (server->name_space[volume] == ns)
538 return 0;
539 server->name_space[volume] = ns;
540 return 1;
541}
542
526static int 543static int
527ncp_ObtainSpecificDirBase(struct ncp_server *server, 544ncp_ObtainSpecificDirBase(struct ncp_server *server,
528 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base, 545 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
529 char *path, /* At most 1 component */ 546 const char *path, /* At most 1 component */
530 __le32 *dirEntNum, __le32 *DosDirNum) 547 __le32 *dirEntNum, __le32 *DosDirNum)
531{ 548{
532 int result; 549 int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
560{ 577{
561 int dstNS; 578 int dstNS;
562 int result; 579 int result;
563 580
564 dstNS = ncp_get_known_namespace(server, volNumber); 581 ncp_update_known_namespace(server, volNumber, &dstNS);
565 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 582 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber,
566 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0) 583 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
567 { 584 {
568 return result; 585 return result;
569 } 586 }
570 server->name_space[volNumber] = dstNS;
571 *volume = volNumber; 587 *volume = volNumber;
572 server->m.mounted_vol[1] = 0; 588 server->m.mounted_vol[1] = 0;
573 server->m.mounted_vol[0] = 'X'; 589 server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
575} 591}
576 592
577int 593int
578ncp_get_volume_root(struct ncp_server *server, const char *volname, 594ncp_get_volume_root(struct ncp_server *server,
579 __u32* volume, __le32* dirent, __le32* dosdirent) 595 const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
580{ 596{
581 int result; 597 int result;
582 __u8 volnum;
583 598
584 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); 599 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
585 600
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
601 return result; 616 return result;
602 } 617 }
603 *dirent = *dosdirent = ncp_reply_dword(server, 4); 618 *dirent = *dosdirent = ncp_reply_dword(server, 4);
604 volnum = ncp_reply_byte(server, 8); 619 *volume = ncp_reply_byte(server, 8);
605 ncp_unlock_server(server); 620 ncp_unlock_server(server);
606 *volume = volnum;
607
608 server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
609
610 DPRINTK("lookup_vol: namespace[%d] = %d\n",
611 volnum, server->name_space[volnum]);
612
613 return 0; 621 return 0;
614} 622}
615 623
616int 624int
617ncp_lookup_volume(struct ncp_server *server, const char *volname, 625ncp_lookup_volume(struct ncp_server *server,
618 struct nw_info_struct *target) 626 const char *volname, struct nw_info_struct *target)
619{ 627{
620 int result; 628 int result;
621 629
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
625 if (result) { 633 if (result) {
626 return result; 634 return result;
627 } 635 }
636 ncp_update_known_namespace(server, target->volNumber, NULL);
628 target->nameLen = strlen(volname); 637 target->nameLen = strlen(volname);
629 memcpy(target->entryName, volname, target->nameLen+1); 638 memcpy(target->entryName, volname, target->nameLen+1);
630 target->attributes = aDIR; 639 target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
676{ 685{
677 int result = 0; 686 int result = 0;
678 687
688 ncp_init_request(server);
679 if (server->name_space[volnum] == NW_NS_NFS) { 689 if (server->name_space[volnum] == NW_NS_NFS) {
680 ncp_init_request(server);
681 ncp_add_byte(server, 25); /* subfunction */ 690 ncp_add_byte(server, 25); /* subfunction */
682 ncp_add_byte(server, server->name_space[volnum]); 691 ncp_add_byte(server, server->name_space[volnum]);
683 ncp_add_byte(server, NW_NS_NFS); 692 ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
690 ncp_add_dword_lh(server, 1); /* nlinks */ 699 ncp_add_dword_lh(server, 1); /* nlinks */
691 ncp_add_dword_lh(server, rdev); 700 ncp_add_dword_lh(server, rdev);
692 result = ncp_request(server, 87); 701 result = ncp_request(server, 87);
693 ncp_unlock_server(server);
694 } 702 }
703 ncp_unlock_server(server);
695 return result; 704 return result;
696} 705}
697#endif 706#endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
700static int 709static int
701ncp_DeleteNSEntry(struct ncp_server *server, 710ncp_DeleteNSEntry(struct ncp_server *server,
702 __u8 have_dir_base, __u8 volnum, __le32 dirent, 711 __u8 have_dir_base, __u8 volnum, __le32 dirent,
703 char* name, __u8 ns, __le16 attr) 712 const char* name, __u8 ns, __le16 attr)
704{ 713{
705 int result; 714 int result;
706 715
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
734 743
735int 744int
736ncp_del_file_or_subdir(struct ncp_server *server, 745ncp_del_file_or_subdir(struct ncp_server *server,
737 struct inode *dir, char *name) 746 struct inode *dir, const char *name)
738{ 747{
739 __u8 volnum = NCP_FINFO(dir)->volNumber; 748 __u8 volnum = NCP_FINFO(dir)->volNumber;
740 __le32 dirent = NCP_FINFO(dir)->dirEntNum; 749 __le32 dirent = NCP_FINFO(dir)->dirEntNum;
750 int name_space;
741 751
752 name_space = server->name_space[volnum];
742#ifdef CONFIG_NCPFS_NFS_NS 753#ifdef CONFIG_NCPFS_NFS_NS
743 if (server->name_space[volnum]==NW_NS_NFS) 754 if (name_space == NW_NS_NFS)
744 { 755 {
745 int result; 756 int result;
746 757
747 result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent); 758 result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
748 if (result) return result; 759 if (result) return result;
749 return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006)); 760 name = NULL;
761 name_space = NW_NS_DOS;
750 } 762 }
751 else
752#endif /* CONFIG_NCPFS_NFS_NS */ 763#endif /* CONFIG_NCPFS_NFS_NS */
753 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006)); 764 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
754} 765}
755 766
756static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6]) 767static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
765/* If both dir and name are NULL, then in target there's already a 776/* If both dir and name are NULL, then in target there's already a
766 looked-up entry that wants to be opened. */ 777 looked-up entry that wants to be opened. */
767int ncp_open_create_file_or_subdir(struct ncp_server *server, 778int ncp_open_create_file_or_subdir(struct ncp_server *server,
768 struct inode *dir, char *name, 779 struct inode *dir, const char *name,
769 int open_create_mode, 780 int open_create_mode,
770 __le32 create_attributes, 781 __le32 create_attributes,
771 __le16 desired_acc_rights, 782 __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
890 901
891static int 902static int
892ncp_RenameNSEntry(struct ncp_server *server, 903ncp_RenameNSEntry(struct ncp_server *server,
893 struct inode *old_dir, char *old_name, __le16 old_type, 904 struct inode *old_dir, const char *old_name, __le16 old_type,
894 struct inode *new_dir, char *new_name) 905 struct inode *new_dir, const char *new_name)
895{ 906{
896 int result = -EINVAL; 907 int result = -EINVAL;
897 908
@@ -929,8 +940,8 @@ out:
929} 940}
930 941
931int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 942int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
932 struct inode *old_dir, char *old_name, 943 struct inode *old_dir, const char *old_name,
933 struct inode *new_dir, char *new_name) 944 struct inode *new_dir, const char *new_name)
934{ 945{
935 int result; 946 int result;
936 __le16 old_type = cpu_to_le16(0x06); 947 __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
958ncp_read_kernel(struct ncp_server *server, const char *file_id, 969ncp_read_kernel(struct ncp_server *server, const char *file_id,
959 __u32 offset, __u16 to_read, char *target, int *bytes_read) 970 __u32 offset, __u16 to_read, char *target, int *bytes_read)
960{ 971{
961 char *source; 972 const char *source;
962 int result; 973 int result;
963 974
964 ncp_init_request(server); 975 ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57dc..3c57eca634ce 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -65,10 +65,11 @@ static inline void ncp_inode_close(struct inode *inode) {
65 atomic_dec(&NCP_FINFO(inode)->opened); 65 atomic_dec(&NCP_FINFO(inode)->opened);
66} 66}
67 67
68void ncp_extract_file_info(void* src, struct nw_info_struct* target); 68void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
69int ncp_obtain_info(struct ncp_server *server, struct inode *, char *, 69int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
70 struct nw_info_struct *target); 70 struct nw_info_struct *target);
71int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target); 71int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
72int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
72int ncp_get_volume_root(struct ncp_server *server, const char *volname, 73int ncp_get_volume_root(struct ncp_server *server, const char *volname,
73 __u32 *volume, __le32 *dirent, __le32 *dosdirent); 74 __u32 *volume, __le32 *dirent, __le32 *dosdirent);
74int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *); 75int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +81,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
80 __u32 mode, __u32 rdev); 81 __u32 mode, __u32 rdev);
81 82
82int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*); 83int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
83int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *); 84int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
84int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *, 85int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
85 int, __le32, __le16, struct ncp_entry_info *); 86 int, __le32, __le16, struct ncp_entry_info *);
86 87
87int ncp_initialize_search(struct ncp_server *, struct inode *, 88int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +94,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
93 char** rbuf, size_t* rsize); 94 char** rbuf, size_t* rsize);
94 95
95int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 96int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
96 struct inode *, char *, struct inode *, char *); 97 struct inode *, const char *, struct inode *, const char *);
97 98
98 99
99int 100int
@@ -170,13 +171,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
170#endif /* CONFIG_NCPFS_NLS */ 171#endif /* CONFIG_NCPFS_NLS */
171 172
172#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time) 173#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time)
173#define NCP_MAX_AGE(server) ((server)->dentry_ttl) 174#define NCP_MAX_AGE(server) atomic_read(&(server)->dentry_ttl)
174#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server)) 175#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
175 176
176static inline void 177static inline void
177ncp_age_dentry(struct ncp_server* server, struct dentry* dentry) 178ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
178{ 179{
179 dentry->d_time = jiffies - server->dentry_ttl; 180 dentry->d_time = jiffies - NCP_MAX_AGE(server);
180} 181}
181 182
182static inline void 183static inline void
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6cf..d8b2d7e6910b 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -15,21 +15,21 @@
15 15
16/* i386: 32-bit, little endian, handles mis-alignment */ 16/* i386: 32-bit, little endian, handles mis-alignment */
17#ifdef __i386__ 17#ifdef __i386__
18#define GET_LE32(p) (*(int *)(p)) 18#define GET_LE32(p) (*(const int *)(p))
19#define PUT_LE32(p,v) { *(int *)(p)=v; } 19#define PUT_LE32(p,v) { *(int *)(p)=v; }
20#else 20#else
21/* from include/ncplib.h */ 21/* from include/ncplib.h */
22#define BVAL(buf,pos) (((__u8 *)(buf))[pos]) 22#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
23#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos)) 23#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
24#define BSET(buf,pos,val) (BVAL(buf,pos) = (val)) 24#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
25 25
26static inline __u16 26static inline __u16
27WVAL_LH(__u8 * buf, int pos) 27WVAL_LH(const __u8 * buf, int pos)
28{ 28{
29 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8; 29 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
30} 30}
31static inline __u32 31static inline __u32
32DVAL_LH(__u8 * buf, int pos) 32DVAL_LH(const __u8 * buf, int pos)
33{ 33{
34 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16; 34 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
35} 35}
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6e..668bd267346e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
746 return -EIO; 746 return -EIO;
747 } 747 }
748 if (!ncp_conn_valid(server)) { 748 if (!ncp_conn_valid(server)) {
749 printk(KERN_ERR "ncpfs: Connection invalid!\n");
750 return -EIO; 749 return -EIO;
751 } 750 }
752 { 751 {
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e13db613cb..ba306658a6db 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -76,13 +76,17 @@ config NFS_V4
76 76
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select PNFS_FILE_LAYOUT
80 help 81 help
81 This option enables support for minor version 1 of the NFSv4 protocol 82 This option enables support for minor version 1 of the NFSv4 protocol
82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. 83 (RFC 5661) in the kernel's NFS client.
83 84
84 If unsure, say N. 85 If unsure, say N.
85 86
87config PNFS_FILE_LAYOUT
88 tristate
89
86config ROOT_NFS 90config ROOT_NFS
87 bool "Root file system on NFS" 91 bool "Root file system on NFS"
88 depends on NFS_FS=y && IP_PNP 92 depends on NFS_FS=y && IP_PNP
@@ -117,3 +121,14 @@ config NFS_USE_KERNEL_DNS
117 select DNS_RESOLVER 121 select DNS_RESOLVER
118 select KEYS 122 select KEYS
119 default y 123 default y
124
125config NFS_USE_NEW_IDMAPPER
126 bool "Use the new idmapper upcall routine"
127 depends on NFS_V4 && KEYS
128 help
129 Say Y here if you want NFS to use the new idmapper upcall functions.
130 You will need /sbin/request-key (usually provided by the keyutils
131 package). For details, read
132 <file:Documentation/filesystems/nfs/idmapper.txt>.
133
134 If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 delegation.o idmap.o \ 15 delegation.o idmap.o \
16 callback.o callback_xdr.o callback_proc.o \ 16 callback.o callback_xdr.o callback_proc.o \
17 nfs4namespace.o 17 nfs4namespace.o
18nfs-$(CONFIG_NFS_V4_1) += pnfs.o
18nfs-$(CONFIG_SYSCTL) += sysctl.o 19nfs-$(CONFIG_SYSCTL) += sysctl.o
19nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
21
22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e17b49e2eabd..aeec017fe814 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -109,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv)
109{ 109{
110 int ret; 110 int ret;
111 111
112 ret = svc_create_xprt(serv, "tcp", PF_INET, 112 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
113 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 113 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
114 if (ret <= 0) 114 if (ret <= 0)
115 goto out_err; 115 goto out_err;
@@ -117,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv)
117 dprintk("NFS: Callback listener port = %u (af %u)\n", 117 dprintk("NFS: Callback listener port = %u (af %u)\n",
118 nfs_callback_tcpport, PF_INET); 118 nfs_callback_tcpport, PF_INET);
119 119
120 ret = svc_create_xprt(serv, "tcp", PF_INET6, 120 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
121 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 121 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
122 if (ret > 0) { 122 if (ret > 0) {
123 nfs_callback_tcpport6 = ret; 123 nfs_callback_tcpport6 = ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..2950fca0c61b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
118 if (delegation == NULL) 118 if (delegation == NULL)
119 return 0; 119 return 0;
120 120
121 /* seqid is 4-bytes long */ 121 if (stateid->stateid.seqid != 0)
122 if (((u32 *) &stateid->data)[0] != 0)
123 return 0; 122 return 0;
124 if (memcmp(&delegation->stateid.data[4], &stateid->data[4], 123 if (memcmp(&delegation->stateid.stateid.other,
125 sizeof(stateid->data)-4)) 124 &stateid->stateid.other,
125 NFS4_STATEID_OTHER_SIZE))
126 return 0; 126 return 0;
127 127
128 return 1; 128 return 1;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e7340729af89..0870d0d4efc0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
48#include "iostat.h" 48#include "iostat.h"
49#include "internal.h" 49#include "internal.h"
50#include "fscache.h" 50#include "fscache.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_CLIENT 53#define NFSDBG_FACILITY NFSDBG_CLIENT
53 54
@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
155 cred = rpc_lookup_machine_cred(); 156 cred = rpc_lookup_machine_cred();
156 if (!IS_ERR(cred)) 157 if (!IS_ERR(cred))
157 clp->cl_machine_cred = cred; 158 clp->cl_machine_cred = cred;
158 159#if defined(CONFIG_NFS_V4_1)
160 INIT_LIST_HEAD(&clp->cl_layouts);
161#endif
159 nfs_fscache_get_client_cookie(clp); 162 nfs_fscache_get_client_cookie(clp);
160 163
161 return clp; 164 return clp;
@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
252 nfs_free_client(clp); 255 nfs_free_client(clp);
253 } 256 }
254} 257}
258EXPORT_SYMBOL_GPL(nfs_put_client);
255 259
256#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 260#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
257/* 261/*
@@ -601,6 +605,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
601{ 605{
602 struct rpc_clnt *clnt = NULL; 606 struct rpc_clnt *clnt = NULL;
603 struct rpc_create_args args = { 607 struct rpc_create_args args = {
608 .net = &init_net,
604 .protocol = clp->cl_proto, 609 .protocol = clp->cl_proto,
605 .address = (struct sockaddr *)&clp->cl_addr, 610 .address = (struct sockaddr *)&clp->cl_addr,
606 .addrsize = clp->cl_addrlen, 611 .addrsize = clp->cl_addrlen,
@@ -635,7 +640,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
635 */ 640 */
636static void nfs_destroy_server(struct nfs_server *server) 641static void nfs_destroy_server(struct nfs_server *server)
637{ 642{
638 if (!(server->flags & NFS_MOUNT_NONLM)) 643 if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
644 !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
639 nlmclnt_done(server->nlm_host); 645 nlmclnt_done(server->nlm_host);
640} 646}
641 647
@@ -657,7 +663,8 @@ static int nfs_start_lockd(struct nfs_server *server)
657 663
658 if (nlm_init.nfs_version > 3) 664 if (nlm_init.nfs_version > 3)
659 return 0; 665 return 0;
660 if (server->flags & NFS_MOUNT_NONLM) 666 if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
667 (server->flags & NFS_MOUNT_LOCAL_FCNTL))
661 return 0; 668 return 0;
662 669
663 switch (clp->cl_proto) { 670 switch (clp->cl_proto) {
@@ -898,11 +905,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
898 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 905 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
899 server->wsize = NFS_MAX_FILE_IO_SIZE; 906 server->wsize = NFS_MAX_FILE_IO_SIZE;
900 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 907 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
908 set_pnfs_layoutdriver(server, fsinfo->layouttype);
909
901 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 910 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
902 911
903 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); 912 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
904 if (server->dtsize > PAGE_CACHE_SIZE) 913 if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
905 server->dtsize = PAGE_CACHE_SIZE; 914 server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
906 if (server->dtsize > server->rsize) 915 if (server->dtsize > server->rsize)
907 server->dtsize = server->rsize; 916 server->dtsize = server->rsize;
908 917
@@ -913,6 +922,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
913 922
914 server->maxfilesize = fsinfo->maxfilesize; 923 server->maxfilesize = fsinfo->maxfilesize;
915 924
925 server->time_delta = fsinfo->time_delta;
926
916 /* We're airborne Set socket buffersize */ 927 /* We're airborne Set socket buffersize */
917 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); 928 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
918} 929}
@@ -935,6 +946,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
935 } 946 }
936 947
937 fsinfo.fattr = fattr; 948 fsinfo.fattr = fattr;
949 fsinfo.layouttype = 0;
938 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); 950 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
939 if (error < 0) 951 if (error < 0)
940 goto out_error; 952 goto out_error;
@@ -1017,6 +1029,7 @@ void nfs_free_server(struct nfs_server *server)
1017{ 1029{
1018 dprintk("--> nfs_free_server()\n"); 1030 dprintk("--> nfs_free_server()\n");
1019 1031
1032 unset_pnfs_layoutdriver(server);
1020 spin_lock(&nfs_client_lock); 1033 spin_lock(&nfs_client_lock);
1021 list_del(&server->client_link); 1034 list_del(&server->client_link);
1022 list_del(&server->master_link); 1035 list_del(&server->master_link);
@@ -1356,8 +1369,9 @@ static int nfs4_init_server(struct nfs_server *server,
1356 1369
1357 /* Initialise the client representation from the mount data */ 1370 /* Initialise the client representation from the mount data */
1358 server->flags = data->flags; 1371 server->flags = data->flags;
1359 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| 1372 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
1360 NFS_CAP_POSIX_LOCK; 1373 if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
1374 server->caps |= NFS_CAP_READDIRPLUS;
1361 server->options = data->options; 1375 server->options = data->options;
1362 1376
1363 /* Get a client record */ 1377 /* Get a client record */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b9c3c43cea1d..232a7eead33a 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -71,20 +71,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
71 if (inode->i_flock == NULL) 71 if (inode->i_flock == NULL)
72 goto out; 72 goto out;
73 73
74 /* Protect inode->i_flock using the BKL */ 74 /* Protect inode->i_flock using the file locks lock */
75 lock_kernel(); 75 lock_flocks();
76 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 76 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
77 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 77 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
78 continue; 78 continue;
79 if (nfs_file_open_context(fl->fl_file) != ctx) 79 if (nfs_file_open_context(fl->fl_file) != ctx)
80 continue; 80 continue;
81 unlock_kernel(); 81 unlock_flocks();
82 status = nfs4_lock_delegation_recall(state, fl); 82 status = nfs4_lock_delegation_recall(state, fl);
83 if (status < 0) 83 if (status < 0)
84 goto out; 84 goto out;
85 lock_kernel(); 85 lock_flocks();
86 } 86 }
87 unlock_kernel(); 87 unlock_flocks();
88out: 88out:
89 return status; 89 return status;
90} 90}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..07ac3847e562 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,12 @@
33#include <linux/namei.h> 33#include <linux/namei.h>
34#include <linux/mount.h> 34#include <linux/mount.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/vmalloc.h>
36 37
37#include "nfs4_fs.h"
38#include "delegation.h" 38#include "delegation.h"
39#include "iostat.h" 39#include "iostat.h"
40#include "internal.h" 40#include "internal.h"
41#include "fscache.h"
41 42
42/* #define NFS_DEBUG_VERBOSE 1 */ 43/* #define NFS_DEBUG_VERBOSE 1 */
43 44
@@ -55,6 +56,7 @@ static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 56 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, int); 57static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 58static loff_t nfs_llseek_dir(struct file *, loff_t, int);
59static int nfs_readdir_clear_array(struct page*, gfp_t);
58 60
59const struct file_operations nfs_dir_operations = { 61const struct file_operations nfs_dir_operations = {
60 .llseek = nfs_llseek_dir, 62 .llseek = nfs_llseek_dir,
@@ -80,6 +82,10 @@ const struct inode_operations nfs_dir_inode_operations = {
80 .setattr = nfs_setattr, 82 .setattr = nfs_setattr,
81}; 83};
82 84
85const struct address_space_operations nfs_dir_addr_space_ops = {
86 .releasepage = nfs_readdir_clear_array,
87};
88
83#ifdef CONFIG_NFS_V3 89#ifdef CONFIG_NFS_V3
84const struct inode_operations nfs3_dir_inode_operations = { 90const struct inode_operations nfs3_dir_inode_operations = {
85 .create = nfs_create, 91 .create = nfs_create,
@@ -104,8 +110,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
104#ifdef CONFIG_NFS_V4 110#ifdef CONFIG_NFS_V4
105 111
106static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); 112static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
113static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
107const struct inode_operations nfs4_dir_inode_operations = { 114const struct inode_operations nfs4_dir_inode_operations = {
108 .create = nfs_create, 115 .create = nfs_open_create,
109 .lookup = nfs_atomic_lookup, 116 .lookup = nfs_atomic_lookup,
110 .link = nfs_link, 117 .link = nfs_link,
111 .unlink = nfs_unlink, 118 .unlink = nfs_unlink,
@@ -150,51 +157,197 @@ nfs_opendir(struct inode *inode, struct file *filp)
150 return res; 157 return res;
151} 158}
152 159
153typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); 160struct nfs_cache_array_entry {
161 u64 cookie;
162 u64 ino;
163 struct qstr string;
164};
165
166struct nfs_cache_array {
167 unsigned int size;
168 int eof_index;
169 u64 last_cookie;
170 struct nfs_cache_array_entry array[0];
171};
172
173#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
174
175typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
154typedef struct { 176typedef struct {
155 struct file *file; 177 struct file *file;
156 struct page *page; 178 struct page *page;
157 unsigned long page_index; 179 unsigned long page_index;
158 __be32 *ptr;
159 u64 *dir_cookie; 180 u64 *dir_cookie;
160 loff_t current_index; 181 loff_t current_index;
161 struct nfs_entry *entry;
162 decode_dirent_t decode; 182 decode_dirent_t decode;
163 int plus; 183
164 unsigned long timestamp; 184 unsigned long timestamp;
165 unsigned long gencount; 185 unsigned long gencount;
166 int timestamp_valid; 186 unsigned int cache_entry_index;
187 unsigned int plus:1;
188 unsigned int eof:1;
167} nfs_readdir_descriptor_t; 189} nfs_readdir_descriptor_t;
168 190
169/* Now we cache directories properly, by stuffing the dirent 191/*
170 * data directly in the page cache. 192 * The caller is responsible for calling nfs_readdir_release_array(page)
171 *
172 * Inode invalidation due to refresh etc. takes care of
173 * _everything_, no sloppy entry flushing logic, no extraneous
174 * copying, network direct to page cache, the way it was meant
175 * to be.
176 *
177 * NOTE: Dirent information verification is done always by the
178 * page-in of the RPC reply, nowhere else, this simplies
179 * things substantially.
180 */ 193 */
181static 194static
182int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) 195struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
196{
197 if (page == NULL)
198 return ERR_PTR(-EIO);
199 return (struct nfs_cache_array *)kmap(page);
200}
201
202static
203void nfs_readdir_release_array(struct page *page)
204{
205 kunmap(page);
206}
207
208/*
209 * we are freeing strings created by nfs_add_to_readdir_array()
210 */
211static
212int nfs_readdir_clear_array(struct page *page, gfp_t mask)
213{
214 struct nfs_cache_array *array = nfs_readdir_get_array(page);
215 int i;
216 for (i = 0; i < array->size; i++)
217 kfree(array->array[i].string.name);
218 nfs_readdir_release_array(page);
219 return 0;
220}
221
222/*
223 * the caller is responsible for freeing qstr.name
224 * when called by nfs_readdir_add_to_array, the strings will be freed in
225 * nfs_clear_readdir_array()
226 */
227static
228int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
229{
230 string->len = len;
231 string->name = kmemdup(name, len, GFP_KERNEL);
232 if (string->name == NULL)
233 return -ENOMEM;
234 string->hash = full_name_hash(name, len);
235 return 0;
236}
237
238static
239int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
240{
241 struct nfs_cache_array *array = nfs_readdir_get_array(page);
242 struct nfs_cache_array_entry *cache_entry;
243 int ret;
244
245 if (IS_ERR(array))
246 return PTR_ERR(array);
247 ret = -EIO;
248 if (array->size >= MAX_READDIR_ARRAY)
249 goto out;
250
251 cache_entry = &array->array[array->size];
252 cache_entry->cookie = entry->prev_cookie;
253 cache_entry->ino = entry->ino;
254 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
255 if (ret)
256 goto out;
257 array->last_cookie = entry->cookie;
258 if (entry->eof == 1)
259 array->eof_index = array->size;
260 array->size++;
261out:
262 nfs_readdir_release_array(page);
263 return ret;
264}
265
266static
267int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
268{
269 loff_t diff = desc->file->f_pos - desc->current_index;
270 unsigned int index;
271
272 if (diff < 0)
273 goto out_eof;
274 if (diff >= array->size) {
275 if (array->eof_index > 0)
276 goto out_eof;
277 desc->current_index += array->size;
278 return -EAGAIN;
279 }
280
281 index = (unsigned int)diff;
282 *desc->dir_cookie = array->array[index].cookie;
283 desc->cache_entry_index = index;
284 if (index == array->eof_index)
285 desc->eof = 1;
286 return 0;
287out_eof:
288 desc->eof = 1;
289 return -EBADCOOKIE;
290}
291
292static
293int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
294{
295 int i;
296 int status = -EAGAIN;
297
298 for (i = 0; i < array->size; i++) {
299 if (i == array->eof_index) {
300 desc->eof = 1;
301 status = -EBADCOOKIE;
302 }
303 if (array->array[i].cookie == *desc->dir_cookie) {
304 desc->cache_entry_index = i;
305 status = 0;
306 break;
307 }
308 }
309
310 return status;
311}
312
313static
314int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
315{
316 struct nfs_cache_array *array;
317 int status = -EBADCOOKIE;
318
319 if (desc->dir_cookie == NULL)
320 goto out;
321
322 array = nfs_readdir_get_array(desc->page);
323 if (IS_ERR(array)) {
324 status = PTR_ERR(array);
325 goto out;
326 }
327
328 if (*desc->dir_cookie == 0)
329 status = nfs_readdir_search_for_pos(array, desc);
330 else
331 status = nfs_readdir_search_for_cookie(array, desc);
332
333 nfs_readdir_release_array(desc->page);
334out:
335 return status;
336}
337
338/* Fill a page with xdr information before transferring to the cache page */
339static
340int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
341 struct nfs_entry *entry, struct file *file, struct inode *inode)
183{ 342{
184 struct file *file = desc->file;
185 struct inode *inode = file->f_path.dentry->d_inode;
186 struct rpc_cred *cred = nfs_file_cred(file); 343 struct rpc_cred *cred = nfs_file_cred(file);
187 unsigned long timestamp, gencount; 344 unsigned long timestamp, gencount;
188 int error; 345 int error;
189 346
190 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
191 __func__, (long long)desc->entry->cookie,
192 page->index);
193
194 again: 347 again:
195 timestamp = jiffies; 348 timestamp = jiffies;
196 gencount = nfs_inc_attr_generation_counter(); 349 gencount = nfs_inc_attr_generation_counter();
197 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, 350 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
198 NFS_SERVER(inode)->dtsize, desc->plus); 351 NFS_SERVER(inode)->dtsize, desc->plus);
199 if (error < 0) { 352 if (error < 0) {
200 /* We requested READDIRPLUS, but the server doesn't grok it */ 353 /* We requested READDIRPLUS, but the server doesn't grok it */
@@ -208,190 +361,292 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
208 } 361 }
209 desc->timestamp = timestamp; 362 desc->timestamp = timestamp;
210 desc->gencount = gencount; 363 desc->gencount = gencount;
211 desc->timestamp_valid = 1; 364error:
212 SetPageUptodate(page); 365 return error;
213 /* Ensure consistent page alignment of the data.
214 * Note: assumes we have exclusive access to this mapping either
215 * through inode->i_mutex or some other mechanism.
216 */
217 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
218 /* Should never happen */
219 nfs_zap_mapping(inode, inode->i_mapping);
220 }
221 unlock_page(page);
222 return 0;
223 error:
224 unlock_page(page);
225 return -EIO;
226} 366}
227 367
228static inline 368/* Fill in an entry based on the xdr code stored in desc->page */
229int dir_decode(nfs_readdir_descriptor_t *desc) 369static
370int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
230{ 371{
231 __be32 *p = desc->ptr; 372 __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
232 p = desc->decode(p, desc->entry, desc->plus);
233 if (IS_ERR(p)) 373 if (IS_ERR(p))
234 return PTR_ERR(p); 374 return PTR_ERR(p);
235 desc->ptr = p; 375
236 if (desc->timestamp_valid) { 376 entry->fattr->time_start = desc->timestamp;
237 desc->entry->fattr->time_start = desc->timestamp; 377 entry->fattr->gencount = desc->gencount;
238 desc->entry->fattr->gencount = desc->gencount;
239 } else
240 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
241 return 0; 378 return 0;
242} 379}
243 380
244static inline 381static
245void dir_page_release(nfs_readdir_descriptor_t *desc) 382int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
246{ 383{
247 kunmap(desc->page); 384 struct nfs_inode *node;
248 page_cache_release(desc->page); 385 if (dentry->d_inode == NULL)
249 desc->page = NULL; 386 goto different;
250 desc->ptr = NULL; 387 node = NFS_I(dentry->d_inode);
388 if (node->fh.size != entry->fh->size)
389 goto different;
390 if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
391 goto different;
392 return 1;
393different:
394 return 0;
251} 395}
252 396
253/* 397static
254 * Given a pointer to a buffer that has already been filled by a call 398void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
255 * to readdir, find the next entry with cookie '*desc->dir_cookie'.
256 *
257 * If the end of the buffer has been reached, return -EAGAIN, if not,
258 * return the offset within the buffer of the next entry to be
259 * read.
260 */
261static inline
262int find_dirent(nfs_readdir_descriptor_t *desc)
263{ 399{
264 struct nfs_entry *entry = desc->entry; 400 struct qstr filename = {
265 int loop_count = 0, 401 .len = entry->len,
266 status; 402 .name = entry->name,
403 };
404 struct dentry *dentry;
405 struct dentry *alias;
406 struct inode *dir = parent->d_inode;
407 struct inode *inode;
267 408
268 while((status = dir_decode(desc)) == 0) { 409 if (filename.name[0] == '.') {
269 dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", 410 if (filename.len == 1)
270 __func__, (unsigned long long)entry->cookie); 411 return;
271 if (entry->prev_cookie == *desc->dir_cookie) 412 if (filename.len == 2 && filename.name[1] == '.')
272 break; 413 return;
273 if (loop_count++ > 200) { 414 }
274 loop_count = 0; 415 filename.hash = full_name_hash(filename.name, filename.len);
275 schedule(); 416
417 dentry = d_lookup(parent, &filename);
418 if (dentry != NULL) {
419 if (nfs_same_file(dentry, entry)) {
420 nfs_refresh_inode(dentry->d_inode, entry->fattr);
421 goto out;
422 } else {
423 d_drop(dentry);
424 dput(dentry);
276 } 425 }
277 } 426 }
278 return status; 427
428 dentry = d_alloc(parent, &filename);
429 if (dentry == NULL)
430 return;
431
432 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
433 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
434 if (IS_ERR(inode))
435 goto out;
436
437 alias = d_materialise_unique(dentry, inode);
438 if (IS_ERR(alias))
439 goto out;
440 else if (alias) {
441 nfs_set_verifier(alias, nfs_save_change_attribute(dir));
442 dput(alias);
443 } else
444 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
445
446out:
447 dput(dentry);
448}
449
450/* Perform conversion from xdr to cache array */
451static
452void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
453 void *xdr_page, struct page *page, unsigned int buflen)
454{
455 struct xdr_stream stream;
456 struct xdr_buf buf;
457 __be32 *ptr = xdr_page;
458 int status;
459 struct nfs_cache_array *array;
460
461 buf.head->iov_base = xdr_page;
462 buf.head->iov_len = buflen;
463 buf.tail->iov_len = 0;
464 buf.page_base = 0;
465 buf.page_len = 0;
466 buf.buflen = buf.head->iov_len;
467 buf.len = buf.head->iov_len;
468
469 xdr_init_decode(&stream, &buf, ptr);
470
471
472 do {
473 status = xdr_decode(desc, entry, &stream);
474 if (status != 0)
475 break;
476
477 if (nfs_readdir_add_to_array(entry, page) == -1)
478 break;
479 if (desc->plus == 1)
480 nfs_prime_dcache(desc->file->f_path.dentry, entry);
481 } while (!entry->eof);
482
483 if (status == -EBADCOOKIE && entry->eof) {
484 array = nfs_readdir_get_array(page);
485 array->eof_index = array->size - 1;
486 status = 0;
487 nfs_readdir_release_array(page);
488 }
489}
490
491static
492void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
493{
494 unsigned int i;
495 for (i = 0; i < npages; i++)
496 put_page(pages[i]);
497}
498
499static
500void nfs_readdir_free_large_page(void *ptr, struct page **pages,
501 unsigned int npages)
502{
503 vm_unmap_ram(ptr, npages);
504 nfs_readdir_free_pagearray(pages, npages);
279} 505}
280 506
281/* 507/*
282 * Given a pointer to a buffer that has already been filled by a call 508 * nfs_readdir_large_page will allocate pages that must be freed with a call
283 * to readdir, find the entry at offset 'desc->file->f_pos'. 509 * to nfs_readdir_free_large_page
284 *
285 * If the end of the buffer has been reached, return -EAGAIN, if not,
286 * return the offset within the buffer of the next entry to be
287 * read.
288 */ 510 */
289static inline 511static
290int find_dirent_index(nfs_readdir_descriptor_t *desc) 512void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
291{ 513{
292 struct nfs_entry *entry = desc->entry; 514 void *ptr;
293 int loop_count = 0, 515 unsigned int i;
294 status; 516
517 for (i = 0; i < npages; i++) {
518 struct page *page = alloc_page(GFP_KERNEL);
519 if (page == NULL)
520 goto out_freepages;
521 pages[i] = page;
522 }
295 523
296 for(;;) { 524 ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
297 status = dir_decode(desc); 525 if (!IS_ERR_OR_NULL(ptr))
298 if (status) 526 return ptr;
299 break; 527out_freepages:
528 nfs_readdir_free_pagearray(pages, i);
529 return NULL;
530}
531
532static
533int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
534{
535 struct page *pages[NFS_MAX_READDIR_PAGES];
536 void *pages_ptr = NULL;
537 struct nfs_entry entry;
538 struct file *file = desc->file;
539 struct nfs_cache_array *array;
540 int status = 0;
541 unsigned int array_size = ARRAY_SIZE(pages);
542
543 entry.prev_cookie = 0;
544 entry.cookie = *desc->dir_cookie;
545 entry.eof = 0;
546 entry.fh = nfs_alloc_fhandle();
547 entry.fattr = nfs_alloc_fattr();
548 if (entry.fh == NULL || entry.fattr == NULL)
549 goto out;
300 550
301 dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", 551 array = nfs_readdir_get_array(page);
302 (unsigned long long)entry->cookie, desc->current_index); 552 memset(array, 0, sizeof(struct nfs_cache_array));
553 array->eof_index = -1;
303 554
304 if (desc->file->f_pos == desc->current_index) { 555 pages_ptr = nfs_readdir_large_page(pages, array_size);
305 *desc->dir_cookie = entry->cookie; 556 if (!pages_ptr)
557 goto out_release_array;
558 do {
559 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
560
561 if (status < 0)
306 break; 562 break;
307 } 563 nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
308 desc->current_index++; 564 } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
309 if (loop_count++ > 200) { 565
310 loop_count = 0; 566 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
311 schedule(); 567out_release_array:
312 } 568 nfs_readdir_release_array(page);
313 } 569out:
570 nfs_free_fattr(entry.fattr);
571 nfs_free_fhandle(entry.fh);
314 return status; 572 return status;
315} 573}
316 574
317/* 575/*
318 * Find the given page, and call find_dirent() or find_dirent_index in 576 * Now we cache directories properly, by converting xdr information
319 * order to try to return the next entry. 577 * to an array that can be used for lookups later. This results in
578 * fewer cache pages, since we can store more information on each page.
579 * We only need to convert from xdr once so future lookups are much simpler
320 */ 580 */
321static inline 581static
322int find_dirent_page(nfs_readdir_descriptor_t *desc) 582int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
323{ 583{
324 struct inode *inode = desc->file->f_path.dentry->d_inode; 584 struct inode *inode = desc->file->f_path.dentry->d_inode;
325 struct page *page;
326 int status;
327 585
328 dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", 586 if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
329 __func__, desc->page_index, 587 goto error;
330 (long long) *desc->dir_cookie); 588 SetPageUptodate(page);
331 589
332 /* If we find the page in the page_cache, we cannot be sure 590 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
333 * how fresh the data is, so we will ignore readdir_plus attributes. 591 /* Should never happen */
334 */ 592 nfs_zap_mapping(inode, inode->i_mapping);
335 desc->timestamp_valid = 0;
336 page = read_cache_page(inode->i_mapping, desc->page_index,
337 (filler_t *)nfs_readdir_filler, desc);
338 if (IS_ERR(page)) {
339 status = PTR_ERR(page);
340 goto out;
341 } 593 }
594 unlock_page(page);
595 return 0;
596 error:
597 unlock_page(page);
598 return -EIO;
599}
342 600
343 /* NOTE: Someone else may have changed the READDIRPLUS flag */ 601static
344 desc->page = page; 602void cache_page_release(nfs_readdir_descriptor_t *desc)
345 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 603{
346 if (*desc->dir_cookie != 0) 604 page_cache_release(desc->page);
347 status = find_dirent(desc); 605 desc->page = NULL;
348 else 606}
349 status = find_dirent_index(desc); 607
350 if (status < 0) 608static
351 dir_page_release(desc); 609struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
352 out: 610{
353 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); 611 struct page *page;
354 return status; 612 page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
613 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
614 if (IS_ERR(page))
615 desc->eof = 1;
616 return page;
355} 617}
356 618
357/* 619/*
358 * Recurse through the page cache pages, and return a 620 * Returns 0 if desc->dir_cookie was found on page desc->page_index
359 * filled nfs_entry structure of the next directory entry if possible.
360 *
361 * The target for the search is '*desc->dir_cookie' if non-0,
362 * 'desc->file->f_pos' otherwise
363 */ 621 */
622static
623int find_cache_page(nfs_readdir_descriptor_t *desc)
624{
625 int res;
626
627 desc->page = get_cache_page(desc);
628 if (IS_ERR(desc->page))
629 return PTR_ERR(desc->page);
630
631 res = nfs_readdir_search_array(desc);
632 if (res == 0)
633 return 0;
634 cache_page_release(desc);
635 return res;
636}
637
638/* Search for desc->dir_cookie from the beginning of the page cache */
364static inline 639static inline
365int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) 640int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
366{ 641{
367 int loop_count = 0; 642 int res = -EAGAIN;
368 int res;
369
370 /* Always search-by-index from the beginning of the cache */
371 if (*desc->dir_cookie == 0) {
372 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
373 (long long)desc->file->f_pos);
374 desc->page_index = 0;
375 desc->entry->cookie = desc->entry->prev_cookie = 0;
376 desc->entry->eof = 0;
377 desc->current_index = 0;
378 } else
379 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
380 (unsigned long long)*desc->dir_cookie);
381 643
382 for (;;) { 644 while (1) {
383 res = find_dirent_page(desc); 645 res = find_cache_page(desc);
384 if (res != -EAGAIN) 646 if (res != -EAGAIN)
385 break; 647 break;
386 /* Align to beginning of next page */ 648 desc->page_index++;
387 desc->page_index ++;
388 if (loop_count++ > 200) {
389 loop_count = 0;
390 schedule();
391 }
392 } 649 }
393
394 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
395 return res; 650 return res;
396} 651}
397 652
@@ -400,8 +655,6 @@ static inline unsigned int dt_type(struct inode *inode)
400 return (inode->i_mode >> 12) & 15; 655 return (inode->i_mode >> 12) & 15;
401} 656}
402 657
403static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
404
405/* 658/*
406 * Once we've found the start of the dirent within a page: fill 'er up... 659 * Once we've found the start of the dirent within a page: fill 'er up...
407 */ 660 */
@@ -410,49 +663,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
410 filldir_t filldir) 663 filldir_t filldir)
411{ 664{
412 struct file *file = desc->file; 665 struct file *file = desc->file;
413 struct nfs_entry *entry = desc->entry; 666 int i = 0;
414 struct dentry *dentry = NULL; 667 int res = 0;
415 u64 fileid; 668 struct nfs_cache_array *array = NULL;
416 int loop_count = 0, 669 unsigned int d_type = DT_UNKNOWN;
417 res; 670 struct dentry *dentry = NULL;
418
419 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
420 (unsigned long long)entry->cookie);
421
422 for(;;) {
423 unsigned d_type = DT_UNKNOWN;
424 /* Note: entry->prev_cookie contains the cookie for
425 * retrieving the current dirent on the server */
426 fileid = entry->ino;
427
428 /* Get a dentry if we have one */
429 if (dentry != NULL)
430 dput(dentry);
431 dentry = nfs_readdir_lookup(desc);
432 671
433 /* Use readdirplus info */ 672 array = nfs_readdir_get_array(desc->page);
434 if (dentry != NULL && dentry->d_inode != NULL) {
435 d_type = dt_type(dentry->d_inode);
436 fileid = NFS_FILEID(dentry->d_inode);
437 }
438 673
439 res = filldir(dirent, entry->name, entry->len, 674 for (i = desc->cache_entry_index; i < array->size; i++) {
440 file->f_pos, nfs_compat_user_ino64(fileid), 675 d_type = DT_UNKNOWN;
441 d_type); 676
677 res = filldir(dirent, array->array[i].string.name,
678 array->array[i].string.len, file->f_pos,
679 nfs_compat_user_ino64(array->array[i].ino), d_type);
442 if (res < 0) 680 if (res < 0)
443 break; 681 break;
444 file->f_pos++; 682 file->f_pos++;
445 *desc->dir_cookie = entry->cookie; 683 desc->cache_entry_index = i;
446 if (dir_decode(desc) != 0) { 684 if (i < (array->size-1))
447 desc->page_index ++; 685 *desc->dir_cookie = array->array[i+1].cookie;
686 else
687 *desc->dir_cookie = array->last_cookie;
688 if (i == array->eof_index) {
689 desc->eof = 1;
448 break; 690 break;
449 } 691 }
450 if (loop_count++ > 200) {
451 loop_count = 0;
452 schedule();
453 }
454 } 692 }
455 dir_page_release(desc); 693
694 nfs_readdir_release_array(desc->page);
695 cache_page_release(desc);
456 if (dentry != NULL) 696 if (dentry != NULL)
457 dput(dentry); 697 dput(dentry);
458 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", 698 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@ -476,12 +716,9 @@ static inline
476int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, 716int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
477 filldir_t filldir) 717 filldir_t filldir)
478{ 718{
479 struct file *file = desc->file;
480 struct inode *inode = file->f_path.dentry->d_inode;
481 struct rpc_cred *cred = nfs_file_cred(file);
482 struct page *page = NULL; 719 struct page *page = NULL;
483 int status; 720 int status;
484 unsigned long timestamp, gencount; 721 struct inode *inode = desc->file->f_path.dentry->d_inode;
485 722
486 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 723 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
487 (unsigned long long)*desc->dir_cookie); 724 (unsigned long long)*desc->dir_cookie);
@@ -491,38 +728,22 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
491 status = -ENOMEM; 728 status = -ENOMEM;
492 goto out; 729 goto out;
493 } 730 }
494 timestamp = jiffies; 731
495 gencount = nfs_inc_attr_generation_counter(); 732 if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
496 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
497 *desc->dir_cookie, page,
498 NFS_SERVER(inode)->dtsize,
499 desc->plus);
500 desc->page = page;
501 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
502 if (status >= 0) {
503 desc->timestamp = timestamp;
504 desc->gencount = gencount;
505 desc->timestamp_valid = 1;
506 if ((status = dir_decode(desc)) == 0)
507 desc->entry->prev_cookie = *desc->dir_cookie;
508 } else
509 status = -EIO; 733 status = -EIO;
510 if (status < 0)
511 goto out_release; 734 goto out_release;
735 }
512 736
737 desc->page_index = 0;
738 desc->page = page;
513 status = nfs_do_filldir(desc, dirent, filldir); 739 status = nfs_do_filldir(desc, dirent, filldir);
514 740
515 /* Reset read descriptor so it searches the page cache from
516 * the start upon the next call to readdir_search_pagecache() */
517 desc->page_index = 0;
518 desc->entry->cookie = desc->entry->prev_cookie = 0;
519 desc->entry->eof = 0;
520 out: 741 out:
521 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", 742 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
522 __func__, status); 743 __func__, status);
523 return status; 744 return status;
524 out_release: 745 out_release:
525 dir_page_release(desc); 746 cache_page_release(desc);
526 goto out; 747 goto out;
527} 748}
528 749
@@ -536,7 +757,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
536 struct inode *inode = dentry->d_inode; 757 struct inode *inode = dentry->d_inode;
537 nfs_readdir_descriptor_t my_desc, 758 nfs_readdir_descriptor_t my_desc,
538 *desc = &my_desc; 759 *desc = &my_desc;
539 struct nfs_entry my_entry;
540 int res = -ENOMEM; 760 int res = -ENOMEM;
541 761
542 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 762 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -557,26 +777,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
557 desc->decode = NFS_PROTO(inode)->decode_dirent; 777 desc->decode = NFS_PROTO(inode)->decode_dirent;
558 desc->plus = NFS_USE_READDIRPLUS(inode); 778 desc->plus = NFS_USE_READDIRPLUS(inode);
559 779
560 my_entry.cookie = my_entry.prev_cookie = 0;
561 my_entry.eof = 0;
562 my_entry.fh = nfs_alloc_fhandle();
563 my_entry.fattr = nfs_alloc_fattr();
564 if (my_entry.fh == NULL || my_entry.fattr == NULL)
565 goto out_alloc_failed;
566
567 desc->entry = &my_entry;
568
569 nfs_block_sillyrename(dentry); 780 nfs_block_sillyrename(dentry);
570 res = nfs_revalidate_mapping(inode, filp->f_mapping); 781 res = nfs_revalidate_mapping(inode, filp->f_mapping);
571 if (res < 0) 782 if (res < 0)
572 goto out; 783 goto out;
573 784
574 while(!desc->entry->eof) { 785 while (desc->eof != 1) {
575 res = readdir_search_pagecache(desc); 786 res = readdir_search_pagecache(desc);
576 787
577 if (res == -EBADCOOKIE) { 788 if (res == -EBADCOOKIE) {
578 /* This means either end of directory */ 789 /* This means either end of directory */
579 if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { 790 if (*desc->dir_cookie && desc->eof == 0) {
580 /* Or that the server has 'lost' a cookie */ 791 /* Or that the server has 'lost' a cookie */
581 res = uncached_readdir(desc, dirent, filldir); 792 res = uncached_readdir(desc, dirent, filldir);
582 if (res >= 0) 793 if (res >= 0)
@@ -588,8 +799,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
588 if (res == -ETOOSMALL && desc->plus) { 799 if (res == -ETOOSMALL && desc->plus) {
589 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 800 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
590 nfs_zap_caches(inode); 801 nfs_zap_caches(inode);
802 desc->page_index = 0;
591 desc->plus = 0; 803 desc->plus = 0;
592 desc->entry->eof = 0; 804 desc->eof = 0;
593 continue; 805 continue;
594 } 806 }
595 if (res < 0) 807 if (res < 0)
@@ -605,9 +817,6 @@ out:
605 nfs_unblock_sillyrename(dentry); 817 nfs_unblock_sillyrename(dentry);
606 if (res > 0) 818 if (res > 0)
607 res = 0; 819 res = 0;
608out_alloc_failed:
609 nfs_free_fattr(my_entry.fattr);
610 nfs_free_fhandle(my_entry.fh);
611 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", 820 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
612 dentry->d_parent->d_name.name, dentry->d_name.name, 821 dentry->d_parent->d_name.name, dentry->d_name.name,
613 res); 822 res);
@@ -1029,10 +1238,63 @@ static int is_atomic_open(struct nameidata *nd)
1029 return 1; 1238 return 1;
1030} 1239}
1031 1240
1241static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
1242{
1243 struct path path = {
1244 .mnt = nd->path.mnt,
1245 .dentry = dentry,
1246 };
1247 struct nfs_open_context *ctx;
1248 struct rpc_cred *cred;
1249 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
1250
1251 cred = rpc_lookup_cred();
1252 if (IS_ERR(cred))
1253 return ERR_CAST(cred);
1254 ctx = alloc_nfs_open_context(&path, cred, fmode);
1255 put_rpccred(cred);
1256 if (ctx == NULL)
1257 return ERR_PTR(-ENOMEM);
1258 return ctx;
1259}
1260
1261static int do_open(struct inode *inode, struct file *filp)
1262{
1263 nfs_fscache_set_inode_cookie(inode, filp);
1264 return 0;
1265}
1266
1267static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
1268{
1269 struct file *filp;
1270 int ret = 0;
1271
1272 /* If the open_intent is for execute, we have an extra check to make */
1273 if (ctx->mode & FMODE_EXEC) {
1274 ret = nfs_may_open(ctx->path.dentry->d_inode,
1275 ctx->cred,
1276 nd->intent.open.flags);
1277 if (ret < 0)
1278 goto out;
1279 }
1280 filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
1281 if (IS_ERR(filp))
1282 ret = PTR_ERR(filp);
1283 else
1284 nfs_file_set_open_context(filp, ctx);
1285out:
1286 put_nfs_open_context(ctx);
1287 return ret;
1288}
1289
1032static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 1290static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1033{ 1291{
1292 struct nfs_open_context *ctx;
1293 struct iattr attr;
1034 struct dentry *res = NULL; 1294 struct dentry *res = NULL;
1035 int error; 1295 struct inode *inode;
1296 int open_flags;
1297 int err;
1036 1298
1037 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", 1299 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
1038 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1300 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1054,13 +1316,32 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1054 goto out; 1316 goto out;
1055 } 1317 }
1056 1318
1319 ctx = nameidata_to_nfs_open_context(dentry, nd);
1320 res = ERR_CAST(ctx);
1321 if (IS_ERR(ctx))
1322 goto out;
1323
1324 open_flags = nd->intent.open.flags;
1325 if (nd->flags & LOOKUP_CREATE) {
1326 attr.ia_mode = nd->intent.open.create_mode;
1327 attr.ia_valid = ATTR_MODE;
1328 if (!IS_POSIXACL(dir))
1329 attr.ia_mode &= ~current_umask();
1330 } else {
1331 open_flags &= ~(O_EXCL | O_CREAT);
1332 attr.ia_valid = 0;
1333 }
1334
1057 /* Open the file on the server */ 1335 /* Open the file on the server */
1058 res = nfs4_atomic_open(dir, dentry, nd); 1336 nfs_block_sillyrename(dentry->d_parent);
1059 if (IS_ERR(res)) { 1337 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1060 error = PTR_ERR(res); 1338 if (IS_ERR(inode)) {
1061 switch (error) { 1339 nfs_unblock_sillyrename(dentry->d_parent);
1340 put_nfs_open_context(ctx);
1341 switch (PTR_ERR(inode)) {
1062 /* Make a negative dentry */ 1342 /* Make a negative dentry */
1063 case -ENOENT: 1343 case -ENOENT:
1344 d_add(dentry, NULL);
1064 res = NULL; 1345 res = NULL;
1065 goto out; 1346 goto out;
1066 /* This turned out not to be a regular file */ 1347 /* This turned out not to be a regular file */
@@ -1072,11 +1353,25 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1072 goto no_open; 1353 goto no_open;
1073 /* case -EINVAL: */ 1354 /* case -EINVAL: */
1074 default: 1355 default:
1356 res = ERR_CAST(inode);
1075 goto out; 1357 goto out;
1076 } 1358 }
1077 } else if (res != NULL) 1359 }
1360 res = d_add_unique(dentry, inode);
1361 nfs_unblock_sillyrename(dentry->d_parent);
1362 if (res != NULL) {
1363 dput(ctx->path.dentry);
1364 ctx->path.dentry = dget(res);
1078 dentry = res; 1365 dentry = res;
1366 }
1367 err = nfs_intent_set_file(nd, ctx);
1368 if (err < 0) {
1369 if (res != NULL)
1370 dput(res);
1371 return ERR_PTR(err);
1372 }
1079out: 1373out:
1374 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1080 return res; 1375 return res;
1081no_open: 1376no_open:
1082 return nfs_lookup(dir, dentry, nd); 1377 return nfs_lookup(dir, dentry, nd);
@@ -1087,12 +1382,15 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1087 struct dentry *parent = NULL; 1382 struct dentry *parent = NULL;
1088 struct inode *inode = dentry->d_inode; 1383 struct inode *inode = dentry->d_inode;
1089 struct inode *dir; 1384 struct inode *dir;
1385 struct nfs_open_context *ctx;
1090 int openflags, ret = 0; 1386 int openflags, ret = 0;
1091 1387
1092 if (!is_atomic_open(nd) || d_mountpoint(dentry)) 1388 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1093 goto no_open; 1389 goto no_open;
1390
1094 parent = dget_parent(dentry); 1391 parent = dget_parent(dentry);
1095 dir = parent->d_inode; 1392 dir = parent->d_inode;
1393
1096 /* We can't create new files in nfs_open_revalidate(), so we 1394 /* We can't create new files in nfs_open_revalidate(), so we
1097 * optimize away revalidation of negative dentries. 1395 * optimize away revalidation of negative dentries.
1098 */ 1396 */
@@ -1112,99 +1410,96 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1112 /* We can't create new files, or truncate existing ones here */ 1410 /* We can't create new files, or truncate existing ones here */
1113 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); 1411 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
1114 1412
1413 ctx = nameidata_to_nfs_open_context(dentry, nd);
1414 ret = PTR_ERR(ctx);
1415 if (IS_ERR(ctx))
1416 goto out;
1115 /* 1417 /*
1116 * Note: we're not holding inode->i_mutex and so may be racing with 1418 * Note: we're not holding inode->i_mutex and so may be racing with
1117 * operations that change the directory. We therefore save the 1419 * operations that change the directory. We therefore save the
1118 * change attribute *before* we do the RPC call. 1420 * change attribute *before* we do the RPC call.
1119 */ 1421 */
1120 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1422 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
1423 if (IS_ERR(inode)) {
1424 ret = PTR_ERR(inode);
1425 switch (ret) {
1426 case -EPERM:
1427 case -EACCES:
1428 case -EDQUOT:
1429 case -ENOSPC:
1430 case -EROFS:
1431 goto out_put_ctx;
1432 default:
1433 goto out_drop;
1434 }
1435 }
1436 iput(inode);
1437 if (inode != dentry->d_inode)
1438 goto out_drop;
1439
1440 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1441 ret = nfs_intent_set_file(nd, ctx);
1442 if (ret >= 0)
1443 ret = 1;
1121out: 1444out:
1122 dput(parent); 1445 dput(parent);
1123 if (!ret)
1124 d_drop(dentry);
1125 return ret; 1446 return ret;
1447out_drop:
1448 d_drop(dentry);
1449 ret = 0;
1450out_put_ctx:
1451 put_nfs_open_context(ctx);
1452 goto out;
1453
1126no_open_dput: 1454no_open_dput:
1127 dput(parent); 1455 dput(parent);
1128no_open: 1456no_open:
1129 return nfs_lookup_revalidate(dentry, nd); 1457 return nfs_lookup_revalidate(dentry, nd);
1130} 1458}
1131#endif /* CONFIG_NFSV4 */
1132 1459
1133static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) 1460static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
1461 struct nameidata *nd)
1134{ 1462{
1135 struct dentry *parent = desc->file->f_path.dentry; 1463 struct nfs_open_context *ctx = NULL;
1136 struct inode *dir = parent->d_inode; 1464 struct iattr attr;
1137 struct nfs_entry *entry = desc->entry; 1465 int error;
1138 struct dentry *dentry, *alias; 1466 int open_flags = 0;
1139 struct qstr name = {
1140 .name = entry->name,
1141 .len = entry->len,
1142 };
1143 struct inode *inode;
1144 unsigned long verf = nfs_save_change_attribute(dir);
1145 1467
1146 switch (name.len) { 1468 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1147 case 2: 1469 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1148 if (name.name[0] == '.' && name.name[1] == '.')
1149 return dget_parent(parent);
1150 break;
1151 case 1:
1152 if (name.name[0] == '.')
1153 return dget(parent);
1154 }
1155 1470
1156 spin_lock(&dir->i_lock); 1471 attr.ia_mode = mode;
1157 if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { 1472 attr.ia_valid = ATTR_MODE;
1158 spin_unlock(&dir->i_lock);
1159 return NULL;
1160 }
1161 spin_unlock(&dir->i_lock);
1162 1473
1163 name.hash = full_name_hash(name.name, name.len); 1474 if ((nd->flags & LOOKUP_CREATE) != 0) {
1164 dentry = d_lookup(parent, &name); 1475 open_flags = nd->intent.open.flags;
1165 if (dentry != NULL) {
1166 /* Is this a positive dentry that matches the readdir info? */
1167 if (dentry->d_inode != NULL &&
1168 (NFS_FILEID(dentry->d_inode) == entry->ino ||
1169 d_mountpoint(dentry))) {
1170 if (!desc->plus || entry->fh->size == 0)
1171 return dentry;
1172 if (nfs_compare_fh(NFS_FH(dentry->d_inode),
1173 entry->fh) == 0)
1174 goto out_renew;
1175 }
1176 /* No, so d_drop to allow one to be created */
1177 d_drop(dentry);
1178 dput(dentry);
1179 }
1180 if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
1181 return NULL;
1182 if (name.len > NFS_SERVER(dir)->namelen)
1183 return NULL;
1184 /* Note: caller is already holding the dir->i_mutex! */
1185 dentry = d_alloc(parent, &name);
1186 if (dentry == NULL)
1187 return NULL;
1188 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1189 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
1190 if (IS_ERR(inode)) {
1191 dput(dentry);
1192 return NULL;
1193 }
1194 1476
1195 alias = d_materialise_unique(dentry, inode); 1477 ctx = nameidata_to_nfs_open_context(dentry, nd);
1196 if (alias != NULL) { 1478 error = PTR_ERR(ctx);
1197 dput(dentry); 1479 if (IS_ERR(ctx))
1198 if (IS_ERR(alias)) 1480 goto out_err_drop;
1199 return NULL;
1200 dentry = alias;
1201 } 1481 }
1202 1482
1203out_renew: 1483 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
1204 nfs_set_verifier(dentry, verf); 1484 if (error != 0)
1205 return dentry; 1485 goto out_put_ctx;
1486 if (ctx != NULL) {
1487 error = nfs_intent_set_file(nd, ctx);
1488 if (error < 0)
1489 goto out_err;
1490 }
1491 return 0;
1492out_put_ctx:
1493 if (ctx != NULL)
1494 put_nfs_open_context(ctx);
1495out_err_drop:
1496 d_drop(dentry);
1497out_err:
1498 return error;
1206} 1499}
1207 1500
1501#endif /* CONFIG_NFSV4 */
1502
1208/* 1503/*
1209 * Code common to create, mkdir, and mknod. 1504 * Code common to create, mkdir, and mknod.
1210 */ 1505 */
@@ -1258,7 +1553,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1258{ 1553{
1259 struct iattr attr; 1554 struct iattr attr;
1260 int error; 1555 int error;
1261 int open_flags = 0;
1262 1556
1263 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1557 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1264 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1558 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1266,10 +1560,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1266 attr.ia_mode = mode; 1560 attr.ia_mode = mode;
1267 attr.ia_valid = ATTR_MODE; 1561 attr.ia_valid = ATTR_MODE;
1268 1562
1269 if ((nd->flags & LOOKUP_CREATE) != 0) 1563 error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
1270 open_flags = nd->intent.open.flags;
1271
1272 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
1273 if (error != 0) 1564 if (error != 0)
1274 goto out_err; 1565 goto out_err;
1275 return 0; 1566 return 0;
@@ -1351,76 +1642,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1351 return error; 1642 return error;
1352} 1643}
1353 1644
1354static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
1355{
1356 static unsigned int sillycounter;
1357 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
1358 const int countersize = sizeof(sillycounter)*2;
1359 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
1360 char silly[slen+1];
1361 struct qstr qsilly;
1362 struct dentry *sdentry;
1363 int error = -EIO;
1364
1365 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
1366 dentry->d_parent->d_name.name, dentry->d_name.name,
1367 atomic_read(&dentry->d_count));
1368 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
1369
1370 /*
1371 * We don't allow a dentry to be silly-renamed twice.
1372 */
1373 error = -EBUSY;
1374 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1375 goto out;
1376
1377 sprintf(silly, ".nfs%*.*Lx",
1378 fileidsize, fileidsize,
1379 (unsigned long long)NFS_FILEID(dentry->d_inode));
1380
1381 /* Return delegation in anticipation of the rename */
1382 nfs_inode_return_delegation(dentry->d_inode);
1383
1384 sdentry = NULL;
1385 do {
1386 char *suffix = silly + slen - countersize;
1387
1388 dput(sdentry);
1389 sillycounter++;
1390 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
1391
1392 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
1393 dentry->d_name.name, silly);
1394
1395 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
1396 /*
1397 * N.B. Better to return EBUSY here ... it could be
1398 * dangerous to delete the file while it's in use.
1399 */
1400 if (IS_ERR(sdentry))
1401 goto out;
1402 } while(sdentry->d_inode != NULL); /* need negative lookup */
1403
1404 qsilly.name = silly;
1405 qsilly.len = strlen(silly);
1406 if (dentry->d_inode) {
1407 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1408 dir, &qsilly);
1409 nfs_mark_for_revalidate(dentry->d_inode);
1410 } else
1411 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1412 dir, &qsilly);
1413 if (!error) {
1414 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1415 d_move(dentry, sdentry);
1416 error = nfs_async_unlink(dir, dentry);
1417 /* If we return 0 we don't unlink */
1418 }
1419 dput(sdentry);
1420out:
1421 return error;
1422}
1423
1424/* 1645/*
1425 * Remove a file after making sure there are no pending writes, 1646 * Remove a file after making sure there are no pending writes,
1426 * and after checking that the file has only one user. 1647 * and after checking that the file has only one user.
@@ -1580,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1580 d_drop(dentry); 1801 d_drop(dentry);
1581 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1802 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1582 if (error == 0) { 1803 if (error == 0) {
1583 atomic_inc(&inode->i_count); 1804 ihold(inode);
1584 d_add(dentry, inode); 1805 d_add(dentry, inode);
1585 } 1806 }
1586 return error; 1807 return error;
@@ -1711,14 +1932,14 @@ static void nfs_access_free_list(struct list_head *head)
1711int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 1932int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
1712{ 1933{
1713 LIST_HEAD(head); 1934 LIST_HEAD(head);
1714 struct nfs_inode *nfsi; 1935 struct nfs_inode *nfsi, *next;
1715 struct nfs_access_entry *cache; 1936 struct nfs_access_entry *cache;
1716 1937
1717 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 1938 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1718 return (nr_to_scan == 0) ? 0 : -1; 1939 return (nr_to_scan == 0) ? 0 : -1;
1719 1940
1720 spin_lock(&nfs_access_lru_lock); 1941 spin_lock(&nfs_access_lru_lock);
1721 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1942 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
1722 struct inode *inode; 1943 struct inode *inode;
1723 1944
1724 if (nr_to_scan-- == 0) 1945 if (nr_to_scan-- == 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 064a80961677..84d3c8b90206 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -873,7 +873,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
873 dreq->inode = inode; 873 dreq->inode = inode;
874 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 874 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
875 dreq->l_ctx = nfs_get_lock_context(dreq->ctx); 875 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
876 if (dreq->l_ctx != NULL) 876 if (dreq->l_ctx == NULL)
877 goto out_release; 877 goto out_release;
878 if (!is_sync_kiocb(iocb)) 878 if (!is_sync_kiocb(iocb))
879 dreq->iocb = iocb; 879 dreq->iocb = iocb;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index dba50a5625db..a6e711ad130f 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
167 return 0; 167 return 0;
168 } 168 }
169 item = container_of(h, struct nfs_dns_ent, h); 169 item = container_of(h, struct nfs_dns_ent, h);
170 ttl = (long)item->h.expiry_time - (long)get_seconds(); 170 ttl = item->h.expiry_time - seconds_since_boot();
171 if (ttl < 0) 171 if (ttl < 0)
172 ttl = 0; 172 ttl = 0;
173 173
@@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
239 ttl = get_expiry(&buf); 239 ttl = get_expiry(&buf);
240 if (ttl == 0) 240 if (ttl == 0)
241 goto out; 241 goto out;
242 key.h.expiry_time = ttl + get_seconds(); 242 key.h.expiry_time = ttl + seconds_since_boot();
243 243
244 ret = -ENOMEM; 244 ret = -ENOMEM;
245 item = nfs_dns_lookup(cd, &key); 245 item = nfs_dns_lookup(cd, &key);
@@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
301 goto out_err; 301 goto out_err;
302 ret = -ETIMEDOUT; 302 ret = -ETIMEDOUT;
303 if (!test_bit(CACHE_VALID, &(*item)->h.flags) 303 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
304 || (*item)->h.expiry_time < get_seconds() 304 || (*item)->h.expiry_time < seconds_since_boot()
305 || cd->flush_time > (*item)->h.last_refresh) 305 || cd->flush_time > (*item)->h.last_refresh)
306 goto out_put; 306 goto out_put;
307 ret = -ENOENT; 307 ret = -ENOENT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05bf3c0dc751..60677f9f1311 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#define NFSDBG_FACILITY NFSDBG_FILE 41#define NFSDBG_FACILITY NFSDBG_FILE
41 42
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
386 file->f_path.dentry->d_name.name, 387 file->f_path.dentry->d_name.name,
387 mapping->host->i_ino, len, (long long) pos); 388 mapping->host->i_ino, len, (long long) pos);
388 389
390 pnfs_update_layout(mapping->host,
391 nfs_file_open_context(file),
392 IOMODE_RW);
393
389start: 394start:
390 /* 395 /*
391 * Prevent starvation issues if someone is doing a consistency 396 * Prevent starvation issues if someone is doing a consistency
@@ -551,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
551 struct file *filp = vma->vm_file; 556 struct file *filp = vma->vm_file;
552 struct dentry *dentry = filp->f_path.dentry; 557 struct dentry *dentry = filp->f_path.dentry;
553 unsigned pagelen; 558 unsigned pagelen;
554 int ret = -EINVAL; 559 int ret = VM_FAULT_NOPAGE;
555 struct address_space *mapping; 560 struct address_space *mapping;
556 561
557 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", 562 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -567,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
567 if (mapping != dentry->d_inode->i_mapping) 572 if (mapping != dentry->d_inode->i_mapping)
568 goto out_unlock; 573 goto out_unlock;
569 574
570 ret = 0;
571 pagelen = nfs_page_length(page); 575 pagelen = nfs_page_length(page);
572 if (pagelen == 0) 576 if (pagelen == 0)
573 goto out_unlock; 577 goto out_unlock;
574 578
575 ret = nfs_flush_incompatible(filp, page); 579 ret = VM_FAULT_LOCKED;
576 if (ret != 0) 580 if (nfs_flush_incompatible(filp, page) == 0 &&
577 goto out_unlock; 581 nfs_updatepage(filp, page, 0, pagelen) == 0)
582 goto out;
578 583
579 ret = nfs_updatepage(filp, page, 0, pagelen); 584 ret = VM_FAULT_SIGBUS;
580out_unlock: 585out_unlock:
581 if (!ret)
582 return VM_FAULT_LOCKED;
583 unlock_page(page); 586 unlock_page(page);
584 return VM_FAULT_SIGBUS; 587out:
588 return ret;
585} 589}
586 590
587static const struct vm_operations_struct nfs_file_vm_ops = { 591static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -684,7 +688,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
684 return ret; 688 return ret;
685} 689}
686 690
687static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) 691static int
692do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
688{ 693{
689 struct inode *inode = filp->f_mapping->host; 694 struct inode *inode = filp->f_mapping->host;
690 int status = 0; 695 int status = 0;
@@ -699,7 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
699 if (nfs_have_delegation(inode, FMODE_READ)) 704 if (nfs_have_delegation(inode, FMODE_READ))
700 goto out_noconflict; 705 goto out_noconflict;
701 706
702 if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) 707 if (is_local)
703 goto out_noconflict; 708 goto out_noconflict;
704 709
705 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 710 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -726,7 +731,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
726 return res; 731 return res;
727} 732}
728 733
729static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) 734static int
735do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
730{ 736{
731 struct inode *inode = filp->f_mapping->host; 737 struct inode *inode = filp->f_mapping->host;
732 int status; 738 int status;
@@ -741,15 +747,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
741 * If we're signalled while cleaning up locks on process exit, we 747 * If we're signalled while cleaning up locks on process exit, we
742 * still need to complete the unlock. 748 * still need to complete the unlock.
743 */ 749 */
744 /* Use local locking if mounted with "-onolock" */ 750 /*
745 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 751 * Use local locking if mounted with "-onolock" or with appropriate
752 * "-olocal_lock="
753 */
754 if (!is_local)
746 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 755 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
747 else 756 else
748 status = do_vfs_lock(filp, fl); 757 status = do_vfs_lock(filp, fl);
749 return status; 758 return status;
750} 759}
751 760
752static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) 761static int
762is_time_granular(struct timespec *ts) {
763 return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
764}
765
766static int
767do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
753{ 768{
754 struct inode *inode = filp->f_mapping->host; 769 struct inode *inode = filp->f_mapping->host;
755 int status; 770 int status;
@@ -762,20 +777,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
762 if (status != 0) 777 if (status != 0)
763 goto out; 778 goto out;
764 779
765 /* Use local locking if mounted with "-onolock" */ 780 /*
766 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 781 * Use local locking if mounted with "-onolock" or with appropriate
782 * "-olocal_lock="
783 */
784 if (!is_local)
767 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 785 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
768 else 786 else
769 status = do_vfs_lock(filp, fl); 787 status = do_vfs_lock(filp, fl);
770 if (status < 0) 788 if (status < 0)
771 goto out; 789 goto out;
790
772 /* 791 /*
773 * Make sure we clear the cache whenever we try to get the lock. 792 * Revalidate the cache if the server has time stamps granular
793 * enough to detect subsecond changes. Otherwise, clear the
794 * cache to prevent missing any changes.
795 *
774 * This makes locking act as a cache coherency point. 796 * This makes locking act as a cache coherency point.
775 */ 797 */
776 nfs_sync_mapping(filp->f_mapping); 798 nfs_sync_mapping(filp->f_mapping);
777 if (!nfs_have_delegation(inode, FMODE_READ)) 799 if (!nfs_have_delegation(inode, FMODE_READ)) {
778 nfs_zap_caches(inode); 800 if (is_time_granular(&NFS_SERVER(inode)->time_delta))
801 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
802 else
803 nfs_zap_caches(inode);
804 }
779out: 805out:
780 return status; 806 return status;
781} 807}
@@ -787,6 +813,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
787{ 813{
788 struct inode *inode = filp->f_mapping->host; 814 struct inode *inode = filp->f_mapping->host;
789 int ret = -ENOLCK; 815 int ret = -ENOLCK;
816 int is_local = 0;
790 817
791 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", 818 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
792 filp->f_path.dentry->d_parent->d_name.name, 819 filp->f_path.dentry->d_parent->d_name.name,
@@ -800,6 +827,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
800 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 827 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
801 goto out_err; 828 goto out_err;
802 829
830 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
831 is_local = 1;
832
803 if (NFS_PROTO(inode)->lock_check_bounds != NULL) { 833 if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
804 ret = NFS_PROTO(inode)->lock_check_bounds(fl); 834 ret = NFS_PROTO(inode)->lock_check_bounds(fl);
805 if (ret < 0) 835 if (ret < 0)
@@ -807,11 +837,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
807 } 837 }
808 838
809 if (IS_GETLK(cmd)) 839 if (IS_GETLK(cmd))
810 ret = do_getlk(filp, cmd, fl); 840 ret = do_getlk(filp, cmd, fl, is_local);
811 else if (fl->fl_type == F_UNLCK) 841 else if (fl->fl_type == F_UNLCK)
812 ret = do_unlk(filp, cmd, fl); 842 ret = do_unlk(filp, cmd, fl, is_local);
813 else 843 else
814 ret = do_setlk(filp, cmd, fl); 844 ret = do_setlk(filp, cmd, fl, is_local);
815out_err: 845out_err:
816 return ret; 846 return ret;
817} 847}
@@ -821,6 +851,9 @@ out_err:
821 */ 851 */
822static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) 852static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
823{ 853{
854 struct inode *inode = filp->f_mapping->host;
855 int is_local = 0;
856
824 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", 857 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
825 filp->f_path.dentry->d_parent->d_name.name, 858 filp->f_path.dentry->d_parent->d_name.name,
826 filp->f_path.dentry->d_name.name, 859 filp->f_path.dentry->d_name.name,
@@ -829,14 +862,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
829 if (!(fl->fl_flags & FL_FLOCK)) 862 if (!(fl->fl_flags & FL_FLOCK))
830 return -ENOLCK; 863 return -ENOLCK;
831 864
865 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
866 is_local = 1;
867
832 /* We're simulating flock() locks using posix locks on the server */ 868 /* We're simulating flock() locks using posix locks on the server */
833 fl->fl_owner = (fl_owner_t)filp; 869 fl->fl_owner = (fl_owner_t)filp;
834 fl->fl_start = 0; 870 fl->fl_start = 0;
835 fl->fl_end = OFFSET_MAX; 871 fl->fl_end = OFFSET_MAX;
836 872
837 if (fl->fl_type == F_UNLCK) 873 if (fl->fl_type == F_UNLCK)
838 return do_unlk(filp, cmd, fl); 874 return do_unlk(filp, cmd, fl, is_local);
839 return do_setlk(filp, cmd, fl); 875 return do_setlk(filp, cmd, fl, is_local);
840} 876}
841 877
842/* 878/*
@@ -848,6 +884,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
848 dprintk("NFS: setlease(%s/%s, arg=%ld)\n", 884 dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
849 file->f_path.dentry->d_parent->d_name.name, 885 file->f_path.dentry->d_parent->d_name.name,
850 file->f_path.dentry->d_name.name, arg); 886 file->f_path.dentry->d_name.name, arg);
851
852 return -EINVAL; 887 return -EINVAL;
853} 888}
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..ac7b814ce162 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
54 iput(inode); 54 iput(inode);
55 return -ENOMEM; 55 return -ENOMEM;
56 } 56 }
57 /* Circumvent igrab(): we know the inode is not being freed */ 57 ihold(inode);
58 atomic_inc(&inode->i_count);
59 /* 58 /*
60 * Ensure that this dentry is invisible to d_find_alias(). 59 * Ensure that this dentry is invisible to d_find_alias().
61 * Otherwise, it may be spliced into the tree by 60 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916f..4e2d9b6b1380 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
38
39#include <linux/slab.h>
40#include <linux/cred.h>
41#include <linux/nfs_idmap.h>
42#include <linux/keyctl.h>
43#include <linux/key-type.h>
44#include <linux/rcupdate.h>
45#include <linux/kernel.h>
46#include <linux/err.h>
47
48#include <keys/user-type.h>
49
50#define NFS_UINT_MAXLEN 11
51
52const struct cred *id_resolver_cache;
53
54struct key_type key_type_id_resolver = {
55 .name = "id_resolver",
56 .instantiate = user_instantiate,
57 .match = user_match,
58 .revoke = user_revoke,
59 .destroy = user_destroy,
60 .describe = user_describe,
61 .read = user_read,
62};
63
64int nfs_idmap_init(void)
65{
66 struct cred *cred;
67 struct key *keyring;
68 int ret = 0;
69
70 printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
71
72 cred = prepare_kernel_cred(NULL);
73 if (!cred)
74 return -ENOMEM;
75
76 keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
77 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
78 KEY_USR_VIEW | KEY_USR_READ,
79 KEY_ALLOC_NOT_IN_QUOTA);
80 if (IS_ERR(keyring)) {
81 ret = PTR_ERR(keyring);
82 goto failed_put_cred;
83 }
84
85 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
86 if (ret < 0)
87 goto failed_put_key;
88
89 ret = register_key_type(&key_type_id_resolver);
90 if (ret < 0)
91 goto failed_put_key;
92
93 cred->thread_keyring = keyring;
94 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
95 id_resolver_cache = cred;
96 return 0;
97
98failed_put_key:
99 key_put(keyring);
100failed_put_cred:
101 put_cred(cred);
102 return ret;
103}
104
105void nfs_idmap_quit(void)
106{
107 key_revoke(id_resolver_cache->thread_keyring);
108 unregister_key_type(&key_type_id_resolver);
109 put_cred(id_resolver_cache);
110}
111
112/*
113 * Assemble the description to pass to request_key()
114 * This function will allocate a new string and update dest to point
115 * at it. The caller is responsible for freeing dest.
116 *
117 * On error 0 is returned. Otherwise, the length of dest is returned.
118 */
119static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
120 const char *type, size_t typelen, char **desc)
121{
122 char *cp;
123 size_t desclen = typelen + namelen + 2;
124
125 *desc = kmalloc(desclen, GFP_KERNEL);
126 if (!*desc)
127 return -ENOMEM;
128
129 cp = *desc;
130 memcpy(cp, type, typelen);
131 cp += typelen;
132 *cp++ = ':';
133
134 memcpy(cp, name, namelen);
135 cp += namelen;
136 *cp = '\0';
137 return desclen;
138}
139
140static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
141 const char *type, void *data, size_t data_size)
142{
143 const struct cred *saved_cred;
144 struct key *rkey;
145 char *desc;
146 struct user_key_payload *payload;
147 ssize_t ret;
148
149 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
150 if (ret <= 0)
151 goto out;
152
153 saved_cred = override_creds(id_resolver_cache);
154 rkey = request_key(&key_type_id_resolver, desc, "");
155 revert_creds(saved_cred);
156 kfree(desc);
157 if (IS_ERR(rkey)) {
158 ret = PTR_ERR(rkey);
159 goto out;
160 }
161
162 rcu_read_lock();
163 rkey->perm |= KEY_USR_VIEW;
164
165 ret = key_validate(rkey);
166 if (ret < 0)
167 goto out_up;
168
169 payload = rcu_dereference(rkey->payload.data);
170 if (IS_ERR_OR_NULL(payload)) {
171 ret = PTR_ERR(payload);
172 goto out_up;
173 }
174
175 ret = payload->datalen;
176 if (ret > 0 && ret <= data_size)
177 memcpy(data, payload->data, ret);
178 else
179 ret = -EINVAL;
180
181out_up:
182 rcu_read_unlock();
183 key_put(rkey);
184out:
185 return ret;
186}
187
188
189/* ID -> Name */
190static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
191{
192 char id_str[NFS_UINT_MAXLEN];
193 int id_len;
194 ssize_t ret;
195
196 id_len = snprintf(id_str, sizeof(id_str), "%u", id);
197 ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
198 if (ret < 0)
199 return -EINVAL;
200 return ret;
201}
202
203/* Name -> ID */
204static int nfs_idmap_lookup_id(const char *name, size_t namelen,
205 const char *type, __u32 *id)
206{
207 char id_str[NFS_UINT_MAXLEN];
208 long id_long;
209 ssize_t data_size;
210 int ret = 0;
211
212 data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
213 if (data_size <= 0) {
214 ret = -EINVAL;
215 } else {
216 ret = strict_strtol(id_str, 10, &id_long);
217 *id = (__u32)id_long;
218 }
219 return ret;
220}
221
222int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
223{
224 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
225}
226
227int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
228{
229 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
230}
231
232int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
233{
234 return nfs_idmap_lookup_name(uid, "user", buf, buflen);
235}
236int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
237{
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen);
239}
240
241#else /* CONFIG_NFS_USE_IDMAPPER not defined */
242
37#include <linux/module.h> 243#include <linux/module.h>
38#include <linux/mutex.h> 244#include <linux/mutex.h>
39#include <linux/init.h> 245#include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
503 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 709 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
504} 710}
505 711
506int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) 712int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
507{ 713{
508 struct idmap *idmap = clp->cl_idmap; 714 struct idmap *idmap = clp->cl_idmap;
509 715
510 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 716 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
511} 717}
512int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) 718int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
513{ 719{
514 struct idmap *idmap = clp->cl_idmap; 720 struct idmap *idmap = clp->cl_idmap;
515 721
516 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 722 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
517} 723}
518 724
725#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d2d6c72aa78..314f57164602 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h" 49#include "fscache.h"
50#include "dns_resolve.h" 50#include "dns_resolve.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_VFS 53#define NFSDBG_FACILITY NFSDBG_VFS
53 54
@@ -234,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
234 return 0; 235 return 0;
235} 236}
236 237
237/* Don't use READDIRPLUS on directories that we believe are too large */
238#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
239
240/* 238/*
241 * This is our front-end to iget that looks up inodes by file handle 239 * This is our front-end to iget that looks up inodes by file handle
242 * instead of inode number. 240 * instead of inode number.
@@ -291,8 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 } else if (S_ISDIR(inode->i_mode)) { 289 } else if (S_ISDIR(inode->i_mode)) {
292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
293 inode->i_fop = &nfs_dir_operations; 291 inode->i_fop = &nfs_dir_operations;
294 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) 292 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
295 && fattr->size <= NFS_LIMIT_READDIRPLUS)
296 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 293 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
297 /* Deal with crossing mountpoints */ 294 /* Deal with crossing mountpoints */
298 if ((fattr->valid & NFS_ATTR_FATTR_FSID) 295 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -623,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
623 nfs_revalidate_inode(server, inode); 620 nfs_revalidate_inode(server, inode);
624} 621}
625 622
626static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred) 623struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
627{ 624{
628 struct nfs_open_context *ctx; 625 struct nfs_open_context *ctx;
629 626
@@ -633,11 +630,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
633 path_get(&ctx->path); 630 path_get(&ctx->path);
634 ctx->cred = get_rpccred(cred); 631 ctx->cred = get_rpccred(cred);
635 ctx->state = NULL; 632 ctx->state = NULL;
633 ctx->mode = f_mode;
636 ctx->flags = 0; 634 ctx->flags = 0;
637 ctx->error = 0; 635 ctx->error = 0;
638 ctx->dir_cookie = 0; 636 ctx->dir_cookie = 0;
639 nfs_init_lock_context(&ctx->lock_context); 637 nfs_init_lock_context(&ctx->lock_context);
640 ctx->lock_context.open_context = ctx; 638 ctx->lock_context.open_context = ctx;
639 INIT_LIST_HEAD(&ctx->list);
641 } 640 }
642 return ctx; 641 return ctx;
643} 642}
@@ -653,11 +652,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
653{ 652{
654 struct inode *inode = ctx->path.dentry->d_inode; 653 struct inode *inode = ctx->path.dentry->d_inode;
655 654
656 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) 655 if (!list_empty(&ctx->list)) {
656 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
657 return;
658 list_del(&ctx->list);
659 spin_unlock(&inode->i_lock);
660 } else if (!atomic_dec_and_test(&ctx->lock_context.count))
657 return; 661 return;
658 list_del(&ctx->list); 662 if (inode != NULL)
659 spin_unlock(&inode->i_lock); 663 NFS_PROTO(inode)->close_context(ctx, is_sync);
660 NFS_PROTO(inode)->close_context(ctx, is_sync);
661 if (ctx->cred != NULL) 664 if (ctx->cred != NULL)
662 put_rpccred(ctx->cred); 665 put_rpccred(ctx->cred);
663 path_put(&ctx->path); 666 path_put(&ctx->path);
@@ -673,7 +676,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
673 * Ensure that mmap has a recent RPC credential for use when writing out 676 * Ensure that mmap has a recent RPC credential for use when writing out
674 * shared pages 677 * shared pages
675 */ 678 */
676static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) 679void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
677{ 680{
678 struct inode *inode = filp->f_path.dentry->d_inode; 681 struct inode *inode = filp->f_path.dentry->d_inode;
679 struct nfs_inode *nfsi = NFS_I(inode); 682 struct nfs_inode *nfsi = NFS_I(inode);
@@ -730,11 +733,10 @@ int nfs_open(struct inode *inode, struct file *filp)
730 cred = rpc_lookup_cred(); 733 cred = rpc_lookup_cred();
731 if (IS_ERR(cred)) 734 if (IS_ERR(cred))
732 return PTR_ERR(cred); 735 return PTR_ERR(cred);
733 ctx = alloc_nfs_open_context(&filp->f_path, cred); 736 ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
734 put_rpccred(cred); 737 put_rpccred(cred);
735 if (ctx == NULL) 738 if (ctx == NULL)
736 return -ENOMEM; 739 return -ENOMEM;
737 ctx->mode = filp->f_mode;
738 nfs_file_set_open_context(filp, ctx); 740 nfs_file_set_open_context(filp, ctx);
739 put_nfs_open_context(ctx); 741 put_nfs_open_context(ctx);
740 nfs_fscache_set_inode_cookie(inode, filp); 742 nfs_fscache_set_inode_cookie(inode, filp);
@@ -1409,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode)
1409{ 1411{
1410 truncate_inode_pages(&inode->i_data, 0); 1412 truncate_inode_pages(&inode->i_data, 0);
1411 end_writeback(inode); 1413 end_writeback(inode);
1414 pnfs_destroy_layout(NFS_I(inode));
1412 /* If we are holding a delegation, return it! */ 1415 /* If we are holding a delegation, return it! */
1413 nfs_inode_return_delegation_noreclaim(inode); 1416 nfs_inode_return_delegation_noreclaim(inode);
1414 /* First call standard NFS clear_inode() code */ 1417 /* First call standard NFS clear_inode() code */
@@ -1446,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
1446 nfsi->delegation = NULL; 1449 nfsi->delegation = NULL;
1447 nfsi->delegation_state = 0; 1450 nfsi->delegation_state = 0;
1448 init_rwsem(&nfsi->rwsem); 1451 init_rwsem(&nfsi->rwsem);
1452 nfsi->layout = NULL;
1449#endif 1453#endif
1450} 1454}
1451 1455
@@ -1493,7 +1497,7 @@ static int nfsiod_start(void)
1493{ 1497{
1494 struct workqueue_struct *wq; 1498 struct workqueue_struct *wq;
1495 dprintk("RPC: creating workqueue nfsiod\n"); 1499 dprintk("RPC: creating workqueue nfsiod\n");
1496 wq = create_singlethread_workqueue("nfsiod"); 1500 wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
1497 if (wq == NULL) 1501 if (wq == NULL)
1498 return -ENOMEM; 1502 return -ENOMEM;
1499 nfsiod_workqueue = wq; 1503 nfsiod_workqueue = wq;
@@ -1521,6 +1525,10 @@ static int __init init_nfs_fs(void)
1521{ 1525{
1522 int err; 1526 int err;
1523 1527
1528 err = nfs_idmap_init();
1529 if (err < 0)
1530 goto out9;
1531
1524 err = nfs_dns_resolver_init(); 1532 err = nfs_dns_resolver_init();
1525 if (err < 0) 1533 if (err < 0)
1526 goto out8; 1534 goto out8;
@@ -1585,6 +1593,8 @@ out6:
1585out7: 1593out7:
1586 nfs_dns_resolver_destroy(); 1594 nfs_dns_resolver_destroy();
1587out8: 1595out8:
1596 nfs_idmap_quit();
1597out9:
1588 return err; 1598 return err;
1589} 1599}
1590 1600
@@ -1597,6 +1607,7 @@ static void __exit exit_nfs_fs(void)
1597 nfs_destroy_nfspagecache(); 1607 nfs_destroy_nfspagecache();
1598 nfs_fscache_unregister(); 1608 nfs_fscache_unregister();
1599 nfs_dns_resolver_destroy(); 1609 nfs_dns_resolver_destroy();
1610 nfs_idmap_quit();
1600#ifdef CONFIG_PROC_FS 1611#ifdef CONFIG_PROC_FS
1601 rpc_proc_unregister("nfs"); 1612 rpc_proc_unregister("nfs");
1602#endif 1613#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c961bc92c107..db08ff3ff454 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
63#define NFS_UNSPEC_PORT (-1) 63#define NFS_UNSPEC_PORT (-1)
64 64
65/* 65/*
66 * Maximum number of pages that readdir can use for creating
67 * a vmapped array of pages.
68 */
69#define NFS_MAX_READDIR_PAGES 8
70
71/*
66 * In-kernel mount arguments 72 * In-kernel mount arguments
67 */ 73 */
68struct nfs_parsed_mount_data { 74struct nfs_parsed_mount_data {
@@ -181,15 +187,15 @@ extern void nfs_destroy_directcache(void);
181/* nfs2xdr.c */ 187/* nfs2xdr.c */
182extern int nfs_stat_to_errno(int); 188extern int nfs_stat_to_errno(int);
183extern struct rpc_procinfo nfs_procedures[]; 189extern struct rpc_procinfo nfs_procedures[];
184extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); 190extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
185 191
186/* nfs3xdr.c */ 192/* nfs3xdr.c */
187extern struct rpc_procinfo nfs3_procedures[]; 193extern struct rpc_procinfo nfs3_procedures[];
188extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); 194extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
189 195
190/* nfs4xdr.c */ 196/* nfs4xdr.c */
191#ifdef CONFIG_NFS_V4 197#ifdef CONFIG_NFS_V4
192extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 198extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
193#endif 199#endif
194#ifdef CONFIG_NFS_V4_1 200#ifdef CONFIG_NFS_V4_1
195extern const u32 nfs41_maxread_overhead; 201extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..eceafe74f473 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
153 .rpc_resp = &result, 153 .rpc_resp = &result,
154 }; 154 };
155 struct rpc_create_args args = { 155 struct rpc_create_args args = {
156 .net = &init_net,
156 .protocol = info->protocol, 157 .protocol = info->protocol,
157 .address = info->sap, 158 .address = info->sap,
158 .addrsize = info->salen, 159 .addrsize = info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
224 .to_retries = 2, 225 .to_retries = 2,
225 }; 226 };
226 struct rpc_create_args args = { 227 struct rpc_create_args args = {
228 .net = &init_net,
227 .protocol = IPPROTO_UDP, 229 .protocol = IPPROTO_UDP,
228 .address = info->sap, 230 .address = info->sap,
229 .addrsize = info->salen, 231 .addrsize = info->salen,
@@ -436,7 +438,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
436 438
437 for (i = 0; i < entries; i++) { 439 for (i = 0; i < entries; i++) {
438 flavors[i] = ntohl(*p++); 440 flavors[i] = ntohl(*p++);
439 dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]); 441 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
440 } 442 }
441 *count = i; 443 *count = i;
442 444
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db8846a0e82e..e6bf45710cc7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
337static int 337static int
338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
339{ 339{
340 p = xdr_encode_fhandle(p, args->fromfh); 340 p = xdr_encode_fhandle(p, args->old_dir);
341 p = xdr_encode_array(p, args->fromname, args->fromlen); 341 p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
342 p = xdr_encode_fhandle(p, args->tofh); 342 p = xdr_encode_fhandle(p, args->new_dir);
343 p = xdr_encode_array(p, args->toname, args->tolen); 343 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
345 return 0; 345 return 0;
346} 346}
@@ -423,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
423 struct page **page; 423 struct page **page;
424 size_t hdrlen; 424 size_t hdrlen;
425 unsigned int pglen, recvd; 425 unsigned int pglen, recvd;
426 u32 len;
427 int status, nr = 0; 426 int status, nr = 0;
428 __be32 *end, *entry, *kaddr;
429 427
430 if ((status = ntohl(*p++))) 428 if ((status = ntohl(*p++)))
431 return nfs_stat_to_errno(status); 429 return nfs_stat_to_errno(status);
@@ -445,80 +443,59 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
445 if (pglen > recvd) 443 if (pglen > recvd)
446 pglen = recvd; 444 pglen = recvd;
447 page = rcvbuf->pages; 445 page = rcvbuf->pages;
448 kaddr = p = kmap_atomic(*page, KM_USER0);
449 end = (__be32 *)((char *)p + pglen);
450 entry = p;
451
452 /* Make sure the packet actually has a value_follows and EOF entry */
453 if ((entry + 1) > end)
454 goto short_pkt;
455
456 for (; *p++; nr++) {
457 if (p + 2 > end)
458 goto short_pkt;
459 p++; /* fileid */
460 len = ntohl(*p++);
461 p += XDR_QUADLEN(len) + 1; /* name plus cookie */
462 if (len > NFS2_MAXNAMLEN) {
463 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
464 len);
465 goto err_unmap;
466 }
467 if (p + 2 > end)
468 goto short_pkt;
469 entry = p;
470 }
471
472 /*
473 * Apparently some server sends responses that are a valid size, but
474 * contain no entries, and have value_follows==0 and EOF==0. For
475 * those, just set the EOF marker.
476 */
477 if (!nr && entry[1] == 0) {
478 dprintk("NFS: readdir reply truncated!\n");
479 entry[1] = 1;
480 }
481 out:
482 kunmap_atomic(kaddr, KM_USER0);
483 return nr; 446 return nr;
484 short_pkt: 447}
485 /* 448
486 * When we get a short packet there are 2 possibilities. We can 449static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
487 * return an error, or fix up the response to look like a valid 450{
488 * response and return what we have so far. If there are no 451 dprintk("nfs: %s: prematurely hit end of receive buffer. "
489 * entries and the packet was short, then return -EIO. If there 452 "Remaining buffer length is %tu words.\n",
490 * are valid entries in the response, return them and pretend that 453 func, xdr->end - xdr->p);
491 * the call was successful, but incomplete. The caller can retry the
492 * readdir starting at the last cookie.
493 */
494 entry[0] = entry[1] = 0;
495 if (!nr)
496 nr = -errno_NFSERR_IO;
497 goto out;
498err_unmap:
499 nr = -errno_NFSERR_IO;
500 goto out;
501} 454}
502 455
503__be32 * 456__be32 *
504nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 457nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
505{ 458{
506 if (!*p++) { 459 __be32 *p;
507 if (!*p) 460 p = xdr_inline_decode(xdr, 4);
461 if (unlikely(!p))
462 goto out_overflow;
463 if (!ntohl(*p++)) {
464 p = xdr_inline_decode(xdr, 4);
465 if (unlikely(!p))
466 goto out_overflow;
467 if (!ntohl(*p++))
508 return ERR_PTR(-EAGAIN); 468 return ERR_PTR(-EAGAIN);
509 entry->eof = 1; 469 entry->eof = 1;
510 return ERR_PTR(-EBADCOOKIE); 470 return ERR_PTR(-EBADCOOKIE);
511 } 471 }
512 472
473 p = xdr_inline_decode(xdr, 8);
474 if (unlikely(!p))
475 goto out_overflow;
476
513 entry->ino = ntohl(*p++); 477 entry->ino = ntohl(*p++);
514 entry->len = ntohl(*p++); 478 entry->len = ntohl(*p++);
479
480 p = xdr_inline_decode(xdr, entry->len + 4);
481 if (unlikely(!p))
482 goto out_overflow;
515 entry->name = (const char *) p; 483 entry->name = (const char *) p;
516 p += XDR_QUADLEN(entry->len); 484 p += XDR_QUADLEN(entry->len);
517 entry->prev_cookie = entry->cookie; 485 entry->prev_cookie = entry->cookie;
518 entry->cookie = ntohl(*p++); 486 entry->cookie = ntohl(*p++);
519 entry->eof = !p[0] && p[1]; 487
488 p = xdr_inline_peek(xdr, 8);
489 if (p != NULL)
490 entry->eof = !p[0] && p[1];
491 else
492 entry->eof = 0;
520 493
521 return p; 494 return p;
495
496out_overflow:
497 print_overflow_msg(__func__, xdr);
498 return ERR_PTR(-EIO);
522} 499}
523 500
524/* 501/*
@@ -596,7 +573,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
596 struct kvec *iov = rcvbuf->head; 573 struct kvec *iov = rcvbuf->head;
597 size_t hdrlen; 574 size_t hdrlen;
598 u32 len, recvd; 575 u32 len, recvd;
599 char *kaddr;
600 int status; 576 int status;
601 577
602 if ((status = ntohl(*p++))) 578 if ((status = ntohl(*p++)))
@@ -623,10 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
623 return -EIO; 599 return -EIO;
624 } 600 }
625 601
626 /* NULL terminate the string we got */ 602 xdr_terminate_string(rcvbuf, len);
627 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
628 kaddr[len+rcvbuf->page_base] = '\0';
629 kunmap_atomic(kaddr, KM_USER0);
630 return 0; 603 return 0;
631} 604}
632 605
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a1..ce939c062a52 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
313 */ 313 */
314static int 314static int
315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
316 int flags, struct nameidata *nd) 316 int flags, struct nfs_open_context *ctx)
317{ 317{
318 struct nfs3_createdata *data; 318 struct nfs3_createdata *data;
319 mode_t mode = sattr->ia_mode; 319 mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
438 return 1; 438 return 1;
439} 439}
440 440
441static void
442nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
443{
444 msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
445}
446
447static int
448nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
449 struct inode *new_dir)
450{
451 struct nfs_renameres *res;
452
453 if (nfs3_async_handle_jukebox(task, old_dir))
454 return 0;
455 res = task->tk_msg.rpc_resp;
456
457 nfs_post_op_update_inode(old_dir, res->old_fattr);
458 nfs_post_op_update_inode(new_dir, res->new_fattr);
459 return 1;
460}
461
441static int 462static int
442nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, 463nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
443 struct inode *new_dir, struct qstr *new_name) 464 struct inode *new_dir, struct qstr *new_name)
444{ 465{
445 struct nfs3_renameargs arg = { 466 struct nfs_renameargs arg = {
446 .fromfh = NFS_FH(old_dir), 467 .old_dir = NFS_FH(old_dir),
447 .fromname = old_name->name, 468 .old_name = old_name,
448 .fromlen = old_name->len, 469 .new_dir = NFS_FH(new_dir),
449 .tofh = NFS_FH(new_dir), 470 .new_name = new_name,
450 .toname = new_name->name,
451 .tolen = new_name->len
452 }; 471 };
453 struct nfs3_renameres res; 472 struct nfs_renameres res;
454 struct rpc_message msg = { 473 struct rpc_message msg = {
455 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], 474 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
456 .rpc_argp = &arg, 475 .rpc_argp = &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
460 479
461 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 480 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
462 481
463 res.fromattr = nfs_alloc_fattr(); 482 res.old_fattr = nfs_alloc_fattr();
464 res.toattr = nfs_alloc_fattr(); 483 res.new_fattr = nfs_alloc_fattr();
465 if (res.fromattr == NULL || res.toattr == NULL) 484 if (res.old_fattr == NULL || res.new_fattr == NULL)
466 goto out; 485 goto out;
467 486
468 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 487 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
469 nfs_post_op_update_inode(old_dir, res.fromattr); 488 nfs_post_op_update_inode(old_dir, res.old_fattr);
470 nfs_post_op_update_inode(new_dir, res.toattr); 489 nfs_post_op_update_inode(new_dir, res.new_fattr);
471out: 490out:
472 nfs_free_fattr(res.toattr); 491 nfs_free_fattr(res.old_fattr);
473 nfs_free_fattr(res.fromattr); 492 nfs_free_fattr(res.new_fattr);
474 dprintk("NFS reply rename: %d\n", status); 493 dprintk("NFS reply rename: %d\n", status);
475 return status; 494 return status;
476} 495}
@@ -611,7 +630,7 @@ out:
611 */ 630 */
612static int 631static int
613nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 632nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
614 u64 cookie, struct page *page, unsigned int count, int plus) 633 u64 cookie, struct page **pages, unsigned int count, int plus)
615{ 634{
616 struct inode *dir = dentry->d_inode; 635 struct inode *dir = dentry->d_inode;
617 __be32 *verf = NFS_COOKIEVERF(dir); 636 __be32 *verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
621 .verf = {verf[0], verf[1]}, 640 .verf = {verf[0], verf[1]},
622 .plus = plus, 641 .plus = plus,
623 .count = count, 642 .count = count,
624 .pages = &page 643 .pages = pages
625 }; 644 };
626 struct nfs3_readdirres res = { 645 struct nfs3_readdirres res = {
627 .verf = verf, 646 .verf = verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
652 671
653 nfs_free_fattr(res.dir_attr); 672 nfs_free_fattr(res.dir_attr);
654out: 673out:
655 dprintk("NFS reply readdir: %d\n", status); 674 dprintk("NFS reply readdir%s: %d\n",
675 plus? "plus" : "", status);
656 return status; 676 return status;
657} 677}
658 678
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
722 dprintk("NFS call fsstat\n"); 742 dprintk("NFS call fsstat\n");
723 nfs_fattr_init(stat->fattr); 743 nfs_fattr_init(stat->fattr);
724 status = rpc_call_sync(server->client, &msg, 0); 744 status = rpc_call_sync(server->client, &msg, 0);
725 dprintk("NFS reply statfs: %d\n", status); 745 dprintk("NFS reply fsstat: %d\n", status);
726 return status; 746 return status;
727} 747}
728 748
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
844 .unlink_setup = nfs3_proc_unlink_setup, 864 .unlink_setup = nfs3_proc_unlink_setup,
845 .unlink_done = nfs3_proc_unlink_done, 865 .unlink_done = nfs3_proc_unlink_done,
846 .rename = nfs3_proc_rename, 866 .rename = nfs3_proc_rename,
867 .rename_setup = nfs3_proc_rename_setup,
868 .rename_done = nfs3_proc_rename_done,
847 .link = nfs3_proc_link, 869 .link = nfs3_proc_link,
848 .symlink = nfs3_proc_symlink, 870 .symlink = nfs3_proc_symlink,
849 .mkdir = nfs3_proc_mkdir, 871 .mkdir = nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9769704f8ce6..d9a5e832c257 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = {
100 [NF3FIFO] = S_IFIFO, 100 [NF3FIFO] = S_IFIFO,
101}; 101};
102 102
103static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
104{
105 dprintk("nfs: %s: prematurely hit end of receive buffer. "
106 "Remaining buffer length is %tu words.\n",
107 func, xdr->end - xdr->p);
108}
109
103/* 110/*
104 * Common NFS XDR functions as inlines 111 * Common NFS XDR functions as inlines
105 */ 112 */
@@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
119 return NULL; 126 return NULL;
120} 127}
121 128
129static inline __be32 *
130xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
131{
132 __be32 *p;
133 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(!p))
135 goto out_overflow;
136 fh->size = ntohl(*p++);
137
138 if (fh->size <= NFS3_FHSIZE) {
139 p = xdr_inline_decode(xdr, fh->size);
140 if (unlikely(!p))
141 goto out_overflow;
142 memcpy(fh->data, p, fh->size);
143 return p + XDR_QUADLEN(fh->size);
144 }
145 return NULL;
146
147out_overflow:
148 print_overflow_msg(__func__, xdr);
149 return ERR_PTR(-EIO);
150}
151
122/* 152/*
123 * Encode/decode time. 153 * Encode/decode time.
124 */ 154 */
@@ -241,6 +271,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
241} 271}
242 272
243static inline __be32 * 273static inline __be32 *
274xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
275{
276 __be32 *p;
277
278 p = xdr_inline_decode(xdr, 4);
279 if (unlikely(!p))
280 goto out_overflow;
281 if (ntohl(*p++)) {
282 p = xdr_inline_decode(xdr, 84);
283 if (unlikely(!p))
284 goto out_overflow;
285 p = xdr_decode_fattr(p, fattr);
286 }
287 return p;
288out_overflow:
289 print_overflow_msg(__func__, xdr);
290 return ERR_PTR(-EIO);
291}
292
293static inline __be32 *
244xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) 294xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
245{ 295{
246 if (*p++) 296 if (*p++)
@@ -442,12 +492,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
442 * Encode RENAME arguments 492 * Encode RENAME arguments
443 */ 493 */
444static int 494static int
445nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args) 495nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
446{ 496{
447 p = xdr_encode_fhandle(p, args->fromfh); 497 p = xdr_encode_fhandle(p, args->old_dir);
448 p = xdr_encode_array(p, args->fromname, args->fromlen); 498 p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
449 p = xdr_encode_fhandle(p, args->tofh); 499 p = xdr_encode_fhandle(p, args->new_dir);
450 p = xdr_encode_array(p, args->toname, args->tolen); 500 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
451 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 501 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
452 return 0; 502 return 0;
453} 503}
@@ -504,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
504 struct kvec *iov = rcvbuf->head; 554 struct kvec *iov = rcvbuf->head;
505 struct page **page; 555 struct page **page;
506 size_t hdrlen; 556 size_t hdrlen;
507 u32 len, recvd, pglen; 557 u32 recvd, pglen;
508 int status, nr = 0; 558 int status, nr = 0;
509 __be32 *entry, *end, *kaddr;
510 559
511 status = ntohl(*p++); 560 status = ntohl(*p++);
512 /* Decode post_op_attrs */ 561 /* Decode post_op_attrs */
@@ -536,99 +585,38 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
536 if (pglen > recvd) 585 if (pglen > recvd)
537 pglen = recvd; 586 pglen = recvd;
538 page = rcvbuf->pages; 587 page = rcvbuf->pages;
539 kaddr = p = kmap_atomic(*page, KM_USER0);
540 end = (__be32 *)((char *)p + pglen);
541 entry = p;
542
543 /* Make sure the packet actually has a value_follows and EOF entry */
544 if ((entry + 1) > end)
545 goto short_pkt;
546
547 for (; *p++; nr++) {
548 if (p + 3 > end)
549 goto short_pkt;
550 p += 2; /* inode # */
551 len = ntohl(*p++); /* string length */
552 p += XDR_QUADLEN(len) + 2; /* name + cookie */
553 if (len > NFS3_MAXNAMLEN) {
554 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
555 len);
556 goto err_unmap;
557 }
558 588
559 if (res->plus) {
560 /* post_op_attr */
561 if (p + 2 > end)
562 goto short_pkt;
563 if (*p++) {
564 p += 21;
565 if (p + 1 > end)
566 goto short_pkt;
567 }
568 /* post_op_fh3 */
569 if (*p++) {
570 if (p + 1 > end)
571 goto short_pkt;
572 len = ntohl(*p++);
573 if (len > NFS3_FHSIZE) {
574 dprintk("NFS: giant filehandle in "
575 "readdir (len 0x%x)!\n", len);
576 goto err_unmap;
577 }
578 p += XDR_QUADLEN(len);
579 }
580 }
581
582 if (p + 2 > end)
583 goto short_pkt;
584 entry = p;
585 }
586
587 /*
588 * Apparently some server sends responses that are a valid size, but
589 * contain no entries, and have value_follows==0 and EOF==0. For
590 * those, just set the EOF marker.
591 */
592 if (!nr && entry[1] == 0) {
593 dprintk("NFS: readdir reply truncated!\n");
594 entry[1] = 1;
595 }
596 out:
597 kunmap_atomic(kaddr, KM_USER0);
598 return nr; 589 return nr;
599 short_pkt:
600 /*
601 * When we get a short packet there are 2 possibilities. We can
602 * return an error, or fix up the response to look like a valid
603 * response and return what we have so far. If there are no
604 * entries and the packet was short, then return -EIO. If there
605 * are valid entries in the response, return them and pretend that
606 * the call was successful, but incomplete. The caller can retry the
607 * readdir starting at the last cookie.
608 */
609 entry[0] = entry[1] = 0;
610 if (!nr)
611 nr = -errno_NFSERR_IO;
612 goto out;
613err_unmap:
614 nr = -errno_NFSERR_IO;
615 goto out;
616} 590}
617 591
618__be32 * 592__be32 *
619nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 593nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
620{ 594{
595 __be32 *p;
621 struct nfs_entry old = *entry; 596 struct nfs_entry old = *entry;
622 597
623 if (!*p++) { 598 p = xdr_inline_decode(xdr, 4);
624 if (!*p) 599 if (unlikely(!p))
600 goto out_overflow;
601 if (!ntohl(*p++)) {
602 p = xdr_inline_decode(xdr, 4);
603 if (unlikely(!p))
604 goto out_overflow;
605 if (!ntohl(*p++))
625 return ERR_PTR(-EAGAIN); 606 return ERR_PTR(-EAGAIN);
626 entry->eof = 1; 607 entry->eof = 1;
627 return ERR_PTR(-EBADCOOKIE); 608 return ERR_PTR(-EBADCOOKIE);
628 } 609 }
629 610
611 p = xdr_inline_decode(xdr, 12);
612 if (unlikely(!p))
613 goto out_overflow;
630 p = xdr_decode_hyper(p, &entry->ino); 614 p = xdr_decode_hyper(p, &entry->ino);
631 entry->len = ntohl(*p++); 615 entry->len = ntohl(*p++);
616
617 p = xdr_inline_decode(xdr, entry->len + 8);
618 if (unlikely(!p))
619 goto out_overflow;
632 entry->name = (const char *) p; 620 entry->name = (const char *) p;
633 p += XDR_QUADLEN(entry->len); 621 p += XDR_QUADLEN(entry->len);
634 entry->prev_cookie = entry->cookie; 622 entry->prev_cookie = entry->cookie;
@@ -636,10 +624,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
636 624
637 if (plus) { 625 if (plus) {
638 entry->fattr->valid = 0; 626 entry->fattr->valid = 0;
639 p = xdr_decode_post_op_attr(p, entry->fattr); 627 p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
628 if (IS_ERR(p))
629 goto out_overflow_exit;
640 /* In fact, a post_op_fh3: */ 630 /* In fact, a post_op_fh3: */
631 p = xdr_inline_decode(xdr, 4);
632 if (unlikely(!p))
633 goto out_overflow;
641 if (*p++) { 634 if (*p++) {
642 p = xdr_decode_fhandle(p, entry->fh); 635 p = xdr_decode_fhandle_stream(xdr, entry->fh);
636 if (IS_ERR(p))
637 goto out_overflow_exit;
643 /* Ugh -- server reply was truncated */ 638 /* Ugh -- server reply was truncated */
644 if (p == NULL) { 639 if (p == NULL) {
645 dprintk("NFS: FH truncated\n"); 640 dprintk("NFS: FH truncated\n");
@@ -650,8 +645,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
650 memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); 645 memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
651 } 646 }
652 647
653 entry->eof = !p[0] && p[1]; 648 p = xdr_inline_peek(xdr, 8);
649 if (p != NULL)
650 entry->eof = !p[0] && p[1];
651 else
652 entry->eof = 0;
653
654 return p; 654 return p;
655
656out_overflow:
657 print_overflow_msg(__func__, xdr);
658out_overflow_exit:
659 return ERR_PTR(-EIO);
655} 660}
656 661
657/* 662/*
@@ -824,7 +829,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
824 struct kvec *iov = rcvbuf->head; 829 struct kvec *iov = rcvbuf->head;
825 size_t hdrlen; 830 size_t hdrlen;
826 u32 len, recvd; 831 u32 len, recvd;
827 char *kaddr;
828 int status; 832 int status;
829 833
830 status = ntohl(*p++); 834 status = ntohl(*p++);
@@ -857,10 +861,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
857 return -EIO; 861 return -EIO;
858 } 862 }
859 863
860 /* NULL terminate the string we got */ 864 xdr_terminate_string(rcvbuf, len);
861 kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
862 kaddr[len+rcvbuf->page_base] = '\0';
863 kunmap_atomic(kaddr, KM_USER0);
864 return 0; 865 return 0;
865} 866}
866 867
@@ -970,14 +971,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
970 * Decode RENAME reply 971 * Decode RENAME reply
971 */ 972 */
972static int 973static int
973nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) 974nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
974{ 975{
975 int status; 976 int status;
976 977
977 if ((status = ntohl(*p++)) != 0) 978 if ((status = ntohl(*p++)) != 0)
978 status = nfs_stat_to_errno(status); 979 status = nfs_stat_to_errno(status);
979 p = xdr_decode_wcc_data(p, res->fromattr); 980 p = xdr_decode_wcc_data(p, res->old_fattr);
980 p = xdr_decode_wcc_data(p, res->toattr); 981 p = xdr_decode_wcc_data(p, res->new_fattr);
981 return status; 982 return status;
982} 983}
983 984
@@ -1043,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
1043 res->wtmult = ntohl(*p++); 1044 res->wtmult = ntohl(*p++);
1044 res->dtpref = ntohl(*p++); 1045 res->dtpref = ntohl(*p++);
1045 p = xdr_decode_hyper(p, &res->maxfilesize); 1046 p = xdr_decode_hyper(p, &res->maxfilesize);
1047 p = xdr_decode_time3(p, &res->time_delta);
1046 1048
1047 /* ignore time_delta and properties */ 1049 /* ignore properties */
1048 res->lease_time = 0; 1050 res->lease_time = 0;
1049 return 0; 1051 return 0;
1050} 1052}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 311e15cc8af0..9fa496387fdf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,8 +242,6 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); 244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
245extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
246extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
247extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 245extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
248extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 246extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
249 struct nfs4_fs_locations *fs_locations, struct page *page); 247 struct nfs4_fs_locations *fs_locations, struct page *page);
@@ -333,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
333extern const nfs4_stateid zero_stateid; 331extern const nfs4_stateid zero_stateid;
334 332
335/* nfs4xdr.c */ 333/* nfs4xdr.c */
336extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 334extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
337extern struct rpc_procinfo nfs4_procedures[]; 335extern struct rpc_procinfo nfs4_procedures[];
338 336
339struct nfs4_mount_data; 337struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..2e92f0d8d654
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
1/*
2 * Module for the pnfs nfs4 file layout driver.
3 * Defines all I/O and Policy interface operations, plus code
4 * to register itself with the pNFS client.
5 *
6 * Copyright (c) 2002
7 * The Regents of the University of Michigan
8 * All Rights Reserved
9 *
10 * Dean Hildebrand <dhildebz@umich.edu>
11 *
12 * Permission is granted to use, copy, create derivative works, and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the University of Michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. If
17 * the above copyright notice or any other identification of the
18 * University of Michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * This software is provided as is, without representation or warranty
22 * of any kind either express or implied, including without limitation
23 * the implied warranties of merchantability, fitness for a particular
24 * purpose, or noninfringement. The Regents of the University of
25 * Michigan shall not be liable for any damages, including special,
26 * indirect, incidental, or consequential damages, with respect to any
27 * claim arising out of or in connection with the use of the software,
28 * even if it has been or is hereafter advised of the possibility of
29 * such damages.
30 */
31
32#include <linux/nfs_fs.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39MODULE_LICENSE("GPL");
40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
41MODULE_DESCRIPTION("The NFSv4 file layout driver");
42
43static int
44filelayout_set_layoutdriver(struct nfs_server *nfss)
45{
46 int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
47 nfs4_fl_free_deviceid_callback);
48 if (status) {
49 printk(KERN_WARNING "%s: deviceid cache could not be "
50 "initialized\n", __func__);
51 return status;
52 }
53 dprintk("%s: deviceid cache has been initialized successfully\n",
54 __func__);
55 return 0;
56}
57
58/* Clear out the layout by destroying its device list */
59static int
60filelayout_clear_layoutdriver(struct nfs_server *nfss)
61{
62 dprintk("--> %s\n", __func__);
63
64 if (nfss->nfs_client->cl_devid_cache)
65 pnfs_put_deviceid_cache(nfss->nfs_client);
66 return 0;
67}
68
69/*
70 * filelayout_check_layout()
71 *
72 * Make sure layout segment parameters are sane WRT the device.
73 * At this point no generic layer initialization of the lseg has occurred,
74 * and nothing has been added to the layout_hdr cache.
75 *
76 */
77static int
78filelayout_check_layout(struct pnfs_layout_hdr *lo,
79 struct nfs4_filelayout_segment *fl,
80 struct nfs4_layoutget_res *lgr,
81 struct nfs4_deviceid *id)
82{
83 struct nfs4_file_layout_dsaddr *dsaddr;
84 int status = -EINVAL;
85 struct nfs_server *nfss = NFS_SERVER(lo->inode);
86
87 dprintk("--> %s\n", __func__);
88
89 if (fl->pattern_offset > lgr->range.offset) {
90 dprintk("%s pattern_offset %lld to large\n",
91 __func__, fl->pattern_offset);
92 goto out;
93 }
94
95 if (fl->stripe_unit % PAGE_SIZE) {
96 dprintk("%s Stripe unit (%u) not page aligned\n",
97 __func__, fl->stripe_unit);
98 goto out;
99 }
100
101 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
103 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->inode, id);
105 if (dsaddr == NULL)
106 goto out;
107 }
108 fl->dsaddr = dsaddr;
109
110 if (fl->first_stripe_index < 0 ||
111 fl->first_stripe_index >= dsaddr->stripe_count) {
112 dprintk("%s Bad first_stripe_index %d\n",
113 __func__, fl->first_stripe_index);
114 goto out_put;
115 }
116
117 if ((fl->stripe_type == STRIPE_SPARSE &&
118 fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
119 (fl->stripe_type == STRIPE_DENSE &&
120 fl->num_fh != dsaddr->stripe_count)) {
121 dprintk("%s num_fh %u not valid for given packing\n",
122 __func__, fl->num_fh);
123 goto out_put;
124 }
125
126 if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
127 dprintk("%s Stripe unit (%u) not aligned with rsize %u "
128 "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
129 nfss->wsize);
130 }
131
132 status = 0;
133out:
134 dprintk("--> %s returns %d\n", __func__, status);
135 return status;
136out_put:
137 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
138 goto out;
139}
140
141static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
142{
143 int i;
144
145 for (i = 0; i < fl->num_fh; i++) {
146 if (!fl->fh_array[i])
147 break;
148 kfree(fl->fh_array[i]);
149 }
150 kfree(fl->fh_array);
151 fl->fh_array = NULL;
152}
153
154static void
155_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
156{
157 filelayout_free_fh_array(fl);
158 kfree(fl);
159}
160
161static int
162filelayout_decode_layout(struct pnfs_layout_hdr *flo,
163 struct nfs4_filelayout_segment *fl,
164 struct nfs4_layoutget_res *lgr,
165 struct nfs4_deviceid *id)
166{
167 uint32_t *p = (uint32_t *)lgr->layout.buf;
168 uint32_t nfl_util;
169 int i;
170
171 dprintk("%s: set_layout_map Begin\n", __func__);
172
173 memcpy(id, p, sizeof(*id));
174 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
175 print_deviceid(id);
176
177 nfl_util = be32_to_cpup(p++);
178 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
179 fl->commit_through_mds = 1;
180 if (nfl_util & NFL4_UFLG_DENSE)
181 fl->stripe_type = STRIPE_DENSE;
182 else
183 fl->stripe_type = STRIPE_SPARSE;
184 fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
185
186 fl->first_stripe_index = be32_to_cpup(p++);
187 p = xdr_decode_hyper(p, &fl->pattern_offset);
188 fl->num_fh = be32_to_cpup(p++);
189
190 dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
191 __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
192 fl->pattern_offset);
193
194 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
195 GFP_KERNEL);
196 if (!fl->fh_array)
197 return -ENOMEM;
198
199 for (i = 0; i < fl->num_fh; i++) {
200 /* Do we want to use a mempool here? */
201 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
202 if (!fl->fh_array[i]) {
203 filelayout_free_fh_array(fl);
204 return -ENOMEM;
205 }
206 fl->fh_array[i]->size = be32_to_cpup(p++);
207 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
208 printk(KERN_ERR "Too big fh %d received %d\n",
209 i, fl->fh_array[i]->size);
210 filelayout_free_fh_array(fl);
211 return -EIO;
212 }
213 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
214 p += XDR_QUADLEN(fl->fh_array[i]->size);
215 dprintk("DEBUG: %s: fh len %d\n", __func__,
216 fl->fh_array[i]->size);
217 }
218
219 return 0;
220}
221
222static struct pnfs_layout_segment *
223filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
224 struct nfs4_layoutget_res *lgr)
225{
226 struct nfs4_filelayout_segment *fl;
227 int rc;
228 struct nfs4_deviceid id;
229
230 dprintk("--> %s\n", __func__);
231 fl = kzalloc(sizeof(*fl), GFP_KERNEL);
232 if (!fl)
233 return NULL;
234
235 rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
236 if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
237 _filelayout_free_lseg(fl);
238 return NULL;
239 }
240 return &fl->generic_hdr;
241}
242
243static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{
246 struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248
249 dprintk("--> %s\n", __func__);
250 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
251 &fl->dsaddr->deviceid);
252 _filelayout_free_lseg(fl);
253}
254
255static struct pnfs_layoutdriver_type filelayout_type = {
256 .id = LAYOUT_NFSV4_1_FILES,
257 .name = "LAYOUT_NFSV4_1_FILES",
258 .owner = THIS_MODULE,
259 .set_layoutdriver = filelayout_set_layoutdriver,
260 .clear_layoutdriver = filelayout_clear_layoutdriver,
261 .alloc_lseg = filelayout_alloc_lseg,
262 .free_lseg = filelayout_free_lseg,
263};
264
265static int __init nfs4filelayout_init(void)
266{
267 printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
268 __func__);
269 return pnfs_register_layoutdriver(&filelayout_type);
270}
271
272static void __exit nfs4filelayout_exit(void)
273{
274 printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
275 __func__);
276 pnfs_unregister_layoutdriver(&filelayout_type);
277}
278
279module_init(nfs4filelayout_init);
280module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
1/*
2 * NFSv4 file layout driver data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_NFS4FILELAYOUT_H
31#define FS_NFS_NFS4FILELAYOUT_H
32
33#include "pnfs.h"
34
35/*
36 * Field testing shows we need to support upto 4096 stripe indices.
37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
38 * reasonable. This in turn means we support a maximum of 256
39 * RFC 5661 multipath_list4 structures.
40 */
41#define NFS4_PNFS_MAX_STRIPE_CNT 4096
42#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
43
44enum stripetype4 {
45 STRIPE_SPARSE = 1,
46 STRIPE_DENSE = 2
47};
48
49/* Individual ip address */
50struct nfs4_pnfs_ds {
51 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
52 u32 ds_ip_addr;
53 u32 ds_port;
54 struct nfs_client *ds_clp;
55 atomic_t ds_count;
56};
57
58struct nfs4_file_layout_dsaddr {
59 struct pnfs_deviceid_node deviceid;
60 u32 stripe_count;
61 u8 *stripe_indices;
62 u32 ds_num;
63 struct nfs4_pnfs_ds *ds_list[1];
64};
65
66struct nfs4_filelayout_segment {
67 struct pnfs_layout_segment generic_hdr;
68 u32 stripe_type;
69 u32 commit_through_mds;
70 u32 stripe_unit;
71 u32 first_stripe_index;
72 u64 pattern_offset;
73 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
74 unsigned int num_fh;
75 struct nfs_fh **fh_array;
76};
77
78static inline struct nfs4_filelayout_segment *
79FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
80{
81 return container_of(lseg,
82 struct nfs4_filelayout_segment,
83 generic_hdr);
84}
85
86extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
87extern void print_ds(struct nfs4_pnfs_ds *ds);
88extern void print_deviceid(struct nfs4_deviceid *dev_id);
89extern struct nfs4_file_layout_dsaddr *
90nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
91struct nfs4_file_layout_dsaddr *
92get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
93
94#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..51fe64ace55a
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39/*
40 * Data server cache
41 *
42 * Data servers can be mapped to different device ids.
43 * nfs4_pnfs_ds reference counting
44 * - set to 1 on allocation
45 * - incremented when a device id maps a data server already in the cache.
46 * - decremented when deviceid is removed from the cache.
47 */
48DEFINE_SPINLOCK(nfs4_ds_cache_lock);
49static LIST_HEAD(nfs4_data_server_cache);
50
51/* Debug routines */
52void
53print_ds(struct nfs4_pnfs_ds *ds)
54{
55 if (ds == NULL) {
56 printk("%s NULL device\n", __func__);
57 return;
58 }
59 printk(" ip_addr %x port %hu\n"
60 " ref count %d\n"
61 " client %p\n"
62 " cl_exchange_flags %x\n",
63 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
64 atomic_read(&ds->ds_count), ds->ds_clp,
65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
66}
67
68void
69print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
70{
71 int i;
72
73 ifdebug(FACILITY) {
74 printk("%s dsaddr->ds_num %d\n", __func__,
75 dsaddr->ds_num);
76 for (i = 0; i < dsaddr->ds_num; i++)
77 print_ds(dsaddr->ds_list[i]);
78 }
79}
80
81void print_deviceid(struct nfs4_deviceid *id)
82{
83 u32 *p = (u32 *)id;
84
85 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
86 p[0], p[1], p[2], p[3]);
87}
88
89/* nfs4_ds_cache_lock is held */
90static struct nfs4_pnfs_ds *
91_data_server_lookup_locked(u32 ip_addr, u32 port)
92{
93 struct nfs4_pnfs_ds *ds;
94
95 dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
96 ntohl(ip_addr), ntohs(port));
97
98 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
99 if (ds->ds_ip_addr == ip_addr &&
100 ds->ds_port == port) {
101 return ds;
102 }
103 }
104 return NULL;
105}
106
107static void
108destroy_ds(struct nfs4_pnfs_ds *ds)
109{
110 dprintk("--> %s\n", __func__);
111 ifdebug(FACILITY)
112 print_ds(ds);
113
114 if (ds->ds_clp)
115 nfs_put_client(ds->ds_clp);
116 kfree(ds);
117}
118
119static void
120nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
121{
122 struct nfs4_pnfs_ds *ds;
123 int i;
124
125 print_deviceid(&dsaddr->deviceid.de_id);
126
127 for (i = 0; i < dsaddr->ds_num; i++) {
128 ds = dsaddr->ds_list[i];
129 if (ds != NULL) {
130 if (atomic_dec_and_lock(&ds->ds_count,
131 &nfs4_ds_cache_lock)) {
132 list_del_init(&ds->ds_node);
133 spin_unlock(&nfs4_ds_cache_lock);
134 destroy_ds(ds);
135 }
136 }
137 }
138 kfree(dsaddr->stripe_indices);
139 kfree(dsaddr);
140}
141
142void
143nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
144{
145 struct nfs4_file_layout_dsaddr *dsaddr =
146 container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
147
148 nfs4_fl_free_deviceid(dsaddr);
149}
150
151static struct nfs4_pnfs_ds *
152nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
153{
154 struct nfs4_pnfs_ds *tmp_ds, *ds;
155
156 ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
157 if (!ds)
158 goto out;
159
160 spin_lock(&nfs4_ds_cache_lock);
161 tmp_ds = _data_server_lookup_locked(ip_addr, port);
162 if (tmp_ds == NULL) {
163 ds->ds_ip_addr = ip_addr;
164 ds->ds_port = port;
165 atomic_set(&ds->ds_count, 1);
166 INIT_LIST_HEAD(&ds->ds_node);
167 ds->ds_clp = NULL;
168 list_add(&ds->ds_node, &nfs4_data_server_cache);
169 dprintk("%s add new data server ip 0x%x\n", __func__,
170 ds->ds_ip_addr);
171 } else {
172 kfree(ds);
173 atomic_inc(&tmp_ds->ds_count);
174 dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
175 __func__, tmp_ds->ds_ip_addr,
176 atomic_read(&tmp_ds->ds_count));
177 ds = tmp_ds;
178 }
179 spin_unlock(&nfs4_ds_cache_lock);
180out:
181 return ds;
182}
183
184/*
185 * Currently only support ipv4, and one multi-path address.
186 */
187static struct nfs4_pnfs_ds *
188decode_and_add_ds(__be32 **pp, struct inode *inode)
189{
190 struct nfs4_pnfs_ds *ds = NULL;
191 char *buf;
192 const char *ipend, *pstr;
193 u32 ip_addr, port;
194 int nlen, rlen, i;
195 int tmp[2];
196 __be32 *r_netid, *r_addr, *p = *pp;
197
198 /* r_netid */
199 nlen = be32_to_cpup(p++);
200 r_netid = p;
201 p += XDR_QUADLEN(nlen);
202
203 /* r_addr */
204 rlen = be32_to_cpup(p++);
205 r_addr = p;
206 p += XDR_QUADLEN(rlen);
207 *pp = p;
208
209 /* Check that netid is "tcp" */
210 if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) {
211 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
212 goto out_err;
213 }
214
215 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s Invalid address, length %d\n", __func__,
218 rlen);
219 goto out_err;
220 }
221 buf = kmalloc(rlen + 1, GFP_KERNEL);
222 buf[rlen] = '\0';
223 memcpy(buf, r_addr, rlen);
224
225 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) {
227 char *res = strrchr(buf, '.');
228 *res = '-';
229 }
230
231 /* Currently only support ipv4 address */
232 if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
233 dprintk("%s: Only ipv4 addresses supported\n", __func__);
234 goto out_free;
235 }
236
237 /* port */
238 pstr = ipend;
239 sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
240 port = htons((tmp[0] << 8) | (tmp[1]));
241
242 ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
243 dprintk("%s Decoded address and port %s\n", __func__, buf);
244out_free:
245 kfree(buf);
246out_err:
247 return ds;
248}
249
250/* Decode opaque device data and return the result */
251static struct nfs4_file_layout_dsaddr*
252decode_device(struct inode *ino, struct pnfs_device *pdev)
253{
254 int i, dummy;
255 u32 cnt, num;
256 u8 *indexp;
257 __be32 *p = (__be32 *)pdev->area, *indicesp;
258 struct nfs4_file_layout_dsaddr *dsaddr;
259
260 /* Get the stripe count (number of stripe index) */
261 cnt = be32_to_cpup(p++);
262 dprintk("%s stripe count %d\n", __func__, cnt);
263 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
264 printk(KERN_WARNING "%s: stripe count %d greater than "
265 "supported maximum %d\n", __func__,
266 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
267 goto out_err;
268 }
269
270 /* Check the multipath list count */
271 indicesp = p;
272 p += XDR_QUADLEN(cnt << 2);
273 num = be32_to_cpup(p++);
274 dprintk("%s ds_num %u\n", __func__, num);
275 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
276 printk(KERN_WARNING "%s: multipath count %d greater than "
277 "supported maximum %d\n", __func__,
278 num, NFS4_PNFS_MAX_MULTI_CNT);
279 goto out_err;
280 }
281 dsaddr = kzalloc(sizeof(*dsaddr) +
282 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
283 GFP_KERNEL);
284 if (!dsaddr)
285 goto out_err;
286
287 dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
288 if (!dsaddr->stripe_indices)
289 goto out_err_free;
290
291 dsaddr->stripe_count = cnt;
292 dsaddr->ds_num = num;
293
294 memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
295
296 /* Go back an read stripe indices */
297 p = indicesp;
298 indexp = &dsaddr->stripe_indices[0];
299 for (i = 0; i < dsaddr->stripe_count; i++) {
300 *indexp = be32_to_cpup(p++);
301 if (*indexp >= num)
302 goto out_err_free;
303 indexp++;
304 }
305 /* Skip already read multipath list count */
306 p++;
307
308 for (i = 0; i < dsaddr->ds_num; i++) {
309 int j;
310
311 dummy = be32_to_cpup(p++); /* multipath count */
312 if (dummy > 1) {
313 printk(KERN_WARNING
314 "%s: Multipath count %d not supported, "
315 "skipping all greater than 1\n", __func__,
316 dummy);
317 }
318 for (j = 0; j < dummy; j++) {
319 if (j == 0) {
320 dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
321 if (dsaddr->ds_list[i] == NULL)
322 goto out_err_free;
323 } else {
324 u32 len;
325 /* skip extra multipath */
326 len = be32_to_cpup(p++);
327 p += XDR_QUADLEN(len);
328 len = be32_to_cpup(p++);
329 p += XDR_QUADLEN(len);
330 continue;
331 }
332 }
333 }
334 return dsaddr;
335
336out_err_free:
337 nfs4_fl_free_deviceid(dsaddr);
338out_err:
339 dprintk("%s ERROR: returning NULL\n", __func__);
340 return NULL;
341}
342
343/*
344 * Decode the opaque device specified in 'dev'
345 * and add it to the list of available devices.
346 * If the deviceid is already cached, nfs4_add_deviceid will return
347 * a pointer to the cached struct and throw away the new.
348 */
349static struct nfs4_file_layout_dsaddr*
350decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
351{
352 struct nfs4_file_layout_dsaddr *dsaddr;
353 struct pnfs_deviceid_node *d;
354
355 dsaddr = decode_device(inode, dev);
356 if (!dsaddr) {
357 printk(KERN_WARNING "%s: Could not decode or add device\n",
358 __func__);
359 return NULL;
360 }
361
362 d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
363 &dsaddr->deviceid);
364
365 return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
366}
367
368/*
369 * Retrieve the information for dev_id, add it to the list
370 * of available devices, and return it.
371 */
372struct nfs4_file_layout_dsaddr *
373get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
374{
375 struct pnfs_device *pdev = NULL;
376 u32 max_resp_sz;
377 int max_pages;
378 struct page **pages = NULL;
379 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
380 int rc, i;
381 struct nfs_server *server = NFS_SERVER(inode);
382
383 /*
384 * Use the session max response size as the basis for setting
385 * GETDEVICEINFO's maxcount
386 */
387 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
388 max_pages = max_resp_sz >> PAGE_SHIFT;
389 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
390 __func__, inode, max_resp_sz, max_pages);
391
392 pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
393 if (pdev == NULL)
394 return NULL;
395
396 pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
397 if (pages == NULL) {
398 kfree(pdev);
399 return NULL;
400 }
401 for (i = 0; i < max_pages; i++) {
402 pages[i] = alloc_page(GFP_KERNEL);
403 if (!pages[i])
404 goto out_free;
405 }
406
407 /* set pdev->area */
408 pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
409 if (!pdev->area)
410 goto out_free;
411
412 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
413 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
414 pdev->pages = pages;
415 pdev->pgbase = 0;
416 pdev->pglen = PAGE_SIZE * max_pages;
417 pdev->mincount = 0;
418
419 rc = nfs4_proc_getdeviceinfo(server, pdev);
420 dprintk("%s getdevice info returns %d\n", __func__, rc);
421 if (rc)
422 goto out_free;
423
424 /*
425 * Found new device, need to decode it and then add it to the
426 * list of known devices for this mountpoint.
427 */
428 dsaddr = decode_and_add_device(inode, pdev);
429out_free:
430 if (pdev->area != NULL)
431 vunmap(pdev->area);
432 for (i = 0; i < max_pages; i++)
433 __free_page(pages[i]);
434 kfree(pages);
435 kfree(pdev);
436 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
437 return dsaddr;
438}
439
440struct nfs4_file_layout_dsaddr *
441nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
442{
443 struct pnfs_deviceid_node *d;
444
445 d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
446 return (d == NULL) ? NULL :
447 container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
448}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 089da5b5d20a..0f24cdf2cb13 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,6 +55,7 @@
55#include "internal.h" 55#include "internal.h"
56#include "iostat.h" 56#include "iostat.h"
57#include "callback.h" 57#include "callback.h"
58#include "pnfs.h"
58 59
59#define NFSDBG_FACILITY NFSDBG_PROC 60#define NFSDBG_FACILITY NFSDBG_PROC
60 61
@@ -129,7 +130,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
129 | FATTR4_WORD0_MAXREAD 130 | FATTR4_WORD0_MAXREAD
130 | FATTR4_WORD0_MAXWRITE 131 | FATTR4_WORD0_MAXWRITE
131 | FATTR4_WORD0_LEASE_TIME, 132 | FATTR4_WORD0_LEASE_TIME,
132 0 133 FATTR4_WORD1_TIME_DELTA
134 | FATTR4_WORD1_FS_LAYOUT_TYPES
133}; 135};
134 136
135const u32 nfs4_fs_locations_bitmap[2] = { 137const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +257,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
255 nfs4_state_mark_reclaim_nograce(clp, state); 257 nfs4_state_mark_reclaim_nograce(clp, state);
256 goto do_state_recovery; 258 goto do_state_recovery;
257 case -NFS4ERR_STALE_STATEID: 259 case -NFS4ERR_STALE_STATEID:
258 if (state == NULL)
259 break;
260 nfs4_state_mark_reclaim_reboot(clp, state);
261 case -NFS4ERR_STALE_CLIENTID: 260 case -NFS4ERR_STALE_CLIENTID:
262 case -NFS4ERR_EXPIRED: 261 case -NFS4ERR_EXPIRED:
263 goto do_state_recovery; 262 goto do_state_recovery;
@@ -334,10 +333,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
334 * Must be called while holding tbl->slot_tbl_lock 333 * Must be called while holding tbl->slot_tbl_lock
335 */ 334 */
336static void 335static void
337nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) 336nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
338{ 337{
338 int free_slotid = free_slot - tbl->slots;
339 int slotid = free_slotid; 339 int slotid = free_slotid;
340 340
341 BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
341 /* clear used bit in bitmap */ 342 /* clear used bit in bitmap */
342 __clear_bit(slotid, tbl->used_slots); 343 __clear_bit(slotid, tbl->used_slots);
343 344
@@ -379,7 +380,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
379 struct nfs4_slot_table *tbl; 380 struct nfs4_slot_table *tbl;
380 381
381 tbl = &res->sr_session->fc_slot_table; 382 tbl = &res->sr_session->fc_slot_table;
382 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { 383 if (!res->sr_slot) {
383 /* just wake up the next guy waiting since 384 /* just wake up the next guy waiting since
384 * we may have not consumed a slot after all */ 385 * we may have not consumed a slot after all */
385 dprintk("%s: No slot\n", __func__); 386 dprintk("%s: No slot\n", __func__);
@@ -387,17 +388,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
387 } 388 }
388 389
389 spin_lock(&tbl->slot_tbl_lock); 390 spin_lock(&tbl->slot_tbl_lock);
390 nfs4_free_slot(tbl, res->sr_slotid); 391 nfs4_free_slot(tbl, res->sr_slot);
391 nfs41_check_drain_session_complete(res->sr_session); 392 nfs41_check_drain_session_complete(res->sr_session);
392 spin_unlock(&tbl->slot_tbl_lock); 393 spin_unlock(&tbl->slot_tbl_lock);
393 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 394 res->sr_slot = NULL;
394} 395}
395 396
396static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 397static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
397{ 398{
398 unsigned long timestamp; 399 unsigned long timestamp;
399 struct nfs4_slot_table *tbl;
400 struct nfs4_slot *slot;
401 struct nfs_client *clp; 400 struct nfs_client *clp;
402 401
403 /* 402 /*
@@ -410,17 +409,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
410 res->sr_status = NFS_OK; 409 res->sr_status = NFS_OK;
411 410
412 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ 411 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
413 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) 412 if (!res->sr_slot)
414 goto out; 413 goto out;
415 414
416 tbl = &res->sr_session->fc_slot_table;
417 slot = tbl->slots + res->sr_slotid;
418
419 /* Check the SEQUENCE operation status */ 415 /* Check the SEQUENCE operation status */
420 switch (res->sr_status) { 416 switch (res->sr_status) {
421 case 0: 417 case 0:
422 /* Update the slot's sequence and clientid lease timer */ 418 /* Update the slot's sequence and clientid lease timer */
423 ++slot->seq_nr; 419 ++res->sr_slot->seq_nr;
424 timestamp = res->sr_renewal_time; 420 timestamp = res->sr_renewal_time;
425 clp = res->sr_session->clp; 421 clp = res->sr_session->clp;
426 do_renew_lease(clp, timestamp); 422 do_renew_lease(clp, timestamp);
@@ -433,12 +429,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
433 * returned NFS4ERR_DELAY as per Section 2.10.6.2 429 * returned NFS4ERR_DELAY as per Section 2.10.6.2
434 * of RFC5661. 430 * of RFC5661.
435 */ 431 */
436 dprintk("%s: slot=%d seq=%d: Operation in progress\n", 432 dprintk("%s: slot=%td seq=%d: Operation in progress\n",
437 __func__, res->sr_slotid, slot->seq_nr); 433 __func__,
434 res->sr_slot - res->sr_session->fc_slot_table.slots,
435 res->sr_slot->seq_nr);
438 goto out_retry; 436 goto out_retry;
439 default: 437 default:
440 /* Just update the slot sequence no. */ 438 /* Just update the slot sequence no. */
441 ++slot->seq_nr; 439 ++res->sr_slot->seq_nr;
442 } 440 }
443out: 441out:
444 /* The session may be reset by one of the error handlers. */ 442 /* The session may be reset by one of the error handlers. */
@@ -505,10 +503,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
505 503
506 dprintk("--> %s\n", __func__); 504 dprintk("--> %s\n", __func__);
507 /* slot already allocated? */ 505 /* slot already allocated? */
508 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) 506 if (res->sr_slot != NULL)
509 return 0; 507 return 0;
510 508
511 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
512 tbl = &session->fc_slot_table; 509 tbl = &session->fc_slot_table;
513 510
514 spin_lock(&tbl->slot_tbl_lock); 511 spin_lock(&tbl->slot_tbl_lock);
@@ -550,7 +547,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
550 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); 547 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
551 548
552 res->sr_session = session; 549 res->sr_session = session;
553 res->sr_slotid = slotid; 550 res->sr_slot = slot;
554 res->sr_renewal_time = jiffies; 551 res->sr_renewal_time = jiffies;
555 res->sr_status_flags = 0; 552 res->sr_status_flags = 0;
556 /* 553 /*
@@ -576,8 +573,9 @@ int nfs4_setup_sequence(const struct nfs_server *server,
576 goto out; 573 goto out;
577 } 574 }
578 575
579 dprintk("--> %s clp %p session %p sr_slotid %d\n", 576 dprintk("--> %s clp %p session %p sr_slot %td\n",
580 __func__, session->clp, session, res->sr_slotid); 577 __func__, session->clp, session, res->sr_slot ?
578 res->sr_slot - session->fc_slot_table.slots : -1);
581 579
582 ret = nfs41_setup_sequence(session, args, res, cache_reply, 580 ret = nfs41_setup_sequence(session, args, res, cache_reply,
583 task); 581 task);
@@ -650,7 +648,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
650 .callback_data = &data 648 .callback_data = &data
651 }; 649 };
652 650
653 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 651 res->sr_slot = NULL;
654 if (privileged) 652 if (privileged)
655 task_setup.callback_ops = &nfs41_call_priv_sync_ops; 653 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
656 task = rpc_run_task(&task_setup); 654 task = rpc_run_task(&task_setup);
@@ -735,7 +733,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
735 p->o_res.server = p->o_arg.server; 733 p->o_res.server = p->o_arg.server;
736 nfs_fattr_init(&p->f_attr); 734 nfs_fattr_init(&p->f_attr);
737 nfs_fattr_init(&p->dir_attr); 735 nfs_fattr_init(&p->dir_attr);
738 p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
739} 736}
740 737
741static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 738static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -1120,6 +1117,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1120 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1117 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1121 smp_rmb(); 1118 smp_rmb();
1122 if (state->n_rdwr != 0) { 1119 if (state->n_rdwr != 0) {
1120 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1123 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); 1121 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
1124 if (ret != 0) 1122 if (ret != 0)
1125 return ret; 1123 return ret;
@@ -1127,6 +1125,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1127 return -ESTALE; 1125 return -ESTALE;
1128 } 1126 }
1129 if (state->n_wronly != 0) { 1127 if (state->n_wronly != 0) {
1128 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1130 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); 1129 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
1131 if (ret != 0) 1130 if (ret != 0)
1132 return ret; 1131 return ret;
@@ -1134,6 +1133,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1134 return -ESTALE; 1133 return -ESTALE;
1135 } 1134 }
1136 if (state->n_rdonly != 0) { 1135 if (state->n_rdonly != 0) {
1136 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1137 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); 1137 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
1138 if (ret != 0) 1138 if (ret != 0)
1139 return ret; 1139 return ret;
@@ -1188,7 +1188,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1188 int err; 1188 int err;
1189 do { 1189 do {
1190 err = _nfs4_do_open_reclaim(ctx, state); 1190 err = _nfs4_do_open_reclaim(ctx, state);
1191 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 1191 if (err != -NFS4ERR_DELAY)
1192 break; 1192 break;
1193 nfs4_handle_exception(server, err, &exception); 1193 nfs4_handle_exception(server, err, &exception);
1194 } while (exception.retry); 1194 } while (exception.retry);
@@ -1258,6 +1258,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1258 case -NFS4ERR_ADMIN_REVOKED: 1258 case -NFS4ERR_ADMIN_REVOKED:
1259 case -NFS4ERR_BAD_STATEID: 1259 case -NFS4ERR_BAD_STATEID:
1260 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 1260 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
1261 case -EKEYEXPIRED:
1262 /*
1263 * User RPCSEC_GSS context has expired.
1264 * We cannot recover this stateid now, so
1265 * skip it and allow recovery thread to
1266 * proceed.
1267 */
1261 case -ENOMEM: 1268 case -ENOMEM:
1262 err = 0; 1269 err = 0;
1263 goto out; 1270 goto out;
@@ -1605,7 +1612,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
1605 goto out; 1612 goto out;
1606 case -NFS4ERR_GRACE: 1613 case -NFS4ERR_GRACE:
1607 case -NFS4ERR_DELAY: 1614 case -NFS4ERR_DELAY:
1608 case -EKEYEXPIRED:
1609 nfs4_handle_exception(server, err, &exception); 1615 nfs4_handle_exception(server, err, &exception);
1610 err = 0; 1616 err = 0;
1611 } 1617 }
@@ -1975,7 +1981,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1975 calldata->res.fattr = &calldata->fattr; 1981 calldata->res.fattr = &calldata->fattr;
1976 calldata->res.seqid = calldata->arg.seqid; 1982 calldata->res.seqid = calldata->arg.seqid;
1977 calldata->res.server = server; 1983 calldata->res.server = server;
1978 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1979 path_get(path); 1984 path_get(path);
1980 calldata->path = *path; 1985 calldata->path = *path;
1981 1986
@@ -1998,120 +2003,17 @@ out:
1998 return status; 2003 return status;
1999} 2004}
2000 2005
2001static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode) 2006static struct inode *
2007nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
2002{ 2008{
2003 struct file *filp;
2004 int ret;
2005
2006 /* If the open_intent is for execute, we have an extra check to make */
2007 if (fmode & FMODE_EXEC) {
2008 ret = nfs_may_open(state->inode,
2009 state->owner->so_cred,
2010 nd->intent.open.flags);
2011 if (ret < 0)
2012 goto out_close;
2013 }
2014 filp = lookup_instantiate_filp(nd, path->dentry, NULL);
2015 if (!IS_ERR(filp)) {
2016 struct nfs_open_context *ctx;
2017 ctx = nfs_file_open_context(filp);
2018 ctx->state = state;
2019 return 0;
2020 }
2021 ret = PTR_ERR(filp);
2022out_close:
2023 nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
2024 return ret;
2025}
2026
2027struct dentry *
2028nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2029{
2030 struct path path = {
2031 .mnt = nd->path.mnt,
2032 .dentry = dentry,
2033 };
2034 struct dentry *parent;
2035 struct iattr attr;
2036 struct rpc_cred *cred;
2037 struct nfs4_state *state; 2009 struct nfs4_state *state;
2038 struct dentry *res;
2039 int open_flags = nd->intent.open.flags;
2040 fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
2041
2042 if (nd->flags & LOOKUP_CREATE) {
2043 attr.ia_mode = nd->intent.open.create_mode;
2044 attr.ia_valid = ATTR_MODE;
2045 if (!IS_POSIXACL(dir))
2046 attr.ia_mode &= ~current_umask();
2047 } else {
2048 open_flags &= ~O_EXCL;
2049 attr.ia_valid = 0;
2050 BUG_ON(open_flags & O_CREAT);
2051 }
2052 2010
2053 cred = rpc_lookup_cred();
2054 if (IS_ERR(cred))
2055 return (struct dentry *)cred;
2056 parent = dentry->d_parent;
2057 /* Protect against concurrent sillydeletes */ 2011 /* Protect against concurrent sillydeletes */
2058 nfs_block_sillyrename(parent); 2012 state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
2059 state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); 2013 if (IS_ERR(state))
2060 put_rpccred(cred); 2014 return ERR_CAST(state);
2061 if (IS_ERR(state)) { 2015 ctx->state = state;
2062 if (PTR_ERR(state) == -ENOENT) { 2016 return igrab(state->inode);
2063 d_add(dentry, NULL);
2064 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2065 }
2066 nfs_unblock_sillyrename(parent);
2067 return (struct dentry *)state;
2068 }
2069 res = d_add_unique(dentry, igrab(state->inode));
2070 if (res != NULL)
2071 path.dentry = res;
2072 nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
2073 nfs_unblock_sillyrename(parent);
2074 nfs4_intent_set_file(nd, &path, state, fmode);
2075 return res;
2076}
2077
2078int
2079nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
2080{
2081 struct path path = {
2082 .mnt = nd->path.mnt,
2083 .dentry = dentry,
2084 };
2085 struct rpc_cred *cred;
2086 struct nfs4_state *state;
2087 fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
2088
2089 cred = rpc_lookup_cred();
2090 if (IS_ERR(cred))
2091 return PTR_ERR(cred);
2092 state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
2093 put_rpccred(cred);
2094 if (IS_ERR(state)) {
2095 switch (PTR_ERR(state)) {
2096 case -EPERM:
2097 case -EACCES:
2098 case -EDQUOT:
2099 case -ENOSPC:
2100 case -EROFS:
2101 return PTR_ERR(state);
2102 default:
2103 goto out_drop;
2104 }
2105 }
2106 if (state->inode == dentry->d_inode) {
2107 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2108 nfs4_intent_set_file(nd, &path, state, fmode);
2109 return 1;
2110 }
2111 nfs4_close_sync(&path, state, fmode);
2112out_drop:
2113 d_drop(dentry);
2114 return 0;
2115} 2017}
2116 2018
2117static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2019static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2568,36 +2470,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
2568 2470
2569static int 2471static int
2570nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 2472nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2571 int flags, struct nameidata *nd) 2473 int flags, struct nfs_open_context *ctx)
2572{ 2474{
2573 struct path path = { 2475 struct path my_path = {
2574 .mnt = nd->path.mnt,
2575 .dentry = dentry, 2476 .dentry = dentry,
2576 }; 2477 };
2478 struct path *path = &my_path;
2577 struct nfs4_state *state; 2479 struct nfs4_state *state;
2578 struct rpc_cred *cred; 2480 struct rpc_cred *cred = NULL;
2579 fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE); 2481 fmode_t fmode = 0;
2580 int status = 0; 2482 int status = 0;
2581 2483
2582 cred = rpc_lookup_cred(); 2484 if (ctx != NULL) {
2583 if (IS_ERR(cred)) { 2485 cred = ctx->cred;
2584 status = PTR_ERR(cred); 2486 path = &ctx->path;
2585 goto out; 2487 fmode = ctx->mode;
2586 } 2488 }
2587 state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred); 2489 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
2588 d_drop(dentry); 2490 d_drop(dentry);
2589 if (IS_ERR(state)) { 2491 if (IS_ERR(state)) {
2590 status = PTR_ERR(state); 2492 status = PTR_ERR(state);
2591 goto out_putcred; 2493 goto out;
2592 } 2494 }
2593 d_add(dentry, igrab(state->inode)); 2495 d_add(dentry, igrab(state->inode));
2594 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 2496 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2595 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2497 if (ctx != NULL)
2596 status = nfs4_intent_set_file(nd, &path, state, fmode); 2498 ctx->state = state;
2597 else 2499 else
2598 nfs4_close_sync(&path, state, fmode); 2500 nfs4_close_sync(path, state, fmode);
2599out_putcred:
2600 put_rpccred(cred);
2601out: 2501out:
2602 return status; 2502 return status;
2603} 2503}
@@ -2655,6 +2555,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2655 2555
2656 args->bitmask = server->cache_consistency_bitmask; 2556 args->bitmask = server->cache_consistency_bitmask;
2657 res->server = server; 2557 res->server = server;
2558 res->seq_res.sr_slot = NULL;
2658 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2559 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2659} 2560}
2660 2561
@@ -2671,18 +2572,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2671 return 1; 2572 return 1;
2672} 2573}
2673 2574
2575static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
2576{
2577 struct nfs_server *server = NFS_SERVER(dir);
2578 struct nfs_renameargs *arg = msg->rpc_argp;
2579 struct nfs_renameres *res = msg->rpc_resp;
2580
2581 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
2582 arg->bitmask = server->attr_bitmask;
2583 res->server = server;
2584}
2585
2586static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
2587 struct inode *new_dir)
2588{
2589 struct nfs_renameres *res = task->tk_msg.rpc_resp;
2590
2591 if (!nfs4_sequence_done(task, &res->seq_res))
2592 return 0;
2593 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2594 return 0;
2595
2596 update_changeattr(old_dir, &res->old_cinfo);
2597 nfs_post_op_update_inode(old_dir, res->old_fattr);
2598 update_changeattr(new_dir, &res->new_cinfo);
2599 nfs_post_op_update_inode(new_dir, res->new_fattr);
2600 return 1;
2601}
2602
2674static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, 2603static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2675 struct inode *new_dir, struct qstr *new_name) 2604 struct inode *new_dir, struct qstr *new_name)
2676{ 2605{
2677 struct nfs_server *server = NFS_SERVER(old_dir); 2606 struct nfs_server *server = NFS_SERVER(old_dir);
2678 struct nfs4_rename_arg arg = { 2607 struct nfs_renameargs arg = {
2679 .old_dir = NFS_FH(old_dir), 2608 .old_dir = NFS_FH(old_dir),
2680 .new_dir = NFS_FH(new_dir), 2609 .new_dir = NFS_FH(new_dir),
2681 .old_name = old_name, 2610 .old_name = old_name,
2682 .new_name = new_name, 2611 .new_name = new_name,
2683 .bitmask = server->attr_bitmask, 2612 .bitmask = server->attr_bitmask,
2684 }; 2613 };
2685 struct nfs4_rename_res res = { 2614 struct nfs_renameres res = {
2686 .server = server, 2615 .server = server,
2687 }; 2616 };
2688 struct rpc_message msg = { 2617 struct rpc_message msg = {
@@ -2896,15 +2825,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2896} 2825}
2897 2826
2898static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2827static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2899 u64 cookie, struct page *page, unsigned int count, int plus) 2828 u64 cookie, struct page **pages, unsigned int count, int plus)
2900{ 2829{
2901 struct inode *dir = dentry->d_inode; 2830 struct inode *dir = dentry->d_inode;
2902 struct nfs4_readdir_arg args = { 2831 struct nfs4_readdir_arg args = {
2903 .fh = NFS_FH(dir), 2832 .fh = NFS_FH(dir),
2904 .pages = &page, 2833 .pages = pages,
2905 .pgbase = 0, 2834 .pgbase = 0,
2906 .count = count, 2835 .count = count,
2907 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2836 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
2837 .plus = plus,
2908 }; 2838 };
2909 struct nfs4_readdir_res res; 2839 struct nfs4_readdir_res res;
2910 struct rpc_message msg = { 2840 struct rpc_message msg = {
@@ -2932,14 +2862,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2932} 2862}
2933 2863
2934static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2864static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2935 u64 cookie, struct page *page, unsigned int count, int plus) 2865 u64 cookie, struct page **pages, unsigned int count, int plus)
2936{ 2866{
2937 struct nfs4_exception exception = { }; 2867 struct nfs4_exception exception = { };
2938 int err; 2868 int err;
2939 do { 2869 do {
2940 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), 2870 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
2941 _nfs4_proc_readdir(dentry, cred, cookie, 2871 _nfs4_proc_readdir(dentry, cred, cookie,
2942 page, count, plus), 2872 pages, count, plus),
2943 &exception); 2873 &exception);
2944 } while (exception.retry); 2874 } while (exception.retry);
2945 return err; 2875 return err;
@@ -3490,9 +3420,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3490 nfs4_state_mark_reclaim_nograce(clp, state); 3420 nfs4_state_mark_reclaim_nograce(clp, state);
3491 goto do_state_recovery; 3421 goto do_state_recovery;
3492 case -NFS4ERR_STALE_STATEID: 3422 case -NFS4ERR_STALE_STATEID:
3493 if (state == NULL)
3494 break;
3495 nfs4_state_mark_reclaim_reboot(clp, state);
3496 case -NFS4ERR_STALE_CLIENTID: 3423 case -NFS4ERR_STALE_CLIENTID:
3497 case -NFS4ERR_EXPIRED: 3424 case -NFS4ERR_EXPIRED:
3498 goto do_state_recovery; 3425 goto do_state_recovery;
@@ -3626,7 +3553,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3626 case -NFS4ERR_RESOURCE: 3553 case -NFS4ERR_RESOURCE:
3627 /* The IBM lawyers misread another document! */ 3554 /* The IBM lawyers misread another document! */
3628 case -NFS4ERR_DELAY: 3555 case -NFS4ERR_DELAY:
3629 case -EKEYEXPIRED:
3630 err = nfs4_delay(clp->cl_rpcclient, &timeout); 3556 err = nfs4_delay(clp->cl_rpcclient, &timeout);
3631 } 3557 }
3632 } while (err == 0); 3558 } while (err == 0);
@@ -3721,7 +3647,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3721 memcpy(&data->stateid, stateid, sizeof(data->stateid)); 3647 memcpy(&data->stateid, stateid, sizeof(data->stateid));
3722 data->res.fattr = &data->fattr; 3648 data->res.fattr = &data->fattr;
3723 data->res.server = server; 3649 data->res.server = server;
3724 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3725 nfs_fattr_init(data->res.fattr); 3650 nfs_fattr_init(data->res.fattr);
3726 data->timestamp = jiffies; 3651 data->timestamp = jiffies;
3727 data->rpc_status = 0; 3652 data->rpc_status = 0;
@@ -3874,7 +3799,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3874 p->arg.fl = &p->fl; 3799 p->arg.fl = &p->fl;
3875 p->arg.seqid = seqid; 3800 p->arg.seqid = seqid;
3876 p->res.seqid = seqid; 3801 p->res.seqid = seqid;
3877 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3878 p->arg.stateid = &lsp->ls_stateid; 3802 p->arg.stateid = &lsp->ls_stateid;
3879 p->lsp = lsp; 3803 p->lsp = lsp;
3880 atomic_inc(&lsp->ls_count); 3804 atomic_inc(&lsp->ls_count);
@@ -4054,7 +3978,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
4054 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 3978 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
4055 p->arg.lock_owner.id = lsp->ls_id.id; 3979 p->arg.lock_owner.id = lsp->ls_id.id;
4056 p->res.lock_seqid = p->arg.lock_seqid; 3980 p->res.lock_seqid = p->arg.lock_seqid;
4057 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4058 p->lsp = lsp; 3981 p->lsp = lsp;
4059 p->server = server; 3982 p->server = server;
4060 atomic_inc(&lsp->ls_count); 3983 atomic_inc(&lsp->ls_count);
@@ -4241,7 +4164,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
4241 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4164 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4242 return 0; 4165 return 0;
4243 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); 4166 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
4244 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 4167 if (err != -NFS4ERR_DELAY)
4245 break; 4168 break;
4246 nfs4_handle_exception(server, err, &exception); 4169 nfs4_handle_exception(server, err, &exception);
4247 } while (exception.retry); 4170 } while (exception.retry);
@@ -4266,7 +4189,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
4266 goto out; 4189 goto out;
4267 case -NFS4ERR_GRACE: 4190 case -NFS4ERR_GRACE:
4268 case -NFS4ERR_DELAY: 4191 case -NFS4ERR_DELAY:
4269 case -EKEYEXPIRED:
4270 nfs4_handle_exception(server, err, &exception); 4192 nfs4_handle_exception(server, err, &exception);
4271 err = 0; 4193 err = 0;
4272 } 4194 }
@@ -4412,13 +4334,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4412 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 4334 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
4413 err = 0; 4335 err = 0;
4414 goto out; 4336 goto out;
4337 case -EKEYEXPIRED:
4338 /*
4339 * User RPCSEC_GSS context has expired.
4340 * We cannot recover this stateid now, so
4341 * skip it and allow recovery thread to
4342 * proceed.
4343 */
4344 err = 0;
4345 goto out;
4415 case -ENOMEM: 4346 case -ENOMEM:
4416 case -NFS4ERR_DENIED: 4347 case -NFS4ERR_DENIED:
4417 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 4348 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
4418 err = 0; 4349 err = 0;
4419 goto out; 4350 goto out;
4420 case -NFS4ERR_DELAY: 4351 case -NFS4ERR_DELAY:
4421 case -EKEYEXPIRED:
4422 break; 4352 break;
4423 } 4353 }
4424 err = nfs4_handle_exception(server, err, &exception); 4354 err = nfs4_handle_exception(server, err, &exception);
@@ -4647,7 +4577,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4647 switch (task->tk_status) { 4577 switch (task->tk_status) {
4648 case -NFS4ERR_DELAY: 4578 case -NFS4ERR_DELAY:
4649 case -NFS4ERR_GRACE: 4579 case -NFS4ERR_GRACE:
4650 case -EKEYEXPIRED:
4651 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); 4580 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4652 rpc_delay(task, NFS4_POLL_RETRY_MIN); 4581 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4653 task->tk_status = 0; 4582 task->tk_status = 0;
@@ -4687,7 +4616,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4687 }; 4616 };
4688 int status; 4617 int status;
4689 4618
4690 res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4691 dprintk("--> %s\n", __func__); 4619 dprintk("--> %s\n", __func__);
4692 task = rpc_run_task(&task_setup); 4620 task = rpc_run_task(&task_setup);
4693 4621
@@ -4914,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4914 args->bc_attrs.max_reqs); 4842 args->bc_attrs.max_reqs);
4915} 4843}
4916 4844
4917static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd) 4845static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4918{ 4846{
4919 if (rcvd <= sent) 4847 struct nfs4_channel_attrs *sent = &args->fc_attrs;
4920 return 0; 4848 struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
4921 printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. " 4849
4922 "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd); 4850 if (rcvd->headerpadsz > sent->headerpadsz)
4923 return -EINVAL; 4851 return -EINVAL;
4852 if (rcvd->max_resp_sz > sent->max_resp_sz)
4853 return -EINVAL;
4854 /*
4855 * Our requested max_ops is the minimum we need; we're not
4856 * prepared to break up compounds into smaller pieces than that.
4857 * So, no point even trying to continue if the server won't
4858 * cooperate:
4859 */
4860 if (rcvd->max_ops < sent->max_ops)
4861 return -EINVAL;
4862 if (rcvd->max_reqs == 0)
4863 return -EINVAL;
4864 return 0;
4924} 4865}
4925 4866
4926#define _verify_fore_channel_attr(_name_) \ 4867static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4927 _verify_channel_attr("fore", #_name_, \ 4868{
4928 args->fc_attrs._name_, \ 4869 struct nfs4_channel_attrs *sent = &args->bc_attrs;
4929 session->fc_attrs._name_) 4870 struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
4930 4871
4931#define _verify_back_channel_attr(_name_) \ 4872 if (rcvd->max_rqst_sz > sent->max_rqst_sz)
4932 _verify_channel_attr("back", #_name_, \ 4873 return -EINVAL;
4933 args->bc_attrs._name_, \ 4874 if (rcvd->max_resp_sz < sent->max_resp_sz)
4934 session->bc_attrs._name_) 4875 return -EINVAL;
4876 if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
4877 return -EINVAL;
4878 /* These would render the backchannel useless: */
4879 if (rcvd->max_ops == 0)
4880 return -EINVAL;
4881 if (rcvd->max_reqs == 0)
4882 return -EINVAL;
4883 return 0;
4884}
4935 4885
4936/*
4937 * The server is not allowed to increase the fore channel header pad size,
4938 * maximum response size, or maximum number of operations.
4939 *
4940 * The back channel attributes are only negotiatied down: We send what the
4941 * (back channel) server insists upon.
4942 */
4943static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, 4886static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
4944 struct nfs4_session *session) 4887 struct nfs4_session *session)
4945{ 4888{
4946 int ret = 0; 4889 int ret;
4947
4948 ret |= _verify_fore_channel_attr(headerpadsz);
4949 ret |= _verify_fore_channel_attr(max_resp_sz);
4950 ret |= _verify_fore_channel_attr(max_ops);
4951
4952 ret |= _verify_back_channel_attr(headerpadsz);
4953 ret |= _verify_back_channel_attr(max_rqst_sz);
4954 ret |= _verify_back_channel_attr(max_resp_sz);
4955 ret |= _verify_back_channel_attr(max_resp_sz_cached);
4956 ret |= _verify_back_channel_attr(max_ops);
4957 ret |= _verify_back_channel_attr(max_reqs);
4958 4890
4959 return ret; 4891 ret = nfs4_verify_fore_channel_attrs(args, session);
4892 if (ret)
4893 return ret;
4894 return nfs4_verify_back_channel_attrs(args, session);
4960} 4895}
4961 4896
4962static int _nfs4_proc_create_session(struct nfs_client *clp) 4897static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5111,7 +5046,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
5111{ 5046{
5112 switch(task->tk_status) { 5047 switch(task->tk_status) {
5113 case -NFS4ERR_DELAY: 5048 case -NFS4ERR_DELAY:
5114 case -EKEYEXPIRED:
5115 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5049 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5116 return -EAGAIN; 5050 return -EAGAIN;
5117 default: 5051 default:
@@ -5180,12 +5114,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
5180 5114
5181 if (!atomic_inc_not_zero(&clp->cl_count)) 5115 if (!atomic_inc_not_zero(&clp->cl_count))
5182 return ERR_PTR(-EIO); 5116 return ERR_PTR(-EIO);
5183 calldata = kmalloc(sizeof(*calldata), GFP_NOFS); 5117 calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
5184 if (calldata == NULL) { 5118 if (calldata == NULL) {
5185 nfs_put_client(clp); 5119 nfs_put_client(clp);
5186 return ERR_PTR(-ENOMEM); 5120 return ERR_PTR(-ENOMEM);
5187 } 5121 }
5188 calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5189 msg.rpc_argp = &calldata->args; 5122 msg.rpc_argp = &calldata->args;
5190 msg.rpc_resp = &calldata->res; 5123 msg.rpc_resp = &calldata->res;
5191 calldata->clp = clp; 5124 calldata->clp = clp;
@@ -5254,7 +5187,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
5254 case -NFS4ERR_WRONG_CRED: /* What to do here? */ 5187 case -NFS4ERR_WRONG_CRED: /* What to do here? */
5255 break; 5188 break;
5256 case -NFS4ERR_DELAY: 5189 case -NFS4ERR_DELAY:
5257 case -EKEYEXPIRED:
5258 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5190 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5259 return -EAGAIN; 5191 return -EAGAIN;
5260 default: 5192 default:
@@ -5317,7 +5249,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5317 goto out; 5249 goto out;
5318 calldata->clp = clp; 5250 calldata->clp = clp;
5319 calldata->arg.one_fs = 0; 5251 calldata->arg.one_fs = 0;
5320 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5321 5252
5322 msg.rpc_argp = &calldata->arg; 5253 msg.rpc_argp = &calldata->arg;
5323 msg.rpc_resp = &calldata->res; 5254 msg.rpc_resp = &calldata->res;
@@ -5333,6 +5264,147 @@ out:
5333 dprintk("<-- %s status=%d\n", __func__, status); 5264 dprintk("<-- %s status=%d\n", __func__, status);
5334 return status; 5265 return status;
5335} 5266}
5267
5268static void
5269nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
5270{
5271 struct nfs4_layoutget *lgp = calldata;
5272 struct inode *ino = lgp->args.inode;
5273 struct nfs_server *server = NFS_SERVER(ino);
5274
5275 dprintk("--> %s\n", __func__);
5276 if (nfs4_setup_sequence(server, &lgp->args.seq_args,
5277 &lgp->res.seq_res, 0, task))
5278 return;
5279 rpc_call_start(task);
5280}
5281
5282static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
5283{
5284 struct nfs4_layoutget *lgp = calldata;
5285 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5286
5287 dprintk("--> %s\n", __func__);
5288
5289 if (!nfs4_sequence_done(task, &lgp->res.seq_res))
5290 return;
5291
5292 switch (task->tk_status) {
5293 case 0:
5294 break;
5295 case -NFS4ERR_LAYOUTTRYLATER:
5296 case -NFS4ERR_RECALLCONFLICT:
5297 task->tk_status = -NFS4ERR_DELAY;
5298 /* Fall through */
5299 default:
5300 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5301 rpc_restart_call_prepare(task);
5302 return;
5303 }
5304 }
5305 lgp->status = task->tk_status;
5306 dprintk("<-- %s\n", __func__);
5307}
5308
5309static void nfs4_layoutget_release(void *calldata)
5310{
5311 struct nfs4_layoutget *lgp = calldata;
5312
5313 dprintk("--> %s\n", __func__);
5314 put_layout_hdr(lgp->args.inode);
5315 if (lgp->res.layout.buf != NULL)
5316 free_page((unsigned long) lgp->res.layout.buf);
5317 put_nfs_open_context(lgp->args.ctx);
5318 kfree(calldata);
5319 dprintk("<-- %s\n", __func__);
5320}
5321
5322static const struct rpc_call_ops nfs4_layoutget_call_ops = {
5323 .rpc_call_prepare = nfs4_layoutget_prepare,
5324 .rpc_call_done = nfs4_layoutget_done,
5325 .rpc_release = nfs4_layoutget_release,
5326};
5327
5328int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5329{
5330 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5331 struct rpc_task *task;
5332 struct rpc_message msg = {
5333 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
5334 .rpc_argp = &lgp->args,
5335 .rpc_resp = &lgp->res,
5336 };
5337 struct rpc_task_setup task_setup_data = {
5338 .rpc_client = server->client,
5339 .rpc_message = &msg,
5340 .callback_ops = &nfs4_layoutget_call_ops,
5341 .callback_data = lgp,
5342 .flags = RPC_TASK_ASYNC,
5343 };
5344 int status = 0;
5345
5346 dprintk("--> %s\n", __func__);
5347
5348 lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
5349 if (lgp->res.layout.buf == NULL) {
5350 nfs4_layoutget_release(lgp);
5351 return -ENOMEM;
5352 }
5353
5354 lgp->res.seq_res.sr_slot = NULL;
5355 task = rpc_run_task(&task_setup_data);
5356 if (IS_ERR(task))
5357 return PTR_ERR(task);
5358 status = nfs4_wait_for_completion_rpc_task(task);
5359 if (status != 0)
5360 goto out;
5361 status = lgp->status;
5362 if (status != 0)
5363 goto out;
5364 status = pnfs_layout_process(lgp);
5365out:
5366 rpc_put_task(task);
5367 dprintk("<-- %s status=%d\n", __func__, status);
5368 return status;
5369}
5370
5371static int
5372_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5373{
5374 struct nfs4_getdeviceinfo_args args = {
5375 .pdev = pdev,
5376 };
5377 struct nfs4_getdeviceinfo_res res = {
5378 .pdev = pdev,
5379 };
5380 struct rpc_message msg = {
5381 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
5382 .rpc_argp = &args,
5383 .rpc_resp = &res,
5384 };
5385 int status;
5386
5387 dprintk("--> %s\n", __func__);
5388 status = nfs4_call_sync(server, &msg, &args, &res, 0);
5389 dprintk("<-- %s status=%d\n", __func__, status);
5390
5391 return status;
5392}
5393
5394int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5395{
5396 struct nfs4_exception exception = { };
5397 int err;
5398
5399 do {
5400 err = nfs4_handle_exception(server,
5401 _nfs4_proc_getdeviceinfo(server, pdev),
5402 &exception);
5403 } while (exception.retry);
5404 return err;
5405}
5406EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
5407
5336#endif /* CONFIG_NFS_V4_1 */ 5408#endif /* CONFIG_NFS_V4_1 */
5337 5409
5338struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 5410struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5443,6 +5515,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5443 .unlink_setup = nfs4_proc_unlink_setup, 5515 .unlink_setup = nfs4_proc_unlink_setup,
5444 .unlink_done = nfs4_proc_unlink_done, 5516 .unlink_done = nfs4_proc_unlink_done,
5445 .rename = nfs4_proc_rename, 5517 .rename = nfs4_proc_rename,
5518 .rename_setup = nfs4_proc_rename_setup,
5519 .rename_done = nfs4_proc_rename_done,
5446 .link = nfs4_proc_link, 5520 .link = nfs4_proc_link,
5447 .symlink = nfs4_proc_symlink, 5521 .symlink = nfs4_proc_symlink,
5448 .mkdir = nfs4_proc_mkdir, 5522 .mkdir = nfs4_proc_mkdir,
@@ -5463,6 +5537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5463 .lock = nfs4_proc_lock, 5537 .lock = nfs4_proc_lock,
5464 .clear_acl_cache = nfs4_zap_acl_attr, 5538 .clear_acl_cache = nfs4_zap_acl_attr,
5465 .close_context = nfs4_close_context, 5539 .close_context = nfs4_close_context,
5540 .open_context = nfs4_atomic_open,
5466}; 5541};
5467 5542
5468/* 5543/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e2f19b04c06..f575a3126737 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,12 +40,13 @@
40 40
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/smp_lock.h> 43#include <linux/fs.h>
44#include <linux/nfs_fs.h> 44#include <linux/nfs_fs.h>
45#include <linux/nfs_idmap.h> 45#include <linux/nfs_idmap.h>
46#include <linux/kthread.h> 46#include <linux/kthread.h>
47#include <linux/module.h> 47#include <linux/module.h>
48#include <linux/random.h> 48#include <linux/random.h>
49#include <linux/ratelimit.h>
49#include <linux/workqueue.h> 50#include <linux/workqueue.h>
50#include <linux/bitops.h> 51#include <linux/bitops.h>
51 52
@@ -53,6 +54,7 @@
53#include "callback.h" 54#include "callback.h"
54#include "delegation.h" 55#include "delegation.h"
55#include "internal.h" 56#include "internal.h"
57#include "pnfs.h"
56 58
57#define OPENOWNER_POOL_SIZE 8 59#define OPENOWNER_POOL_SIZE 8
58 60
@@ -970,13 +972,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
970 /* Guard against delegation returns and new lock/unlock calls */ 972 /* Guard against delegation returns and new lock/unlock calls */
971 down_write(&nfsi->rwsem); 973 down_write(&nfsi->rwsem);
972 /* Protect inode->i_flock using the BKL */ 974 /* Protect inode->i_flock using the BKL */
973 lock_kernel(); 975 lock_flocks();
974 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 976 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
975 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 977 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
976 continue; 978 continue;
977 if (nfs_file_open_context(fl->fl_file)->state != state) 979 if (nfs_file_open_context(fl->fl_file)->state != state)
978 continue; 980 continue;
979 unlock_kernel(); 981 unlock_flocks();
980 status = ops->recover_lock(state, fl); 982 status = ops->recover_lock(state, fl);
981 switch (status) { 983 switch (status) {
982 case 0: 984 case 0:
@@ -1003,9 +1005,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1003 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1005 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1004 status = 0; 1006 status = 0;
1005 } 1007 }
1006 lock_kernel(); 1008 lock_flocks();
1007 } 1009 }
1008 unlock_kernel(); 1010 unlock_flocks();
1009out: 1011out:
1010 up_write(&nfsi->rwsem); 1012 up_write(&nfsi->rwsem);
1011 return status; 1013 return status;
@@ -1063,6 +1065,14 @@ restart:
1063 /* Mark the file as being 'closed' */ 1065 /* Mark the file as being 'closed' */
1064 state->state = 0; 1066 state->state = 0;
1065 break; 1067 break;
1068 case -EKEYEXPIRED:
1069 /*
1070 * User RPCSEC_GSS context has expired.
1071 * We cannot recover this stateid now, so
1072 * skip it and allow recovery thread to
1073 * proceed.
1074 */
1075 break;
1066 case -NFS4ERR_ADMIN_REVOKED: 1076 case -NFS4ERR_ADMIN_REVOKED:
1067 case -NFS4ERR_STALE_STATEID: 1077 case -NFS4ERR_STALE_STATEID:
1068 case -NFS4ERR_BAD_STATEID: 1078 case -NFS4ERR_BAD_STATEID:
@@ -1138,16 +1148,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
1138 (void)ops->reclaim_complete(clp); 1148 (void)ops->reclaim_complete(clp);
1139} 1149}
1140 1150
1141static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1151static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1142{ 1152{
1143 struct nfs4_state_owner *sp; 1153 struct nfs4_state_owner *sp;
1144 struct rb_node *pos; 1154 struct rb_node *pos;
1145 struct nfs4_state *state; 1155 struct nfs4_state *state;
1146 1156
1147 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1157 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1148 return; 1158 return 0;
1149
1150 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1151 1159
1152 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1160 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1153 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1161 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1161,6 +1169,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1161 } 1169 }
1162 1170
1163 nfs_delegation_reap_unclaimed(clp); 1171 nfs_delegation_reap_unclaimed(clp);
1172 return 1;
1173}
1174
1175static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1176{
1177 if (!nfs4_state_clear_reclaim_reboot(clp))
1178 return;
1179 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1164} 1180}
1165 1181
1166static void nfs_delegation_clear_all(struct nfs_client *clp) 1182static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1175,6 +1191,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1175 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); 1191 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1176} 1192}
1177 1193
1194static void nfs4_warn_keyexpired(const char *s)
1195{
1196 printk_ratelimited(KERN_WARNING "Error: state manager"
1197 " encountered RPCSEC_GSS session"
1198 " expired against NFSv4 server %s.\n",
1199 s);
1200}
1201
1178static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) 1202static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1179{ 1203{
1180 switch (error) { 1204 switch (error) {
@@ -1187,7 +1211,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1187 case -NFS4ERR_STALE_CLIENTID: 1211 case -NFS4ERR_STALE_CLIENTID:
1188 case -NFS4ERR_LEASE_MOVED: 1212 case -NFS4ERR_LEASE_MOVED:
1189 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1213 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1190 nfs4_state_end_reclaim_reboot(clp); 1214 nfs4_state_clear_reclaim_reboot(clp);
1191 nfs4_state_start_reclaim_reboot(clp); 1215 nfs4_state_start_reclaim_reboot(clp);
1192 break; 1216 break;
1193 case -NFS4ERR_EXPIRED: 1217 case -NFS4ERR_EXPIRED:
@@ -1204,6 +1228,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1204 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); 1228 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1205 /* Zero session reset errors */ 1229 /* Zero session reset errors */
1206 return 0; 1230 return 0;
1231 case -EKEYEXPIRED:
1232 /* Nothing we can do */
1233 nfs4_warn_keyexpired(clp->cl_hostname);
1234 return 0;
1207 } 1235 }
1208 return error; 1236 return error;
1209} 1237}
@@ -1414,9 +1442,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1414 case -NFS4ERR_DELAY: 1442 case -NFS4ERR_DELAY:
1415 case -NFS4ERR_CLID_INUSE: 1443 case -NFS4ERR_CLID_INUSE:
1416 case -EAGAIN: 1444 case -EAGAIN:
1417 case -EKEYEXPIRED:
1418 break; 1445 break;
1419 1446
1447 case -EKEYEXPIRED:
1448 nfs4_warn_keyexpired(clp->cl_hostname);
1420 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1449 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1421 * in nfs4_exchange_id */ 1450 * in nfs4_exchange_id */
1422 default: 1451 default:
@@ -1447,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1447 } 1476 }
1448 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1477 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1449 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); 1478 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
1479 pnfs_destroy_all_layouts(clp);
1450 } 1480 }
1451 1481
1452 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { 1482 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 08ef91291132..f313c4cce7e4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
52#include <linux/nfs_idmap.h> 52#include <linux/nfs_idmap.h>
53#include "nfs4_fs.h" 53#include "nfs4_fs.h"
54#include "internal.h" 54#include "internal.h"
55#include "pnfs.h"
55 56
56#define NFSDBG_FACILITY NFSDBG_XDR 57#define NFSDBG_FACILITY NFSDBG_XDR
57 58
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
310 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 311 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
311#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 312#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
312#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 313#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
314#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
315 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
316#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
317 1 /* layout type */ + \
318 1 /* opaque devaddr4 length */ + \
319 /* devaddr4 payload is read into page */ \
320 1 /* notification bitmap length */ + \
321 1 /* notification bitmap */)
322#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
323 encode_stateid_maxsz)
324#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
325 decode_stateid_maxsz + \
326 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
313#else /* CONFIG_NFS_V4_1 */ 327#else /* CONFIG_NFS_V4_1 */
314#define encode_sequence_maxsz 0 328#define encode_sequence_maxsz 0
315#define decode_sequence_maxsz 0 329#define decode_sequence_maxsz 0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
699#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 713#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
700 decode_sequence_maxsz + \ 714 decode_sequence_maxsz + \
701 decode_reclaim_complete_maxsz) 715 decode_reclaim_complete_maxsz)
716#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
717 encode_sequence_maxsz +\
718 encode_getdeviceinfo_maxsz)
719#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
720 decode_sequence_maxsz + \
721 decode_getdeviceinfo_maxsz)
722#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
723 encode_sequence_maxsz + \
724 encode_putfh_maxsz + \
725 encode_layoutget_maxsz)
726#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
727 decode_sequence_maxsz + \
728 decode_putfh_maxsz + \
729 decode_layoutget_maxsz)
702 730
703const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 731const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
704 compound_encode_hdr_maxsz + 732 compound_encode_hdr_maxsz +
@@ -816,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
816 if (iap->ia_valid & ATTR_MODE) 844 if (iap->ia_valid & ATTR_MODE)
817 len += 4; 845 len += 4;
818 if (iap->ia_valid & ATTR_UID) { 846 if (iap->ia_valid & ATTR_UID) {
819 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); 847 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
820 if (owner_namelen < 0) { 848 if (owner_namelen < 0) {
821 dprintk("nfs: couldn't resolve uid %d to string\n", 849 dprintk("nfs: couldn't resolve uid %d to string\n",
822 iap->ia_uid); 850 iap->ia_uid);
@@ -828,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
828 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
829 } 857 }
830 if (iap->ia_valid & ATTR_GID) { 858 if (iap->ia_valid & ATTR_GID) {
831 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); 859 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
832 if (owner_grouplen < 0) { 860 if (owner_grouplen < 0) {
833 dprintk("nfs: couldn't resolve gid %d to string\n", 861 dprintk("nfs: couldn't resolve gid %d to string\n",
834 iap->ia_gid); 862 iap->ia_gid);
@@ -1385,24 +1413,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1385 1413
1386static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1414static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1387{ 1415{
1388 uint32_t attrs[2] = { 1416 uint32_t attrs[2] = {0, 0};
1389 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, 1417 uint32_t dircount = readdir->count >> 1;
1390 FATTR4_WORD1_MOUNTED_ON_FILEID,
1391 };
1392 __be32 *p; 1418 __be32 *p;
1393 1419
1420 if (readdir->plus) {
1421 attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
1422 FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
1423 attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
1424 FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
1425 FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
1426 FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1427 dircount >>= 1;
1428 }
1429 attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
1430 attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
1431 /* Switch to mounted_on_fileid if the server supports it */
1432 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1433 attrs[0] &= ~FATTR4_WORD0_FILEID;
1434 else
1435 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1436
1394 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); 1437 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
1395 *p++ = cpu_to_be32(OP_READDIR); 1438 *p++ = cpu_to_be32(OP_READDIR);
1396 p = xdr_encode_hyper(p, readdir->cookie); 1439 p = xdr_encode_hyper(p, readdir->cookie);
1397 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); 1440 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
1398 *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ 1441 *p++ = cpu_to_be32(dircount);
1399 *p++ = cpu_to_be32(readdir->count); 1442 *p++ = cpu_to_be32(readdir->count);
1400 *p++ = cpu_to_be32(2); 1443 *p++ = cpu_to_be32(2);
1401 /* Switch to mounted_on_fileid if the server supports it */ 1444
1402 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1403 attrs[0] &= ~FATTR4_WORD0_FILEID;
1404 else
1405 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1406 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1445 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1407 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1446 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1408 hdr->nops++; 1447 hdr->nops++;
@@ -1726,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
1726#endif /* CONFIG_NFS_V4_1 */ 1765#endif /* CONFIG_NFS_V4_1 */
1727} 1766}
1728 1767
1768#ifdef CONFIG_NFS_V4_1
1769static void
1770encode_getdeviceinfo(struct xdr_stream *xdr,
1771 const struct nfs4_getdeviceinfo_args *args,
1772 struct compound_hdr *hdr)
1773{
1774 __be32 *p;
1775
1776 p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
1777 *p++ = cpu_to_be32(OP_GETDEVICEINFO);
1778 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1779 NFS4_DEVICEID4_SIZE);
1780 *p++ = cpu_to_be32(args->pdev->layout_type);
1781 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
1782 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1783 hdr->nops++;
1784 hdr->replen += decode_getdeviceinfo_maxsz;
1785}
1786
1787static void
1788encode_layoutget(struct xdr_stream *xdr,
1789 const struct nfs4_layoutget_args *args,
1790 struct compound_hdr *hdr)
1791{
1792 nfs4_stateid stateid;
1793 __be32 *p;
1794
1795 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
1796 *p++ = cpu_to_be32(OP_LAYOUTGET);
1797 *p++ = cpu_to_be32(0); /* Signal layout available */
1798 *p++ = cpu_to_be32(args->type);
1799 *p++ = cpu_to_be32(args->range.iomode);
1800 p = xdr_encode_hyper(p, args->range.offset);
1801 p = xdr_encode_hyper(p, args->range.length);
1802 p = xdr_encode_hyper(p, args->minlength);
1803 pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
1804 args->ctx->state);
1805 p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
1806 *p = cpu_to_be32(args->maxcount);
1807
1808 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
1809 __func__,
1810 args->type,
1811 args->range.iomode,
1812 (unsigned long)args->range.offset,
1813 (unsigned long)args->range.length,
1814 args->maxcount);
1815 hdr->nops++;
1816 hdr->replen += decode_layoutget_maxsz;
1817}
1818#endif /* CONFIG_NFS_V4_1 */
1819
1729/* 1820/*
1730 * END OF "GENERIC" ENCODE ROUTINES. 1821 * END OF "GENERIC" ENCODE ROUTINES.
1731 */ 1822 */
@@ -1823,7 +1914,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
1823/* 1914/*
1824 * Encode RENAME request 1915 * Encode RENAME request
1825 */ 1916 */
1826static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args) 1917static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
1827{ 1918{
1828 struct xdr_stream xdr; 1919 struct xdr_stream xdr;
1829 struct compound_hdr hdr = { 1920 struct compound_hdr hdr = {
@@ -2543,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
2543 return 0; 2634 return 0;
2544} 2635}
2545 2636
2637/*
2638 * Encode GETDEVICEINFO request
2639 */
2640static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
2641 struct nfs4_getdeviceinfo_args *args)
2642{
2643 struct xdr_stream xdr;
2644 struct compound_hdr hdr = {
2645 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2646 };
2647
2648 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2649 encode_compound_hdr(&xdr, req, &hdr);
2650 encode_sequence(&xdr, &args->seq_args, &hdr);
2651 encode_getdeviceinfo(&xdr, args, &hdr);
2652
2653 /* set up reply kvec. Subtract notification bitmap max size (2)
2654 * so that notification bitmap is put in xdr_buf tail */
2655 xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
2656 args->pdev->pages, args->pdev->pgbase,
2657 args->pdev->pglen);
2658
2659 encode_nops(&hdr);
2660 return 0;
2661}
2662
2663/*
2664 * Encode LAYOUTGET request
2665 */
2666static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
2667 struct nfs4_layoutget_args *args)
2668{
2669 struct xdr_stream xdr;
2670 struct compound_hdr hdr = {
2671 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2672 };
2673
2674 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2675 encode_compound_hdr(&xdr, req, &hdr);
2676 encode_sequence(&xdr, &args->seq_args, &hdr);
2677 encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
2678 encode_layoutget(&xdr, args, &hdr);
2679 encode_nops(&hdr);
2680 return 0;
2681}
2546#endif /* CONFIG_NFS_V4_1 */ 2682#endif /* CONFIG_NFS_V4_1 */
2547 2683
2548static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 2684static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2676,7 +2812,10 @@ out_overflow:
2676static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) 2812static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
2677{ 2813{
2678 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { 2814 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
2679 decode_attr_bitmap(xdr, bitmask); 2815 int ret;
2816 ret = decode_attr_bitmap(xdr, bitmask);
2817 if (unlikely(ret < 0))
2818 return ret;
2680 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 2819 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
2681 } else 2820 } else
2682 bitmask[0] = bitmask[1] = 0; 2821 bitmask[0] = bitmask[1] = 0;
@@ -2848,6 +2987,56 @@ out_overflow:
2848 return -EIO; 2987 return -EIO;
2849} 2988}
2850 2989
2990static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
2991{
2992 __be32 *p;
2993
2994 if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
2995 return -EIO;
2996 if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
2997 p = xdr_inline_decode(xdr, 4);
2998 if (unlikely(!p))
2999 goto out_overflow;
3000 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
3001 }
3002 return 0;
3003out_overflow:
3004 print_overflow_msg(__func__, xdr);
3005 return -EIO;
3006}
3007
3008static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
3009{
3010 __be32 *p;
3011 int len;
3012
3013 if (fh != NULL)
3014 memset(fh, 0, sizeof(*fh));
3015
3016 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
3017 return -EIO;
3018 if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
3019 p = xdr_inline_decode(xdr, 4);
3020 if (unlikely(!p))
3021 goto out_overflow;
3022 len = be32_to_cpup(p);
3023 if (len > NFS4_FHSIZE)
3024 return -EIO;
3025 p = xdr_inline_decode(xdr, len);
3026 if (unlikely(!p))
3027 goto out_overflow;
3028 if (fh != NULL) {
3029 memcpy(fh->data, p, len);
3030 fh->size = len;
3031 }
3032 bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
3033 }
3034 return 0;
3035out_overflow:
3036 print_overflow_msg(__func__, xdr);
3037 return -EIO;
3038}
3039
2851static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3040static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2852{ 3041{
2853 __be32 *p; 3042 __be32 *p;
@@ -3521,6 +3710,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
3521 return status; 3710 return status;
3522} 3711}
3523 3712
3713static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
3714 struct timespec *time)
3715{
3716 int status = 0;
3717
3718 time->tv_sec = 0;
3719 time->tv_nsec = 0;
3720 if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
3721 return -EIO;
3722 if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
3723 status = decode_attr_time(xdr, time);
3724 bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
3725 }
3726 dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
3727 (long)time->tv_nsec);
3728 return status;
3729}
3730
3524static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 3731static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
3525{ 3732{
3526 int status = 0; 3733 int status = 0;
@@ -3744,29 +3951,14 @@ xdr_error:
3744 return status; 3951 return status;
3745} 3952}
3746 3953
3747static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 3954static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3955 struct nfs_fattr *fattr, struct nfs_fh *fh,
3748 const struct nfs_server *server, int may_sleep) 3956 const struct nfs_server *server, int may_sleep)
3749{ 3957{
3750 __be32 *savep;
3751 uint32_t attrlen,
3752 bitmap[2] = {0},
3753 type;
3754 int status; 3958 int status;
3755 umode_t fmode = 0; 3959 umode_t fmode = 0;
3756 uint64_t fileid; 3960 uint64_t fileid;
3757 3961 uint32_t type;
3758 status = decode_op_hdr(xdr, OP_GETATTR);
3759 if (status < 0)
3760 goto xdr_error;
3761
3762 status = decode_attr_bitmap(xdr, bitmap);
3763 if (status < 0)
3764 goto xdr_error;
3765
3766 status = decode_attr_length(xdr, &attrlen, &savep);
3767 if (status < 0)
3768 goto xdr_error;
3769
3770 3962
3771 status = decode_attr_type(xdr, bitmap, &type); 3963 status = decode_attr_type(xdr, bitmap, &type);
3772 if (status < 0) 3964 if (status < 0)
@@ -3792,6 +3984,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3792 goto xdr_error; 3984 goto xdr_error;
3793 fattr->valid |= status; 3985 fattr->valid |= status;
3794 3986
3987 status = decode_attr_error(xdr, bitmap);
3988 if (status < 0)
3989 goto xdr_error;
3990
3991 status = decode_attr_filehandle(xdr, bitmap, fh);
3992 if (status < 0)
3993 goto xdr_error;
3994
3795 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); 3995 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3796 if (status < 0) 3996 if (status < 0)
3797 goto xdr_error; 3997 goto xdr_error;
@@ -3862,12 +4062,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3862 fattr->valid |= status; 4062 fattr->valid |= status;
3863 } 4063 }
3864 4064
4065xdr_error:
4066 dprintk("%s: xdr returned %d\n", __func__, -status);
4067 return status;
4068}
4069
4070static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4071 struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
4072{
4073 __be32 *savep;
4074 uint32_t attrlen,
4075 bitmap[2] = {0};
4076 int status;
4077
4078 status = decode_op_hdr(xdr, OP_GETATTR);
4079 if (status < 0)
4080 goto xdr_error;
4081
4082 status = decode_attr_bitmap(xdr, bitmap);
4083 if (status < 0)
4084 goto xdr_error;
4085
4086 status = decode_attr_length(xdr, &attrlen, &savep);
4087 if (status < 0)
4088 goto xdr_error;
4089
4090 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
4091 if (status < 0)
4092 goto xdr_error;
4093
3865 status = verify_attr_len(xdr, savep, attrlen); 4094 status = verify_attr_len(xdr, savep, attrlen);
3866xdr_error: 4095xdr_error:
3867 dprintk("%s: xdr returned %d\n", __func__, -status); 4096 dprintk("%s: xdr returned %d\n", __func__, -status);
3868 return status; 4097 return status;
3869} 4098}
3870 4099
4100static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4101 const struct nfs_server *server, int may_sleep)
4102{
4103 return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
4104}
4105
4106/*
4107 * Decode potentially multiple layout types. Currently we only support
4108 * one layout driver per file system.
4109 */
4110static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
4111 uint32_t *layouttype)
4112{
4113 uint32_t *p;
4114 int num;
4115
4116 p = xdr_inline_decode(xdr, 4);
4117 if (unlikely(!p))
4118 goto out_overflow;
4119 num = be32_to_cpup(p);
4120
4121 /* pNFS is not supported by the underlying file system */
4122 if (num == 0) {
4123 *layouttype = 0;
4124 return 0;
4125 }
4126 if (num > 1)
4127 printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
4128 "per filesystem not supported\n", __func__);
4129
4130 /* Decode and set first layout type, move xdr->p past unused types */
4131 p = xdr_inline_decode(xdr, num * 4);
4132 if (unlikely(!p))
4133 goto out_overflow;
4134 *layouttype = be32_to_cpup(p);
4135 return 0;
4136out_overflow:
4137 print_overflow_msg(__func__, xdr);
4138 return -EIO;
4139}
4140
4141/*
4142 * The type of file system exported.
4143 * Note we must ensure that layouttype is set in any non-error case.
4144 */
4145static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
4146 uint32_t *layouttype)
4147{
4148 int status = 0;
4149
4150 dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
4151 if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
4152 return -EIO;
4153 if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
4154 status = decode_first_pnfs_layout_type(xdr, layouttype);
4155 bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
4156 } else
4157 *layouttype = 0;
4158 return status;
4159}
3871 4160
3872static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4161static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3873{ 4162{
@@ -3894,6 +4183,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3894 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) 4183 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
3895 goto xdr_error; 4184 goto xdr_error;
3896 fsinfo->wtpref = fsinfo->wtmax; 4185 fsinfo->wtpref = fsinfo->wtmax;
4186 status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
4187 if (status != 0)
4188 goto xdr_error;
4189 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
4190 if (status != 0)
4191 goto xdr_error;
3897 4192
3898 status = verify_attr_len(xdr, savep, attrlen); 4193 status = verify_attr_len(xdr, savep, attrlen);
3899xdr_error: 4194xdr_error:
@@ -3950,13 +4245,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3950 __be32 *p; 4245 __be32 *p;
3951 uint32_t namelen, type; 4246 uint32_t namelen, type;
3952 4247
3953 p = xdr_inline_decode(xdr, 32); 4248 p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
3954 if (unlikely(!p)) 4249 if (unlikely(!p))
3955 goto out_overflow; 4250 goto out_overflow;
3956 p = xdr_decode_hyper(p, &offset); 4251 p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
3957 p = xdr_decode_hyper(p, &length); 4252 p = xdr_decode_hyper(p, &length);
3958 type = be32_to_cpup(p++); 4253 type = be32_to_cpup(p++); /* 4 byte read */
3959 if (fl != NULL) { 4254 if (fl != NULL) { /* manipulate file lock */
3960 fl->fl_start = (loff_t)offset; 4255 fl->fl_start = (loff_t)offset;
3961 fl->fl_end = fl->fl_start + (loff_t)length - 1; 4256 fl->fl_end = fl->fl_start + (loff_t)length - 1;
3962 if (length == ~(uint64_t)0) 4257 if (length == ~(uint64_t)0)
@@ -3966,9 +4261,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3966 fl->fl_type = F_RDLCK; 4261 fl->fl_type = F_RDLCK;
3967 fl->fl_pid = 0; 4262 fl->fl_pid = 0;
3968 } 4263 }
3969 p = xdr_decode_hyper(p, &clientid); 4264 p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
3970 namelen = be32_to_cpup(p); 4265 namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
3971 p = xdr_inline_decode(xdr, namelen); 4266 p = xdr_inline_decode(xdr, namelen); /* variable size field */
3972 if (likely(p)) 4267 if (likely(p))
3973 return -NFS4ERR_DENIED; 4268 return -NFS4ERR_DENIED;
3974out_overflow: 4269out_overflow:
@@ -4200,12 +4495,9 @@ out_overflow:
4200static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) 4495static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
4201{ 4496{
4202 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 4497 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
4203 struct page *page = *rcvbuf->pages;
4204 struct kvec *iov = rcvbuf->head; 4498 struct kvec *iov = rcvbuf->head;
4205 size_t hdrlen; 4499 size_t hdrlen;
4206 u32 recvd, pglen = rcvbuf->page_len; 4500 u32 recvd, pglen = rcvbuf->page_len;
4207 __be32 *end, *entry, *p, *kaddr;
4208 unsigned int nr = 0;
4209 int status; 4501 int status;
4210 4502
4211 status = decode_op_hdr(xdr, OP_READDIR); 4503 status = decode_op_hdr(xdr, OP_READDIR);
@@ -4225,71 +4517,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
4225 pglen = recvd; 4517 pglen = recvd;
4226 xdr_read_pages(xdr, pglen); 4518 xdr_read_pages(xdr, pglen);
4227 4519
4228 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); 4520
4229 kaddr = p = kmap_atomic(page, KM_USER0);
4230 end = p + ((pglen + readdir->pgbase) >> 2);
4231 entry = p;
4232
4233 /* Make sure the packet actually has a value_follows and EOF entry */
4234 if ((entry + 1) > end)
4235 goto short_pkt;
4236
4237 for (; *p++; nr++) {
4238 u32 len, attrlen, xlen;
4239 if (end - p < 3)
4240 goto short_pkt;
4241 dprintk("cookie = %Lu, ", *((unsigned long long *)p));
4242 p += 2; /* cookie */
4243 len = ntohl(*p++); /* filename length */
4244 if (len > NFS4_MAXNAMLEN) {
4245 dprintk("NFS: giant filename in readdir (len 0x%x)\n",
4246 len);
4247 goto err_unmap;
4248 }
4249 xlen = XDR_QUADLEN(len);
4250 if (end - p < xlen + 1)
4251 goto short_pkt;
4252 dprintk("filename = %*s\n", len, (char *)p);
4253 p += xlen;
4254 len = ntohl(*p++); /* bitmap length */
4255 if (end - p < len + 1)
4256 goto short_pkt;
4257 p += len;
4258 attrlen = XDR_QUADLEN(ntohl(*p++));
4259 if (end - p < attrlen + 2)
4260 goto short_pkt;
4261 p += attrlen; /* attributes */
4262 entry = p;
4263 }
4264 /*
4265 * Apparently some server sends responses that are a valid size, but
4266 * contain no entries, and have value_follows==0 and EOF==0. For
4267 * those, just set the EOF marker.
4268 */
4269 if (!nr && entry[1] == 0) {
4270 dprintk("NFS: readdir reply truncated!\n");
4271 entry[1] = 1;
4272 }
4273out:
4274 kunmap_atomic(kaddr, KM_USER0);
4275 return 0; 4521 return 0;
4276short_pkt:
4277 /*
4278 * When we get a short packet there are 2 possibilities. We can
4279 * return an error, or fix up the response to look like a valid
4280 * response and return what we have so far. If there are no
4281 * entries and the packet was short, then return -EIO. If there
4282 * are valid entries in the response, return them and pretend that
4283 * the call was successful, but incomplete. The caller can retry the
4284 * readdir starting at the last cookie.
4285 */
4286 dprintk("%s: short packet at entry %d\n", __func__, nr);
4287 entry[0] = entry[1] = 0;
4288 if (nr)
4289 goto out;
4290err_unmap:
4291 kunmap_atomic(kaddr, KM_USER0);
4292 return -errno_NFSERR_IO;
4293} 4522}
4294 4523
4295static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) 4524static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4299,7 +4528,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4299 size_t hdrlen; 4528 size_t hdrlen;
4300 u32 len, recvd; 4529 u32 len, recvd;
4301 __be32 *p; 4530 __be32 *p;
4302 char *kaddr;
4303 int status; 4531 int status;
4304 4532
4305 status = decode_op_hdr(xdr, OP_READLINK); 4533 status = decode_op_hdr(xdr, OP_READLINK);
@@ -4330,9 +4558,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4330 * and and null-terminate the text (the VFS expects 4558 * and and null-terminate the text (the VFS expects
4331 * null-termination). 4559 * null-termination).
4332 */ 4560 */
4333 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); 4561 xdr_terminate_string(rcvbuf, len);
4334 kaddr[len+rcvbuf->page_base] = '\0';
4335 kunmap_atomic(kaddr, KM_USER0);
4336 return 0; 4562 return 0;
4337out_overflow: 4563out_overflow:
4338 print_overflow_msg(__func__, xdr); 4564 print_overflow_msg(__func__, xdr);
@@ -4668,7 +4894,6 @@ static int decode_sequence(struct xdr_stream *xdr,
4668 struct rpc_rqst *rqstp) 4894 struct rpc_rqst *rqstp)
4669{ 4895{
4670#if defined(CONFIG_NFS_V4_1) 4896#if defined(CONFIG_NFS_V4_1)
4671 struct nfs4_slot *slot;
4672 struct nfs4_sessionid id; 4897 struct nfs4_sessionid id;
4673 u32 dummy; 4898 u32 dummy;
4674 int status; 4899 int status;
@@ -4700,15 +4925,14 @@ static int decode_sequence(struct xdr_stream *xdr,
4700 goto out_overflow; 4925 goto out_overflow;
4701 4926
4702 /* seqid */ 4927 /* seqid */
4703 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4704 dummy = be32_to_cpup(p++); 4928 dummy = be32_to_cpup(p++);
4705 if (dummy != slot->seq_nr) { 4929 if (dummy != res->sr_slot->seq_nr) {
4706 dprintk("%s Invalid sequence number\n", __func__); 4930 dprintk("%s Invalid sequence number\n", __func__);
4707 goto out_err; 4931 goto out_err;
4708 } 4932 }
4709 /* slot id */ 4933 /* slot id */
4710 dummy = be32_to_cpup(p++); 4934 dummy = be32_to_cpup(p++);
4711 if (dummy != res->sr_slotid) { 4935 if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
4712 dprintk("%s Invalid slot id\n", __func__); 4936 dprintk("%s Invalid slot id\n", __func__);
4713 goto out_err; 4937 goto out_err;
4714 } 4938 }
@@ -4731,6 +4955,134 @@ out_overflow:
4731#endif /* CONFIG_NFS_V4_1 */ 4955#endif /* CONFIG_NFS_V4_1 */
4732} 4956}
4733 4957
4958#if defined(CONFIG_NFS_V4_1)
4959
4960static int decode_getdeviceinfo(struct xdr_stream *xdr,
4961 struct pnfs_device *pdev)
4962{
4963 __be32 *p;
4964 uint32_t len, type;
4965 int status;
4966
4967 status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
4968 if (status) {
4969 if (status == -ETOOSMALL) {
4970 p = xdr_inline_decode(xdr, 4);
4971 if (unlikely(!p))
4972 goto out_overflow;
4973 pdev->mincount = be32_to_cpup(p);
4974 dprintk("%s: Min count too small. mincnt = %u\n",
4975 __func__, pdev->mincount);
4976 }
4977 return status;
4978 }
4979
4980 p = xdr_inline_decode(xdr, 8);
4981 if (unlikely(!p))
4982 goto out_overflow;
4983 type = be32_to_cpup(p++);
4984 if (type != pdev->layout_type) {
4985 dprintk("%s: layout mismatch req: %u pdev: %u\n",
4986 __func__, pdev->layout_type, type);
4987 return -EINVAL;
4988 }
4989 /*
4990 * Get the length of the opaque device_addr4. xdr_read_pages places
4991 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
4992 * and places the remaining xdr data in xdr_buf->tail
4993 */
4994 pdev->mincount = be32_to_cpup(p);
4995 xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
4996
4997 /* Parse notification bitmap, verifying that it is zero. */
4998 p = xdr_inline_decode(xdr, 4);
4999 if (unlikely(!p))
5000 goto out_overflow;
5001 len = be32_to_cpup(p);
5002 if (len) {
5003 int i;
5004
5005 p = xdr_inline_decode(xdr, 4 * len);
5006 if (unlikely(!p))
5007 goto out_overflow;
5008 for (i = 0; i < len; i++, p++) {
5009 if (be32_to_cpup(p)) {
5010 dprintk("%s: notifications not supported\n",
5011 __func__);
5012 return -EIO;
5013 }
5014 }
5015 }
5016 return 0;
5017out_overflow:
5018 print_overflow_msg(__func__, xdr);
5019 return -EIO;
5020}
5021
5022static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
5023 struct nfs4_layoutget_res *res)
5024{
5025 __be32 *p;
5026 int status;
5027 u32 layout_count;
5028
5029 status = decode_op_hdr(xdr, OP_LAYOUTGET);
5030 if (status)
5031 return status;
5032 p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
5033 if (unlikely(!p))
5034 goto out_overflow;
5035 res->return_on_close = be32_to_cpup(p++);
5036 p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
5037 layout_count = be32_to_cpup(p);
5038 if (!layout_count) {
5039 dprintk("%s: server responded with empty layout array\n",
5040 __func__);
5041 return -EINVAL;
5042 }
5043
5044 p = xdr_inline_decode(xdr, 24);
5045 if (unlikely(!p))
5046 goto out_overflow;
5047 p = xdr_decode_hyper(p, &res->range.offset);
5048 p = xdr_decode_hyper(p, &res->range.length);
5049 res->range.iomode = be32_to_cpup(p++);
5050 res->type = be32_to_cpup(p++);
5051
5052 status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
5053 if (unlikely(status))
5054 return status;
5055
5056 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
5057 __func__,
5058 (unsigned long)res->range.offset,
5059 (unsigned long)res->range.length,
5060 res->range.iomode,
5061 res->type,
5062 res->layout.len);
5063
5064 /* nfs4_proc_layoutget allocated a single page */
5065 if (res->layout.len > PAGE_SIZE)
5066 return -ENOMEM;
5067 memcpy(res->layout.buf, p, res->layout.len);
5068
5069 if (layout_count > 1) {
5070 /* We only handle a length one array at the moment. Any
5071 * further entries are just ignored. Note that this means
5072 * the client may see a response that is less than the
5073 * minimum it requested.
5074 */
5075 dprintk("%s: server responded with %d layouts, dropping tail\n",
5076 __func__, layout_count);
5077 }
5078
5079 return 0;
5080out_overflow:
5081 print_overflow_msg(__func__, xdr);
5082 return -EIO;
5083}
5084#endif /* CONFIG_NFS_V4_1 */
5085
4734/* 5086/*
4735 * END OF "GENERIC" DECODE ROUTINES. 5087 * END OF "GENERIC" DECODE ROUTINES.
4736 */ 5088 */
@@ -4873,7 +5225,7 @@ out:
4873/* 5225/*
4874 * Decode RENAME response 5226 * Decode RENAME response
4875 */ 5227 */
4876static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res) 5228static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
4877{ 5229{
4878 struct xdr_stream xdr; 5230 struct xdr_stream xdr;
4879 struct compound_hdr hdr; 5231 struct compound_hdr hdr;
@@ -5758,25 +6110,84 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
5758 status = decode_reclaim_complete(&xdr, (void *)NULL); 6110 status = decode_reclaim_complete(&xdr, (void *)NULL);
5759 return status; 6111 return status;
5760} 6112}
6113
6114/*
6115 * Decode GETDEVINFO response
6116 */
6117static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
6118 struct nfs4_getdeviceinfo_res *res)
6119{
6120 struct xdr_stream xdr;
6121 struct compound_hdr hdr;
6122 int status;
6123
6124 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
6125 status = decode_compound_hdr(&xdr, &hdr);
6126 if (status != 0)
6127 goto out;
6128 status = decode_sequence(&xdr, &res->seq_res, rqstp);
6129 if (status != 0)
6130 goto out;
6131 status = decode_getdeviceinfo(&xdr, res->pdev);
6132out:
6133 return status;
6134}
6135
6136/*
6137 * Decode LAYOUTGET response
6138 */
6139static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
6140 struct nfs4_layoutget_res *res)
6141{
6142 struct xdr_stream xdr;
6143 struct compound_hdr hdr;
6144 int status;
6145
6146 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
6147 status = decode_compound_hdr(&xdr, &hdr);
6148 if (status)
6149 goto out;
6150 status = decode_sequence(&xdr, &res->seq_res, rqstp);
6151 if (status)
6152 goto out;
6153 status = decode_putfh(&xdr);
6154 if (status)
6155 goto out;
6156 status = decode_layoutget(&xdr, rqstp, res);
6157out:
6158 return status;
6159}
5761#endif /* CONFIG_NFS_V4_1 */ 6160#endif /* CONFIG_NFS_V4_1 */
5762 6161
5763__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 6162__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6163 struct nfs_server *server, int plus)
5764{ 6164{
5765 uint32_t bitmap[2] = {0}; 6165 uint32_t bitmap[2] = {0};
5766 uint32_t len; 6166 uint32_t len;
5767 6167 __be32 *p = xdr_inline_decode(xdr, 4);
5768 if (!*p++) { 6168 if (unlikely(!p))
5769 if (!*p) 6169 goto out_overflow;
6170 if (!ntohl(*p++)) {
6171 p = xdr_inline_decode(xdr, 4);
6172 if (unlikely(!p))
6173 goto out_overflow;
6174 if (!ntohl(*p++))
5770 return ERR_PTR(-EAGAIN); 6175 return ERR_PTR(-EAGAIN);
5771 entry->eof = 1; 6176 entry->eof = 1;
5772 return ERR_PTR(-EBADCOOKIE); 6177 return ERR_PTR(-EBADCOOKIE);
5773 } 6178 }
5774 6179
6180 p = xdr_inline_decode(xdr, 12);
6181 if (unlikely(!p))
6182 goto out_overflow;
5775 entry->prev_cookie = entry->cookie; 6183 entry->prev_cookie = entry->cookie;
5776 p = xdr_decode_hyper(p, &entry->cookie); 6184 p = xdr_decode_hyper(p, &entry->cookie);
5777 entry->len = ntohl(*p++); 6185 entry->len = ntohl(*p++);
6186
6187 p = xdr_inline_decode(xdr, entry->len);
6188 if (unlikely(!p))
6189 goto out_overflow;
5778 entry->name = (const char *) p; 6190 entry->name = (const char *) p;
5779 p += XDR_QUADLEN(entry->len);
5780 6191
5781 /* 6192 /*
5782 * In case the server doesn't return an inode number, 6193 * In case the server doesn't return an inode number,
@@ -5784,32 +6195,33 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
5784 * since glibc seems to choke on it...) 6195 * since glibc seems to choke on it...)
5785 */ 6196 */
5786 entry->ino = 1; 6197 entry->ino = 1;
6198 entry->fattr->valid = 0;
5787 6199
5788 len = ntohl(*p++); /* bitmap length */ 6200 if (decode_attr_bitmap(xdr, bitmap) < 0)
5789 if (len-- > 0) { 6201 goto out_overflow;
5790 bitmap[0] = ntohl(*p++); 6202
5791 if (len-- > 0) { 6203 if (decode_attr_length(xdr, &len, &p) < 0)
5792 bitmap[1] = ntohl(*p++); 6204 goto out_overflow;
5793 p += len; 6205
5794 } 6206 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
5795 } 6207 goto out_overflow;
5796 len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ 6208 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
5797 if (len > 0) { 6209 entry->ino = entry->fattr->fileid;
5798 if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { 6210
5799 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; 6211 if (verify_attr_len(xdr, p, len) < 0)
5800 /* Ignore the return value of rdattr_error for now */ 6212 goto out_overflow;
5801 p++; 6213
5802 len--; 6214 p = xdr_inline_peek(xdr, 8);
5803 } 6215 if (p != NULL)
5804 if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) 6216 entry->eof = !p[0] && p[1];
5805 xdr_decode_hyper(p, &entry->ino); 6217 else
5806 else if (bitmap[0] == FATTR4_WORD0_FILEID) 6218 entry->eof = 0;
5807 xdr_decode_hyper(p, &entry->ino);
5808 p += len;
5809 }
5810 6219
5811 entry->eof = !p[0] && p[1];
5812 return p; 6220 return p;
6221
6222out_overflow:
6223 print_overflow_msg(__func__, xdr);
6224 return ERR_PTR(-EIO);
5813} 6225}
5814 6226
5815/* 6227/*
@@ -5936,6 +6348,8 @@ struct rpc_procinfo nfs4_procedures[] = {
5936 PROC(SEQUENCE, enc_sequence, dec_sequence), 6348 PROC(SEQUENCE, enc_sequence, dec_sequence),
5937 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 6349 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
5938 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6350 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6351 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6352 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
5939#endif /* CONFIG_NFS_V4_1 */ 6353#endif /* CONFIG_NFS_V4_1 */
5940}; 6354};
5941 6355
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546a..903908a20023 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
3 * 3 *
4 * Allow an NFS filesystem to be mounted as root. The way this works is: 4 * Allow an NFS filesystem to be mounted as root. The way this works is:
5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. 5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
6 * (2) Handle RPC negotiation with the system which replied to RARP or 6 * (2) Construct the device string and the options string using DHCP
7 * was reported as a boot server by BOOTP or manually. 7 * option 17 and/or kernel command line options.
8 * (3) The actual mounting is done later, when init() is running. 8 * (3) When mount_root() sets up the root file system, pass these strings
9 * to the NFS client's regular mount interface via sys_mount().
9 * 10 *
10 * 11 *
11 * Changes: 12 * Changes:
@@ -65,470 +66,245 @@
65 * Hua Qin : Support for mounting root file system via 66 * Hua Qin : Support for mounting root file system via
66 * NFS over TCP. 67 * NFS over TCP.
67 * Fabian Frederick: Option parser rebuilt (using parser lib) 68 * Fabian Frederick: Option parser rebuilt (using parser lib)
68*/ 69 * Chuck Lever : Use super.c's text-based mount option parsing
70 * Chuck Lever : Add "nfsrootdebug".
71 */
69 72
70#include <linux/types.h> 73#include <linux/types.h>
71#include <linux/string.h> 74#include <linux/string.h>
72#include <linux/kernel.h>
73#include <linux/time.h>
74#include <linux/fs.h>
75#include <linux/init.h> 75#include <linux/init.h>
76#include <linux/sunrpc/clnt.h>
77#include <linux/sunrpc/xprtsock.h>
78#include <linux/nfs.h> 76#include <linux/nfs.h>
79#include <linux/nfs_fs.h> 77#include <linux/nfs_fs.h>
80#include <linux/nfs_mount.h>
81#include <linux/in.h>
82#include <linux/major.h>
83#include <linux/utsname.h> 78#include <linux/utsname.h>
84#include <linux/inet.h>
85#include <linux/root_dev.h> 79#include <linux/root_dev.h>
86#include <net/ipconfig.h> 80#include <net/ipconfig.h>
87#include <linux/parser.h>
88 81
89#include "internal.h" 82#include "internal.h"
90 83
91/* Define this to allow debugging output */
92#undef NFSROOT_DEBUG
93#define NFSDBG_FACILITY NFSDBG_ROOT 84#define NFSDBG_FACILITY NFSDBG_ROOT
94 85
95/* Default port to use if server is not running a portmapper */
96#define NFS_MNT_PORT 627
97
98/* Default path we try to mount. "%s" gets replaced by our IP address */ 86/* Default path we try to mount. "%s" gets replaced by our IP address */
99#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
100 88
101/* Parameters passed from the kernel command line */ 89/* Parameters passed from the kernel command line */
102static char nfs_root_name[256] __initdata = ""; 90static char nfs_root_parms[256] __initdata = "";
91
92/* Text-based mount options passed to super.c */
93static char nfs_root_options[256] __initdata = "";
103 94
104/* Address of NFS server */ 95/* Address of NFS server */
105static __be32 servaddr __initdata = 0; 96static __be32 servaddr __initdata = htonl(INADDR_NONE);
106 97
107/* Name of directory to mount */ 98/* Name of directory to mount */
108static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; 99static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
109
110/* NFS-related data */
111static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
112static int nfs_port __initdata = 0; /* Port to connect to for NFS */
113static int mount_port __initdata = 0; /* Mount daemon port number */
114
115
116/***************************************************************************
117
118 Parsing of options
119
120 ***************************************************************************/
121
122enum {
123 /* Options that take integer arguments */
124 Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
125 Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
126 /* Options that take no arguments */
127 Opt_soft, Opt_hard, Opt_intr,
128 Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac,
129 Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
130 Opt_acl, Opt_noacl,
131 /* Error token */
132 Opt_err
133};
134
135static const match_table_t tokens __initconst = {
136 {Opt_port, "port=%u"},
137 {Opt_rsize, "rsize=%u"},
138 {Opt_wsize, "wsize=%u"},
139 {Opt_timeo, "timeo=%u"},
140 {Opt_retrans, "retrans=%u"},
141 {Opt_acregmin, "acregmin=%u"},
142 {Opt_acregmax, "acregmax=%u"},
143 {Opt_acdirmin, "acdirmin=%u"},
144 {Opt_acdirmax, "acdirmax=%u"},
145 {Opt_soft, "soft"},
146 {Opt_hard, "hard"},
147 {Opt_intr, "intr"},
148 {Opt_nointr, "nointr"},
149 {Opt_posix, "posix"},
150 {Opt_noposix, "noposix"},
151 {Opt_cto, "cto"},
152 {Opt_nocto, "nocto"},
153 {Opt_ac, "ac"},
154 {Opt_noac, "noac"},
155 {Opt_lock, "lock"},
156 {Opt_nolock, "nolock"},
157 {Opt_v2, "nfsvers=2"},
158 {Opt_v2, "v2"},
159 {Opt_v3, "nfsvers=3"},
160 {Opt_v3, "v3"},
161 {Opt_udp, "proto=udp"},
162 {Opt_udp, "udp"},
163 {Opt_tcp, "proto=tcp"},
164 {Opt_tcp, "tcp"},
165 {Opt_acl, "acl"},
166 {Opt_noacl, "noacl"},
167 {Opt_err, NULL}
168
169};
170 100
101/* server:export path string passed to super.c */
102static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
103
104#ifdef RPC_DEBUG
171/* 105/*
172 * Parse option string. 106 * When the "nfsrootdebug" kernel command line option is specified,
107 * enable debugging messages for NFSROOT.
173 */ 108 */
174 109static int __init nfs_root_debug(char *__unused)
175static int __init root_nfs_parse(char *name, char *buf)
176{ 110{
177 111 nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
178 char *p;
179 substring_t args[MAX_OPT_ARGS];
180 int option;
181
182 if (!name)
183 return 1;
184
185 /* Set the NFS remote path */
186 p = strsep(&name, ",");
187 if (p[0] != '\0' && strcmp(p, "default") != 0)
188 strlcpy(buf, p, NFS_MAXPATHLEN);
189
190 while ((p = strsep (&name, ",")) != NULL) {
191 int token;
192 if (!*p)
193 continue;
194 token = match_token(p, tokens, args);
195
196 /* %u tokens only. Beware if you add new tokens! */
197 if (token < Opt_soft && match_int(&args[0], &option))
198 return 0;
199 switch (token) {
200 case Opt_port:
201 nfs_port = option;
202 break;
203 case Opt_rsize:
204 nfs_data.rsize = option;
205 break;
206 case Opt_wsize:
207 nfs_data.wsize = option;
208 break;
209 case Opt_timeo:
210 nfs_data.timeo = option;
211 break;
212 case Opt_retrans:
213 nfs_data.retrans = option;
214 break;
215 case Opt_acregmin:
216 nfs_data.acregmin = option;
217 break;
218 case Opt_acregmax:
219 nfs_data.acregmax = option;
220 break;
221 case Opt_acdirmin:
222 nfs_data.acdirmin = option;
223 break;
224 case Opt_acdirmax:
225 nfs_data.acdirmax = option;
226 break;
227 case Opt_soft:
228 nfs_data.flags |= NFS_MOUNT_SOFT;
229 break;
230 case Opt_hard:
231 nfs_data.flags &= ~NFS_MOUNT_SOFT;
232 break;
233 case Opt_intr:
234 case Opt_nointr:
235 break;
236 case Opt_posix:
237 nfs_data.flags |= NFS_MOUNT_POSIX;
238 break;
239 case Opt_noposix:
240 nfs_data.flags &= ~NFS_MOUNT_POSIX;
241 break;
242 case Opt_cto:
243 nfs_data.flags &= ~NFS_MOUNT_NOCTO;
244 break;
245 case Opt_nocto:
246 nfs_data.flags |= NFS_MOUNT_NOCTO;
247 break;
248 case Opt_ac:
249 nfs_data.flags &= ~NFS_MOUNT_NOAC;
250 break;
251 case Opt_noac:
252 nfs_data.flags |= NFS_MOUNT_NOAC;
253 break;
254 case Opt_lock:
255 nfs_data.flags &= ~NFS_MOUNT_NONLM;
256 break;
257 case Opt_nolock:
258 nfs_data.flags |= NFS_MOUNT_NONLM;
259 break;
260 case Opt_v2:
261 nfs_data.flags &= ~NFS_MOUNT_VER3;
262 break;
263 case Opt_v3:
264 nfs_data.flags |= NFS_MOUNT_VER3;
265 break;
266 case Opt_udp:
267 nfs_data.flags &= ~NFS_MOUNT_TCP;
268 break;
269 case Opt_tcp:
270 nfs_data.flags |= NFS_MOUNT_TCP;
271 break;
272 case Opt_acl:
273 nfs_data.flags &= ~NFS_MOUNT_NOACL;
274 break;
275 case Opt_noacl:
276 nfs_data.flags |= NFS_MOUNT_NOACL;
277 break;
278 default:
279 printk(KERN_WARNING "Root-NFS: unknown "
280 "option: %s\n", p);
281 return 0;
282 }
283 }
284
285 return 1; 112 return 1;
286} 113}
287 114
115__setup("nfsrootdebug", nfs_root_debug);
116#endif
117
288/* 118/*
289 * Prepare the NFS data structure and parse all options. 119 * Parse NFS server and directory information passed on the kernel
120 * command line.
121 *
122 * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
123 *
124 * If there is a "%s" token in the <root-dir> string, it is replaced
125 * by the ASCII-representation of the client's IP address.
290 */ 126 */
291static int __init root_nfs_name(char *name) 127static int __init nfs_root_setup(char *line)
292{ 128{
293 static char buf[NFS_MAXPATHLEN] __initdata; 129 ROOT_DEV = Root_NFS;
294 char *cp; 130
295 131 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
296 /* Set some default values */ 132 strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
297 memset(&nfs_data, 0, sizeof(nfs_data)); 133 } else {
298 nfs_port = -1; 134 size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
299 nfs_data.version = NFS_MOUNT_VERSION; 135 if (n >= sizeof(nfs_root_parms))
300 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ 136 line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
301 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; 137 sprintf(nfs_root_parms, NFS_ROOT, line);
302 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
303 nfs_data.acregmin = NFS_DEF_ACREGMIN;
304 nfs_data.acregmax = NFS_DEF_ACREGMAX;
305 nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
306 nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
307 strcpy(buf, NFS_ROOT);
308
309 /* Process options received from the remote server */
310 root_nfs_parse(root_server_path, buf);
311
312 /* Override them by options set on kernel command-line */
313 root_nfs_parse(name, buf);
314
315 cp = utsname()->nodename;
316 if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
317 printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
318 return -1;
319 } 138 }
320 sprintf(nfs_export_path, buf, cp); 139
140 /*
141 * Extract the IP address of the NFS server containing our
142 * root file system, if one was specified.
143 *
144 * Note: root_nfs_parse_addr() removes the server-ip from
145 * nfs_root_parms, if it exists.
146 */
147 root_server_addr = root_nfs_parse_addr(nfs_root_parms);
321 148
322 return 1; 149 return 1;
323} 150}
324 151
152__setup("nfsroot=", nfs_root_setup);
325 153
326/* 154static int __init root_nfs_copy(char *dest, const char *src,
327 * Get NFS server address. 155 const size_t destlen)
328 */
329static int __init root_nfs_addr(void)
330{ 156{
331 if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) { 157 if (strlcpy(dest, src, destlen) > destlen)
332 printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
333 return -1; 158 return -1;
334 } 159 return 0;
160}
335 161
336 snprintf(nfs_data.hostname, sizeof(nfs_data.hostname), 162static int __init root_nfs_cat(char *dest, const char *src,
337 "%pI4", &servaddr); 163 const size_t destlen)
164{
165 if (strlcat(dest, src, destlen) > destlen)
166 return -1;
338 return 0; 167 return 0;
339} 168}
340 169
341/* 170/*
342 * Tell the user what's going on. 171 * Parse out root export path and mount options from
172 * passed-in string @incoming.
173 *
174 * Copy the export path into @exppath.
343 */ 175 */
344#ifdef NFSROOT_DEBUG 176static int __init root_nfs_parse_options(char *incoming, char *exppath,
345static void __init root_nfs_print(void) 177 const size_t exppathlen)
346{ 178{
347 printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", 179 char *p;
348 nfs_export_path, nfs_data.hostname);
349 printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
350 nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
351 printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
352 nfs_data.acregmin, nfs_data.acregmax,
353 nfs_data.acdirmin, nfs_data.acdirmax);
354 printk(KERN_NOTICE "Root-NFS: nfsd port = %d, mountd port = %d, flags = %08x\n",
355 nfs_port, mount_port, nfs_data.flags);
356}
357#endif
358
359 180
360static int __init root_nfs_init(void) 181 /*
361{ 182 * Set the NFS remote path
362#ifdef NFSROOT_DEBUG 183 */
363 nfs_debug |= NFSDBG_ROOT; 184 p = strsep(&incoming, ",");
364#endif 185 if (*p != '\0' && strcmp(p, "default") != 0)
186 if (root_nfs_copy(exppath, p, exppathlen))
187 return -1;
365 188
366 /* 189 /*
367 * Decode the root directory path name and NFS options from 190 * @incoming now points to the rest of the string; if it
368 * the kernel command line. This has to go here in order to 191 * contains something, append it to our root options buffer
369 * be able to use the client IP address for the remote root
370 * directory (necessary for pure RARP booting).
371 */ 192 */
372 if (root_nfs_name(nfs_root_name) < 0 || 193 if (incoming != NULL && *incoming != '\0')
373 root_nfs_addr() < 0) 194 if (root_nfs_cat(nfs_root_options, incoming,
374 return -1; 195 sizeof(nfs_root_options)))
196 return -1;
375 197
376#ifdef NFSROOT_DEBUG 198 /*
377 root_nfs_print(); 199 * Possibly prepare for more options to be appended
378#endif 200 */
201 if (nfs_root_options[0] != '\0' &&
202 nfs_root_options[strlen(nfs_root_options)] != ',')
203 if (root_nfs_cat(nfs_root_options, ",",
204 sizeof(nfs_root_options)))
205 return -1;
379 206
380 return 0; 207 return 0;
381} 208}
382 209
383
384/* 210/*
385 * Parse NFS server and directory information passed on the kernel 211 * Decode the export directory path name and NFS options from
386 * command line. 212 * the kernel command line. This has to be done late in order to
213 * use a dynamically acquired client IP address for the remote
214 * root directory path.
215 *
216 * Returns zero if successful; otherwise -1 is returned.
387 */ 217 */
388static int __init nfs_root_setup(char *line) 218static int __init root_nfs_data(char *cmdline)
389{ 219{
390 ROOT_DEV = Root_NFS; 220 char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
391 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { 221 int len, retval = -1;
392 strlcpy(nfs_root_name, line, sizeof(nfs_root_name)); 222 char *tmp = NULL;
393 } else { 223 const size_t tmplen = sizeof(nfs_export_path);
394 int n = strlen(line) + sizeof(NFS_ROOT) - 1; 224
395 if (n >= sizeof(nfs_root_name)) 225 tmp = kzalloc(tmplen, GFP_KERNEL);
396 line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0'; 226 if (tmp == NULL)
397 sprintf(nfs_root_name, NFS_ROOT, line); 227 goto out_nomem;
228 strcpy(tmp, NFS_ROOT);
229
230 if (root_server_path[0] != '\0') {
231 dprintk("Root-NFS: DHCPv4 option 17: %s\n",
232 root_server_path);
233 if (root_nfs_parse_options(root_server_path, tmp, tmplen))
234 goto out_optionstoolong;
398 } 235 }
399 root_server_addr = root_nfs_parse_addr(nfs_root_name);
400 return 1;
401}
402
403__setup("nfsroot=", nfs_root_setup);
404
405/***************************************************************************
406 236
407 Routines to actually mount the root directory 237 if (cmdline[0] != '\0') {
238 dprintk("Root-NFS: nfsroot=%s\n", cmdline);
239 if (root_nfs_parse_options(cmdline, tmp, tmplen))
240 goto out_optionstoolong;
241 }
408 242
409 ***************************************************************************/ 243 /*
244 * Append mandatory options for nfsroot so they override
245 * what has come before
246 */
247 snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
248 &servaddr);
249 if (root_nfs_cat(nfs_root_options, addr_option,
250 sizeof(nfs_root_options)))
251 goto out_optionstoolong;
410 252
411/* 253 /*
412 * Construct sockaddr_in from address and port number. 254 * Set up nfs_root_device. For NFS mounts, this looks like
413 */ 255 *
414static inline void 256 * server:/path
415set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) 257 *
416{ 258 * At this point, utsname()->nodename contains our local
417 sin->sin_family = AF_INET; 259 * IP address or hostname, set by ipconfig. If "%s" exists
418 sin->sin_addr.s_addr = addr; 260 * in tmp, substitute the nodename, then shovel the whole
419 sin->sin_port = port; 261 * mess into nfs_root_device.
420} 262 */
263 len = snprintf(nfs_export_path, sizeof(nfs_export_path),
264 tmp, utsname()->nodename);
265 if (len > (int)sizeof(nfs_export_path))
266 goto out_devnametoolong;
267 len = snprintf(nfs_root_device, sizeof(nfs_root_device),
268 "%pI4:%s", &servaddr, nfs_export_path);
269 if (len > (int)sizeof(nfs_root_device))
270 goto out_devnametoolong;
421 271
422/* 272 retval = 0;
423 * Query server portmapper for the port of a daemon program.
424 */
425static int __init root_nfs_getport(int program, int version, int proto)
426{
427 struct sockaddr_in sin;
428 273
429 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n", 274out:
430 program, version, &servaddr); 275 kfree(tmp);
431 set_sockaddr(&sin, servaddr, 0); 276 return retval;
432 return rpcb_getport_sync(&sin, program, version, proto); 277out_nomem:
278 printk(KERN_ERR "Root-NFS: could not allocate memory\n");
279 goto out;
280out_optionstoolong:
281 printk(KERN_ERR "Root-NFS: mount options string too long\n");
282 goto out;
283out_devnametoolong:
284 printk(KERN_ERR "Root-NFS: root device name too long.\n");
285 goto out;
433} 286}
434 287
435 288/**
436/* 289 * nfs_root_data - Return prepared 'data' for NFSROOT mount
437 * Use portmapper to find mountd and nfsd port numbers if not overriden 290 * @root_device: OUT: address of string containing NFSROOT device
438 * by the user. Use defaults if portmapper is not available. 291 * @root_data: OUT: address of string containing NFSROOT mount options
439 * XXX: Is there any nfs server with no portmapper? 292 *
293 * Returns zero and sets @root_device and @root_data if successful,
294 * otherwise -1 is returned.
440 */ 295 */
441static int __init root_nfs_ports(void) 296int __init nfs_root_data(char **root_device, char **root_data)
442{ 297{
443 int port; 298 servaddr = root_server_addr;
444 int nfsd_ver, mountd_ver; 299 if (servaddr == htonl(INADDR_NONE)) {
445 int nfsd_port, mountd_port; 300 printk(KERN_ERR "Root-NFS: no NFS server address\n");
446 int proto; 301 return -1;
447
448 if (nfs_data.flags & NFS_MOUNT_VER3) {
449 nfsd_ver = NFS3_VERSION;
450 mountd_ver = NFS_MNT3_VERSION;
451 nfsd_port = NFS_PORT;
452 mountd_port = NFS_MNT_PORT;
453 } else {
454 nfsd_ver = NFS2_VERSION;
455 mountd_ver = NFS_MNT_VERSION;
456 nfsd_port = NFS_PORT;
457 mountd_port = NFS_MNT_PORT;
458 }
459
460 proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
461
462 if (nfs_port < 0) {
463 if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
464 printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
465 "number from server, using default\n");
466 port = nfsd_port;
467 }
468 nfs_port = port;
469 dprintk("Root-NFS: Portmapper on server returned %d "
470 "as nfsd port\n", port);
471 } 302 }
472 303
473 if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { 304 if (root_nfs_data(nfs_root_parms) < 0)
474 printk(KERN_ERR "Root-NFS: Unable to get mountd port " 305 return -1;
475 "number from server, using default\n");
476 port = mountd_port;
477 }
478 mount_port = port;
479 dprintk("Root-NFS: mountd port is %d\n", port);
480 306
307 *root_device = nfs_root_device;
308 *root_data = nfs_root_options;
481 return 0; 309 return 0;
482} 310}
483
484
485/*
486 * Get a file handle from the server for the directory which is to be
487 * mounted.
488 */
489static int __init root_nfs_get_handle(void)
490{
491 struct sockaddr_in sin;
492 unsigned int auth_flav_len = 0;
493 struct nfs_mount_request request = {
494 .sap = (struct sockaddr *)&sin,
495 .salen = sizeof(sin),
496 .dirpath = nfs_export_path,
497 .version = (nfs_data.flags & NFS_MOUNT_VER3) ?
498 NFS_MNT3_VERSION : NFS_MNT_VERSION,
499 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
500 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
501 .auth_flav_len = &auth_flav_len,
502 };
503 int status = -ENOMEM;
504
505 request.fh = nfs_alloc_fhandle();
506 if (!request.fh)
507 goto out;
508 set_sockaddr(&sin, servaddr, htons(mount_port));
509 status = nfs_mount(&request);
510 if (status < 0)
511 printk(KERN_ERR "Root-NFS: Server returned error %d "
512 "while mounting %s\n", status, nfs_export_path);
513 else {
514 nfs_data.root.size = request.fh->size;
515 memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
516 }
517 nfs_free_fhandle(request.fh);
518out:
519 return status;
520}
521
522/*
523 * Get the NFS port numbers and file handle, and return the prepared 'data'
524 * argument for mount() if everything went OK. Return NULL otherwise.
525 */
526void * __init nfs_root_data(void)
527{
528 if (root_nfs_init() < 0
529 || root_nfs_ports() < 0
530 || root_nfs_get_handle() < 0)
531 return NULL;
532 set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
533 return (void*)&nfs_data;
534}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 919490232e17..137b549e63db 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -65,6 +65,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
65 if (req == NULL) 65 if (req == NULL)
66 return ERR_PTR(-ENOMEM); 66 return ERR_PTR(-ENOMEM);
67 67
68 /* get lock context early so we can deal with alloc failures */
69 req->wb_lock_context = nfs_get_lock_context(ctx);
70 if (req->wb_lock_context == NULL) {
71 nfs_page_free(req);
72 return ERR_PTR(-ENOMEM);
73 }
74
68 /* Initialize the request struct. Initially, we assume a 75 /* Initialize the request struct. Initially, we assume a
69 * long write-back delay. This will be adjusted in 76 * long write-back delay. This will be adjusted in
70 * update_nfs_request below if the region is not locked. */ 77 * update_nfs_request below if the region is not locked. */
@@ -79,7 +86,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
79 req->wb_pgbase = offset; 86 req->wb_pgbase = offset;
80 req->wb_bytes = count; 87 req->wb_bytes = count;
81 req->wb_context = get_nfs_open_context(ctx); 88 req->wb_context = get_nfs_open_context(ctx);
82 req->wb_lock_context = nfs_get_lock_context(ctx);
83 kref_init(&req->wb_kref); 89 kref_init(&req->wb_kref);
84 return req; 90 return req;
85} 91}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000000..db773428f95f
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,783 @@
1/*
2 * pNFS functions to call and manage layout drivers.
3 *
4 * Copyright (c) 2002 [year of first publication]
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#include <linux/nfs_fs.h>
31#include "internal.h"
32#include "pnfs.h"
33
34#define NFSDBG_FACILITY NFSDBG_PNFS
35
36/* Locking:
37 *
38 * pnfs_spinlock:
39 * protects pnfs_modules_tbl.
40 */
41static DEFINE_SPINLOCK(pnfs_spinlock);
42
43/*
44 * pnfs_modules_tbl holds all pnfs modules
45 */
46static LIST_HEAD(pnfs_modules_tbl);
47
48/* Return the registered pnfs layout driver module matching given id */
49static struct pnfs_layoutdriver_type *
50find_pnfs_driver_locked(u32 id)
51{
52 struct pnfs_layoutdriver_type *local;
53
54 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
55 if (local->id == id)
56 goto out;
57 local = NULL;
58out:
59 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
60 return local;
61}
62
63static struct pnfs_layoutdriver_type *
64find_pnfs_driver(u32 id)
65{
66 struct pnfs_layoutdriver_type *local;
67
68 spin_lock(&pnfs_spinlock);
69 local = find_pnfs_driver_locked(id);
70 spin_unlock(&pnfs_spinlock);
71 return local;
72}
73
74void
75unset_pnfs_layoutdriver(struct nfs_server *nfss)
76{
77 if (nfss->pnfs_curr_ld) {
78 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner);
80 }
81 nfss->pnfs_curr_ld = NULL;
82}
83
84/*
85 * Try to set the server's pnfs module to the pnfs layout type specified by id.
86 * Currently only one pNFS layout driver per filesystem is supported.
87 *
88 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
89 */
90void
91set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
92{
93 struct pnfs_layoutdriver_type *ld_type = NULL;
94
95 if (id == 0)
96 goto out_no_driver;
97 if (!(server->nfs_client->cl_exchange_flags &
98 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
99 printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
100 id, server->nfs_client->cl_exchange_flags);
101 goto out_no_driver;
102 }
103 ld_type = find_pnfs_driver(id);
104 if (!ld_type) {
105 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
106 ld_type = find_pnfs_driver(id);
107 if (!ld_type) {
108 dprintk("%s: No pNFS module found for %u.\n",
109 __func__, id);
110 goto out_no_driver;
111 }
112 }
113 if (!try_module_get(ld_type->owner)) {
114 dprintk("%s: Could not grab reference on module\n", __func__);
115 goto out_no_driver;
116 }
117 server->pnfs_curr_ld = ld_type;
118 if (ld_type->set_layoutdriver(server)) {
119 printk(KERN_ERR
120 "%s: Error initializing mount point for layout driver %u.\n",
121 __func__, id);
122 module_put(ld_type->owner);
123 goto out_no_driver;
124 }
125 dprintk("%s: pNFS module for %u set\n", __func__, id);
126 return;
127
128out_no_driver:
129 dprintk("%s: Using NFSv4 I/O\n", __func__);
130 server->pnfs_curr_ld = NULL;
131}
132
133int
134pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
135{
136 int status = -EINVAL;
137 struct pnfs_layoutdriver_type *tmp;
138
139 if (ld_type->id == 0) {
140 printk(KERN_ERR "%s id 0 is reserved\n", __func__);
141 return status;
142 }
143 if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
144 printk(KERN_ERR "%s Layout driver must provide "
145 "alloc_lseg and free_lseg.\n", __func__);
146 return status;
147 }
148
149 spin_lock(&pnfs_spinlock);
150 tmp = find_pnfs_driver_locked(ld_type->id);
151 if (!tmp) {
152 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
153 status = 0;
154 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
155 ld_type->name);
156 } else {
157 printk(KERN_ERR "%s Module with id %d already loaded!\n",
158 __func__, ld_type->id);
159 }
160 spin_unlock(&pnfs_spinlock);
161
162 return status;
163}
164EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
165
166void
167pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
168{
169 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
170 spin_lock(&pnfs_spinlock);
171 list_del(&ld_type->pnfs_tblid);
172 spin_unlock(&pnfs_spinlock);
173}
174EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
175
176/*
177 * pNFS client layout cache
178 */
179
180static void
181get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
182{
183 assert_spin_locked(&lo->inode->i_lock);
184 lo->refcount++;
185}
186
187static void
188put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
189{
190 assert_spin_locked(&lo->inode->i_lock);
191 BUG_ON(lo->refcount == 0);
192
193 lo->refcount--;
194 if (!lo->refcount) {
195 dprintk("%s: freeing layout cache %p\n", __func__, lo);
196 BUG_ON(!list_empty(&lo->layouts));
197 NFS_I(lo->inode)->layout = NULL;
198 kfree(lo);
199 }
200}
201
202void
203put_layout_hdr(struct inode *inode)
204{
205 spin_lock(&inode->i_lock);
206 put_layout_hdr_locked(NFS_I(inode)->layout);
207 spin_unlock(&inode->i_lock);
208}
209
210static void
211init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
212{
213 INIT_LIST_HEAD(&lseg->fi_list);
214 kref_init(&lseg->kref);
215 lseg->layout = lo;
216}
217
218/* Called without i_lock held, as the free_lseg call may sleep */
219static void
220destroy_lseg(struct kref *kref)
221{
222 struct pnfs_layout_segment *lseg =
223 container_of(kref, struct pnfs_layout_segment, kref);
224 struct inode *ino = lseg->layout->inode;
225
226 dprintk("--> %s\n", __func__);
227 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
228 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
229 put_layout_hdr(ino);
230}
231
232static void
233put_lseg(struct pnfs_layout_segment *lseg)
234{
235 if (!lseg)
236 return;
237
238 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
239 atomic_read(&lseg->kref.refcount));
240 kref_put(&lseg->kref, destroy_lseg);
241}
242
243static void
244pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
245{
246 struct pnfs_layout_segment *lseg, *next;
247 struct nfs_client *clp;
248
249 dprintk("%s:Begin lo %p\n", __func__, lo);
250
251 assert_spin_locked(&lo->inode->i_lock);
252 list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
253 dprintk("%s: freeing lseg %p\n", __func__, lseg);
254 list_move(&lseg->fi_list, tmp_list);
255 }
256 clp = NFS_SERVER(lo->inode)->nfs_client;
257 spin_lock(&clp->cl_lock);
258 /* List does not take a reference, so no need for put here */
259 list_del_init(&lo->layouts);
260 spin_unlock(&clp->cl_lock);
261 write_seqlock(&lo->seqlock);
262 clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
263 write_sequnlock(&lo->seqlock);
264
265 dprintk("%s:Return\n", __func__);
266}
267
268static void
269pnfs_free_lseg_list(struct list_head *tmp_list)
270{
271 struct pnfs_layout_segment *lseg;
272
273 while (!list_empty(tmp_list)) {
274 lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
275 fi_list);
276 dprintk("%s calling put_lseg on %p\n", __func__, lseg);
277 list_del(&lseg->fi_list);
278 put_lseg(lseg);
279 }
280}
281
282void
283pnfs_destroy_layout(struct nfs_inode *nfsi)
284{
285 struct pnfs_layout_hdr *lo;
286 LIST_HEAD(tmp_list);
287
288 spin_lock(&nfsi->vfs_inode.i_lock);
289 lo = nfsi->layout;
290 if (lo) {
291 pnfs_clear_lseg_list(lo, &tmp_list);
292 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
293 put_layout_hdr_locked(lo);
294 }
295 spin_unlock(&nfsi->vfs_inode.i_lock);
296 pnfs_free_lseg_list(&tmp_list);
297}
298
299/*
300 * Called by the state manger to remove all layouts established under an
301 * expired lease.
302 */
303void
304pnfs_destroy_all_layouts(struct nfs_client *clp)
305{
306 struct pnfs_layout_hdr *lo;
307 LIST_HEAD(tmp_list);
308
309 spin_lock(&clp->cl_lock);
310 list_splice_init(&clp->cl_layouts, &tmp_list);
311 spin_unlock(&clp->cl_lock);
312
313 while (!list_empty(&tmp_list)) {
314 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
315 layouts);
316 dprintk("%s freeing layout for inode %lu\n", __func__,
317 lo->inode->i_ino);
318 pnfs_destroy_layout(NFS_I(lo->inode));
319 }
320}
321
322/* update lo->stateid with new if is more recent
323 *
324 * lo->stateid could be the open stateid, in which case we just use what given.
325 */
326static void
327pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
328 const nfs4_stateid *new)
329{
330 nfs4_stateid *old = &lo->stateid;
331 bool overwrite = false;
332
333 write_seqlock(&lo->seqlock);
334 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
335 memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
336 overwrite = true;
337 else {
338 u32 oldseq, newseq;
339
340 oldseq = be32_to_cpu(old->stateid.seqid);
341 newseq = be32_to_cpu(new->stateid.seqid);
342 if ((int)(newseq - oldseq) > 0)
343 overwrite = true;
344 }
345 if (overwrite)
346 memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
347 write_sequnlock(&lo->seqlock);
348}
349
350static void
351pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
352 struct nfs4_state *state)
353{
354 int seq;
355
356 dprintk("--> %s\n", __func__);
357 write_seqlock(&lo->seqlock);
358 do {
359 seq = read_seqbegin(&state->seqlock);
360 memcpy(lo->stateid.data, state->stateid.data,
361 sizeof(state->stateid.data));
362 } while (read_seqretry(&state->seqlock, seq));
363 set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
364 write_sequnlock(&lo->seqlock);
365 dprintk("<-- %s\n", __func__);
366}
367
368void
369pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
370 struct nfs4_state *open_state)
371{
372 int seq;
373
374 dprintk("--> %s\n", __func__);
375 do {
376 seq = read_seqbegin(&lo->seqlock);
377 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
378 /* This will trigger retry of the read */
379 pnfs_layout_from_open_stateid(lo, open_state);
380 } else
381 memcpy(dst->data, lo->stateid.data,
382 sizeof(lo->stateid.data));
383 } while (read_seqretry(&lo->seqlock, seq));
384 dprintk("<-- %s\n", __func__);
385}
386
387/*
388* Get layout from server.
389* for now, assume that whole file layouts are requested.
390* arg->offset: 0
391* arg->length: all ones
392*/
393static struct pnfs_layout_segment *
394send_layoutget(struct pnfs_layout_hdr *lo,
395 struct nfs_open_context *ctx,
396 u32 iomode)
397{
398 struct inode *ino = lo->inode;
399 struct nfs_server *server = NFS_SERVER(ino);
400 struct nfs4_layoutget *lgp;
401 struct pnfs_layout_segment *lseg = NULL;
402
403 dprintk("--> %s\n", __func__);
404
405 BUG_ON(ctx == NULL);
406 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
407 if (lgp == NULL) {
408 put_layout_hdr(lo->inode);
409 return NULL;
410 }
411 lgp->args.minlength = NFS4_MAX_UINT64;
412 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
413 lgp->args.range.iomode = iomode;
414 lgp->args.range.offset = 0;
415 lgp->args.range.length = NFS4_MAX_UINT64;
416 lgp->args.type = server->pnfs_curr_ld->id;
417 lgp->args.inode = ino;
418 lgp->args.ctx = get_nfs_open_context(ctx);
419 lgp->lsegpp = &lseg;
420
421 /* Synchronously retrieve layout information from server and
422 * store in lseg.
423 */
424 nfs4_proc_layoutget(lgp);
425 if (!lseg) {
426 /* remember that LAYOUTGET failed and suspend trying */
427 set_bit(lo_fail_bit(iomode), &lo->state);
428 }
429 return lseg;
430}
431
432/*
433 * Compare two layout segments for sorting into layout cache.
434 * We want to preferentially return RW over RO layouts, so ensure those
435 * are seen first.
436 */
437static s64
438cmp_layout(u32 iomode1, u32 iomode2)
439{
440 /* read > read/write */
441 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
442}
443
444static void
445pnfs_insert_layout(struct pnfs_layout_hdr *lo,
446 struct pnfs_layout_segment *lseg)
447{
448 struct pnfs_layout_segment *lp;
449 int found = 0;
450
451 dprintk("%s:Begin\n", __func__);
452
453 assert_spin_locked(&lo->inode->i_lock);
454 if (list_empty(&lo->segs)) {
455 struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
456
457 spin_lock(&clp->cl_lock);
458 BUG_ON(!list_empty(&lo->layouts));
459 list_add_tail(&lo->layouts, &clp->cl_layouts);
460 spin_unlock(&clp->cl_lock);
461 }
462 list_for_each_entry(lp, &lo->segs, fi_list) {
463 if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
464 continue;
465 list_add_tail(&lseg->fi_list, &lp->fi_list);
466 dprintk("%s: inserted lseg %p "
467 "iomode %d offset %llu length %llu before "
468 "lp %p iomode %d offset %llu length %llu\n",
469 __func__, lseg, lseg->range.iomode,
470 lseg->range.offset, lseg->range.length,
471 lp, lp->range.iomode, lp->range.offset,
472 lp->range.length);
473 found = 1;
474 break;
475 }
476 if (!found) {
477 list_add_tail(&lseg->fi_list, &lo->segs);
478 dprintk("%s: inserted lseg %p "
479 "iomode %d offset %llu length %llu at tail\n",
480 __func__, lseg, lseg->range.iomode,
481 lseg->range.offset, lseg->range.length);
482 }
483 get_layout_hdr_locked(lo);
484
485 dprintk("%s:Return\n", __func__);
486}
487
488static struct pnfs_layout_hdr *
489alloc_init_layout_hdr(struct inode *ino)
490{
491 struct pnfs_layout_hdr *lo;
492
493 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
494 if (!lo)
495 return NULL;
496 lo->refcount = 1;
497 INIT_LIST_HEAD(&lo->layouts);
498 INIT_LIST_HEAD(&lo->segs);
499 seqlock_init(&lo->seqlock);
500 lo->inode = ino;
501 return lo;
502}
503
504static struct pnfs_layout_hdr *
505pnfs_find_alloc_layout(struct inode *ino)
506{
507 struct nfs_inode *nfsi = NFS_I(ino);
508 struct pnfs_layout_hdr *new = NULL;
509
510 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
511
512 assert_spin_locked(&ino->i_lock);
513 if (nfsi->layout)
514 return nfsi->layout;
515
516 spin_unlock(&ino->i_lock);
517 new = alloc_init_layout_hdr(ino);
518 spin_lock(&ino->i_lock);
519
520 if (likely(nfsi->layout == NULL)) /* Won the race? */
521 nfsi->layout = new;
522 else
523 kfree(new);
524 return nfsi->layout;
525}
526
527/*
528 * iomode matching rules:
529 * iomode lseg match
530 * ----- ----- -----
531 * ANY READ true
532 * ANY RW true
533 * RW READ false
534 * RW RW true
535 * READ READ true
536 * READ RW true
537 */
538static int
539is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
540{
541 return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
542}
543
544/*
545 * lookup range in layout
546 */
547static struct pnfs_layout_segment *
548pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
549{
550 struct pnfs_layout_segment *lseg, *ret = NULL;
551
552 dprintk("%s:Begin\n", __func__);
553
554 assert_spin_locked(&lo->inode->i_lock);
555 list_for_each_entry(lseg, &lo->segs, fi_list) {
556 if (is_matching_lseg(lseg, iomode)) {
557 ret = lseg;
558 break;
559 }
560 if (cmp_layout(iomode, lseg->range.iomode) > 0)
561 break;
562 }
563
564 dprintk("%s:Return lseg %p ref %d\n",
565 __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
566 return ret;
567}
568
569/*
570 * Layout segment is retreived from the server if not cached.
571 * The appropriate layout segment is referenced and returned to the caller.
572 */
573struct pnfs_layout_segment *
574pnfs_update_layout(struct inode *ino,
575 struct nfs_open_context *ctx,
576 enum pnfs_iomode iomode)
577{
578 struct nfs_inode *nfsi = NFS_I(ino);
579 struct pnfs_layout_hdr *lo;
580 struct pnfs_layout_segment *lseg = NULL;
581
582 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
583 return NULL;
584 spin_lock(&ino->i_lock);
585 lo = pnfs_find_alloc_layout(ino);
586 if (lo == NULL) {
587 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
588 goto out_unlock;
589 }
590
591 /* Check to see if the layout for the given range already exists */
592 lseg = pnfs_has_layout(lo, iomode);
593 if (lseg) {
594 dprintk("%s: Using cached lseg %p for iomode %d)\n",
595 __func__, lseg, iomode);
596 goto out_unlock;
597 }
598
599 /* if LAYOUTGET already failed once we don't try again */
600 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
601 goto out_unlock;
602
603 get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
604 spin_unlock(&ino->i_lock);
605
606 lseg = send_layoutget(lo, ctx, iomode);
607out:
608 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
609 nfsi->layout->state, lseg);
610 return lseg;
611out_unlock:
612 spin_unlock(&ino->i_lock);
613 goto out;
614}
615
616int
617pnfs_layout_process(struct nfs4_layoutget *lgp)
618{
619 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
620 struct nfs4_layoutget_res *res = &lgp->res;
621 struct pnfs_layout_segment *lseg;
622 struct inode *ino = lo->inode;
623 int status = 0;
624
625 /* Inject layout blob into I/O device driver */
626 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
627 if (!lseg || IS_ERR(lseg)) {
628 if (!lseg)
629 status = -ENOMEM;
630 else
631 status = PTR_ERR(lseg);
632 dprintk("%s: Could not allocate layout: error %d\n",
633 __func__, status);
634 goto out;
635 }
636
637 spin_lock(&ino->i_lock);
638 init_lseg(lo, lseg);
639 lseg->range = res->range;
640 *lgp->lsegpp = lseg;
641 pnfs_insert_layout(lo, lseg);
642
643 /* Done processing layoutget. Set the layout stateid */
644 pnfs_set_layout_stateid(lo, &res->stateid);
645 spin_unlock(&ino->i_lock);
646out:
647 return status;
648}
649
650/*
651 * Device ID cache. Currently supports one layout type per struct nfs_client.
652 * Add layout type to the lookup key to expand to support multiple types.
653 */
654int
655pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
656 void (*free_callback)(struct pnfs_deviceid_node *))
657{
658 struct pnfs_deviceid_cache *c;
659
660 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
661 if (!c)
662 return -ENOMEM;
663 spin_lock(&clp->cl_lock);
664 if (clp->cl_devid_cache != NULL) {
665 atomic_inc(&clp->cl_devid_cache->dc_ref);
666 dprintk("%s [kref [%d]]\n", __func__,
667 atomic_read(&clp->cl_devid_cache->dc_ref));
668 kfree(c);
669 } else {
670 /* kzalloc initializes hlists */
671 spin_lock_init(&c->dc_lock);
672 atomic_set(&c->dc_ref, 1);
673 c->dc_free_callback = free_callback;
674 clp->cl_devid_cache = c;
675 dprintk("%s [new]\n", __func__);
676 }
677 spin_unlock(&clp->cl_lock);
678 return 0;
679}
680EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
681
682/*
683 * Called from pnfs_layoutdriver_type->free_lseg
684 * last layout segment reference frees deviceid
685 */
686void
687pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
688 struct pnfs_deviceid_node *devid)
689{
690 struct nfs4_deviceid *id = &devid->de_id;
691 struct pnfs_deviceid_node *d;
692 struct hlist_node *n;
693 long h = nfs4_deviceid_hash(id);
694
695 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
696 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
697 return;
698
699 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
700 if (!memcmp(&d->de_id, id, sizeof(*id))) {
701 hlist_del_rcu(&d->de_node);
702 spin_unlock(&c->dc_lock);
703 synchronize_rcu();
704 c->dc_free_callback(devid);
705 return;
706 }
707 spin_unlock(&c->dc_lock);
708 /* Why wasn't it found in the list? */
709 BUG();
710}
711EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
712
713/* Find and reference a deviceid */
714struct pnfs_deviceid_node *
715pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
716{
717 struct pnfs_deviceid_node *d;
718 struct hlist_node *n;
719 long hash = nfs4_deviceid_hash(id);
720
721 dprintk("--> %s hash %ld\n", __func__, hash);
722 rcu_read_lock();
723 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
724 if (!memcmp(&d->de_id, id, sizeof(*id))) {
725 if (!atomic_inc_not_zero(&d->de_ref)) {
726 goto fail;
727 } else {
728 rcu_read_unlock();
729 return d;
730 }
731 }
732 }
733fail:
734 rcu_read_unlock();
735 return NULL;
736}
737EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
738
739/*
740 * Add a deviceid to the cache.
741 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
742 */
743struct pnfs_deviceid_node *
744pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
745{
746 struct pnfs_deviceid_node *d;
747 long hash = nfs4_deviceid_hash(&new->de_id);
748
749 dprintk("--> %s hash %ld\n", __func__, hash);
750 spin_lock(&c->dc_lock);
751 d = pnfs_find_get_deviceid(c, &new->de_id);
752 if (d) {
753 spin_unlock(&c->dc_lock);
754 dprintk("%s [discard]\n", __func__);
755 c->dc_free_callback(new);
756 return d;
757 }
758 INIT_HLIST_NODE(&new->de_node);
759 atomic_set(&new->de_ref, 1);
760 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
761 spin_unlock(&c->dc_lock);
762 dprintk("%s [new]\n", __func__);
763 return new;
764}
765EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
766
767void
768pnfs_put_deviceid_cache(struct nfs_client *clp)
769{
770 struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
771
772 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
773 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
774 int i;
775 /* Verify cache is empty */
776 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
777 BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
778 clp->cl_devid_cache = NULL;
779 spin_unlock(&clp->cl_lock);
780 kfree(local);
781 }
782}
783EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000000..e12367d50489
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,189 @@
1/*
2 * pNFS client data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H
32
33struct pnfs_layout_segment {
34 struct list_head fi_list;
35 struct pnfs_layout_range range;
36 struct kref kref;
37 struct pnfs_layout_hdr *layout;
38};
39
40#ifdef CONFIG_NFS_V4_1
41
42#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
43
44enum {
45 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
46 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
47 NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
48};
49
50/* Per-layout driver specific registration structure */
51struct pnfs_layoutdriver_type {
52 struct list_head pnfs_tblid;
53 const u32 id;
54 const char *name;
55 struct module *owner;
56 int (*set_layoutdriver) (struct nfs_server *);
57 int (*clear_layoutdriver) (struct nfs_server *);
58 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
59 void (*free_lseg) (struct pnfs_layout_segment *lseg);
60};
61
62struct pnfs_layout_hdr {
63 unsigned long refcount;
64 struct list_head layouts; /* other client layouts */
65 struct list_head segs; /* layout segments list */
66 seqlock_t seqlock; /* Protects the stateid */
67 nfs4_stateid stateid;
68 unsigned long state;
69 struct inode *inode;
70};
71
72struct pnfs_device {
73 struct nfs4_deviceid dev_id;
74 unsigned int layout_type;
75 unsigned int mincount;
76 struct page **pages;
77 void *area;
78 unsigned int pgbase;
79 unsigned int pglen;
80};
81
82/*
83 * Device ID RCU cache. A device ID is unique per client ID and layout type.
84 */
85#define NFS4_DEVICE_ID_HASH_BITS 5
86#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
87#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
88
89static inline u32
90nfs4_deviceid_hash(struct nfs4_deviceid *id)
91{
92 unsigned char *cptr = (unsigned char *)id->data;
93 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
94 u32 x = 0;
95
96 while (nbytes--) {
97 x *= 37;
98 x += *cptr++;
99 }
100 return x & NFS4_DEVICE_ID_HASH_MASK;
101}
102
103struct pnfs_deviceid_node {
104 struct hlist_node de_node;
105 struct nfs4_deviceid de_id;
106 atomic_t de_ref;
107};
108
109struct pnfs_deviceid_cache {
110 spinlock_t dc_lock;
111 atomic_t dc_ref;
112 void (*dc_free_callback)(struct pnfs_deviceid_node *);
113 struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
114};
115
116extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
117 void (*free_callback)(struct pnfs_deviceid_node *));
118extern void pnfs_put_deviceid_cache(struct nfs_client *);
119extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
120 struct pnfs_deviceid_cache *,
121 struct nfs4_deviceid *);
122extern struct pnfs_deviceid_node *pnfs_add_deviceid(
123 struct pnfs_deviceid_cache *,
124 struct pnfs_deviceid_node *);
125extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
126 struct pnfs_deviceid_node *devid);
127
128extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
129extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
130
131/* nfs4proc.c */
132extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
133 struct pnfs_device *dev);
134extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
135
136/* pnfs.c */
137struct pnfs_layout_segment *
138pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
139 enum pnfs_iomode access_type);
140void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
141void unset_pnfs_layoutdriver(struct nfs_server *);
142int pnfs_layout_process(struct nfs4_layoutget *lgp);
143void pnfs_destroy_layout(struct nfs_inode *);
144void pnfs_destroy_all_layouts(struct nfs_client *);
145void put_layout_hdr(struct inode *inode);
146void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
147 struct nfs4_state *open_state);
148
149
150static inline int lo_fail_bit(u32 iomode)
151{
152 return iomode == IOMODE_RW ?
153 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
154}
155
156/* Return true if a layout driver is being used for this mountpoint */
157static inline int pnfs_enabled_sb(struct nfs_server *nfss)
158{
159 return nfss->pnfs_curr_ld != NULL;
160}
161
162#else /* CONFIG_NFS_V4_1 */
163
164static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
165{
166}
167
168static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
169{
170}
171
172static inline struct pnfs_layout_segment *
173pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
174 enum pnfs_iomode access_type)
175{
176 return NULL;
177}
178
179static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
180{
181}
182
183static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
184{
185}
186
187#endif /* CONFIG_NFS_V4_1 */
188
189#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f552..58e7f84fc1fd 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
258 258
259static int 259static int
260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
261 int flags, struct nameidata *nd) 261 int flags, struct nfs_open_context *ctx)
262{ 262{
263 struct nfs_createdata *data; 263 struct nfs_createdata *data;
264 struct rpc_message msg = { 264 struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
365 return 1; 365 return 1;
366} 366}
367 367
368static void
369nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
370{
371 msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
372}
373
374static int
375nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
376 struct inode *new_dir)
377{
378 if (nfs_async_handle_expired_key(task))
379 return 0;
380 nfs_mark_for_revalidate(old_dir);
381 nfs_mark_for_revalidate(new_dir);
382 return 1;
383}
384
368static int 385static int
369nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, 386nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
370 struct inode *new_dir, struct qstr *new_name) 387 struct inode *new_dir, struct qstr *new_name)
371{ 388{
372 struct nfs_renameargs arg = { 389 struct nfs_renameargs arg = {
373 .fromfh = NFS_FH(old_dir), 390 .old_dir = NFS_FH(old_dir),
374 .fromname = old_name->name, 391 .old_name = old_name,
375 .fromlen = old_name->len, 392 .new_dir = NFS_FH(new_dir),
376 .tofh = NFS_FH(new_dir), 393 .new_name = new_name,
377 .toname = new_name->name,
378 .tolen = new_name->len
379 }; 394 };
380 struct rpc_message msg = { 395 struct rpc_message msg = {
381 .rpc_proc = &nfs_procedures[NFSPROC_RENAME], 396 .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
@@ -519,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
519 */ 534 */
520static int 535static int
521nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 536nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
522 u64 cookie, struct page *page, unsigned int count, int plus) 537 u64 cookie, struct page **pages, unsigned int count, int plus)
523{ 538{
524 struct inode *dir = dentry->d_inode; 539 struct inode *dir = dentry->d_inode;
525 struct nfs_readdirargs arg = { 540 struct nfs_readdirargs arg = {
526 .fh = NFS_FH(dir), 541 .fh = NFS_FH(dir),
527 .cookie = cookie, 542 .cookie = cookie,
528 .count = count, 543 .count = count,
529 .pages = &page, 544 .pages = pages,
530 }; 545 };
531 struct rpc_message msg = { 546 struct rpc_message msg = {
532 .rpc_proc = &nfs_procedures[NFSPROC_READDIR], 547 .rpc_proc = &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
705 .unlink_setup = nfs_proc_unlink_setup, 720 .unlink_setup = nfs_proc_unlink_setup,
706 .unlink_done = nfs_proc_unlink_done, 721 .unlink_done = nfs_proc_unlink_done,
707 .rename = nfs_proc_rename, 722 .rename = nfs_proc_rename,
723 .rename_setup = nfs_proc_rename_setup,
724 .rename_done = nfs_proc_rename_done,
708 .link = nfs_proc_link, 725 .link = nfs_proc_link,
709 .symlink = nfs_proc_symlink, 726 .symlink = nfs_proc_symlink,
710 .mkdir = nfs_proc_mkdir, 727 .mkdir = nfs_proc_mkdir,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87adc2744246..e4b62c6f5a6e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h" 27#include "fscache.h"
28#include "pnfs.h"
28 29
29#define NFSDBG_FACILITY NFSDBG_PAGECACHE 30#define NFSDBG_FACILITY NFSDBG_PAGECACHE
30 31
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
46 memset(p, 0, sizeof(*p)); 47 memset(p, 0, sizeof(*p));
47 INIT_LIST_HEAD(&p->pages); 48 INIT_LIST_HEAD(&p->pages);
48 p->npages = pagecount; 49 p->npages = pagecount;
49 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
51 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
52 else { 52 else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
121 len = nfs_page_length(page); 121 len = nfs_page_length(page);
122 if (len == 0) 122 if (len == 0)
123 return nfs_return_empty_page(page); 123 return nfs_return_empty_page(page);
124 pnfs_update_layout(inode, ctx, IOMODE_READ);
124 new = nfs_create_request(ctx, inode, page, 0, len); 125 new = nfs_create_request(ctx, inode, page, 0, len);
125 if (IS_ERR(new)) { 126 if (IS_ERR(new)) {
126 unlock_page(page); 127 unlock_page(page);
@@ -625,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
625 if (ret == 0) 626 if (ret == 0)
626 goto read_complete; /* all pages were read */ 627 goto read_complete; /* all pages were read */
627 628
629 pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
628 if (rsize < PAGE_CACHE_SIZE) 630 if (rsize < PAGE_CACHE_SIZE)
629 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 631 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
630 else 632 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f4cbf0c306c6..0a42e8f4adcb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
100 Opt_addr, Opt_mountaddr, Opt_clientaddr, 100 Opt_addr, Opt_mountaddr, Opt_clientaddr,
101 Opt_lookupcache, 101 Opt_lookupcache,
102 Opt_fscache_uniq, 102 Opt_fscache_uniq,
103 Opt_local_lock,
103 104
104 /* Special mount options */ 105 /* Special mount options */
105 Opt_userspace, Opt_deprecated, Opt_sloppy, 106 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = {
171 172
172 { Opt_lookupcache, "lookupcache=%s" }, 173 { Opt_lookupcache, "lookupcache=%s" },
173 { Opt_fscache_uniq, "fsc=%s" }, 174 { Opt_fscache_uniq, "fsc=%s" },
175 { Opt_local_lock, "local_lock=%s" },
174 176
175 { Opt_err, NULL } 177 { Opt_err, NULL }
176}; 178};
@@ -236,14 +238,30 @@ static match_table_t nfs_lookupcache_tokens = {
236 { Opt_lookupcache_err, NULL } 238 { Opt_lookupcache_err, NULL }
237}; 239};
238 240
241enum {
242 Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
243 Opt_local_lock_none,
244
245 Opt_local_lock_err
246};
247
248static match_table_t nfs_local_lock_tokens = {
249 { Opt_local_lock_all, "all" },
250 { Opt_local_lock_flock, "flock" },
251 { Opt_local_lock_posix, "posix" },
252 { Opt_local_lock_none, "none" },
253
254 { Opt_local_lock_err, NULL }
255};
256
239 257
240static void nfs_umount_begin(struct super_block *); 258static void nfs_umount_begin(struct super_block *);
241static int nfs_statfs(struct dentry *, struct kstatfs *); 259static int nfs_statfs(struct dentry *, struct kstatfs *);
242static int nfs_show_options(struct seq_file *, struct vfsmount *); 260static int nfs_show_options(struct seq_file *, struct vfsmount *);
243static int nfs_show_stats(struct seq_file *, struct vfsmount *); 261static int nfs_show_stats(struct seq_file *, struct vfsmount *);
244static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); 262static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
245static int nfs_xdev_get_sb(struct file_system_type *fs_type, 263static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
246 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 264 int flags, const char *dev_name, void *raw_data);
247static void nfs_put_super(struct super_block *); 265static void nfs_put_super(struct super_block *);
248static void nfs_kill_super(struct super_block *); 266static void nfs_kill_super(struct super_block *);
249static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 267static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -259,7 +277,7 @@ static struct file_system_type nfs_fs_type = {
259struct file_system_type nfs_xdev_fs_type = { 277struct file_system_type nfs_xdev_fs_type = {
260 .owner = THIS_MODULE, 278 .owner = THIS_MODULE,
261 .name = "nfs", 279 .name = "nfs",
262 .get_sb = nfs_xdev_get_sb, 280 .mount = nfs_xdev_mount,
263 .kill_sb = nfs_kill_super, 281 .kill_sb = nfs_kill_super,
264 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 282 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
265}; 283};
@@ -284,14 +302,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
284 struct nfs_parsed_mount_data *data, struct vfsmount *mnt); 302 struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
285static int nfs4_get_sb(struct file_system_type *fs_type, 303static int nfs4_get_sb(struct file_system_type *fs_type,
286 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 304 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
287static int nfs4_remote_get_sb(struct file_system_type *fs_type, 305static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
288 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 306 int flags, const char *dev_name, void *raw_data);
289static int nfs4_xdev_get_sb(struct file_system_type *fs_type, 307static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
290 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 308 int flags, const char *dev_name, void *raw_data);
291static int nfs4_referral_get_sb(struct file_system_type *fs_type, 309static int nfs4_referral_get_sb(struct file_system_type *fs_type,
292 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 310 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
293static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, 311static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
294 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 312 int flags, const char *dev_name, void *raw_data);
295static void nfs4_kill_super(struct super_block *sb); 313static void nfs4_kill_super(struct super_block *sb);
296 314
297static struct file_system_type nfs4_fs_type = { 315static struct file_system_type nfs4_fs_type = {
@@ -305,7 +323,7 @@ static struct file_system_type nfs4_fs_type = {
305static struct file_system_type nfs4_remote_fs_type = { 323static struct file_system_type nfs4_remote_fs_type = {
306 .owner = THIS_MODULE, 324 .owner = THIS_MODULE,
307 .name = "nfs4", 325 .name = "nfs4",
308 .get_sb = nfs4_remote_get_sb, 326 .mount = nfs4_remote_mount,
309 .kill_sb = nfs4_kill_super, 327 .kill_sb = nfs4_kill_super,
310 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 328 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
311}; 329};
@@ -313,7 +331,7 @@ static struct file_system_type nfs4_remote_fs_type = {
313struct file_system_type nfs4_xdev_fs_type = { 331struct file_system_type nfs4_xdev_fs_type = {
314 .owner = THIS_MODULE, 332 .owner = THIS_MODULE,
315 .name = "nfs4", 333 .name = "nfs4",
316 .get_sb = nfs4_xdev_get_sb, 334 .mount = nfs4_xdev_mount,
317 .kill_sb = nfs4_kill_super, 335 .kill_sb = nfs4_kill_super,
318 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 336 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
319}; 337};
@@ -321,7 +339,7 @@ struct file_system_type nfs4_xdev_fs_type = {
321static struct file_system_type nfs4_remote_referral_fs_type = { 339static struct file_system_type nfs4_remote_referral_fs_type = {
322 .owner = THIS_MODULE, 340 .owner = THIS_MODULE,
323 .name = "nfs4", 341 .name = "nfs4",
324 .get_sb = nfs4_remote_referral_get_sb, 342 .mount = nfs4_remote_referral_mount,
325 .kill_sb = nfs4_kill_super, 343 .kill_sb = nfs4_kill_super,
326 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 344 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
327}; 345};
@@ -622,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
622 const struct proc_nfs_info *nfs_infop; 640 const struct proc_nfs_info *nfs_infop;
623 struct nfs_client *clp = nfss->nfs_client; 641 struct nfs_client *clp = nfss->nfs_client;
624 u32 version = clp->rpc_ops->version; 642 u32 version = clp->rpc_ops->version;
643 int local_flock, local_fcntl;
625 644
626 seq_printf(m, ",vers=%u", version); 645 seq_printf(m, ",vers=%u", version);
627 seq_printf(m, ",rsize=%u", nfss->rsize); 646 seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -670,6 +689,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
670 else 689 else
671 seq_printf(m, ",lookupcache=pos"); 690 seq_printf(m, ",lookupcache=pos");
672 } 691 }
692
693 local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
694 local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
695
696 if (!local_flock && !local_fcntl)
697 seq_printf(m, ",local_lock=none");
698 else if (local_flock && local_fcntl)
699 seq_printf(m, ",local_lock=all");
700 else if (local_flock)
701 seq_printf(m, ",local_lock=flock");
702 else
703 seq_printf(m, ",local_lock=posix");
673} 704}
674 705
675/* 706/*
@@ -1017,9 +1048,13 @@ static int nfs_parse_mount_options(char *raw,
1017 break; 1048 break;
1018 case Opt_lock: 1049 case Opt_lock:
1019 mnt->flags &= ~NFS_MOUNT_NONLM; 1050 mnt->flags &= ~NFS_MOUNT_NONLM;
1051 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1052 NFS_MOUNT_LOCAL_FCNTL);
1020 break; 1053 break;
1021 case Opt_nolock: 1054 case Opt_nolock:
1022 mnt->flags |= NFS_MOUNT_NONLM; 1055 mnt->flags |= NFS_MOUNT_NONLM;
1056 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1057 NFS_MOUNT_LOCAL_FCNTL);
1023 break; 1058 break;
1024 case Opt_v2: 1059 case Opt_v2:
1025 mnt->flags &= ~NFS_MOUNT_VER3; 1060 mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1420,6 +1455,34 @@ static int nfs_parse_mount_options(char *raw,
1420 mnt->fscache_uniq = string; 1455 mnt->fscache_uniq = string;
1421 mnt->options |= NFS_OPTION_FSCACHE; 1456 mnt->options |= NFS_OPTION_FSCACHE;
1422 break; 1457 break;
1458 case Opt_local_lock:
1459 string = match_strdup(args);
1460 if (string == NULL)
1461 goto out_nomem;
1462 token = match_token(string, nfs_local_lock_tokens,
1463 args);
1464 kfree(string);
1465 switch (token) {
1466 case Opt_local_lock_all:
1467 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1468 NFS_MOUNT_LOCAL_FCNTL);
1469 break;
1470 case Opt_local_lock_flock:
1471 mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
1472 break;
1473 case Opt_local_lock_posix:
1474 mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
1475 break;
1476 case Opt_local_lock_none:
1477 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1478 NFS_MOUNT_LOCAL_FCNTL);
1479 break;
1480 default:
1481 dfprintk(MOUNT, "NFS: invalid "
1482 "local_lock argument\n");
1483 return 0;
1484 };
1485 break;
1423 1486
1424 /* 1487 /*
1425 * Special options 1488 * Special options
@@ -1825,6 +1888,12 @@ static int nfs_validate_mount_data(void *options,
1825 if (!args->nfs_server.hostname) 1888 if (!args->nfs_server.hostname)
1826 goto out_nomem; 1889 goto out_nomem;
1827 1890
1891 if (!(data->flags & NFS_MOUNT_NONLM))
1892 args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
1893 NFS_MOUNT_LOCAL_FCNTL);
1894 else
1895 args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
1896 NFS_MOUNT_LOCAL_FCNTL);
1828 /* 1897 /*
1829 * The legacy version 6 binary mount data from userspace has a 1898 * The legacy version 6 binary mount data from userspace has a
1830 * field used only to transport selinux information into the 1899 * field used only to transport selinux information into the
@@ -2328,9 +2397,9 @@ static void nfs_kill_super(struct super_block *s)
2328/* 2397/*
2329 * Clone an NFS2/3 server record on xdev traversal (FSID-change) 2398 * Clone an NFS2/3 server record on xdev traversal (FSID-change)
2330 */ 2399 */
2331static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, 2400static struct dentry *
2332 const char *dev_name, void *raw_data, 2401nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2333 struct vfsmount *mnt) 2402 const char *dev_name, void *raw_data)
2334{ 2403{
2335 struct nfs_clone_mount *data = raw_data; 2404 struct nfs_clone_mount *data = raw_data;
2336 struct super_block *s; 2405 struct super_block *s;
@@ -2342,7 +2411,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2342 }; 2411 };
2343 int error; 2412 int error;
2344 2413
2345 dprintk("--> nfs_xdev_get_sb()\n"); 2414 dprintk("--> nfs_xdev_mount()\n");
2346 2415
2347 /* create a new volume representation */ 2416 /* create a new volume representation */
2348 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); 2417 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2389,28 +2458,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2389 } 2458 }
2390 2459
2391 s->s_flags |= MS_ACTIVE; 2460 s->s_flags |= MS_ACTIVE;
2392 mnt->mnt_sb = s;
2393 mnt->mnt_root = mntroot;
2394 2461
2395 /* clone any lsm security options from the parent to the new sb */ 2462 /* clone any lsm security options from the parent to the new sb */
2396 security_sb_clone_mnt_opts(data->sb, s); 2463 security_sb_clone_mnt_opts(data->sb, s);
2397 2464
2398 dprintk("<-- nfs_xdev_get_sb() = 0\n"); 2465 dprintk("<-- nfs_xdev_mount() = 0\n");
2399 return 0; 2466 return mntroot;
2400 2467
2401out_err_nosb: 2468out_err_nosb:
2402 nfs_free_server(server); 2469 nfs_free_server(server);
2403out_err_noserver: 2470out_err_noserver:
2404 dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error); 2471 dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
2405 return error; 2472 return ERR_PTR(error);
2406 2473
2407error_splat_super: 2474error_splat_super:
2408 if (server && !s->s_root) 2475 if (server && !s->s_root)
2409 bdi_unregister(&server->backing_dev_info); 2476 bdi_unregister(&server->backing_dev_info);
2410error_splat_bdi: 2477error_splat_bdi:
2411 deactivate_locked_super(s); 2478 deactivate_locked_super(s);
2412 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2479 dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
2413 return error; 2480 return ERR_PTR(error);
2414} 2481}
2415 2482
2416#ifdef CONFIG_NFS_V4 2483#ifdef CONFIG_NFS_V4
@@ -2441,7 +2508,8 @@ static void nfs4_fill_super(struct super_block *sb)
2441 2508
2442static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) 2509static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2443{ 2510{
2444 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); 2511 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
2512 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
2445} 2513}
2446 2514
2447static int nfs4_validate_text_mount_data(void *options, 2515static int nfs4_validate_text_mount_data(void *options,
@@ -2579,8 +2647,9 @@ out_no_address:
2579/* 2647/*
2580 * Get the superblock for the NFS4 root partition 2648 * Get the superblock for the NFS4 root partition
2581 */ 2649 */
2582static int nfs4_remote_get_sb(struct file_system_type *fs_type, 2650static struct dentry *
2583 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2651nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2652 const char *dev_name, void *raw_data)
2584{ 2653{
2585 struct nfs_parsed_mount_data *data = raw_data; 2654 struct nfs_parsed_mount_data *data = raw_data;
2586 struct super_block *s; 2655 struct super_block *s;
@@ -2644,15 +2713,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2644 goto error_splat_root; 2713 goto error_splat_root;
2645 2714
2646 s->s_flags |= MS_ACTIVE; 2715 s->s_flags |= MS_ACTIVE;
2647 mnt->mnt_sb = s; 2716
2648 mnt->mnt_root = mntroot; 2717 security_free_mnt_opts(&data->lsm_opts);
2649 error = 0; 2718 nfs_free_fhandle(mntfh);
2719 return mntroot;
2650 2720
2651out: 2721out:
2652 security_free_mnt_opts(&data->lsm_opts); 2722 security_free_mnt_opts(&data->lsm_opts);
2653out_free_fh: 2723out_free_fh:
2654 nfs_free_fhandle(mntfh); 2724 nfs_free_fhandle(mntfh);
2655 return error; 2725 return ERR_PTR(error);
2656 2726
2657out_free: 2727out_free:
2658 nfs_free_server(server); 2728 nfs_free_server(server);
@@ -2898,9 +2968,9 @@ static void nfs4_kill_super(struct super_block *sb)
2898/* 2968/*
2899 * Clone an NFS4 server record on xdev traversal (FSID-change) 2969 * Clone an NFS4 server record on xdev traversal (FSID-change)
2900 */ 2970 */
2901static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, 2971static struct dentry *
2902 const char *dev_name, void *raw_data, 2972nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
2903 struct vfsmount *mnt) 2973 const char *dev_name, void *raw_data)
2904{ 2974{
2905 struct nfs_clone_mount *data = raw_data; 2975 struct nfs_clone_mount *data = raw_data;
2906 struct super_block *s; 2976 struct super_block *s;
@@ -2912,7 +2982,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2912 }; 2982 };
2913 int error; 2983 int error;
2914 2984
2915 dprintk("--> nfs4_xdev_get_sb()\n"); 2985 dprintk("--> nfs4_xdev_mount()\n");
2916 2986
2917 /* create a new volume representation */ 2987 /* create a new volume representation */
2918 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); 2988 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2959,32 +3029,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2959 } 3029 }
2960 3030
2961 s->s_flags |= MS_ACTIVE; 3031 s->s_flags |= MS_ACTIVE;
2962 mnt->mnt_sb = s;
2963 mnt->mnt_root = mntroot;
2964 3032
2965 security_sb_clone_mnt_opts(data->sb, s); 3033 security_sb_clone_mnt_opts(data->sb, s);
2966 3034
2967 dprintk("<-- nfs4_xdev_get_sb() = 0\n"); 3035 dprintk("<-- nfs4_xdev_mount() = 0\n");
2968 return 0; 3036 return mntroot;
2969 3037
2970out_err_nosb: 3038out_err_nosb:
2971 nfs_free_server(server); 3039 nfs_free_server(server);
2972out_err_noserver: 3040out_err_noserver:
2973 dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error); 3041 dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
2974 return error; 3042 return ERR_PTR(error);
2975 3043
2976error_splat_super: 3044error_splat_super:
2977 if (server && !s->s_root) 3045 if (server && !s->s_root)
2978 bdi_unregister(&server->backing_dev_info); 3046 bdi_unregister(&server->backing_dev_info);
2979error_splat_bdi: 3047error_splat_bdi:
2980 deactivate_locked_super(s); 3048 deactivate_locked_super(s);
2981 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 3049 dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
2982 return error; 3050 return ERR_PTR(error);
2983} 3051}
2984 3052
2985static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, 3053static struct dentry *
2986 int flags, const char *dev_name, void *raw_data, 3054nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
2987 struct vfsmount *mnt) 3055 const char *dev_name, void *raw_data)
2988{ 3056{
2989 struct nfs_clone_mount *data = raw_data; 3057 struct nfs_clone_mount *data = raw_data;
2990 struct super_block *s; 3058 struct super_block *s;
@@ -3048,14 +3116,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
3048 } 3116 }
3049 3117
3050 s->s_flags |= MS_ACTIVE; 3118 s->s_flags |= MS_ACTIVE;
3051 mnt->mnt_sb = s;
3052 mnt->mnt_root = mntroot;
3053 3119
3054 security_sb_clone_mnt_opts(data->sb, s); 3120 security_sb_clone_mnt_opts(data->sb, s);
3055 3121
3056 nfs_free_fhandle(mntfh); 3122 nfs_free_fhandle(mntfh);
3057 dprintk("<-- nfs4_referral_get_sb() = 0\n"); 3123 dprintk("<-- nfs4_referral_get_sb() = 0\n");
3058 return 0; 3124 return mntroot;
3059 3125
3060out_err_nosb: 3126out_err_nosb:
3061 nfs_free_server(server); 3127 nfs_free_server(server);
@@ -3063,7 +3129,7 @@ out_err_noserver:
3063 nfs_free_fhandle(mntfh); 3129 nfs_free_fhandle(mntfh);
3064out_err_nofh: 3130out_err_nofh:
3065 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); 3131 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
3066 return error; 3132 return ERR_PTR(error);
3067 3133
3068error_splat_super: 3134error_splat_super:
3069 if (server && !s->s_root) 3135 if (server && !s->s_root)
@@ -3072,7 +3138,7 @@ error_splat_bdi:
3072 deactivate_locked_super(s); 3138 deactivate_locked_super(s);
3073 nfs_free_fhandle(mntfh); 3139 nfs_free_fhandle(mntfh);
3074 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 3140 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
3075 return error; 3141 return ERR_PTR(error);
3076} 3142}
3077 3143
3078/* 3144/*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b20..978aaeb8a093 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
32 .extra1 = (int *)&nfs_set_port_min, 32 .extra1 = (int *)&nfs_set_port_min,
33 .extra2 = (int *)&nfs_set_port_max, 33 .extra2 = (int *)&nfs_set_port_max,
34 }, 34 },
35#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
35 { 36 {
36 .procname = "idmap_cache_timeout", 37 .procname = "idmap_cache_timeout",
37 .data = &nfs_idmap_cache_timeout, 38 .data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
39 .mode = 0644, 40 .mode = 0644,
40 .proc_handler = proc_dointvec_jiffies, 41 .proc_handler = proc_dointvec_jiffies,
41 }, 42 },
43#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
42#endif 44#endif
43 { 45 {
44 .procname = "nfs_mountpoint_timeout", 46 .procname = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 2f84adaad427..7bdec8531400 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/wait.h> 15#include <linux/wait.h>
16#include <linux/namei.h>
16 17
17#include "internal.h" 18#include "internal.h"
18#include "nfs4_fs.h" 19#include "nfs4_fs.h"
20#include "iostat.h"
21#include "delegation.h"
19 22
20struct nfs_unlinkdata { 23struct nfs_unlinkdata {
21 struct hlist_node list; 24 struct hlist_node list;
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
244 * @dir: parent directory of dentry 247 * @dir: parent directory of dentry
245 * @dentry: dentry to unlink 248 * @dentry: dentry to unlink
246 */ 249 */
247int 250static int
248nfs_async_unlink(struct inode *dir, struct dentry *dentry) 251nfs_async_unlink(struct inode *dir, struct dentry *dentry)
249{ 252{
250 struct nfs_unlinkdata *data; 253 struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
259 status = PTR_ERR(data->cred); 262 status = PTR_ERR(data->cred);
260 goto out_free; 263 goto out_free;
261 } 264 }
262 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
263 data->res.dir_attr = &data->dir_attr; 265 data->res.dir_attr = &data->dir_attr;
264 266
265 status = -EBUSY; 267 status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
303 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) 305 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
304 nfs_free_unlinkdata(data); 306 nfs_free_unlinkdata(data);
305} 307}
308
309/* Cancel a queued async unlink. Called when a sillyrename run fails. */
310static void
311nfs_cancel_async_unlink(struct dentry *dentry)
312{
313 spin_lock(&dentry->d_lock);
314 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
315 struct nfs_unlinkdata *data = dentry->d_fsdata;
316
317 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
318 spin_unlock(&dentry->d_lock);
319 nfs_free_unlinkdata(data);
320 return;
321 }
322 spin_unlock(&dentry->d_lock);
323}
324
325struct nfs_renamedata {
326 struct nfs_renameargs args;
327 struct nfs_renameres res;
328 struct rpc_cred *cred;
329 struct inode *old_dir;
330 struct dentry *old_dentry;
331 struct nfs_fattr old_fattr;
332 struct inode *new_dir;
333 struct dentry *new_dentry;
334 struct nfs_fattr new_fattr;
335};
336
337/**
338 * nfs_async_rename_done - Sillyrename post-processing
339 * @task: rpc_task of the sillyrename
340 * @calldata: nfs_renamedata for the sillyrename
341 *
342 * Do the directory attribute updates and the d_move
343 */
344static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
345{
346 struct nfs_renamedata *data = calldata;
347 struct inode *old_dir = data->old_dir;
348 struct inode *new_dir = data->new_dir;
349
350 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
351 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
352 return;
353 }
354
355 if (task->tk_status != 0) {
356 nfs_cancel_async_unlink(data->old_dentry);
357 return;
358 }
359
360 nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
361 d_move(data->old_dentry, data->new_dentry);
362}
363
364/**
365 * nfs_async_rename_release - Release the sillyrename data.
366 * @calldata: the struct nfs_renamedata to be released
367 */
368static void nfs_async_rename_release(void *calldata)
369{
370 struct nfs_renamedata *data = calldata;
371 struct super_block *sb = data->old_dir->i_sb;
372
373 if (data->old_dentry->d_inode)
374 nfs_mark_for_revalidate(data->old_dentry->d_inode);
375
376 dput(data->old_dentry);
377 dput(data->new_dentry);
378 iput(data->old_dir);
379 iput(data->new_dir);
380 nfs_sb_deactive(sb);
381 put_rpccred(data->cred);
382 kfree(data);
383}
384
385#if defined(CONFIG_NFS_V4_1)
386static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
387{
388 struct nfs_renamedata *data = calldata;
389 struct nfs_server *server = NFS_SERVER(data->old_dir);
390
391 if (nfs4_setup_sequence(server, &data->args.seq_args,
392 &data->res.seq_res, 1, task))
393 return;
394 rpc_call_start(task);
395}
396#endif /* CONFIG_NFS_V4_1 */
397
398static const struct rpc_call_ops nfs_rename_ops = {
399 .rpc_call_done = nfs_async_rename_done,
400 .rpc_release = nfs_async_rename_release,
401#if defined(CONFIG_NFS_V4_1)
402 .rpc_call_prepare = nfs_rename_prepare,
403#endif /* CONFIG_NFS_V4_1 */
404};
405
406/**
407 * nfs_async_rename - perform an asynchronous rename operation
408 * @old_dir: directory that currently holds the dentry to be renamed
409 * @new_dir: target directory for the rename
410 * @old_dentry: original dentry to be renamed
411 * @new_dentry: dentry to which the old_dentry should be renamed
412 *
413 * It's expected that valid references to the dentries and inodes are held
414 */
415static struct rpc_task *
416nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
417 struct dentry *old_dentry, struct dentry *new_dentry)
418{
419 struct nfs_renamedata *data;
420 struct rpc_message msg = { };
421 struct rpc_task_setup task_setup_data = {
422 .rpc_message = &msg,
423 .callback_ops = &nfs_rename_ops,
424 .workqueue = nfsiod_workqueue,
425 .rpc_client = NFS_CLIENT(old_dir),
426 .flags = RPC_TASK_ASYNC,
427 };
428
429 data = kzalloc(sizeof(*data), GFP_KERNEL);
430 if (data == NULL)
431 return ERR_PTR(-ENOMEM);
432 task_setup_data.callback_data = data,
433
434 data->cred = rpc_lookup_cred();
435 if (IS_ERR(data->cred)) {
436 struct rpc_task *task = ERR_CAST(data->cred);
437 kfree(data);
438 return task;
439 }
440
441 msg.rpc_argp = &data->args;
442 msg.rpc_resp = &data->res;
443 msg.rpc_cred = data->cred;
444
445 /* set up nfs_renamedata */
446 data->old_dir = old_dir;
447 ihold(old_dir);
448 data->new_dir = new_dir;
449 ihold(new_dir);
450 data->old_dentry = dget(old_dentry);
451 data->new_dentry = dget(new_dentry);
452 nfs_fattr_init(&data->old_fattr);
453 nfs_fattr_init(&data->new_fattr);
454
455 /* set up nfs_renameargs */
456 data->args.old_dir = NFS_FH(old_dir);
457 data->args.old_name = &old_dentry->d_name;
458 data->args.new_dir = NFS_FH(new_dir);
459 data->args.new_name = &new_dentry->d_name;
460
461 /* set up nfs_renameres */
462 data->res.old_fattr = &data->old_fattr;
463 data->res.new_fattr = &data->new_fattr;
464
465 nfs_sb_active(old_dir->i_sb);
466
467 NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
468
469 return rpc_run_task(&task_setup_data);
470}
471
472/**
473 * nfs_sillyrename - Perform a silly-rename of a dentry
474 * @dir: inode of directory that contains dentry
475 * @dentry: dentry to be sillyrenamed
476 *
477 * NFSv2/3 is stateless and the server doesn't know when the client is
478 * holding a file open. To prevent application problems when a file is
479 * unlinked while it's still open, the client performs a "silly-rename".
480 * That is, it renames the file to a hidden file in the same directory,
481 * and only performs the unlink once the last reference to it is put.
482 *
483 * The final cleanup is done during dentry_iput.
484 */
485int
486nfs_sillyrename(struct inode *dir, struct dentry *dentry)
487{
488 static unsigned int sillycounter;
489 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
490 const int countersize = sizeof(sillycounter)*2;
491 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
492 char silly[slen+1];
493 struct dentry *sdentry;
494 struct rpc_task *task;
495 int error = -EIO;
496
497 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
498 dentry->d_parent->d_name.name, dentry->d_name.name,
499 atomic_read(&dentry->d_count));
500 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
501
502 /*
503 * We don't allow a dentry to be silly-renamed twice.
504 */
505 error = -EBUSY;
506 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
507 goto out;
508
509 sprintf(silly, ".nfs%*.*Lx",
510 fileidsize, fileidsize,
511 (unsigned long long)NFS_FILEID(dentry->d_inode));
512
513 /* Return delegation in anticipation of the rename */
514 nfs_inode_return_delegation(dentry->d_inode);
515
516 sdentry = NULL;
517 do {
518 char *suffix = silly + slen - countersize;
519
520 dput(sdentry);
521 sillycounter++;
522 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
523
524 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
525 dentry->d_name.name, silly);
526
527 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
528 /*
529 * N.B. Better to return EBUSY here ... it could be
530 * dangerous to delete the file while it's in use.
531 */
532 if (IS_ERR(sdentry))
533 goto out;
534 } while (sdentry->d_inode != NULL); /* need negative lookup */
535
536 /* queue unlink first. Can't do this from rpc_release as it
537 * has to allocate memory
538 */
539 error = nfs_async_unlink(dir, dentry);
540 if (error)
541 goto out_dput;
542
543 /* run the rename task, undo unlink if it fails */
544 task = nfs_async_rename(dir, dir, dentry, sdentry);
545 if (IS_ERR(task)) {
546 error = -EBUSY;
547 nfs_cancel_async_unlink(dentry);
548 goto out_dput;
549 }
550
551 /* wait for the RPC task to complete, unless a SIGKILL intervenes */
552 error = rpc_wait_for_completion_task(task);
553 if (error == 0)
554 error = task->tk_status;
555 rpc_put_task(task);
556out_dput:
557 dput(sdentry);
558out:
559 return error;
560}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 874972d9427c..4c14c17a5276 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
55 if (p) { 55 if (p) {
56 memset(p, 0, sizeof(*p)); 56 memset(p, 0, sizeof(*p));
57 INIT_LIST_HEAD(&p->pages); 57 INIT_LIST_HEAD(&p->pages);
58 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
59 } 58 }
60 return p; 59 return p;
61} 60}
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
75 memset(p, 0, sizeof(*p)); 74 memset(p, 0, sizeof(*p));
76 INIT_LIST_HEAD(&p->pages); 75 INIT_LIST_HEAD(&p->pages);
77 p->npages = pagecount; 76 p->npages = pagecount;
78 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
79 if (pagecount <= ARRAY_SIZE(p->page_array)) 77 if (pagecount <= ARRAY_SIZE(p->page_array))
80 p->pagevec = p->page_array; 78 p->pagevec = p->page_array;
81 else { 79 else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
292 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 290 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
293 291
294 nfs_pageio_cond_complete(pgio, page->index); 292 nfs_pageio_cond_complete(pgio, page->index);
295 ret = nfs_page_async_flush(pgio, page, 293 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
296 wbc->sync_mode == WB_SYNC_NONE ||
297 wbc->nonblocking != 0);
298 if (ret == -EAGAIN) { 294 if (ret == -EAGAIN) {
299 redirty_page_for_writepage(wbc, page); 295 redirty_page_for_writepage(wbc, page);
300 ret = 0; 296 ret = 0;
@@ -1433,15 +1429,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1433 int flags = FLUSH_SYNC; 1429 int flags = FLUSH_SYNC;
1434 int ret = 0; 1430 int ret = 0;
1435 1431
1436 /* Don't commit yet if this is a non-blocking flush and there are 1432 if (wbc->sync_mode == WB_SYNC_NONE) {
1437 * lots of outstanding writes for this mapping. 1433 /* Don't commit yet if this is a non-blocking flush and there
1438 */ 1434 * are a lot of outstanding writes for this mapping.
1439 if (wbc->sync_mode == WB_SYNC_NONE && 1435 */
1440 nfsi->ncommit <= (nfsi->npages >> 1)) 1436 if (nfsi->ncommit <= (nfsi->npages >> 1))
1441 goto out_mark_dirty; 1437 goto out_mark_dirty;
1442 1438
1443 if (wbc->nonblocking || wbc->for_background) 1439 /* don't wait for the COMMIT response */
1444 flags = 0; 1440 flags = 0;
1441 }
1442
1445 ret = nfs_commit_inode(inode, flags); 1443 ret = nfs_commit_inode(inode, flags);
1446 if (ret >= 0) { 1444 if (ret >= 0) {
1447 if (wbc->sync_mode == WB_SYNC_NONE) { 1445 if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
28 28
29 If unsure, say N. 29 If unsure, say N.
30 30
31config NFSD_DEPRECATED
32 bool "Include support for deprecated syscall interface to NFSD"
33 depends on NFSD
34 default y
35 help
36 The syscall interface to nfsd was obsoleted in 2.6.0 by a new
37 filesystem based interface. The old interface is due for removal
38 in 2.6.40. If you wish to remove the interface before then
39 say N.
40
41 In unsure, say Y.
42
31config NFSD_V2_ACL 43config NFSD_V2_ACL
32 bool 44 bool
33 depends on NFSD 45 depends on NFSD
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87dd..c0fcb7ab7f6d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -28,9 +28,6 @@
28typedef struct auth_domain svc_client; 28typedef struct auth_domain svc_client;
29typedef struct svc_export svc_export; 29typedef struct svc_export svc_export;
30 30
31static void exp_do_unexport(svc_export *unexp);
32static int exp_verify_string(char *cp, int max);
33
34/* 31/*
35 * We have two caches. 32 * We have two caches.
36 * One maps client+vfsmnt+dentry to export options - the export map 33 * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
802 return ek; 799 return ek;
803} 800}
804 801
802#ifdef CONFIG_NFSD_DEPRECATED
805static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, 803static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
806 struct svc_export *exp) 804 struct svc_export *exp)
807{ 805{
@@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
852 850
853 return exp_find_key(clp, FSID_NUM, fsidv, NULL); 851 return exp_find_key(clp, FSID_NUM, fsidv, NULL);
854} 852}
853#endif
855 854
856static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, 855static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
857 struct cache_req *reqp) 856 struct cache_req *reqp)
@@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
893 return exp; 892 return exp;
894} 893}
895 894
895#ifdef CONFIG_NFSD_DEPRECATED
896/* 896/*
897 * Hashtable locking. Write locks are placed only by user processes 897 * Hashtable locking. Write locks are placed only by user processes
898 * wanting to modify export information. 898 * wanting to modify export information.
@@ -925,6 +925,19 @@ exp_writeunlock(void)
925{ 925{
926 up_write(&hash_sem); 926 up_write(&hash_sem);
927} 927}
928#else
929
930/* hash_sem not needed once deprecated interface is removed */
931void exp_readlock(void) {}
932static inline void exp_writelock(void){}
933void exp_readunlock(void) {}
934static inline void exp_writeunlock(void){}
935
936#endif
937
938#ifdef CONFIG_NFSD_DEPRECATED
939static void exp_do_unexport(svc_export *unexp);
940static int exp_verify_string(char *cp, int max);
928 941
929static void exp_fsid_unhash(struct svc_export *exp) 942static void exp_fsid_unhash(struct svc_export *exp)
930{ 943{
@@ -935,10 +948,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
935 948
936 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); 949 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
937 if (!IS_ERR(ek)) { 950 if (!IS_ERR(ek)) {
938 ek->h.expiry_time = get_seconds()-1; 951 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
939 cache_put(&ek->h, &svc_expkey_cache); 952 cache_put(&ek->h, &svc_expkey_cache);
940 } 953 }
941 svc_expkey_cache.nextcheck = get_seconds();
942} 954}
943 955
944static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) 956static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +985,9 @@ static void exp_unhash(struct svc_export *exp)
973 985
974 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); 986 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
975 if (!IS_ERR(ek)) { 987 if (!IS_ERR(ek)) {
976 ek->h.expiry_time = get_seconds()-1; 988 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
977 cache_put(&ek->h, &svc_expkey_cache); 989 cache_put(&ek->h, &svc_expkey_cache);
978 } 990 }
979 svc_expkey_cache.nextcheck = get_seconds();
980} 991}
981 992
982/* 993/*
@@ -1097,8 +1108,7 @@ out:
1097static void 1108static void
1098exp_do_unexport(svc_export *unexp) 1109exp_do_unexport(svc_export *unexp)
1099{ 1110{
1100 unexp->h.expiry_time = get_seconds()-1; 1111 sunrpc_invalidate(&unexp->h, &svc_export_cache);
1101 svc_export_cache.nextcheck = get_seconds();
1102 exp_unhash(unexp); 1112 exp_unhash(unexp);
1103 exp_fsid_unhash(unexp); 1113 exp_fsid_unhash(unexp);
1104} 1114}
@@ -1150,6 +1160,7 @@ out_unlock:
1150 exp_writeunlock(); 1160 exp_writeunlock();
1151 return err; 1161 return err;
1152} 1162}
1163#endif /* CONFIG_NFSD_DEPRECATED */
1153 1164
1154/* 1165/*
1155 * Obtain the root fh on behalf of a client. 1166 * Obtain the root fh on behalf of a client.
@@ -1459,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
1459 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); 1470 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
1460} 1471}
1461 1472
1473static bool secinfo_flags_equal(int f, int g)
1474{
1475 f &= NFSEXP_SECINFO_FLAGS;
1476 g &= NFSEXP_SECINFO_FLAGS;
1477 return f == g;
1478}
1479
1480static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
1481{
1482 int flags;
1483
1484 flags = (*fp)->flags;
1485 seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
1486 (*fp)++;
1487 while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
1488 seq_printf(m, ":%d", (*fp)->pseudoflavor);
1489 (*fp)++;
1490 }
1491 return flags;
1492}
1493
1462static void show_secinfo(struct seq_file *m, struct svc_export *exp) 1494static void show_secinfo(struct seq_file *m, struct svc_export *exp)
1463{ 1495{
1464 struct exp_flavor_info *f; 1496 struct exp_flavor_info *f;
1465 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; 1497 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1466 int lastflags = 0, first = 0; 1498 int flags;
1467 1499
1468 if (exp->ex_nflavors == 0) 1500 if (exp->ex_nflavors == 0)
1469 return; 1501 return;
1470 for (f = exp->ex_flavors; f < end; f++) { 1502 f = exp->ex_flavors;
1471 if (first || f->flags != lastflags) { 1503 flags = show_secinfo_run(m, &f, end);
1472 if (!first) 1504 if (!secinfo_flags_equal(flags, exp->ex_flags))
1473 show_secinfo_flags(m, lastflags); 1505 show_secinfo_flags(m, flags);
1474 seq_printf(m, ",sec=%d", f->pseudoflavor); 1506 while (f != end) {
1475 lastflags = f->flags; 1507 flags = show_secinfo_run(m, &f, end);
1476 } else { 1508 show_secinfo_flags(m, flags);
1477 seq_printf(m, ":%d", f->pseudoflavor);
1478 }
1479 } 1509 }
1480 show_secinfo_flags(m, lastflags);
1481} 1510}
1482 1511
1483static void exp_flags(struct seq_file *m, int flag, int fsid, 1512static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1561,7 @@ const struct seq_operations nfs_exports_op = {
1532 .show = e_show, 1561 .show = e_show,
1533}; 1562};
1534 1563
1564#ifdef CONFIG_NFSD_DEPRECATED
1535/* 1565/*
1536 * Add or modify a client. 1566 * Add or modify a client.
1537 * Change requests may involve the list of host addresses. The list of 1567 * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp)
1563 /* Insert client into hashtable. */ 1593 /* Insert client into hashtable. */
1564 for (i = 0; i < ncp->cl_naddr; i++) { 1594 for (i = 0; i < ncp->cl_naddr; i++) {
1565 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); 1595 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
1566 auth_unix_add_addr(&addr6, dom); 1596 auth_unix_add_addr(&init_net, &addr6, dom);
1567 } 1597 }
1568 auth_unix_forget_old(dom); 1598 auth_unix_forget_old(dom);
1569 auth_domain_put(dom); 1599 auth_domain_put(dom);
@@ -1621,6 +1651,7 @@ exp_verify_string(char *cp, int max)
1621 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); 1651 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
1622 return 0; 1652 return 0;
1623} 1653}
1654#endif /* CONFIG_NFSD_DEPRECATED */
1624 1655
1625/* 1656/*
1626 * Initialize the exports module. 1657 * Initialize the exports module.
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..143da2eecd7b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
41 41
42#define NFSPROC4_CB_NULL 0 42#define NFSPROC4_CB_NULL 0
43#define NFSPROC4_CB_COMPOUND 1 43#define NFSPROC4_CB_COMPOUND 1
44#define NFS4_STATEID_SIZE 16
45 44
46/* Index of predefined Linux callback client operations */ 45/* Index of predefined Linux callback client operations */
47 46
@@ -248,10 +247,11 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
248} 247}
249 248
250static void 249static void
251encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, 250encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
252 struct nfs4_cb_compound_hdr *hdr) 251 struct nfs4_cb_compound_hdr *hdr)
253{ 252{
254 __be32 *p; 253 __be32 *p;
254 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
255 255
256 if (hdr->minorversion == 0) 256 if (hdr->minorversion == 0)
257 return; 257 return;
@@ -259,8 +259,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); 259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
260 260
261 WRITE32(OP_CB_SEQUENCE); 261 WRITE32(OP_CB_SEQUENCE);
262 WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); 262 WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
263 WRITE32(args->cbs_clp->cl_cb_seq_nr); 263 WRITE32(ses->se_cb_seq_nr);
264 WRITE32(0); /* slotid, always 0 */ 264 WRITE32(0); /* slotid, always 0 */
265 WRITE32(0); /* highest slotid always 0 */ 265 WRITE32(0); /* highest slotid always 0 */
266 WRITE32(0); /* cachethis always 0 */ 266 WRITE32(0); /* cachethis always 0 */
@@ -280,18 +280,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
280 280
281static int 281static int
282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, 282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
283 struct nfs4_rpc_args *rpc_args) 283 struct nfsd4_callback *cb)
284{ 284{
285 struct xdr_stream xdr; 285 struct xdr_stream xdr;
286 struct nfs4_delegation *args = rpc_args->args_op; 286 struct nfs4_delegation *args = cb->cb_op;
287 struct nfs4_cb_compound_hdr hdr = { 287 struct nfs4_cb_compound_hdr hdr = {
288 .ident = args->dl_ident, 288 .ident = cb->cb_clp->cl_cb_ident,
289 .minorversion = rpc_args->args_seq.cbs_minorversion, 289 .minorversion = cb->cb_minorversion,
290 }; 290 };
291 291
292 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 292 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
293 encode_cb_compound_hdr(&xdr, &hdr); 293 encode_cb_compound_hdr(&xdr, &hdr);
294 encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); 294 encode_cb_sequence(&xdr, cb, &hdr);
295 encode_cb_recall(&xdr, args, &hdr); 295 encode_cb_recall(&xdr, args, &hdr);
296 encode_cb_nops(&hdr); 296 encode_cb_nops(&hdr);
297 return 0; 297 return 0;
@@ -339,15 +339,16 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
339 * with a single slot. 339 * with a single slot.
340 */ 340 */
341static int 341static int
342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, 342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
343 struct rpc_rqst *rqstp) 343 struct rpc_rqst *rqstp)
344{ 344{
345 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
345 struct nfs4_sessionid id; 346 struct nfs4_sessionid id;
346 int status; 347 int status;
347 u32 dummy; 348 u32 dummy;
348 __be32 *p; 349 __be32 *p;
349 350
350 if (res->cbs_minorversion == 0) 351 if (cb->cb_minorversion == 0)
351 return 0; 352 return 0;
352 353
353 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE); 354 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -363,13 +364,12 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
363 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); 364 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
364 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 365 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
365 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); 366 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
366 if (memcmp(id.data, res->cbs_clp->cl_sessionid.data, 367 if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
367 NFS4_MAX_SESSIONID_LEN)) {
368 dprintk("%s Invalid session id\n", __func__); 368 dprintk("%s Invalid session id\n", __func__);
369 goto out; 369 goto out;
370 } 370 }
371 READ32(dummy); 371 READ32(dummy);
372 if (dummy != res->cbs_clp->cl_cb_seq_nr) { 372 if (dummy != ses->se_cb_seq_nr) {
373 dprintk("%s Invalid sequence number\n", __func__); 373 dprintk("%s Invalid sequence number\n", __func__);
374 goto out; 374 goto out;
375 } 375 }
@@ -393,7 +393,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
393 393
394static int 394static int
395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, 395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
396 struct nfsd4_cb_sequence *seq) 396 struct nfsd4_callback *cb)
397{ 397{
398 struct xdr_stream xdr; 398 struct xdr_stream xdr;
399 struct nfs4_cb_compound_hdr hdr; 399 struct nfs4_cb_compound_hdr hdr;
@@ -403,8 +403,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
403 status = decode_cb_compound_hdr(&xdr, &hdr); 403 status = decode_cb_compound_hdr(&xdr, &hdr);
404 if (status) 404 if (status)
405 goto out; 405 goto out;
406 if (seq) { 406 if (cb) {
407 status = decode_cb_sequence(&xdr, seq, rqstp); 407 status = decode_cb_sequence(&xdr, cb, rqstp);
408 if (status) 408 if (status)
409 goto out; 409 goto out;
410 } 410 }
@@ -473,30 +473,34 @@ static int max_cb_time(void)
473/* Reference counting, callback cleanup, etc., all look racy as heck. 473/* Reference counting, callback cleanup, etc., all look racy as heck.
474 * And why is cl_cb_set an atomic? */ 474 * And why is cl_cb_set an atomic? */
475 475
476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
477{ 477{
478 struct rpc_timeout timeparms = { 478 struct rpc_timeout timeparms = {
479 .to_initval = max_cb_time(), 479 .to_initval = max_cb_time(),
480 .to_retries = 0, 480 .to_retries = 0,
481 }; 481 };
482 struct rpc_create_args args = { 482 struct rpc_create_args args = {
483 .protocol = XPRT_TRANSPORT_TCP, 483 .net = &init_net,
484 .address = (struct sockaddr *) &cb->cb_addr, 484 .address = (struct sockaddr *) &conn->cb_addr,
485 .addrsize = cb->cb_addrlen, 485 .addrsize = conn->cb_addrlen,
486 .timeout = &timeparms, 486 .timeout = &timeparms,
487 .program = &cb_program, 487 .program = &cb_program,
488 .prognumber = cb->cb_prog,
489 .version = 0, 488 .version = 0,
490 .authflavor = clp->cl_flavor, 489 .authflavor = clp->cl_flavor,
491 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 490 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
492 .client_name = clp->cl_principal,
493 }; 491 };
494 struct rpc_clnt *client; 492 struct rpc_clnt *client;
495 493
496 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 494 if (clp->cl_minorversion == 0) {
497 return -EINVAL; 495 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
498 if (cb->cb_minorversion) { 496 return -EINVAL;
499 args.bc_xprt = cb->cb_xprt; 497 args.client_name = clp->cl_principal;
498 args.prognumber = conn->cb_prog,
499 args.protocol = XPRT_TRANSPORT_TCP;
500 clp->cl_cb_ident = conn->cb_ident;
501 } else {
502 args.bc_xprt = conn->cb_xprt;
503 args.prognumber = clp->cl_cb_session->se_cb_prog;
500 args.protocol = XPRT_TRANSPORT_BC_TCP; 504 args.protocol = XPRT_TRANSPORT_BC_TCP;
501 } 505 }
502 /* Create RPC client */ 506 /* Create RPC client */
@@ -506,7 +510,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
506 PTR_ERR(client)); 510 PTR_ERR(client));
507 return PTR_ERR(client); 511 return PTR_ERR(client);
508 } 512 }
509 nfsd4_set_callback_client(clp, client); 513 clp->cl_cb_client = client;
510 return 0; 514 return 0;
511 515
512} 516}
@@ -519,7 +523,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
519 523
520static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 524static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
521{ 525{
522 struct nfs4_client *clp = calldata; 526 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
523 527
524 if (task->tk_status) 528 if (task->tk_status)
525 warn_no_callback_path(clp, task->tk_status); 529 warn_no_callback_path(clp, task->tk_status);
@@ -528,6 +532,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
528} 532}
529 533
530static const struct rpc_call_ops nfsd4_cb_probe_ops = { 534static const struct rpc_call_ops nfsd4_cb_probe_ops = {
535 /* XXX: release method to ensure we set the cb channel down if
536 * necessary on early failure? */
531 .rpc_call_done = nfsd4_cb_probe_done, 537 .rpc_call_done = nfsd4_cb_probe_done,
532}; 538};
533 539
@@ -543,38 +549,42 @@ int set_callback_cred(void)
543 return 0; 549 return 0;
544} 550}
545 551
552static struct workqueue_struct *callback_wq;
546 553
547void do_probe_callback(struct nfs4_client *clp) 554static void do_probe_callback(struct nfs4_client *clp)
548{ 555{
549 struct rpc_message msg = { 556 struct nfsd4_callback *cb = &clp->cl_cb_null;
550 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
551 .rpc_argp = clp,
552 .rpc_cred = callback_cred
553 };
554 int status;
555 557
556 status = rpc_call_async(clp->cl_cb_client, &msg, 558 cb->cb_op = NULL;
557 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 559 cb->cb_clp = clp;
558 &nfsd4_cb_probe_ops, (void *)clp); 560
559 if (status) 561 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
560 warn_no_callback_path(clp, status); 562 cb->cb_msg.rpc_argp = NULL;
563 cb->cb_msg.rpc_resp = NULL;
564 cb->cb_msg.rpc_cred = callback_cred;
565
566 cb->cb_ops = &nfsd4_cb_probe_ops;
567
568 queue_work(callback_wq, &cb->cb_work);
561} 569}
562 570
563/* 571/*
564 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 572 * Poke the callback thread to process any updates to the callback
573 * parameters, and send a null probe.
565 */ 574 */
566void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 575void nfsd4_probe_callback(struct nfs4_client *clp)
567{ 576{
568 int status; 577 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
578 do_probe_callback(clp);
579}
569 580
581void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
582{
570 BUG_ON(atomic_read(&clp->cl_cb_set)); 583 BUG_ON(atomic_read(&clp->cl_cb_set));
571 584
572 status = setup_callback_client(clp, cb); 585 spin_lock(&clp->cl_lock);
573 if (status) { 586 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
574 warn_no_callback_path(clp, status); 587 spin_unlock(&clp->cl_lock);
575 return;
576 }
577 do_probe_callback(clp);
578} 588}
579 589
580/* 590/*
@@ -585,8 +595,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
585static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, 595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
586 struct rpc_task *task) 596 struct rpc_task *task)
587{ 597{
588 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; 598 u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
589 u32 *ptr = (u32 *)clp->cl_sessionid.data;
590 int status = 0; 599 int status = 0;
591 600
592 dprintk("%s: %u:%u:%u:%u\n", __func__, 601 dprintk("%s: %u:%u:%u:%u\n", __func__,
@@ -598,14 +607,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
598 status = -EAGAIN; 607 status = -EAGAIN;
599 goto out; 608 goto out;
600 } 609 }
601
602 /*
603 * We'll need the clp during XDR encoding and decoding,
604 * and the sequence during decoding to verify the reply
605 */
606 args->args_seq.cbs_clp = clp;
607 task->tk_msg.rpc_resp = &args->args_seq;
608
609out: 610out:
610 dprintk("%s status=%d\n", __func__, status); 611 dprintk("%s status=%d\n", __func__, status);
611 return status; 612 return status;
@@ -617,13 +618,13 @@ out:
617 */ 618 */
618static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) 619static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
619{ 620{
620 struct nfs4_delegation *dp = calldata; 621 struct nfsd4_callback *cb = calldata;
622 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
621 struct nfs4_client *clp = dp->dl_client; 623 struct nfs4_client *clp = dp->dl_client;
622 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; 624 u32 minorversion = clp->cl_minorversion;
623 u32 minorversion = clp->cl_cb_conn.cb_minorversion;
624 int status = 0; 625 int status = 0;
625 626
626 args->args_seq.cbs_minorversion = minorversion; 627 cb->cb_minorversion = minorversion;
627 if (minorversion) { 628 if (minorversion) {
628 status = nfsd41_cb_setup_sequence(clp, task); 629 status = nfsd41_cb_setup_sequence(clp, task);
629 if (status) { 630 if (status) {
@@ -640,19 +641,20 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
640 641
641static void nfsd4_cb_done(struct rpc_task *task, void *calldata) 642static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
642{ 643{
643 struct nfs4_delegation *dp = calldata; 644 struct nfsd4_callback *cb = calldata;
645 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
644 struct nfs4_client *clp = dp->dl_client; 646 struct nfs4_client *clp = dp->dl_client;
645 647
646 dprintk("%s: minorversion=%d\n", __func__, 648 dprintk("%s: minorversion=%d\n", __func__,
647 clp->cl_cb_conn.cb_minorversion); 649 clp->cl_minorversion);
648 650
649 if (clp->cl_cb_conn.cb_minorversion) { 651 if (clp->cl_minorversion) {
650 /* No need for lock, access serialized in nfsd4_cb_prepare */ 652 /* No need for lock, access serialized in nfsd4_cb_prepare */
651 ++clp->cl_cb_seq_nr; 653 ++clp->cl_cb_session->se_cb_seq_nr;
652 clear_bit(0, &clp->cl_cb_slot_busy); 654 clear_bit(0, &clp->cl_cb_slot_busy);
653 rpc_wake_up_next(&clp->cl_cb_waitq); 655 rpc_wake_up_next(&clp->cl_cb_waitq);
654 dprintk("%s: freed slot, new seqid=%d\n", __func__, 656 dprintk("%s: freed slot, new seqid=%d\n", __func__,
655 clp->cl_cb_seq_nr); 657 clp->cl_cb_session->se_cb_seq_nr);
656 658
657 /* We're done looking into the sequence information */ 659 /* We're done looking into the sequence information */
658 task->tk_msg.rpc_resp = NULL; 660 task->tk_msg.rpc_resp = NULL;
@@ -662,7 +664,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
662 664
663static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 665static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
664{ 666{
665 struct nfs4_delegation *dp = calldata; 667 struct nfsd4_callback *cb = calldata;
668 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
666 struct nfs4_client *clp = dp->dl_client; 669 struct nfs4_client *clp = dp->dl_client;
667 struct rpc_clnt *current_rpc_client = clp->cl_cb_client; 670 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
668 671
@@ -707,7 +710,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
707 710
708static void nfsd4_cb_recall_release(void *calldata) 711static void nfsd4_cb_recall_release(void *calldata)
709{ 712{
710 struct nfs4_delegation *dp = calldata; 713 struct nfsd4_callback *cb = calldata;
714 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
711 715
712 nfs4_put_delegation(dp); 716 nfs4_put_delegation(dp);
713} 717}
@@ -718,8 +722,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
718 .rpc_release = nfsd4_cb_recall_release, 722 .rpc_release = nfsd4_cb_recall_release,
719}; 723};
720 724
721static struct workqueue_struct *callback_wq;
722
723int nfsd4_create_callback_queue(void) 725int nfsd4_create_callback_queue(void)
724{ 726{
725 callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); 727 callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -734,57 +736,88 @@ void nfsd4_destroy_callback_queue(void)
734} 736}
735 737
736/* must be called under the state lock */ 738/* must be called under the state lock */
737void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) 739void nfsd4_shutdown_callback(struct nfs4_client *clp)
738{ 740{
739 struct rpc_clnt *old = clp->cl_cb_client; 741 set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
740
741 clp->cl_cb_client = new;
742 /* 742 /*
743 * After this, any work that saw the old value of cl_cb_client will 743 * Note this won't actually result in a null callback;
744 * be gone: 744 * instead, nfsd4_do_callback_rpc() will detect the killed
745 * client, destroy the rpc client, and stop:
745 */ 746 */
747 do_probe_callback(clp);
746 flush_workqueue(callback_wq); 748 flush_workqueue(callback_wq);
747 /* So we can safely shut it down: */
748 if (old)
749 rpc_shutdown_client(old);
750} 749}
751 750
752/* 751void nfsd4_release_cb(struct nfsd4_callback *cb)
753 * called with dp->dl_count inc'ed.
754 */
755static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
756{ 752{
757 struct nfs4_client *clp = dp->dl_client; 753 if (cb->cb_ops->rpc_release)
758 struct rpc_clnt *clnt = clp->cl_cb_client; 754 cb->cb_ops->rpc_release(cb);
759 struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; 755}
760 struct rpc_message msg = {
761 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
762 .rpc_cred = callback_cred
763 };
764 756
765 if (clnt == NULL) { 757void nfsd4_process_cb_update(struct nfsd4_callback *cb)
766 nfs4_put_delegation(dp); 758{
767 return; /* Client is shutting down; give up. */ 759 struct nfs4_cb_conn conn;
760 struct nfs4_client *clp = cb->cb_clp;
761 int err;
762
763 /*
764 * This is either an update, or the client dying; in either case,
765 * kill the old client:
766 */
767 if (clp->cl_cb_client) {
768 rpc_shutdown_client(clp->cl_cb_client);
769 clp->cl_cb_client = NULL;
768 } 770 }
771 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
772 return;
773 spin_lock(&clp->cl_lock);
774 /*
775 * Only serialized callback code is allowed to clear these
776 * flags; main nfsd code can only set them:
777 */
778 BUG_ON(!clp->cl_cb_flags);
779 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
780 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
781 spin_unlock(&clp->cl_lock);
769 782
770 args->args_op = dp; 783 err = setup_callback_client(clp, &conn);
771 msg.rpc_argp = args; 784 if (err)
772 dp->dl_retries = 1; 785 warn_no_callback_path(clp, err);
773 rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
774} 786}
775 787
776void nfsd4_do_callback_rpc(struct work_struct *w) 788void nfsd4_do_callback_rpc(struct work_struct *w)
777{ 789{
778 /* XXX: for now, just send off delegation recall. */ 790 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
779 /* In future, generalize to handle any sort of callback. */ 791 struct nfs4_client *clp = cb->cb_clp;
780 struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); 792 struct rpc_clnt *clnt;
781 struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
782 793
783 _nfsd4_cb_recall(dp); 794 if (clp->cl_cb_flags)
784} 795 nfsd4_process_cb_update(cb);
785 796
797 clnt = clp->cl_cb_client;
798 if (!clnt) {
799 /* Callback channel broken, or client killed; give up: */
800 nfsd4_release_cb(cb);
801 return;
802 }
803 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
804 cb->cb_ops, cb);
805}
786 806
787void nfsd4_cb_recall(struct nfs4_delegation *dp) 807void nfsd4_cb_recall(struct nfs4_delegation *dp)
788{ 808{
809 struct nfsd4_callback *cb = &dp->dl_recall;
810
811 dp->dl_retries = 1;
812 cb->cb_op = dp;
813 cb->cb_clp = dp->dl_client;
814 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
815 cb->cb_msg.rpc_argp = cb;
816 cb->cb_msg.rpc_resp = cb;
817 cb->cb_msg.rpc_cred = callback_cred;
818
819 cb->cb_ops = &nfsd4_cb_recall_ops;
820 dp->dl_retries = 1;
821
789 queue_work(callback_wq, &dp->dl_recall.cb_work); 822 queue_work(callback_wq, &dp->dl_recall.cb_work);
790} 823}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf493424..f0695e815f0e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void)
482 cache_unregister(&nametoid_cache); 482 cache_unregister(&nametoid_cache);
483} 483}
484 484
485/*
486 * Deferred request handling
487 */
488
489struct idmap_defer_req {
490 struct cache_req req;
491 struct cache_deferred_req deferred_req;
492 wait_queue_head_t waitq;
493 atomic_t count;
494};
495
496static inline void
497put_mdr(struct idmap_defer_req *mdr)
498{
499 if (atomic_dec_and_test(&mdr->count))
500 kfree(mdr);
501}
502
503static inline void
504get_mdr(struct idmap_defer_req *mdr)
505{
506 atomic_inc(&mdr->count);
507}
508
509static void
510idmap_revisit(struct cache_deferred_req *dreq, int toomany)
511{
512 struct idmap_defer_req *mdr =
513 container_of(dreq, struct idmap_defer_req, deferred_req);
514
515 wake_up(&mdr->waitq);
516 put_mdr(mdr);
517}
518
519static struct cache_deferred_req *
520idmap_defer(struct cache_req *req)
521{
522 struct idmap_defer_req *mdr =
523 container_of(req, struct idmap_defer_req, req);
524
525 mdr->deferred_req.revisit = idmap_revisit;
526 get_mdr(mdr);
527 return (&mdr->deferred_req);
528}
529
530static inline int
531do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
532 struct cache_detail *detail, struct ent **item,
533 struct idmap_defer_req *mdr)
534{
535 *item = lookup_fn(key);
536 if (!*item)
537 return -ENOMEM;
538 return cache_check(detail, &(*item)->h, &mdr->req);
539}
540
541static inline int
542do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
543 struct ent *key, struct cache_detail *detail,
544 struct ent **item)
545{
546 int ret = -ENOMEM;
547
548 *item = lookup_fn(key);
549 if (!*item)
550 goto out_err;
551 ret = -ETIMEDOUT;
552 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
553 || (*item)->h.expiry_time < get_seconds()
554 || detail->flush_time > (*item)->h.last_refresh)
555 goto out_put;
556 ret = -ENOENT;
557 if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
558 goto out_put;
559 return 0;
560out_put:
561 cache_put(&(*item)->h, detail);
562out_err:
563 *item = NULL;
564 return ret;
565}
566
567static int 485static int
568idmap_lookup(struct svc_rqst *rqstp, 486idmap_lookup(struct svc_rqst *rqstp,
569 struct ent *(*lookup_fn)(struct ent *), struct ent *key, 487 struct ent *(*lookup_fn)(struct ent *), struct ent *key,
570 struct cache_detail *detail, struct ent **item) 488 struct cache_detail *detail, struct ent **item)
571{ 489{
572 struct idmap_defer_req *mdr;
573 int ret; 490 int ret;
574 491
575 mdr = kzalloc(sizeof(*mdr), GFP_KERNEL); 492 *item = lookup_fn(key);
576 if (!mdr) 493 if (!*item)
577 return -ENOMEM; 494 return -ENOMEM;
578 atomic_set(&mdr->count, 1); 495 retry:
579 init_waitqueue_head(&mdr->waitq); 496 ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
580 mdr->req.defer = idmap_defer; 497
581 ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr); 498 if (ret == -ETIMEDOUT) {
582 if (ret == -EAGAIN) { 499 struct ent *prev_item = *item;
583 wait_event_interruptible_timeout(mdr->waitq, 500 *item = lookup_fn(key);
584 test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ); 501 if (*item != prev_item)
585 ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item); 502 goto retry;
503 cache_put(&(*item)->h, detail);
586 } 504 }
587 put_mdr(mdr);
588 return ret; 505 return ret;
589} 506}
590 507
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7f..0cdfd022bb7b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1031 resp->cstate.session = NULL; 1031 resp->cstate.session = NULL;
1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
1034 /* Use the deferral mechanism only for NFSv4.0 compounds */ 1034 /*
1035 rqstp->rq_usedeferral = (args->minorversion == 0); 1035 * Don't use the deferral mechanism for NFSv4; compounds make it
1036 * too hard to avoid non-idempotency problems.
1037 */
1038 rqstp->rq_usedeferral = 0;
1036 1039
1037 /* 1040 /*
1038 * According to RFC3010, this takes precedence over all other errors. 1041 * According to RFC3010, this takes precedence over all other errors.
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cf0d2ffb3c84..ad2bfa68d534 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
33*/ 33*/
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/fs.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/swap.h> 39#include <linux/swap.h>
@@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
207{ 207{
208 struct nfs4_delegation *dp; 208 struct nfs4_delegation *dp;
209 struct nfs4_file *fp = stp->st_file; 209 struct nfs4_file *fp = stp->st_file;
210 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
211 210
212 dprintk("NFSD alloc_init_deleg\n"); 211 dprintk("NFSD alloc_init_deleg\n");
213 /* 212 /*
@@ -234,7 +233,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
234 nfs4_file_get_access(fp, O_RDONLY); 233 nfs4_file_get_access(fp, O_RDONLY);
235 dp->dl_flock = NULL; 234 dp->dl_flock = NULL;
236 dp->dl_type = type; 235 dp->dl_type = type;
237 dp->dl_ident = cb->cb_ident;
238 dp->dl_stateid.si_boot = boot_time; 236 dp->dl_stateid.si_boot = boot_time;
239 dp->dl_stateid.si_stateownerid = current_delegid++; 237 dp->dl_stateid.si_stateownerid = current_delegid++;
240 dp->dl_stateid.si_fileid = 0; 238 dp->dl_stateid.si_fileid = 0;
@@ -535,171 +533,262 @@ gen_sessionid(struct nfsd4_session *ses)
535 */ 533 */
536#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) 534#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
537 535
536static void
537free_session_slots(struct nfsd4_session *ses)
538{
539 int i;
540
541 for (i = 0; i < ses->se_fchannel.maxreqs; i++)
542 kfree(ses->se_slots[i]);
543}
544
538/* 545/*
539 * Give the client the number of ca_maxresponsesize_cached slots it 546 * We don't actually need to cache the rpc and session headers, so we
540 * requests, of size bounded by NFSD_SLOT_CACHE_SIZE, 547 * can allocate a little less for each slot:
541 * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more 548 */
542 * than NFSD_MAX_SLOTS_PER_SESSION. 549static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
543 * 550{
544 * If we run out of reserved DRC memory we should (up to a point) 551 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
552}
553
554static int nfsd4_sanitize_slot_size(u32 size)
555{
556 size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
557 size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
558
559 return size;
560}
561
562/*
563 * XXX: If we run out of reserved DRC memory we could (up to a point)
545 * re-negotiate active sessions and reduce their slot usage to make 564 * re-negotiate active sessions and reduce their slot usage to make
546 * rooom for new connections. For now we just fail the create session. 565 * rooom for new connections. For now we just fail the create session.
547 */ 566 */
548static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) 567static int nfsd4_get_drc_mem(int slotsize, u32 num)
549{ 568{
550 int mem, size = fchan->maxresp_cached; 569 int avail;
551 570
552 if (fchan->maxreqs < 1) 571 num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
553 return nfserr_inval;
554 572
555 if (size < NFSD_MIN_HDR_SEQ_SZ) 573 spin_lock(&nfsd_drc_lock);
556 size = NFSD_MIN_HDR_SEQ_SZ; 574 avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
557 size -= NFSD_MIN_HDR_SEQ_SZ; 575 nfsd_drc_max_mem - nfsd_drc_mem_used);
558 if (size > NFSD_SLOT_CACHE_SIZE) 576 num = min_t(int, num, avail / slotsize);
559 size = NFSD_SLOT_CACHE_SIZE; 577 nfsd_drc_mem_used += num * slotsize;
560 578 spin_unlock(&nfsd_drc_lock);
561 /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */ 579
562 mem = fchan->maxreqs * size; 580 return num;
563 if (mem > NFSD_MAX_MEM_PER_SESSION) { 581}
564 fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
565 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
566 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
567 mem = fchan->maxreqs * size;
568 }
569 582
583static void nfsd4_put_drc_mem(int slotsize, int num)
584{
570 spin_lock(&nfsd_drc_lock); 585 spin_lock(&nfsd_drc_lock);
571 /* bound the total session drc memory ussage */ 586 nfsd_drc_mem_used -= slotsize * num;
572 if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
573 fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
574 mem = fchan->maxreqs * size;
575 }
576 nfsd_drc_mem_used += mem;
577 spin_unlock(&nfsd_drc_lock); 587 spin_unlock(&nfsd_drc_lock);
588}
578 589
579 if (fchan->maxreqs == 0) 590static struct nfsd4_session *alloc_session(int slotsize, int numslots)
580 return nfserr_jukebox; 591{
592 struct nfsd4_session *new;
593 int mem, i;
581 594
582 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; 595 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
583 return 0; 596 + sizeof(struct nfsd4_session) > PAGE_SIZE);
597 mem = numslots * sizeof(struct nfsd4_slot *);
598
599 new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
600 if (!new)
601 return NULL;
602 /* allocate each struct nfsd4_slot and data cache in one piece */
603 for (i = 0; i < numslots; i++) {
604 mem = sizeof(struct nfsd4_slot) + slotsize;
605 new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
606 if (!new->se_slots[i])
607 goto out_free;
608 }
609 return new;
610out_free:
611 while (i--)
612 kfree(new->se_slots[i]);
613 kfree(new);
614 return NULL;
584} 615}
585 616
586/* 617static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
587 * fchan holds the client values on input, and the server values on output
588 * sv_max_mesg is the maximum payload plus one page for overhead.
589 */
590static int init_forechannel_attrs(struct svc_rqst *rqstp,
591 struct nfsd4_channel_attrs *session_fchan,
592 struct nfsd4_channel_attrs *fchan)
593{ 618{
594 int status = 0; 619 u32 maxrpc = nfsd_serv->sv_max_mesg;
595 __u32 maxcount = nfsd_serv->sv_max_mesg;
596 620
597 /* headerpadsz set to zero in encode routine */ 621 new->maxreqs = numslots;
622 new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
623 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
624 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
625 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
626}
598 627
599 /* Use the client's max request and max response size if possible */ 628static void free_conn(struct nfsd4_conn *c)
600 if (fchan->maxreq_sz > maxcount) 629{
601 fchan->maxreq_sz = maxcount; 630 svc_xprt_put(c->cn_xprt);
602 session_fchan->maxreq_sz = fchan->maxreq_sz; 631 kfree(c);
632}
603 633
604 if (fchan->maxresp_sz > maxcount) 634static void nfsd4_conn_lost(struct svc_xpt_user *u)
605 fchan->maxresp_sz = maxcount; 635{
606 session_fchan->maxresp_sz = fchan->maxresp_sz; 636 struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
637 struct nfs4_client *clp = c->cn_session->se_client;
607 638
608 /* Use the client's maxops if possible */ 639 spin_lock(&clp->cl_lock);
609 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 640 if (!list_empty(&c->cn_persession)) {
610 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 641 list_del(&c->cn_persession);
611 session_fchan->maxops = fchan->maxops; 642 free_conn(c);
643 }
644 spin_unlock(&clp->cl_lock);
645}
612 646
613 /* FIXME: Error means no more DRC pages so the server should 647static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
614 * recover pages from existing sessions. For now fail session 648{
615 * creation. 649 struct nfsd4_conn *conn;
616 */
617 status = set_forechannel_drc_size(fchan);
618 650
619 session_fchan->maxresp_cached = fchan->maxresp_cached; 651 conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
620 session_fchan->maxreqs = fchan->maxreqs; 652 if (!conn)
653 return NULL;
654 svc_xprt_get(rqstp->rq_xprt);
655 conn->cn_xprt = rqstp->rq_xprt;
656 conn->cn_flags = flags;
657 INIT_LIST_HEAD(&conn->cn_xpt_user.list);
658 return conn;
659}
621 660
622 dprintk("%s status %d\n", __func__, status); 661static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
623 return status; 662{
663 conn->cn_session = ses;
664 list_add(&conn->cn_persession, &ses->se_conns);
624} 665}
625 666
626static void 667static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
627free_session_slots(struct nfsd4_session *ses)
628{ 668{
629 int i; 669 struct nfs4_client *clp = ses->se_client;
630 670
631 for (i = 0; i < ses->se_fchannel.maxreqs; i++) 671 spin_lock(&clp->cl_lock);
632 kfree(ses->se_slots[i]); 672 __nfsd4_hash_conn(conn, ses);
673 spin_unlock(&clp->cl_lock);
633} 674}
634 675
635/* 676static int nfsd4_register_conn(struct nfsd4_conn *conn)
636 * We don't actually need to cache the rpc and session headers, so we
637 * can allocate a little less for each slot:
638 */
639static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
640{ 677{
641 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; 678 conn->cn_xpt_user.callback = nfsd4_conn_lost;
679 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
642} 680}
643 681
644static int 682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
645alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
646 struct nfsd4_create_session *cses)
647{ 683{
648 struct nfsd4_session *new, tmp; 684 struct nfsd4_conn *conn;
649 struct nfsd4_slot *sp; 685 u32 flags = NFS4_CDFC4_FORE;
650 int idx, slotsize, cachesize, i; 686 int ret;
651 int status;
652 687
653 memset(&tmp, 0, sizeof(tmp)); 688 if (ses->se_flags & SESSION4_BACK_CHAN)
689 flags |= NFS4_CDFC4_BACK;
690 conn = alloc_conn(rqstp, flags);
691 if (!conn)
692 return nfserr_jukebox;
693 nfsd4_hash_conn(conn, ses);
694 ret = nfsd4_register_conn(conn);
695 if (ret)
696 /* oops; xprt is already down: */
697 nfsd4_conn_lost(&conn->cn_xpt_user);
698 return nfs_ok;
699}
654 700
655 /* FIXME: For now, we just accept the client back channel attributes. */ 701static void nfsd4_del_conns(struct nfsd4_session *s)
656 tmp.se_bchannel = cses->back_channel; 702{
657 status = init_forechannel_attrs(rqstp, &tmp.se_fchannel, 703 struct nfs4_client *clp = s->se_client;
658 &cses->fore_channel); 704 struct nfsd4_conn *c;
659 if (status)
660 goto out;
661 705
662 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) 706 spin_lock(&clp->cl_lock);
663 + sizeof(struct nfsd4_session) > PAGE_SIZE); 707 while (!list_empty(&s->se_conns)) {
708 c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
709 list_del_init(&c->cn_persession);
710 spin_unlock(&clp->cl_lock);
664 711
665 status = nfserr_jukebox; 712 unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
666 /* allocate struct nfsd4_session and slot table pointers in one piece */ 713 free_conn(c);
667 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
668 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
669 if (!new)
670 goto out;
671 714
672 memcpy(new, &tmp, sizeof(*new)); 715 spin_lock(&clp->cl_lock);
716 }
717 spin_unlock(&clp->cl_lock);
718}
673 719
674 /* allocate each struct nfsd4_slot and data cache in one piece */ 720void free_session(struct kref *kref)
675 cachesize = slot_bytes(&new->se_fchannel); 721{
676 for (i = 0; i < new->se_fchannel.maxreqs; i++) { 722 struct nfsd4_session *ses;
677 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); 723 int mem;
678 if (!sp) 724
679 goto out_free; 725 ses = container_of(kref, struct nfsd4_session, se_ref);
680 new->se_slots[i] = sp; 726 nfsd4_del_conns(ses);
727 spin_lock(&nfsd_drc_lock);
728 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
729 nfsd_drc_mem_used -= mem;
730 spin_unlock(&nfsd_drc_lock);
731 free_session_slots(ses);
732 kfree(ses);
733}
734
735static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
736{
737 struct nfsd4_session *new;
738 struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
739 int numslots, slotsize;
740 int status;
741 int idx;
742
743 /*
744 * Note decreasing slot size below client's request may
745 * make it difficult for client to function correctly, whereas
746 * decreasing the number of slots will (just?) affect
747 * performance. When short on memory we therefore prefer to
748 * decrease number of slots instead of their size.
749 */
750 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
751 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
752
753 new = alloc_session(slotsize, numslots);
754 if (!new) {
755 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
756 return NULL;
681 } 757 }
758 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
682 759
683 new->se_client = clp; 760 new->se_client = clp;
684 gen_sessionid(new); 761 gen_sessionid(new);
685 idx = hash_sessionid(&new->se_sessionid);
686 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
687 NFS4_MAX_SESSIONID_LEN);
688 762
763 INIT_LIST_HEAD(&new->se_conns);
764
765 new->se_cb_seq_nr = 1;
689 new->se_flags = cses->flags; 766 new->se_flags = cses->flags;
767 new->se_cb_prog = cses->callback_prog;
690 kref_init(&new->se_ref); 768 kref_init(&new->se_ref);
769 idx = hash_sessionid(&new->se_sessionid);
691 spin_lock(&client_lock); 770 spin_lock(&client_lock);
692 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 771 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
693 list_add(&new->se_perclnt, &clp->cl_sessions); 772 list_add(&new->se_perclnt, &clp->cl_sessions);
694 spin_unlock(&client_lock); 773 spin_unlock(&client_lock);
695 774
696 status = nfs_ok; 775 status = nfsd4_new_conn(rqstp, new);
697out: 776 /* whoops: benny points out, status is ignored! (err, or bogus) */
698 return status; 777 if (status) {
699out_free: 778 free_session(&new->se_ref);
700 free_session_slots(new); 779 return NULL;
701 kfree(new); 780 }
702 goto out; 781 if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
782 struct sockaddr *sa = svc_addr(rqstp);
783
784 clp->cl_cb_session = new;
785 clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
786 svc_xprt_get(rqstp->rq_xprt);
787 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
788 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
789 nfsd4_probe_callback(clp);
790 }
791 return new;
703} 792}
704 793
705/* caller must hold client_lock */ 794/* caller must hold client_lock */
@@ -731,21 +820,6 @@ unhash_session(struct nfsd4_session *ses)
731 list_del(&ses->se_perclnt); 820 list_del(&ses->se_perclnt);
732} 821}
733 822
734void
735free_session(struct kref *kref)
736{
737 struct nfsd4_session *ses;
738 int mem;
739
740 ses = container_of(kref, struct nfsd4_session, se_ref);
741 spin_lock(&nfsd_drc_lock);
742 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
743 nfsd_drc_mem_used -= mem;
744 spin_unlock(&nfsd_drc_lock);
745 free_session_slots(ses);
746 kfree(ses);
747}
748
749/* must be called under the client_lock */ 823/* must be called under the client_lock */
750static inline void 824static inline void
751renew_client_locked(struct nfs4_client *clp) 825renew_client_locked(struct nfs4_client *clp)
@@ -812,6 +886,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
812static inline void 886static inline void
813free_client(struct nfs4_client *clp) 887free_client(struct nfs4_client *clp)
814{ 888{
889 while (!list_empty(&clp->cl_sessions)) {
890 struct nfsd4_session *ses;
891 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
892 se_perclnt);
893 list_del(&ses->se_perclnt);
894 nfsd4_put_session(ses);
895 }
815 if (clp->cl_cred.cr_group_info) 896 if (clp->cl_cred.cr_group_info)
816 put_group_info(clp->cl_cred.cr_group_info); 897 put_group_info(clp->cl_cred.cr_group_info);
817 kfree(clp->cl_principal); 898 kfree(clp->cl_principal);
@@ -838,15 +919,12 @@ release_session_client(struct nfsd4_session *session)
838static inline void 919static inline void
839unhash_client_locked(struct nfs4_client *clp) 920unhash_client_locked(struct nfs4_client *clp)
840{ 921{
922 struct nfsd4_session *ses;
923
841 mark_client_expired(clp); 924 mark_client_expired(clp);
842 list_del(&clp->cl_lru); 925 list_del(&clp->cl_lru);
843 while (!list_empty(&clp->cl_sessions)) { 926 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
844 struct nfsd4_session *ses; 927 list_del_init(&ses->se_hash);
845 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
846 se_perclnt);
847 unhash_session(ses);
848 nfsd4_put_session(ses);
849 }
850} 928}
851 929
852static void 930static void
@@ -875,7 +953,7 @@ expire_client(struct nfs4_client *clp)
875 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 953 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
876 release_openowner(sop); 954 release_openowner(sop);
877 } 955 }
878 nfsd4_set_callback_client(clp, NULL); 956 nfsd4_shutdown_callback(clp);
879 if (clp->cl_cb_conn.cb_xprt) 957 if (clp->cl_cb_conn.cb_xprt)
880 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 958 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
881 list_del(&clp->cl_idhash); 959 list_del(&clp->cl_idhash);
@@ -960,6 +1038,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
960 if (clp == NULL) 1038 if (clp == NULL)
961 return NULL; 1039 return NULL;
962 1040
1041 INIT_LIST_HEAD(&clp->cl_sessions);
1042
963 princ = svc_gss_principal(rqstp); 1043 princ = svc_gss_principal(rqstp);
964 if (princ) { 1044 if (princ) {
965 clp->cl_principal = kstrdup(princ, GFP_KERNEL); 1045 clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -976,8 +1056,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
976 INIT_LIST_HEAD(&clp->cl_strhash); 1056 INIT_LIST_HEAD(&clp->cl_strhash);
977 INIT_LIST_HEAD(&clp->cl_openowners); 1057 INIT_LIST_HEAD(&clp->cl_openowners);
978 INIT_LIST_HEAD(&clp->cl_delegations); 1058 INIT_LIST_HEAD(&clp->cl_delegations);
979 INIT_LIST_HEAD(&clp->cl_sessions);
980 INIT_LIST_HEAD(&clp->cl_lru); 1059 INIT_LIST_HEAD(&clp->cl_lru);
1060 spin_lock_init(&clp->cl_lock);
1061 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
981 clp->cl_time = get_seconds(); 1062 clp->cl_time = get_seconds();
982 clear_bit(0, &clp->cl_cb_slot_busy); 1063 clear_bit(0, &clp->cl_cb_slot_busy);
983 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1064 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -986,7 +1067,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
986 clp->cl_flavor = rqstp->rq_flavor; 1067 clp->cl_flavor = rqstp->rq_flavor;
987 copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1068 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
988 gen_confirm(clp); 1069 gen_confirm(clp);
989 1070 clp->cl_cb_session = NULL;
990 return clp; 1071 return clp;
991} 1072}
992 1073
@@ -1098,7 +1179,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
1098static void 1179static void
1099gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) 1180gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1100{ 1181{
1101 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 1182 struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
1102 unsigned short expected_family; 1183 unsigned short expected_family;
1103 1184
1104 /* Currently, we only support tcp and tcp6 for the callback channel */ 1185 /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1111,24 +1192,23 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1111 else 1192 else
1112 goto out_err; 1193 goto out_err;
1113 1194
1114 cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, 1195 conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
1115 se->se_callback_addr_len, 1196 se->se_callback_addr_len,
1116 (struct sockaddr *) &cb->cb_addr, 1197 (struct sockaddr *)&conn->cb_addr,
1117 sizeof(cb->cb_addr)); 1198 sizeof(conn->cb_addr));
1118 1199
1119 if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) 1200 if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
1120 goto out_err; 1201 goto out_err;
1121 1202
1122 if (cb->cb_addr.ss_family == AF_INET6) 1203 if (conn->cb_addr.ss_family == AF_INET6)
1123 ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; 1204 ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
1124 1205
1125 cb->cb_minorversion = 0; 1206 conn->cb_prog = se->se_callback_prog;
1126 cb->cb_prog = se->se_callback_prog; 1207 conn->cb_ident = se->se_callback_ident;
1127 cb->cb_ident = se->se_callback_ident;
1128 return; 1208 return;
1129out_err: 1209out_err:
1130 cb->cb_addr.ss_family = AF_UNSPEC; 1210 conn->cb_addr.ss_family = AF_UNSPEC;
1131 cb->cb_addrlen = 0; 1211 conn->cb_addrlen = 0;
1132 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " 1212 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
1133 "will not receive delegations\n", 1213 "will not receive delegations\n",
1134 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); 1214 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1415,7 +1495,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1415{ 1495{
1416 struct sockaddr *sa = svc_addr(rqstp); 1496 struct sockaddr *sa = svc_addr(rqstp);
1417 struct nfs4_client *conf, *unconf; 1497 struct nfs4_client *conf, *unconf;
1498 struct nfsd4_session *new;
1418 struct nfsd4_clid_slot *cs_slot = NULL; 1499 struct nfsd4_clid_slot *cs_slot = NULL;
1500 bool confirm_me = false;
1419 int status = 0; 1501 int status = 0;
1420 1502
1421 nfs4_lock_state(); 1503 nfs4_lock_state();
@@ -1438,7 +1520,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1438 cs_slot->sl_seqid, cr_ses->seqid); 1520 cs_slot->sl_seqid, cr_ses->seqid);
1439 goto out; 1521 goto out;
1440 } 1522 }
1441 cs_slot->sl_seqid++;
1442 } else if (unconf) { 1523 } else if (unconf) {
1443 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1524 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1444 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1525 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1451,25 +1532,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1451 if (status) { 1532 if (status) {
1452 /* an unconfirmed replay returns misordered */ 1533 /* an unconfirmed replay returns misordered */
1453 status = nfserr_seq_misordered; 1534 status = nfserr_seq_misordered;
1454 goto out_cache; 1535 goto out;
1455 } 1536 }
1456 1537
1457 cs_slot->sl_seqid++; /* from 0 to 1 */ 1538 confirm_me = true;
1458 move_to_confirmed(unconf);
1459
1460 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1461 unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
1462 svc_xprt_get(rqstp->rq_xprt);
1463 rpc_copy_addr(
1464 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1465 sa);
1466 unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
1467 unconf->cl_cb_conn.cb_minorversion =
1468 cstate->minorversion;
1469 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1470 unconf->cl_cb_seq_nr = 1;
1471 nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
1472 }
1473 conf = unconf; 1539 conf = unconf;
1474 } else { 1540 } else {
1475 status = nfserr_stale_clientid; 1541 status = nfserr_stale_clientid;
@@ -1477,22 +1543,30 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1477 } 1543 }
1478 1544
1479 /* 1545 /*
1546 * XXX: we should probably set this at creation time, and check
1547 * for consistent minorversion use throughout:
1548 */
1549 conf->cl_minorversion = 1;
1550 /*
1480 * We do not support RDMA or persistent sessions 1551 * We do not support RDMA or persistent sessions
1481 */ 1552 */
1482 cr_ses->flags &= ~SESSION4_PERSIST; 1553 cr_ses->flags &= ~SESSION4_PERSIST;
1483 cr_ses->flags &= ~SESSION4_RDMA; 1554 cr_ses->flags &= ~SESSION4_RDMA;
1484 1555
1485 status = alloc_init_session(rqstp, conf, cr_ses); 1556 status = nfserr_jukebox;
1486 if (status) 1557 new = alloc_init_session(rqstp, conf, cr_ses);
1558 if (!new)
1487 goto out; 1559 goto out;
1488 1560 status = nfs_ok;
1489 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, 1561 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1490 NFS4_MAX_SESSIONID_LEN); 1562 NFS4_MAX_SESSIONID_LEN);
1563 cs_slot->sl_seqid++;
1491 cr_ses->seqid = cs_slot->sl_seqid; 1564 cr_ses->seqid = cs_slot->sl_seqid;
1492 1565
1493out_cache:
1494 /* cache solo and embedded create sessions under the state lock */ 1566 /* cache solo and embedded create sessions under the state lock */
1495 nfsd4_cache_create_session(cr_ses, cs_slot, status); 1567 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1568 if (confirm_me)
1569 move_to_confirmed(conf);
1496out: 1570out:
1497 nfs4_unlock_state(); 1571 nfs4_unlock_state();
1498 dprintk("%s returns %d\n", __func__, ntohl(status)); 1572 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1546,8 +1620,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
1546 1620
1547 nfs4_lock_state(); 1621 nfs4_lock_state();
1548 /* wait for callbacks */ 1622 /* wait for callbacks */
1549 nfsd4_set_callback_client(ses->se_client, NULL); 1623 nfsd4_shutdown_callback(ses->se_client);
1550 nfs4_unlock_state(); 1624 nfs4_unlock_state();
1625
1626 nfsd4_del_conns(ses);
1627
1551 nfsd4_put_session(ses); 1628 nfsd4_put_session(ses);
1552 status = nfs_ok; 1629 status = nfs_ok;
1553out: 1630out:
@@ -1555,6 +1632,40 @@ out:
1555 return status; 1632 return status;
1556} 1633}
1557 1634
1635static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
1636{
1637 struct nfsd4_conn *c;
1638
1639 list_for_each_entry(c, &s->se_conns, cn_persession) {
1640 if (c->cn_xprt == xpt) {
1641 return c;
1642 }
1643 }
1644 return NULL;
1645}
1646
1647static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
1648{
1649 struct nfs4_client *clp = ses->se_client;
1650 struct nfsd4_conn *c;
1651 int ret;
1652
1653 spin_lock(&clp->cl_lock);
1654 c = __nfsd4_find_conn(new->cn_xprt, ses);
1655 if (c) {
1656 spin_unlock(&clp->cl_lock);
1657 free_conn(new);
1658 return;
1659 }
1660 __nfsd4_hash_conn(new, ses);
1661 spin_unlock(&clp->cl_lock);
1662 ret = nfsd4_register_conn(new);
1663 if (ret)
1664 /* oops; xprt is already down: */
1665 nfsd4_conn_lost(&new->cn_xpt_user);
1666 return;
1667}
1668
1558__be32 1669__be32
1559nfsd4_sequence(struct svc_rqst *rqstp, 1670nfsd4_sequence(struct svc_rqst *rqstp,
1560 struct nfsd4_compound_state *cstate, 1671 struct nfsd4_compound_state *cstate,
@@ -1563,11 +1674,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1563 struct nfsd4_compoundres *resp = rqstp->rq_resp; 1674 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1564 struct nfsd4_session *session; 1675 struct nfsd4_session *session;
1565 struct nfsd4_slot *slot; 1676 struct nfsd4_slot *slot;
1677 struct nfsd4_conn *conn;
1566 int status; 1678 int status;
1567 1679
1568 if (resp->opcnt != 1) 1680 if (resp->opcnt != 1)
1569 return nfserr_sequence_pos; 1681 return nfserr_sequence_pos;
1570 1682
1683 /*
1684 * Will be either used or freed by nfsd4_sequence_check_conn
1685 * below.
1686 */
1687 conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
1688 if (!conn)
1689 return nfserr_jukebox;
1690
1571 spin_lock(&client_lock); 1691 spin_lock(&client_lock);
1572 status = nfserr_badsession; 1692 status = nfserr_badsession;
1573 session = find_in_sessionid_hashtbl(&seq->sessionid); 1693 session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1599,6 +1719,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1599 if (status) 1719 if (status)
1600 goto out; 1720 goto out;
1601 1721
1722 nfsd4_sequence_check_conn(conn, session);
1723 conn = NULL;
1724
1602 /* Success! bump slot seqid */ 1725 /* Success! bump slot seqid */
1603 slot->sl_inuse = true; 1726 slot->sl_inuse = true;
1604 slot->sl_seqid = seq->seqid; 1727 slot->sl_seqid = seq->seqid;
@@ -1613,6 +1736,7 @@ out:
1613 nfsd4_get_session(cstate->session); 1736 nfsd4_get_session(cstate->session);
1614 atomic_inc(&session->se_client->cl_refcount); 1737 atomic_inc(&session->se_client->cl_refcount);
1615 } 1738 }
1739 kfree(conn);
1616 spin_unlock(&client_lock); 1740 spin_unlock(&client_lock);
1617 dprintk("%s: return %d\n", __func__, ntohl(status)); 1741 dprintk("%s: return %d\n", __func__, ntohl(status));
1618 return status; 1742 return status;
@@ -1747,6 +1871,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1747 goto out; 1871 goto out;
1748 gen_clid(new); 1872 gen_clid(new);
1749 } 1873 }
1874 /*
1875 * XXX: we should probably set this at creation time, and check
1876 * for consistent minorversion use throughout:
1877 */
1878 new->cl_minorversion = 0;
1750 gen_callback(new, setclid, rpc_get_scope_id(sa)); 1879 gen_callback(new, setclid, rpc_get_scope_id(sa));
1751 add_to_unconfirmed(new, strhashval); 1880 add_to_unconfirmed(new, strhashval);
1752 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1881 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
@@ -1807,7 +1936,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1807 status = nfserr_clid_inuse; 1936 status = nfserr_clid_inuse;
1808 else { 1937 else {
1809 atomic_set(&conf->cl_cb_set, 0); 1938 atomic_set(&conf->cl_cb_set, 0);
1810 nfsd4_probe_callback(conf, &unconf->cl_cb_conn); 1939 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
1940 nfsd4_probe_callback(conf);
1811 expire_client(unconf); 1941 expire_client(unconf);
1812 status = nfs_ok; 1942 status = nfs_ok;
1813 1943
@@ -1841,7 +1971,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1841 } 1971 }
1842 move_to_confirmed(unconf); 1972 move_to_confirmed(unconf);
1843 conf = unconf; 1973 conf = unconf;
1844 nfsd4_probe_callback(conf, &conf->cl_cb_conn); 1974 nfsd4_probe_callback(conf);
1845 status = nfs_ok; 1975 status = nfs_ok;
1846 } 1976 }
1847 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 1977 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -2188,22 +2318,6 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
2188} 2318}
2189 2319
2190/* 2320/*
2191 * Set the delegation file_lock back pointer.
2192 *
2193 * Called from setlease() with lock_kernel() held.
2194 */
2195static
2196void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
2197{
2198 struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
2199
2200 dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
2201 if (!dp)
2202 return;
2203 dp->dl_flock = new;
2204}
2205
2206/*
2207 * Called from setlease() with lock_kernel() held 2321 * Called from setlease() with lock_kernel() held
2208 */ 2322 */
2209static 2323static
@@ -2233,7 +2347,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2233static const struct lock_manager_operations nfsd_lease_mng_ops = { 2347static const struct lock_manager_operations nfsd_lease_mng_ops = {
2234 .fl_break = nfsd_break_deleg_cb, 2348 .fl_break = nfsd_break_deleg_cb,
2235 .fl_release_private = nfsd_release_deleg_cb, 2349 .fl_release_private = nfsd_release_deleg_cb,
2236 .fl_copy_lock = nfsd_copy_lock_deleg_cb,
2237 .fl_mylease = nfsd_same_client_deleg_cb, 2350 .fl_mylease = nfsd_same_client_deleg_cb,
2238 .fl_change = nfsd_change_deleg_cb, 2351 .fl_change = nfsd_change_deleg_cb,
2239}; 2352};
@@ -2492,7 +2605,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2492 struct nfs4_delegation *dp; 2605 struct nfs4_delegation *dp;
2493 struct nfs4_stateowner *sop = stp->st_stateowner; 2606 struct nfs4_stateowner *sop = stp->st_stateowner;
2494 int cb_up = atomic_read(&sop->so_client->cl_cb_set); 2607 int cb_up = atomic_read(&sop->so_client->cl_cb_set);
2495 struct file_lock fl, *flp = &fl; 2608 struct file_lock *fl;
2496 int status, flag = 0; 2609 int status, flag = 0;
2497 2610
2498 flag = NFS4_OPEN_DELEGATE_NONE; 2611 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2526,21 +2639,28 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2526 flag = NFS4_OPEN_DELEGATE_NONE; 2639 flag = NFS4_OPEN_DELEGATE_NONE;
2527 goto out; 2640 goto out;
2528 } 2641 }
2529 locks_init_lock(&fl); 2642 status = -ENOMEM;
2530 fl.fl_lmops = &nfsd_lease_mng_ops; 2643 fl = locks_alloc_lock();
2531 fl.fl_flags = FL_LEASE; 2644 if (!fl)
2532 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 2645 goto out;
2533 fl.fl_end = OFFSET_MAX; 2646 locks_init_lock(fl);
2534 fl.fl_owner = (fl_owner_t)dp; 2647 fl->fl_lmops = &nfsd_lease_mng_ops;
2535 fl.fl_file = find_readable_file(stp->st_file); 2648 fl->fl_flags = FL_LEASE;
2536 BUG_ON(!fl.fl_file); 2649 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2537 fl.fl_pid = current->tgid; 2650 fl->fl_end = OFFSET_MAX;
2651 fl->fl_owner = (fl_owner_t)dp;
2652 fl->fl_file = find_readable_file(stp->st_file);
2653 BUG_ON(!fl->fl_file);
2654 fl->fl_pid = current->tgid;
2655 dp->dl_flock = fl;
2538 2656
2539 /* vfs_setlease checks to see if delegation should be handed out. 2657 /* vfs_setlease checks to see if delegation should be handed out.
2540 * the lock_manager callbacks fl_mylease and fl_change are used 2658 * the lock_manager callbacks fl_mylease and fl_change are used
2541 */ 2659 */
2542 if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) { 2660 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2543 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2661 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2662 dp->dl_flock = NULL;
2663 locks_free_lock(fl);
2544 unhash_delegation(dp); 2664 unhash_delegation(dp);
2545 flag = NFS4_OPEN_DELEGATE_NONE; 2665 flag = NFS4_OPEN_DELEGATE_NONE;
2546 goto out; 2666 goto out;
@@ -2944,7 +3064,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2944 if (STALE_STATEID(stateid)) 3064 if (STALE_STATEID(stateid))
2945 goto out; 3065 goto out;
2946 3066
2947 status = nfserr_bad_stateid; 3067 /*
3068 * We assume that any stateid that has the current boot time,
3069 * but that we can't find, is expired:
3070 */
3071 status = nfserr_expired;
2948 if (is_delegation_stateid(stateid)) { 3072 if (is_delegation_stateid(stateid)) {
2949 dp = find_delegation_stateid(ino, stateid); 3073 dp = find_delegation_stateid(ino, stateid);
2950 if (!dp) 3074 if (!dp)
@@ -2964,6 +3088,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2964 stp = find_stateid(stateid, flags); 3088 stp = find_stateid(stateid, flags);
2965 if (!stp) 3089 if (!stp)
2966 goto out; 3090 goto out;
3091 status = nfserr_bad_stateid;
2967 if (nfs4_check_fh(current_fh, stp)) 3092 if (nfs4_check_fh(current_fh, stp))
2968 goto out; 3093 goto out;
2969 if (!stp->st_stateowner->so_confirmed) 3094 if (!stp->st_stateowner->so_confirmed)
@@ -3038,8 +3163,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3038 * a replayed close: 3163 * a replayed close:
3039 */ 3164 */
3040 sop = search_close_lru(stateid->si_stateownerid, flags); 3165 sop = search_close_lru(stateid->si_stateownerid, flags);
3166 /* It's not stale; let's assume it's expired: */
3041 if (sop == NULL) 3167 if (sop == NULL)
3042 return nfserr_bad_stateid; 3168 return nfserr_expired;
3043 *sopp = sop; 3169 *sopp = sop;
3044 goto check_replay; 3170 goto check_replay;
3045 } 3171 }
@@ -3304,6 +3430,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3304 status = nfserr_bad_stateid; 3430 status = nfserr_bad_stateid;
3305 if (!is_delegation_stateid(stateid)) 3431 if (!is_delegation_stateid(stateid))
3306 goto out; 3432 goto out;
3433 status = nfserr_expired;
3307 dp = find_delegation_stateid(inode, stateid); 3434 dp = find_delegation_stateid(inode, stateid);
3308 if (!dp) 3435 if (!dp)
3309 goto out; 3436 goto out;
@@ -3895,7 +4022,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3895 struct inode *inode = filp->fi_inode; 4022 struct inode *inode = filp->fi_inode;
3896 int status = 0; 4023 int status = 0;
3897 4024
3898 lock_kernel(); 4025 lock_flocks();
3899 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 4026 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
3900 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 4027 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
3901 status = 1; 4028 status = 1;
@@ -3903,7 +4030,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3903 } 4030 }
3904 } 4031 }
3905out: 4032out:
3906 unlock_kernel(); 4033 unlock_flocks();
3907 return status; 4034 return status;
3908} 4035}
3909 4036
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1a468bbd330f..f35a94a04026 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1805,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1805 goto out_nfserr; 1805 goto out_nfserr;
1806 } 1806 }
1807 } 1807 }
1808 if ((buflen -= 16) < 0)
1809 goto out_resource;
1810 1808
1811 if (unlikely(bmval2)) { 1809 if (bmval2) {
1810 if ((buflen -= 16) < 0)
1811 goto out_resource;
1812 WRITE32(3); 1812 WRITE32(3);
1813 WRITE32(bmval0); 1813 WRITE32(bmval0);
1814 WRITE32(bmval1); 1814 WRITE32(bmval1);
1815 WRITE32(bmval2); 1815 WRITE32(bmval2);
1816 } else if (likely(bmval1)) { 1816 } else if (bmval1) {
1817 if ((buflen -= 12) < 0)
1818 goto out_resource;
1817 WRITE32(2); 1819 WRITE32(2);
1818 WRITE32(bmval0); 1820 WRITE32(bmval0);
1819 WRITE32(bmval1); 1821 WRITE32(bmval1);
1820 } else { 1822 } else {
1823 if ((buflen -= 8) < 0)
1824 goto out_resource;
1821 WRITE32(1); 1825 WRITE32(1);
1822 WRITE32(bmval0); 1826 WRITE32(bmval0);
1823 } 1827 }
@@ -1828,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1828 u32 word1 = nfsd_suppattrs1(minorversion); 1832 u32 word1 = nfsd_suppattrs1(minorversion);
1829 u32 word2 = nfsd_suppattrs2(minorversion); 1833 u32 word2 = nfsd_suppattrs2(minorversion);
1830 1834
1831 if ((buflen -= 12) < 0)
1832 goto out_resource;
1833 if (!aclsupport) 1835 if (!aclsupport)
1834 word0 &= ~FATTR4_WORD0_ACL; 1836 word0 &= ~FATTR4_WORD0_ACL;
1835 if (!word2) { 1837 if (!word2) {
1838 if ((buflen -= 12) < 0)
1839 goto out_resource;
1836 WRITE32(2); 1840 WRITE32(2);
1837 WRITE32(word0); 1841 WRITE32(word0);
1838 WRITE32(word1); 1842 WRITE32(word1);
1839 } else { 1843 } else {
1844 if ((buflen -= 16) < 0)
1845 goto out_resource;
1840 WRITE32(3); 1846 WRITE32(3);
1841 WRITE32(word0); 1847 WRITE32(word0);
1842 WRITE32(word1); 1848 WRITE32(word1);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b53b1d042f1f..4514ebbee4d6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23enum { 23enum {
24 NFSD_Root = 1, 24 NFSD_Root = 1,
25#ifdef CONFIG_NFSD_DEPRECATED
25 NFSD_Svc, 26 NFSD_Svc,
26 NFSD_Add, 27 NFSD_Add,
27 NFSD_Del, 28 NFSD_Del,
@@ -29,6 +30,7 @@ enum {
29 NFSD_Unexport, 30 NFSD_Unexport,
30 NFSD_Getfd, 31 NFSD_Getfd,
31 NFSD_Getfs, 32 NFSD_Getfs,
33#endif
32 NFSD_List, 34 NFSD_List,
33 NFSD_Export_features, 35 NFSD_Export_features,
34 NFSD_Fh, 36 NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
54/* 56/*
55 * write() for these nodes. 57 * write() for these nodes.
56 */ 58 */
59#ifdef CONFIG_NFSD_DEPRECATED
57static ssize_t write_svc(struct file *file, char *buf, size_t size); 60static ssize_t write_svc(struct file *file, char *buf, size_t size);
58static ssize_t write_add(struct file *file, char *buf, size_t size); 61static ssize_t write_add(struct file *file, char *buf, size_t size);
59static ssize_t write_del(struct file *file, char *buf, size_t size); 62static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
61static ssize_t write_unexport(struct file *file, char *buf, size_t size); 64static ssize_t write_unexport(struct file *file, char *buf, size_t size);
62static ssize_t write_getfd(struct file *file, char *buf, size_t size); 65static ssize_t write_getfd(struct file *file, char *buf, size_t size);
63static ssize_t write_getfs(struct file *file, char *buf, size_t size); 66static ssize_t write_getfs(struct file *file, char *buf, size_t size);
67#endif
64static ssize_t write_filehandle(struct file *file, char *buf, size_t size); 68static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
65static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); 69static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
66static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); 70static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
76#endif 80#endif
77 81
78static ssize_t (*write_op[])(struct file *, char *, size_t) = { 82static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83#ifdef CONFIG_NFSD_DEPRECATED
79 [NFSD_Svc] = write_svc, 84 [NFSD_Svc] = write_svc,
80 [NFSD_Add] = write_add, 85 [NFSD_Add] = write_add,
81 [NFSD_Del] = write_del, 86 [NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83 [NFSD_Unexport] = write_unexport, 88 [NFSD_Unexport] = write_unexport,
84 [NFSD_Getfd] = write_getfd, 89 [NFSD_Getfd] = write_getfd,
85 [NFSD_Getfs] = write_getfs, 90 [NFSD_Getfs] = write_getfs,
91#endif
86 [NFSD_Fh] = write_filehandle, 92 [NFSD_Fh] = write_filehandle,
87 [NFSD_FO_UnlockIP] = write_unlock_ip, 93 [NFSD_FO_UnlockIP] = write_unlock_ip,
88 [NFSD_FO_UnlockFS] = write_unlock_fs, 94 [NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,14 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
121 127
122static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
123{ 129{
130 static int warned;
131 if (file->f_dentry->d_name.name[0] == '.' && !warned) {
132 printk(KERN_INFO
133 "Warning: \"%s\" uses deprecated NFSD interface: %s."
134 " This will be removed in 2.6.40\n",
135 current->comm, file->f_dentry->d_name.name);
136 warned = 1;
137 }
124 if (! file->private_data) { 138 if (! file->private_data) {
125 /* An attempt to read a transaction file without writing 139 /* An attempt to read a transaction file without writing
126 * causes a 0-byte write so that the file can return 140 * causes a 0-byte write so that the file can return
@@ -137,6 +151,7 @@ static const struct file_operations transaction_ops = {
137 .write = nfsctl_transaction_write, 151 .write = nfsctl_transaction_write,
138 .read = nfsctl_transaction_read, 152 .read = nfsctl_transaction_read,
139 .release = simple_transaction_release, 153 .release = simple_transaction_release,
154 .llseek = default_llseek,
140}; 155};
141 156
142static int exports_open(struct inode *inode, struct file *file) 157static int exports_open(struct inode *inode, struct file *file)
@@ -186,6 +201,7 @@ static const struct file_operations pool_stats_operations = {
186 * payload - write methods 201 * payload - write methods
187 */ 202 */
188 203
204#ifdef CONFIG_NFSD_DEPRECATED
189/** 205/**
190 * write_svc - Start kernel's NFSD server 206 * write_svc - Start kernel's NFSD server
191 * 207 *
@@ -401,7 +417,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
401 417
402 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 418 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
403 419
404 clp = auth_unix_lookup(&in6); 420 clp = auth_unix_lookup(&init_net, &in6);
405 if (!clp) 421 if (!clp)
406 err = -EPERM; 422 err = -EPERM;
407 else { 423 else {
@@ -464,7 +480,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
464 480
465 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 481 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
466 482
467 clp = auth_unix_lookup(&in6); 483 clp = auth_unix_lookup(&init_net, &in6);
468 if (!clp) 484 if (!clp)
469 err = -EPERM; 485 err = -EPERM;
470 else { 486 else {
@@ -481,6 +497,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
481 out: 497 out:
482 return err; 498 return err;
483} 499}
500#endif /* CONFIG_NFSD_DEPRECATED */
484 501
485/** 502/**
486 * write_unlock_ip - Release all locks used by a client 503 * write_unlock_ip - Release all locks used by a client
@@ -999,12 +1016,12 @@ static ssize_t __write_ports_addxprt(char *buf)
999 if (err != 0) 1016 if (err != 0)
1000 return err; 1017 return err;
1001 1018
1002 err = svc_create_xprt(nfsd_serv, transport, 1019 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1003 PF_INET, port, SVC_SOCK_ANONYMOUS); 1020 PF_INET, port, SVC_SOCK_ANONYMOUS);
1004 if (err < 0) 1021 if (err < 0)
1005 goto out_err; 1022 goto out_err;
1006 1023
1007 err = svc_create_xprt(nfsd_serv, transport, 1024 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1008 PF_INET6, port, SVC_SOCK_ANONYMOUS); 1025 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1009 if (err < 0 && err != -EAFNOSUPPORT) 1026 if (err < 0 && err != -EAFNOSUPPORT)
1010 goto out_close; 1027 goto out_close;
@@ -1355,6 +1372,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
1355static int nfsd_fill_super(struct super_block * sb, void * data, int silent) 1372static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1356{ 1373{
1357 static struct tree_descr nfsd_files[] = { 1374 static struct tree_descr nfsd_files[] = {
1375#ifdef CONFIG_NFSD_DEPRECATED
1358 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, 1376 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
1359 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, 1377 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
1360 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, 1378 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1362,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1362 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, 1380 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
1363 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, 1381 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
1364 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, 1382 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
1383#endif
1365 [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, 1384 [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
1366 [NFSD_Export_features] = {"export_features", 1385 [NFSD_Export_features] = {"export_features",
1367 &export_features_operations, S_IRUGO}, 1386 &export_features_operations, S_IRUGO},
@@ -1386,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1386 return simple_fill_super(sb, 0x6e667364, nfsd_files); 1405 return simple_fill_super(sb, 0x6e667364, nfsd_files);
1387} 1406}
1388 1407
1389static int nfsd_get_sb(struct file_system_type *fs_type, 1408static struct dentry *nfsd_mount(struct file_system_type *fs_type,
1390 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1409 int flags, const char *dev_name, void *data)
1391{ 1410{
1392 return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt); 1411 return mount_single(fs_type, flags, data, nfsd_fill_super);
1393} 1412}
1394 1413
1395static struct file_system_type nfsd_fs_type = { 1414static struct file_system_type nfsd_fs_type = {
1396 .owner = THIS_MODULE, 1415 .owner = THIS_MODULE,
1397 .name = "nfsd", 1416 .name = "nfsd",
1398 .get_sb = nfsd_get_sb, 1417 .mount = nfsd_mount,
1399 .kill_sb = kill_litter_super, 1418 .kill_sb = kill_litter_super,
1400}; 1419};
1401 1420
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b76ac3a82e39..6b641cf2c19a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -249,7 +249,7 @@ extern time_t nfsd4_grace;
249#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 249#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
250#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 250#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
251 251
252#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ 252#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
253 253
254/* 254/*
255 * The following attributes are currently not supported by the NFSv4 server: 255 * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e2c43464f237..2bae1d86f5f2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
16#include <linux/lockd/bind.h> 16#include <linux/lockd/bind.h>
17#include <linux/nfsacl.h> 17#include <linux/nfsacl.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <net/net_namespace.h>
19#include "nfsd.h" 20#include "nfsd.h"
20#include "cache.h" 21#include "cache.h"
21#include "vfs.h" 22#include "vfs.h"
@@ -186,12 +187,12 @@ static int nfsd_init_socks(int port)
186 if (!list_empty(&nfsd_serv->sv_permsocks)) 187 if (!list_empty(&nfsd_serv->sv_permsocks))
187 return 0; 188 return 0;
188 189
189 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, 190 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
190 SVC_SOCK_DEFAULTS); 191 SVC_SOCK_DEFAULTS);
191 if (error < 0) 192 if (error < 0)
192 return error; 193 return error;
193 194
194 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, 195 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
195 SVC_SOCK_DEFAULTS); 196 SVC_SOCK_DEFAULTS);
196 if (error < 0) 197 if (error < 0)
197 return error; 198 return error;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 322518c88e4b..39adc27b0685 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
35#ifndef _NFSD4_STATE_H 35#ifndef _NFSD4_STATE_H
36#define _NFSD4_STATE_H 36#define _NFSD4_STATE_H
37 37
38#include <linux/sunrpc/svc_xprt.h>
38#include <linux/nfsd/nfsfh.h> 39#include <linux/nfsd/nfsfh.h>
39#include "nfsfh.h" 40#include "nfsfh.h"
40 41
@@ -64,19 +65,12 @@ typedef struct {
64 (s)->si_fileid, \ 65 (s)->si_fileid, \
65 (s)->si_generation 66 (s)->si_generation
66 67
67struct nfsd4_cb_sequence {
68 /* args/res */
69 u32 cbs_minorversion;
70 struct nfs4_client *cbs_clp;
71};
72
73struct nfs4_rpc_args {
74 void *args_op;
75 struct nfsd4_cb_sequence args_seq;
76};
77
78struct nfsd4_callback { 68struct nfsd4_callback {
79 struct nfs4_rpc_args cb_args; 69 void *cb_op;
70 struct nfs4_client *cb_clp;
71 u32 cb_minorversion;
72 struct rpc_message cb_msg;
73 const struct rpc_call_ops *cb_ops;
80 struct work_struct cb_work; 74 struct work_struct cb_work;
81}; 75};
82 76
@@ -91,7 +85,6 @@ struct nfs4_delegation {
91 u32 dl_type; 85 u32 dl_type;
92 time_t dl_time; 86 time_t dl_time;
93/* For recall: */ 87/* For recall: */
94 u32 dl_ident;
95 stateid_t dl_stateid; 88 stateid_t dl_stateid;
96 struct knfsd_fh dl_fh; 89 struct knfsd_fh dl_fh;
97 int dl_retries; 90 int dl_retries;
@@ -103,8 +96,8 @@ struct nfs4_cb_conn {
103 /* SETCLIENTID info */ 96 /* SETCLIENTID info */
104 struct sockaddr_storage cb_addr; 97 struct sockaddr_storage cb_addr;
105 size_t cb_addrlen; 98 size_t cb_addrlen;
106 u32 cb_prog; 99 u32 cb_prog; /* used only in 4.0 case;
107 u32 cb_minorversion; 100 per-session otherwise */
108 u32 cb_ident; /* minorversion 0 only */ 101 u32 cb_ident; /* minorversion 0 only */
109 struct svc_xprt *cb_xprt; /* minorversion 1 only */ 102 struct svc_xprt *cb_xprt; /* minorversion 1 only */
110}; 103};
@@ -160,6 +153,15 @@ struct nfsd4_clid_slot {
160 struct nfsd4_create_session sl_cr_ses; 153 struct nfsd4_create_session sl_cr_ses;
161}; 154};
162 155
156struct nfsd4_conn {
157 struct list_head cn_persession;
158 struct svc_xprt *cn_xprt;
159 struct svc_xpt_user cn_xpt_user;
160 struct nfsd4_session *cn_session;
161/* CDFC4_FORE, CDFC4_BACK: */
162 unsigned char cn_flags;
163};
164
163struct nfsd4_session { 165struct nfsd4_session {
164 struct kref se_ref; 166 struct kref se_ref;
165 struct list_head se_hash; /* hash by sessionid */ 167 struct list_head se_hash; /* hash by sessionid */
@@ -169,6 +171,9 @@ struct nfsd4_session {
169 struct nfs4_sessionid se_sessionid; 171 struct nfs4_sessionid se_sessionid;
170 struct nfsd4_channel_attrs se_fchannel; 172 struct nfsd4_channel_attrs se_fchannel;
171 struct nfsd4_channel_attrs se_bchannel; 173 struct nfsd4_channel_attrs se_bchannel;
174 struct list_head se_conns;
175 u32 se_cb_prog;
176 u32 se_cb_seq_nr;
172 struct nfsd4_slot *se_slots[]; /* forward channel slots */ 177 struct nfsd4_slot *se_slots[]; /* forward channel slots */
173}; 178};
174 179
@@ -221,24 +226,32 @@ struct nfs4_client {
221 clientid_t cl_clientid; /* generated by server */ 226 clientid_t cl_clientid; /* generated by server */
222 nfs4_verifier cl_confirm; /* generated by server */ 227 nfs4_verifier cl_confirm; /* generated by server */
223 u32 cl_firststate; /* recovery dir creation */ 228 u32 cl_firststate; /* recovery dir creation */
229 u32 cl_minorversion;
224 230
225 /* for v4.0 and v4.1 callbacks: */ 231 /* for v4.0 and v4.1 callbacks: */
226 struct nfs4_cb_conn cl_cb_conn; 232 struct nfs4_cb_conn cl_cb_conn;
233#define NFSD4_CLIENT_CB_UPDATE 1
234#define NFSD4_CLIENT_KILL 2
235 unsigned long cl_cb_flags;
227 struct rpc_clnt *cl_cb_client; 236 struct rpc_clnt *cl_cb_client;
237 u32 cl_cb_ident;
228 atomic_t cl_cb_set; 238 atomic_t cl_cb_set;
239 struct nfsd4_callback cl_cb_null;
240 struct nfsd4_session *cl_cb_session;
241
242 /* for all client information that callback code might need: */
243 spinlock_t cl_lock;
229 244
230 /* for nfs41 */ 245 /* for nfs41 */
231 struct list_head cl_sessions; 246 struct list_head cl_sessions;
232 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ 247 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
233 u32 cl_exchange_flags; 248 u32 cl_exchange_flags;
234 struct nfs4_sessionid cl_sessionid;
235 /* number of rpc's in progress over an associated session: */ 249 /* number of rpc's in progress over an associated session: */
236 atomic_t cl_refcount; 250 atomic_t cl_refcount;
237 251
238 /* for nfs41 callbacks */ 252 /* for nfs41 callbacks */
239 /* We currently support a single back channel with a single slot */ 253 /* We currently support a single back channel with a single slot */
240 unsigned long cl_cb_slot_busy; 254 unsigned long cl_cb_slot_busy;
241 u32 cl_cb_seq_nr;
242 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 255 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
243 /* wait here for slots */ 256 /* wait here for slots */
244}; 257};
@@ -440,12 +453,13 @@ extern int nfs4_in_grace(void);
440extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 453extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
441extern void nfs4_free_stateowner(struct kref *kref); 454extern void nfs4_free_stateowner(struct kref *kref);
442extern int set_callback_cred(void); 455extern int set_callback_cred(void);
443extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 456extern void nfsd4_probe_callback(struct nfs4_client *clp);
457extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
444extern void nfsd4_do_callback_rpc(struct work_struct *); 458extern void nfsd4_do_callback_rpc(struct work_struct *);
445extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 459extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
446extern int nfsd4_create_callback_queue(void); 460extern int nfsd4_create_callback_queue(void);
447extern void nfsd4_destroy_callback_queue(void); 461extern void nfsd4_destroy_callback_queue(void);
448extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); 462extern void nfsd4_shutdown_callback(struct nfs4_client *);
449extern void nfs4_put_delegation(struct nfs4_delegation *dp); 463extern void nfs4_put_delegation(struct nfs4_delegation *dp);
450extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 464extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
451extern void nfsd4_init_recdir(char *recdir_name); 465extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..184938fcff04 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
281{ 281{
282 struct inode *inode = fhp->fh_dentry->d_inode; 282 struct inode *inode = fhp->fh_dentry->d_inode;
283 const struct export_operations *export_ops = inode->i_sb->s_export_op; 283 const struct export_operations *export_ops = inode->i_sb->s_export_op;
284 int error = 0;
285 284
286 if (!EX_ISSYNC(fhp->fh_export)) 285 if (!EX_ISSYNC(fhp->fh_export))
287 return 0; 286 return 0;
288 287
289 if (export_ops->commit_metadata) { 288 if (export_ops->commit_metadata)
290 error = export_ops->commit_metadata(inode); 289 return export_ops->commit_metadata(inode);
291 } else { 290 return sync_inode_metadata(inode, 1);
292 struct writeback_control wbc = {
293 .sync_mode = WB_SYNC_ALL,
294 .nr_to_write = 0, /* metadata only */
295 };
296
297 error = sync_inode(inode, &wbc);
298 }
299
300 return error;
301} 291}
302 292
303/* 293/*
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc5..85c98737a146 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ 2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \ 3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ 4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o 5 ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3dbdc1d356bf..8b782b062baa 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -533,18 +533,20 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
533 nilfs_btree_init_gc(bmap); 533 nilfs_btree_init_gc(bmap);
534} 534}
535 535
536void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 536void nilfs_bmap_save(const struct nilfs_bmap *bmap,
537 struct nilfs_bmap_store *store)
537{ 538{
538 memcpy(gcbmap, bmap, sizeof(*bmap)); 539 memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
539 init_rwsem(&gcbmap->b_sem); 540 store->last_allocated_key = bmap->b_last_allocated_key;
540 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 541 store->last_allocated_ptr = bmap->b_last_allocated_ptr;
541 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; 542 store->state = bmap->b_state;
542} 543}
543 544
544void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 545void nilfs_bmap_restore(struct nilfs_bmap *bmap,
546 const struct nilfs_bmap_store *store)
545{ 547{
546 memcpy(bmap, gcbmap, sizeof(*bmap)); 548 memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
547 init_rwsem(&bmap->b_sem); 549 bmap->b_last_allocated_key = store->last_allocated_key;
548 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 550 bmap->b_last_allocated_ptr = store->last_allocated_ptr;
549 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 551 bmap->b_state = store->state;
550} 552}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a20569b19929..bde1c0aa2e15 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -135,6 +135,12 @@ struct nilfs_bmap {
135/* state */ 135/* state */
136#define NILFS_BMAP_DIRTY 0x00000001 136#define NILFS_BMAP_DIRTY 0x00000001
137 137
138struct nilfs_bmap_store {
139 __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
140 __u64 last_allocated_key;
141 __u64 last_allocated_ptr;
142 int state;
143};
138 144
139int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); 145int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
140int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 146int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -153,9 +159,9 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
153int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); 159int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
154 160
155void nilfs_bmap_init_gc(struct nilfs_bmap *); 161void nilfs_bmap_init_gc(struct nilfs_bmap *);
156void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
157void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
158 162
163void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
164void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
159 165
160static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, 166static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
161 __u64 *ptr) 167 __u64 *ptr)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f78ab1044d1d..5115814cb745 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
37 37
38void nilfs_btnode_cache_init_once(struct address_space *btnc) 38void nilfs_btnode_cache_init_once(struct address_space *btnc)
39{ 39{
40 memset(btnc, 0, sizeof(*btnc)); 40 nilfs_mapping_init_once(btnc);
41 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
42 spin_lock_init(&btnc->tree_lock);
43 INIT_LIST_HEAD(&btnc->private_list);
44 spin_lock_init(&btnc->private_lock);
45
46 spin_lock_init(&btnc->i_mmap_lock);
47 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
48 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
49} 41}
50 42
51static const struct address_space_operations def_btnode_aops = { 43static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
55void nilfs_btnode_cache_init(struct address_space *btnc, 47void nilfs_btnode_cache_init(struct address_space *btnc,
56 struct backing_dev_info *bdi) 48 struct backing_dev_info *bdi)
57{ 49{
58 btnc->host = NULL; /* can safely set to host inode ? */ 50 nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
59 btnc->flags = 0;
60 mapping_set_gfp_mask(btnc, GFP_NOFS);
61 btnc->assoc_mapping = NULL;
62 btnc->backing_dev_info = bdi;
63 btnc->a_ops = &def_btnode_aops;
64} 51}
65 52
66void nilfs_btnode_cache_clear(struct address_space *btnc) 53void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db63..5ff15a8a1024 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
863 */ 863 */
864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) 864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
865{ 865{
866 struct the_nilfs *nilfs;
867 int ret; 866 int ret;
868 867
869 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
870
871 switch (mode) { 868 switch (mode) {
872 case NILFS_CHECKPOINT: 869 case NILFS_CHECKPOINT:
873 /* 870 if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
874 * Check for protecting existing snapshot mounts: 871 /*
875 * ns_mount_mutex is used to make this operation atomic and 872 * Current implementation does not have to protect
876 * exclusive with a new mount job. Though it doesn't cover 873 * plain read-only mounts since they are exclusive
877 * umount, it's enough for the purpose. 874 * with a read/write mount and are protected from the
878 */ 875 * cleaner.
879 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 876 */
880 /* Current implementation does not have to protect
881 plain read-only mounts since they are exclusive
882 with a read/write mount and are protected from the
883 cleaner. */
884 ret = -EBUSY; 877 ret = -EBUSY;
885 } else 878 else
886 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
887 return ret; 880 return ret;
888 case NILFS_SNAPSHOT: 881 case NILFS_SNAPSHOT:
@@ -933,27 +926,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
933} 926}
934 927
935/** 928/**
936 * nilfs_cpfile_read - read cpfile inode 929 * nilfs_cpfile_read - read or get cpfile inode
937 * @cpfile: cpfile inode 930 * @sb: super block instance
938 * @raw_inode: on-disk cpfile inode
939 */
940int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
941{
942 return nilfs_read_inode_common(cpfile, raw_inode);
943}
944
945/**
946 * nilfs_cpfile_new - create cpfile
947 * @nilfs: nilfs object
948 * @cpsize: size of a checkpoint entry 931 * @cpsize: size of a checkpoint entry
932 * @raw_inode: on-disk cpfile inode
933 * @inodep: buffer to store the inode
949 */ 934 */
950struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize) 935int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
936 struct nilfs_inode *raw_inode, struct inode **inodep)
951{ 937{
952 struct inode *cpfile; 938 struct inode *cpfile;
939 int err;
940
941 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
942 if (unlikely(!cpfile))
943 return -ENOMEM;
944 if (!(cpfile->i_state & I_NEW))
945 goto out;
946
947 err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
948 if (err)
949 goto failed;
953 950
954 cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0); 951 nilfs_mdt_set_entry_size(cpfile, cpsize,
955 if (cpfile) 952 sizeof(struct nilfs_cpfile_header));
956 nilfs_mdt_set_entry_size(cpfile, cpsize, 953
957 sizeof(struct nilfs_cpfile_header)); 954 err = nilfs_read_inode_common(cpfile, raw_inode);
958 return cpfile; 955 if (err)
956 goto failed;
957
958 unlock_new_inode(cpfile);
959 out:
960 *inodep = cpfile;
961 return 0;
962 failed:
963 iget_failed(cpfile);
964 return err;
959} 965}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab43..a242b9a314f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, 40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
41 size_t); 41 size_t);
42 42
43int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode); 43int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
44struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize); 44 struct nilfs_inode *raw_inode, struct inode **inodep);
45 45
46#endif /* _NILFS_CPFILE_H */ 46#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 013146755683..49c844dab33a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
36struct nilfs_dat_info { 36struct nilfs_dat_info {
37 struct nilfs_mdt_info mi; 37 struct nilfs_mdt_info mi;
38 struct nilfs_palloc_cache palloc_cache; 38 struct nilfs_palloc_cache palloc_cache;
39 struct nilfs_shadow_map shadow;
39}; 40};
40 41
41static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) 42static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -102,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
102 nilfs_palloc_abort_alloc_entry(dat, req); 103 nilfs_palloc_abort_alloc_entry(dat, req);
103} 104}
104 105
105void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) 106static void nilfs_dat_commit_free(struct inode *dat,
107 struct nilfs_palloc_req *req)
106{ 108{
107 struct nilfs_dat_entry *entry; 109 struct nilfs_dat_entry *entry;
108 void *kaddr; 110 void *kaddr;
@@ -327,6 +329,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
327 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); 329 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
328 if (ret < 0) 330 if (ret < 0)
329 return ret; 331 return ret;
332
333 /*
334 * The given disk block number (blocknr) is not yet written to
335 * the device at this point.
336 *
337 * To prevent nilfs_dat_translate() from returning the
338 * uncommited block number, this makes a copy of the entry
339 * buffer and redirects nilfs_dat_translate() to the copy.
340 */
341 if (!buffer_nilfs_redirected(entry_bh)) {
342 ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
343 if (ret) {
344 brelse(entry_bh);
345 return ret;
346 }
347 }
348
330 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 349 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
331 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 350 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
332 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { 351 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +390,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
371 */ 390 */
372int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) 391int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
373{ 392{
374 struct buffer_head *entry_bh; 393 struct buffer_head *entry_bh, *bh;
375 struct nilfs_dat_entry *entry; 394 struct nilfs_dat_entry *entry;
376 sector_t blocknr; 395 sector_t blocknr;
377 void *kaddr; 396 void *kaddr;
@@ -381,6 +400,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
381 if (ret < 0) 400 if (ret < 0)
382 return ret; 401 return ret;
383 402
403 if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
404 bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
405 if (bh) {
406 WARN_ON(!buffer_uptodate(bh));
407 brelse(entry_bh);
408 entry_bh = bh;
409 }
410 }
411
384 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 412 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
385 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 413 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
386 blocknr = le64_to_cpu(entry->de_blocknr); 414 blocknr = le64_to_cpu(entry->de_blocknr);
@@ -436,38 +464,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
436} 464}
437 465
438/** 466/**
439 * nilfs_dat_read - read dat inode 467 * nilfs_dat_read - read or get dat inode
440 * @dat: dat inode 468 * @sb: super block instance
441 * @raw_inode: on-disk dat inode
442 */
443int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
444{
445 return nilfs_read_inode_common(dat, raw_inode);
446}
447
448/**
449 * nilfs_dat_new - create dat file
450 * @nilfs: nilfs object
451 * @entry_size: size of a dat entry 469 * @entry_size: size of a dat entry
470 * @raw_inode: on-disk dat inode
471 * @inodep: buffer to store the inode
452 */ 472 */
453struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size) 473int nilfs_dat_read(struct super_block *sb, size_t entry_size,
474 struct nilfs_inode *raw_inode, struct inode **inodep)
454{ 475{
455 static struct lock_class_key dat_lock_key; 476 static struct lock_class_key dat_lock_key;
456 struct inode *dat; 477 struct inode *dat;
457 struct nilfs_dat_info *di; 478 struct nilfs_dat_info *di;
458 int err; 479 int err;
459 480
460 dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di)); 481 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
461 if (dat) { 482 if (unlikely(!dat))
462 err = nilfs_palloc_init_blockgroup(dat, entry_size); 483 return -ENOMEM;
463 if (unlikely(err)) { 484 if (!(dat->i_state & I_NEW))
464 nilfs_mdt_destroy(dat); 485 goto out;
465 return NULL;
466 }
467 486
468 di = NILFS_DAT_I(dat); 487 err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
469 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); 488 if (err)
470 nilfs_palloc_setup_cache(dat, &di->palloc_cache); 489 goto failed;
471 } 490
472 return dat; 491 err = nilfs_palloc_init_blockgroup(dat, entry_size);
492 if (err)
493 goto failed;
494
495 di = NILFS_DAT_I(dat);
496 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
497 nilfs_palloc_setup_cache(dat, &di->palloc_cache);
498 nilfs_mdt_setup_shadow_map(dat, &di->shadow);
499
500 err = nilfs_read_inode_common(dat, raw_inode);
501 if (err)
502 goto failed;
503
504 unlock_new_inode(dat);
505 out:
506 *inodep = dat;
507 return 0;
508 failed:
509 iget_failed(dat);
510 return err;
473} 511}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0efe..cbd8e9732503 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
53int nilfs_dat_move(struct inode *, __u64, sector_t); 53int nilfs_dat_move(struct inode *, __u64, sector_t);
54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); 54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
55 55
56int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode); 56int nilfs_dat_read(struct super_block *sb, size_t entry_size,
57struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size); 57 struct nilfs_inode *raw_inode, struct inode **inodep);
58 58
59#endif /* _NILFS_DAT_H */ 59#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 000000000000..a71cc412b651
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
1#ifndef NILFS_EXPORT_H
2#define NILFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations nilfs_export_ops;
7
8struct nilfs_fid {
9 u64 cno;
10 u64 ino;
11 u32 gen;
12
13 u32 parent_gen;
14 u64 parent_ino;
15} __attribute__ ((packed));
16
17#endif
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index 84a45d1d5464..000000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_palloc_clear_cache(dat);
65 nilfs_palloc_clear_cache(gcdat);
66 nilfs_clear_dirty_pages(mapping);
67 nilfs_copy_back_pages(mapping, gmapping);
68 /* note: mdt dirty flags should be cleared by segctor. */
69
70 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
71 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
72
73 up_write(&NILFS_MDT(dat)->mi_sem);
74}
75
76void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
77{
78 struct inode *gcdat = nilfs->ns_gc_dat;
79 struct nilfs_inode_info *gii = NILFS_I(gcdat);
80
81 gcdat->i_state = I_FREEING | I_CLEAR;
82 gii->i_flags = 0;
83
84 nilfs_palloc_clear_cache(gcdat);
85 truncate_inode_pages(gcdat->i_mapping, 0);
86 truncate_inode_pages(&gii->i_btnode_cache, 0);
87}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index bed3a783129b..33ad25ddd5c4 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each 31 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the 32 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap 33 * current generation and the blocks to be moved by GC never overlap
@@ -175,125 +168,46 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
175 } 168 }
176 nilfs_btnode_mark_dirty(bh); 169 nilfs_btnode_mark_dirty(bh);
177 } else { 170 } else {
178 nilfs_mdt_mark_buffer_dirty(bh); 171 nilfs_mark_buffer_dirty(bh);
179 } 172 }
180 return 0; 173 return 0;
181} 174}
182 175
183/* 176int nilfs_init_gcinode(struct inode *inode)
184 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
185 * @nilfs - the_nilfs
186 *
187 * Return Value: On success, 0.
188 * On error, a negative error code is returned.
189 */
190int nilfs_init_gccache(struct the_nilfs *nilfs)
191{ 177{
192 int loop; 178 struct nilfs_inode_info *ii = NILFS_I(inode);
193 179 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
194 BUG_ON(nilfs->ns_gc_inodes_h);
195
196 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
197
198 nilfs->ns_gc_inodes_h =
199 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
200 GFP_NOFS);
201 if (nilfs->ns_gc_inodes_h == NULL)
202 return -ENOMEM;
203
204 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
205 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
206 return 0;
207}
208
209/*
210 * nilfs_destroy_gccache() - free gc_inode hash table
211 * @nilfs - the nilfs
212 */
213void nilfs_destroy_gccache(struct the_nilfs *nilfs)
214{
215 if (nilfs->ns_gc_inodes_h) {
216 nilfs_remove_all_gcinode(nilfs);
217 kfree(nilfs->ns_gc_inodes_h);
218 nilfs->ns_gc_inodes_h = NULL;
219 }
220}
221
222static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
223 __u64 cno)
224{
225 struct inode *inode;
226 struct nilfs_inode_info *ii;
227
228 inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
229 if (!inode)
230 return NULL;
231 180
232 inode->i_op = NULL; 181 inode->i_mode = S_IFREG;
233 inode->i_fop = NULL; 182 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
234 inode->i_mapping->a_ops = &def_gcinode_aops; 183 inode->i_mapping->a_ops = &def_gcinode_aops;
184 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
235 185
236 ii = NILFS_I(inode);
237 ii->i_cno = cno;
238 ii->i_flags = 0; 186 ii->i_flags = 0;
239 ii->i_state = 1 << NILFS_I_GCINODE;
240 ii->i_bh = NULL;
241 nilfs_bmap_init_gc(ii->i_bmap); 187 nilfs_bmap_init_gc(ii->i_bmap);
242 188
243 return inode; 189 /*
244} 190 * Add the inode to GC inode list. Garbage Collection
245 191 * is serialized and no two processes manipulate the
246static unsigned long ihash(ino_t ino, __u64 cno) 192 * list simultaneously.
247{ 193 */
248 return hash_long((unsigned long)((ino << 2) + cno), 194 igrab(inode);
249 NILFS_GCINODE_HASH_BITS); 195 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
250}
251
252/*
253 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
254 */
255struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
256{
257 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
258 struct hlist_node *node;
259 struct inode *inode;
260
261 hlist_for_each_entry(inode, node, head, i_hash) {
262 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
263 return inode;
264 }
265 196
266 inode = alloc_gcinode(nilfs, ino, cno); 197 return 0;
267 if (likely(inode)) {
268 hlist_add_head(&inode->i_hash, head);
269 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
270 }
271 return inode;
272}
273
274/*
275 * nilfs_clear_gcinode() - clear and free a gc inode
276 */
277void nilfs_clear_gcinode(struct inode *inode)
278{
279 nilfs_mdt_destroy(inode);
280} 198}
281 199
282/* 200/**
283 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs 201 * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
284 */ 202 */
285void nilfs_remove_all_gcinode(struct the_nilfs *nilfs) 203void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
286{ 204{
287 struct hlist_head *head = nilfs->ns_gc_inodes_h; 205 struct list_head *head = &nilfs->ns_gc_inodes;
288 struct hlist_node *node, *n; 206 struct nilfs_inode_info *ii;
289 struct inode *inode;
290 int loop;
291 207
292 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) { 208 while (!list_empty(head)) {
293 hlist_for_each_entry_safe(inode, node, n, head, i_hash) { 209 ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
294 hlist_del_init(&inode->i_hash); 210 list_del_init(&ii->i_dirty);
295 list_del_init(&NILFS_I(inode)->i_dirty); 211 iput(&ii->vfs_inode);
296 nilfs_clear_gcinode(inode); /* might sleep */
297 }
298 } 212 }
299} 213}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8f..9f8a2da67f90 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -161,25 +161,46 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
161} 161}
162 162
163/** 163/**
164 * nilfs_ifile_new - create inode file 164 * nilfs_ifile_read - read or get ifile inode
165 * @sbi: nilfs_sb_info struct 165 * @sb: super block instance
166 * @root: root object
166 * @inode_size: size of an inode 167 * @inode_size: size of an inode
168 * @raw_inode: on-disk ifile inode
169 * @inodep: buffer to store the inode
167 */ 170 */
168struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size) 171int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
172 size_t inode_size, struct nilfs_inode *raw_inode,
173 struct inode **inodep)
169{ 174{
170 struct inode *ifile; 175 struct inode *ifile;
171 int err; 176 int err;
172 177
173 ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO, 178 ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
174 sizeof(struct nilfs_ifile_info)); 179 if (unlikely(!ifile))
175 if (ifile) { 180 return -ENOMEM;
176 err = nilfs_palloc_init_blockgroup(ifile, inode_size); 181 if (!(ifile->i_state & I_NEW))
177 if (unlikely(err)) { 182 goto out;
178 nilfs_mdt_destroy(ifile); 183
179 return NULL; 184 err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
180 } 185 sizeof(struct nilfs_ifile_info));
181 nilfs_palloc_setup_cache(ifile, 186 if (err)
182 &NILFS_IFILE_I(ifile)->palloc_cache); 187 goto failed;
183 } 188
184 return ifile; 189 err = nilfs_palloc_init_blockgroup(ifile, inode_size);
190 if (err)
191 goto failed;
192
193 nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
194
195 err = nilfs_read_inode_common(ifile, raw_inode);
196 if (err)
197 goto failed;
198
199 unlock_new_inode(ifile);
200 out:
201 *inodep = ifile;
202 return 0;
203 failed:
204 iget_failed(ifile);
205 return err;
185} 206}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f2..59b6f2b51df6 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size); 52int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
53 size_t inode_size, struct nilfs_inode *raw_inode,
54 struct inode **inodep);
53 55
54#endif /* _NILFS_IFILE_H */ 56#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index eccb2f2e2315..71d4bc8464e0 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -34,6 +34,12 @@
34#include "cpfile.h" 34#include "cpfile.h"
35#include "ifile.h" 35#include "ifile.h"
36 36
37struct nilfs_iget_args {
38 u64 ino;
39 __u64 cno;
40 struct nilfs_root *root;
41 int for_gc;
42};
37 43
38/** 44/**
39 * nilfs_get_block() - get a file block on the filesystem (callback function) 45 * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -279,6 +285,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
279 struct nilfs_sb_info *sbi = NILFS_SB(sb); 285 struct nilfs_sb_info *sbi = NILFS_SB(sb);
280 struct inode *inode; 286 struct inode *inode;
281 struct nilfs_inode_info *ii; 287 struct nilfs_inode_info *ii;
288 struct nilfs_root *root;
282 int err = -ENOMEM; 289 int err = -ENOMEM;
283 ino_t ino; 290 ino_t ino;
284 291
@@ -289,15 +296,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
289 mapping_set_gfp_mask(inode->i_mapping, 296 mapping_set_gfp_mask(inode->i_mapping,
290 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 297 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
291 298
299 root = NILFS_I(dir)->i_root;
292 ii = NILFS_I(inode); 300 ii = NILFS_I(inode);
293 ii->i_state = 1 << NILFS_I_NEW; 301 ii->i_state = 1 << NILFS_I_NEW;
302 ii->i_root = root;
294 303
295 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); 304 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
296 if (unlikely(err)) 305 if (unlikely(err))
297 goto failed_ifile_create_inode; 306 goto failed_ifile_create_inode;
298 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 307 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
299 308
300 atomic_inc(&sbi->s_inodes_count); 309 atomic_inc(&root->inodes_count);
301 inode_init_owner(inode, dir, mode); 310 inode_init_owner(inode, dir, mode);
302 inode->i_ino = ino; 311 inode->i_ino = ino;
303 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 312 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -320,7 +329,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
320 /* ii->i_file_acl = 0; */ 329 /* ii->i_file_acl = 0; */
321 /* ii->i_dir_acl = 0; */ 330 /* ii->i_dir_acl = 0; */
322 ii->i_dir_start_lookup = 0; 331 ii->i_dir_start_lookup = 0;
323 ii->i_cno = 0;
324 nilfs_set_inode_flags(inode); 332 nilfs_set_inode_flags(inode);
325 spin_lock(&sbi->s_next_gen_lock); 333 spin_lock(&sbi->s_next_gen_lock);
326 inode->i_generation = sbi->s_next_generation++; 334 inode->i_generation = sbi->s_next_generation++;
@@ -350,16 +358,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
350 return ERR_PTR(err); 358 return ERR_PTR(err);
351} 359}
352 360
353void nilfs_free_inode(struct inode *inode)
354{
355 struct super_block *sb = inode->i_sb;
356 struct nilfs_sb_info *sbi = NILFS_SB(sb);
357
358 /* XXX: check error code? Is there any thing I can do? */
359 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
360 atomic_dec(&sbi->s_inodes_count);
361}
362
363void nilfs_set_inode_flags(struct inode *inode) 361void nilfs_set_inode_flags(struct inode *inode)
364{ 362{
365 unsigned int flags = NILFS_I(inode)->i_flags; 363 unsigned int flags = NILFS_I(inode)->i_flags;
@@ -410,7 +408,6 @@ int nilfs_read_inode_common(struct inode *inode,
410 0 : le32_to_cpu(raw_inode->i_dir_acl); 408 0 : le32_to_cpu(raw_inode->i_dir_acl);
411#endif 409#endif
412 ii->i_dir_start_lookup = 0; 410 ii->i_dir_start_lookup = 0;
413 ii->i_cno = 0;
414 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 411 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
415 412
416 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 413 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -424,7 +421,8 @@ int nilfs_read_inode_common(struct inode *inode,
424 return 0; 421 return 0;
425} 422}
426 423
427static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, 424static int __nilfs_read_inode(struct super_block *sb,
425 struct nilfs_root *root, unsigned long ino,
428 struct inode *inode) 426 struct inode *inode)
429{ 427{
430 struct nilfs_sb_info *sbi = NILFS_SB(sb); 428 struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -434,11 +432,11 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
434 int err; 432 int err;
435 433
436 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 434 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
437 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); 435 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
438 if (unlikely(err)) 436 if (unlikely(err))
439 goto bad_inode; 437 goto bad_inode;
440 438
441 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 439 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
442 440
443 err = nilfs_read_inode_common(inode, raw_inode); 441 err = nilfs_read_inode_common(inode, raw_inode);
444 if (err) 442 if (err)
@@ -461,14 +459,14 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
461 inode, inode->i_mode, 459 inode, inode->i_mode,
462 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 460 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
463 } 461 }
464 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 462 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
465 brelse(bh); 463 brelse(bh);
466 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 464 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
467 nilfs_set_inode_flags(inode); 465 nilfs_set_inode_flags(inode);
468 return 0; 466 return 0;
469 467
470 failed_unmap: 468 failed_unmap:
471 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 469 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
472 brelse(bh); 470 brelse(bh);
473 471
474 bad_inode: 472 bad_inode:
@@ -476,18 +474,95 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
476 return err; 474 return err;
477} 475}
478 476
479struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) 477static int nilfs_iget_test(struct inode *inode, void *opaque)
478{
479 struct nilfs_iget_args *args = opaque;
480 struct nilfs_inode_info *ii;
481
482 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
483 return 0;
484
485 ii = NILFS_I(inode);
486 if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
487 return !args->for_gc;
488
489 return args->for_gc && args->cno == ii->i_cno;
490}
491
492static int nilfs_iget_set(struct inode *inode, void *opaque)
493{
494 struct nilfs_iget_args *args = opaque;
495
496 inode->i_ino = args->ino;
497 if (args->for_gc) {
498 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
499 NILFS_I(inode)->i_cno = args->cno;
500 NILFS_I(inode)->i_root = NULL;
501 } else {
502 if (args->root && args->ino == NILFS_ROOT_INO)
503 nilfs_get_root(args->root);
504 NILFS_I(inode)->i_root = args->root;
505 }
506 return 0;
507}
508
509struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
510 unsigned long ino)
511{
512 struct nilfs_iget_args args = {
513 .ino = ino, .root = root, .cno = 0, .for_gc = 0
514 };
515
516 return ilookup5(sb, ino, nilfs_iget_test, &args);
517}
518
519struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
520 unsigned long ino)
521{
522 struct nilfs_iget_args args = {
523 .ino = ino, .root = root, .cno = 0, .for_gc = 0
524 };
525
526 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
527}
528
529struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
530 unsigned long ino)
480{ 531{
481 struct inode *inode; 532 struct inode *inode;
482 int err; 533 int err;
483 534
484 inode = iget_locked(sb, ino); 535 inode = nilfs_iget_locked(sb, root, ino);
485 if (unlikely(!inode)) 536 if (unlikely(!inode))
486 return ERR_PTR(-ENOMEM); 537 return ERR_PTR(-ENOMEM);
487 if (!(inode->i_state & I_NEW)) 538 if (!(inode->i_state & I_NEW))
488 return inode; 539 return inode;
489 540
490 err = __nilfs_read_inode(sb, ino, inode); 541 err = __nilfs_read_inode(sb, root, ino, inode);
542 if (unlikely(err)) {
543 iget_failed(inode);
544 return ERR_PTR(err);
545 }
546 unlock_new_inode(inode);
547 return inode;
548}
549
550struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
551 __u64 cno)
552{
553 struct nilfs_iget_args args = {
554 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
555 };
556 struct inode *inode;
557 int err;
558
559 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
560 if (unlikely(!inode))
561 return ERR_PTR(-ENOMEM);
562 if (!(inode->i_state & I_NEW))
563 return inode;
564
565 err = nilfs_init_gcinode(inode);
491 if (unlikely(err)) { 566 if (unlikely(err)) {
492 iget_failed(inode); 567 iget_failed(inode);
493 return ERR_PTR(err); 568 return ERR_PTR(err);
@@ -528,21 +603,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
528{ 603{
529 ino_t ino = inode->i_ino; 604 ino_t ino = inode->i_ino;
530 struct nilfs_inode_info *ii = NILFS_I(inode); 605 struct nilfs_inode_info *ii = NILFS_I(inode);
531 struct super_block *sb = inode->i_sb; 606 struct inode *ifile = ii->i_root->ifile;
532 struct nilfs_sb_info *sbi = NILFS_SB(sb);
533 struct nilfs_inode *raw_inode; 607 struct nilfs_inode *raw_inode;
534 608
535 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); 609 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
536 610
537 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 611 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
538 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); 612 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
539 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 613 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
540 614
541 nilfs_write_inode_common(inode, raw_inode, 0); 615 nilfs_write_inode_common(inode, raw_inode, 0);
542 /* XXX: call with has_bmap = 0 is a workaround to avoid 616 /* XXX: call with has_bmap = 0 is a workaround to avoid
543 deadlock of bmap. This delays update of i_bmap to just 617 deadlock of bmap. This delays update of i_bmap to just
544 before writing */ 618 before writing */
545 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); 619 nilfs_ifile_unmap_inode(ifile, ino, ibh);
546} 620}
547 621
548#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ 622#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
@@ -617,6 +691,7 @@ void nilfs_truncate(struct inode *inode)
617static void nilfs_clear_inode(struct inode *inode) 691static void nilfs_clear_inode(struct inode *inode)
618{ 692{
619 struct nilfs_inode_info *ii = NILFS_I(inode); 693 struct nilfs_inode_info *ii = NILFS_I(inode);
694 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
620 695
621 /* 696 /*
622 * Free resources allocated in nilfs_read_inode(), here. 697 * Free resources allocated in nilfs_read_inode(), here.
@@ -625,10 +700,16 @@ static void nilfs_clear_inode(struct inode *inode)
625 brelse(ii->i_bh); 700 brelse(ii->i_bh);
626 ii->i_bh = NULL; 701 ii->i_bh = NULL;
627 702
703 if (mdi && mdi->mi_palloc_cache)
704 nilfs_palloc_destroy_cache(inode);
705
628 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 706 if (test_bit(NILFS_I_BMAP, &ii->i_state))
629 nilfs_bmap_clear(ii->i_bmap); 707 nilfs_bmap_clear(ii->i_bmap);
630 708
631 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 709 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
710
711 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
712 nilfs_put_root(ii->i_root);
632} 713}
633 714
634void nilfs_evict_inode(struct inode *inode) 715void nilfs_evict_inode(struct inode *inode)
@@ -637,7 +718,7 @@ void nilfs_evict_inode(struct inode *inode)
637 struct super_block *sb = inode->i_sb; 718 struct super_block *sb = inode->i_sb;
638 struct nilfs_inode_info *ii = NILFS_I(inode); 719 struct nilfs_inode_info *ii = NILFS_I(inode);
639 720
640 if (inode->i_nlink || unlikely(is_bad_inode(inode))) { 721 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
641 if (inode->i_data.nrpages) 722 if (inode->i_data.nrpages)
642 truncate_inode_pages(&inode->i_data, 0); 723 truncate_inode_pages(&inode->i_data, 0);
643 end_writeback(inode); 724 end_writeback(inode);
@@ -649,12 +730,16 @@ void nilfs_evict_inode(struct inode *inode)
649 if (inode->i_data.nrpages) 730 if (inode->i_data.nrpages)
650 truncate_inode_pages(&inode->i_data, 0); 731 truncate_inode_pages(&inode->i_data, 0);
651 732
733 /* TODO: some of the following operations may fail. */
652 nilfs_truncate_bmap(ii, 0); 734 nilfs_truncate_bmap(ii, 0);
653 nilfs_mark_inode_dirty(inode); 735 nilfs_mark_inode_dirty(inode);
654 end_writeback(inode); 736 end_writeback(inode);
737
738 nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
739 atomic_dec(&ii->i_root->inodes_count);
740
655 nilfs_clear_inode(inode); 741 nilfs_clear_inode(inode);
656 nilfs_free_inode(inode); 742
657 /* nilfs_free_inode() marks inode buffer dirty */
658 if (IS_SYNC(inode)) 743 if (IS_SYNC(inode))
659 nilfs_set_transaction_flag(NILFS_TI_SYNC); 744 nilfs_set_transaction_flag(NILFS_TI_SYNC);
660 nilfs_transaction_commit(sb); 745 nilfs_transaction_commit(sb);
@@ -700,6 +785,17 @@ out_err:
700 return err; 785 return err;
701} 786}
702 787
788int nilfs_permission(struct inode *inode, int mask)
789{
790 struct nilfs_root *root = NILFS_I(inode)->i_root;
791
792 if ((mask & MAY_WRITE) && root &&
793 root->cno != NILFS_CPTREE_CURRENT_CNO)
794 return -EROFS; /* snapshot is not writable */
795
796 return generic_permission(inode, mask, NULL);
797}
798
703int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, 799int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
704 struct buffer_head **pbh) 800 struct buffer_head **pbh)
705{ 801{
@@ -709,8 +805,8 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
709 spin_lock(&sbi->s_inode_lock); 805 spin_lock(&sbi->s_inode_lock);
710 if (ii->i_bh == NULL) { 806 if (ii->i_bh == NULL) {
711 spin_unlock(&sbi->s_inode_lock); 807 spin_unlock(&sbi->s_inode_lock);
712 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, 808 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
713 pbh); 809 inode->i_ino, pbh);
714 if (unlikely(err)) 810 if (unlikely(err))
715 return err; 811 return err;
716 spin_lock(&sbi->s_inode_lock); 812 spin_lock(&sbi->s_inode_lock);
@@ -790,7 +886,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
790 } 886 }
791 nilfs_update_inode(inode, ibh); 887 nilfs_update_inode(inode, ibh);
792 nilfs_mdt_mark_buffer_dirty(ibh); 888 nilfs_mdt_mark_buffer_dirty(ibh);
793 nilfs_mdt_mark_dirty(sbi->s_ifile); 889 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
794 brelse(ibh); 890 brelse(ibh);
795 return 0; 891 return 0;
796} 892}
@@ -808,6 +904,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
808void nilfs_dirty_inode(struct inode *inode) 904void nilfs_dirty_inode(struct inode *inode)
809{ 905{
810 struct nilfs_transaction_info ti; 906 struct nilfs_transaction_info ti;
907 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
811 908
812 if (is_bad_inode(inode)) { 909 if (is_bad_inode(inode)) {
813 nilfs_warning(inode->i_sb, __func__, 910 nilfs_warning(inode->i_sb, __func__,
@@ -815,6 +912,10 @@ void nilfs_dirty_inode(struct inode *inode)
815 dump_stack(); 912 dump_stack();
816 return; 913 return;
817 } 914 }
915 if (mdi) {
916 nilfs_mdt_mark_dirty(inode);
917 return;
918 }
818 nilfs_transaction_begin(inode->i_sb, &ti, 0); 919 nilfs_transaction_begin(inode->i_sb, &ti, 0);
819 nilfs_mark_inode_dirty(inode); 920 nilfs_mark_inode_dirty(inode);
820 nilfs_transaction_commit(inode->i_sb); /* never fails */ 921 nilfs_transaction_commit(inode->i_sb); /* never fails */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b0..3e90f86d5bfe 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h> 25#include <linux/slab.h>
27#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
@@ -118,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
118 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 117 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
119 goto out; 118 goto out;
120 119
121 mutex_lock(&nilfs->ns_mount_mutex); 120 down_read(&inode->i_sb->s_umount);
122 121
123 nilfs_transaction_begin(inode->i_sb, &ti, 0); 122 nilfs_transaction_begin(inode->i_sb, &ti, 0);
124 ret = nilfs_cpfile_change_cpmode( 123 ret = nilfs_cpfile_change_cpmode(
@@ -128,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
128 else 127 else
129 nilfs_transaction_commit(inode->i_sb); /* never fails */ 128 nilfs_transaction_commit(inode->i_sb); /* never fails */
130 129
131 mutex_unlock(&nilfs->ns_mount_mutex); 130 up_read(&inode->i_sb->s_umount);
132out: 131out:
133 mnt_drop_write(filp->f_path.mnt); 132 mnt_drop_write(filp->f_path.mnt);
134 return ret; 133 return ret;
@@ -334,7 +333,7 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
334 return 0; 333 return 0;
335} 334}
336 335
337static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, 336static int nilfs_ioctl_move_blocks(struct super_block *sb,
338 struct nilfs_argv *argv, void *buf) 337 struct nilfs_argv *argv, void *buf)
339{ 338{
340 size_t nmembs = argv->v_nmembs; 339 size_t nmembs = argv->v_nmembs;
@@ -349,7 +348,7 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
349 for (i = 0, vdesc = buf; i < nmembs; ) { 348 for (i = 0, vdesc = buf; i < nmembs; ) {
350 ino = vdesc->vd_ino; 349 ino = vdesc->vd_ino;
351 cno = vdesc->vd_cno; 350 cno = vdesc->vd_cno;
352 inode = nilfs_gc_iget(nilfs, ino, cno); 351 inode = nilfs_iget_for_gc(sb, ino, cno);
353 if (unlikely(inode == NULL)) { 352 if (unlikely(inode == NULL)) {
354 ret = -ENOMEM; 353 ret = -ENOMEM;
355 goto failed; 354 goto failed;
@@ -357,11 +356,15 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
357 do { 356 do {
358 ret = nilfs_ioctl_move_inode_block(inode, vdesc, 357 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
359 &buffers); 358 &buffers);
360 if (unlikely(ret < 0)) 359 if (unlikely(ret < 0)) {
360 iput(inode);
361 goto failed; 361 goto failed;
362 }
362 vdesc++; 363 vdesc++;
363 } while (++i < nmembs && 364 } while (++i < nmembs &&
364 vdesc->vd_ino == ino && vdesc->vd_cno == cno); 365 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
366
367 iput(inode); /* The inode still remains in GC inode list */
365 } 368 }
366 369
367 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 370 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -567,7 +570,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
567 } 570 }
568 571
569 /* 572 /*
570 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), 573 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
571 * which will operates an inode list without blocking. 574 * which will operates an inode list without blocking.
572 * To protect the list from concurrent operations, 575 * To protect the list from concurrent operations,
573 * nilfs_ioctl_move_blocks should be atomic operation. 576 * nilfs_ioctl_move_blocks should be atomic operation.
@@ -577,15 +580,16 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
577 goto out_free; 580 goto out_free;
578 } 581 }
579 582
580 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); 583 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
584
585 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
581 if (ret < 0) 586 if (ret < 0)
582 printk(KERN_ERR "NILFS: GC failed during preparation: " 587 printk(KERN_ERR "NILFS: GC failed during preparation: "
583 "cannot read source blocks: err=%d\n", ret); 588 "cannot read source blocks: err=%d\n", ret);
584 else 589 else
585 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 590 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
586 591
587 if (ret < 0) 592 nilfs_remove_all_gcinodes(nilfs);
588 nilfs_remove_all_gcinode(nilfs);
589 clear_nilfs_gc_running(nilfs); 593 clear_nilfs_gc_running(nilfs);
590 594
591out_free: 595out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d01aff4957d9..39a5b84e2c9f 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -36,7 +36,6 @@
36 36
37#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) 37#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
38 38
39#define INIT_UNUSED_INODE_FIELDS
40 39
41static int 40static int
42nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, 41nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -78,25 +77,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
78 struct buffer_head *, 77 struct buffer_head *,
79 void *)) 78 void *))
80{ 79{
81 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
82 struct super_block *sb = inode->i_sb; 80 struct super_block *sb = inode->i_sb;
83 struct nilfs_transaction_info ti; 81 struct nilfs_transaction_info ti;
84 struct buffer_head *bh; 82 struct buffer_head *bh;
85 int err; 83 int err;
86 84
87 if (!sb) {
88 /*
89 * Make sure this function is not called from any
90 * read-only context.
91 */
92 if (!nilfs->ns_writer) {
93 WARN_ON(1);
94 err = -EROFS;
95 goto out;
96 }
97 sb = nilfs->ns_writer->s_super;
98 }
99
100 nilfs_transaction_begin(sb, &ti, 0); 85 nilfs_transaction_begin(sb, &ti, 0);
101 86
102 err = -ENOMEM; 87 err = -ENOMEM;
@@ -112,7 +97,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
112 if (buffer_uptodate(bh)) 97 if (buffer_uptodate(bh))
113 goto failed_bh; 98 goto failed_bh;
114 99
115 bh->b_bdev = nilfs->ns_bdev; 100 bh->b_bdev = sb->s_bdev;
116 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 101 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
117 if (likely(!err)) { 102 if (likely(!err)) {
118 get_bh(bh); 103 get_bh(bh);
@@ -129,7 +114,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
129 err = nilfs_transaction_commit(sb); 114 err = nilfs_transaction_commit(sb);
130 else 115 else
131 nilfs_transaction_abort(sb); 116 nilfs_transaction_abort(sb);
132 out: 117
133 return err; 118 return err;
134} 119}
135 120
@@ -167,9 +152,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
167 unlock_buffer(bh); 152 unlock_buffer(bh);
168 goto failed_bh; 153 goto failed_bh;
169 } 154 }
170 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; 155 map_bh(bh, inode->i_sb, (sector_t)blknum);
171 bh->b_blocknr = (sector_t)blknum;
172 set_buffer_mapped(bh);
173 156
174 bh->b_end_io = end_buffer_read_sync; 157 bh->b_end_io = end_buffer_read_sync;
175 get_bh(bh); 158 get_bh(bh);
@@ -398,35 +381,24 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
398static int 381static int
399nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) 382nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
400{ 383{
401 struct inode *inode = container_of(page->mapping, 384 struct inode *inode;
402 struct inode, i_data); 385 struct super_block *sb;
403 struct super_block *sb = inode->i_sb;
404 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
405 struct nilfs_sb_info *writer = NULL;
406 int err = 0; 386 int err = 0;
407 387
408 redirty_page_for_writepage(wbc, page); 388 redirty_page_for_writepage(wbc, page);
409 unlock_page(page); 389 unlock_page(page);
410 390
411 if (page->mapping->assoc_mapping) 391 inode = page->mapping->host;
412 return 0; /* Do not request flush for shadow page cache */ 392 if (!inode)
413 if (!sb) { 393 return 0;
414 down_read(&nilfs->ns_writer_sem); 394
415 writer = nilfs->ns_writer; 395 sb = inode->i_sb;
416 if (!writer) {
417 up_read(&nilfs->ns_writer_sem);
418 return -EROFS;
419 }
420 sb = writer->s_super;
421 }
422 396
423 if (wbc->sync_mode == WB_SYNC_ALL) 397 if (wbc->sync_mode == WB_SYNC_ALL)
424 err = nilfs_construct_segment(sb); 398 err = nilfs_construct_segment(sb);
425 else if (wbc->for_reclaim) 399 else if (wbc->for_reclaim)
426 nilfs_flush_segment(sb, inode->i_ino); 400 nilfs_flush_segment(sb, inode->i_ino);
427 401
428 if (writer)
429 up_read(&nilfs->ns_writer_sem);
430 return err; 402 return err;
431} 403}
432 404
@@ -439,105 +411,27 @@ static const struct address_space_operations def_mdt_aops = {
439static const struct inode_operations def_mdt_iops; 411static const struct inode_operations def_mdt_iops;
440static const struct file_operations def_mdt_fops; 412static const struct file_operations def_mdt_fops;
441 413
442/* 414
443 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 415int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
444 * ifile, or gcinodes. This allows the B-tree code and segment constructor
445 * to treat them like regular files, and this helps to simplify the
446 * implementation.
447 * On the other hand, some of the pseudo inodes have an irregular point:
448 * They don't have valid inode->i_sb pointer because their lifetimes are
449 * longer than those of the super block structs; they may continue for
450 * several consecutive mounts/umounts. This would need discussions.
451 */
452/**
453 * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
454 * @nilfs: nilfs object
455 * @sb: super block instance the metadata file belongs to
456 * @ino: inode number
457 * @gfp_mask: gfp mask for data pages
458 * @objsz: size of the private object attached to inode->i_private
459 */
460struct inode *
461nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
462 ino_t ino, gfp_t gfp_mask, size_t objsz)
463{ 416{
464 struct inode *inode = nilfs_alloc_inode_common(nilfs); 417 struct nilfs_mdt_info *mi;
465 418
466 if (!inode) 419 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
467 return NULL; 420 if (!mi)
468 else { 421 return -ENOMEM;
469 struct address_space * const mapping = &inode->i_data;
470 struct nilfs_mdt_info *mi;
471
472 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
473 if (!mi) {
474 nilfs_destroy_inode(inode);
475 return NULL;
476 }
477 mi->mi_nilfs = nilfs;
478 init_rwsem(&mi->mi_sem);
479
480 inode->i_sb = sb; /* sb may be NULL for some meta data files */
481 inode->i_blkbits = nilfs->ns_blocksize_bits;
482 inode->i_flags = 0;
483 atomic_set(&inode->i_count, 1);
484 inode->i_nlink = 1;
485 inode->i_ino = ino;
486 inode->i_mode = S_IFREG;
487 inode->i_private = mi;
488
489#ifdef INIT_UNUSED_INODE_FIELDS
490 atomic_set(&inode->i_writecount, 0);
491 inode->i_size = 0;
492 inode->i_blocks = 0;
493 inode->i_bytes = 0;
494 inode->i_generation = 0;
495#ifdef CONFIG_QUOTA
496 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
497#endif
498 inode->i_pipe = NULL;
499 inode->i_bdev = NULL;
500 inode->i_cdev = NULL;
501 inode->i_rdev = 0;
502#ifdef CONFIG_SECURITY
503 inode->i_security = NULL;
504#endif
505 inode->dirtied_when = 0;
506
507 INIT_LIST_HEAD(&inode->i_list);
508 INIT_LIST_HEAD(&inode->i_sb_list);
509 inode->i_state = 0;
510#endif
511
512 spin_lock_init(&inode->i_lock);
513 mutex_init(&inode->i_mutex);
514 init_rwsem(&inode->i_alloc_sem);
515
516 mapping->host = NULL; /* instead of inode */
517 mapping->flags = 0;
518 mapping_set_gfp_mask(mapping, gfp_mask);
519 mapping->assoc_mapping = NULL;
520 mapping->backing_dev_info = nilfs->ns_bdi;
521
522 inode->i_mapping = mapping;
523 }
524 422
525 return inode; 423 init_rwsem(&mi->mi_sem);
526} 424 inode->i_private = mi;
527 425
528struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 426 inode->i_mode = S_IFREG;
529 ino_t ino, size_t objsz) 427 mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
530{ 428 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
531 struct inode *inode;
532
533 inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
534 if (!inode)
535 return NULL;
536 429
537 inode->i_op = &def_mdt_iops; 430 inode->i_op = &def_mdt_iops;
538 inode->i_fop = &def_mdt_fops; 431 inode->i_fop = &def_mdt_fops;
539 inode->i_mapping->a_ops = &def_mdt_aops; 432 inode->i_mapping->a_ops = &def_mdt_aops;
540 return inode; 433
434 return 0;
541} 435}
542 436
543void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, 437void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
@@ -550,34 +444,159 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
550 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 444 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
551} 445}
552 446
553void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) 447static const struct address_space_operations shadow_map_aops = {
448 .sync_page = block_sync_page,
449};
450
451/**
452 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
453 * @inode: inode of the metadata file
454 * @shadow: shadow mapping
455 */
456int nilfs_mdt_setup_shadow_map(struct inode *inode,
457 struct nilfs_shadow_map *shadow)
554{ 458{
555 shadow->i_mapping->assoc_mapping = orig->i_mapping; 459 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
556 NILFS_I(shadow)->i_btnode_cache.assoc_mapping = 460 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
557 &NILFS_I(orig)->i_btnode_cache; 461
462 INIT_LIST_HEAD(&shadow->frozen_buffers);
463 nilfs_mapping_init_once(&shadow->frozen_data);
464 nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
465 nilfs_mapping_init_once(&shadow->frozen_btnodes);
466 nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
467 mi->mi_shadow = shadow;
468 return 0;
558} 469}
559 470
560static void nilfs_mdt_clear(struct inode *inode) 471/**
472 * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
473 * @inode: inode of the metadata file
474 */
475int nilfs_mdt_save_to_shadow_map(struct inode *inode)
561{ 476{
477 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
562 struct nilfs_inode_info *ii = NILFS_I(inode); 478 struct nilfs_inode_info *ii = NILFS_I(inode);
479 struct nilfs_shadow_map *shadow = mi->mi_shadow;
480 int ret;
563 481
564 invalidate_mapping_pages(inode->i_mapping, 0, -1); 482 ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
565 truncate_inode_pages(inode->i_mapping, 0); 483 if (ret)
484 goto out;
485
486 ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
487 &ii->i_btnode_cache);
488 if (ret)
489 goto out;
566 490
567 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 491 nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
568 nilfs_bmap_clear(ii->i_bmap); 492 out:
569 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 493 return ret;
570} 494}
571 495
572void nilfs_mdt_destroy(struct inode *inode) 496int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
573{ 497{
574 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 498 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
499 struct buffer_head *bh_frozen;
500 struct page *page;
501 int blkbits = inode->i_blkbits;
502 int ret = -ENOMEM;
503
504 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
505 if (!page)
506 return ret;
507
508 if (!page_has_buffers(page))
509 create_empty_buffers(page, 1 << blkbits, 0);
510
511 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
512 if (bh_frozen) {
513 if (!buffer_uptodate(bh_frozen))
514 nilfs_copy_buffer(bh_frozen, bh);
515 if (list_empty(&bh_frozen->b_assoc_buffers)) {
516 list_add_tail(&bh_frozen->b_assoc_buffers,
517 &shadow->frozen_buffers);
518 set_buffer_nilfs_redirected(bh);
519 } else {
520 brelse(bh_frozen); /* already frozen */
521 }
522 ret = 0;
523 }
524 unlock_page(page);
525 page_cache_release(page);
526 return ret;
527}
528
529struct buffer_head *
530nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
531{
532 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
533 struct buffer_head *bh_frozen = NULL;
534 struct page *page;
535 int n;
536
537 page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
538 if (page) {
539 if (page_has_buffers(page)) {
540 n = bh_offset(bh) >> inode->i_blkbits;
541 bh_frozen = nilfs_page_get_nth_block(page, n);
542 }
543 unlock_page(page);
544 page_cache_release(page);
545 }
546 return bh_frozen;
547}
548
549static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
550{
551 struct list_head *head = &shadow->frozen_buffers;
552 struct buffer_head *bh;
553
554 while (!list_empty(head)) {
555 bh = list_first_entry(head, struct buffer_head,
556 b_assoc_buffers);
557 list_del_init(&bh->b_assoc_buffers);
558 brelse(bh); /* drop ref-count to make it releasable */
559 }
560}
561
562/**
563 * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
564 * @inode: inode of the metadata file
565 */
566void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
567{
568 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
569 struct nilfs_inode_info *ii = NILFS_I(inode);
570 struct nilfs_shadow_map *shadow = mi->mi_shadow;
571
572 down_write(&mi->mi_sem);
575 573
576 if (mdi->mi_palloc_cache) 574 if (mi->mi_palloc_cache)
577 nilfs_palloc_destroy_cache(inode); 575 nilfs_palloc_clear_cache(inode);
578 nilfs_mdt_clear(inode); 576
577 nilfs_clear_dirty_pages(inode->i_mapping);
578 nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
579
580 nilfs_clear_dirty_pages(&ii->i_btnode_cache);
581 nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
582
583 nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
584
585 up_write(&mi->mi_sem);
586}
587
588/**
589 * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
590 * @inode: inode of the metadata file
591 */
592void nilfs_mdt_clear_shadow_map(struct inode *inode)
593{
594 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
595 struct nilfs_shadow_map *shadow = mi->mi_shadow;
579 596
580 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ 597 down_write(&mi->mi_sem);
581 kfree(mdi); 598 nilfs_release_frozen_buffers(shadow);
582 nilfs_destroy_inode(inode); 599 truncate_inode_pages(&shadow->frozen_data, 0);
600 truncate_inode_pages(&shadow->frozen_btnodes, 0);
601 up_write(&mi->mi_sem);
583} 602}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470fc..b13734bf3521 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,26 +28,33 @@
28#include "nilfs.h" 28#include "nilfs.h"
29#include "page.h" 29#include "page.h"
30 30
31struct nilfs_shadow_map {
32 struct nilfs_bmap_store bmap_store;
33 struct address_space frozen_data;
34 struct address_space frozen_btnodes;
35 struct list_head frozen_buffers;
36};
37
31/** 38/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files 39 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations 40 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking 41 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry 42 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry 43 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block 44 * @mi_entries_per_block: number of entries in a block
39 * @mi_palloc_cache: persistent object allocator cache 45 * @mi_palloc_cache: persistent object allocator cache
46 * @mi_shadow: shadow of bmap and page caches
40 * @mi_blocks_per_group: number of blocks in a group 47 * @mi_blocks_per_group: number of blocks in a group
41 * @mi_blocks_per_desc_block: number of blocks per descriptor block 48 * @mi_blocks_per_desc_block: number of blocks per descriptor block
42 */ 49 */
43struct nilfs_mdt_info { 50struct nilfs_mdt_info {
44 struct the_nilfs *mi_nilfs;
45 struct rw_semaphore mi_sem; 51 struct rw_semaphore mi_sem;
46 struct blockgroup_lock *mi_bgl; 52 struct blockgroup_lock *mi_bgl;
47 unsigned mi_entry_size; 53 unsigned mi_entry_size;
48 unsigned mi_first_entry_offset; 54 unsigned mi_first_entry_offset;
49 unsigned long mi_entries_per_block; 55 unsigned long mi_entries_per_block;
50 struct nilfs_palloc_cache *mi_palloc_cache; 56 struct nilfs_palloc_cache *mi_palloc_cache;
57 struct nilfs_shadow_map *mi_shadow;
51 unsigned long mi_blocks_per_group; 58 unsigned long mi_blocks_per_group;
52 unsigned long mi_blocks_per_desc_block; 59 unsigned long mi_blocks_per_desc_block;
53}; 60};
@@ -59,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
59 66
60static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) 67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
61{ 68{
62 struct super_block *sb = inode->i_sb; 69 return NILFS_SB(inode->i_sb)->s_nilfs;
63
64 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
65} 70}
66 71
67/* Default GFP flags using highmem */ 72/* Default GFP flags using highmem */
@@ -76,14 +81,17 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
76int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 81int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
77int nilfs_mdt_fetch_dirty(struct inode *); 82int nilfs_mdt_fetch_dirty(struct inode *);
78 83
79struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, 84int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
80 size_t);
81struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
82 ino_t, gfp_t, size_t);
83void nilfs_mdt_destroy(struct inode *);
84void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); 85void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
85void nilfs_mdt_set_shadow(struct inode *, struct inode *);
86 86
87int nilfs_mdt_setup_shadow_map(struct inode *inode,
88 struct nilfs_shadow_map *shadow);
89int nilfs_mdt_save_to_shadow_map(struct inode *inode);
90void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
91void nilfs_mdt_clear_shadow_map(struct inode *inode);
92int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
93struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
94 struct buffer_head *bh);
87 95
88#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) 96#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
89 97
@@ -100,7 +108,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
100 108
101static inline __u64 nilfs_mdt_cno(struct inode *inode) 109static inline __u64 nilfs_mdt_cno(struct inode *inode)
102{ 110{
103 return NILFS_MDT(inode)->mi_nilfs->ns_cno; 111 return NILFS_I_NILFS(inode)->ns_cno;
104} 112}
105 113
106#define nilfs_mdt_bgl_lock(inode, bg) \ 114#define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b4..6e9557ecf161 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
40 40
41#include <linux/pagemap.h> 41#include <linux/pagemap.h>
42#include "nilfs.h" 42#include "nilfs.h"
43#include "export.h"
43 44
45#define NILFS_FID_SIZE_NON_CONNECTABLE \
46 (offsetof(struct nilfs_fid, parent_gen) / 4)
47#define NILFS_FID_SIZE_CONNECTABLE (sizeof(struct nilfs_fid) / 4)
44 48
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) 49static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{ 50{
@@ -70,29 +74,13 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
70 ino = nilfs_inode_by_name(dir, &dentry->d_name); 74 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 75 inode = NULL;
72 if (ino) { 76 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 77 inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
74 if (IS_ERR(inode)) 78 if (IS_ERR(inode))
75 return ERR_CAST(inode); 79 return ERR_CAST(inode);
76 } 80 }
77 return d_splice_alias(inode, dentry); 81 return d_splice_alias(inode, dentry);
78} 82}
79 83
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
87 if (!ino)
88 return ERR_PTR(-ENOENT);
89
90 inode = nilfs_iget(child->d_inode->i_sb, ino);
91 if (IS_ERR(inode))
92 return ERR_CAST(inode);
93 return d_obtain_alias(inode);
94}
95
96/* 84/*
97 * By the time this is called, we already have created 85 * By the time this is called, we already have created
98 * the directory cache entry for the new file, but it 86 * the directory cache entry for the new file, but it
@@ -219,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
219 207
220 inode->i_ctime = CURRENT_TIME; 208 inode->i_ctime = CURRENT_TIME;
221 inode_inc_link_count(inode); 209 inode_inc_link_count(inode);
222 atomic_inc(&inode->i_count); 210 ihold(inode);
223 211
224 err = nilfs_add_nondir(dentry, inode); 212 err = nilfs_add_nondir(dentry, inode);
225 if (!err) 213 if (!err)
@@ -468,6 +456,115 @@ out:
468 return err; 456 return err;
469} 457}
470 458
459/*
460 * Export operations
461 */
462static struct dentry *nilfs_get_parent(struct dentry *child)
463{
464 unsigned long ino;
465 struct inode *inode;
466 struct qstr dotdot = {.name = "..", .len = 2};
467 struct nilfs_root *root;
468
469 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
470 if (!ino)
471 return ERR_PTR(-ENOENT);
472
473 root = NILFS_I(child->d_inode)->i_root;
474
475 inode = nilfs_iget(child->d_inode->i_sb, root, ino);
476 if (IS_ERR(inode))
477 return ERR_CAST(inode);
478
479 return d_obtain_alias(inode);
480}
481
482static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
483 u64 ino, u32 gen)
484{
485 struct nilfs_root *root;
486 struct inode *inode;
487
488 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
489 return ERR_PTR(-ESTALE);
490
491 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
492 if (!root)
493 return ERR_PTR(-ESTALE);
494
495 inode = nilfs_iget(sb, root, ino);
496 nilfs_put_root(root);
497
498 if (IS_ERR(inode))
499 return ERR_CAST(inode);
500 if (gen && inode->i_generation != gen) {
501 iput(inode);
502 return ERR_PTR(-ESTALE);
503 }
504 return d_obtain_alias(inode);
505}
506
507static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
508 int fh_len, int fh_type)
509{
510 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
511
512 if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
513 fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
514 (fh_type != FILEID_NILFS_WITH_PARENT &&
515 fh_type != FILEID_NILFS_WITHOUT_PARENT))
516 return NULL;
517
518 return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
519}
520
521static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
522 int fh_len, int fh_type)
523{
524 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
525
526 if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
527 fh_type != FILEID_NILFS_WITH_PARENT)
528 return NULL;
529
530 return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
531}
532
533static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
534 int connectable)
535{
536 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
537 struct inode *inode = dentry->d_inode;
538 struct nilfs_root *root = NILFS_I(inode)->i_root;
539 int type;
540
541 if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
542 (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
543 return 255;
544
545 fid->cno = root->cno;
546 fid->ino = inode->i_ino;
547 fid->gen = inode->i_generation;
548
549 if (connectable && !S_ISDIR(inode->i_mode)) {
550 struct inode *parent;
551
552 spin_lock(&dentry->d_lock);
553 parent = dentry->d_parent->d_inode;
554 fid->parent_ino = parent->i_ino;
555 fid->parent_gen = parent->i_generation;
556 spin_unlock(&dentry->d_lock);
557
558 type = FILEID_NILFS_WITH_PARENT;
559 *lenp = NILFS_FID_SIZE_CONNECTABLE;
560 } else {
561 type = FILEID_NILFS_WITHOUT_PARENT;
562 *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
563 }
564
565 return type;
566}
567
471const struct inode_operations nilfs_dir_inode_operations = { 568const struct inode_operations nilfs_dir_inode_operations = {
472 .create = nilfs_create, 569 .create = nilfs_create,
473 .lookup = nilfs_lookup, 570 .lookup = nilfs_lookup,
@@ -491,4 +588,12 @@ const struct inode_operations nilfs_symlink_inode_operations = {
491 .readlink = generic_readlink, 588 .readlink = generic_readlink,
492 .follow_link = page_follow_link_light, 589 .follow_link = page_follow_link_light,
493 .put_link = page_put_link, 590 .put_link = page_put_link,
591 .permission = nilfs_permission,
592};
593
594const struct export_operations nilfs_export_ops = {
595 .encode_fh = nilfs_encode_fh,
596 .fh_to_dentry = nilfs_fh_to_dentry,
597 .fh_to_parent = nilfs_fh_to_parent,
598 .get_parent = nilfs_get_parent,
494}; 599};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index d3d54046e5f8..f7560da5a567 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -59,6 +59,7 @@ struct nilfs_inode_info {
59#endif 59#endif
60 struct buffer_head *i_bh; /* i_bh contains a new or dirty 60 struct buffer_head *i_bh; /* i_bh contains a new or dirty
61 disk inode */ 61 disk inode */
62 struct nilfs_root *i_root;
62 struct inode vfs_inode; 63 struct inode vfs_inode;
63}; 64};
64 65
@@ -100,7 +101,6 @@ enum {
100 NILFS_I_INODE_DIRTY, /* write_inode is requested */ 101 NILFS_I_INODE_DIRTY, /* write_inode is requested */
101 NILFS_I_BMAP, /* has bmap and btnode_cache */ 102 NILFS_I_BMAP, /* has bmap and btnode_cache */
102 NILFS_I_GCINODE, /* inode for GC, on memory only */ 103 NILFS_I_GCINODE, /* inode for GC, on memory only */
103 NILFS_I_GCDAT, /* shadow DAT, on memory only */
104}; 104};
105 105
106/* 106/*
@@ -192,7 +192,7 @@ static inline int nilfs_doing_construction(void)
192 192
193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs) 193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
194{ 194{
195 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat; 195 return nilfs->ns_dat;
196} 196}
197 197
198/* 198/*
@@ -200,12 +200,9 @@ static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
200 */ 200 */
201#ifdef CONFIG_NILFS_POSIX_ACL 201#ifdef CONFIG_NILFS_POSIX_ACL
202#error "NILFS: not yet supported POSIX ACL" 202#error "NILFS: not yet supported POSIX ACL"
203extern int nilfs_permission(struct inode *, int, struct nameidata *);
204extern int nilfs_acl_chmod(struct inode *); 203extern int nilfs_acl_chmod(struct inode *);
205extern int nilfs_init_acl(struct inode *, struct inode *); 204extern int nilfs_init_acl(struct inode *, struct inode *);
206#else 205#else
207#define nilfs_permission NULL
208
209static inline int nilfs_acl_chmod(struct inode *inode) 206static inline int nilfs_acl_chmod(struct inode *inode)
210{ 207{
211 return 0; 208 return 0;
@@ -247,11 +244,19 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
247extern void nilfs_set_inode_flags(struct inode *); 244extern void nilfs_set_inode_flags(struct inode *);
248extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); 245extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
249extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); 246extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
250extern struct inode *nilfs_iget(struct super_block *, unsigned long); 247struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
248 unsigned long ino);
249struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
250 unsigned long ino);
251struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
252 unsigned long ino);
253extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
254 unsigned long ino, __u64 cno);
251extern void nilfs_update_inode(struct inode *, struct buffer_head *); 255extern void nilfs_update_inode(struct inode *, struct buffer_head *);
252extern void nilfs_truncate(struct inode *); 256extern void nilfs_truncate(struct inode *);
253extern void nilfs_evict_inode(struct inode *); 257extern void nilfs_evict_inode(struct inode *);
254extern int nilfs_setattr(struct dentry *, struct iattr *); 258extern int nilfs_setattr(struct dentry *, struct iattr *);
259int nilfs_permission(struct inode *inode, int mask);
255extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, 260extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
256 struct buffer_head **); 261 struct buffer_head **);
257extern int nilfs_inode_dirty(struct inode *); 262extern int nilfs_inode_dirty(struct inode *);
@@ -260,11 +265,7 @@ extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
260extern int nilfs_mark_inode_dirty(struct inode *); 265extern int nilfs_mark_inode_dirty(struct inode *);
261extern void nilfs_dirty_inode(struct inode *); 266extern void nilfs_dirty_inode(struct inode *);
262 267
263/* namei.c */
264extern struct dentry *nilfs_get_parent(struct dentry *);
265
266/* super.c */ 268/* super.c */
267extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
268extern struct inode *nilfs_alloc_inode(struct super_block *); 269extern struct inode *nilfs_alloc_inode(struct super_block *);
269extern void nilfs_destroy_inode(struct inode *); 270extern void nilfs_destroy_inode(struct inode *);
270extern void nilfs_error(struct super_block *, const char *, const char *, ...) 271extern void nilfs_error(struct super_block *, const char *, const char *, ...)
@@ -283,8 +284,9 @@ extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
283 int flip); 284 int flip);
284extern int nilfs_commit_super(struct nilfs_sb_info *, int); 285extern int nilfs_commit_super(struct nilfs_sb_info *, int);
285extern int nilfs_cleanup_super(struct nilfs_sb_info *); 286extern int nilfs_cleanup_super(struct nilfs_sb_info *);
286extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); 287int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
287extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); 288 struct nilfs_root **root);
289int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
288 290
289/* gcinode.c */ 291/* gcinode.c */
290int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, 292int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
@@ -292,16 +294,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
292int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, 294int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
293 struct buffer_head **); 295 struct buffer_head **);
294int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); 296int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
295int nilfs_init_gccache(struct the_nilfs *); 297int nilfs_init_gcinode(struct inode *inode);
296void nilfs_destroy_gccache(struct the_nilfs *); 298void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
297void nilfs_clear_gcinode(struct inode *);
298struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
299void nilfs_remove_all_gcinode(struct the_nilfs *);
300
301/* gcdat.c */
302int nilfs_init_gcdat_inode(struct the_nilfs *);
303void nilfs_commit_gcdat_inode(struct the_nilfs *);
304void nilfs_clear_gcdat_inode(struct the_nilfs *);
305 299
306/* 300/*
307 * Inodes and files operations 301 * Inodes and files operations
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index aab11db2cb08..a6c3c2e817f8 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -79,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
79{ 79{
80 int blkbits = inode->i_blkbits; 80 int blkbits = inode->i_blkbits;
81 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); 81 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
82 struct page *page, *opage; 82 struct page *page;
83 struct buffer_head *bh, *obh; 83 struct buffer_head *bh;
84 84
85 page = grab_cache_page(mapping, index); 85 page = grab_cache_page(mapping, index);
86 if (unlikely(!page)) 86 if (unlikely(!page))
@@ -92,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
92 page_cache_release(page); 92 page_cache_release(page);
93 return NULL; 93 return NULL;
94 } 94 }
95 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
96 /*
97 * Shadow page cache uses assoc_mapping to point its original
98 * page cache. The following code tries the original cache
99 * if the given cache is a shadow and it didn't hit.
100 */
101 opage = find_lock_page(mapping->assoc_mapping, index);
102 if (!opage)
103 return bh;
104
105 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
106 b_state);
107 if (buffer_uptodate(obh)) {
108 nilfs_copy_buffer(bh, obh);
109 if (buffer_dirty(obh)) {
110 nilfs_mark_buffer_dirty(bh);
111 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
112 nilfs_mdt_mark_dirty(inode);
113 }
114 }
115 brelse(obh);
116 unlock_page(opage);
117 page_cache_release(opage);
118 }
119 return bh; 95 return bh;
120} 96}
121 97
@@ -131,6 +107,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
131 lock_buffer(bh); 107 lock_buffer(bh);
132 clear_buffer_nilfs_volatile(bh); 108 clear_buffer_nilfs_volatile(bh);
133 clear_buffer_nilfs_checked(bh); 109 clear_buffer_nilfs_checked(bh);
110 clear_buffer_nilfs_redirected(bh);
134 clear_buffer_dirty(bh); 111 clear_buffer_dirty(bh);
135 if (nilfs_page_buffers_clean(page)) 112 if (nilfs_page_buffers_clean(page))
136 __nilfs_clear_page_dirty(page); 113 __nilfs_clear_page_dirty(page);
@@ -483,6 +460,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
483 clear_buffer_dirty(bh); 460 clear_buffer_dirty(bh);
484 clear_buffer_nilfs_volatile(bh); 461 clear_buffer_nilfs_volatile(bh);
485 clear_buffer_nilfs_checked(bh); 462 clear_buffer_nilfs_checked(bh);
463 clear_buffer_nilfs_redirected(bh);
486 clear_buffer_uptodate(bh); 464 clear_buffer_uptodate(bh);
487 clear_buffer_mapped(bh); 465 clear_buffer_mapped(bh);
488 unlock_buffer(bh); 466 unlock_buffer(bh);
@@ -513,6 +491,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
513 } 491 }
514 return nc; 492 return nc;
515} 493}
494
495void nilfs_mapping_init_once(struct address_space *mapping)
496{
497 memset(mapping, 0, sizeof(*mapping));
498 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
499 spin_lock_init(&mapping->tree_lock);
500 INIT_LIST_HEAD(&mapping->private_list);
501 spin_lock_init(&mapping->private_lock);
502
503 spin_lock_init(&mapping->i_mmap_lock);
504 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
505 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
506}
507
508void nilfs_mapping_init(struct address_space *mapping,
509 struct backing_dev_info *bdi,
510 const struct address_space_operations *aops)
511{
512 mapping->host = NULL;
513 mapping->flags = 0;
514 mapping_set_gfp_mask(mapping, GFP_NOFS);
515 mapping->assoc_mapping = NULL;
516 mapping->backing_dev_info = bdi;
517 mapping->a_ops = aops;
518}
516 519
517/* 520/*
518 * NILFS2 needs clear_page_dirty() in the following two cases: 521 * NILFS2 needs clear_page_dirty() in the following two cases:
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f53d8da41ed7..fb9e8a8a2038 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -35,12 +35,14 @@ enum {
35 BH_NILFS_Node, 35 BH_NILFS_Node,
36 BH_NILFS_Volatile, 36 BH_NILFS_Volatile,
37 BH_NILFS_Checked, 37 BH_NILFS_Checked,
38 BH_NILFS_Redirected,
38}; 39};
39 40
40BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ 41BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
41BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ 42BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
42BUFFER_FNS(NILFS_Volatile, nilfs_volatile) 43BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
43BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ 44BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
45BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */
44 46
45 47
46void nilfs_mark_buffer_dirty(struct buffer_head *bh); 48void nilfs_mark_buffer_dirty(struct buffer_head *bh);
@@ -59,6 +61,10 @@ void nilfs_free_private_page(struct page *);
59int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); 61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
60void nilfs_copy_back_pages(struct address_space *, struct address_space *); 62void nilfs_copy_back_pages(struct address_space *, struct address_space *);
61void nilfs_clear_dirty_pages(struct address_space *); 63void nilfs_clear_dirty_pages(struct address_space *);
64void nilfs_mapping_init_once(struct address_space *mapping);
65void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi,
67 const struct address_space_operations *aops);
62unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
63 69
64#define NILFS_PAGE_BUG(page, m, a...) \ 70#define NILFS_PAGE_BUG(page, m, a...) \
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d0c35ef39f6a..5d2711c28da7 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -440,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
440 segnum[2] = ri->ri_segnum; 440 segnum[2] = ri->ri_segnum;
441 segnum[3] = ri->ri_nextnum; 441 segnum[3] = ri->ri_nextnum;
442 442
443 nilfs_attach_writer(nilfs, sbi);
444 /* 443 /*
445 * Releasing the next segment of the latest super root. 444 * Releasing the next segment of the latest super root.
446 * The next segment is invalidated by this recovery. 445 * The next segment is invalidated by this recovery.
@@ -480,7 +479,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
480 479
481 failed: 480 failed:
482 /* No need to recover sufile because it will be destroyed on error */ 481 /* No need to recover sufile because it will be destroyed on error */
483 nilfs_detach_writer(nilfs, sbi);
484 return err; 482 return err;
485} 483}
486 484
@@ -504,6 +502,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
504 502
505static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, 503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
506 struct nilfs_sb_info *sbi, 504 struct nilfs_sb_info *sbi,
505 struct nilfs_root *root,
507 struct list_head *head, 506 struct list_head *head,
508 unsigned long *nr_salvaged_blocks) 507 unsigned long *nr_salvaged_blocks)
509{ 508{
@@ -515,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
515 int err = 0, err2 = 0; 514 int err = 0, err2 = 0;
516 515
517 list_for_each_entry_safe(rb, n, head, list) { 516 list_for_each_entry_safe(rb, n, head, list) {
518 inode = nilfs_iget(sbi->s_super, rb->ino); 517 inode = nilfs_iget(sbi->s_super, root, rb->ino);
519 if (IS_ERR(inode)) { 518 if (IS_ERR(inode)) {
520 err = PTR_ERR(inode); 519 err = PTR_ERR(inode);
521 inode = NULL; 520 inode = NULL;
@@ -578,6 +577,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
578 */ 577 */
579static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 578static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
580 struct nilfs_sb_info *sbi, 579 struct nilfs_sb_info *sbi,
580 struct nilfs_root *root,
581 struct nilfs_recovery_info *ri) 581 struct nilfs_recovery_info *ri)
582{ 582{
583 struct buffer_head *bh_sum = NULL; 583 struct buffer_head *bh_sum = NULL;
@@ -597,7 +597,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
597 }; 597 };
598 int state = RF_INIT_ST; 598 int state = RF_INIT_ST;
599 599
600 nilfs_attach_writer(nilfs, sbi);
601 pseg_start = ri->ri_lsegs_start; 600 pseg_start = ri->ri_lsegs_start;
602 seg_seq = ri->ri_lsegs_start_seq; 601 seg_seq = ri->ri_lsegs_start_seq;
603 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); 602 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
@@ -649,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
649 goto failed; 648 goto failed;
650 if (flags & NILFS_SS_LOGEND) { 649 if (flags & NILFS_SS_LOGEND) {
651 err = nilfs_recover_dsync_blocks( 650 err = nilfs_recover_dsync_blocks(
652 nilfs, sbi, &dsync_blocks, 651 nilfs, sbi, root, &dsync_blocks,
653 &nsalvaged_blocks); 652 &nsalvaged_blocks);
654 if (unlikely(err)) 653 if (unlikely(err))
655 goto failed; 654 goto failed;
@@ -688,7 +687,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
688 out: 687 out:
689 brelse(bh_sum); 688 brelse(bh_sum);
690 dispose_recovery_list(&dsync_blocks); 689 dispose_recovery_list(&dsync_blocks);
691 nilfs_detach_writer(nilfs, sbi);
692 return err; 690 return err;
693 691
694 confused: 692 confused:
@@ -746,19 +744,20 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
746 struct nilfs_sb_info *sbi, 744 struct nilfs_sb_info *sbi,
747 struct nilfs_recovery_info *ri) 745 struct nilfs_recovery_info *ri)
748{ 746{
747 struct nilfs_root *root;
749 int err; 748 int err;
750 749
751 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) 750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
752 return 0; 751 return 0;
753 752
754 err = nilfs_attach_checkpoint(sbi, ri->ri_cno); 753 err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
755 if (unlikely(err)) { 754 if (unlikely(err)) {
756 printk(KERN_ERR 755 printk(KERN_ERR
757 "NILFS: error loading the latest checkpoint.\n"); 756 "NILFS: error loading the latest checkpoint.\n");
758 return err; 757 return err;
759 } 758 }
760 759
761 err = nilfs_do_roll_forward(nilfs, sbi, ri); 760 err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
762 if (unlikely(err)) 761 if (unlikely(err))
763 goto failed; 762 goto failed;
764 763
@@ -770,7 +769,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
770 goto failed; 769 goto failed;
771 } 770 }
772 771
773 err = nilfs_attach_segment_constructor(sbi); 772 err = nilfs_attach_segment_constructor(sbi, root);
774 if (unlikely(err)) 773 if (unlikely(err))
775 goto failed; 774 goto failed;
776 775
@@ -788,7 +787,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
788 } 787 }
789 788
790 failed: 789 failed:
791 nilfs_detach_checkpoint(sbi); 790 nilfs_put_root(root);
792 return err; 791 return err;
793} 792}
794 793
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504a..35a07157b980 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -42,11 +42,6 @@ struct nilfs_sc_info;
42 * NILFS super-block data in memory 42 * NILFS super-block data in memory
43 */ 43 */
44struct nilfs_sb_info { 44struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */ 45 /* Mount options */
51 unsigned long s_mount_opt; 46 unsigned long s_mount_opt;
52 uid_t s_resuid; 47 uid_t s_resuid;
@@ -59,8 +54,6 @@ struct nilfs_sb_info {
59 /* Fundamental members */ 54 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */ 55 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs; 56 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63 atomic_t s_count; /* reference count */
64 57
65 /* Segment constructor */ 58 /* Segment constructor */
66 struct list_head s_dirty_files; /* dirty files list */ 59 struct list_head s_dirty_files; /* dirty files list */
@@ -68,9 +61,6 @@ struct nilfs_sb_info {
68 spinlock_t s_inode_lock; /* Lock for the nilfs inode. 61 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
69 It covers s_dirty_files list */ 62 It covers s_dirty_files list */
70 63
71 /* Metadata files */
72 struct inode *s_ifile; /* index file inode */
73
74 /* Inode allocator */ 64 /* Inode allocator */
75 spinlock_t s_next_gen_lock; 65 spinlock_t s_next_gen_lock;
76 u32 s_next_generation; 66 u32 s_next_generation;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 4588fb9e93df..0f83e93935b2 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
371 struct bio *bio = wi->bio; 371 struct bio *bio = wi->bio;
372 int err; 372 int err;
373 373
374 if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) { 374 if (segbuf->sb_nbio > 0 &&
375 bdi_write_congested(segbuf->sb_super->s_bdi)) {
375 wait_for_completion(&segbuf->sb_bio_event); 376 wait_for_completion(&segbuf->sb_bio_event);
376 segbuf->sb_nbio--; 377 segbuf->sb_nbio--;
377 if (unlikely(atomic_read(&segbuf->sb_err))) { 378 if (unlikely(atomic_read(&segbuf->sb_err))) {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9fd051a33c4f..687d090cea34 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
191 if (ret > 0) 191 if (ret > 0)
192 return 0; 192 return 0;
193 193
194 vfs_check_frozen(sb, SB_FREEZE_WRITE);
195
194 sbi = NILFS_SB(sb); 196 sbi = NILFS_SB(sb);
195 nilfs = sbi->s_nilfs; 197 nilfs = sbi->s_nilfs;
196 down_read(&nilfs->ns_segctor_sem); 198 down_read(&nilfs->ns_segctor_sem);
@@ -366,8 +368,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
366 368
367 if (nilfs_doing_gc()) 369 if (nilfs_doing_gc())
368 flags = NILFS_SS_GC; 370 flags = NILFS_SS_GC;
369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, 371 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
370 sci->sc_sbi->s_nilfs->ns_cno);
371 if (unlikely(err)) 372 if (unlikely(err))
372 return err; 373 return err;
373 374
@@ -440,17 +441,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
440 struct nilfs_finfo *finfo; 441 struct nilfs_finfo *finfo;
441 struct nilfs_inode_info *ii; 442 struct nilfs_inode_info *ii;
442 struct nilfs_segment_buffer *segbuf; 443 struct nilfs_segment_buffer *segbuf;
444 __u64 cno;
443 445
444 if (sci->sc_blk_cnt == 0) 446 if (sci->sc_blk_cnt == 0)
445 return; 447 return;
446 448
447 ii = NILFS_I(inode); 449 ii = NILFS_I(inode);
450
451 if (test_bit(NILFS_I_GCINODE, &ii->i_state))
452 cno = ii->i_cno;
453 else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
454 cno = 0;
455 else
456 cno = sci->sc_cno;
457
448 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, 458 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
449 sizeof(*finfo)); 459 sizeof(*finfo));
450 finfo->fi_ino = cpu_to_le64(inode->i_ino); 460 finfo->fi_ino = cpu_to_le64(inode->i_ino);
451 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); 461 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
452 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); 462 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
453 finfo->fi_cno = cpu_to_le64(ii->i_cno); 463 finfo->fi_cno = cpu_to_le64(cno);
454 464
455 segbuf = sci->sc_curseg; 465 segbuf = sci->sc_curseg;
456 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + 466 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -755,12 +765,12 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
755 } 765 }
756} 766}
757 767
758static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi) 768static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
769 struct nilfs_root *root)
759{ 770{
760 struct the_nilfs *nilfs = sbi->s_nilfs;
761 int ret = 0; 771 int ret = 0;
762 772
763 if (nilfs_mdt_fetch_dirty(sbi->s_ifile)) 773 if (nilfs_mdt_fetch_dirty(root->ifile))
764 ret++; 774 ret++;
765 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) 775 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
766 ret++; 776 ret++;
@@ -785,7 +795,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
785 struct nilfs_sb_info *sbi = sci->sc_sbi; 795 struct nilfs_sb_info *sbi = sci->sc_sbi;
786 int ret = 0; 796 int ret = 0;
787 797
788 if (nilfs_test_metadata_dirty(sbi)) 798 if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
789 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 799 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
790 800
791 spin_lock(&sbi->s_inode_lock); 801 spin_lock(&sbi->s_inode_lock);
@@ -801,7 +811,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
801 struct nilfs_sb_info *sbi = sci->sc_sbi; 811 struct nilfs_sb_info *sbi = sci->sc_sbi;
802 struct the_nilfs *nilfs = sbi->s_nilfs; 812 struct the_nilfs *nilfs = sbi->s_nilfs;
803 813
804 nilfs_mdt_clear_dirty(sbi->s_ifile); 814 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
805 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 815 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
806 nilfs_mdt_clear_dirty(nilfs->ns_sufile); 816 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
807 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); 817 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
@@ -848,9 +858,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
848 raw_cp->cp_snapshot_list.ssl_next = 0; 858 raw_cp->cp_snapshot_list.ssl_next = 0;
849 raw_cp->cp_snapshot_list.ssl_prev = 0; 859 raw_cp->cp_snapshot_list.ssl_prev = 0;
850 raw_cp->cp_inodes_count = 860 raw_cp->cp_inodes_count =
851 cpu_to_le64(atomic_read(&sbi->s_inodes_count)); 861 cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
852 raw_cp->cp_blocks_count = 862 raw_cp->cp_blocks_count =
853 cpu_to_le64(atomic_read(&sbi->s_blocks_count)); 863 cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
854 raw_cp->cp_nblk_inc = 864 raw_cp->cp_nblk_inc =
855 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); 865 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
856 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); 866 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
@@ -861,7 +871,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
861 else 871 else
862 nilfs_checkpoint_set_minor(raw_cp); 872 nilfs_checkpoint_set_minor(raw_cp);
863 873
864 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); 874 nilfs_write_inode_common(sci->sc_root->ifile,
875 &raw_cp->cp_ifile_inode, 1);
865 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); 876 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
866 return 0; 877 return 0;
867 878
@@ -886,13 +897,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
886 } 897 }
887} 898}
888 899
889static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, 900static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
890 struct inode *ifile)
891{ 901{
892 struct nilfs_inode_info *ii; 902 struct nilfs_inode_info *ii;
893 903
894 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { 904 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
895 nilfs_fill_in_file_bmap(ifile, ii); 905 nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
896 set_bit(NILFS_I_COLLECTED, &ii->i_state); 906 set_bit(NILFS_I_COLLECTED, &ii->i_state);
897 } 907 }
898} 908}
@@ -1135,7 +1145,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1135 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; 1145 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1136 /* Fall through */ 1146 /* Fall through */
1137 case NILFS_ST_IFILE: 1147 case NILFS_ST_IFILE:
1138 err = nilfs_segctor_scan_file(sci, sbi->s_ifile, 1148 err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
1139 &nilfs_sc_file_ops); 1149 &nilfs_sc_file_ops);
1140 if (unlikely(err)) 1150 if (unlikely(err))
1141 break; 1151 break;
@@ -1599,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1599 kunmap_atomic(kaddr, KM_USER0); 1609 kunmap_atomic(kaddr, KM_USER0);
1600 1610
1601 if (!TestSetPageWriteback(clone_page)) 1611 if (!TestSetPageWriteback(clone_page))
1602 inc_zone_page_state(clone_page, NR_WRITEBACK); 1612 account_page_writeback(clone_page);
1603 unlock_page(clone_page); 1613 unlock_page(clone_page);
1604 1614
1605 return 0; 1615 return 0;
@@ -1900,6 +1910,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1900 set_buffer_uptodate(bh); 1910 set_buffer_uptodate(bh);
1901 clear_buffer_dirty(bh); 1911 clear_buffer_dirty(bh);
1902 clear_buffer_nilfs_volatile(bh); 1912 clear_buffer_nilfs_volatile(bh);
1913 clear_buffer_nilfs_redirected(bh);
1903 if (bh == segbuf->sb_super_root) { 1914 if (bh == segbuf->sb_super_root) {
1904 if (bh->b_page != bd_page) { 1915 if (bh->b_page != bd_page) {
1905 end_page_writeback(bd_page); 1916 end_page_writeback(bd_page);
@@ -1936,11 +1947,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1936 1947
1937 nilfs_drop_collected_inodes(&sci->sc_dirty_files); 1948 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
1938 1949
1939 if (nilfs_doing_gc()) { 1950 if (nilfs_doing_gc())
1940 nilfs_drop_collected_inodes(&sci->sc_gc_inodes); 1951 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
1941 if (update_sr) 1952 else
1942 nilfs_commit_gcdat_inode(nilfs);
1943 } else
1944 nilfs->ns_nongc_ctime = sci->sc_seg_ctime; 1953 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
1945 1954
1946 sci->sc_nblk_inc += sci->sc_nblk_this_inc; 1955 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -1976,7 +1985,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1976 struct nilfs_sb_info *sbi) 1985 struct nilfs_sb_info *sbi)
1977{ 1986{
1978 struct nilfs_inode_info *ii, *n; 1987 struct nilfs_inode_info *ii, *n;
1979 __u64 cno = sbi->s_nilfs->ns_cno; 1988 struct inode *ifile = sci->sc_root->ifile;
1980 1989
1981 spin_lock(&sbi->s_inode_lock); 1990 spin_lock(&sbi->s_inode_lock);
1982 retry: 1991 retry:
@@ -1987,14 +1996,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1987 1996
1988 spin_unlock(&sbi->s_inode_lock); 1997 spin_unlock(&sbi->s_inode_lock);
1989 err = nilfs_ifile_get_inode_block( 1998 err = nilfs_ifile_get_inode_block(
1990 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh); 1999 ifile, ii->vfs_inode.i_ino, &ibh);
1991 if (unlikely(err)) { 2000 if (unlikely(err)) {
1992 nilfs_warning(sbi->s_super, __func__, 2001 nilfs_warning(sbi->s_super, __func__,
1993 "failed to get inode block.\n"); 2002 "failed to get inode block.\n");
1994 return err; 2003 return err;
1995 } 2004 }
1996 nilfs_mdt_mark_buffer_dirty(ibh); 2005 nilfs_mdt_mark_buffer_dirty(ibh);
1997 nilfs_mdt_mark_dirty(sbi->s_ifile); 2006 nilfs_mdt_mark_dirty(ifile);
1998 spin_lock(&sbi->s_inode_lock); 2007 spin_lock(&sbi->s_inode_lock);
1999 if (likely(!ii->i_bh)) 2008 if (likely(!ii->i_bh))
2000 ii->i_bh = ibh; 2009 ii->i_bh = ibh;
@@ -2002,7 +2011,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2002 brelse(ibh); 2011 brelse(ibh);
2003 goto retry; 2012 goto retry;
2004 } 2013 }
2005 ii->i_cno = cno;
2006 2014
2007 clear_bit(NILFS_I_QUEUED, &ii->i_state); 2015 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2008 set_bit(NILFS_I_BUSY, &ii->i_state); 2016 set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2011,8 +2019,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2011 } 2019 }
2012 spin_unlock(&sbi->s_inode_lock); 2020 spin_unlock(&sbi->s_inode_lock);
2013 2021
2014 NILFS_I(sbi->s_ifile)->i_cno = cno;
2015
2016 return 0; 2022 return 0;
2017} 2023}
2018 2024
@@ -2021,19 +2027,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2021{ 2027{
2022 struct nilfs_transaction_info *ti = current->journal_info; 2028 struct nilfs_transaction_info *ti = current->journal_info;
2023 struct nilfs_inode_info *ii, *n; 2029 struct nilfs_inode_info *ii, *n;
2024 __u64 cno = sbi->s_nilfs->ns_cno;
2025 2030
2026 spin_lock(&sbi->s_inode_lock); 2031 spin_lock(&sbi->s_inode_lock);
2027 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { 2032 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2028 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || 2033 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2029 test_bit(NILFS_I_DIRTY, &ii->i_state)) { 2034 test_bit(NILFS_I_DIRTY, &ii->i_state))
2030 /* The current checkpoint number (=nilfs->ns_cno) is
2031 changed between check-in and check-out only if the
2032 super root is written out. So, we can update i_cno
2033 for the inodes that remain in the dirty list. */
2034 ii->i_cno = cno;
2035 continue; 2035 continue;
2036 } 2036
2037 clear_bit(NILFS_I_BUSY, &ii->i_state); 2037 clear_bit(NILFS_I_BUSY, &ii->i_state);
2038 brelse(ii->i_bh); 2038 brelse(ii->i_bh);
2039 ii->i_bh = NULL; 2039 ii->i_bh = NULL;
@@ -2054,12 +2054,13 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2054 int err; 2054 int err;
2055 2055
2056 sci->sc_stage.scnt = NILFS_ST_INIT; 2056 sci->sc_stage.scnt = NILFS_ST_INIT;
2057 sci->sc_cno = nilfs->ns_cno;
2057 2058
2058 err = nilfs_segctor_check_in_files(sci, sbi); 2059 err = nilfs_segctor_check_in_files(sci, sbi);
2059 if (unlikely(err)) 2060 if (unlikely(err))
2060 goto out; 2061 goto out;
2061 2062
2062 if (nilfs_test_metadata_dirty(sbi)) 2063 if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
2063 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2064 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2064 2065
2065 if (nilfs_segctor_clean(sci)) 2066 if (nilfs_segctor_clean(sci))
@@ -2091,7 +2092,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2091 goto failed; 2092 goto failed;
2092 2093
2093 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2094 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2094 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2095 nilfs_segctor_fill_in_file_bmap(sci);
2095 2096
2096 if (mode == SC_LSEG_SR && 2097 if (mode == SC_LSEG_SR &&
2097 sci->sc_stage.scnt >= NILFS_ST_CPFILE) { 2098 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2452,9 +2453,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2452 list_for_each_entry_safe(ii, n, head, i_dirty) { 2453 list_for_each_entry_safe(ii, n, head, i_dirty) {
2453 if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) 2454 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2454 continue; 2455 continue;
2455 hlist_del_init(&ii->vfs_inode.i_hash);
2456 list_del_init(&ii->i_dirty); 2456 list_del_init(&ii->i_dirty);
2457 nilfs_clear_gcinode(&ii->vfs_inode); 2457 iput(&ii->vfs_inode);
2458 } 2458 }
2459} 2459}
2460 2460
@@ -2472,13 +2472,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2472 2472
2473 nilfs_transaction_lock(sbi, &ti, 1); 2473 nilfs_transaction_lock(sbi, &ti, 1);
2474 2474
2475 err = nilfs_init_gcdat_inode(nilfs); 2475 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
2476 if (unlikely(err)) 2476 if (unlikely(err))
2477 goto out_unlock; 2477 goto out_unlock;
2478 2478
2479 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); 2479 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
2480 if (unlikely(err)) 2480 if (unlikely(err)) {
2481 nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
2481 goto out_unlock; 2482 goto out_unlock;
2483 }
2482 2484
2483 sci->sc_freesegs = kbufs[4]; 2485 sci->sc_freesegs = kbufs[4];
2484 sci->sc_nfreesegs = argv[4].v_nmembs; 2486 sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2510,7 +2512,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2510 out_unlock: 2512 out_unlock:
2511 sci->sc_freesegs = NULL; 2513 sci->sc_freesegs = NULL;
2512 sci->sc_nfreesegs = 0; 2514 sci->sc_nfreesegs = 0;
2513 nilfs_clear_gcdat_inode(nilfs); 2515 nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
2514 nilfs_transaction_unlock(sbi); 2516 nilfs_transaction_unlock(sbi);
2515 return err; 2517 return err;
2516} 2518}
@@ -2672,6 +2674,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2672} 2674}
2673 2675
2674static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) 2676static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2677 __acquires(&sci->sc_state_lock)
2678 __releases(&sci->sc_state_lock)
2675{ 2679{
2676 sci->sc_state |= NILFS_SEGCTOR_QUIT; 2680 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2677 2681
@@ -2686,7 +2690,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2686/* 2690/*
2687 * Setup & clean-up functions 2691 * Setup & clean-up functions
2688 */ 2692 */
2689static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) 2693static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
2694 struct nilfs_root *root)
2690{ 2695{
2691 struct nilfs_sc_info *sci; 2696 struct nilfs_sc_info *sci;
2692 2697
@@ -2697,6 +2702,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2697 sci->sc_sbi = sbi; 2702 sci->sc_sbi = sbi;
2698 sci->sc_super = sbi->s_super; 2703 sci->sc_super = sbi->s_super;
2699 2704
2705 nilfs_get_root(root);
2706 sci->sc_root = root;
2707
2700 init_waitqueue_head(&sci->sc_wait_request); 2708 init_waitqueue_head(&sci->sc_wait_request);
2701 init_waitqueue_head(&sci->sc_wait_daemon); 2709 init_waitqueue_head(&sci->sc_wait_daemon);
2702 init_waitqueue_head(&sci->sc_wait_task); 2710 init_waitqueue_head(&sci->sc_wait_task);
@@ -2771,6 +2779,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2771 WARN_ON(!list_empty(&sci->sc_segbufs)); 2779 WARN_ON(!list_empty(&sci->sc_segbufs));
2772 WARN_ON(!list_empty(&sci->sc_write_logs)); 2780 WARN_ON(!list_empty(&sci->sc_write_logs));
2773 2781
2782 nilfs_put_root(sci->sc_root);
2783
2774 down_write(&sbi->s_nilfs->ns_segctor_sem); 2784 down_write(&sbi->s_nilfs->ns_segctor_sem);
2775 2785
2776 del_timer_sync(&sci->sc_timer); 2786 del_timer_sync(&sci->sc_timer);
@@ -2780,6 +2790,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2780/** 2790/**
2781 * nilfs_attach_segment_constructor - attach a segment constructor 2791 * nilfs_attach_segment_constructor - attach a segment constructor
2782 * @sbi: nilfs_sb_info 2792 * @sbi: nilfs_sb_info
2793 * @root: root object of the current filesystem tree
2783 * 2794 *
2784 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2795 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2785 * initializes it, and starts the segment constructor. 2796 * initializes it, and starts the segment constructor.
@@ -2789,9 +2800,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2789 * 2800 *
2790 * %-ENOMEM - Insufficient memory available. 2801 * %-ENOMEM - Insufficient memory available.
2791 */ 2802 */
2792int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) 2803int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
2804 struct nilfs_root *root)
2793{ 2805{
2794 struct the_nilfs *nilfs = sbi->s_nilfs;
2795 int err; 2806 int err;
2796 2807
2797 if (NILFS_SC(sbi)) { 2808 if (NILFS_SC(sbi)) {
@@ -2803,14 +2814,12 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2803 nilfs_detach_segment_constructor(sbi); 2814 nilfs_detach_segment_constructor(sbi);
2804 } 2815 }
2805 2816
2806 sbi->s_sc_info = nilfs_segctor_new(sbi); 2817 sbi->s_sc_info = nilfs_segctor_new(sbi, root);
2807 if (!sbi->s_sc_info) 2818 if (!sbi->s_sc_info)
2808 return -ENOMEM; 2819 return -ENOMEM;
2809 2820
2810 nilfs_attach_writer(nilfs, sbi);
2811 err = nilfs_segctor_start_thread(NILFS_SC(sbi)); 2821 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2812 if (err) { 2822 if (err) {
2813 nilfs_detach_writer(nilfs, sbi);
2814 kfree(sbi->s_sc_info); 2823 kfree(sbi->s_sc_info);
2815 sbi->s_sc_info = NULL; 2824 sbi->s_sc_info = NULL;
2816 } 2825 }
@@ -2847,5 +2856,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2847 up_write(&nilfs->ns_segctor_sem); 2856 up_write(&nilfs->ns_segctor_sem);
2848 2857
2849 nilfs_dispose_list(sbi, &garbage_list, 1); 2858 nilfs_dispose_list(sbi, &garbage_list, 1);
2850 nilfs_detach_writer(nilfs, sbi);
2851} 2859}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 17c487bd8152..cd8056e7cbed 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "sb.h" 30#include "sb.h"
31 31
32struct nilfs_root;
33
32/** 34/**
33 * struct nilfs_recovery_info - Recovery information 35 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 36 * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
87 * struct nilfs_sc_info - Segment constructor information 89 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct 90 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct 91 * @sc_sbi: Back pointer to nilfs_sb_info struct
92 * @sc_root: root object of the current filesystem tree
90 * @sc_nblk_inc: Block count of current generation 93 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written 94 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written 95 * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -107,6 +110,7 @@ struct nilfs_segsum_pointer {
107 * @sc_datablk_cnt: Data block count of a file 110 * @sc_datablk_cnt: Data block count of a file
108 * @sc_nblk_this_inc: Number of blocks included in the current logical segment 111 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
109 * @sc_seg_ctime: Creation time 112 * @sc_seg_ctime: Creation time
113 * @sc_cno: checkpoint number of current log
110 * @sc_flags: Internal flags 114 * @sc_flags: Internal flags
111 * @sc_state_lock: spinlock for sc_state and so on 115 * @sc_state_lock: spinlock for sc_state and so on
112 * @sc_state: Segctord state flags 116 * @sc_state: Segctord state flags
@@ -128,6 +132,7 @@ struct nilfs_segsum_pointer {
128struct nilfs_sc_info { 132struct nilfs_sc_info {
129 struct super_block *sc_super; 133 struct super_block *sc_super;
130 struct nilfs_sb_info *sc_sbi; 134 struct nilfs_sb_info *sc_sbi;
135 struct nilfs_root *sc_root;
131 136
132 unsigned long sc_nblk_inc; 137 unsigned long sc_nblk_inc;
133 138
@@ -156,7 +161,7 @@ struct nilfs_sc_info {
156 unsigned long sc_datablk_cnt; 161 unsigned long sc_datablk_cnt;
157 unsigned long sc_nblk_this_inc; 162 unsigned long sc_nblk_this_inc;
158 time_t sc_seg_ctime; 163 time_t sc_seg_ctime;
159 164 __u64 sc_cno;
160 unsigned long sc_flags; 165 unsigned long sc_flags;
161 166
162 spinlock_t sc_state_lock; 167 spinlock_t sc_state_lock;
@@ -230,7 +235,8 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
230extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, 235extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
231 void **); 236 void **);
232 237
233extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); 238int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
239 struct nilfs_root *root);
234extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); 240extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
235 241
236/* recovery.c */ 242/* recovery.c */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2e..1d6f488ccae8 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
505{ 505{
506 struct buffer_head *header_bh; 506 struct buffer_head *header_bh;
507 struct nilfs_sufile_header *header; 507 struct nilfs_sufile_header *header;
508 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 508 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
509 void *kaddr; 509 void *kaddr;
510 int ret; 510 int ret;
511 511
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
583 struct nilfs_segment_usage *su; 583 struct nilfs_segment_usage *su;
584 struct nilfs_suinfo *si = buf; 584 struct nilfs_suinfo *si = buf;
585 size_t susz = NILFS_MDT(sufile)->mi_entry_size; 585 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
586 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 586 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
587 void *kaddr; 587 void *kaddr;
588 unsigned long nsegs, segusages_per_block; 588 unsigned long nsegs, segusages_per_block;
589 ssize_t n; 589 ssize_t n;
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
635} 635}
636 636
637/** 637/**
638 * nilfs_sufile_read - read sufile inode 638 * nilfs_sufile_read - read or get sufile inode
639 * @sufile: sufile inode 639 * @sb: super block instance
640 * @susize: size of a segment usage entry
640 * @raw_inode: on-disk sufile inode 641 * @raw_inode: on-disk sufile inode
642 * @inodep: buffer to store the inode
641 */ 643 */
642int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode) 644int nilfs_sufile_read(struct super_block *sb, size_t susize,
645 struct nilfs_inode *raw_inode, struct inode **inodep)
643{ 646{
644 struct nilfs_sufile_info *sui = NILFS_SUI(sufile); 647 struct inode *sufile;
648 struct nilfs_sufile_info *sui;
645 struct buffer_head *header_bh; 649 struct buffer_head *header_bh;
646 struct nilfs_sufile_header *header; 650 struct nilfs_sufile_header *header;
647 void *kaddr; 651 void *kaddr;
648 int ret; 652 int err;
649 653
650 ret = nilfs_read_inode_common(sufile, raw_inode); 654 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
651 if (ret < 0) 655 if (unlikely(!sufile))
652 return ret; 656 return -ENOMEM;
657 if (!(sufile->i_state & I_NEW))
658 goto out;
653 659
654 ret = nilfs_sufile_get_header_block(sufile, &header_bh); 660 err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
655 if (!ret) { 661 if (err)
656 kaddr = kmap_atomic(header_bh->b_page, KM_USER0); 662 goto failed;
657 header = kaddr + bh_offset(header_bh);
658 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
659 kunmap_atomic(kaddr, KM_USER0);
660 brelse(header_bh);
661 }
662 return ret;
663}
664 663
665/** 664 nilfs_mdt_set_entry_size(sufile, susize,
666 * nilfs_sufile_new - create sufile 665 sizeof(struct nilfs_sufile_header));
667 * @nilfs: nilfs object 666
668 * @susize: size of a segment usage entry 667 err = nilfs_read_inode_common(sufile, raw_inode);
669 */ 668 if (err)
670struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize) 669 goto failed;
671{ 670
672 struct inode *sufile; 671 err = nilfs_sufile_get_header_block(sufile, &header_bh);
672 if (err)
673 goto failed;
673 674
674 sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO, 675 sui = NILFS_SUI(sufile);
675 sizeof(struct nilfs_sufile_info)); 676 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
676 if (sufile) 677 header = kaddr + bh_offset(header_bh);
677 nilfs_mdt_set_entry_size(sufile, susize, 678 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
678 sizeof(struct nilfs_sufile_header)); 679 kunmap_atomic(kaddr, KM_USER0);
679 return sufile; 680 brelse(header_bh);
681
682 unlock_new_inode(sufile);
683 out:
684 *inodep = sufile;
685 return 0;
686 failed:
687 iget_failed(sufile);
688 return err;
680} 689}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7d..a943fbacb45b 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
31 31
32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) 32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
33{ 33{
34 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; 34 return NILFS_I_NILFS(sufile)->ns_nsegments;
35} 35}
36 36
37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); 37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, 61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
62 struct buffer_head *); 62 struct buffer_head *);
63 63
64int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode); 64int nilfs_sufile_read(struct super_block *sb, size_t susize,
65struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize); 65 struct nilfs_inode *raw_inode, struct inode **inodep);
66 66
67/** 67/**
68 * nilfs_sufile_scrap - make a segment garbage 68 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..f804d41ec9d3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,14 +45,13 @@
45#include <linux/parser.h> 45#include <linux/parser.h>
46#include <linux/random.h> 46#include <linux/random.h>
47#include <linux/crc32.h> 47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h> 48#include <linux/vfs.h>
50#include <linux/writeback.h> 49#include <linux/writeback.h>
51#include <linux/kobject.h> 50#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include <linux/seq_file.h> 51#include <linux/seq_file.h>
54#include <linux/mount.h> 52#include <linux/mount.h>
55#include "nilfs.h" 53#include "nilfs.h"
54#include "export.h"
56#include "mdt.h" 55#include "mdt.h"
57#include "alloc.h" 56#include "alloc.h"
58#include "btree.h" 57#include "btree.h"
@@ -69,11 +68,12 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
69 "(NILFS)"); 68 "(NILFS)");
70MODULE_LICENSE("GPL"); 69MODULE_LICENSE("GPL");
71 70
72struct kmem_cache *nilfs_inode_cachep; 71static struct kmem_cache *nilfs_inode_cachep;
73struct kmem_cache *nilfs_transaction_cachep; 72struct kmem_cache *nilfs_transaction_cachep;
74struct kmem_cache *nilfs_segbuf_cachep; 73struct kmem_cache *nilfs_segbuf_cachep;
75struct kmem_cache *nilfs_btree_path_cache; 74struct kmem_cache *nilfs_btree_path_cache;
76 75
76static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
77static int nilfs_remount(struct super_block *sb, int *flags, char *data); 77static int nilfs_remount(struct super_block *sb, int *flags, char *data);
78 78
79static void nilfs_set_error(struct nilfs_sb_info *sbi) 79static void nilfs_set_error(struct nilfs_sb_info *sbi)
@@ -147,7 +147,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
147} 147}
148 148
149 149
150struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 150struct inode *nilfs_alloc_inode(struct super_block *sb)
151{ 151{
152 struct nilfs_inode_info *ii; 152 struct nilfs_inode_info *ii;
153 153
@@ -156,18 +156,20 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
156 return NULL; 156 return NULL;
157 ii->i_bh = NULL; 157 ii->i_bh = NULL;
158 ii->i_state = 0; 158 ii->i_state = 0;
159 ii->i_cno = 0;
159 ii->vfs_inode.i_version = 1; 160 ii->vfs_inode.i_version = 1;
160 nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi); 161 nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
161 return &ii->vfs_inode; 162 return &ii->vfs_inode;
162} 163}
163 164
164struct inode *nilfs_alloc_inode(struct super_block *sb)
165{
166 return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
167}
168
169void nilfs_destroy_inode(struct inode *inode) 165void nilfs_destroy_inode(struct inode *inode)
170{ 166{
167 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
168
169 if (mdi) {
170 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
171 kfree(mdi);
172 }
171 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 173 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
172} 174}
173 175
@@ -178,17 +180,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
178 180
179 retry: 181 retry:
180 set_buffer_dirty(nilfs->ns_sbh[0]); 182 set_buffer_dirty(nilfs->ns_sbh[0]);
181
182 if (nilfs_test_opt(sbi, BARRIER)) { 183 if (nilfs_test_opt(sbi, BARRIER)) {
183 err = __sync_dirty_buffer(nilfs->ns_sbh[0], 184 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
184 WRITE_SYNC | WRITE_BARRIER); 185 WRITE_SYNC | WRITE_FLUSH_FUA);
185 if (err == -EOPNOTSUPP) {
186 nilfs_warning(sbi->s_super, __func__,
187 "barrier-based sync failed. "
188 "disabling barriers\n");
189 nilfs_clear_opt(sbi, BARRIER);
190 goto retry;
191 }
192 } else { 186 } else {
193 err = sync_dirty_buffer(nilfs->ns_sbh[0]); 187 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
194 } 188 }
@@ -342,8 +336,6 @@ static void nilfs_put_super(struct super_block *sb)
342 struct nilfs_sb_info *sbi = NILFS_SB(sb); 336 struct nilfs_sb_info *sbi = NILFS_SB(sb);
343 struct the_nilfs *nilfs = sbi->s_nilfs; 337 struct the_nilfs *nilfs = sbi->s_nilfs;
344 338
345 lock_kernel();
346
347 nilfs_detach_segment_constructor(sbi); 339 nilfs_detach_segment_constructor(sbi);
348 340
349 if (!(sb->s_flags & MS_RDONLY)) { 341 if (!(sb->s_flags & MS_RDONLY)) {
@@ -351,18 +343,15 @@ static void nilfs_put_super(struct super_block *sb)
351 nilfs_cleanup_super(sbi); 343 nilfs_cleanup_super(sbi);
352 up_write(&nilfs->ns_sem); 344 up_write(&nilfs->ns_sem);
353 } 345 }
354 down_write(&nilfs->ns_super_sem);
355 if (nilfs->ns_current == sbi)
356 nilfs->ns_current = NULL;
357 up_write(&nilfs->ns_super_sem);
358 346
359 nilfs_detach_checkpoint(sbi); 347 iput(nilfs->ns_sufile);
360 put_nilfs(sbi->s_nilfs); 348 iput(nilfs->ns_cpfile);
349 iput(nilfs->ns_dat);
350
351 destroy_nilfs(nilfs);
361 sbi->s_super = NULL; 352 sbi->s_super = NULL;
362 sb->s_fs_info = NULL; 353 sb->s_fs_info = NULL;
363 nilfs_put_sbinfo(sbi); 354 kfree(sbi);
364
365 unlock_kernel();
366} 355}
367 356
368static int nilfs_sync_fs(struct super_block *sb, int wait) 357static int nilfs_sync_fs(struct super_block *sb, int wait)
@@ -389,21 +378,22 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
389 return err; 378 return err;
390} 379}
391 380
392int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) 381int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
382 struct nilfs_root **rootp)
393{ 383{
394 struct the_nilfs *nilfs = sbi->s_nilfs; 384 struct the_nilfs *nilfs = sbi->s_nilfs;
385 struct nilfs_root *root;
395 struct nilfs_checkpoint *raw_cp; 386 struct nilfs_checkpoint *raw_cp;
396 struct buffer_head *bh_cp; 387 struct buffer_head *bh_cp;
397 int err; 388 int err = -ENOMEM;
398 389
399 down_write(&nilfs->ns_super_sem); 390 root = nilfs_find_or_create_root(
400 list_add(&sbi->s_list, &nilfs->ns_supers); 391 nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
401 up_write(&nilfs->ns_super_sem); 392 if (!root)
393 return err;
402 394
403 err = -ENOMEM; 395 if (root->ifile)
404 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); 396 goto reuse; /* already attached checkpoint */
405 if (!sbi->s_ifile)
406 goto delist;
407 397
408 down_read(&nilfs->ns_segctor_sem); 398 down_read(&nilfs->ns_segctor_sem);
409 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 399 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -419,45 +409,64 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
419 } 409 }
420 goto failed; 410 goto failed;
421 } 411 }
422 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); 412
423 if (unlikely(err)) 413 err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
414 &raw_cp->cp_ifile_inode, &root->ifile);
415 if (err)
424 goto failed_bh; 416 goto failed_bh;
425 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); 417
426 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); 418 atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
419 atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
427 420
428 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 421 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
422
423 reuse:
424 *rootp = root;
429 return 0; 425 return 0;
430 426
431 failed_bh: 427 failed_bh:
432 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 428 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
433 failed: 429 failed:
434 nilfs_mdt_destroy(sbi->s_ifile); 430 nilfs_put_root(root);
435 sbi->s_ifile = NULL; 431
432 return err;
433}
434
435static int nilfs_freeze(struct super_block *sb)
436{
437 struct nilfs_sb_info *sbi = NILFS_SB(sb);
438 struct the_nilfs *nilfs = sbi->s_nilfs;
439 int err;
436 440
437 delist: 441 if (sb->s_flags & MS_RDONLY)
438 down_write(&nilfs->ns_super_sem); 442 return 0;
439 list_del_init(&sbi->s_list);
440 up_write(&nilfs->ns_super_sem);
441 443
444 /* Mark super block clean */
445 down_write(&nilfs->ns_sem);
446 err = nilfs_cleanup_super(sbi);
447 up_write(&nilfs->ns_sem);
442 return err; 448 return err;
443} 449}
444 450
445void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) 451static int nilfs_unfreeze(struct super_block *sb)
446{ 452{
453 struct nilfs_sb_info *sbi = NILFS_SB(sb);
447 struct the_nilfs *nilfs = sbi->s_nilfs; 454 struct the_nilfs *nilfs = sbi->s_nilfs;
448 455
449 nilfs_mdt_destroy(sbi->s_ifile); 456 if (sb->s_flags & MS_RDONLY)
450 sbi->s_ifile = NULL; 457 return 0;
451 down_write(&nilfs->ns_super_sem); 458
452 list_del_init(&sbi->s_list); 459 down_write(&nilfs->ns_sem);
453 up_write(&nilfs->ns_super_sem); 460 nilfs_setup_super(sbi, false);
461 up_write(&nilfs->ns_sem);
462 return 0;
454} 463}
455 464
456static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) 465static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
457{ 466{
458 struct super_block *sb = dentry->d_sb; 467 struct super_block *sb = dentry->d_sb;
459 struct nilfs_sb_info *sbi = NILFS_SB(sb); 468 struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
460 struct the_nilfs *nilfs = sbi->s_nilfs; 469 struct the_nilfs *nilfs = root->nilfs;
461 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 470 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
462 unsigned long long blocks; 471 unsigned long long blocks;
463 unsigned long overhead; 472 unsigned long overhead;
@@ -493,7 +502,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
493 buf->f_bfree = nfreeblocks; 502 buf->f_bfree = nfreeblocks;
494 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? 503 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
495 (buf->f_bfree - nrsvblocks) : 0; 504 (buf->f_bfree - nrsvblocks) : 0;
496 buf->f_files = atomic_read(&sbi->s_inodes_count); 505 buf->f_files = atomic_read(&root->inodes_count);
497 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ 506 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
498 buf->f_namelen = NILFS_NAME_LEN; 507 buf->f_namelen = NILFS_NAME_LEN;
499 buf->f_fsid.val[0] = (u32)id; 508 buf->f_fsid.val[0] = (u32)id;
@@ -506,12 +515,12 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
506{ 515{
507 struct super_block *sb = vfs->mnt_sb; 516 struct super_block *sb = vfs->mnt_sb;
508 struct nilfs_sb_info *sbi = NILFS_SB(sb); 517 struct nilfs_sb_info *sbi = NILFS_SB(sb);
518 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
509 519
510 if (!nilfs_test_opt(sbi, BARRIER)) 520 if (!nilfs_test_opt(sbi, BARRIER))
511 seq_puts(seq, ",nobarrier"); 521 seq_puts(seq, ",nobarrier");
512 if (nilfs_test_opt(sbi, SNAPSHOT)) 522 if (root->cno != NILFS_CPTREE_CURRENT_CNO)
513 seq_printf(seq, ",cp=%llu", 523 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
514 (unsigned long long int)sbi->s_snapshot_cno);
515 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 524 if (nilfs_test_opt(sbi, ERRORS_PANIC))
516 seq_puts(seq, ",errors=panic"); 525 seq_puts(seq, ",errors=panic");
517 if (nilfs_test_opt(sbi, ERRORS_CONT)) 526 if (nilfs_test_opt(sbi, ERRORS_CONT))
@@ -537,6 +546,8 @@ static const struct super_operations nilfs_sops = {
537 .put_super = nilfs_put_super, 546 .put_super = nilfs_put_super,
538 /* .write_super = nilfs_write_super, */ 547 /* .write_super = nilfs_write_super, */
539 .sync_fs = nilfs_sync_fs, 548 .sync_fs = nilfs_sync_fs,
549 .freeze_fs = nilfs_freeze,
550 .unfreeze_fs = nilfs_unfreeze,
540 /* .write_super_lockfs */ 551 /* .write_super_lockfs */
541 /* .unlockfs */ 552 /* .unlockfs */
542 .statfs = nilfs_statfs, 553 .statfs = nilfs_statfs,
@@ -545,48 +556,6 @@ static const struct super_operations nilfs_sops = {
545 .show_options = nilfs_show_options 556 .show_options = nilfs_show_options
546}; 557};
547 558
548static struct inode *
549nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
550{
551 struct inode *inode;
552
553 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
554 ino != NILFS_SKETCH_INO)
555 return ERR_PTR(-ESTALE);
556
557 inode = nilfs_iget(sb, ino);
558 if (IS_ERR(inode))
559 return ERR_CAST(inode);
560 if (generation && inode->i_generation != generation) {
561 iput(inode);
562 return ERR_PTR(-ESTALE);
563 }
564
565 return inode;
566}
567
568static struct dentry *
569nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
570 int fh_type)
571{
572 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
573 nilfs_nfs_get_inode);
574}
575
576static struct dentry *
577nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
578 int fh_type)
579{
580 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
581 nilfs_nfs_get_inode);
582}
583
584static const struct export_operations nilfs_export_ops = {
585 .fh_to_dentry = nilfs_fh_to_dentry,
586 .fh_to_parent = nilfs_fh_to_parent,
587 .get_parent = nilfs_get_parent,
588};
589
590enum { 559enum {
591 Opt_err_cont, Opt_err_panic, Opt_err_ro, 560 Opt_err_cont, Opt_err_panic, Opt_err_ro,
592 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 561 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
@@ -612,7 +581,6 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
612 struct nilfs_sb_info *sbi = NILFS_SB(sb); 581 struct nilfs_sb_info *sbi = NILFS_SB(sb);
613 char *p; 582 char *p;
614 substring_t args[MAX_OPT_ARGS]; 583 substring_t args[MAX_OPT_ARGS];
615 int option;
616 584
617 if (!options) 585 if (!options)
618 return 1; 586 return 1;
@@ -650,30 +618,12 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
650 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); 618 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
651 break; 619 break;
652 case Opt_snapshot: 620 case Opt_snapshot:
653 if (match_int(&args[0], &option) || option <= 0)
654 return 0;
655 if (is_remount) { 621 if (is_remount) {
656 if (!nilfs_test_opt(sbi, SNAPSHOT)) { 622 printk(KERN_ERR
657 printk(KERN_ERR 623 "NILFS: \"%s\" option is invalid "
658 "NILFS: cannot change regular " 624 "for remount.\n", p);
659 "mount to snapshot.\n");
660 return 0;
661 } else if (option != sbi->s_snapshot_cno) {
662 printk(KERN_ERR
663 "NILFS: cannot remount to a "
664 "different snapshot.\n");
665 return 0;
666 }
667 break;
668 }
669 if (!(sb->s_flags & MS_RDONLY)) {
670 printk(KERN_ERR "NILFS: cannot mount snapshot "
671 "read/write. A read-only option is "
672 "required.\n");
673 return 0; 625 return 0;
674 } 626 }
675 sbi->s_snapshot_cno = option;
676 nilfs_set_opt(sbi, SNAPSHOT);
677 break; 627 break;
678 case Opt_norecovery: 628 case Opt_norecovery:
679 nilfs_set_opt(sbi, NORECOVERY); 629 nilfs_set_opt(sbi, NORECOVERY);
@@ -701,7 +651,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
701 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; 651 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
702} 652}
703 653
704static int nilfs_setup_super(struct nilfs_sb_info *sbi) 654static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
705{ 655{
706 struct the_nilfs *nilfs = sbi->s_nilfs; 656 struct the_nilfs *nilfs = sbi->s_nilfs;
707 struct nilfs_super_block **sbp; 657 struct nilfs_super_block **sbp;
@@ -713,6 +663,9 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
713 if (!sbp) 663 if (!sbp)
714 return -EIO; 664 return -EIO;
715 665
666 if (!is_mount)
667 goto skip_mount_setup;
668
716 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); 669 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
717 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); 670 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
718 671
@@ -729,9 +682,11 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
729 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); 682 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
730 683
731 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); 684 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
685 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
686
687skip_mount_setup:
732 sbp[0]->s_state = 688 sbp[0]->s_state =
733 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); 689 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
734 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
735 /* synchronize sbp[1] with sbp[0] */ 690 /* synchronize sbp[1] with sbp[0] */
736 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 691 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
737 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 692 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
@@ -798,22 +753,156 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
798 return 0; 753 return 0;
799} 754}
800 755
756static int nilfs_get_root_dentry(struct super_block *sb,
757 struct nilfs_root *root,
758 struct dentry **root_dentry)
759{
760 struct inode *inode;
761 struct dentry *dentry;
762 int ret = 0;
763
764 inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
765 if (IS_ERR(inode)) {
766 printk(KERN_ERR "NILFS: get root inode failed\n");
767 ret = PTR_ERR(inode);
768 goto out;
769 }
770 if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
771 iput(inode);
772 printk(KERN_ERR "NILFS: corrupt root inode.\n");
773 ret = -EINVAL;
774 goto out;
775 }
776
777 if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
778 dentry = d_find_alias(inode);
779 if (!dentry) {
780 dentry = d_alloc_root(inode);
781 if (!dentry) {
782 iput(inode);
783 ret = -ENOMEM;
784 goto failed_dentry;
785 }
786 } else {
787 iput(inode);
788 }
789 } else {
790 dentry = d_obtain_alias(inode);
791 if (IS_ERR(dentry)) {
792 ret = PTR_ERR(dentry);
793 goto failed_dentry;
794 }
795 }
796 *root_dentry = dentry;
797 out:
798 return ret;
799
800 failed_dentry:
801 printk(KERN_ERR "NILFS: get root dentry failed\n");
802 goto out;
803}
804
805static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
806 struct dentry **root_dentry)
807{
808 struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
809 struct nilfs_root *root;
810 int ret;
811
812 down_read(&nilfs->ns_segctor_sem);
813 ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
814 up_read(&nilfs->ns_segctor_sem);
815 if (ret < 0) {
816 ret = (ret == -ENOENT) ? -EINVAL : ret;
817 goto out;
818 } else if (!ret) {
819 printk(KERN_ERR "NILFS: The specified checkpoint is "
820 "not a snapshot (checkpoint number=%llu).\n",
821 (unsigned long long)cno);
822 ret = -EINVAL;
823 goto out;
824 }
825
826 ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
827 if (ret) {
828 printk(KERN_ERR "NILFS: error loading snapshot "
829 "(checkpoint number=%llu).\n",
830 (unsigned long long)cno);
831 goto out;
832 }
833 ret = nilfs_get_root_dentry(s, root, root_dentry);
834 nilfs_put_root(root);
835 out:
836 return ret;
837}
838
839static int nilfs_tree_was_touched(struct dentry *root_dentry)
840{
841 return atomic_read(&root_dentry->d_count) > 1;
842}
843
844/**
845 * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
846 * @root_dentry: root dentry of the tree to be shrunk
847 *
848 * This function returns true if the tree was in-use.
849 */
850static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
851{
852 if (have_submounts(root_dentry))
853 return true;
854 shrink_dcache_parent(root_dentry);
855 return nilfs_tree_was_touched(root_dentry);
856}
857
858int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
859{
860 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
861 struct nilfs_root *root;
862 struct inode *inode;
863 struct dentry *dentry;
864 int ret;
865
866 if (cno < 0 || cno > nilfs->ns_cno)
867 return false;
868
869 if (cno >= nilfs_last_cno(nilfs))
870 return true; /* protect recent checkpoints */
871
872 ret = false;
873 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
874 if (root) {
875 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
876 if (inode) {
877 dentry = d_find_alias(inode);
878 if (dentry) {
879 if (nilfs_tree_was_touched(dentry))
880 ret = nilfs_try_to_shrink_tree(dentry);
881 dput(dentry);
882 }
883 iput(inode);
884 }
885 nilfs_put_root(root);
886 }
887 return ret;
888}
889
801/** 890/**
802 * nilfs_fill_super() - initialize a super block instance 891 * nilfs_fill_super() - initialize a super block instance
803 * @sb: super_block 892 * @sb: super_block
804 * @data: mount options 893 * @data: mount options
805 * @silent: silent mode flag 894 * @silent: silent mode flag
806 * @nilfs: the_nilfs struct
807 * 895 *
808 * This function is called exclusively by nilfs->ns_mount_mutex. 896 * This function is called exclusively by nilfs->ns_mount_mutex.
809 * So, the recovery process is protected from other simultaneous mounts. 897 * So, the recovery process is protected from other simultaneous mounts.
810 */ 898 */
811static int 899static int
812nilfs_fill_super(struct super_block *sb, void *data, int silent, 900nilfs_fill_super(struct super_block *sb, void *data, int silent)
813 struct the_nilfs *nilfs)
814{ 901{
902 struct the_nilfs *nilfs;
815 struct nilfs_sb_info *sbi; 903 struct nilfs_sb_info *sbi;
816 struct inode *root; 904 struct nilfs_root *fsroot;
905 struct backing_dev_info *bdi;
817 __u64 cno; 906 __u64 cno;
818 int err; 907 int err;
819 908
@@ -822,19 +911,21 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
822 return -ENOMEM; 911 return -ENOMEM;
823 912
824 sb->s_fs_info = sbi; 913 sb->s_fs_info = sbi;
914 sbi->s_super = sb;
825 915
826 get_nilfs(nilfs); 916 nilfs = alloc_nilfs(sb->s_bdev);
917 if (!nilfs) {
918 err = -ENOMEM;
919 goto failed_sbi;
920 }
827 sbi->s_nilfs = nilfs; 921 sbi->s_nilfs = nilfs;
828 sbi->s_super = sb;
829 atomic_set(&sbi->s_count, 1);
830 922
831 err = init_nilfs(nilfs, sbi, (char *)data); 923 err = init_nilfs(nilfs, sbi, (char *)data);
832 if (err) 924 if (err)
833 goto failed_sbi; 925 goto failed_nilfs;
834 926
835 spin_lock_init(&sbi->s_inode_lock); 927 spin_lock_init(&sbi->s_inode_lock);
836 INIT_LIST_HEAD(&sbi->s_dirty_files); 928 INIT_LIST_HEAD(&sbi->s_dirty_files);
837 INIT_LIST_HEAD(&sbi->s_list);
838 929
839 /* 930 /*
840 * Following initialization is overlapped because 931 * Following initialization is overlapped because
@@ -850,94 +941,59 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
850 sb->s_export_op = &nilfs_export_ops; 941 sb->s_export_op = &nilfs_export_ops;
851 sb->s_root = NULL; 942 sb->s_root = NULL;
852 sb->s_time_gran = 1; 943 sb->s_time_gran = 1;
853 sb->s_bdi = nilfs->ns_bdi; 944
945 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
946 sb->s_bdi = bdi ? : &default_backing_dev_info;
854 947
855 err = load_nilfs(nilfs, sbi); 948 err = load_nilfs(nilfs, sbi);
856 if (err) 949 if (err)
857 goto failed_sbi; 950 goto failed_nilfs;
858 951
859 cno = nilfs_last_cno(nilfs); 952 cno = nilfs_last_cno(nilfs);
860 953 err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
861 if (sb->s_flags & MS_RDONLY) {
862 if (nilfs_test_opt(sbi, SNAPSHOT)) {
863 down_read(&nilfs->ns_segctor_sem);
864 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
865 sbi->s_snapshot_cno);
866 up_read(&nilfs->ns_segctor_sem);
867 if (err < 0) {
868 if (err == -ENOENT)
869 err = -EINVAL;
870 goto failed_sbi;
871 }
872 if (!err) {
873 printk(KERN_ERR
874 "NILFS: The specified checkpoint is "
875 "not a snapshot "
876 "(checkpoint number=%llu).\n",
877 (unsigned long long)sbi->s_snapshot_cno);
878 err = -EINVAL;
879 goto failed_sbi;
880 }
881 cno = sbi->s_snapshot_cno;
882 }
883 }
884
885 err = nilfs_attach_checkpoint(sbi, cno);
886 if (err) { 954 if (err) {
887 printk(KERN_ERR "NILFS: error loading a checkpoint" 955 printk(KERN_ERR "NILFS: error loading last checkpoint "
888 " (checkpoint number=%llu).\n", (unsigned long long)cno); 956 "(checkpoint number=%llu).\n", (unsigned long long)cno);
889 goto failed_sbi; 957 goto failed_unload;
890 } 958 }
891 959
892 if (!(sb->s_flags & MS_RDONLY)) { 960 if (!(sb->s_flags & MS_RDONLY)) {
893 err = nilfs_attach_segment_constructor(sbi); 961 err = nilfs_attach_segment_constructor(sbi, fsroot);
894 if (err) 962 if (err)
895 goto failed_checkpoint; 963 goto failed_checkpoint;
896 } 964 }
897 965
898 root = nilfs_iget(sb, NILFS_ROOT_INO); 966 err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
899 if (IS_ERR(root)) { 967 if (err)
900 printk(KERN_ERR "NILFS: get root inode failed\n");
901 err = PTR_ERR(root);
902 goto failed_segctor;
903 }
904 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
905 iput(root);
906 printk(KERN_ERR "NILFS: corrupt root inode.\n");
907 err = -EINVAL;
908 goto failed_segctor;
909 }
910 sb->s_root = d_alloc_root(root);
911 if (!sb->s_root) {
912 iput(root);
913 printk(KERN_ERR "NILFS: get root dentry failed\n");
914 err = -ENOMEM;
915 goto failed_segctor; 968 goto failed_segctor;
916 } 969
970 nilfs_put_root(fsroot);
917 971
918 if (!(sb->s_flags & MS_RDONLY)) { 972 if (!(sb->s_flags & MS_RDONLY)) {
919 down_write(&nilfs->ns_sem); 973 down_write(&nilfs->ns_sem);
920 nilfs_setup_super(sbi); 974 nilfs_setup_super(sbi, true);
921 up_write(&nilfs->ns_sem); 975 up_write(&nilfs->ns_sem);
922 } 976 }
923 977
924 down_write(&nilfs->ns_super_sem);
925 if (!nilfs_test_opt(sbi, SNAPSHOT))
926 nilfs->ns_current = sbi;
927 up_write(&nilfs->ns_super_sem);
928
929 return 0; 978 return 0;
930 979
931 failed_segctor: 980 failed_segctor:
932 nilfs_detach_segment_constructor(sbi); 981 nilfs_detach_segment_constructor(sbi);
933 982
934 failed_checkpoint: 983 failed_checkpoint:
935 nilfs_detach_checkpoint(sbi); 984 nilfs_put_root(fsroot);
985
986 failed_unload:
987 iput(nilfs->ns_sufile);
988 iput(nilfs->ns_cpfile);
989 iput(nilfs->ns_dat);
990
991 failed_nilfs:
992 destroy_nilfs(nilfs);
936 993
937 failed_sbi: 994 failed_sbi:
938 put_nilfs(nilfs);
939 sb->s_fs_info = NULL; 995 sb->s_fs_info = NULL;
940 nilfs_put_sbinfo(sbi); 996 kfree(sbi);
941 return err; 997 return err;
942} 998}
943 999
@@ -947,15 +1003,10 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
947 struct the_nilfs *nilfs = sbi->s_nilfs; 1003 struct the_nilfs *nilfs = sbi->s_nilfs;
948 unsigned long old_sb_flags; 1004 unsigned long old_sb_flags;
949 struct nilfs_mount_options old_opts; 1005 struct nilfs_mount_options old_opts;
950 int was_snapshot, err; 1006 int err;
951
952 lock_kernel();
953 1007
954 down_write(&nilfs->ns_super_sem);
955 old_sb_flags = sb->s_flags; 1008 old_sb_flags = sb->s_flags;
956 old_opts.mount_opt = sbi->s_mount_opt; 1009 old_opts.mount_opt = sbi->s_mount_opt;
957 old_opts.snapshot_cno = sbi->s_snapshot_cno;
958 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
959 1010
960 if (!parse_options(data, sb, 1)) { 1011 if (!parse_options(data, sb, 1)) {
961 err = -EINVAL; 1012 err = -EINVAL;
@@ -964,11 +1015,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
964 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 1015 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
965 1016
966 err = -EINVAL; 1017 err = -EINVAL;
967 if (was_snapshot && !(*flags & MS_RDONLY)) {
968 printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
969 "read/write.\n", sb->s_id);
970 goto restore_opts;
971 }
972 1018
973 if (!nilfs_valid_fs(nilfs)) { 1019 if (!nilfs_valid_fs(nilfs)) {
974 printk(KERN_WARNING "NILFS (device %s): couldn't " 1020 printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -993,6 +1039,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
993 up_write(&nilfs->ns_sem); 1039 up_write(&nilfs->ns_sem);
994 } else { 1040 } else {
995 __u64 features; 1041 __u64 features;
1042 struct nilfs_root *root;
996 1043
997 /* 1044 /*
998 * Mounting a RDONLY partition read-write, so reread and 1045 * Mounting a RDONLY partition read-write, so reread and
@@ -1014,25 +1061,21 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1014 1061
1015 sb->s_flags &= ~MS_RDONLY; 1062 sb->s_flags &= ~MS_RDONLY;
1016 1063
1017 err = nilfs_attach_segment_constructor(sbi); 1064 root = NILFS_I(sb->s_root->d_inode)->i_root;
1065 err = nilfs_attach_segment_constructor(sbi, root);
1018 if (err) 1066 if (err)
1019 goto restore_opts; 1067 goto restore_opts;
1020 1068
1021 down_write(&nilfs->ns_sem); 1069 down_write(&nilfs->ns_sem);
1022 nilfs_setup_super(sbi); 1070 nilfs_setup_super(sbi, true);
1023 up_write(&nilfs->ns_sem); 1071 up_write(&nilfs->ns_sem);
1024 } 1072 }
1025 out: 1073 out:
1026 up_write(&nilfs->ns_super_sem);
1027 unlock_kernel();
1028 return 0; 1074 return 0;
1029 1075
1030 restore_opts: 1076 restore_opts:
1031 sb->s_flags = old_sb_flags; 1077 sb->s_flags = old_sb_flags;
1032 sbi->s_mount_opt = old_opts.mount_opt; 1078 sbi->s_mount_opt = old_opts.mount_opt;
1033 sbi->s_snapshot_cno = old_opts.snapshot_cno;
1034 up_write(&nilfs->ns_super_sem);
1035 unlock_kernel();
1036 return err; 1079 return err;
1037} 1080}
1038 1081
@@ -1052,7 +1095,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1052{ 1095{
1053 char *p, *options = data; 1096 char *p, *options = data;
1054 substring_t args[MAX_OPT_ARGS]; 1097 substring_t args[MAX_OPT_ARGS];
1055 int option, token; 1098 int token;
1056 int ret = 0; 1099 int ret = 0;
1057 1100
1058 do { 1101 do {
@@ -1060,16 +1103,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1060 if (p != NULL && *p) { 1103 if (p != NULL && *p) {
1061 token = match_token(p, tokens, args); 1104 token = match_token(p, tokens, args);
1062 if (token == Opt_snapshot) { 1105 if (token == Opt_snapshot) {
1063 if (!(sd->flags & MS_RDONLY)) 1106 if (!(sd->flags & MS_RDONLY)) {
1064 ret++; 1107 ret++;
1065 else { 1108 } else {
1066 ret = match_int(&args[0], &option); 1109 sd->cno = simple_strtoull(args[0].from,
1067 if (!ret) { 1110 NULL, 0);
1068 if (option > 0) 1111 /*
1069 sd->cno = option; 1112 * No need to see the end pointer;
1070 else 1113 * match_token() has done syntax
1071 ret++; 1114 * checking.
1072 } 1115 */
1116 if (sd->cno == 0)
1117 ret++;
1073 } 1118 }
1074 } 1119 }
1075 if (ret) 1120 if (ret)
@@ -1086,43 +1131,33 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1086 1131
1087static int nilfs_set_bdev_super(struct super_block *s, void *data) 1132static int nilfs_set_bdev_super(struct super_block *s, void *data)
1088{ 1133{
1089 struct nilfs_super_data *sd = data; 1134 s->s_bdev = data;
1090
1091 s->s_bdev = sd->bdev;
1092 s->s_dev = s->s_bdev->bd_dev; 1135 s->s_dev = s->s_bdev->bd_dev;
1093 return 0; 1136 return 0;
1094} 1137}
1095 1138
1096static int nilfs_test_bdev_super(struct super_block *s, void *data) 1139static int nilfs_test_bdev_super(struct super_block *s, void *data)
1097{ 1140{
1098 struct nilfs_super_data *sd = data; 1141 return (void *)s->s_bdev == data;
1099
1100 return sd->sbi && s->s_fs_info == (void *)sd->sbi;
1101} 1142}
1102 1143
1103static int 1144static struct dentry *
1104nilfs_get_sb(struct file_system_type *fs_type, int flags, 1145nilfs_mount(struct file_system_type *fs_type, int flags,
1105 const char *dev_name, void *data, struct vfsmount *mnt) 1146 const char *dev_name, void *data)
1106{ 1147{
1107 struct nilfs_super_data sd; 1148 struct nilfs_super_data sd;
1108 struct super_block *s; 1149 struct super_block *s;
1109 fmode_t mode = FMODE_READ; 1150 fmode_t mode = FMODE_READ;
1110 struct the_nilfs *nilfs; 1151 struct dentry *root_dentry;
1111 int err, need_to_close = 1; 1152 int err, s_new = false;
1112 1153
1113 if (!(flags & MS_RDONLY)) 1154 if (!(flags & MS_RDONLY))
1114 mode |= FMODE_WRITE; 1155 mode |= FMODE_WRITE;
1115 1156
1116 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1157 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1117 if (IS_ERR(sd.bdev)) 1158 if (IS_ERR(sd.bdev))
1118 return PTR_ERR(sd.bdev); 1159 return ERR_CAST(sd.bdev);
1119 1160
1120 /*
1121 * To get mount instance using sget() vfs-routine, NILFS needs
1122 * much more information than normal filesystems to identify mount
1123 * instance. For snapshot mounts, not only a mount type (ro-mount
1124 * or rw-mount) but also a checkpoint number is required.
1125 */
1126 sd.cno = 0; 1161 sd.cno = 0;
1127 sd.flags = flags; 1162 sd.flags = flags;
1128 if (nilfs_identify((char *)data, &sd)) { 1163 if (nilfs_identify((char *)data, &sd)) {
@@ -1130,101 +1165,91 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1130 goto failed; 1165 goto failed;
1131 } 1166 }
1132 1167
1133 nilfs = find_or_create_nilfs(sd.bdev);
1134 if (!nilfs) {
1135 err = -ENOMEM;
1136 goto failed;
1137 }
1138
1139 mutex_lock(&nilfs->ns_mount_mutex);
1140
1141 if (!sd.cno) {
1142 /*
1143 * Check if an exclusive mount exists or not.
1144 * Snapshot mounts coexist with a current mount
1145 * (i.e. rw-mount or ro-mount), whereas rw-mount and
1146 * ro-mount are mutually exclusive.
1147 */
1148 down_read(&nilfs->ns_super_sem);
1149 if (nilfs->ns_current &&
1150 ((nilfs->ns_current->s_super->s_flags ^ flags)
1151 & MS_RDONLY)) {
1152 up_read(&nilfs->ns_super_sem);
1153 err = -EBUSY;
1154 goto failed_unlock;
1155 }
1156 up_read(&nilfs->ns_super_sem);
1157 }
1158
1159 /*
1160 * Find existing nilfs_sb_info struct
1161 */
1162 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1163
1164 /* 1168 /*
1165 * Get super block instance holding the nilfs_sb_info struct. 1169 * once the super is inserted into the list by sget, s_umount
1166 * A new instance is allocated if no existing mount is present or 1170 * will protect the lockfs code from trying to start a snapshot
1167 * existing instance has been unmounted. 1171 * while we are mounting
1168 */ 1172 */
1169 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); 1173 mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
1170 if (sd.sbi) 1174 if (sd.bdev->bd_fsfreeze_count > 0) {
1171 nilfs_put_sbinfo(sd.sbi); 1175 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1172 1176 err = -EBUSY;
1177 goto failed;
1178 }
1179 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
1180 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1173 if (IS_ERR(s)) { 1181 if (IS_ERR(s)) {
1174 err = PTR_ERR(s); 1182 err = PTR_ERR(s);
1175 goto failed_unlock; 1183 goto failed;
1176 } 1184 }
1177 1185
1178 if (!s->s_root) { 1186 if (!s->s_root) {
1179 char b[BDEVNAME_SIZE]; 1187 char b[BDEVNAME_SIZE];
1180 1188
1189 s_new = true;
1190
1181 /* New superblock instance created */ 1191 /* New superblock instance created */
1182 s->s_flags = flags; 1192 s->s_flags = flags;
1183 s->s_mode = mode; 1193 s->s_mode = mode;
1184 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1194 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1185 sb_set_blocksize(s, block_size(sd.bdev)); 1195 sb_set_blocksize(s, block_size(sd.bdev));
1186 1196
1187 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, 1197 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1188 nilfs);
1189 if (err) 1198 if (err)
1190 goto cancel_new; 1199 goto failed_super;
1191 1200
1192 s->s_flags |= MS_ACTIVE; 1201 s->s_flags |= MS_ACTIVE;
1193 need_to_close = 0; 1202 } else if (!sd.cno) {
1203 int busy = false;
1204
1205 if (nilfs_tree_was_touched(s->s_root)) {
1206 busy = nilfs_try_to_shrink_tree(s->s_root);
1207 if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
1208 printk(KERN_ERR "NILFS: the device already "
1209 "has a %s mount.\n",
1210 (s->s_flags & MS_RDONLY) ?
1211 "read-only" : "read/write");
1212 err = -EBUSY;
1213 goto failed_super;
1214 }
1215 }
1216 if (!busy) {
1217 /*
1218 * Try remount to setup mount states if the current
1219 * tree is not mounted and only snapshots use this sb.
1220 */
1221 err = nilfs_remount(s, &flags, data);
1222 if (err)
1223 goto failed_super;
1224 }
1194 } 1225 }
1195 1226
1196 mutex_unlock(&nilfs->ns_mount_mutex); 1227 if (sd.cno) {
1197 put_nilfs(nilfs); 1228 err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
1198 if (need_to_close) 1229 if (err)
1199 close_bdev_exclusive(sd.bdev, mode); 1230 goto failed_super;
1200 simple_set_mnt(mnt, s); 1231 } else {
1201 return 0; 1232 root_dentry = dget(s->s_root);
1233 }
1202 1234
1203 failed_unlock: 1235 if (!s_new)
1204 mutex_unlock(&nilfs->ns_mount_mutex); 1236 close_bdev_exclusive(sd.bdev, mode);
1205 put_nilfs(nilfs);
1206 failed:
1207 close_bdev_exclusive(sd.bdev, mode);
1208 1237
1209 return err; 1238 return root_dentry;
1210 1239
1211 cancel_new: 1240 failed_super:
1212 /* Abandoning the newly allocated superblock */
1213 mutex_unlock(&nilfs->ns_mount_mutex);
1214 put_nilfs(nilfs);
1215 deactivate_locked_super(s); 1241 deactivate_locked_super(s);
1216 /* 1242
1217 * deactivate_locked_super() invokes close_bdev_exclusive(). 1243 failed:
1218 * We must finish all post-cleaning before this call; 1244 if (!s_new)
1219 * put_nilfs() needs the block device. 1245 close_bdev_exclusive(sd.bdev, mode);
1220 */ 1246 return ERR_PTR(err);
1221 return err;
1222} 1247}
1223 1248
1224struct file_system_type nilfs_fs_type = { 1249struct file_system_type nilfs_fs_type = {
1225 .owner = THIS_MODULE, 1250 .owner = THIS_MODULE,
1226 .name = "nilfs2", 1251 .name = "nilfs2",
1227 .get_sb = nilfs_get_sb, 1252 .mount = nilfs_mount,
1228 .kill_sb = kill_block_super, 1253 .kill_sb = kill_block_super,
1229 .fs_flags = FS_REQUIRES_DEV, 1254 .fs_flags = FS_REQUIRES_DEV,
1230}; 1255};
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba7c10c917fc..0254be2d73c6 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,9 +35,6 @@
35#include "segbuf.h" 35#include "segbuf.h"
36 36
37 37
38static LIST_HEAD(nilfs_objects);
39static DEFINE_SPINLOCK(nilfs_lock);
40
41static int nilfs_valid_sb(struct nilfs_super_block *sbp); 38static int nilfs_valid_sb(struct nilfs_super_block *sbp);
42 39
43void nilfs_set_last_segment(struct the_nilfs *nilfs, 40void nilfs_set_last_segment(struct the_nilfs *nilfs,
@@ -61,16 +58,13 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
61} 58}
62 59
63/** 60/**
64 * alloc_nilfs - allocate the_nilfs structure 61 * alloc_nilfs - allocate a nilfs object
65 * @bdev: block device to which the_nilfs is related 62 * @bdev: block device to which the_nilfs is related
66 * 63 *
67 * alloc_nilfs() allocates memory for the_nilfs and
68 * initializes its reference count and locks.
69 *
70 * Return Value: On success, pointer to the_nilfs is returned. 64 * Return Value: On success, pointer to the_nilfs is returned.
71 * On error, NULL is returned. 65 * On error, NULL is returned.
72 */ 66 */
73static struct the_nilfs *alloc_nilfs(struct block_device *bdev) 67struct the_nilfs *alloc_nilfs(struct block_device *bdev)
74{ 68{
75 struct the_nilfs *nilfs; 69 struct the_nilfs *nilfs;
76 70
@@ -79,103 +73,38 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
79 return NULL; 73 return NULL;
80 74
81 nilfs->ns_bdev = bdev; 75 nilfs->ns_bdev = bdev;
82 atomic_set(&nilfs->ns_count, 1);
83 atomic_set(&nilfs->ns_ndirtyblks, 0); 76 atomic_set(&nilfs->ns_ndirtyblks, 0);
84 init_rwsem(&nilfs->ns_sem); 77 init_rwsem(&nilfs->ns_sem);
85 init_rwsem(&nilfs->ns_super_sem); 78 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
86 mutex_init(&nilfs->ns_mount_mutex);
87 init_rwsem(&nilfs->ns_writer_sem);
88 INIT_LIST_HEAD(&nilfs->ns_list);
89 INIT_LIST_HEAD(&nilfs->ns_supers);
90 spin_lock_init(&nilfs->ns_last_segment_lock); 79 spin_lock_init(&nilfs->ns_last_segment_lock);
91 nilfs->ns_gc_inodes_h = NULL; 80 nilfs->ns_cptree = RB_ROOT;
81 spin_lock_init(&nilfs->ns_cptree_lock);
92 init_rwsem(&nilfs->ns_segctor_sem); 82 init_rwsem(&nilfs->ns_segctor_sem);
93 83
94 return nilfs; 84 return nilfs;
95} 85}
96 86
97/** 87/**
98 * find_or_create_nilfs - find or create nilfs object 88 * destroy_nilfs - destroy nilfs object
99 * @bdev: block device to which the_nilfs is related 89 * @nilfs: nilfs object to be released
100 *
101 * find_nilfs() looks up an existent nilfs object created on the
102 * device and gets the reference count of the object. If no nilfs object
103 * is found on the device, a new nilfs object is allocated.
104 *
105 * Return Value: On success, pointer to the nilfs object is returned.
106 * On error, NULL is returned.
107 */
108struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
109{
110 struct the_nilfs *nilfs, *new = NULL;
111
112 retry:
113 spin_lock(&nilfs_lock);
114 list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
115 if (nilfs->ns_bdev == bdev) {
116 get_nilfs(nilfs);
117 spin_unlock(&nilfs_lock);
118 if (new)
119 put_nilfs(new);
120 return nilfs; /* existing object */
121 }
122 }
123 if (new) {
124 list_add_tail(&new->ns_list, &nilfs_objects);
125 spin_unlock(&nilfs_lock);
126 return new; /* new object */
127 }
128 spin_unlock(&nilfs_lock);
129
130 new = alloc_nilfs(bdev);
131 if (new)
132 goto retry;
133 return NULL; /* insufficient memory */
134}
135
136/**
137 * put_nilfs - release a reference to the_nilfs
138 * @nilfs: the_nilfs structure to be released
139 *
140 * put_nilfs() decrements a reference counter of the_nilfs.
141 * If the reference count reaches zero, the_nilfs is freed.
142 */ 90 */
143void put_nilfs(struct the_nilfs *nilfs) 91void destroy_nilfs(struct the_nilfs *nilfs)
144{ 92{
145 spin_lock(&nilfs_lock);
146 if (!atomic_dec_and_test(&nilfs->ns_count)) {
147 spin_unlock(&nilfs_lock);
148 return;
149 }
150 list_del_init(&nilfs->ns_list);
151 spin_unlock(&nilfs_lock);
152
153 /*
154 * Increment of ns_count never occurs below because the caller
155 * of get_nilfs() holds at least one reference to the_nilfs.
156 * Thus its exclusion control is not required here.
157 */
158
159 might_sleep(); 93 might_sleep();
160 if (nilfs_loaded(nilfs)) {
161 nilfs_mdt_destroy(nilfs->ns_sufile);
162 nilfs_mdt_destroy(nilfs->ns_cpfile);
163 nilfs_mdt_destroy(nilfs->ns_dat);
164 nilfs_mdt_destroy(nilfs->ns_gc_dat);
165 }
166 if (nilfs_init(nilfs)) { 94 if (nilfs_init(nilfs)) {
167 nilfs_destroy_gccache(nilfs);
168 brelse(nilfs->ns_sbh[0]); 95 brelse(nilfs->ns_sbh[0]);
169 brelse(nilfs->ns_sbh[1]); 96 brelse(nilfs->ns_sbh[1]);
170 } 97 }
171 kfree(nilfs); 98 kfree(nilfs);
172} 99}
173 100
174static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block) 101static int nilfs_load_super_root(struct the_nilfs *nilfs,
102 struct super_block *sb, sector_t sr_block)
175{ 103{
176 struct buffer_head *bh_sr; 104 struct buffer_head *bh_sr;
177 struct nilfs_super_root *raw_sr; 105 struct nilfs_super_root *raw_sr;
178 struct nilfs_super_block **sbp = nilfs->ns_sbp; 106 struct nilfs_super_block **sbp = nilfs->ns_sbp;
107 struct nilfs_inode *rawi;
179 unsigned dat_entry_size, segment_usage_size, checkpoint_size; 108 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
180 unsigned inode_size; 109 unsigned inode_size;
181 int err; 110 int err;
@@ -192,40 +121,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
192 121
193 inode_size = nilfs->ns_inode_size; 122 inode_size = nilfs->ns_inode_size;
194 123
195 err = -ENOMEM; 124 rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
196 nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size); 125 err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
197 if (unlikely(!nilfs->ns_dat)) 126 if (err)
198 goto failed; 127 goto failed;
199 128
200 nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size); 129 rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
201 if (unlikely(!nilfs->ns_gc_dat)) 130 err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
131 if (err)
202 goto failed_dat; 132 goto failed_dat;
203 133
204 nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size); 134 rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
205 if (unlikely(!nilfs->ns_cpfile)) 135 err = nilfs_sufile_read(sb, segment_usage_size, rawi,
206 goto failed_gc_dat; 136 &nilfs->ns_sufile);
207 137 if (err)
208 nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
209 if (unlikely(!nilfs->ns_sufile))
210 goto failed_cpfile; 138 goto failed_cpfile;
211 139
212 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
213
214 err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
215 NILFS_SR_DAT_OFFSET(inode_size));
216 if (unlikely(err))
217 goto failed_sufile;
218
219 err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
220 NILFS_SR_CPFILE_OFFSET(inode_size));
221 if (unlikely(err))
222 goto failed_sufile;
223
224 err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
225 NILFS_SR_SUFILE_OFFSET(inode_size));
226 if (unlikely(err))
227 goto failed_sufile;
228
229 raw_sr = (struct nilfs_super_root *)bh_sr->b_data; 140 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
230 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); 141 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
231 142
@@ -233,17 +144,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
233 brelse(bh_sr); 144 brelse(bh_sr);
234 return err; 145 return err;
235 146
236 failed_sufile:
237 nilfs_mdt_destroy(nilfs->ns_sufile);
238
239 failed_cpfile: 147 failed_cpfile:
240 nilfs_mdt_destroy(nilfs->ns_cpfile); 148 iput(nilfs->ns_cpfile);
241
242 failed_gc_dat:
243 nilfs_mdt_destroy(nilfs->ns_gc_dat);
244 149
245 failed_dat: 150 failed_dat:
246 nilfs_mdt_destroy(nilfs->ns_dat); 151 iput(nilfs->ns_dat);
247 goto failed; 152 goto failed;
248} 153}
249 154
@@ -306,15 +211,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
306 int valid_fs = nilfs_valid_fs(nilfs); 211 int valid_fs = nilfs_valid_fs(nilfs);
307 int err; 212 int err;
308 213
309 if (nilfs_loaded(nilfs)) {
310 if (valid_fs ||
311 ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
312 return 0;
313 printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
314 "recovery state.\n");
315 return -EINVAL;
316 }
317
318 if (!valid_fs) { 214 if (!valid_fs) {
319 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); 215 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
320 if (s_flags & MS_RDONLY) { 216 if (s_flags & MS_RDONLY) {
@@ -375,7 +271,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
375 goto scan_error; 271 goto scan_error;
376 } 272 }
377 273
378 err = nilfs_load_super_root(nilfs, ri.ri_super_root); 274 err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
379 if (unlikely(err)) { 275 if (unlikely(err)) {
380 printk(KERN_ERR "NILFS: error loading super root.\n"); 276 printk(KERN_ERR "NILFS: error loading super root.\n");
381 goto failed; 277 goto failed;
@@ -443,10 +339,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
443 goto failed; 339 goto failed;
444 340
445 failed_unload: 341 failed_unload:
446 nilfs_mdt_destroy(nilfs->ns_cpfile); 342 iput(nilfs->ns_cpfile);
447 nilfs_mdt_destroy(nilfs->ns_sufile); 343 iput(nilfs->ns_sufile);
448 nilfs_mdt_destroy(nilfs->ns_dat); 344 iput(nilfs->ns_dat);
449 nilfs_mdt_destroy(nilfs->ns_gc_dat);
450 345
451 failed: 346 failed:
452 nilfs_clear_recovery_info(&ri); 347 nilfs_clear_recovery_info(&ri);
@@ -468,8 +363,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
468static int nilfs_store_disk_layout(struct the_nilfs *nilfs, 363static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
469 struct nilfs_super_block *sbp) 364 struct nilfs_super_block *sbp)
470{ 365{
471 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) { 366 if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
472 printk(KERN_ERR "NILFS: revision mismatch " 367 printk(KERN_ERR "NILFS: unsupported revision "
473 "(superblock rev.=%d.%d, current rev.=%d.%d). " 368 "(superblock rev.=%d.%d, current rev.=%d.%d). "
474 "Please check the version of mkfs.nilfs.\n", 369 "Please check the version of mkfs.nilfs.\n",
475 le32_to_cpu(sbp->s_rev_level), 370 le32_to_cpu(sbp->s_rev_level),
@@ -631,12 +526,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
631 * 526 *
632 * init_nilfs() performs common initialization per block device (e.g. 527 * init_nilfs() performs common initialization per block device (e.g.
633 * reading the super block, getting disk layout information, initializing 528 * reading the super block, getting disk layout information, initializing
634 * shared fields in the_nilfs). It takes on some portion of the jobs 529 * shared fields in the_nilfs).
635 * typically done by a fill_super() routine. This division arises from
636 * the nature that multiple NILFS instances may be simultaneously
637 * mounted on a device.
638 * For multiple mounts on the same device, only the first mount
639 * invokes these tasks.
640 * 530 *
641 * Return Value: On success, 0 is returned. On error, a negative error 531 * Return Value: On success, 0 is returned. On error, a negative error
642 * code is returned. 532 * code is returned.
@@ -645,32 +535,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
645{ 535{
646 struct super_block *sb = sbi->s_super; 536 struct super_block *sb = sbi->s_super;
647 struct nilfs_super_block *sbp; 537 struct nilfs_super_block *sbp;
648 struct backing_dev_info *bdi;
649 int blocksize; 538 int blocksize;
650 int err; 539 int err;
651 540
652 down_write(&nilfs->ns_sem); 541 down_write(&nilfs->ns_sem);
653 if (nilfs_init(nilfs)) {
654 /* Load values from existing the_nilfs */
655 sbp = nilfs->ns_sbp[0];
656 err = nilfs_store_magic_and_option(sb, sbp, data);
657 if (err)
658 goto out;
659
660 err = nilfs_check_feature_compatibility(sb, sbp);
661 if (err)
662 goto out;
663
664 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
665 if (sb->s_blocksize != blocksize &&
666 !sb_set_blocksize(sb, blocksize)) {
667 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
668 blocksize);
669 err = -EINVAL;
670 }
671 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
672 goto out;
673 }
674 542
675 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); 543 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
676 if (!blocksize) { 544 if (!blocksize) {
@@ -729,18 +597,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
729 597
730 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 598 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
731 599
732 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
733 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
734
735 err = nilfs_store_log_cursor(nilfs, sbp); 600 err = nilfs_store_log_cursor(nilfs, sbp);
736 if (err) 601 if (err)
737 goto failed_sbh; 602 goto failed_sbh;
738 603
739 /* Initialize gcinode cache */
740 err = nilfs_init_gccache(nilfs);
741 if (err)
742 goto failed_sbh;
743
744 set_nilfs_init(nilfs); 604 set_nilfs_init(nilfs);
745 err = 0; 605 err = 0;
746 out: 606 out:
@@ -775,9 +635,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
775 ret = blkdev_issue_discard(nilfs->ns_bdev, 635 ret = blkdev_issue_discard(nilfs->ns_bdev,
776 start * sects_per_block, 636 start * sects_per_block,
777 nblocks * sects_per_block, 637 nblocks * sects_per_block,
778 GFP_NOFS, 638 GFP_NOFS, 0);
779 BLKDEV_IFL_WAIT |
780 BLKDEV_IFL_BARRIER);
781 if (ret < 0) 639 if (ret < 0)
782 return ret; 640 return ret;
783 nblocks = 0; 641 nblocks = 0;
@@ -787,8 +645,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
787 ret = blkdev_issue_discard(nilfs->ns_bdev, 645 ret = blkdev_issue_discard(nilfs->ns_bdev,
788 start * sects_per_block, 646 start * sects_per_block,
789 nblocks * sects_per_block, 647 nblocks * sects_per_block,
790 GFP_NOFS, 648 GFP_NOFS, 0);
791 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
792 return ret; 649 return ret;
793} 650}
794 651
@@ -815,79 +672,92 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
815 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; 672 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
816} 673}
817 674
818/** 675struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
819 * nilfs_find_sbinfo - find existing nilfs_sb_info structure
820 * @nilfs: nilfs object
821 * @rw_mount: mount type (non-zero value for read/write mount)
822 * @cno: checkpoint number (zero for read-only mount)
823 *
824 * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
825 * @rw_mount and @cno (in case of snapshots) matched. If no instance
826 * was found, NULL is returned. Although the super block instance can
827 * be unmounted after this function returns, the nilfs_sb_info struct
828 * is kept on memory until nilfs_put_sbinfo() is called.
829 */
830struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
831 int rw_mount, __u64 cno)
832{ 676{
833 struct nilfs_sb_info *sbi; 677 struct rb_node *n;
834 678 struct nilfs_root *root;
835 down_read(&nilfs->ns_super_sem); 679
836 /* 680 spin_lock(&nilfs->ns_cptree_lock);
837 * The SNAPSHOT flag and sb->s_flags are supposed to be 681 n = nilfs->ns_cptree.rb_node;
838 * protected with nilfs->ns_super_sem. 682 while (n) {
839 */ 683 root = rb_entry(n, struct nilfs_root, rb_node);
840 sbi = nilfs->ns_current; 684
841 if (rw_mount) { 685 if (cno < root->cno) {
842 if (sbi && !(sbi->s_super->s_flags & MS_RDONLY)) 686 n = n->rb_left;
843 goto found; /* read/write mount */ 687 } else if (cno > root->cno) {
844 else 688 n = n->rb_right;
845 goto out; 689 } else {
846 } else if (cno == 0) { 690 atomic_inc(&root->count);
847 if (sbi && (sbi->s_super->s_flags & MS_RDONLY)) 691 spin_unlock(&nilfs->ns_cptree_lock);
848 goto found; /* read-only mount */ 692 return root;
849 else 693 }
850 goto out;
851 } 694 }
695 spin_unlock(&nilfs->ns_cptree_lock);
852 696
853 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
854 if (nilfs_test_opt(sbi, SNAPSHOT) &&
855 sbi->s_snapshot_cno == cno)
856 goto found; /* snapshot mount */
857 }
858 out:
859 up_read(&nilfs->ns_super_sem);
860 return NULL; 697 return NULL;
861
862 found:
863 atomic_inc(&sbi->s_count);
864 up_read(&nilfs->ns_super_sem);
865 return sbi;
866} 698}
867 699
868int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, 700struct nilfs_root *
869 int snapshot_mount) 701nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
870{ 702{
871 struct nilfs_sb_info *sbi; 703 struct rb_node **p, *parent;
872 int ret = 0; 704 struct nilfs_root *root, *new;
873 705
874 down_read(&nilfs->ns_super_sem); 706 root = nilfs_lookup_root(nilfs, cno);
875 if (cno == 0 || cno > nilfs->ns_cno) 707 if (root)
876 goto out_unlock; 708 return root;
877 709
878 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { 710 new = kmalloc(sizeof(*root), GFP_KERNEL);
879 if (sbi->s_snapshot_cno == cno && 711 if (!new)
880 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) { 712 return NULL;
881 /* exclude read-only mounts */ 713
882 ret++; 714 spin_lock(&nilfs->ns_cptree_lock);
883 break; 715
716 p = &nilfs->ns_cptree.rb_node;
717 parent = NULL;
718
719 while (*p) {
720 parent = *p;
721 root = rb_entry(parent, struct nilfs_root, rb_node);
722
723 if (cno < root->cno) {
724 p = &(*p)->rb_left;
725 } else if (cno > root->cno) {
726 p = &(*p)->rb_right;
727 } else {
728 atomic_inc(&root->count);
729 spin_unlock(&nilfs->ns_cptree_lock);
730 kfree(new);
731 return root;
884 } 732 }
885 } 733 }
886 /* for protecting recent checkpoints */
887 if (cno >= nilfs_last_cno(nilfs))
888 ret++;
889 734
890 out_unlock: 735 new->cno = cno;
891 up_read(&nilfs->ns_super_sem); 736 new->ifile = NULL;
892 return ret; 737 new->nilfs = nilfs;
738 atomic_set(&new->count, 1);
739 atomic_set(&new->inodes_count, 0);
740 atomic_set(&new->blocks_count, 0);
741
742 rb_link_node(&new->rb_node, parent, p);
743 rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
744
745 spin_unlock(&nilfs->ns_cptree_lock);
746
747 return new;
748}
749
750void nilfs_put_root(struct nilfs_root *root)
751{
752 if (atomic_dec_and_test(&root->count)) {
753 struct the_nilfs *nilfs = root->nilfs;
754
755 spin_lock(&nilfs->ns_cptree_lock);
756 rb_erase(&root->rb_node, &nilfs->ns_cptree);
757 spin_unlock(&nilfs->ns_cptree_lock);
758 if (root->ifile)
759 iput(root->ifile);
760
761 kfree(root);
762 }
893} 763}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f785a7b0ab99..69226e14b745 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
26 26
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/rbtree.h>
29#include <linux/fs.h> 30#include <linux/fs.h>
30#include <linux/blkdev.h> 31#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
@@ -45,22 +46,13 @@ enum {
45/** 46/**
46 * struct the_nilfs - struct to supervise multiple nilfs mount points 47 * struct the_nilfs - struct to supervise multiple nilfs mount points
47 * @ns_flags: flags 48 * @ns_flags: flags
48 * @ns_count: reference count
49 * @ns_list: list head for nilfs_list
50 * @ns_bdev: block device 49 * @ns_bdev: block device
51 * @ns_bdi: backing dev info
52 * @ns_writer: back pointer to writable nilfs_sb_info
53 * @ns_sem: semaphore for shared states 50 * @ns_sem: semaphore for shared states
54 * @ns_super_sem: semaphore for global operations across super block instances
55 * @ns_mount_mutex: mutex protecting mount process of nilfs
56 * @ns_writer_sem: semaphore protecting ns_writer attach/detach
57 * @ns_current: back pointer to current mount
58 * @ns_sbh: buffer heads of on-disk super blocks 51 * @ns_sbh: buffer heads of on-disk super blocks
59 * @ns_sbp: pointers to super block data 52 * @ns_sbp: pointers to super block data
60 * @ns_sbwtime: previous write time of super block 53 * @ns_sbwtime: previous write time of super block
61 * @ns_sbwcount: write count of super block 54 * @ns_sbwcount: write count of super block
62 * @ns_sbsize: size of valid data in super block 55 * @ns_sbsize: size of valid data in super block
63 * @ns_supers: list of nilfs super block structs
64 * @ns_seg_seq: segment sequence counter 56 * @ns_seg_seq: segment sequence counter
65 * @ns_segnum: index number of the latest full segment. 57 * @ns_segnum: index number of the latest full segment.
66 * @ns_nextnum: index number of the full segment index to be used next 58 * @ns_nextnum: index number of the full segment index to be used next
@@ -79,9 +71,9 @@ enum {
79 * @ns_dat: DAT file inode 71 * @ns_dat: DAT file inode
80 * @ns_cpfile: checkpoint file inode 72 * @ns_cpfile: checkpoint file inode
81 * @ns_sufile: segusage file inode 73 * @ns_sufile: segusage file inode
82 * @ns_gc_dat: shadow inode of the DAT file inode for GC 74 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
75 * @ns_cptree_lock: lock protecting @ns_cptree
83 * @ns_gc_inodes: dummy inodes to keep live blocks 76 * @ns_gc_inodes: dummy inodes to keep live blocks
84 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
85 * @ns_blocksize_bits: bit length of block size 77 * @ns_blocksize_bits: bit length of block size
86 * @ns_blocksize: block size 78 * @ns_blocksize: block size
87 * @ns_nsegments: number of segments in filesystem 79 * @ns_nsegments: number of segments in filesystem
@@ -95,22 +87,9 @@ enum {
95 */ 87 */
96struct the_nilfs { 88struct the_nilfs {
97 unsigned long ns_flags; 89 unsigned long ns_flags;
98 atomic_t ns_count;
99 struct list_head ns_list;
100 90
101 struct block_device *ns_bdev; 91 struct block_device *ns_bdev;
102 struct backing_dev_info *ns_bdi;
103 struct nilfs_sb_info *ns_writer;
104 struct rw_semaphore ns_sem; 92 struct rw_semaphore ns_sem;
105 struct rw_semaphore ns_super_sem;
106 struct mutex ns_mount_mutex;
107 struct rw_semaphore ns_writer_sem;
108
109 /*
110 * components protected by ns_super_sem
111 */
112 struct nilfs_sb_info *ns_current;
113 struct list_head ns_supers;
114 93
115 /* 94 /*
116 * used for 95 * used for
@@ -163,11 +142,13 @@ struct the_nilfs {
163 struct inode *ns_dat; 142 struct inode *ns_dat;
164 struct inode *ns_cpfile; 143 struct inode *ns_cpfile;
165 struct inode *ns_sufile; 144 struct inode *ns_sufile;
166 struct inode *ns_gc_dat;
167 145
168 /* GC inode list and hash table head */ 146 /* Checkpoint tree */
147 struct rb_root ns_cptree;
148 spinlock_t ns_cptree_lock;
149
150 /* GC inode list */
169 struct list_head ns_gc_inodes; 151 struct list_head ns_gc_inodes;
170 struct hlist_head *ns_gc_inodes_h;
171 152
172 /* Disk layout information (static) */ 153 /* Disk layout information (static) */
173 unsigned int ns_blocksize_bits; 154 unsigned int ns_blocksize_bits;
@@ -182,9 +163,6 @@ struct the_nilfs {
182 u32 ns_crc_seed; 163 u32 ns_crc_seed;
183}; 164};
184 165
185#define NILFS_GCINODE_HASH_BITS 8
186#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
187
188#define THE_NILFS_FNS(bit, name) \ 166#define THE_NILFS_FNS(bit, name) \
189static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ 167static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
190{ \ 168{ \
@@ -205,6 +183,32 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
205THE_NILFS_FNS(GC_RUNNING, gc_running) 183THE_NILFS_FNS(GC_RUNNING, gc_running)
206THE_NILFS_FNS(SB_DIRTY, sb_dirty) 184THE_NILFS_FNS(SB_DIRTY, sb_dirty)
207 185
186/**
187 * struct nilfs_root - nilfs root object
188 * @cno: checkpoint number
189 * @rb_node: red-black tree node
190 * @count: refcount of this structure
191 * @nilfs: nilfs object
192 * @ifile: inode file
193 * @root: root inode
194 * @inodes_count: number of inodes
195 * @blocks_count: number of blocks (Reserved)
196 */
197struct nilfs_root {
198 __u64 cno;
199 struct rb_node rb_node;
200
201 atomic_t count;
202 struct the_nilfs *nilfs;
203 struct inode *ifile;
204
205 atomic_t inodes_count;
206 atomic_t blocks_count;
207};
208
209/* Special checkpoint number */
210#define NILFS_CPTREE_CURRENT_CNO 0
211
208/* Minimum interval of periodical update of superblocks (in seconds) */ 212/* Minimum interval of periodical update of superblocks (in seconds) */
209#define NILFS_SB_FREQ 10 213#define NILFS_SB_FREQ 10
210 214
@@ -221,46 +225,25 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
221} 225}
222 226
223void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 227void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
224struct the_nilfs *find_or_create_nilfs(struct block_device *); 228struct the_nilfs *alloc_nilfs(struct block_device *bdev);
225void put_nilfs(struct the_nilfs *); 229void destroy_nilfs(struct the_nilfs *nilfs);
226int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 230int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
227int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 231int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
228int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); 232int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
229int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 233int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
234struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
235struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
236 __u64 cno);
237void nilfs_put_root(struct nilfs_root *root);
230struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 238struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
231int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
232int nilfs_near_disk_full(struct the_nilfs *); 239int nilfs_near_disk_full(struct the_nilfs *);
233void nilfs_fall_back_super_block(struct the_nilfs *); 240void nilfs_fall_back_super_block(struct the_nilfs *);
234void nilfs_swap_super_block(struct the_nilfs *); 241void nilfs_swap_super_block(struct the_nilfs *);
235 242
236 243
237static inline void get_nilfs(struct the_nilfs *nilfs) 244static inline void nilfs_get_root(struct nilfs_root *root)
238{
239 /* Caller must have at least one reference of the_nilfs. */
240 atomic_inc(&nilfs->ns_count);
241}
242
243static inline void
244nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
245{
246 down_write(&nilfs->ns_writer_sem);
247 nilfs->ns_writer = sbi;
248 up_write(&nilfs->ns_writer_sem);
249}
250
251static inline void
252nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
253{
254 down_write(&nilfs->ns_writer_sem);
255 if (sbi == nilfs->ns_writer)
256 nilfs->ns_writer = NULL;
257 up_write(&nilfs->ns_writer_sem);
258}
259
260static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
261{ 245{
262 if (atomic_dec_and_test(&sbi->s_count)) 246 atomic_inc(&root->count);
263 kfree(sbi);
264} 247}
265 248
266static inline int nilfs_valid_fs(struct the_nilfs *nilfs) 249static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d3467..6e40e42a43de 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
19 19
20const struct file_operations def_blk_fops = { 20const struct file_operations def_blk_fops = {
21 .open = no_blkdev_open, 21 .open = no_blkdev_open,
22 .llseek = noop_llseek,
22}; 23};
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index b388443c3a09..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6#source "fs/notify/fanotify/Kconfig" 6source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 85366c78cc37..b04f88eed09e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -131,6 +131,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
131 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); 131 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
132 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); 132 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
133 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); 133 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
134 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
134 135
135 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 136 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
136 137
@@ -160,20 +161,21 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
160 __u32 event_mask, void *data, int data_type) 161 __u32 event_mask, void *data, int data_type)
161{ 162{
162 __u32 marks_mask, marks_ignored_mask; 163 __u32 marks_mask, marks_ignored_mask;
164 struct path *path = data;
163 165
164 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " 166 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 167 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
166 inode_mark, vfsmnt_mark, event_mask, data, data_type); 168 inode_mark, vfsmnt_mark, event_mask, data, data_type);
167 169
168 /* sorry, fanotify only gives a damn about files and dirs */
169 if (!S_ISREG(to_tell->i_mode) &&
170 !S_ISDIR(to_tell->i_mode))
171 return false;
172
173 /* if we don't have enough info to send an event to userspace say no */ 170 /* if we don't have enough info to send an event to userspace say no */
174 if (data_type != FSNOTIFY_EVENT_PATH) 171 if (data_type != FSNOTIFY_EVENT_PATH)
175 return false; 172 return false;
176 173
174 /* sorry, fanotify only gives a damn about files and dirs */
175 if (!S_ISREG(path->dentry->d_inode->i_mode) &&
176 !S_ISDIR(path->dentry->d_inode->i_mode))
177 return false;
178
177 if (inode_mark && vfsmnt_mark) { 179 if (inode_mark && vfsmnt_mark) {
178 marks_mask = (vfsmnt_mark->mask | inode_mark->mask); 180 marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
179 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); 181 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
@@ -194,16 +196,29 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
194 BUG(); 196 BUG();
195 } 197 }
196 198
199 if (S_ISDIR(path->dentry->d_inode->i_mode) &&
200 (marks_ignored_mask & FS_ISDIR))
201 return false;
202
197 if (event_mask & marks_mask & ~marks_ignored_mask) 203 if (event_mask & marks_mask & ~marks_ignored_mask)
198 return true; 204 return true;
199 205
200 return false; 206 return false;
201} 207}
202 208
209static void fanotify_free_group_priv(struct fsnotify_group *group)
210{
211 struct user_struct *user;
212
213 user = group->fanotify_data.user;
214 atomic_dec(&user->fanotify_listeners);
215 free_uid(user);
216}
217
203const struct fsnotify_ops fanotify_fsnotify_ops = { 218const struct fsnotify_ops fanotify_fsnotify_ops = {
204 .handle_event = fanotify_handle_event, 219 .handle_event = fanotify_handle_event,
205 .should_send_event = fanotify_should_send_event, 220 .should_send_event = fanotify_should_send_event,
206 .free_group_priv = NULL, 221 .free_group_priv = fanotify_free_group_priv,
207 .free_event_priv = NULL, 222 .free_event_priv = NULL,
208 .freeing_mark = NULL, 223 .freeing_mark = NULL,
209}; 224};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5ed8e58d7bfc..063224812b7e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,10 @@
16 16
17#include <asm/ioctls.h> 17#include <asm/ioctls.h>
18 18
19#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
20#define FANOTIFY_DEFAULT_MAX_MARKS 8192
21#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
22
19extern const struct fsnotify_ops fanotify_fsnotify_ops; 23extern const struct fsnotify_ops fanotify_fsnotify_ops;
20 24
21static struct kmem_cache *fanotify_mark_cache __read_mostly; 25static struct kmem_cache *fanotify_mark_cache __read_mostly;
@@ -326,7 +330,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
326 ret = -EAGAIN; 330 ret = -EAGAIN;
327 if (file->f_flags & O_NONBLOCK) 331 if (file->f_flags & O_NONBLOCK)
328 break; 332 break;
329 ret = -EINTR; 333 ret = -ERESTARTSYS;
330 if (signal_pending(current)) 334 if (signal_pending(current))
331 break; 335 break;
332 336
@@ -372,11 +376,10 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
372static int fanotify_release(struct inode *ignored, struct file *file) 376static int fanotify_release(struct inode *ignored, struct file *file)
373{ 377{
374 struct fsnotify_group *group = file->private_data; 378 struct fsnotify_group *group = file->private_data;
375 struct fanotify_response_event *re, *lre;
376
377 pr_debug("%s: file=%p group=%p\n", __func__, file, group);
378 379
379#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 380#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
381 struct fanotify_response_event *re, *lre;
382
380 mutex_lock(&group->fanotify_data.access_mutex); 383 mutex_lock(&group->fanotify_data.access_mutex);
381 384
382 group->fanotify_data.bypass_perm = true; 385 group->fanotify_data.bypass_perm = true;
@@ -433,6 +436,7 @@ static const struct file_operations fanotify_fops = {
433 .release = fanotify_release, 436 .release = fanotify_release,
434 .unlocked_ioctl = fanotify_ioctl, 437 .unlocked_ioctl = fanotify_ioctl,
435 .compat_ioctl = fanotify_ioctl, 438 .compat_ioctl = fanotify_ioctl,
439 .llseek = noop_llseek,
436}; 440};
437 441
438static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) 442static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
@@ -553,18 +557,24 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
553 __u32 mask, 557 __u32 mask,
554 unsigned int flags) 558 unsigned int flags)
555{ 559{
556 __u32 oldmask; 560 __u32 oldmask = -1;
557 561
558 spin_lock(&fsn_mark->lock); 562 spin_lock(&fsn_mark->lock);
559 if (!(flags & FAN_MARK_IGNORED_MASK)) { 563 if (!(flags & FAN_MARK_IGNORED_MASK)) {
560 oldmask = fsn_mark->mask; 564 oldmask = fsn_mark->mask;
561 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); 565 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
562 } else { 566 } else {
563 oldmask = fsn_mark->ignored_mask; 567 __u32 tmask = fsn_mark->ignored_mask | mask;
564 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask)); 568 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
565 if (flags & FAN_MARK_IGNORED_SURV_MODIFY) 569 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
566 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; 570 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
567 } 571 }
572
573 if (!(flags & FAN_MARK_ONDIR)) {
574 __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
575 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
576 }
577
568 spin_unlock(&fsn_mark->lock); 578 spin_unlock(&fsn_mark->lock);
569 579
570 return mask & ~oldmask; 580 return mask & ~oldmask;
@@ -581,6 +591,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
581 if (!fsn_mark) { 591 if (!fsn_mark) {
582 int ret; 592 int ret;
583 593
594 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
595 return -ENOSPC;
596
584 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 597 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
585 if (!fsn_mark) 598 if (!fsn_mark)
586 return -ENOMEM; 599 return -ENOMEM;
@@ -609,10 +622,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
609 622
610 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 623 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
611 624
625 /*
626 * If some other task has this inode open for write we should not add
627 * an ignored mark, unless that ignored mark is supposed to survive
628 * modification changes anyway.
629 */
630 if ((flags & FAN_MARK_IGNORED_MASK) &&
631 !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
632 (atomic_read(&inode->i_writecount) > 0))
633 return 0;
634
612 fsn_mark = fsnotify_find_inode_mark(group, inode); 635 fsn_mark = fsnotify_find_inode_mark(group, inode);
613 if (!fsn_mark) { 636 if (!fsn_mark) {
614 int ret; 637 int ret;
615 638
639 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
640 return -ENOSPC;
641
616 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 642 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
617 if (!fsn_mark) 643 if (!fsn_mark)
618 return -ENOMEM; 644 return -ENOMEM;
@@ -636,6 +662,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
636{ 662{
637 struct fsnotify_group *group; 663 struct fsnotify_group *group;
638 int f_flags, fd; 664 int f_flags, fd;
665 struct user_struct *user;
639 666
640 pr_debug("%s: flags=%d event_f_flags=%d\n", 667 pr_debug("%s: flags=%d event_f_flags=%d\n",
641 __func__, flags, event_f_flags); 668 __func__, flags, event_f_flags);
@@ -646,6 +673,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
646 if (flags & ~FAN_ALL_INIT_FLAGS) 673 if (flags & ~FAN_ALL_INIT_FLAGS)
647 return -EINVAL; 674 return -EINVAL;
648 675
676 user = get_current_user();
677 if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
678 free_uid(user);
679 return -EMFILE;
680 }
681
649 f_flags = O_RDWR | FMODE_NONOTIFY; 682 f_flags = O_RDWR | FMODE_NONOTIFY;
650 if (flags & FAN_CLOEXEC) 683 if (flags & FAN_CLOEXEC)
651 f_flags |= O_CLOEXEC; 684 f_flags |= O_CLOEXEC;
@@ -657,12 +690,47 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
657 if (IS_ERR(group)) 690 if (IS_ERR(group))
658 return PTR_ERR(group); 691 return PTR_ERR(group);
659 692
693 group->fanotify_data.user = user;
694 atomic_inc(&user->fanotify_listeners);
695
660 group->fanotify_data.f_flags = event_f_flags; 696 group->fanotify_data.f_flags = event_f_flags;
661#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 697#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
662 mutex_init(&group->fanotify_data.access_mutex); 698 mutex_init(&group->fanotify_data.access_mutex);
663 init_waitqueue_head(&group->fanotify_data.access_waitq); 699 init_waitqueue_head(&group->fanotify_data.access_waitq);
664 INIT_LIST_HEAD(&group->fanotify_data.access_list); 700 INIT_LIST_HEAD(&group->fanotify_data.access_list);
665#endif 701#endif
702 switch (flags & FAN_ALL_CLASS_BITS) {
703 case FAN_CLASS_NOTIF:
704 group->priority = FS_PRIO_0;
705 break;
706 case FAN_CLASS_CONTENT:
707 group->priority = FS_PRIO_1;
708 break;
709 case FAN_CLASS_PRE_CONTENT:
710 group->priority = FS_PRIO_2;
711 break;
712 default:
713 fd = -EINVAL;
714 goto out_put_group;
715 }
716
717 if (flags & FAN_UNLIMITED_QUEUE) {
718 fd = -EPERM;
719 if (!capable(CAP_SYS_ADMIN))
720 goto out_put_group;
721 group->max_events = UINT_MAX;
722 } else {
723 group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
724 }
725
726 if (flags & FAN_UNLIMITED_MARKS) {
727 fd = -EPERM;
728 if (!capable(CAP_SYS_ADMIN))
729 goto out_put_group;
730 group->fanotify_data.max_marks = UINT_MAX;
731 } else {
732 group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
733 }
666 734
667 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); 735 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
668 if (fd < 0) 736 if (fd < 0)
@@ -703,6 +771,12 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
703 default: 771 default:
704 return -EINVAL; 772 return -EINVAL;
705 } 773 }
774
775 if (mask & FAN_ONDIR) {
776 flags |= FAN_MARK_ONDIR;
777 mask &= ~FAN_ONDIR;
778 }
779
706#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 780#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
707 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD)) 781 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
708#else 782#else
@@ -718,6 +792,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
718 ret = -EINVAL; 792 ret = -EINVAL;
719 if (unlikely(filp->f_op != &fanotify_fops)) 793 if (unlikely(filp->f_op != &fanotify_fops))
720 goto fput_and_out; 794 goto fput_and_out;
795 group = filp->private_data;
796
797 /*
798 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
799 * allowed to set permissions events.
800 */
801 ret = -EINVAL;
802 if (mask & FAN_ALL_PERM_EVENTS &&
803 group->priority == FS_PRIO_0)
804 goto fput_and_out;
721 805
722 ret = fanotify_find_path(dfd, pathname, &path, flags); 806 ret = fanotify_find_path(dfd, pathname, &path, flags);
723 if (ret) 807 if (ret)
@@ -728,7 +812,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
728 inode = path.dentry->d_inode; 812 inode = path.dentry->d_inode;
729 else 813 else
730 mnt = path.mnt; 814 mnt = path.mnt;
731 group = filp->private_data;
732 815
733 /* create/update an inode mark */ 816 /* create/update an inode mark */
734 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 817 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..20dc218707ca 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,59 +84,39 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
84} 84}
85 85
86/* Notify this dentry's parent about a child's events. */ 86/* Notify this dentry's parent about a child's events. */
87void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) 87int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
88{ 88{
89 struct dentry *parent; 89 struct dentry *parent;
90 struct inode *p_inode; 90 struct inode *p_inode;
91 bool send = false; 91 int ret = 0;
92 bool should_update_children = false;
93 92
94 if (!dentry) 93 if (!dentry)
95 dentry = path->dentry; 94 dentry = path->dentry;
96 95
97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 96 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
98 return; 97 return 0;
99 98
100 spin_lock(&dentry->d_lock); 99 parent = dget_parent(dentry);
101 parent = dentry->d_parent;
102 p_inode = parent->d_inode; 100 p_inode = parent->d_inode;
103 101
104 if (fsnotify_inode_watches_children(p_inode)) { 102 if (unlikely(!fsnotify_inode_watches_children(p_inode)))
105 if (p_inode->i_fsnotify_mask & mask) { 103 __fsnotify_update_child_dentry_flags(p_inode);
106 dget(parent); 104 else if (p_inode->i_fsnotify_mask & mask) {
107 send = true;
108 }
109 } else {
110 /*
111 * The parent doesn't care about events on it's children but
112 * at least one child thought it did. We need to run all the
113 * children and update their d_flags to let them know p_inode
114 * doesn't care about them any more.
115 */
116 dget(parent);
117 should_update_children = true;
118 }
119
120 spin_unlock(&dentry->d_lock);
121
122 if (send) {
123 /* we are notifying a parent so come up with the new mask which 105 /* we are notifying a parent so come up with the new mask which
124 * specifies these are events which came from a child. */ 106 * specifies these are events which came from a child. */
125 mask |= FS_EVENT_ON_CHILD; 107 mask |= FS_EVENT_ON_CHILD;
126 108
127 if (path) 109 if (path)
128 fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, 110 ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
129 dentry->d_name.name, 0); 111 dentry->d_name.name, 0);
130 else 112 else
131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 113 ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
132 dentry->d_name.name, 0); 114 dentry->d_name.name, 0);
133 dput(parent);
134 } 115 }
135 116
136 if (unlikely(should_update_children)) { 117 dput(parent);
137 __fsnotify_update_child_dentry_flags(p_inode); 118
138 dput(parent); 119 return ret;
139 }
140} 120}
141EXPORT_SYMBOL_GPL(__fsnotify_parent); 121EXPORT_SYMBOL_GPL(__fsnotify_parent);
142 122
@@ -275,20 +255,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
275 255
276 if (inode_group > vfsmount_group) { 256 if (inode_group > vfsmount_group) {
277 /* handle inode */ 257 /* handle inode */
278 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, 258 ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
279 data_is, cookie, file_name, &event); 259 data_is, cookie, file_name, &event);
280 /* we didn't use the vfsmount_mark */ 260 /* we didn't use the vfsmount_mark */
281 vfsmount_group = NULL; 261 vfsmount_group = NULL;
282 } else if (vfsmount_group > inode_group) { 262 } else if (vfsmount_group > inode_group) {
283 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, 263 ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
284 data_is, cookie, file_name, &event); 264 data_is, cookie, file_name, &event);
285 inode_group = NULL; 265 inode_group = NULL;
286 } else { 266 } else {
287 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, 267 ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
288 mask, data, data_is, cookie, file_name, 268 mask, data, data_is, cookie, file_name,
289 &event); 269 &event);
290 } 270 }
291 271
272 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
273 goto out;
274
292 if (inode_group) 275 if (inode_group)
293 inode_node = srcu_dereference(inode_node->next, 276 inode_node = srcu_dereference(inode_node->next,
294 &fsnotify_mark_srcu); 277 &fsnotify_mark_srcu);
@@ -296,7 +279,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
296 vfsmount_node = srcu_dereference(vfsmount_node->next, 279 vfsmount_node = srcu_dereference(vfsmount_node->next,
297 &fsnotify_mark_srcu); 280 &fsnotify_mark_srcu);
298 } 281 }
299 282 ret = 0;
283out:
300 srcu_read_unlock(&fsnotify_mark_srcu, idx); 284 srcu_read_unlock(&fsnotify_mark_srcu, idx);
301 /* 285 /*
302 * fsnotify_create_event() took a reference so the event can't be cleaned 286 * fsnotify_create_event() took a reference so the event can't be cleaned
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..4c29fcf557d1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -177,7 +177,8 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
177 * Attach an initialized mark to a given inode. 177 * Attach an initialized mark to a given inode.
178 * These marks may be used for the fsnotify backend to determine which 178 * These marks may be used for the fsnotify backend to determine which
179 * event types should be delivered to which group and for which inodes. These 179 * event types should be delivered to which group and for which inodes. These
180 * marks are ordered according to the group's location in memory. 180 * marks are ordered according to priority, highest number first, and then by
181 * the group's location in memory.
181 */ 182 */
182int fsnotify_add_inode_mark(struct fsnotify_mark *mark, 183int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
183 struct fsnotify_group *group, struct inode *inode, 184 struct fsnotify_group *group, struct inode *inode,
@@ -211,7 +212,11 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
211 goto out; 212 goto out;
212 } 213 }
213 214
214 if (mark->group < lmark->group) 215 if (mark->group->priority < lmark->group->priority)
216 continue;
217
218 if ((mark->group->priority == lmark->group->priority) &&
219 (mark->group < lmark->group))
215 continue; 220 continue;
216 221
217 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); 222 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
@@ -240,6 +245,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
240{ 245{
241 struct inode *inode, *next_i, *need_iput = NULL; 246 struct inode *inode, *next_i, *need_iput = NULL;
242 247
248 spin_lock(&inode_lock);
243 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
244 struct inode *need_iput_tmp; 250 struct inode *need_iput_tmp;
245 251
@@ -297,4 +303,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
297 303
298 spin_lock(&inode_lock); 304 spin_lock(&inode_lock);
299 } 305 }
306 spin_unlock(&inode_lock);
300} 307}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bf7f6d776c31..444c305a468c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -344,6 +344,7 @@ static const struct file_operations inotify_fops = {
344 .release = inotify_release, 344 .release = inotify_release,
345 .unlocked_ioctl = inotify_ioctl, 345 .unlocked_ioctl = inotify_ioctl,
346 .compat_ioctl = inotify_ioctl, 346 .compat_ioctl = inotify_ioctl,
347 .llseek = noop_llseek,
347}; 348};
348 349
349 350
@@ -861,7 +862,7 @@ static int __init inotify_user_setup(void)
861 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); 862 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
862 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); 863 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
863 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); 864 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
864 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR); 865 BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
865 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); 866 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
866 867
867 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); 868 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 56772b578fbd..85eebff6d0d7 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -169,7 +169,11 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
169 goto out; 169 goto out;
170 } 170 }
171 171
172 if (mark->group < lmark->group) 172 if (mark->group->priority < lmark->group->priority)
173 continue;
174
175 if ((mark->group->priority == lmark->group->priority) &&
176 (mark->group < lmark->group))
173 continue; 177 continue;
174 178
175 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); 179 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 512806171bfa..a30ecacc01f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -30,7 +30,6 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h>
34#include <linux/bitmap.h> 33#include <linux/bitmap.h>
35 34
36#include "sysctl.h" 35#include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
445 444
446 ntfs_debug("Entering with remount options string: %s", opt); 445 ntfs_debug("Entering with remount options string: %s", opt);
447 446
448 lock_kernel();
449#ifndef NTFS_RW 447#ifndef NTFS_RW
450 /* For read-only compiled driver, enforce read-only flag. */ 448 /* For read-only compiled driver, enforce read-only flag. */
451 *flags |= MS_RDONLY; 449 *flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
469 if (NVolErrors(vol)) { 467 if (NVolErrors(vol)) {
470 ntfs_error(sb, "Volume has errors and is read-only%s", 468 ntfs_error(sb, "Volume has errors and is read-only%s",
471 es); 469 es);
472 unlock_kernel();
473 return -EROFS; 470 return -EROFS;
474 } 471 }
475 if (vol->vol_flags & VOLUME_IS_DIRTY) { 472 if (vol->vol_flags & VOLUME_IS_DIRTY) {
476 ntfs_error(sb, "Volume is dirty and read-only%s", es); 473 ntfs_error(sb, "Volume is dirty and read-only%s", es);
477 unlock_kernel();
478 return -EROFS; 474 return -EROFS;
479 } 475 }
480 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { 476 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
481 ntfs_error(sb, "Volume has been modified by chkdsk " 477 ntfs_error(sb, "Volume has been modified by chkdsk "
482 "and is read-only%s", es); 478 "and is read-only%s", es);
483 unlock_kernel();
484 return -EROFS; 479 return -EROFS;
485 } 480 }
486 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 481 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
488 "(0x%x) and is read-only%s", 483 "(0x%x) and is read-only%s",
489 (unsigned)le16_to_cpu(vol->vol_flags), 484 (unsigned)le16_to_cpu(vol->vol_flags),
490 es); 485 es);
491 unlock_kernel();
492 return -EROFS; 486 return -EROFS;
493 } 487 }
494 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { 488 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
495 ntfs_error(sb, "Failed to set dirty bit in volume " 489 ntfs_error(sb, "Failed to set dirty bit in volume "
496 "information flags%s", es); 490 "information flags%s", es);
497 unlock_kernel();
498 return -EROFS; 491 return -EROFS;
499 } 492 }
500#if 0 493#if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
514 ntfs_error(sb, "Failed to empty journal $LogFile%s", 507 ntfs_error(sb, "Failed to empty journal $LogFile%s",
515 es); 508 es);
516 NVolSetErrors(vol); 509 NVolSetErrors(vol);
517 unlock_kernel();
518 return -EROFS; 510 return -EROFS;
519 } 511 }
520 if (!ntfs_mark_quotas_out_of_date(vol)) { 512 if (!ntfs_mark_quotas_out_of_date(vol)) {
521 ntfs_error(sb, "Failed to mark quotas out of date%s", 513 ntfs_error(sb, "Failed to mark quotas out of date%s",
522 es); 514 es);
523 NVolSetErrors(vol); 515 NVolSetErrors(vol);
524 unlock_kernel();
525 return -EROFS; 516 return -EROFS;
526 } 517 }
527 if (!ntfs_stamp_usnjrnl(vol)) { 518 if (!ntfs_stamp_usnjrnl(vol)) {
528 ntfs_error(sb, "Failed to stamp transation log " 519 ntfs_error(sb, "Failed to stamp transation log "
529 "($UsnJrnl)%s", es); 520 "($UsnJrnl)%s", es);
530 NVolSetErrors(vol); 521 NVolSetErrors(vol);
531 unlock_kernel();
532 return -EROFS; 522 return -EROFS;
533 } 523 }
534 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 524 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
544 534
545 // TODO: Deal with *flags. 535 // TODO: Deal with *flags.
546 536
547 if (!parse_options(vol, opt)) { 537 if (!parse_options(vol, opt))
548 unlock_kernel();
549 return -EINVAL; 538 return -EINVAL;
550 } 539
551 unlock_kernel();
552 ntfs_debug("Done."); 540 ntfs_debug("Done.");
553 return 0; 541 return 0;
554} 542}
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
2261 2249
2262 ntfs_debug("Entering."); 2250 ntfs_debug("Entering.");
2263 2251
2264 lock_kernel();
2265
2266#ifdef NTFS_RW 2252#ifdef NTFS_RW
2267 /* 2253 /*
2268 * Commit all inodes while they are still open in case some of them 2254 * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
2433 2419
2434 sb->s_fs_info = NULL; 2420 sb->s_fs_info = NULL;
2435 kfree(vol); 2421 kfree(vol);
2436
2437 unlock_kernel();
2438} 2422}
2439 2423
2440/** 2424/**
@@ -2772,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2772 init_rwsem(&vol->mftbmp_lock); 2756 init_rwsem(&vol->mftbmp_lock);
2773 init_rwsem(&vol->lcnbmp_lock); 2757 init_rwsem(&vol->lcnbmp_lock);
2774 2758
2775 unlock_kernel();
2776
2777 /* By default, enable sparse support. */ 2759 /* By default, enable sparse support. */
2778 NVolSetSparseEnabled(vol); 2760 NVolSetSparseEnabled(vol);
2779 2761
@@ -2929,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2929 goto unl_upcase_iput_tmp_ino_err_out_now; 2911 goto unl_upcase_iput_tmp_ino_err_out_now;
2930 } 2912 }
2931 if ((sb->s_root = d_alloc_root(vol->root_ino))) { 2913 if ((sb->s_root = d_alloc_root(vol->root_ino))) {
2932 /* We increment i_count simulating an ntfs_iget(). */ 2914 /* We grab a reference, simulating an ntfs_iget(). */
2933 atomic_inc(&vol->root_ino->i_count); 2915 ihold(vol->root_ino);
2934 ntfs_debug("Exiting, status successful."); 2916 ntfs_debug("Exiting, status successful.");
2935 /* Release the default upcase if it has no users. */ 2917 /* Release the default upcase if it has no users. */
2936 mutex_lock(&ntfs_lock); 2918 mutex_lock(&ntfs_lock);
@@ -2940,7 +2922,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2940 } 2922 }
2941 mutex_unlock(&ntfs_lock); 2923 mutex_unlock(&ntfs_lock);
2942 sb->s_export_op = &ntfs_export_ops; 2924 sb->s_export_op = &ntfs_export_ops;
2943 lock_kernel();
2944 lockdep_on(); 2925 lockdep_on();
2945 return 0; 2926 return 0;
2946 } 2927 }
@@ -3040,24 +3021,8 @@ iput_tmp_ino_err_out_now:
3040 if (vol->mft_ino && vol->mft_ino != tmp_ino) 3021 if (vol->mft_ino && vol->mft_ino != tmp_ino)
3041 iput(vol->mft_ino); 3022 iput(vol->mft_ino);
3042 vol->mft_ino = NULL; 3023 vol->mft_ino = NULL;
3043 /*
3044 * This is needed to get ntfs_clear_extent_inode() called for each
3045 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
3046 * leak resources and B) a subsequent mount fails automatically due to
3047 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
3048 * method again... FIXME: Do we need to do this twice now because of
3049 * attribute inodes? I think not, so leave as is for now... (AIA)
3050 */
3051 if (invalidate_inodes(sb)) {
3052 ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
3053 "driver bug.");
3054 /* Copied from fs/super.c. I just love this message. (-; */
3055 printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
3056 "seconds. Have a nice day...\n");
3057 }
3058 /* Errors at this stage are irrelevant. */ 3024 /* Errors at this stage are irrelevant. */
3059err_out_now: 3025err_out_now:
3060 lock_kernel();
3061 sb->s_fs_info = NULL; 3026 sb->s_fs_info = NULL;
3062 kfree(vol); 3027 kfree(vol);
3063 ntfs_debug("Failed, returning -EINVAL."); 3028 ntfs_debug("Failed, returning -EINVAL.");
@@ -3094,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
3094/* Driver wide mutex. */ 3059/* Driver wide mutex. */
3095DEFINE_MUTEX(ntfs_lock); 3060DEFINE_MUTEX(ntfs_lock);
3096 3061
3097static int ntfs_get_sb(struct file_system_type *fs_type, 3062static struct dentry *ntfs_mount(struct file_system_type *fs_type,
3098 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3063 int flags, const char *dev_name, void *data)
3099{ 3064{
3100 return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super, 3065 return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
3101 mnt);
3102} 3066}
3103 3067
3104static struct file_system_type ntfs_fs_type = { 3068static struct file_system_type ntfs_fs_type = {
3105 .owner = THIS_MODULE, 3069 .owner = THIS_MODULE,
3106 .name = "ntfs", 3070 .name = "ntfs",
3107 .get_sb = ntfs_get_sb, 3071 .mount = ntfs_mount,
3108 .kill_sb = kill_block_super, 3072 .kill_sb = kill_block_super,
3109 .fs_flags = FS_REQUIRES_DEV, 3073 .fs_flags = FS_REQUIRES_DEV,
3110}; 3074};
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5cfeee118158..f1e962cb3b73 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
165 * ocfs2 never allocates in this function - the only time we 165 * ocfs2 never allocates in this function - the only time we
166 * need to use BH_New is when we're extending i_size on a file 166 * need to use BH_New is when we're extending i_size on a file
167 * system which doesn't support holes, in which case BH_New 167 * system which doesn't support holes, in which case BH_New
168 * allows block_prepare_write() to zero. 168 * allows __block_write_begin() to zero.
169 * 169 *
170 * If we see this on a sparse file system, then a truncate has 170 * If we see this on a sparse file system, then a truncate has
171 * raced us and removed the cluster. In this case, we clear 171 * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
407 return ret; 407 return ret;
408} 408}
409 409
410/*
411 * This is called from ocfs2_write_zero_page() which has handled it's
412 * own cluster locking and has ensured allocation exists for those
413 * blocks to be written.
414 */
415int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
416 unsigned from, unsigned to)
417{
418 int ret;
419
420 ret = block_prepare_write(page, from, to, ocfs2_get_block);
421
422 return ret;
423}
424
425/* Taken from ext3. We don't necessarily need the full blown 410/* Taken from ext3. We don't necessarily need the full blown
426 * functionality yet, but IMHO it's better to cut and paste the whole 411 * functionality yet, but IMHO it's better to cut and paste the whole
427 * thing so we can avoid introducing our own bugs (and easily pick up 412 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
732} 717}
733 718
734/* 719/*
735 * Some of this taken from block_prepare_write(). We already have our 720 * Some of this taken from __block_write_begin(). We already have our
736 * mapping by now though, and the entire write will be allocating or 721 * mapping by now though, and the entire write will be allocating or
737 * it won't, so not much need to use BH_New. 722 * it won't, so not much need to use BH_New.
738 * 723 *
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 7606f663da6d..76bfdfda691a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
22#ifndef OCFS2_AOPS_H 22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H 23#define OCFS2_AOPS_H
24 24
25int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
26 unsigned from, unsigned to);
27
28handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 25handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page, 26 struct page *page,
30 unsigned from, 27 unsigned from,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530c..15fdbdf9eb4b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
129 129
130struct o2net_sock_container { 130struct o2net_sock_container {
131 struct kref sc_kref; 131 struct kref sc_kref;
132 /* the next two are vaild for the life time of the sc */ 132 /* the next two are valid for the life time of the sc */
133 struct socket *sc_sock; 133 struct socket *sc_sock;
134 struct o2nm_node *sc_node; 134 struct o2nm_node *sc_node;
135 135
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..b2df490a19ed 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
400 if (inode) { 400 if (inode) {
401 ip = DLMFS_I(inode); 401 ip = DLMFS_I(inode);
402 402
403 inode->i_ino = get_next_ino();
403 inode->i_mode = mode; 404 inode->i_mode = mode;
404 inode->i_uid = current_fsuid(); 405 inode->i_uid = current_fsuid();
405 inode->i_gid = current_fsgid(); 406 inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
425 if (!inode) 426 if (!inode)
426 return NULL; 427 return NULL;
427 428
429 inode->i_ino = get_next_ino();
428 inode->i_mode = mode; 430 inode->i_mode = mode;
429 inode->i_uid = current_fsuid(); 431 inode->i_uid = current_fsuid();
430 inode->i_gid = current_fsgid(); 432 inode->i_gid = current_fsgid();
@@ -612,6 +614,7 @@ static const struct file_operations dlmfs_file_operations = {
612 .poll = dlmfs_file_poll, 614 .poll = dlmfs_file_poll,
613 .read = dlmfs_file_read, 615 .read = dlmfs_file_read,
614 .write = dlmfs_file_write, 616 .write = dlmfs_file_write,
617 .llseek = default_llseek,
615}; 618};
616 619
617static const struct inode_operations dlmfs_dir_inode_operations = { 620static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -640,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
640 .setattr = dlmfs_file_setattr, 643 .setattr = dlmfs_file_setattr,
641}; 644};
642 645
643static int dlmfs_get_sb(struct file_system_type *fs_type, 646static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
644 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 647 int flags, const char *dev_name, void *data)
645{ 648{
646 return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); 649 return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
647} 650}
648 651
649static struct file_system_type dlmfs_fs_type = { 652static struct file_system_type dlmfs_fs_type = {
650 .owner = THIS_MODULE, 653 .owner = THIS_MODULE,
651 .name = "ocfs2_dlmfs", 654 .name = "ocfs2_dlmfs",
652 .get_sb = dlmfs_get_sb, 655 .mount = dlmfs_mount,
653 .kill_sb = kill_litter_super, 656 .kill_sb = kill_litter_super,
654}; 657};
655 658
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9e8cc4346b76..77b4c04a2809 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,8 +187,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
187 * platter 187 * platter
188 */ 188 */
189 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 189 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
190 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 190 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
191 NULL, BLKDEV_IFL_WAIT);
192 goto bail; 191 goto bail;
193 } 192 }
194 193
@@ -797,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
797 block_end = block_start + (1 << inode->i_blkbits); 796 block_end = block_start + (1 << inode->i_blkbits);
798 797
799 /* 798 /*
800 * block_start is block-aligned. Bump it by one to 799 * block_start is block-aligned. Bump it by one to force
801 * force ocfs2_{prepare,commit}_write() to zero the 800 * __block_write_begin and block_commit_write to zero the
802 * whole block. 801 * whole block.
803 */ 802 */
804 ret = ocfs2_prepare_write_nolock(inode, page, 803 ret = __block_write_begin(page, block_start + 1, 0,
805 block_start + 1, 804 ocfs2_get_block);
806 block_start + 1);
807 if (ret < 0) { 805 if (ret < 0) {
808 mlog_errno(ret); 806 mlog_errno(ret);
809 goto out_unlock; 807 goto out_unlock;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e7bde21149ae..ff5744e1e36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -742,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
742 goto out_commit; 742 goto out_commit;
743 } 743 }
744 744
745 atomic_inc(&inode->i_count); 745 ihold(inode);
746 dentry->d_op = &ocfs2_dentry_ops; 746 dentry->d_op = &ocfs2_dentry_ops;
747 d_instantiate(dentry, inode); 747 d_instantiate(dentry, inode);
748 748
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3bd..1efea3615589 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
159 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
160 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
161 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
162 unsigned char l_level; 162 char l_level;
163 char l_requested;
164 char l_blocking;
163 165
164 /* Data packed - type enum ocfs2_lock_type */ 166 /* Data packed - type enum ocfs2_lock_type */
165 unsigned char l_type; 167 unsigned char l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
169 unsigned char l_action; 171 unsigned char l_action;
170 /* Data packed - enum type ocfs2_unlock_action */ 172 /* Data packed - enum type ocfs2_unlock_action */
171 unsigned char l_unlock_action; 173 unsigned char l_unlock_action;
172 unsigned char l_requested;
173 unsigned char l_blocking;
174 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
175 175
176 spinlock_t l_lock; 176 spinlock_t l_lock;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..252e7c82f929 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/smp_lock.h>
26#include <linux/reboot.h> 25#include <linux/reboot.h>
27#include <asm/uaccess.h> 26#include <asm/uaccess.h>
28 27
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
612 return -ENOMEM; 611 return -ENOMEM;
613 p->op_this_node = -1; 612 p->op_this_node = -1;
614 613
615 lock_kernel();
616 mutex_lock(&ocfs2_control_lock); 614 mutex_lock(&ocfs2_control_lock);
617 file->private_data = p; 615 file->private_data = p;
618 list_add(&p->op_list, &ocfs2_control_private_list); 616 list_add(&p->op_list, &ocfs2_control_private_list);
619 mutex_unlock(&ocfs2_control_lock); 617 mutex_unlock(&ocfs2_control_lock);
620 unlock_kernel();
621 618
622 return 0; 619 return 0;
623} 620}
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
628 .read = ocfs2_control_read, 625 .read = ocfs2_control_read,
629 .write = ocfs2_control_write, 626 .write = ocfs2_control_write,
630 .owner = THIS_MODULE, 627 .owner = THIS_MODULE,
628 .llseek = default_llseek,
631}; 629};
632 630
633static struct miscdevice ocfs2_control_device = { 631static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a8a0ca44f88f..f02c0ef31578 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -630,8 +630,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
630 struct ocfs2_super *osb = OCFS2_SB(sb); 630 struct ocfs2_super *osb = OCFS2_SB(sb);
631 u32 tmp; 631 u32 tmp;
632 632
633 lock_kernel();
634
635 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 633 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
636 !ocfs2_check_set_options(sb, &parsed_options)) { 634 !ocfs2_check_set_options(sb, &parsed_options)) {
637 ret = -EINVAL; 635 ret = -EINVAL;
@@ -739,7 +737,6 @@ unlock_osb:
739 MS_POSIXACL : 0); 737 MS_POSIXACL : 0);
740 } 738 }
741out: 739out:
742 unlock_kernel();
743 return ret; 740 return ret;
744} 741}
745 742
@@ -1239,14 +1236,12 @@ read_super_error:
1239 return status; 1236 return status;
1240} 1237}
1241 1238
1242static int ocfs2_get_sb(struct file_system_type *fs_type, 1239static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
1243 int flags, 1240 int flags,
1244 const char *dev_name, 1241 const char *dev_name,
1245 void *data, 1242 void *data)
1246 struct vfsmount *mnt)
1247{ 1243{
1248 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super, 1244 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
1249 mnt);
1250} 1245}
1251 1246
1252static void ocfs2_kill_sb(struct super_block *sb) 1247static void ocfs2_kill_sb(struct super_block *sb)
@@ -1270,8 +1265,7 @@ out:
1270static struct file_system_type ocfs2_fs_type = { 1265static struct file_system_type ocfs2_fs_type = {
1271 .owner = THIS_MODULE, 1266 .owner = THIS_MODULE,
1272 .name = "ocfs2", 1267 .name = "ocfs2",
1273 .get_sb = ocfs2_get_sb, /* is this called when we mount 1268 .mount = ocfs2_mount,
1274 * the fs? */
1275 .kill_sb = ocfs2_kill_sb, 1269 .kill_sb = ocfs2_kill_sb,
1276 1270
1277 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1271 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
@@ -1696,13 +1690,9 @@ static void ocfs2_put_super(struct super_block *sb)
1696{ 1690{
1697 mlog_entry("(0x%p)\n", sb); 1691 mlog_entry("(0x%p)\n", sb);
1698 1692
1699 lock_kernel();
1700
1701 ocfs2_sync_blockdev(sb); 1693 ocfs2_sync_blockdev(sb);
1702 ocfs2_dismount_volume(sb, 0); 1694 ocfs2_dismount_volume(sb, 0);
1703 1695
1704 unlock_kernel();
1705
1706 mlog_exit_void(); 1696 mlog_exit_void();
1707} 1697}
1708 1698
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 14a22863291a..e043c4cb9a97 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -557,17 +557,16 @@ end:
557 return ret; 557 return ret;
558} 558}
559 559
560static int omfs_get_sb(struct file_system_type *fs_type, 560static struct dentry *omfs_mount(struct file_system_type *fs_type,
561 int flags, const char *dev_name, 561 int flags, const char *dev_name, void *data)
562 void *data, struct vfsmount *m)
563{ 562{
564 return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m); 563 return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
565} 564}
566 565
567static struct file_system_type omfs_fs_type = { 566static struct file_system_type omfs_fs_type = {
568 .owner = THIS_MODULE, 567 .owner = THIS_MODULE,
569 .name = "omfs", 568 .name = "omfs",
570 .get_sb = omfs_get_sb, 569 .mount = omfs_mount,
571 .kill_sb = kill_block_super, 570 .kill_sb = kill_block_super,
572 .fs_flags = FS_REQUIRES_DEV, 571 .fs_flags = FS_REQUIRES_DEV,
573}; 572};
diff --git a/fs/open.c b/fs/open.c
index d74e1983e8dc..4197b9ed023d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
786 /* Pick up the filp from the open intent */ 786 /* Pick up the filp from the open intent */
787 filp = nd->intent.open.file; 787 filp = nd->intent.open.file;
788 /* Has the filesystem initialised the file for us? */ 788 /* Has the filesystem initialised the file for us? */
789 if (filp->f_path.dentry == NULL) 789 if (filp->f_path.dentry == NULL) {
790 path_get(&nd->path);
790 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, 791 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
791 NULL, cred); 792 NULL, cred);
792 else 793 }
793 path_put(&nd->path);
794 return filp; 794 return filp;
795} 795}
796 796
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012c..911e61f348fc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -415,16 +415,16 @@ out_no_root:
415 return ret; 415 return ret;
416} 416}
417 417
418static int openprom_get_sb(struct file_system_type *fs_type, 418static struct dentry *openprom_mount(struct file_system_type *fs_type,
419 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 419 int flags, const char *dev_name, void *data)
420{ 420{
421 return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt); 421 return mount_single(fs_type, flags, data, openprom_fill_super);
422} 422}
423 423
424static struct file_system_type openprom_fs_type = { 424static struct file_system_type openprom_fs_type = {
425 .owner = THIS_MODULE, 425 .owner = THIS_MODULE,
426 .name = "openpromfs", 426 .name = "openpromfs",
427 .get_sb = openprom_get_sb, 427 .mount = openprom_mount,
428 .kill_sb = kill_anon_super, 428 .kill_sb = kill_anon_super,
429}; 429};
430 430
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..0a8b0ad0c7e2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -352,6 +352,7 @@ static void part_release(struct device *dev)
352{ 352{
353 struct hd_struct *p = dev_to_part(dev); 353 struct hd_struct *p = dev_to_part(dev);
354 free_part_stats(p); 354 free_part_stats(p);
355 free_part_info(p);
355 kfree(p); 356 kfree(p);
356} 357}
357 358
@@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
401 whole_disk_show, NULL); 402 whole_disk_show, NULL);
402 403
403struct hd_struct *add_partition(struct gendisk *disk, int partno, 404struct hd_struct *add_partition(struct gendisk *disk, int partno,
404 sector_t start, sector_t len, int flags) 405 sector_t start, sector_t len, int flags,
406 struct partition_meta_info *info)
405{ 407{
406 struct hd_struct *p; 408 struct hd_struct *p;
407 dev_t devt = MKDEV(0, 0); 409 dev_t devt = MKDEV(0, 0);
@@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
438 p->partno = partno; 440 p->partno = partno;
439 p->policy = get_disk_ro(disk); 441 p->policy = get_disk_ro(disk);
440 442
443 if (info) {
444 struct partition_meta_info *pinfo = alloc_part_info(disk);
445 if (!pinfo)
446 goto out_free_stats;
447 memcpy(pinfo, info, sizeof(*info));
448 p->info = pinfo;
449 }
450
441 dname = dev_name(ddev); 451 dname = dev_name(ddev);
442 if (isdigit(dname[strlen(dname) - 1])) 452 if (isdigit(dname[strlen(dname) - 1]))
443 dev_set_name(pdev, "%sp%d", dname, partno); 453 dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
451 461
452 err = blk_alloc_devt(p, &devt); 462 err = blk_alloc_devt(p, &devt);
453 if (err) 463 if (err)
454 goto out_free_stats; 464 goto out_free_info;
455 pdev->devt = devt; 465 pdev->devt = devt;
456 466
457 /* delay uevent until 'holders' subdir is created */ 467 /* delay uevent until 'holders' subdir is created */
@@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
481 491
482 return p; 492 return p;
483 493
494out_free_info:
495 free_part_info(p);
484out_free_stats: 496out_free_stats:
485 free_part_stats(p); 497 free_part_stats(p);
486out_free: 498out_free:
@@ -513,14 +525,14 @@ void register_disk(struct gendisk *disk)
513 525
514 if (device_add(ddev)) 526 if (device_add(ddev))
515 return; 527 return;
516#ifndef CONFIG_SYSFS_DEPRECATED 528 if (!sysfs_deprecated) {
517 err = sysfs_create_link(block_depr, &ddev->kobj, 529 err = sysfs_create_link(block_depr, &ddev->kobj,
518 kobject_name(&ddev->kobj)); 530 kobject_name(&ddev->kobj));
519 if (err) { 531 if (err) {
520 device_del(ddev); 532 device_del(ddev);
521 return; 533 return;
534 }
522 } 535 }
523#endif
524 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); 536 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
525 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); 537 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
526 538
@@ -642,6 +654,7 @@ rescan:
642 /* add partitions */ 654 /* add partitions */
643 for (p = 1; p < state->limit; p++) { 655 for (p = 1; p < state->limit; p++) {
644 sector_t size, from; 656 sector_t size, from;
657 struct partition_meta_info *info = NULL;
645 658
646 size = state->parts[p].size; 659 size = state->parts[p].size;
647 if (!size) 660 if (!size)
@@ -675,8 +688,12 @@ rescan:
675 size = get_capacity(disk) - from; 688 size = get_capacity(disk) - from;
676 } 689 }
677 } 690 }
691
692 if (state->parts[p].has_info)
693 info = &state->parts[p].info;
678 part = add_partition(disk, p, from, size, 694 part = add_partition(disk, p, from, size,
679 state->parts[p].flags); 695 state->parts[p].flags,
696 &state->parts[p].info);
680 if (IS_ERR(part)) { 697 if (IS_ERR(part)) {
681 printk(KERN_ERR " %s: p%d could not be added: %ld\n", 698 printk(KERN_ERR " %s: p%d could not be added: %ld\n",
682 disk->disk_name, p, -PTR_ERR(part)); 699 disk->disk_name, p, -PTR_ERR(part));
@@ -737,8 +754,7 @@ void del_gendisk(struct gendisk *disk)
737 kobject_put(disk->part0.holder_dir); 754 kobject_put(disk->part0.holder_dir);
738 kobject_put(disk->slave_dir); 755 kobject_put(disk->slave_dir);
739 disk->driverfs_dev = NULL; 756 disk->driverfs_dev = NULL;
740#ifndef CONFIG_SYSFS_DEPRECATED 757 if (!sysfs_deprecated)
741 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); 758 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
742#endif
743 device_del(disk_to_dev(disk)); 759 device_del(disk_to_dev(disk));
744} 760}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
1#include <linux/pagemap.h> 1#include <linux/pagemap.h>
2#include <linux/blkdev.h> 2#include <linux/blkdev.h>
3#include <linux/genhd.h>
3 4
4/* 5/*
5 * add_gd_partition adds a partitions details to the devices partition 6 * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
12 sector_t from; 13 sector_t from;
13 sector_t size; 14 sector_t size;
14 int flags; 15 int flags;
16 bool has_info;
17 struct partition_meta_info info;
15 } parts[DISK_MAX_PARTS]; 18 } parts[DISK_MAX_PARTS];
16 int next; 19 int next;
17 int limit; 20 int limit;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
94 * 94 *
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/ctype.h>
97#include <linux/math64.h> 98#include <linux/math64.h>
98#include <linux/slab.h> 99#include <linux/slab.h>
99#include "check.h" 100#include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
604 gpt_entry *ptes = NULL; 605 gpt_entry *ptes = NULL;
605 u32 i; 606 u32 i;
606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512; 607 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
608 u8 unparsed_guid[37];
607 609
608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { 610 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
609 kfree(gpt); 611 kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
614 pr_debug("GUID Partition Table is valid! Yea!\n"); 616 pr_debug("GUID Partition Table is valid! Yea!\n");
615 617
616 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 618 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
619 struct partition_meta_info *info;
620 unsigned label_count = 0;
621 unsigned label_max;
617 u64 start = le64_to_cpu(ptes[i].starting_lba); 622 u64 start = le64_to_cpu(ptes[i].starting_lba);
618 u64 size = le64_to_cpu(ptes[i].ending_lba) - 623 u64 size = le64_to_cpu(ptes[i].ending_lba) -
619 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 624 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
627 if (!efi_guidcmp(ptes[i].partition_type_guid, 632 if (!efi_guidcmp(ptes[i].partition_type_guid,
628 PARTITION_LINUX_RAID_GUID)) 633 PARTITION_LINUX_RAID_GUID))
629 state->parts[i + 1].flags = ADDPART_FLAG_RAID; 634 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635
636 info = &state->parts[i + 1].info;
637 /* Instead of doing a manual swap to big endian, reuse the
638 * common ASCII hex format as the interim.
639 */
640 efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
641 part_pack_uuid(unparsed_guid, info->uuid);
642
643 /* Naively convert UTF16-LE to 7 bits. */
644 label_max = min(sizeof(info->volname) - 1,
645 sizeof(ptes[i].partition_name));
646 info->volname[label_max] = 0;
647 while (label_count < label_max) {
648 u8 c = ptes[i].partition_name[label_count] & 0xff;
649 if (c && !isprint(c))
650 c = '!';
651 info->volname[label_count] = c;
652 label_count++;
653 }
654 state->parts[i + 1].has_info = true;
630 } 655 }
631 kfree(ptes); 656 kfree(ptes);
632 kfree(gpt); 657 kfree(gpt);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 5bf8a04b5d9b..789c625c7aa5 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it under 10 * This program is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU General Public License as published by the Free Software 11 * the terms of the GNU General Public License as published by the Free Software
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d86..374242c0971a 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it 10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free 11 * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51c..a8012a955720 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
382 error = ops->confirm(pipe, buf); 382 error = ops->confirm(pipe, buf);
383 if (error) { 383 if (error) {
384 if (!ret) 384 if (!ret)
385 error = ret; 385 ret = error;
386 break; 386 break;
387 } 387 }
388 388
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
954 if (!inode) 954 if (!inode)
955 goto fail_inode; 955 goto fail_inode;
956 956
957 inode->i_ino = get_next_ino();
958
957 pipe = alloc_pipe_info(inode); 959 pipe = alloc_pipe_info(inode);
958 if (!pipe) 960 if (!pipe)
959 goto fail_iput; 961 goto fail_iput;
@@ -1245,16 +1247,15 @@ out:
1245 * any operations on the root directory. However, we need a non-trivial 1247 * any operations on the root directory. However, we need a non-trivial
1246 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1248 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1247 */ 1249 */
1248static int pipefs_get_sb(struct file_system_type *fs_type, 1250static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1249 int flags, const char *dev_name, void *data, 1251 int flags, const char *dev_name, void *data)
1250 struct vfsmount *mnt)
1251{ 1252{
1252 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); 1253 return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
1253} 1254}
1254 1255
1255static struct file_system_type pipe_fs_type = { 1256static struct file_system_type pipe_fs_type = {
1256 .name = "pipefs", 1257 .name = "pipefs",
1257 .get_sb = pipefs_get_sb, 1258 .mount = pipefs_mount,
1258 .kill_sb = kill_anon_super, 1259 .kill_sb = kill_anon_super,
1259}; 1260};
1260 1261
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..6a0068841d96 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
33 depends on PROC_FS && MMU 33 depends on PROC_FS && MMU
34 34
35config PROC_VMCORE 35config PROC_VMCORE
36 bool "/proc/vmcore support (EXPERIMENTAL)" 36 bool "/proc/vmcore support"
37 depends on PROC_FS && CRASH_DUMP 37 depends on PROC_FS && CRASH_DUMP
38 default y 38 default y
39 help 39 help
40 Exports the dump image of crashed kernel in ELF format. 40 Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e4addaa5424..f3d02ca461ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
226{ 226{
227 struct mm_struct *mm; 227 struct mm_struct *mm;
228 228
229 if (mutex_lock_killable(&task->cred_guard_mutex)) 229 if (mutex_lock_killable(&task->signal->cred_guard_mutex))
230 return NULL; 230 return NULL;
231 231
232 mm = get_task_mm(task); 232 mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
235 mmput(mm); 235 mmput(mm);
236 mm = NULL; 236 mm = NULL;
237 } 237 }
238 mutex_unlock(&task->cred_guard_mutex); 238 mutex_unlock(&task->signal->cred_guard_mutex);
239 239
240 return mm; 240 return mm;
241} 241}
@@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
771static int mem_open(struct inode* inode, struct file* file) 771static int mem_open(struct inode* inode, struct file* file)
772{ 772{
773 file->private_data = (void*)((long)current->self_exec_id); 773 file->private_data = (void*)((long)current->self_exec_id);
774 /* OK to pass negative loff_t, we can catch out-of-range */
775 file->f_mode |= FMODE_UNSIGNED_OFFSET;
774 return 0; 776 return 0;
775} 777}
776 778
@@ -1023,28 +1025,47 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1023 memset(buffer, 0, sizeof(buffer)); 1025 memset(buffer, 0, sizeof(buffer));
1024 if (count > sizeof(buffer) - 1) 1026 if (count > sizeof(buffer) - 1)
1025 count = sizeof(buffer) - 1; 1027 count = sizeof(buffer) - 1;
1026 if (copy_from_user(buffer, buf, count)) 1028 if (copy_from_user(buffer, buf, count)) {
1027 return -EFAULT; 1029 err = -EFAULT;
1030 goto out;
1031 }
1028 1032
1029 err = strict_strtol(strstrip(buffer), 0, &oom_adjust); 1033 err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1030 if (err) 1034 if (err)
1031 return -EINVAL; 1035 goto out;
1032 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && 1036 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1033 oom_adjust != OOM_DISABLE) 1037 oom_adjust != OOM_DISABLE) {
1034 return -EINVAL; 1038 err = -EINVAL;
1039 goto out;
1040 }
1035 1041
1036 task = get_proc_task(file->f_path.dentry->d_inode); 1042 task = get_proc_task(file->f_path.dentry->d_inode);
1037 if (!task) 1043 if (!task) {
1038 return -ESRCH; 1044 err = -ESRCH;
1045 goto out;
1046 }
1047
1048 task_lock(task);
1049 if (!task->mm) {
1050 err = -EINVAL;
1051 goto err_task_lock;
1052 }
1053
1039 if (!lock_task_sighand(task, &flags)) { 1054 if (!lock_task_sighand(task, &flags)) {
1040 put_task_struct(task); 1055 err = -ESRCH;
1041 return -ESRCH; 1056 goto err_task_lock;
1042 } 1057 }
1043 1058
1044 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { 1059 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1045 unlock_task_sighand(task, &flags); 1060 err = -EACCES;
1046 put_task_struct(task); 1061 goto err_sighand;
1047 return -EACCES; 1062 }
1063
1064 if (oom_adjust != task->signal->oom_adj) {
1065 if (oom_adjust == OOM_DISABLE)
1066 atomic_inc(&task->mm->oom_disable_count);
1067 if (task->signal->oom_adj == OOM_DISABLE)
1068 atomic_dec(&task->mm->oom_disable_count);
1048 } 1069 }
1049 1070
1050 /* 1071 /*
@@ -1065,10 +1086,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1065 else 1086 else
1066 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / 1087 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1067 -OOM_DISABLE; 1088 -OOM_DISABLE;
1089err_sighand:
1068 unlock_task_sighand(task, &flags); 1090 unlock_task_sighand(task, &flags);
1091err_task_lock:
1092 task_unlock(task);
1069 put_task_struct(task); 1093 put_task_struct(task);
1070 1094out:
1071 return count; 1095 return err < 0 ? err : count;
1072} 1096}
1073 1097
1074static const struct file_operations proc_oom_adjust_operations = { 1098static const struct file_operations proc_oom_adjust_operations = {
@@ -1109,30 +1133,49 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1109 memset(buffer, 0, sizeof(buffer)); 1133 memset(buffer, 0, sizeof(buffer));
1110 if (count > sizeof(buffer) - 1) 1134 if (count > sizeof(buffer) - 1)
1111 count = sizeof(buffer) - 1; 1135 count = sizeof(buffer) - 1;
1112 if (copy_from_user(buffer, buf, count)) 1136 if (copy_from_user(buffer, buf, count)) {
1113 return -EFAULT; 1137 err = -EFAULT;
1138 goto out;
1139 }
1114 1140
1115 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); 1141 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
1116 if (err) 1142 if (err)
1117 return -EINVAL; 1143 goto out;
1118 if (oom_score_adj < OOM_SCORE_ADJ_MIN || 1144 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1119 oom_score_adj > OOM_SCORE_ADJ_MAX) 1145 oom_score_adj > OOM_SCORE_ADJ_MAX) {
1120 return -EINVAL; 1146 err = -EINVAL;
1147 goto out;
1148 }
1121 1149
1122 task = get_proc_task(file->f_path.dentry->d_inode); 1150 task = get_proc_task(file->f_path.dentry->d_inode);
1123 if (!task) 1151 if (!task) {
1124 return -ESRCH; 1152 err = -ESRCH;
1153 goto out;
1154 }
1155
1156 task_lock(task);
1157 if (!task->mm) {
1158 err = -EINVAL;
1159 goto err_task_lock;
1160 }
1161
1125 if (!lock_task_sighand(task, &flags)) { 1162 if (!lock_task_sighand(task, &flags)) {
1126 put_task_struct(task); 1163 err = -ESRCH;
1127 return -ESRCH; 1164 goto err_task_lock;
1128 } 1165 }
1166
1129 if (oom_score_adj < task->signal->oom_score_adj && 1167 if (oom_score_adj < task->signal->oom_score_adj &&
1130 !capable(CAP_SYS_RESOURCE)) { 1168 !capable(CAP_SYS_RESOURCE)) {
1131 unlock_task_sighand(task, &flags); 1169 err = -EACCES;
1132 put_task_struct(task); 1170 goto err_sighand;
1133 return -EACCES;
1134 } 1171 }
1135 1172
1173 if (oom_score_adj != task->signal->oom_score_adj) {
1174 if (oom_score_adj == OOM_SCORE_ADJ_MIN)
1175 atomic_inc(&task->mm->oom_disable_count);
1176 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1177 atomic_dec(&task->mm->oom_disable_count);
1178 }
1136 task->signal->oom_score_adj = oom_score_adj; 1179 task->signal->oom_score_adj = oom_score_adj;
1137 /* 1180 /*
1138 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1181 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@ -1143,14 +1186,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1143 else 1186 else
1144 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / 1187 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1145 OOM_SCORE_ADJ_MAX; 1188 OOM_SCORE_ADJ_MAX;
1189err_sighand:
1146 unlock_task_sighand(task, &flags); 1190 unlock_task_sighand(task, &flags);
1191err_task_lock:
1192 task_unlock(task);
1147 put_task_struct(task); 1193 put_task_struct(task);
1148 return count; 1194out:
1195 return err < 0 ? err : count;
1149} 1196}
1150 1197
1151static const struct file_operations proc_oom_score_adj_operations = { 1198static const struct file_operations proc_oom_score_adj_operations = {
1152 .read = oom_score_adj_read, 1199 .read = oom_score_adj_read,
1153 .write = oom_score_adj_write, 1200 .write = oom_score_adj_write,
1201 .llseek = default_llseek,
1154}; 1202};
1155 1203
1156#ifdef CONFIG_AUDITSYSCALL 1204#ifdef CONFIG_AUDITSYSCALL
@@ -1600,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1600 1648
1601 /* Common stuff */ 1649 /* Common stuff */
1602 ei = PROC_I(inode); 1650 ei = PROC_I(inode);
1651 inode->i_ino = get_next_ino();
1603 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1652 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1604 inode->i_op = &proc_def_inode_operations; 1653 inode->i_op = &proc_def_inode_operations;
1605 1654
@@ -2039,11 +2088,13 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
2039static const struct file_operations proc_fdinfo_file_operations = { 2088static const struct file_operations proc_fdinfo_file_operations = {
2040 .open = nonseekable_open, 2089 .open = nonseekable_open,
2041 .read = proc_fdinfo_read, 2090 .read = proc_fdinfo_read,
2091 .llseek = no_llseek,
2042}; 2092};
2043 2093
2044static const struct file_operations proc_fd_operations = { 2094static const struct file_operations proc_fd_operations = {
2045 .read = generic_read_dir, 2095 .read = generic_read_dir,
2046 .readdir = proc_readfd, 2096 .readdir = proc_readfd,
2097 .llseek = default_llseek,
2047}; 2098};
2048 2099
2049/* 2100/*
@@ -2112,6 +2163,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
2112static const struct file_operations proc_fdinfo_operations = { 2163static const struct file_operations proc_fdinfo_operations = {
2113 .read = generic_read_dir, 2164 .read = generic_read_dir,
2114 .readdir = proc_readfdinfo, 2165 .readdir = proc_readfdinfo,
2166 .llseek = default_llseek,
2115}; 2167};
2116 2168
2117/* 2169/*
@@ -2302,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2302 goto out_free; 2354 goto out_free;
2303 2355
2304 /* Guard against adverse ptrace interaction */ 2356 /* Guard against adverse ptrace interaction */
2305 length = mutex_lock_interruptible(&task->cred_guard_mutex); 2357 length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
2306 if (length < 0) 2358 if (length < 0)
2307 goto out_free; 2359 goto out_free;
2308 2360
2309 length = security_setprocattr(task, 2361 length = security_setprocattr(task,
2310 (char*)file->f_path.dentry->d_name.name, 2362 (char*)file->f_path.dentry->d_name.name,
2311 (void*)page, count); 2363 (void*)page, count);
2312 mutex_unlock(&task->cred_guard_mutex); 2364 mutex_unlock(&task->signal->cred_guard_mutex);
2313out_free: 2365out_free:
2314 free_page((unsigned long) page); 2366 free_page((unsigned long) page);
2315out: 2367out:
@@ -2343,6 +2395,7 @@ static int proc_attr_dir_readdir(struct file * filp,
2343static const struct file_operations proc_attr_dir_operations = { 2395static const struct file_operations proc_attr_dir_operations = {
2344 .read = generic_read_dir, 2396 .read = generic_read_dir,
2345 .readdir = proc_attr_dir_readdir, 2397 .readdir = proc_attr_dir_readdir,
2398 .llseek = default_llseek,
2346}; 2399};
2347 2400
2348static struct dentry *proc_attr_dir_lookup(struct inode *dir, 2401static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2542,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2542 2595
2543 /* Initialize the inode */ 2596 /* Initialize the inode */
2544 ei = PROC_I(inode); 2597 ei = PROC_I(inode);
2598 inode->i_ino = get_next_ino();
2545 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 2599 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2546 2600
2547 /* 2601 /*
@@ -2751,6 +2805,7 @@ static int proc_tgid_base_readdir(struct file * filp,
2751static const struct file_operations proc_tgid_base_operations = { 2805static const struct file_operations proc_tgid_base_operations = {
2752 .read = generic_read_dir, 2806 .read = generic_read_dir,
2753 .readdir = proc_tgid_base_readdir, 2807 .readdir = proc_tgid_base_readdir,
2808 .llseek = default_llseek,
2754}; 2809};
2755 2810
2756static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ 2811static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -3088,6 +3143,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
3088static const struct file_operations proc_tid_base_operations = { 3143static const struct file_operations proc_tid_base_operations = {
3089 .read = generic_read_dir, 3144 .read = generic_read_dir,
3090 .readdir = proc_tid_base_readdir, 3145 .readdir = proc_tid_base_readdir,
3146 .llseek = default_llseek,
3091}; 3147};
3092 3148
3093static const struct inode_operations proc_tid_base_inode_operations = { 3149static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3324,4 +3380,5 @@ static const struct inode_operations proc_task_inode_operations = {
3324static const struct file_operations proc_task_operations = { 3380static const struct file_operations proc_task_operations = {
3325 .read = generic_read_dir, 3381 .read = generic_read_dir,
3326 .readdir = proc_task_readdir, 3382 .readdir = proc_task_readdir,
3383 .llseek = default_llseek,
3327}; 3384};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5be436ea088e..b652cb00906b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
23 if (!inode) 23 if (!inode)
24 goto out; 24 goto out;
25 25
26 inode->i_ino = get_next_ino();
27
26 sysctl_head_get(head); 28 sysctl_head_get(head);
27 ei = PROC_I(inode); 29 ei = PROC_I(inode);
28 ei->sysctl = head; 30 ei->sysctl = head;
@@ -364,6 +366,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
364static const struct file_operations proc_sys_file_operations = { 366static const struct file_operations proc_sys_file_operations = {
365 .read = proc_sys_read, 367 .read = proc_sys_read,
366 .write = proc_sys_write, 368 .write = proc_sys_write,
369 .llseek = default_llseek,
367}; 370};
368 371
369static const struct file_operations proc_sys_dir_file_operations = { 372static const struct file_operations proc_sys_dir_file_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22d..ef9fa8e24ad6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
35 return set_anon_super(sb, NULL); 35 return set_anon_super(sb, NULL);
36} 36}
37 37
38static int proc_get_sb(struct file_system_type *fs_type, 38static struct dentry *proc_mount(struct file_system_type *fs_type,
39 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 39 int flags, const char *dev_name, void *data)
40{ 40{
41 int err; 41 int err;
42 struct super_block *sb; 42 struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
61 61
62 sb = sget(fs_type, proc_test_super, proc_set_super, ns); 62 sb = sget(fs_type, proc_test_super, proc_set_super, ns);
63 if (IS_ERR(sb)) 63 if (IS_ERR(sb))
64 return PTR_ERR(sb); 64 return ERR_CAST(sb);
65 65
66 if (!sb->s_root) { 66 if (!sb->s_root) {
67 sb->s_flags = flags; 67 sb->s_flags = flags;
68 err = proc_fill_super(sb); 68 err = proc_fill_super(sb);
69 if (err) { 69 if (err) {
70 deactivate_locked_super(sb); 70 deactivate_locked_super(sb);
71 return err; 71 return ERR_PTR(err);
72 } 72 }
73 73
74 ei = PROC_I(sb->s_root->d_inode); 74 ei = PROC_I(sb->s_root->d_inode);
@@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
79 } 79 }
80 80
81 sb->s_flags |= MS_ACTIVE; 81 sb->s_flags |= MS_ACTIVE;
82 ns->proc_mnt = mnt;
83 } 82 }
84 83
85 simple_set_mnt(mnt, sb); 84 return dget(sb->s_root);
86 return 0;
87} 85}
88 86
89static void proc_kill_sb(struct super_block *sb) 87static void proc_kill_sb(struct super_block *sb)
@@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
97 95
98static struct file_system_type proc_fs_type = { 96static struct file_system_type proc_fs_type = {
99 .name = "proc", 97 .name = "proc",
100 .get_sb = proc_get_sb, 98 .mount = proc_mount,
101 .kill_sb = proc_kill_sb, 99 .kill_sb = proc_kill_sb,
102}; 100};
103 101
@@ -115,6 +113,7 @@ void __init proc_root_init(void)
115 return; 113 return;
116 } 114 }
117 115
116 init_pid_ns.proc_mnt = proc_mnt;
118 proc_symlink("mounts", NULL, "self/mounts"); 117 proc_symlink("mounts", NULL, "self/mounts");
119 118
120 proc_net_init(); 119 proc_net_init();
@@ -179,6 +178,7 @@ static int proc_root_readdir(struct file * filp,
179static const struct file_operations proc_root_operations = { 178static const struct file_operations proc_root_operations = {
180 .read = generic_read_dir, 179 .read = generic_read_dir,
181 .readdir = proc_root_readdir, 180 .readdir = proc_root_readdir,
181 .llseek = default_llseek,
182}; 182};
183 183
184/* 184/*
@@ -212,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
212 if (IS_ERR(mnt)) 212 if (IS_ERR(mnt))
213 return PTR_ERR(mnt); 213 return PTR_ERR(mnt);
214 214
215 ns->proc_mnt = mnt;
215 return 0; 216 return 0;
216} 217}
217 218
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f17..37994737c983 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v)
10{ 10{
11 int i, j; 11 int i, j;
12 12
13 seq_printf(p, " "); 13 seq_printf(p, " ");
14 for_each_possible_cpu(i) 14 for_each_possible_cpu(i)
15 seq_printf(p, "CPU%-8d", i); 15 seq_printf(p, "CPU%-8d", i);
16 seq_printf(p, "\n"); 16 seq_printf(p, "\n");
17 17
18 for (i = 0; i < NR_SOFTIRQS; i++) { 18 for (i = 0; i < NR_SOFTIRQS; i++) {
19 seq_printf(p, "%8s:", softirq_to_name[i]); 19 seq_printf(p, "%12s:", softirq_to_name[i]);
20 for_each_possible_cpu(j) 20 for_each_possible_cpu(j)
21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); 21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
22 seq_printf(p, "\n"); 22 seq_printf(p, "\n");
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc275..e15a19c93bae 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
31 u64 sum_softirq = 0; 31 u64 sum_softirq = 0;
32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
33 struct timespec boottime; 33 struct timespec boottime;
34 unsigned int per_irq_sum;
35 34
36 user = nice = system = idle = iowait = 35 user = nice = system = idle = iowait =
37 irq = softirq = steal = cputime64_zero; 36 irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
52 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 51 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
53 guest_nice = cputime64_add(guest_nice, 52 guest_nice = cputime64_add(guest_nice,
54 kstat_cpu(i).cpustat.guest_nice); 53 kstat_cpu(i).cpustat.guest_nice);
55 for_each_irq_nr(j) { 54 sum += kstat_cpu_irqs_sum(i);
56 sum += kstat_irqs_cpu(j, i);
57 }
58 sum += arch_irq_stat_cpu(i); 55 sum += arch_irq_stat_cpu(i);
59 56
60 for (j = 0; j < NR_SOFTIRQS; j++) { 57 for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
110 seq_printf(p, "intr %llu", (unsigned long long)sum); 107 seq_printf(p, "intr %llu", (unsigned long long)sum);
111 108
112 /* sum again ? it could be updated? */ 109 /* sum again ? it could be updated? */
113 for_each_irq_nr(j) { 110 for_each_irq_nr(j)
114 per_irq_sum = 0; 111 seq_printf(p, " %u", kstat_irqs(j));
115 for_each_possible_cpu(i)
116 per_irq_sum += kstat_irqs_cpu(j, i);
117
118 seq_printf(p, " %u", per_irq_sum);
119 }
120 112
121 seq_printf(p, 113 seq_printf(p,
122 "\nctxt %llu\n" 114 "\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1dbca4e8cc16..da6b01d70f01 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -327,6 +327,7 @@ struct mem_size_stats {
327 unsigned long private_clean; 327 unsigned long private_clean;
328 unsigned long private_dirty; 328 unsigned long private_dirty;
329 unsigned long referenced; 329 unsigned long referenced;
330 unsigned long anonymous;
330 unsigned long swap; 331 unsigned long swap;
331 u64 pss; 332 u64 pss;
332}; 333};
@@ -357,6 +358,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
357 if (!page) 358 if (!page)
358 continue; 359 continue;
359 360
361 if (PageAnon(page))
362 mss->anonymous += PAGE_SIZE;
363
360 mss->resident += PAGE_SIZE; 364 mss->resident += PAGE_SIZE;
361 /* Accumulate the size in pages that have been accessed. */ 365 /* Accumulate the size in pages that have been accessed. */
362 if (pte_young(ptent) || PageReferenced(page)) 366 if (pte_young(ptent) || PageReferenced(page))
@@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
410 "Private_Clean: %8lu kB\n" 414 "Private_Clean: %8lu kB\n"
411 "Private_Dirty: %8lu kB\n" 415 "Private_Dirty: %8lu kB\n"
412 "Referenced: %8lu kB\n" 416 "Referenced: %8lu kB\n"
417 "Anonymous: %8lu kB\n"
413 "Swap: %8lu kB\n" 418 "Swap: %8lu kB\n"
414 "KernelPageSize: %8lu kB\n" 419 "KernelPageSize: %8lu kB\n"
415 "MMUPageSize: %8lu kB\n", 420 "MMUPageSize: %8lu kB\n",
@@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
421 mss.private_clean >> 10, 426 mss.private_clean >> 10,
422 mss.private_dirty >> 10, 427 mss.private_dirty >> 10,
423 mss.referenced >> 10, 428 mss.referenced >> 10,
429 mss.anonymous >> 10,
424 mss.swap >> 10, 430 mss.swap >> 10,
425 vma_kernel_pagesize(vma) >> 10, 431 vma_kernel_pagesize(vma) >> 10,
426 vma_mmu_pagesize(vma) >> 10); 432 vma_mmu_pagesize(vma) >> 10);
@@ -539,6 +545,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
539 545
540const struct file_operations proc_clear_refs_operations = { 546const struct file_operations proc_clear_refs_operations = {
541 .write = clear_refs_write, 547 .write = clear_refs_write,
548 .llseek = noop_llseek,
542}; 549};
543 550
544struct pagemapread { 551struct pagemapread {
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a8..7b0329468a5d 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. 11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
12 */ 12 */
13 13
14#include <linux/smp_lock.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include "qnx4.h" 15#include "qnx4.h"
17 16
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
29 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
30 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); 29 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
31 30
32 lock_kernel();
33
34 while (filp->f_pos < inode->i_size) { 31 while (filp->f_pos < inode->i_size) {
35 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); 32 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
36 bh = sb_bread(inode->i_sb, blknum); 33 bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
71 brelse(bh); 68 brelse(bh);
72 } 69 }
73out: 70out:
74 unlock_kernel();
75 return 0; 71 return 0;
76} 72}
77 73
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 16829722be93..fcada42f1aa3 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/smp_lock.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
157 struct super_block *sb = dentry->d_sb; 156 struct super_block *sb = dentry->d_sb;
158 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 157 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
159 158
160 lock_kernel();
161
162 buf->f_type = sb->s_magic; 159 buf->f_type = sb->s_magic;
163 buf->f_bsize = sb->s_blocksize; 160 buf->f_bsize = sb->s_blocksize;
164 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8; 161 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
168 buf->f_fsid.val[0] = (u32)id; 165 buf->f_fsid.val[0] = (u32)id;
169 buf->f_fsid.val[1] = (u32)(id >> 32); 166 buf->f_fsid.val[1] = (u32)(id >> 32);
170 167
171 unlock_kernel();
172
173 return 0; 168 return 0;
174} 169}
175 170
@@ -283,7 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
283 goto outi; 278 goto outi;
284 279
285 brelse(bh); 280 brelse(bh);
286
287 return 0; 281 return 0;
288 282
289 outi: 283 outi:
@@ -460,17 +454,16 @@ static void destroy_inodecache(void)
460 kmem_cache_destroy(qnx4_inode_cachep); 454 kmem_cache_destroy(qnx4_inode_cachep);
461} 455}
462 456
463static int qnx4_get_sb(struct file_system_type *fs_type, 457static struct dentry *qnx4_mount(struct file_system_type *fs_type,
464 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 458 int flags, const char *dev_name, void *data)
465{ 459{
466 return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super, 460 return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
467 mnt);
468} 461}
469 462
470static struct file_system_type qnx4_fs_type = { 463static struct file_system_type qnx4_fs_type = {
471 .owner = THIS_MODULE, 464 .owner = THIS_MODULE,
472 .name = "qnx4", 465 .name = "qnx4",
473 .get_sb = qnx4_get_sb, 466 .mount = qnx4_mount,
474 .kill_sb = kill_block_super, 467 .kill_sb = kill_block_super,
475 .fs_flags = FS_REQUIRES_DEV, 468 .fs_flags = FS_REQUIRES_DEV,
476}; 469};
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba879..275327b5615e 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink. 12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
13 */ 13 */
14 14
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include "qnx4.h" 16#include "qnx4.h"
18 17
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
109 int len = dentry->d_name.len; 108 int len = dentry->d_name.len;
110 struct inode *foundinode = NULL; 109 struct inode *foundinode = NULL;
111 110
112 lock_kernel();
113 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino))) 111 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
114 goto out; 112 goto out;
115 /* The entry is linked, let's get the real info */ 113 /* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
123 121
124 foundinode = qnx4_iget(dir->i_sb, ino); 122 foundinode = qnx4_iget(dir->i_sb, ino);
125 if (IS_ERR(foundinode)) { 123 if (IS_ERR(foundinode)) {
126 unlock_kernel();
127 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", 124 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
128 PTR_ERR(foundinode))); 125 PTR_ERR(foundinode)));
129 return ERR_CAST(foundinode); 126 return ERR_CAST(foundinode);
130 } 127 }
131out: 128out:
132 unlock_kernel();
133 d_add(dentry, foundinode); 129 d_add(dentry, foundinode);
134 130
135 return NULL; 131 return NULL;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3a..880fd9884366 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
4 4
5config QUOTA 5config QUOTA
6 bool "Quota support" 6 bool "Quota support"
7 select QUOTACTL
7 help 8 help
8 If you say Y here, you will be able to set per user limits for disk 9 If you say Y here, you will be able to set per user limits for disk
9 usage (also called disk quotas). Currently, it works for the 10 usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
65 66
66config QUOTACTL 67config QUOTACTL
67 bool 68 bool
68 depends on XFS_QUOTA || QUOTA 69 default n
69 default y
70 70
71config QUOTACTL_COMPAT 71config QUOTACTL_COMPAT
72 bool 72 bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index aad1316a977f..0fed41e6efcd 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1386,6 +1386,9 @@ static void __dquot_initialize(struct inode *inode, int type)
1386 /* Avoid races with quotaoff() */ 1386 /* Avoid races with quotaoff() */
1387 if (!sb_has_quota_active(sb, cnt)) 1387 if (!sb_has_quota_active(sb, cnt))
1388 continue; 1388 continue;
1389 /* We could race with quotaon or dqget() could have failed */
1390 if (!got[cnt])
1391 continue;
1389 if (!inode->i_dquot[cnt]) { 1392 if (!inode->i_dquot[cnt]) {
1390 inode->i_dquot[cnt] = got[cnt]; 1393 inode->i_dquot[cnt] = got[cnt];
1391 got[cnt] = NULL; 1394 got[cnt] = NULL;
@@ -1736,6 +1739,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1736 qsize_t rsv_space = 0; 1739 qsize_t rsv_space = 0;
1737 struct dquot *transfer_from[MAXQUOTAS] = {}; 1740 struct dquot *transfer_from[MAXQUOTAS] = {};
1738 int cnt, ret = 0; 1741 int cnt, ret = 0;
1742 char is_valid[MAXQUOTAS] = {};
1739 char warntype_to[MAXQUOTAS]; 1743 char warntype_to[MAXQUOTAS];
1740 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1744 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1741 1745
@@ -1757,8 +1761,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1757 space = cur_space + rsv_space; 1761 space = cur_space + rsv_space;
1758 /* Build the transfer_from list and check the limits */ 1762 /* Build the transfer_from list and check the limits */
1759 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1763 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1764 /*
1765 * Skip changes for same uid or gid or for turned off quota-type.
1766 */
1760 if (!transfer_to[cnt]) 1767 if (!transfer_to[cnt])
1761 continue; 1768 continue;
1769 /* Avoid races with quotaoff() */
1770 if (!sb_has_quota_active(inode->i_sb, cnt))
1771 continue;
1772 is_valid[cnt] = 1;
1762 transfer_from[cnt] = inode->i_dquot[cnt]; 1773 transfer_from[cnt] = inode->i_dquot[cnt];
1763 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); 1774 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
1764 if (ret) 1775 if (ret)
@@ -1772,12 +1783,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1772 * Finally perform the needed transfer from transfer_from to transfer_to 1783 * Finally perform the needed transfer from transfer_from to transfer_to
1773 */ 1784 */
1774 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1785 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1775 /* 1786 if (!is_valid[cnt])
1776 * Skip changes for same uid or gid or for turned off quota-type.
1777 */
1778 if (!transfer_to[cnt])
1779 continue; 1787 continue;
1780
1781 /* Due to IO error we might not have transfer_from[] structure */ 1788 /* Due to IO error we might not have transfer_from[] structure */
1782 if (transfer_from[cnt]) { 1789 if (transfer_from[cnt]) {
1783 warntype_from_inodes[cnt] = 1790 warntype_from_inodes[cnt] =
@@ -1801,18 +1808,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1801 1808
1802 mark_all_dquot_dirty(transfer_from); 1809 mark_all_dquot_dirty(transfer_from);
1803 mark_all_dquot_dirty(transfer_to); 1810 mark_all_dquot_dirty(transfer_to);
1804 /* Pass back references to put */
1805 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1806 transfer_to[cnt] = transfer_from[cnt];
1807warn:
1808 flush_warnings(transfer_to, warntype_to); 1811 flush_warnings(transfer_to, warntype_to);
1809 flush_warnings(transfer_from, warntype_from_inodes); 1812 flush_warnings(transfer_from, warntype_from_inodes);
1810 flush_warnings(transfer_from, warntype_from_space); 1813 flush_warnings(transfer_from, warntype_from_space);
1811 return ret; 1814 /* Pass back references to put */
1815 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1816 if (is_valid[cnt])
1817 transfer_to[cnt] = transfer_from[cnt];
1818 return 0;
1812over_quota: 1819over_quota:
1813 spin_unlock(&dq_data_lock); 1820 spin_unlock(&dq_data_lock);
1814 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1821 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1815 goto warn; 1822 flush_warnings(transfer_to, warntype_to);
1823 return ret;
1816} 1824}
1817EXPORT_SYMBOL(__dquot_transfer); 1825EXPORT_SYMBOL(__dquot_transfer);
1818 1826
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..eacb166fb259 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
58 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
59 59
60 if (inode) { 60 if (inode) {
61 inode->i_ino = get_next_ino();
61 inode_init_owner(inode, dir, mode); 62 inode_init_owner(inode, dir, mode);
62 inode->i_mapping->a_ops = &ramfs_aops; 63 inode->i_mapping->a_ops = &ramfs_aops;
63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -254,17 +255,16 @@ fail:
254 return err; 255 return err;
255} 256}
256 257
257int ramfs_get_sb(struct file_system_type *fs_type, 258struct dentry *ramfs_mount(struct file_system_type *fs_type,
258 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 259 int flags, const char *dev_name, void *data)
259{ 260{
260 return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt); 261 return mount_nodev(fs_type, flags, data, ramfs_fill_super);
261} 262}
262 263
263static int rootfs_get_sb(struct file_system_type *fs_type, 264static struct dentry *rootfs_mount(struct file_system_type *fs_type,
264 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 265 int flags, const char *dev_name, void *data)
265{ 266{
266 return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super, 267 return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
267 mnt);
268} 268}
269 269
270static void ramfs_kill_sb(struct super_block *sb) 270static void ramfs_kill_sb(struct super_block *sb)
@@ -275,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
275 275
276static struct file_system_type ramfs_fs_type = { 276static struct file_system_type ramfs_fs_type = {
277 .name = "ramfs", 277 .name = "ramfs",
278 .get_sb = ramfs_get_sb, 278 .mount = ramfs_mount,
279 .kill_sb = ramfs_kill_sb, 279 .kill_sb = ramfs_kill_sb,
280}; 280};
281static struct file_system_type rootfs_fs_type = { 281static struct file_system_type rootfs_fs_type = {
282 .name = "rootfs", 282 .name = "rootfs",
283 .get_sb = rootfs_get_sb, 283 .mount = rootfs_mount,
284 .kill_sb = kill_litter_super, 284 .kill_sb = kill_litter_super,
285}; 285};
286 286
diff --git a/fs/read_write.c b/fs/read_write.c
index 74e36586e4d3..431a0ed610c8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
31 31
32EXPORT_SYMBOL(generic_ro_fops); 32EXPORT_SYMBOL(generic_ro_fops);
33 33
34static int
35__negative_fpos_check(struct file *file, loff_t pos, size_t count)
36{
37 /*
38 * pos or pos+count is negative here, check overflow.
39 * too big "count" will be caught in rw_verify_area().
40 */
41 if ((pos < 0) && (pos + count < pos))
42 return -EOVERFLOW;
43 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
44 return 0;
45 return -EINVAL;
46}
47
34/** 48/**
35 * generic_file_llseek_unlocked - lockless generic llseek implementation 49 * generic_file_llseek_unlocked - lockless generic llseek implementation
36 * @file: file structure to seek on 50 * @file: file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
62 break; 76 break;
63 } 77 }
64 78
65 if (offset < 0 || offset > inode->i_sb->s_maxbytes) 79 if (offset < 0 && __negative_fpos_check(file, offset, 0))
80 return -EINVAL;
81 if (offset > inode->i_sb->s_maxbytes)
66 return -EINVAL; 82 return -EINVAL;
67 83
68 /* Special lock needed here? */ 84 /* Special lock needed here? */
@@ -124,7 +140,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
124{ 140{
125 loff_t retval; 141 loff_t retval;
126 142
127 lock_kernel(); 143 mutex_lock(&file->f_dentry->d_inode->i_mutex);
128 switch (origin) { 144 switch (origin) {
129 case SEEK_END: 145 case SEEK_END:
130 offset += i_size_read(file->f_path.dentry->d_inode); 146 offset += i_size_read(file->f_path.dentry->d_inode);
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
137 offset += file->f_pos; 153 offset += file->f_pos;
138 } 154 }
139 retval = -EINVAL; 155 retval = -EINVAL;
140 if (offset >= 0) { 156 if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
141 if (offset != file->f_pos) { 157 if (offset != file->f_pos) {
142 file->f_pos = offset; 158 file->f_pos = offset;
143 file->f_version = 0; 159 file->f_version = 0;
@@ -145,7 +161,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
145 retval = offset; 161 retval = offset;
146 } 162 }
147out: 163out:
148 unlock_kernel(); 164 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
149 return retval; 165 return retval;
150} 166}
151EXPORT_SYMBOL(default_llseek); 167EXPORT_SYMBOL(default_llseek);
@@ -156,7 +172,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
156 172
157 fn = no_llseek; 173 fn = no_llseek;
158 if (file->f_mode & FMODE_LSEEK) { 174 if (file->f_mode & FMODE_LSEEK) {
159 fn = default_llseek;
160 if (file->f_op && file->f_op->llseek) 175 if (file->f_op && file->f_op->llseek)
161 fn = file->f_op->llseek; 176 fn = file->f_op->llseek;
162 } 177 }
@@ -222,13 +237,12 @@ bad:
222} 237}
223#endif 238#endif
224 239
240
225/* 241/*
226 * rw_verify_area doesn't like huge counts. We limit 242 * rw_verify_area doesn't like huge counts. We limit
227 * them to something that fits in "int" so that others 243 * them to something that fits in "int" so that others
228 * won't have to do range checks all the time. 244 * won't have to do range checks all the time.
229 */ 245 */
230#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
231
232int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 246int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
233{ 247{
234 struct inode *inode; 248 struct inode *inode;
@@ -239,8 +253,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
239 if (unlikely((ssize_t) count < 0)) 253 if (unlikely((ssize_t) count < 0))
240 return retval; 254 return retval;
241 pos = *ppos; 255 pos = *ppos;
242 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) 256 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
243 return retval; 257 retval = __negative_fpos_check(file, pos, count);
258 if (retval)
259 return retval;
260 }
244 261
245 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 262 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
246 retval = locks_mandatory_area( 263 retval = locks_mandatory_area(
@@ -565,65 +582,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
565 unsigned long nr_segs, unsigned long fast_segs, 582 unsigned long nr_segs, unsigned long fast_segs,
566 struct iovec *fast_pointer, 583 struct iovec *fast_pointer,
567 struct iovec **ret_pointer) 584 struct iovec **ret_pointer)
568 { 585{
569 unsigned long seg; 586 unsigned long seg;
570 ssize_t ret; 587 ssize_t ret;
571 struct iovec *iov = fast_pointer; 588 struct iovec *iov = fast_pointer;
572 589
573 /* 590 /*
574 * SuS says "The readv() function *may* fail if the iovcnt argument 591 * SuS says "The readv() function *may* fail if the iovcnt argument
575 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has 592 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
576 * traditionally returned zero for zero segments, so... 593 * traditionally returned zero for zero segments, so...
577 */ 594 */
578 if (nr_segs == 0) { 595 if (nr_segs == 0) {
579 ret = 0; 596 ret = 0;
580 goto out; 597 goto out;
581 } 598 }
582 599
583 /* 600 /*
584 * First get the "struct iovec" from user memory and 601 * First get the "struct iovec" from user memory and
585 * verify all the pointers 602 * verify all the pointers
586 */ 603 */
587 if (nr_segs > UIO_MAXIOV) { 604 if (nr_segs > UIO_MAXIOV) {
588 ret = -EINVAL; 605 ret = -EINVAL;
589 goto out; 606 goto out;
590 } 607 }
591 if (nr_segs > fast_segs) { 608 if (nr_segs > fast_segs) {
592 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 609 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
593 if (iov == NULL) { 610 if (iov == NULL) {
594 ret = -ENOMEM; 611 ret = -ENOMEM;
595 goto out; 612 goto out;
596 } 613 }
597 } 614 }
598 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { 615 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
599 ret = -EFAULT; 616 ret = -EFAULT;
600 goto out; 617 goto out;
601 } 618 }
602 619
603 /* 620 /*
604 * According to the Single Unix Specification we should return EINVAL 621 * According to the Single Unix Specification we should return EINVAL
605 * if an element length is < 0 when cast to ssize_t or if the 622 * if an element length is < 0 when cast to ssize_t or if the
606 * total length would overflow the ssize_t return value of the 623 * total length would overflow the ssize_t return value of the
607 * system call. 624 * system call.
608 */ 625 *
626 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
627 * overflow case.
628 */
609 ret = 0; 629 ret = 0;
610 for (seg = 0; seg < nr_segs; seg++) { 630 for (seg = 0; seg < nr_segs; seg++) {
611 void __user *buf = iov[seg].iov_base; 631 void __user *buf = iov[seg].iov_base;
612 ssize_t len = (ssize_t)iov[seg].iov_len; 632 ssize_t len = (ssize_t)iov[seg].iov_len;
613 633
614 /* see if we we're about to use an invalid len or if 634 /* see if we we're about to use an invalid len or if
615 * it's about to overflow ssize_t */ 635 * it's about to overflow ssize_t */
616 if (len < 0 || (ret + len < ret)) { 636 if (len < 0) {
617 ret = -EINVAL; 637 ret = -EINVAL;
618 goto out; 638 goto out;
619 } 639 }
620 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { 640 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
621 ret = -EFAULT; 641 ret = -EFAULT;
622 goto out; 642 goto out;
643 }
644 if (len > MAX_RW_COUNT - ret) {
645 len = MAX_RW_COUNT - ret;
646 iov[seg].iov_len = len;
623 } 647 }
624
625 ret += len; 648 ret += len;
626 } 649 }
627out: 650out:
628 *ret_pointer = iov; 651 *ret_pointer = iov;
629 return ret; 652 return ret;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f9..7cd46666ba2c 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
10 10
11 In general, ReiserFS is as fast as ext2, but is very efficient with 11 In general, ReiserFS is as fast as ext2, but is very efficient with
12 large directories and small files. Additional patches are needed 12 large directories and small files. Additional patches are needed
13 for NFS and quotas, please see <http://www.namesys.com/> for links. 13 for NFS and quotas, please see
14 <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
14 15
15 It is more easily extended to have features currently found in 16 It is more easily extended to have features currently found in
16 database and keyword search systems than block allocation based file 17 database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
18 plugins consistent with our motto ``It takes more than a license to 19 plugins consistent with our motto ``It takes more than a license to
19 make source code open.'' 20 make source code open.''
20 21
21 Read <http://www.namesys.com/> to learn more about reiserfs. 22 Read <https://reiser4.wiki.kernel.org/index.php/Main_Page>
23 to learn more about reiserfs.
22 24
23 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. 25 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
24 26
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e5..e2f7a264e3ff 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
43[END LICENSING] 43[END LICENSING]
44 44
45Reiserfs is a file system based on balanced tree algorithms, which is 45Reiserfs is a file system based on balanced tree algorithms, which is
46described at http://devlinux.com/namesys. 46described at https://reiser4.wiki.kernel.org/index.php/Main_Page
47 47
48Stop reading here. Go there, then return. 48Stop reading here. Go there, then return.
49 49
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
152 barrier_done = reiserfs_commit_for_inode(inode); 152 barrier_done = reiserfs_commit_for_inode(inode);
153 reiserfs_write_unlock(inode->i_sb); 153 reiserfs_write_unlock(inode->i_sb);
154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
156 BLKDEV_IFL_WAIT);
157 if (barrier_done < 0) 156 if (barrier_done < 0)
158 return barrier_done; 157 return barrier_done;
159 return (err < 0) ? -EIO : 0; 158 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..41656d40dc5c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,8 +22,6 @@
22 22
23int reiserfs_commit_write(struct file *f, struct page *page, 23int reiserfs_commit_write(struct file *f, struct page *page,
24 unsigned from, unsigned to); 24 unsigned from, unsigned to);
25int reiserfs_prepare_write(struct file *f, struct page *page,
26 unsigned from, unsigned to);
27 25
28void reiserfs_evict_inode(struct inode *inode) 26void reiserfs_evict_inode(struct inode *inode)
29{ 27{
@@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
165** but tail is still sitting in a direct item, and we can't write to 163** but tail is still sitting in a direct item, and we can't write to
166** it. So, look through this page, and check all the mapped buffers 164** it. So, look through this page, and check all the mapped buffers
167** to make sure they have valid block numbers. Any that don't need 165** to make sure they have valid block numbers. Any that don't need
168** to be unmapped, so that block_prepare_write will correctly call 166** to be unmapped, so that __block_write_begin will correctly call
169** reiserfs_get_block to convert the tail into an unformatted node 167** reiserfs_get_block to convert the tail into an unformatted node
170*/ 168*/
171static inline void fix_tail_page_for_writing(struct page *page) 169static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
439} 437}
440 438
441/* special version of get_block that is only used by grab_tail_page right 439/* special version of get_block that is only used by grab_tail_page right
442** now. It is sent to block_prepare_write, and when you try to get a 440** now. It is sent to __block_write_begin, and when you try to get a
443** block past the end of the file (or a block from a hole) it returns 441** block past the end of the file (or a block from a hole) it returns
444** -ENOENT instead of a valid buffer. block_prepare_write expects to 442** -ENOENT instead of a valid buffer. __block_write_begin expects to
445** be able to do i/o on the buffers returned, unless an error value 443** be able to do i/o on the buffers returned, unless an error value
446** is also returned. 444** is also returned.
447** 445**
448** So, this allows block_prepare_write to be used for reading a single block 446** So, this allows __block_write_begin to be used for reading a single block
449** in a page. Where it does not produce a valid page for holes, or past the 447** in a page. Where it does not produce a valid page for holes, or past the
450** end of the file. This turns out to be exactly what we need for reading 448** end of the file. This turns out to be exactly what we need for reading
451** tails for conversion. 449** tails for conversion.
@@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
558 ** 556 **
559 ** We must fix the tail page for writing because it might have buffers 557 ** We must fix the tail page for writing because it might have buffers
560 ** that are mapped, but have a block number of 0. This indicates tail 558 ** that are mapped, but have a block number of 0. This indicates tail
561 ** data that has been read directly into the page, and block_prepare_write 559 ** data that has been read directly into the page, and
562 ** won't trigger a get_block in this case. 560 ** __block_write_begin won't trigger a get_block in this case.
563 */ 561 */
564 fix_tail_page_for_writing(tail_page); 562 fix_tail_page_for_writing(tail_page);
565 retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); 563 retval = __reiserfs_write_begin(tail_page, tail_start,
564 tail_end - tail_start);
566 if (retval) 565 if (retval)
567 goto unlock; 566 goto unlock;
568 567
@@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
2033 /* start within the page of the last block in the file */ 2032 /* start within the page of the last block in the file */
2034 start = (offset / blocksize) * blocksize; 2033 start = (offset / blocksize) * blocksize;
2035 2034
2036 error = block_prepare_write(page, start, offset, 2035 error = __block_write_begin(page, start, offset - start,
2037 reiserfs_get_block_create_0); 2036 reiserfs_get_block_create_0);
2038 if (error) 2037 if (error)
2039 goto unlock; 2038 goto unlock;
@@ -2438,7 +2437,7 @@ static int reiserfs_write_full_page(struct page *page,
2438 /* from this point on, we know the buffer is mapped to a 2437 /* from this point on, we know the buffer is mapped to a
2439 * real block and not a direct item 2438 * real block and not a direct item
2440 */ 2439 */
2441 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 2440 if (wbc->sync_mode != WB_SYNC_NONE) {
2442 lock_buffer(bh); 2441 lock_buffer(bh);
2443 } else { 2442 } else {
2444 if (!trylock_buffer(bh)) { 2443 if (!trylock_buffer(bh)) {
@@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
2628 return ret; 2627 return ret;
2629} 2628}
2630 2629
2631int reiserfs_prepare_write(struct file *f, struct page *page, 2630int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
2632 unsigned from, unsigned to)
2633{ 2631{
2634 struct inode *inode = page->mapping->host; 2632 struct inode *inode = page->mapping->host;
2635 int ret; 2633 int ret;
@@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2650 th->t_refcount++; 2648 th->t_refcount++;
2651 } 2649 }
2652 2650
2653 ret = block_prepare_write(page, from, to, reiserfs_get_block); 2651 ret = __block_write_begin(page, from, len, reiserfs_get_block);
2654 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2652 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2655 struct reiserfs_transaction_handle *th = current->journal_info; 2653 struct reiserfs_transaction_handle *th = current->journal_info;
2656 /* this gets a little ugly. If reiserfs_get_block returned an 2654 /* this gets a little ugly. If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e134ac..adf22b485cea 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
160 160
161int reiserfs_commit_write(struct file *f, struct page *page, 161int reiserfs_commit_write(struct file *f, struct page *page,
162 unsigned from, unsigned to); 162 unsigned from, unsigned to);
163int reiserfs_prepare_write(struct file *f, struct page *page,
164 unsigned from, unsigned to);
165/* 163/*
166** reiserfs_unpack 164** reiserfs_unpack
167** Function try to convert tail from direct item into indirect. 165** Function try to convert tail from direct item into indirect.
@@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
200 } 198 }
201 199
202 /* we unpack by finding the page with the tail, and calling 200 /* we unpack by finding the page with the tail, and calling
203 ** reiserfs_prepare_write on that page. This will force a 201 ** __reiserfs_write_begin on that page. This will force a
204 ** reiserfs_get_block to unpack the tail for us. 202 ** reiserfs_get_block to unpack the tail for us.
205 */ 203 */
206 index = inode->i_size >> PAGE_CACHE_SHIFT; 204 index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
210 if (!page) { 208 if (!page) {
211 goto out; 209 goto out;
212 } 210 }
213 retval = reiserfs_prepare_write(NULL, page, write_from, write_from); 211 retval = __reiserfs_write_begin(page, write_from, 0);
214 if (retval) 212 if (retval)
215 goto out_unlock; 213 goto out_unlock;
216 214
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa29..076c8b194682 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
138 return 0; 138 return 0;
139} 139}
140 140
141static void disable_barrier(struct super_block *s)
142{
143 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
144 printk("reiserfs: disabling flush barriers on %s\n",
145 reiserfs_bdevname(s));
146}
147
148static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block 141static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
149 *sb) 142 *sb)
150{ 143{
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
677 submit_bh(WRITE, bh); 670 submit_bh(WRITE, bh);
678} 671}
679 672
680static int submit_barrier_buffer(struct buffer_head *bh)
681{
682 get_bh(bh);
683 bh->b_end_io = reiserfs_end_ordered_io;
684 clear_buffer_dirty(bh);
685 if (!buffer_uptodate(bh))
686 BUG();
687 return submit_bh(WRITE_BARRIER, bh);
688}
689
690static void check_barrier_completion(struct super_block *s,
691 struct buffer_head *bh)
692{
693 if (buffer_eopnotsupp(bh)) {
694 clear_buffer_eopnotsupp(bh);
695 disable_barrier(s);
696 set_buffer_uptodate(bh);
697 set_buffer_dirty(bh);
698 reiserfs_write_unlock(s);
699 sync_dirty_buffer(bh);
700 reiserfs_write_lock(s);
701 }
702}
703
704#define CHUNK_SIZE 32 673#define CHUNK_SIZE 32
705struct buffer_chunk { 674struct buffer_chunk {
706 struct buffer_head *bh[CHUNK_SIZE]; 675 struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
1009 struct buffer_head *tbh = NULL; 978 struct buffer_head *tbh = NULL;
1010 unsigned int trans_id = jl->j_trans_id; 979 unsigned int trans_id = jl->j_trans_id;
1011 struct reiserfs_journal *journal = SB_JOURNAL(s); 980 struct reiserfs_journal *journal = SB_JOURNAL(s);
1012 int barrier = 0;
1013 int retval = 0; 981 int retval = 0;
1014 int write_len; 982 int write_len;
1015 983
@@ -1094,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
1094 } 1062 }
1095 atomic_dec(&journal->j_async_throttle); 1063 atomic_dec(&journal->j_async_throttle);
1096 1064
1097 /* We're skipping the commit if there's an error */
1098 if (retval || reiserfs_is_journal_aborted(journal))
1099 barrier = 0;
1100
1101 /* wait on everything written so far before writing the commit
1102 * if we are in barrier mode, send the commit down now
1103 */
1104 barrier = reiserfs_barrier_flush(s);
1105 if (barrier) {
1106 int ret;
1107 lock_buffer(jl->j_commit_bh);
1108 ret = submit_barrier_buffer(jl->j_commit_bh);
1109 if (ret == -EOPNOTSUPP) {
1110 set_buffer_uptodate(jl->j_commit_bh);
1111 disable_barrier(s);
1112 barrier = 0;
1113 }
1114 }
1115 for (i = 0; i < (jl->j_len + 1); i++) { 1065 for (i = 0; i < (jl->j_len + 1); i++) {
1116 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1066 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1117 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1067 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
1143 1093
1144 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); 1094 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1145 1095
1146 if (!barrier) { 1096 /* If there was a write error in the journal - we can't commit
1147 /* If there was a write error in the journal - we can't commit 1097 * this transaction - it will be invalid and, if successful,
1148 * this transaction - it will be invalid and, if successful, 1098 * will just end up propagating the write error out to
1149 * will just end up propagating the write error out to 1099 * the file system. */
1150 * the file system. */ 1100 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1151 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { 1101 if (buffer_dirty(jl->j_commit_bh))
1152 if (buffer_dirty(jl->j_commit_bh)) 1102 BUG();
1153 BUG(); 1103 mark_buffer_dirty(jl->j_commit_bh) ;
1154 mark_buffer_dirty(jl->j_commit_bh) ;
1155 reiserfs_write_unlock(s);
1156 sync_dirty_buffer(jl->j_commit_bh) ;
1157 reiserfs_write_lock(s);
1158 }
1159 } else {
1160 reiserfs_write_unlock(s); 1104 reiserfs_write_unlock(s);
1161 wait_on_buffer(jl->j_commit_bh); 1105 if (reiserfs_barrier_flush(s))
1106 __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1107 else
1108 sync_dirty_buffer(jl->j_commit_bh);
1162 reiserfs_write_lock(s); 1109 reiserfs_write_lock(s);
1163 } 1110 }
1164 1111
1165 check_barrier_completion(s, jl->j_commit_bh);
1166
1167 /* If there was a write error in the journal - we can't commit this 1112 /* If there was a write error in the journal - we can't commit this
1168 * transaction - it will be invalid and, if successful, will just end 1113 * transaction - it will be invalid and, if successful, will just end
1169 * up propagating the write error out to the filesystem. */ 1114 * up propagating the write error out to the filesystem. */
@@ -1319,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
1319 jh->j_first_unflushed_offset = cpu_to_le32(offset); 1264 jh->j_first_unflushed_offset = cpu_to_le32(offset);
1320 jh->j_mount_id = cpu_to_le32(journal->j_mount_id); 1265 jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1321 1266
1322 if (reiserfs_barrier_flush(sb)) { 1267 set_buffer_dirty(journal->j_header_bh);
1323 int ret; 1268 reiserfs_write_unlock(sb);
1324 lock_buffer(journal->j_header_bh); 1269
1325 ret = submit_barrier_buffer(journal->j_header_bh); 1270 if (reiserfs_barrier_flush(sb))
1326 if (ret == -EOPNOTSUPP) { 1271 __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1327 set_buffer_uptodate(journal->j_header_bh); 1272 else
1328 disable_barrier(sb);
1329 goto sync;
1330 }
1331 reiserfs_write_unlock(sb);
1332 wait_on_buffer(journal->j_header_bh);
1333 reiserfs_write_lock(sb);
1334 check_barrier_completion(sb, journal->j_header_bh);
1335 } else {
1336 sync:
1337 set_buffer_dirty(journal->j_header_bh);
1338 reiserfs_write_unlock(sb);
1339 sync_dirty_buffer(journal->j_header_bh); 1273 sync_dirty_buffer(journal->j_header_bh);
1340 reiserfs_write_lock(sb); 1274
1341 } 1275 reiserfs_write_lock(sb);
1342 if (!buffer_uptodate(journal->j_header_bh)) { 1276 if (!buffer_uptodate(journal->j_header_bh)) {
1343 reiserfs_warning(sb, "journal-837", 1277 reiserfs_warning(sb, "journal-837",
1344 "IO error during journal replay"); 1278 "IO error during journal replay");
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1156 inode->i_ctime = CURRENT_TIME_SEC; 1156 inode->i_ctime = CURRENT_TIME_SEC;
1157 reiserfs_update_sd(&th, inode); 1157 reiserfs_update_sd(&th, inode);
1158 1158
1159 atomic_inc(&inode->i_count); 1159 ihold(inode);
1160 d_instantiate(dentry, inode); 1160 d_instantiate(dentry, inode);
1161 retval = journal_end(&th, dir->i_sb, jbegin_count); 1161 retval = journal_end(&th, dir->i_sb, jbegin_count);
1162 reiserfs_write_unlock(dir->i_sb); 1162 reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e15ff612002d..3bf7a6457f4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2213,12 +2213,11 @@ out:
2213 2213
2214#endif 2214#endif
2215 2215
2216static int get_super_block(struct file_system_type *fs_type, 2216static struct dentry *get_super_block(struct file_system_type *fs_type,
2217 int flags, const char *dev_name, 2217 int flags, const char *dev_name,
2218 void *data, struct vfsmount *mnt) 2218 void *data)
2219{ 2219{
2220 return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super, 2220 return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
2221 mnt);
2222} 2221}
2223 2222
2224static int __init init_reiserfs_fs(void) 2223static int __init init_reiserfs_fs(void)
@@ -2253,7 +2252,7 @@ static void __exit exit_reiserfs_fs(void)
2253struct file_system_type reiserfs_fs_type = { 2252struct file_system_type reiserfs_fs_type = {
2254 .owner = THIS_MODULE, 2253 .owner = THIS_MODULE,
2255 .name = "reiserfs", 2254 .name = "reiserfs",
2256 .get_sb = get_super_block, 2255 .mount = get_super_block,
2257 .kill_sb = reiserfs_kill_sb, 2256 .kill_sb = reiserfs_kill_sb,
2258 .fs_flags = FS_REQUIRES_DEV, 2257 .fs_flags = FS_REQUIRES_DEV,
2259}; 2258};
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..5d04a7828e7a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
418 418
419int reiserfs_commit_write(struct file *f, struct page *page, 419int reiserfs_commit_write(struct file *f, struct page *page,
420 unsigned from, unsigned to); 420 unsigned from, unsigned to);
421int reiserfs_prepare_write(struct file *f, struct page *page,
422 unsigned from, unsigned to);
423 421
424static void update_ctime(struct inode *inode) 422static void update_ctime(struct inode *inode)
425{ 423{
426 struct timespec now = current_fs_time(inode->i_sb); 424 struct timespec now = current_fs_time(inode->i_sb);
427 if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink || 425 if (inode_unhashed(inode) || !inode->i_nlink ||
428 timespec_equal(&inode->i_ctime, &now)) 426 timespec_equal(&inode->i_ctime, &now))
429 return; 427 return;
430 428
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
532 rxh->h_hash = cpu_to_le32(xahash); 530 rxh->h_hash = cpu_to_le32(xahash);
533 } 531 }
534 532
535 err = reiserfs_prepare_write(NULL, page, page_offset, 533 err = __reiserfs_write_begin(page, page_offset, chunk + skip);
536 page_offset + chunk + skip);
537 if (!err) { 534 if (!err) {
538 if (buffer) 535 if (buffer)
539 memcpy(data + skip, buffer + buffer_pos, chunk); 536 memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d213546894..6647f90e55cd 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
282static const struct file_operations romfs_dir_operations = { 282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir, 283 .read = generic_read_dir,
284 .readdir = romfs_readdir, 284 .readdir = romfs_readdir,
285 .llseek = default_llseek,
285}; 286};
286 287
287static const struct inode_operations romfs_dir_inode_operations = { 288static const struct inode_operations romfs_dir_inode_operations = {
@@ -551,20 +552,19 @@ error_rsb:
551/* 552/*
552 * get a superblock for mounting 553 * get a superblock for mounting
553 */ 554 */
554static int romfs_get_sb(struct file_system_type *fs_type, 555static struct dentry *romfs_mount(struct file_system_type *fs_type,
555 int flags, const char *dev_name, 556 int flags, const char *dev_name,
556 void *data, struct vfsmount *mnt) 557 void *data)
557{ 558{
558 int ret = -EINVAL; 559 struct dentry *ret = ERR_PTR(-EINVAL);
559 560
560#ifdef CONFIG_ROMFS_ON_MTD 561#ifdef CONFIG_ROMFS_ON_MTD
561 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, 562 ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
562 mnt);
563#endif 563#endif
564#ifdef CONFIG_ROMFS_ON_BLOCK 564#ifdef CONFIG_ROMFS_ON_BLOCK
565 if (ret == -EINVAL) 565 if (ret == ERR_PTR(-EINVAL))
566 ret = get_sb_bdev(fs_type, flags, dev_name, data, 566 ret = mount_bdev(fs_type, flags, dev_name, data,
567 romfs_fill_super, mnt); 567 romfs_fill_super);
568#endif 568#endif
569 return ret; 569 return ret;
570} 570}
@@ -591,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb)
591static struct file_system_type romfs_fs_type = { 591static struct file_system_type romfs_fs_type = {
592 .owner = THIS_MODULE, 592 .owner = THIS_MODULE,
593 .name = "romfs", 593 .name = "romfs",
594 .get_sb = romfs_get_sb, 594 .mount = romfs_mount,
595 .kill_sb = romfs_kill_sb, 595 .kill_sb = romfs_kill_sb,
596 .fs_flags = FS_REQUIRES_DEV, 596 .fs_flags = FS_REQUIRES_DEV,
597}; 597};
diff --git a/fs/select.c b/fs/select.c
index 500a669f7790..b7b10aa30861 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
67 return slack; 67 return slack;
68} 68}
69 69
70static long estimate_accuracy(struct timespec *tv) 70long select_estimate_accuracy(struct timespec *tv)
71{ 71{
72 unsigned long ret; 72 unsigned long ret;
73 struct timespec now; 73 struct timespec now;
@@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
417 } 417 }
418 418
419 if (end_time && !timed_out) 419 if (end_time && !timed_out)
420 slack = estimate_accuracy(end_time); 420 slack = select_estimate_accuracy(end_time);
421 421
422 retval = 0; 422 retval = 0;
423 for (;;) { 423 for (;;) {
@@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
769 } 769 }
770 770
771 if (end_time && !timed_out) 771 if (end_time && !timed_out)
772 slack = estimate_accuracy(end_time); 772 slack = select_estimate_accuracy(end_time);
773 773
774 for (;;) { 774 for (;;) {
775 struct poll_list *walk; 775 struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3c..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
131 */ 131 */
132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) 132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
133{ 133{
134 struct seq_file *m = (struct seq_file *)file->private_data; 134 struct seq_file *m = file->private_data;
135 size_t copied = 0; 135 size_t copied = 0;
136 loff_t pos; 136 loff_t pos;
137 size_t n; 137 size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
280 */ 280 */
281loff_t seq_lseek(struct file *file, loff_t offset, int origin) 281loff_t seq_lseek(struct file *file, loff_t offset, int origin)
282{ 282{
283 struct seq_file *m = (struct seq_file *)file->private_data; 283 struct seq_file *m = file->private_data;
284 loff_t retval = -EINVAL; 284 loff_t retval = -EINVAL;
285 285
286 mutex_lock(&m->lock); 286 mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
324 */ 324 */
325int seq_release(struct inode *inode, struct file *file) 325int seq_release(struct inode *inode, struct file *file)
326{ 326{
327 struct seq_file *m = (struct seq_file *)file->private_data; 327 struct seq_file *m = file->private_data;
328 kfree(m->buf); 328 kfree(m->buf);
329 kfree(m); 329 kfree(m);
330 return 0; 330 return 0;
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
462 if (size) { 462 if (size) {
463 char *p; 463 char *p;
464 464
465 spin_lock(&dcache_lock);
466 p = __d_path(path, root, buf, size); 465 p = __d_path(path, root, buf, size);
467 spin_unlock(&dcache_lock);
468 res = PTR_ERR(p); 466 res = PTR_ERR(p);
469 if (!IS_ERR(p)) { 467 if (!IS_ERR(p)) {
470 char *end = mangle_path(buf, p, esc); 468 char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c5a6add779d..492465b451dd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
99#ifdef __ARCH_SI_TRAPNO 99#ifdef __ARCH_SI_TRAPNO
100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); 100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
101#endif 101#endif
102#ifdef BUS_MCEERR_AO
103 /*
104 * Other callers might not initialize the si_lsb field,
105 * so check explicitly for the right codes here.
106 */
107 if (kinfo->si_code == BUS_MCEERR_AR ||
108 kinfo->si_code == BUS_MCEERR_AO)
109 err |= __put_user((short) kinfo->si_addr_lsb,
110 &uinfo->ssi_addr_lsb);
111#endif
102 break; 112 break;
103 case __SI_CHLD: 113 case __SI_CHLD:
104 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 114 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
@@ -206,6 +216,7 @@ static const struct file_operations signalfd_fops = {
206 .release = signalfd_release, 216 .release = signalfd_release,
207 .poll = signalfd_poll, 217 .poll = signalfd_poll,
208 .read = signalfd_read, 218 .read = signalfd_read,
219 .llseek = noop_llseek,
209}; 220};
210 221
211SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, 222SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2e..000000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
1config SMB_FS
2 tristate "SMB file system support (OBSOLETE, please use CIFS)"
3 depends on INET
4 select NLS
5 help
6 SMB (Server Message Block) is the protocol Windows for Workgroups
7 (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
8 files and printers over local networks. Saying Y here allows you to
9 mount their file systems (often called "shares" in this context) and
10 access them just like any other Unix directory. Currently, this
11 works only if the Windows machines use TCP/IP as the underlying
12 transport protocol, and not NetBEUI. For details, read
13 <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>.
15
16 Note: if you just want your box to act as an SMB *server* and make
17 files and printing services available to Windows clients (which need
18 to have a TCP/IP stack), you don't need to say Y here; you can use
19 the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
20 for that.
21
22 General information about how to connect Linux, Windows machines and
23 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
24
25 To compile the SMB support as a module, choose M here:
26 the module will be called smbfs. Most people say N, however.
27
28config SMB_NLS_DEFAULT
29 bool "Use a default NLS"
30 depends on SMB_FS
31 help
32 Enabling this will make smbfs use nls translations by default. You
33 need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
34 settings and you need to give the default nls for the SMB server as
35 CONFIG_SMB_NLS_REMOTE.
36
37 The nls settings can be changed at mount time, if your smbmount
38 supports that, using the codepage and iocharset parameters.
39
40 smbmount from samba 2.2.0 or later supports this.
41
42config SMB_NLS_REMOTE
43 string "Default Remote NLS Option"
44 depends on SMB_NLS_DEFAULT
45 default "cp437"
46 help
47 This setting allows you to specify a default value for which
48 codepage the server uses. If this field is left blank no
49 translations will be done by default. The local codepage/charset
50 default to CONFIG_NLS_DEFAULT.
51
52 The nls settings can be changed at mount time, if your smbmount
53 supports that, using the codepage and iocharset parameters.
54
55 smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c3..000000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
1#
2# Makefile for the linux smb-filesystem routines.
3#
4
5obj-$(CONFIG_SMB_FS) += smbfs.o
6
7smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
8 symlink.o smbiod.o request.o
9
10# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
11# SMBFS_PARANOIA should normally be enabled.
12
13EXTRA_CFLAGS += -DSMBFS_PARANOIA
14#EXTRA_CFLAGS += -DSMBFS_DEBUG
15#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
16#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
17#EXTRA_CFLAGS += -Werror
18
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e344..000000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
1/*
2 * cache.c
3 *
4 * Copyright (C) 1997 by Bill Hawes
5 *
6 * Routines to support directory cacheing using the page cache.
7 * This cache code is almost directly taken from ncpfs.
8 *
9 * Please add a note about your changes to smbfs in the ChangeLog file.
10 */
11
12#include <linux/time.h>
13#include <linux/errno.h>
14#include <linux/kernel.h>
15#include <linux/mm.h>
16#include <linux/smb_fs.h>
17#include <linux/pagemap.h>
18#include <linux/net.h>
19
20#include <asm/page.h>
21
22#include "smb_debug.h"
23#include "proto.h"
24
25/*
26 * Force the next attempt to use the cache to be a timeout.
27 * If we can't find the page that's fine, it will cause a refresh.
28 */
29void
30smb_invalid_dir_cache(struct inode * dir)
31{
32 struct smb_sb_info *server = server_from_inode(dir);
33 union smb_dir_cache *cache = NULL;
34 struct page *page = NULL;
35
36 page = grab_cache_page(&dir->i_data, 0);
37 if (!page)
38 goto out;
39
40 if (!PageUptodate(page))
41 goto out_unlock;
42
43 cache = kmap(page);
44 cache->head.time = jiffies - SMB_MAX_AGE(server);
45
46 kunmap(page);
47 SetPageUptodate(page);
48out_unlock:
49 unlock_page(page);
50 page_cache_release(page);
51out:
52 return;
53}
54
55/*
56 * Mark all dentries for 'parent' as invalid, forcing them to be re-read
57 */
58void
59smb_invalidate_dircache_entries(struct dentry *parent)
60{
61 struct smb_sb_info *server = server_from_dentry(parent);
62 struct list_head *next;
63 struct dentry *dentry;
64
65 spin_lock(&dcache_lock);
66 next = parent->d_subdirs.next;
67 while (next != &parent->d_subdirs) {
68 dentry = list_entry(next, struct dentry, d_u.d_child);
69 dentry->d_fsdata = NULL;
70 smb_age_dentry(server, dentry);
71 next = next->next;
72 }
73 spin_unlock(&dcache_lock);
74}
75
76/*
77 * dget, but require that fpos and parent matches what the dentry contains.
78 * dentry is not known to be a valid pointer at entry.
79 */
80struct dentry *
81smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
82{
83 struct dentry *dent = dentry;
84 struct list_head *next;
85
86 if (d_validate(dent, parent)) {
87 if (dent->d_name.len <= SMB_MAXNAMELEN &&
88 (unsigned long)dent->d_fsdata == fpos) {
89 if (!dent->d_inode) {
90 dput(dent);
91 dent = NULL;
92 }
93 return dent;
94 }
95 dput(dent);
96 }
97
98 /* If a pointer is invalid, we search the dentry. */
99 spin_lock(&dcache_lock);
100 next = parent->d_subdirs.next;
101 while (next != &parent->d_subdirs) {
102 dent = list_entry(next, struct dentry, d_u.d_child);
103 if ((unsigned long)dent->d_fsdata == fpos) {
104 if (dent->d_inode)
105 dget_locked(dent);
106 else
107 dent = NULL;
108 goto out_unlock;
109 }
110 next = next->next;
111 }
112 dent = NULL;
113out_unlock:
114 spin_unlock(&dcache_lock);
115 return dent;
116}
117
118
119/*
120 * Create dentry/inode for this file and add it to the dircache.
121 */
122int
123smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
124 struct smb_cache_control *ctrl, struct qstr *qname,
125 struct smb_fattr *entry)
126{
127 struct dentry *newdent, *dentry = filp->f_path.dentry;
128 struct inode *newino, *inode = dentry->d_inode;
129 struct smb_cache_control ctl = *ctrl;
130 int valid = 0;
131 int hashed = 0;
132 ino_t ino = 0;
133
134 qname->hash = full_name_hash(qname->name, qname->len);
135
136 if (dentry->d_op && dentry->d_op->d_hash)
137 if (dentry->d_op->d_hash(dentry, qname) != 0)
138 goto end_advance;
139
140 newdent = d_lookup(dentry, qname);
141
142 if (!newdent) {
143 newdent = d_alloc(dentry, qname);
144 if (!newdent)
145 goto end_advance;
146 } else {
147 hashed = 1;
148 memcpy((char *) newdent->d_name.name, qname->name,
149 newdent->d_name.len);
150 }
151
152 if (!newdent->d_inode) {
153 smb_renew_times(newdent);
154 entry->f_ino = iunique(inode->i_sb, 2);
155 newino = smb_iget(inode->i_sb, entry);
156 if (newino) {
157 smb_new_dentry(newdent);
158 d_instantiate(newdent, newino);
159 if (!hashed)
160 d_rehash(newdent);
161 }
162 } else
163 smb_set_inode_attr(newdent->d_inode, entry);
164
165 if (newdent->d_inode) {
166 ino = newdent->d_inode->i_ino;
167 newdent->d_fsdata = (void *) ctl.fpos;
168 smb_new_dentry(newdent);
169 }
170
171 if (ctl.idx >= SMB_DIRCACHE_SIZE) {
172 if (ctl.page) {
173 kunmap(ctl.page);
174 SetPageUptodate(ctl.page);
175 unlock_page(ctl.page);
176 page_cache_release(ctl.page);
177 }
178 ctl.cache = NULL;
179 ctl.idx -= SMB_DIRCACHE_SIZE;
180 ctl.ofs += 1;
181 ctl.page = grab_cache_page(&inode->i_data, ctl.ofs);
182 if (ctl.page)
183 ctl.cache = kmap(ctl.page);
184 }
185 if (ctl.cache) {
186 ctl.cache->dentry[ctl.idx] = newdent;
187 valid = 1;
188 }
189 dput(newdent);
190
191end_advance:
192 if (!valid)
193 ctl.valid = 0;
194 if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
195 if (!ino)
196 ino = find_inode_number(dentry, qname);
197 if (!ino)
198 ino = iunique(inode->i_sb, 2);
199 ctl.filled = filldir(dirent, qname->name, qname->len,
200 filp->f_pos, ino, DT_UNKNOWN);
201 if (!ctl.filled)
202 filp->f_pos += 1;
203 }
204 ctl.fpos += 1;
205 ctl.idx += 1;
206 *ctrl = ctl;
207 return (ctl.valid || !ctl.filled);
208}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f36..000000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
1/*
2 * dir.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/time.h>
11#include <linux/errno.h>
12#include <linux/kernel.h>
13#include <linux/smp_lock.h>
14#include <linux/ctype.h>
15#include <linux/net.h>
16#include <linux/sched.h>
17
18#include <linux/smb_fs.h>
19#include <linux/smb_mount.h>
20#include <linux/smbno.h>
21
22#include "smb_debug.h"
23#include "proto.h"
24
25static int smb_readdir(struct file *, void *, filldir_t);
26static int smb_dir_open(struct inode *, struct file *);
27
28static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
29static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
30static int smb_mkdir(struct inode *, struct dentry *, int);
31static int smb_rmdir(struct inode *, struct dentry *);
32static int smb_unlink(struct inode *, struct dentry *);
33static int smb_rename(struct inode *, struct dentry *,
34 struct inode *, struct dentry *);
35static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
36static int smb_link(struct dentry *, struct inode *, struct dentry *);
37
38const struct file_operations smb_dir_operations =
39{
40 .llseek = generic_file_llseek,
41 .read = generic_read_dir,
42 .readdir = smb_readdir,
43 .unlocked_ioctl = smb_ioctl,
44 .open = smb_dir_open,
45};
46
47const struct inode_operations smb_dir_inode_operations =
48{
49 .create = smb_create,
50 .lookup = smb_lookup,
51 .unlink = smb_unlink,
52 .mkdir = smb_mkdir,
53 .rmdir = smb_rmdir,
54 .rename = smb_rename,
55 .getattr = smb_getattr,
56 .setattr = smb_notify_change,
57};
58
59const struct inode_operations smb_dir_inode_operations_unix =
60{
61 .create = smb_create,
62 .lookup = smb_lookup,
63 .unlink = smb_unlink,
64 .mkdir = smb_mkdir,
65 .rmdir = smb_rmdir,
66 .rename = smb_rename,
67 .getattr = smb_getattr,
68 .setattr = smb_notify_change,
69 .symlink = smb_symlink,
70 .mknod = smb_make_node,
71 .link = smb_link,
72};
73
74/*
75 * Read a directory, using filldir to fill the dirent memory.
76 * smb_proc_readdir does the actual reading from the smb server.
77 *
78 * The cache code is almost directly taken from ncpfs
79 */
80static int
81smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
82{
83 struct dentry *dentry = filp->f_path.dentry;
84 struct inode *dir = dentry->d_inode;
85 struct smb_sb_info *server = server_from_dentry(dentry);
86 union smb_dir_cache *cache = NULL;
87 struct smb_cache_control ctl;
88 struct page *page = NULL;
89 int result;
90
91 ctl.page = NULL;
92 ctl.cache = NULL;
93
94 VERBOSE("reading %s/%s, f_pos=%d\n",
95 DENTRY_PATH(dentry), (int) filp->f_pos);
96
97 result = 0;
98
99 lock_kernel();
100
101 switch ((unsigned int) filp->f_pos) {
102 case 0:
103 if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
104 goto out;
105 filp->f_pos = 1;
106 /* fallthrough */
107 case 1:
108 if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
109 goto out;
110 filp->f_pos = 2;
111 }
112
113 /*
114 * Make sure our inode is up-to-date.
115 */
116 result = smb_revalidate_inode(dentry);
117 if (result)
118 goto out;
119
120
121 page = grab_cache_page(&dir->i_data, 0);
122 if (!page)
123 goto read_really;
124
125 ctl.cache = cache = kmap(page);
126 ctl.head = cache->head;
127
128 if (!PageUptodate(page) || !ctl.head.eof) {
129 VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
130 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
131 goto init_cache;
132 }
133
134 if (filp->f_pos == 2) {
135 if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
136 goto init_cache;
137
138 /*
139 * N.B. ncpfs checks mtime of dentry too here, we don't.
140 * 1. common smb servers do not update mtime on dir changes
141 * 2. it requires an extra smb request
142 * (revalidate has the same timeout as ctl.head.time)
143 *
144 * Instead smbfs invalidates its own cache on local changes
145 * and remote changes are not seen until timeout.
146 */
147 }
148
149 if (filp->f_pos > ctl.head.end)
150 goto finished;
151
152 ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
153 ctl.ofs = ctl.fpos / SMB_DIRCACHE_SIZE;
154 ctl.idx = ctl.fpos % SMB_DIRCACHE_SIZE;
155
156 for (;;) {
157 if (ctl.ofs != 0) {
158 ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
159 if (!ctl.page)
160 goto invalid_cache;
161 ctl.cache = kmap(ctl.page);
162 if (!PageUptodate(ctl.page))
163 goto invalid_cache;
164 }
165 while (ctl.idx < SMB_DIRCACHE_SIZE) {
166 struct dentry *dent;
167 int res;
168
169 dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
170 dentry, filp->f_pos);
171 if (!dent)
172 goto invalid_cache;
173
174 res = filldir(dirent, dent->d_name.name,
175 dent->d_name.len, filp->f_pos,
176 dent->d_inode->i_ino, DT_UNKNOWN);
177 dput(dent);
178 if (res)
179 goto finished;
180 filp->f_pos += 1;
181 ctl.idx += 1;
182 if (filp->f_pos > ctl.head.end)
183 goto finished;
184 }
185 if (ctl.page) {
186 kunmap(ctl.page);
187 SetPageUptodate(ctl.page);
188 unlock_page(ctl.page);
189 page_cache_release(ctl.page);
190 ctl.page = NULL;
191 }
192 ctl.idx = 0;
193 ctl.ofs += 1;
194 }
195invalid_cache:
196 if (ctl.page) {
197 kunmap(ctl.page);
198 unlock_page(ctl.page);
199 page_cache_release(ctl.page);
200 ctl.page = NULL;
201 }
202 ctl.cache = cache;
203init_cache:
204 smb_invalidate_dircache_entries(dentry);
205 ctl.head.time = jiffies;
206 ctl.head.eof = 0;
207 ctl.fpos = 2;
208 ctl.ofs = 0;
209 ctl.idx = SMB_DIRCACHE_START;
210 ctl.filled = 0;
211 ctl.valid = 1;
212read_really:
213 result = server->ops->readdir(filp, dirent, filldir, &ctl);
214 if (result == -ERESTARTSYS && page)
215 ClearPageUptodate(page);
216 if (ctl.idx == -1)
217 goto invalid_cache; /* retry */
218 ctl.head.end = ctl.fpos - 1;
219 ctl.head.eof = ctl.valid;
220finished:
221 if (page) {
222 cache->head = ctl.head;
223 kunmap(page);
224 if (result != -ERESTARTSYS)
225 SetPageUptodate(page);
226 unlock_page(page);
227 page_cache_release(page);
228 }
229 if (ctl.page) {
230 kunmap(ctl.page);
231 SetPageUptodate(ctl.page);
232 unlock_page(ctl.page);
233 page_cache_release(ctl.page);
234 }
235out:
236 unlock_kernel();
237 return result;
238}
239
240static int
241smb_dir_open(struct inode *dir, struct file *file)
242{
243 struct dentry *dentry = file->f_path.dentry;
244 struct smb_sb_info *server;
245 int error = 0;
246
247 VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
248 file->f_path.dentry->d_name.name);
249
250 /*
251 * Directory timestamps in the core protocol aren't updated
252 * when a file is added, so we give them a very short TTL.
253 */
254 lock_kernel();
255 server = server_from_dentry(dentry);
256 if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
257 unsigned long age = jiffies - SMB_I(dir)->oldmtime;
258 if (age > 2*HZ)
259 smb_invalid_dir_cache(dir);
260 }
261
262 /*
263 * Note: in order to allow the smbmount process to open the
264 * mount point, we only revalidate if the connection is valid or
265 * if the process is trying to access something other than the root.
266 */
267 if (server->state == CONN_VALID || !IS_ROOT(dentry))
268 error = smb_revalidate_inode(dentry);
269 unlock_kernel();
270 return error;
271}
272
273/*
274 * Dentry operations routines
275 */
276static int smb_lookup_validate(struct dentry *, struct nameidata *);
277static int smb_hash_dentry(struct dentry *, struct qstr *);
278static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
279static int smb_delete_dentry(struct dentry *);
280
281static const struct dentry_operations smbfs_dentry_operations =
282{
283 .d_revalidate = smb_lookup_validate,
284 .d_hash = smb_hash_dentry,
285 .d_compare = smb_compare_dentry,
286 .d_delete = smb_delete_dentry,
287};
288
289static const struct dentry_operations smbfs_dentry_operations_case =
290{
291 .d_revalidate = smb_lookup_validate,
292 .d_delete = smb_delete_dentry,
293};
294
295
296/*
297 * This is the callback when the dcache has a lookup hit.
298 */
299static int
300smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
301{
302 struct smb_sb_info *server = server_from_dentry(dentry);
303 struct inode * inode = dentry->d_inode;
304 unsigned long age = jiffies - dentry->d_time;
305 int valid;
306
307 /*
308 * The default validation is based on dentry age:
309 * we believe in dentries for a few seconds. (But each
310 * successful server lookup renews the timestamp.)
311 */
312 valid = (age <= SMB_MAX_AGE(server));
313#ifdef SMBFS_DEBUG_VERBOSE
314 if (!valid)
315 VERBOSE("%s/%s not valid, age=%lu\n",
316 DENTRY_PATH(dentry), age);
317#endif
318
319 if (inode) {
320 lock_kernel();
321 if (is_bad_inode(inode)) {
322 PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
323 valid = 0;
324 } else if (!valid)
325 valid = (smb_revalidate_inode(dentry) == 0);
326 unlock_kernel();
327 } else {
328 /*
329 * What should we do for negative dentries?
330 */
331 }
332 return valid;
333}
334
335static int
336smb_hash_dentry(struct dentry *dir, struct qstr *this)
337{
338 unsigned long hash;
339 int i;
340
341 hash = init_name_hash();
342 for (i=0; i < this->len ; i++)
343 hash = partial_name_hash(tolower(this->name[i]), hash);
344 this->hash = end_name_hash(hash);
345
346 return 0;
347}
348
349static int
350smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
351{
352 int i, result = 1;
353
354 if (a->len != b->len)
355 goto out;
356 for (i=0; i < a->len; i++) {
357 if (tolower(a->name[i]) != tolower(b->name[i]))
358 goto out;
359 }
360 result = 0;
361out:
362 return result;
363}
364
365/*
366 * This is the callback from dput() when d_count is going to 0.
367 * We use this to unhash dentries with bad inodes.
368 */
369static int
370smb_delete_dentry(struct dentry * dentry)
371{
372 if (dentry->d_inode) {
373 if (is_bad_inode(dentry->d_inode)) {
374 PARANOIA("bad inode, unhashing %s/%s\n",
375 DENTRY_PATH(dentry));
376 return 1;
377 }
378 } else {
379 /* N.B. Unhash negative dentries? */
380 }
381 return 0;
382}
383
384/*
385 * Initialize a new dentry
386 */
387void
388smb_new_dentry(struct dentry *dentry)
389{
390 struct smb_sb_info *server = server_from_dentry(dentry);
391
392 if (server->mnt->flags & SMB_MOUNT_CASE)
393 dentry->d_op = &smbfs_dentry_operations_case;
394 else
395 dentry->d_op = &smbfs_dentry_operations;
396 dentry->d_time = jiffies;
397}
398
399
400/*
401 * Whenever a lookup succeeds, we know the parent directories
402 * are all valid, so we want to update the dentry timestamps.
403 * N.B. Move this to dcache?
404 */
405void
406smb_renew_times(struct dentry * dentry)
407{
408 dget(dentry);
409 spin_lock(&dentry->d_lock);
410 for (;;) {
411 struct dentry *parent;
412
413 dentry->d_time = jiffies;
414 if (IS_ROOT(dentry))
415 break;
416 parent = dentry->d_parent;
417 dget(parent);
418 spin_unlock(&dentry->d_lock);
419 dput(dentry);
420 dentry = parent;
421 spin_lock(&dentry->d_lock);
422 }
423 spin_unlock(&dentry->d_lock);
424 dput(dentry);
425}
426
427static struct dentry *
428smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
429{
430 struct smb_fattr finfo;
431 struct inode *inode;
432 int error;
433 struct smb_sb_info *server;
434
435 error = -ENAMETOOLONG;
436 if (dentry->d_name.len > SMB_MAXNAMELEN)
437 goto out;
438
439 /* Do not allow lookup of names with backslashes in */
440 error = -EINVAL;
441 if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
442 goto out;
443
444 lock_kernel();
445 error = smb_proc_getattr(dentry, &finfo);
446#ifdef SMBFS_PARANOIA
447 if (error && error != -ENOENT)
448 PARANOIA("find %s/%s failed, error=%d\n",
449 DENTRY_PATH(dentry), error);
450#endif
451
452 inode = NULL;
453 if (error == -ENOENT)
454 goto add_entry;
455 if (!error) {
456 error = -EACCES;
457 finfo.f_ino = iunique(dentry->d_sb, 2);
458 inode = smb_iget(dir->i_sb, &finfo);
459 if (inode) {
460 add_entry:
461 server = server_from_dentry(dentry);
462 if (server->mnt->flags & SMB_MOUNT_CASE)
463 dentry->d_op = &smbfs_dentry_operations_case;
464 else
465 dentry->d_op = &smbfs_dentry_operations;
466
467 d_add(dentry, inode);
468 smb_renew_times(dentry);
469 error = 0;
470 }
471 }
472 unlock_kernel();
473out:
474 return ERR_PTR(error);
475}
476
477/*
478 * This code is common to all routines creating a new inode.
479 */
480static int
481smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
482{
483 struct smb_sb_info *server = server_from_dentry(dentry);
484 struct inode *inode;
485 int error;
486 struct smb_fattr fattr;
487
488 VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
489
490 error = smb_proc_getattr(dentry, &fattr);
491 if (error)
492 goto out_close;
493
494 smb_renew_times(dentry);
495 fattr.f_ino = iunique(dentry->d_sb, 2);
496 inode = smb_iget(dentry->d_sb, &fattr);
497 if (!inode)
498 goto out_no_inode;
499
500 if (have_id) {
501 struct smb_inode_info *ei = SMB_I(inode);
502 ei->fileid = fileid;
503 ei->access = SMB_O_RDWR;
504 ei->open = server->generation;
505 }
506 d_instantiate(dentry, inode);
507out:
508 return error;
509
510out_no_inode:
511 error = -EACCES;
512out_close:
513 if (have_id) {
514 PARANOIA("%s/%s failed, error=%d, closing %u\n",
515 DENTRY_PATH(dentry), error, fileid);
516 smb_close_fileid(dentry, fileid);
517 }
518 goto out;
519}
520
521/* N.B. How should the mode argument be used? */
522static int
523smb_create(struct inode *dir, struct dentry *dentry, int mode,
524 struct nameidata *nd)
525{
526 struct smb_sb_info *server = server_from_dentry(dentry);
527 __u16 fileid;
528 int error;
529 struct iattr attr;
530
531 VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
532
533 lock_kernel();
534 smb_invalid_dir_cache(dir);
535 error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
536 if (!error) {
537 if (server->opt.capabilities & SMB_CAP_UNIX) {
538 /* Set attributes for new file */
539 attr.ia_valid = ATTR_MODE;
540 attr.ia_mode = mode;
541 error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
542 }
543 error = smb_instantiate(dentry, fileid, 1);
544 } else {
545 PARANOIA("%s/%s failed, error=%d\n",
546 DENTRY_PATH(dentry), error);
547 }
548 unlock_kernel();
549 return error;
550}
551
552/* N.B. How should the mode argument be used? */
553static int
554smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
555{
556 struct smb_sb_info *server = server_from_dentry(dentry);
557 int error;
558 struct iattr attr;
559
560 lock_kernel();
561 smb_invalid_dir_cache(dir);
562 error = smb_proc_mkdir(dentry);
563 if (!error) {
564 if (server->opt.capabilities & SMB_CAP_UNIX) {
565 /* Set attributes for new directory */
566 attr.ia_valid = ATTR_MODE;
567 attr.ia_mode = mode;
568 error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
569 }
570 error = smb_instantiate(dentry, 0, 0);
571 }
572 unlock_kernel();
573 return error;
574}
575
576static int
577smb_rmdir(struct inode *dir, struct dentry *dentry)
578{
579 struct inode *inode = dentry->d_inode;
580 int error;
581
582 /*
583 * Close the directory if it's open.
584 */
585 lock_kernel();
586 smb_close(inode);
587
588 /*
589 * Check that nobody else is using the directory..
590 */
591 error = -EBUSY;
592 if (!d_unhashed(dentry))
593 goto out;
594
595 smb_invalid_dir_cache(dir);
596 error = smb_proc_rmdir(dentry);
597
598out:
599 unlock_kernel();
600 return error;
601}
602
603static int
604smb_unlink(struct inode *dir, struct dentry *dentry)
605{
606 int error;
607
608 /*
609 * Close the file if it's open.
610 */
611 lock_kernel();
612 smb_close(dentry->d_inode);
613
614 smb_invalid_dir_cache(dir);
615 error = smb_proc_unlink(dentry);
616 if (!error)
617 smb_renew_times(dentry);
618 unlock_kernel();
619 return error;
620}
621
622static int
623smb_rename(struct inode *old_dir, struct dentry *old_dentry,
624 struct inode *new_dir, struct dentry *new_dentry)
625{
626 int error;
627
628 /*
629 * Close any open files, and check whether to delete the
630 * target before attempting the rename.
631 */
632 lock_kernel();
633 if (old_dentry->d_inode)
634 smb_close(old_dentry->d_inode);
635 if (new_dentry->d_inode) {
636 smb_close(new_dentry->d_inode);
637 error = smb_proc_unlink(new_dentry);
638 if (error) {
639 VERBOSE("unlink %s/%s, error=%d\n",
640 DENTRY_PATH(new_dentry), error);
641 goto out;
642 }
643 /* FIXME */
644 d_delete(new_dentry);
645 }
646
647 smb_invalid_dir_cache(old_dir);
648 smb_invalid_dir_cache(new_dir);
649 error = smb_proc_mv(old_dentry, new_dentry);
650 if (!error) {
651 smb_renew_times(old_dentry);
652 smb_renew_times(new_dentry);
653 }
654out:
655 unlock_kernel();
656 return error;
657}
658
659/*
660 * FIXME: samba servers won't let you create device nodes unless uid/gid
661 * matches the connection credentials (and we don't know which those are ...)
662 */
663static int
664smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
665{
666 int error;
667 struct iattr attr;
668
669 attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
670 attr.ia_mode = mode;
671 current_euid_egid(&attr.ia_uid, &attr.ia_gid);
672
673 if (!new_valid_dev(dev))
674 return -EINVAL;
675
676 smb_invalid_dir_cache(dir);
677 error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
678 if (!error) {
679 error = smb_instantiate(dentry, 0, 0);
680 }
681 return error;
682}
683
684/*
685 * dentry = existing file
686 * new_dentry = new file
687 */
688static int
689smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
690{
691 int error;
692
693 DEBUG1("smb_link old=%s/%s new=%s/%s\n",
694 DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
695 smb_invalid_dir_cache(dir);
696 error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
697 if (!error) {
698 smb_renew_times(dentry);
699 error = smb_instantiate(new_dentry, 0, 0);
700 }
701 return error;
702}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94bb..000000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
1/*
2 * file.c
3 *
4 * Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/time.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/fcntl.h>
14#include <linux/stat.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/net.h>
19#include <linux/aio.h>
20
21#include <asm/uaccess.h>
22#include <asm/system.h>
23
24#include <linux/smbno.h>
25#include <linux/smb_fs.h>
26
27#include "smb_debug.h"
28#include "proto.h"
29
30static int
31smb_fsync(struct file *file, int datasync)
32{
33 struct dentry *dentry = file->f_path.dentry;
34 struct smb_sb_info *server = server_from_dentry(dentry);
35 int result;
36
37 VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
38
39 /*
40 * The VFS will writepage() all dirty pages for us, but we
41 * should send a SMBflush to the server, letting it know that
42 * we want things synchronized with actual storage.
43 *
44 * Note: this function requires all pages to have been written already
45 * (should be ok with writepage_sync)
46 */
47 result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
48 return result;
49}
50
51/*
52 * Read a page synchronously.
53 */
54static int
55smb_readpage_sync(struct dentry *dentry, struct page *page)
56{
57 char *buffer = kmap(page);
58 loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
59 struct smb_sb_info *server = server_from_dentry(dentry);
60 unsigned int rsize = smb_get_rsize(server);
61 int count = PAGE_SIZE;
62 int result;
63
64 VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
65 DENTRY_PATH(dentry), count, offset, rsize);
66
67 result = smb_open(dentry, SMB_O_RDONLY);
68 if (result < 0)
69 goto io_error;
70
71 do {
72 if (count < rsize)
73 rsize = count;
74
75 result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
76 if (result < 0)
77 goto io_error;
78
79 count -= result;
80 offset += result;
81 buffer += result;
82 dentry->d_inode->i_atime =
83 current_fs_time(dentry->d_inode->i_sb);
84 if (result < rsize)
85 break;
86 } while (count);
87
88 memset(buffer, 0, count);
89 flush_dcache_page(page);
90 SetPageUptodate(page);
91 result = 0;
92
93io_error:
94 kunmap(page);
95 unlock_page(page);
96 return result;
97}
98
99/*
100 * We are called with the page locked and we unlock it when done.
101 */
102static int
103smb_readpage(struct file *file, struct page *page)
104{
105 int error;
106 struct dentry *dentry = file->f_path.dentry;
107
108 page_cache_get(page);
109 error = smb_readpage_sync(dentry, page);
110 page_cache_release(page);
111 return error;
112}
113
114/*
115 * Write a page synchronously.
116 * Offset is the data offset within the page.
117 */
118static int
119smb_writepage_sync(struct inode *inode, struct page *page,
120 unsigned long pageoffset, unsigned int count)
121{
122 loff_t offset;
123 char *buffer = kmap(page) + pageoffset;
124 struct smb_sb_info *server = server_from_inode(inode);
125 unsigned int wsize = smb_get_wsize(server);
126 int ret = 0;
127
128 offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
129 VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
130 inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
131
132 do {
133 int write_ret;
134
135 if (count < wsize)
136 wsize = count;
137
138 write_ret = server->ops->write(inode, offset, wsize, buffer);
139 if (write_ret < 0) {
140 PARANOIA("failed write, wsize=%d, write_ret=%d\n",
141 wsize, write_ret);
142 ret = write_ret;
143 break;
144 }
145 /* N.B. what if result < wsize?? */
146#ifdef SMBFS_PARANOIA
147 if (write_ret < wsize)
148 PARANOIA("short write, wsize=%d, write_ret=%d\n",
149 wsize, write_ret);
150#endif
151 buffer += wsize;
152 offset += wsize;
153 count -= wsize;
154 /*
155 * Update the inode now rather than waiting for a refresh.
156 */
157 inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
158 SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
159 if (offset > inode->i_size)
160 inode->i_size = offset;
161 } while (count);
162
163 kunmap(page);
164 return ret;
165}
166
167/*
168 * Write a page to the server. This will be used for NFS swapping only
169 * (for now), and we currently do this synchronously only.
170 *
171 * We are called with the page locked and we unlock it when done.
172 */
173static int
174smb_writepage(struct page *page, struct writeback_control *wbc)
175{
176 struct address_space *mapping = page->mapping;
177 struct inode *inode;
178 unsigned long end_index;
179 unsigned offset = PAGE_CACHE_SIZE;
180 int err;
181
182 BUG_ON(!mapping);
183 inode = mapping->host;
184 BUG_ON(!inode);
185
186 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
187
188 /* easy case */
189 if (page->index < end_index)
190 goto do_it;
191 /* things got complicated... */
192 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
193 /* OK, are we completely out? */
194 if (page->index >= end_index+1 || !offset)
195 return 0; /* truncated - don't care */
196do_it:
197 page_cache_get(page);
198 err = smb_writepage_sync(inode, page, 0, offset);
199 SetPageUptodate(page);
200 unlock_page(page);
201 page_cache_release(page);
202 return err;
203}
204
205static int
206smb_updatepage(struct file *file, struct page *page, unsigned long offset,
207 unsigned int count)
208{
209 struct dentry *dentry = file->f_path.dentry;
210
211 DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
212 ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
213
214 return smb_writepage_sync(dentry->d_inode, page, offset, count);
215}
216
217static ssize_t
218smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
219 unsigned long nr_segs, loff_t pos)
220{
221 struct file * file = iocb->ki_filp;
222 struct dentry * dentry = file->f_path.dentry;
223 ssize_t status;
224
225 VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
226 (unsigned long) iocb->ki_left, (unsigned long) pos);
227
228 status = smb_revalidate_inode(dentry);
229 if (status) {
230 PARANOIA("%s/%s validation failed, error=%Zd\n",
231 DENTRY_PATH(dentry), status);
232 goto out;
233 }
234
235 VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
236 (long)dentry->d_inode->i_size,
237 dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
238
239 status = generic_file_aio_read(iocb, iov, nr_segs, pos);
240out:
241 return status;
242}
243
244static int
245smb_file_mmap(struct file * file, struct vm_area_struct * vma)
246{
247 struct dentry * dentry = file->f_path.dentry;
248 int status;
249
250 VERBOSE("file %s/%s, address %lu - %lu\n",
251 DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
252
253 status = smb_revalidate_inode(dentry);
254 if (status) {
255 PARANOIA("%s/%s validation failed, error=%d\n",
256 DENTRY_PATH(dentry), status);
257 goto out;
258 }
259 status = generic_file_mmap(file, vma);
260out:
261 return status;
262}
263
264static ssize_t
265smb_file_splice_read(struct file *file, loff_t *ppos,
266 struct pipe_inode_info *pipe, size_t count,
267 unsigned int flags)
268{
269 struct dentry *dentry = file->f_path.dentry;
270 ssize_t status;
271
272 VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
273 DENTRY_PATH(dentry), *ppos, count);
274
275 status = smb_revalidate_inode(dentry);
276 if (status) {
277 PARANOIA("%s/%s validation failed, error=%Zd\n",
278 DENTRY_PATH(dentry), status);
279 goto out;
280 }
281 status = generic_file_splice_read(file, ppos, pipe, count, flags);
282out:
283 return status;
284}
285
286/*
287 * This does the "real" work of the write. The generic routine has
288 * allocated the page, locked it, done all the page alignment stuff
289 * calculations etc. Now we should just copy the data from user
290 * space and write it back to the real medium..
291 *
292 * If the writer ends up delaying the write, the writer needs to
293 * increment the page use counts until he is done with the page.
294 */
295static int smb_write_begin(struct file *file, struct address_space *mapping,
296 loff_t pos, unsigned len, unsigned flags,
297 struct page **pagep, void **fsdata)
298{
299 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
300 *pagep = grab_cache_page_write_begin(mapping, index, flags);
301 if (!*pagep)
302 return -ENOMEM;
303 return 0;
304}
305
306static int smb_write_end(struct file *file, struct address_space *mapping,
307 loff_t pos, unsigned len, unsigned copied,
308 struct page *page, void *fsdata)
309{
310 int status;
311 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
312
313 lock_kernel();
314 status = smb_updatepage(file, page, offset, copied);
315 unlock_kernel();
316
317 if (!status) {
318 if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
319 SetPageUptodate(page);
320 status = copied;
321 }
322
323 unlock_page(page);
324 page_cache_release(page);
325
326 return status;
327}
328
329const struct address_space_operations smb_file_aops = {
330 .readpage = smb_readpage,
331 .writepage = smb_writepage,
332 .write_begin = smb_write_begin,
333 .write_end = smb_write_end,
334};
335
336/*
337 * Write to a file (through the page cache).
338 */
339static ssize_t
340smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
341 unsigned long nr_segs, loff_t pos)
342{
343 struct file * file = iocb->ki_filp;
344 struct dentry * dentry = file->f_path.dentry;
345 ssize_t result;
346
347 VERBOSE("file %s/%s, count=%lu@%lu\n",
348 DENTRY_PATH(dentry),
349 (unsigned long) iocb->ki_left, (unsigned long) pos);
350
351 result = smb_revalidate_inode(dentry);
352 if (result) {
353 PARANOIA("%s/%s validation failed, error=%Zd\n",
354 DENTRY_PATH(dentry), result);
355 goto out;
356 }
357
358 result = smb_open(dentry, SMB_O_WRONLY);
359 if (result)
360 goto out;
361
362 if (iocb->ki_left > 0) {
363 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
364 VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
365 (long) file->f_pos, (long) dentry->d_inode->i_size,
366 dentry->d_inode->i_mtime.tv_sec,
367 dentry->d_inode->i_atime.tv_sec);
368 }
369out:
370 return result;
371}
372
373static int
374smb_file_open(struct inode *inode, struct file * file)
375{
376 int result;
377 struct dentry *dentry = file->f_path.dentry;
378 int smb_mode = (file->f_mode & O_ACCMODE) - 1;
379
380 lock_kernel();
381 result = smb_open(dentry, smb_mode);
382 if (result)
383 goto out;
384 SMB_I(inode)->openers++;
385out:
386 unlock_kernel();
387 return result;
388}
389
390static int
391smb_file_release(struct inode *inode, struct file * file)
392{
393 lock_kernel();
394 if (!--SMB_I(inode)->openers) {
395 /* We must flush any dirty pages now as we won't be able to
396 write anything after close. mmap can trigger this.
397 "openers" should perhaps include mmap'ers ... */
398 filemap_write_and_wait(inode->i_mapping);
399 smb_close(inode);
400 }
401 unlock_kernel();
402 return 0;
403}
404
405/*
406 * Check whether the required access is compatible with
407 * an inode's permission. SMB doesn't recognize superuser
408 * privileges, so we need our own check for this.
409 */
410static int
411smb_file_permission(struct inode *inode, int mask)
412{
413 int mode = inode->i_mode;
414 int error = 0;
415
416 VERBOSE("mode=%x, mask=%x\n", mode, mask);
417
418 /* Look at user permissions */
419 mode >>= 6;
420 if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
421 error = -EACCES;
422 return error;
423}
424
425static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
426{
427 loff_t ret;
428 lock_kernel();
429 ret = generic_file_llseek_unlocked(file, offset, origin);
430 unlock_kernel();
431 return ret;
432}
433
434const struct file_operations smb_file_operations =
435{
436 .llseek = smb_remote_llseek,
437 .read = do_sync_read,
438 .aio_read = smb_file_aio_read,
439 .write = do_sync_write,
440 .aio_write = smb_file_aio_write,
441 .unlocked_ioctl = smb_ioctl,
442 .mmap = smb_file_mmap,
443 .open = smb_file_open,
444 .release = smb_file_release,
445 .fsync = smb_fsync,
446 .splice_read = smb_file_splice_read,
447};
448
449const struct inode_operations smb_file_inode_operations =
450{
451 .permission = smb_file_permission,
452 .getattr = smb_getattr,
453 .setattr = smb_notify_change,
454};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab1..000000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
1/*
2 * getopt.c
3 */
4
5#include <linux/kernel.h>
6#include <linux/string.h>
7#include <linux/net.h>
8
9#include "getopt.h"
10
11/**
12 * smb_getopt - option parser
13 * @caller: name of the caller, for error messages
14 * @options: the options string
15 * @opts: an array of &struct option entries controlling parser operations
16 * @optopt: output; will contain the current option
17 * @optarg: output; will contain the value (if one exists)
18 * @flag: output; may be NULL; should point to a long for or'ing flags
19 * @value: output; may be NULL; will be overwritten with the integer value
20 * of the current argument.
21 *
22 * Helper to parse options on the format used by mount ("a=b,c=d,e,f").
23 * Returns opts->val if a matching entry in the 'opts' array is found,
24 * 0 when no more tokens are found, -1 if an error is encountered.
25 */
26int smb_getopt(char *caller, char **options, struct option *opts,
27 char **optopt, char **optarg, unsigned long *flag,
28 unsigned long *value)
29{
30 char *token;
31 char *val;
32 int i;
33
34 do {
35 if ((token = strsep(options, ",")) == NULL)
36 return 0;
37 } while (*token == '\0');
38 *optopt = token;
39
40 *optarg = NULL;
41 if ((val = strchr (token, '=')) != NULL) {
42 *val++ = 0;
43 if (value)
44 *value = simple_strtoul(val, NULL, 0);
45 *optarg = val;
46 }
47
48 for (i = 0; opts[i].name != NULL; i++) {
49 if (!strcmp(opts[i].name, token)) {
50 if (!opts[i].flag && (!val || !*val)) {
51 printk("%s: the %s option requires an argument\n",
52 caller, token);
53 return -1;
54 }
55
56 if (flag && opts[i].flag)
57 *flag |= opts[i].flag;
58
59 return opts[i].val;
60 }
61 }
62 printk("%s: Unrecognized mount option %s\n", caller, token);
63 return -1;
64}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c46..000000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#ifndef _LINUX_GETOPT_H
2#define _LINUX_GETOPT_H
3
4struct option {
5 const char *name;
6 unsigned long flag;
7 int val;
8};
9
10extern int smb_getopt(char *caller, char **options, struct option *opts,
11 char **optopt, char **optarg, unsigned long *flag,
12 unsigned long *value);
13
14#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 450c91941988..000000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,839 +0,0 @@
1/*
2 * inode.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/module.h>
11#include <linux/time.h>
12#include <linux/kernel.h>
13#include <linux/mm.h>
14#include <linux/string.h>
15#include <linux/stat.h>
16#include <linux/errno.h>
17#include <linux/slab.h>
18#include <linux/init.h>
19#include <linux/file.h>
20#include <linux/dcache.h>
21#include <linux/smp_lock.h>
22#include <linux/nls.h>
23#include <linux/seq_file.h>
24#include <linux/mount.h>
25#include <linux/net.h>
26#include <linux/vfs.h>
27#include <linux/highuid.h>
28#include <linux/sched.h>
29#include <linux/smb_fs.h>
30#include <linux/smbno.h>
31#include <linux/smb_mount.h>
32
33#include <asm/system.h>
34#include <asm/uaccess.h>
35
36#include "smb_debug.h"
37#include "getopt.h"
38#include "proto.h"
39
40/* Always pick a default string */
41#ifdef CONFIG_SMB_NLS_REMOTE
42#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
43#else
44#define SMB_NLS_REMOTE ""
45#endif
46
47#define SMB_TTL_DEFAULT 1000
48
49static void smb_evict_inode(struct inode *);
50static void smb_put_super(struct super_block *);
51static int smb_statfs(struct dentry *, struct kstatfs *);
52static int smb_show_options(struct seq_file *, struct vfsmount *);
53
54static struct kmem_cache *smb_inode_cachep;
55
56static struct inode *smb_alloc_inode(struct super_block *sb)
57{
58 struct smb_inode_info *ei;
59 ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
60 if (!ei)
61 return NULL;
62 return &ei->vfs_inode;
63}
64
65static void smb_destroy_inode(struct inode *inode)
66{
67 kmem_cache_free(smb_inode_cachep, SMB_I(inode));
68}
69
70static void init_once(void *foo)
71{
72 struct smb_inode_info *ei = (struct smb_inode_info *) foo;
73
74 inode_init_once(&ei->vfs_inode);
75}
76
77static int init_inodecache(void)
78{
79 smb_inode_cachep = kmem_cache_create("smb_inode_cache",
80 sizeof(struct smb_inode_info),
81 0, (SLAB_RECLAIM_ACCOUNT|
82 SLAB_MEM_SPREAD),
83 init_once);
84 if (smb_inode_cachep == NULL)
85 return -ENOMEM;
86 return 0;
87}
88
89static void destroy_inodecache(void)
90{
91 kmem_cache_destroy(smb_inode_cachep);
92}
93
94static int smb_remount(struct super_block *sb, int *flags, char *data)
95{
96 *flags |= MS_NODIRATIME;
97 return 0;
98}
99
100static const struct super_operations smb_sops =
101{
102 .alloc_inode = smb_alloc_inode,
103 .destroy_inode = smb_destroy_inode,
104 .drop_inode = generic_delete_inode,
105 .evict_inode = smb_evict_inode,
106 .put_super = smb_put_super,
107 .statfs = smb_statfs,
108 .show_options = smb_show_options,
109 .remount_fs = smb_remount,
110};
111
112
113/* We are always generating a new inode here */
114struct inode *
115smb_iget(struct super_block *sb, struct smb_fattr *fattr)
116{
117 struct smb_sb_info *server = SMB_SB(sb);
118 struct inode *result;
119
120 DEBUG1("smb_iget: %p\n", fattr);
121
122 result = new_inode(sb);
123 if (!result)
124 return result;
125 result->i_ino = fattr->f_ino;
126 SMB_I(result)->open = 0;
127 SMB_I(result)->fileid = 0;
128 SMB_I(result)->access = 0;
129 SMB_I(result)->flags = 0;
130 SMB_I(result)->closed = 0;
131 SMB_I(result)->openers = 0;
132 smb_set_inode_attr(result, fattr);
133 if (S_ISREG(result->i_mode)) {
134 result->i_op = &smb_file_inode_operations;
135 result->i_fop = &smb_file_operations;
136 result->i_data.a_ops = &smb_file_aops;
137 } else if (S_ISDIR(result->i_mode)) {
138 if (server->opt.capabilities & SMB_CAP_UNIX)
139 result->i_op = &smb_dir_inode_operations_unix;
140 else
141 result->i_op = &smb_dir_inode_operations;
142 result->i_fop = &smb_dir_operations;
143 } else if (S_ISLNK(result->i_mode)) {
144 result->i_op = &smb_link_inode_operations;
145 } else {
146 init_special_inode(result, result->i_mode, fattr->f_rdev);
147 }
148 insert_inode_hash(result);
149 return result;
150}
151
152/*
153 * Copy the inode data to a smb_fattr structure.
154 */
155void
156smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
157{
158 memset(fattr, 0, sizeof(struct smb_fattr));
159 fattr->f_mode = inode->i_mode;
160 fattr->f_nlink = inode->i_nlink;
161 fattr->f_ino = inode->i_ino;
162 fattr->f_uid = inode->i_uid;
163 fattr->f_gid = inode->i_gid;
164 fattr->f_size = inode->i_size;
165 fattr->f_mtime = inode->i_mtime;
166 fattr->f_ctime = inode->i_ctime;
167 fattr->f_atime = inode->i_atime;
168 fattr->f_blocks = inode->i_blocks;
169
170 fattr->attr = SMB_I(inode)->attr;
171 /*
172 * Keep the attributes in sync with the inode permissions.
173 */
174 if (fattr->f_mode & S_IWUSR)
175 fattr->attr &= ~aRONLY;
176 else
177 fattr->attr |= aRONLY;
178}
179
180/*
181 * Update the inode, possibly causing it to invalidate its pages if mtime/size
182 * is different from last time.
183 */
184void
185smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
186{
187 struct smb_inode_info *ei = SMB_I(inode);
188
189 /*
190 * A size change should have a different mtime, or same mtime
191 * but different size.
192 */
193 time_t last_time = inode->i_mtime.tv_sec;
194 loff_t last_sz = inode->i_size;
195
196 inode->i_mode = fattr->f_mode;
197 inode->i_nlink = fattr->f_nlink;
198 inode->i_uid = fattr->f_uid;
199 inode->i_gid = fattr->f_gid;
200 inode->i_ctime = fattr->f_ctime;
201 inode->i_blocks = fattr->f_blocks;
202 inode->i_size = fattr->f_size;
203 inode->i_mtime = fattr->f_mtime;
204 inode->i_atime = fattr->f_atime;
205 ei->attr = fattr->attr;
206
207 /*
208 * Update the "last time refreshed" field for revalidation.
209 */
210 ei->oldmtime = jiffies;
211
212 if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
213 VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
214 inode->i_ino,
215 (long) last_time, (long) inode->i_mtime.tv_sec,
216 (long) last_sz, (long) inode->i_size);
217
218 if (!S_ISDIR(inode->i_mode))
219 invalidate_remote_inode(inode);
220 }
221}
222
223/*
224 * This is called if the connection has gone bad ...
225 * try to kill off all the current inodes.
226 */
227void
228smb_invalidate_inodes(struct smb_sb_info *server)
229{
230 VERBOSE("\n");
231 shrink_dcache_sb(SB_of(server));
232 invalidate_inodes(SB_of(server));
233}
234
235/*
236 * This is called to update the inode attributes after
237 * we've made changes to a file or directory.
238 */
239static int
240smb_refresh_inode(struct dentry *dentry)
241{
242 struct inode *inode = dentry->d_inode;
243 int error;
244 struct smb_fattr fattr;
245
246 error = smb_proc_getattr(dentry, &fattr);
247 if (!error) {
248 smb_renew_times(dentry);
249 /*
250 * Check whether the type part of the mode changed,
251 * and don't update the attributes if it did.
252 *
253 * And don't dick with the root inode
254 */
255 if (inode->i_ino == 2)
256 return error;
257 if (S_ISLNK(inode->i_mode))
258 return error; /* VFS will deal with it */
259
260 if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
261 smb_set_inode_attr(inode, &fattr);
262 } else {
263 /*
264 * Big trouble! The inode has become a new object,
265 * so any operations attempted on it are invalid.
266 *
267 * To limit damage, mark the inode as bad so that
268 * subsequent lookup validations will fail.
269 */
270 PARANOIA("%s/%s changed mode, %07o to %07o\n",
271 DENTRY_PATH(dentry),
272 inode->i_mode, fattr.f_mode);
273
274 fattr.f_mode = inode->i_mode; /* save mode */
275 make_bad_inode(inode);
276 inode->i_mode = fattr.f_mode; /* restore mode */
277 /*
278 * No need to worry about unhashing the dentry: the
279 * lookup validation will see that the inode is bad.
280 * But we do want to invalidate the caches ...
281 */
282 if (!S_ISDIR(inode->i_mode))
283 invalidate_remote_inode(inode);
284 else
285 smb_invalid_dir_cache(inode);
286 error = -EIO;
287 }
288 }
289 return error;
290}
291
292/*
293 * This is called when we want to check whether the inode
294 * has changed on the server. If it has changed, we must
295 * invalidate our local caches.
296 */
297int
298smb_revalidate_inode(struct dentry *dentry)
299{
300 struct smb_sb_info *s = server_from_dentry(dentry);
301 struct inode *inode = dentry->d_inode;
302 int error = 0;
303
304 DEBUG1("smb_revalidate_inode\n");
305 lock_kernel();
306
307 /*
308 * Check whether we've recently refreshed the inode.
309 */
310 if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
311 VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
312 inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
313 goto out;
314 }
315
316 error = smb_refresh_inode(dentry);
317out:
318 unlock_kernel();
319 return error;
320}
321
322/*
323 * This routine is called when i_nlink == 0 and i_count goes to 0.
324 * All blocking cleanup operations need to go here to avoid races.
325 */
326static void
327smb_evict_inode(struct inode *ino)
328{
329 DEBUG1("ino=%ld\n", ino->i_ino);
330 truncate_inode_pages(&ino->i_data, 0);
331 end_writeback(ino);
332 lock_kernel();
333 if (smb_close(ino))
334 PARANOIA("could not close inode %ld\n", ino->i_ino);
335 unlock_kernel();
336}
337
338static struct option opts[] = {
339 { "version", 0, 'v' },
340 { "win95", SMB_MOUNT_WIN95, 1 },
341 { "oldattr", SMB_MOUNT_OLDATTR, 1 },
342 { "dirattr", SMB_MOUNT_DIRATTR, 1 },
343 { "case", SMB_MOUNT_CASE, 1 },
344 { "uid", 0, 'u' },
345 { "gid", 0, 'g' },
346 { "file_mode", 0, 'f' },
347 { "dir_mode", 0, 'd' },
348 { "iocharset", 0, 'i' },
349 { "codepage", 0, 'c' },
350 { "ttl", 0, 't' },
351 { NULL, 0, 0}
352};
353
354static int
355parse_options(struct smb_mount_data_kernel *mnt, char *options)
356{
357 int c;
358 unsigned long flags;
359 unsigned long value;
360 char *optarg;
361 char *optopt;
362
363 flags = 0;
364 while ( (c = smb_getopt("smbfs", &options, opts,
365 &optopt, &optarg, &flags, &value)) > 0) {
366
367 VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
368 switch (c) {
369 case 1:
370 /* got a "flag" option */
371 break;
372 case 'v':
373 if (value != SMB_MOUNT_VERSION) {
374 printk ("smbfs: Bad mount version %ld, expected %d\n",
375 value, SMB_MOUNT_VERSION);
376 return 0;
377 }
378 mnt->version = value;
379 break;
380 case 'u':
381 mnt->uid = value;
382 flags |= SMB_MOUNT_UID;
383 break;
384 case 'g':
385 mnt->gid = value;
386 flags |= SMB_MOUNT_GID;
387 break;
388 case 'f':
389 mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
390 flags |= SMB_MOUNT_FMODE;
391 break;
392 case 'd':
393 mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
394 flags |= SMB_MOUNT_DMODE;
395 break;
396 case 'i':
397 strlcpy(mnt->codepage.local_name, optarg,
398 SMB_NLS_MAXNAMELEN);
399 break;
400 case 'c':
401 strlcpy(mnt->codepage.remote_name, optarg,
402 SMB_NLS_MAXNAMELEN);
403 break;
404 case 't':
405 mnt->ttl = value;
406 break;
407 default:
408 printk ("smbfs: Unrecognized mount option %s\n",
409 optopt);
410 return -1;
411 }
412 }
413 mnt->flags = flags;
414 return c;
415}
416
417/*
418 * smb_show_options() is for displaying mount options in /proc/mounts.
419 * It tries to avoid showing settings that were not changed from their
420 * defaults.
421 */
422static int
423smb_show_options(struct seq_file *s, struct vfsmount *m)
424{
425 struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
426 int i;
427
428 for (i = 0; opts[i].name != NULL; i++)
429 if (mnt->flags & opts[i].flag)
430 seq_printf(s, ",%s", opts[i].name);
431
432 if (mnt->flags & SMB_MOUNT_UID)
433 seq_printf(s, ",uid=%d", mnt->uid);
434 if (mnt->flags & SMB_MOUNT_GID)
435 seq_printf(s, ",gid=%d", mnt->gid);
436 if (mnt->mounted_uid != 0)
437 seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
438
439 /*
440 * Defaults for file_mode and dir_mode are unknown to us; they
441 * depend on the current umask of the user doing the mount.
442 */
443 if (mnt->flags & SMB_MOUNT_FMODE)
444 seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
445 if (mnt->flags & SMB_MOUNT_DMODE)
446 seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
447
448 if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
449 seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
450 if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
451 seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
452
453 if (mnt->ttl != SMB_TTL_DEFAULT)
454 seq_printf(s, ",ttl=%d", mnt->ttl);
455
456 return 0;
457}
458
459static void
460smb_unload_nls(struct smb_sb_info *server)
461{
462 unload_nls(server->remote_nls);
463 unload_nls(server->local_nls);
464}
465
466static void
467smb_put_super(struct super_block *sb)
468{
469 struct smb_sb_info *server = SMB_SB(sb);
470
471 lock_kernel();
472
473 smb_lock_server(server);
474 server->state = CONN_INVALID;
475 smbiod_unregister_server(server);
476
477 smb_close_socket(server);
478
479 if (server->conn_pid)
480 kill_pid(server->conn_pid, SIGTERM, 1);
481
482 bdi_destroy(&server->bdi);
483 kfree(server->ops);
484 smb_unload_nls(server);
485 sb->s_fs_info = NULL;
486 smb_unlock_server(server);
487 put_pid(server->conn_pid);
488 kfree(server);
489
490 unlock_kernel();
491}
492
493static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
494{
495 struct smb_sb_info *server;
496 struct smb_mount_data_kernel *mnt;
497 struct smb_mount_data *oldmnt;
498 struct inode *root_inode;
499 struct smb_fattr root;
500 int ver;
501 void *mem;
502 static int warn_count;
503
504 if (warn_count < 5) {
505 warn_count++;
506 printk(KERN_EMERG "smbfs is deprecated and will be removed"
507 " from the 2.6.27 kernel. Please migrate to cifs\n");
508 }
509
510 if (!raw_data)
511 goto out_no_data;
512
513 oldmnt = (struct smb_mount_data *) raw_data;
514 ver = oldmnt->version;
515 if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
516 goto out_wrong_data;
517
518 sb->s_flags |= MS_NODIRATIME;
519 sb->s_blocksize = 1024; /* Eh... Is this correct? */
520 sb->s_blocksize_bits = 10;
521 sb->s_magic = SMB_SUPER_MAGIC;
522 sb->s_op = &smb_sops;
523 sb->s_time_gran = 100;
524
525 server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
526 if (!server)
527 goto out_no_server;
528 sb->s_fs_info = server;
529
530 if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
531 goto out_bdi;
532
533 sb->s_bdi = &server->bdi;
534
535 server->super_block = sb;
536 server->mnt = NULL;
537 server->sock_file = NULL;
538 init_waitqueue_head(&server->conn_wq);
539 init_MUTEX(&server->sem);
540 INIT_LIST_HEAD(&server->entry);
541 INIT_LIST_HEAD(&server->xmitq);
542 INIT_LIST_HEAD(&server->recvq);
543 server->conn_error = 0;
544 server->conn_pid = NULL;
545 server->state = CONN_INVALID; /* no connection yet */
546 server->generation = 0;
547
548 /* Allocate the global temp buffer and some superblock helper structs */
549 /* FIXME: move these to the smb_sb_info struct */
550 VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
551 sizeof(struct smb_mount_data_kernel));
552 mem = kmalloc(sizeof(struct smb_ops) +
553 sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
554 if (!mem)
555 goto out_no_mem;
556
557 server->ops = mem;
558 smb_install_null_ops(server->ops);
559 server->mnt = mem + sizeof(struct smb_ops);
560
561 /* Setup NLS stuff */
562 server->remote_nls = NULL;
563 server->local_nls = NULL;
564
565 mnt = server->mnt;
566
567 memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
568 strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
569 SMB_NLS_MAXNAMELEN);
570 strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
571 SMB_NLS_MAXNAMELEN);
572
573 mnt->ttl = SMB_TTL_DEFAULT;
574 if (ver == SMB_MOUNT_OLDVERSION) {
575 mnt->version = oldmnt->version;
576
577 SET_UID(mnt->uid, oldmnt->uid);
578 SET_GID(mnt->gid, oldmnt->gid);
579
580 mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
581 mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
582
583 mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
584 SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
585 } else {
586 mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
587 S_IROTH | S_IXOTH | S_IFREG;
588 mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
589 S_IROTH | S_IXOTH | S_IFDIR;
590 if (parse_options(mnt, raw_data))
591 goto out_bad_option;
592 }
593 mnt->mounted_uid = current_uid();
594 smb_setcodepage(server, &mnt->codepage);
595
596 /*
597 * Display the enabled options
598 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
599 */
600 if (mnt->flags & SMB_MOUNT_OLDATTR)
601 printk("SMBFS: Using core getattr (Win 95 speedup)\n");
602 else if (mnt->flags & SMB_MOUNT_DIRATTR)
603 printk("SMBFS: Using dir ff getattr\n");
604
605 if (smbiod_register_server(server) < 0) {
606 printk(KERN_ERR "smbfs: failed to start smbiod\n");
607 goto out_no_smbiod;
608 }
609
610 /*
611 * Keep the super block locked while we get the root inode.
612 */
613 smb_init_root_dirent(server, &root, sb);
614 root_inode = smb_iget(sb, &root);
615 if (!root_inode)
616 goto out_no_root;
617
618 sb->s_root = d_alloc_root(root_inode);
619 if (!sb->s_root)
620 goto out_no_root;
621
622 smb_new_dentry(sb->s_root);
623
624 return 0;
625
626out_no_root:
627 iput(root_inode);
628out_no_smbiod:
629 smb_unload_nls(server);
630out_bad_option:
631 kfree(mem);
632out_no_mem:
633 bdi_destroy(&server->bdi);
634out_bdi:
635 if (!server->mnt)
636 printk(KERN_ERR "smb_fill_super: allocation failure\n");
637 sb->s_fs_info = NULL;
638 kfree(server);
639 goto out_fail;
640out_wrong_data:
641 printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
642 goto out_fail;
643out_no_data:
644 printk(KERN_ERR "smb_fill_super: missing data argument\n");
645out_fail:
646 return -EINVAL;
647out_no_server:
648 printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
649 return -ENOMEM;
650}
651
652static int
653smb_statfs(struct dentry *dentry, struct kstatfs *buf)
654{
655 int result;
656
657 lock_kernel();
658
659 result = smb_proc_dskattr(dentry, buf);
660
661 unlock_kernel();
662
663 buf->f_type = SMB_SUPER_MAGIC;
664 buf->f_namelen = SMB_MAXPATHLEN;
665 return result;
666}
667
668int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
669{
670 int err = smb_revalidate_inode(dentry);
671 if (!err)
672 generic_fillattr(dentry->d_inode, stat);
673 return err;
674}
675
676int
677smb_notify_change(struct dentry *dentry, struct iattr *attr)
678{
679 struct inode *inode = dentry->d_inode;
680 struct smb_sb_info *server = server_from_dentry(dentry);
681 unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
682 int error, changed, refresh = 0;
683 struct smb_fattr fattr;
684
685 lock_kernel();
686
687 error = smb_revalidate_inode(dentry);
688 if (error)
689 goto out;
690
691 if ((error = inode_change_ok(inode, attr)) < 0)
692 goto out;
693
694 error = -EPERM;
695 if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
696 goto out;
697
698 if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
699 goto out;
700
701 if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
702 goto out;
703
704 if ((attr->ia_valid & ATTR_SIZE) != 0) {
705 VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
706 DENTRY_PATH(dentry),
707 (long) inode->i_size, (long) attr->ia_size);
708
709 filemap_write_and_wait(inode->i_mapping);
710
711 error = smb_open(dentry, O_WRONLY);
712 if (error)
713 goto out;
714 error = server->ops->truncate(inode, attr->ia_size);
715 if (error)
716 goto out;
717 truncate_setsize(inode, attr->ia_size);
718 refresh = 1;
719 }
720
721 if (server->opt.capabilities & SMB_CAP_UNIX) {
722 /* For now we don't want to set the size with setattr_unix */
723 attr->ia_valid &= ~ATTR_SIZE;
724 /* FIXME: only call if we actually want to set something? */
725 error = smb_proc_setattr_unix(dentry, attr, 0, 0);
726 if (!error)
727 refresh = 1;
728
729 goto out;
730 }
731
732 /*
733 * Initialize the fattr and check for changed fields.
734 * Note: CTIME under SMB is creation time rather than
735 * change time, so we don't attempt to change it.
736 */
737 smb_get_inode_attr(inode, &fattr);
738
739 changed = 0;
740 if ((attr->ia_valid & ATTR_MTIME) != 0) {
741 fattr.f_mtime = attr->ia_mtime;
742 changed = 1;
743 }
744 if ((attr->ia_valid & ATTR_ATIME) != 0) {
745 fattr.f_atime = attr->ia_atime;
746 /* Earlier protocols don't have an access time */
747 if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
748 changed = 1;
749 }
750 if (changed) {
751 error = smb_proc_settime(dentry, &fattr);
752 if (error)
753 goto out;
754 refresh = 1;
755 }
756
757 /*
758 * Check for mode changes ... we're extremely limited in
759 * what can be set for SMB servers: just the read-only bit.
760 */
761 if ((attr->ia_valid & ATTR_MODE) != 0) {
762 VERBOSE("%s/%s mode change, old=%x, new=%x\n",
763 DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
764 changed = 0;
765 if (attr->ia_mode & S_IWUSR) {
766 if (fattr.attr & aRONLY) {
767 fattr.attr &= ~aRONLY;
768 changed = 1;
769 }
770 } else {
771 if (!(fattr.attr & aRONLY)) {
772 fattr.attr |= aRONLY;
773 changed = 1;
774 }
775 }
776 if (changed) {
777 error = smb_proc_setattr(dentry, &fattr);
778 if (error)
779 goto out;
780 refresh = 1;
781 }
782 }
783 error = 0;
784
785out:
786 if (refresh)
787 smb_refresh_inode(dentry);
788 unlock_kernel();
789 return error;
790}
791
792static int smb_get_sb(struct file_system_type *fs_type,
793 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
794{
795 return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
796}
797
798static struct file_system_type smb_fs_type = {
799 .owner = THIS_MODULE,
800 .name = "smbfs",
801 .get_sb = smb_get_sb,
802 .kill_sb = kill_anon_super,
803 .fs_flags = FS_BINARY_MOUNTDATA,
804};
805
806static int __init init_smb_fs(void)
807{
808 int err;
809 DEBUG1("registering ...\n");
810
811 err = init_inodecache();
812 if (err)
813 goto out_inode;
814 err = smb_init_request_cache();
815 if (err)
816 goto out_request;
817 err = register_filesystem(&smb_fs_type);
818 if (err)
819 goto out;
820 return 0;
821out:
822 smb_destroy_request_cache();
823out_request:
824 destroy_inodecache();
825out_inode:
826 return err;
827}
828
829static void __exit exit_smb_fs(void)
830{
831 DEBUG1("unregistering ...\n");
832 unregister_filesystem(&smb_fs_type);
833 smb_destroy_request_cache();
834 destroy_inodecache();
835}
836
837module_init(init_smb_fs)
838module_exit(exit_smb_fs)
839MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad39..000000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/*
2 * ioctl.c
3 *
4 * Copyright (C) 1995, 1996 by Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/errno.h>
11#include <linux/fs.h>
12#include <linux/ioctl.h>
13#include <linux/time.h>
14#include <linux/mm.h>
15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
17#include <linux/net.h>
18
19#include <linux/smb_fs.h>
20#include <linux/smb_mount.h>
21
22#include <asm/uaccess.h>
23
24#include "proto.h"
25
26long
27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
28{
29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt;
31 int result = -EINVAL;
32
33 lock_kernel();
34 switch (cmd) {
35 uid16_t uid16;
36 uid_t uid32;
37 case SMB_IOC_GETMOUNTUID:
38 SET_UID(uid16, server->mnt->mounted_uid);
39 result = put_user(uid16, (uid16_t __user *) arg);
40 break;
41 case SMB_IOC_GETMOUNTUID32:
42 SET_UID(uid32, server->mnt->mounted_uid);
43 result = put_user(uid32, (uid_t __user *) arg);
44 break;
45
46 case SMB_IOC_NEWCONN:
47 /* arg is smb_conn_opt, or NULL if no connection was made */
48 if (!arg) {
49 result = 0;
50 smb_lock_server(server);
51 server->state = CONN_RETRIED;
52 printk(KERN_ERR "Connection attempt failed! [%d]\n",
53 server->conn_error);
54 smbiod_flush(server);
55 smb_unlock_server(server);
56 break;
57 }
58
59 result = -EFAULT;
60 if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
61 result = smb_newconn(server, &opt);
62 break;
63 default:
64 break;
65 }
66 unlock_kernel();
67
68 return result;
69}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b4..000000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
1/*
2 * proc.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/types.h>
11#include <linux/capability.h>
12#include <linux/errno.h>
13#include <linux/slab.h>
14#include <linux/fs.h>
15#include <linux/file.h>
16#include <linux/stat.h>
17#include <linux/fcntl.h>
18#include <linux/dcache.h>
19#include <linux/nls.h>
20#include <linux/smp_lock.h>
21#include <linux/net.h>
22#include <linux/vfs.h>
23#include <linux/smb_fs.h>
24#include <linux/smbno.h>
25#include <linux/smb_mount.h>
26
27#include <net/sock.h>
28
29#include <asm/string.h>
30#include <asm/div64.h>
31
32#include "smb_debug.h"
33#include "proto.h"
34#include "request.h"
35
36
37/* Features. Undefine if they cause problems, this should perhaps be a
38 config option. */
39#define SMBFS_POSIX_UNLINK 1
40
41/* Allow smb_retry to be interrupted. */
42#define SMB_RETRY_INTR
43
44#define SMB_VWV(packet) ((packet) + SMB_HEADER_LEN)
45#define SMB_CMD(packet) (*(packet+8))
46#define SMB_WCT(packet) (*(packet+SMB_HEADER_LEN - 1))
47
48#define SMB_DIRINFO_SIZE 43
49#define SMB_STATUS_SIZE 21
50
51#define SMB_ST_BLKSIZE (PAGE_SIZE)
52#define SMB_ST_BLKSHIFT (PAGE_SHIFT)
53
54static struct smb_ops smb_ops_core;
55static struct smb_ops smb_ops_os2;
56static struct smb_ops smb_ops_win95;
57static struct smb_ops smb_ops_winNT;
58static struct smb_ops smb_ops_unix;
59static struct smb_ops smb_ops_null;
60
61static void
62smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
63static void
64smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
65static int
66smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
67 struct smb_fattr *fattr);
68static int
69smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
70 struct smb_fattr *fattr);
71static int
72smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
73 u16 attr);
74static int
75smb_proc_setattr_ext(struct smb_sb_info *server,
76 struct inode *inode, struct smb_fattr *fattr);
77static int
78smb_proc_query_cifsunix(struct smb_sb_info *server);
79static void
80install_ops(struct smb_ops *dst, struct smb_ops *src);
81
82
83static void
84str_upper(char *name, int len)
85{
86 while (len--)
87 {
88 if (*name >= 'a' && *name <= 'z')
89 *name -= ('a' - 'A');
90 name++;
91 }
92}
93
94#if 0
95static void
96str_lower(char *name, int len)
97{
98 while (len--)
99 {
100 if (*name >= 'A' && *name <= 'Z')
101 *name += ('a' - 'A');
102 name++;
103 }
104}
105#endif
106
107/* reverse a string inline. This is used by the dircache walking routines */
108static void reverse_string(char *buf, int len)
109{
110 char c;
111 char *end = buf+len-1;
112
113 while(buf < end) {
114 c = *buf;
115 *(buf++) = *end;
116 *(end--) = c;
117 }
118}
119
120/* no conversion, just a wrapper for memcpy. */
121static int convert_memcpy(unsigned char *output, int olen,
122 const unsigned char *input, int ilen,
123 struct nls_table *nls_from,
124 struct nls_table *nls_to)
125{
126 if (olen < ilen)
127 return -ENAMETOOLONG;
128 memcpy(output, input, ilen);
129 return ilen;
130}
131
132static inline int write_char(unsigned char ch, char *output, int olen)
133{
134 if (olen < 4)
135 return -ENAMETOOLONG;
136 sprintf(output, ":x%02x", ch);
137 return 4;
138}
139
140static inline int write_unichar(wchar_t ch, char *output, int olen)
141{
142 if (olen < 5)
143 return -ENAMETOOLONG;
144 sprintf(output, ":%04x", ch);
145 return 5;
146}
147
148/* convert from one "codepage" to another (possibly being utf8). */
149static int convert_cp(unsigned char *output, int olen,
150 const unsigned char *input, int ilen,
151 struct nls_table *nls_from,
152 struct nls_table *nls_to)
153{
154 int len = 0;
155 int n;
156 wchar_t ch;
157
158 while (ilen > 0) {
159 /* convert by changing to unicode and back to the new cp */
160 n = nls_from->char2uni(input, ilen, &ch);
161 if (n == -EINVAL) {
162 ilen--;
163 n = write_char(*input++, output, olen);
164 if (n < 0)
165 goto fail;
166 output += n;
167 olen -= n;
168 len += n;
169 continue;
170 } else if (n < 0)
171 goto fail;
172 input += n;
173 ilen -= n;
174
175 n = nls_to->uni2char(ch, output, olen);
176 if (n == -EINVAL)
177 n = write_unichar(ch, output, olen);
178 if (n < 0)
179 goto fail;
180 output += n;
181 olen -= n;
182
183 len += n;
184 }
185 return len;
186fail:
187 return n;
188}
189
190/* ----------------------------------------------------------- */
191
192/*
193 * nls_unicode
194 *
195 * This encodes/decodes little endian unicode format
196 */
197
198static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
199{
200 if (boundlen < 2)
201 return -EINVAL;
202 *out++ = uni & 0xff;
203 *out++ = uni >> 8;
204 return 2;
205}
206
207static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
208{
209 if (boundlen < 2)
210 return -EINVAL;
211 *uni = (rawstring[1] << 8) | rawstring[0];
212 return 2;
213}
214
215static struct nls_table unicode_table = {
216 .charset = "unicode",
217 .uni2char = uni2char,
218 .char2uni = char2uni,
219};
220
221/* ----------------------------------------------------------- */
222
223static int setcodepage(struct nls_table **p, char *name)
224{
225 struct nls_table *nls;
226
227 if (!name || !*name) {
228 nls = NULL;
229 } else if ( (nls = load_nls(name)) == NULL) {
230 printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
231 return -EINVAL;
232 }
233
234 /* if already set, unload the previous one. */
235 if (*p && *p != &unicode_table)
236 unload_nls(*p);
237 *p = nls;
238
239 return 0;
240}
241
242/* Handles all changes to codepage settings. */
243int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
244{
245 int n = 0;
246
247 smb_lock_server(server);
248
249 /* Don't load any nls_* at all, if no remote is requested */
250 if (!*cp->remote_name)
251 goto out;
252
253 /* local */
254 n = setcodepage(&server->local_nls, cp->local_name);
255 if (n != 0)
256 goto out;
257
258 /* remote */
259 if (!strcmp(cp->remote_name, "unicode")) {
260 server->remote_nls = &unicode_table;
261 } else {
262 n = setcodepage(&server->remote_nls, cp->remote_name);
263 if (n != 0)
264 setcodepage(&server->local_nls, NULL);
265 }
266
267out:
268 if (server->local_nls != NULL && server->remote_nls != NULL)
269 server->ops->convert = convert_cp;
270 else
271 server->ops->convert = convert_memcpy;
272
273 smb_unlock_server(server);
274 return n;
275}
276
277
278/*****************************************************************************/
279/* */
280/* Encoding/Decoding section */
281/* */
282/*****************************************************************************/
283
284static __u8 *
285smb_encode_smb_length(__u8 * p, __u32 len)
286{
287 *p = 0;
288 *(p+1) = 0;
289 *(p+2) = (len & 0xFF00) >> 8;
290 *(p+3) = (len & 0xFF);
291 if (len > 0xFFFF)
292 {
293 *(p+1) = 1;
294 }
295 return p + 4;
296}
297
298/*
299 * smb_build_path: build the path to entry and name storing it in buf.
300 * The path returned will have the trailing '\0'.
301 */
302static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
303 int maxlen,
304 struct dentry *entry, struct qstr *name)
305{
306 unsigned char *path = buf;
307 int len;
308 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
309
310 if (maxlen < (2<<unicode))
311 return -ENAMETOOLONG;
312
313 if (maxlen > SMB_MAXPATHLEN + 1)
314 maxlen = SMB_MAXPATHLEN + 1;
315
316 if (entry == NULL)
317 goto test_name_and_out;
318
319 /*
320 * If IS_ROOT, we have to do no walking at all.
321 */
322 if (IS_ROOT(entry) && !name) {
323 *path++ = '\\';
324 if (unicode) *path++ = '\0';
325 *path++ = '\0';
326 if (unicode) *path++ = '\0';
327 return path-buf;
328 }
329
330 /*
331 * Build the path string walking the tree backward from end to ROOT
332 * and store it in reversed order [see reverse_string()]
333 */
334 dget(entry);
335 spin_lock(&entry->d_lock);
336 while (!IS_ROOT(entry)) {
337 struct dentry *parent;
338
339 if (maxlen < (3<<unicode)) {
340 spin_unlock(&entry->d_lock);
341 dput(entry);
342 return -ENAMETOOLONG;
343 }
344
345 len = server->ops->convert(path, maxlen-2,
346 entry->d_name.name, entry->d_name.len,
347 server->local_nls, server->remote_nls);
348 if (len < 0) {
349 spin_unlock(&entry->d_lock);
350 dput(entry);
351 return len;
352 }
353 reverse_string(path, len);
354 path += len;
355 if (unicode) {
356 /* Note: reverse order */
357 *path++ = '\0';
358 maxlen--;
359 }
360 *path++ = '\\';
361 maxlen -= len+1;
362
363 parent = entry->d_parent;
364 dget(parent);
365 spin_unlock(&entry->d_lock);
366 dput(entry);
367 entry = parent;
368 spin_lock(&entry->d_lock);
369 }
370 spin_unlock(&entry->d_lock);
371 dput(entry);
372 reverse_string(buf, path-buf);
373
374 /* maxlen has space for at least one char */
375test_name_and_out:
376 if (name) {
377 if (maxlen < (3<<unicode))
378 return -ENAMETOOLONG;
379 *path++ = '\\';
380 if (unicode) {
381 *path++ = '\0';
382 maxlen--;
383 }
384 len = server->ops->convert(path, maxlen-2,
385 name->name, name->len,
386 server->local_nls, server->remote_nls);
387 if (len < 0)
388 return len;
389 path += len;
390 maxlen -= len+1;
391 }
392 /* maxlen has space for at least one char */
393 *path++ = '\0';
394 if (unicode) *path++ = '\0';
395 return path-buf;
396}
397
398static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
399 struct dentry *dir, struct qstr *name)
400{
401 int result;
402
403 result = smb_build_path(server, buf, maxlen, dir, name);
404 if (result < 0)
405 goto out;
406 if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
407 str_upper(buf, result);
408out:
409 return result;
410}
411
412/* encode_path for non-trans2 request SMBs */
413static int smb_simple_encode_path(struct smb_request *req, char **p,
414 struct dentry * entry, struct qstr * name)
415{
416 struct smb_sb_info *server = req->rq_server;
417 char *s = *p;
418 int res;
419 int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
420 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
421
422 if (!maxlen)
423 return -ENAMETOOLONG;
424 *s++ = 4; /* ASCII data format */
425
426 /*
427 * SMB Unicode strings must be 16bit aligned relative the start of the
428 * packet. If they are not they must be padded with 0.
429 */
430 if (unicode) {
431 int align = s - (char *)req->rq_buffer;
432 if (!(align & 1)) {
433 *s++ = '\0';
434 maxlen--;
435 }
436 }
437
438 res = smb_encode_path(server, s, maxlen-1, entry, name);
439 if (res < 0)
440 return res;
441 *p = s + res;
442 return 0;
443}
444
445/* The following are taken directly from msdos-fs */
446
447/* Linear day numbers of the respective 1sts in non-leap years. */
448
449static int day_n[] =
450{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
451 /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
452
453
454static time_t
455utc2local(struct smb_sb_info *server, time_t time)
456{
457 return time - server->opt.serverzone*60;
458}
459
460static time_t
461local2utc(struct smb_sb_info *server, time_t time)
462{
463 return time + server->opt.serverzone*60;
464}
465
466/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
467
468static time_t
469date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
470{
471 int month, year;
472 time_t secs;
473
474 /* first subtract and mask after that... Otherwise, if
475 date == 0, bad things happen */
476 month = ((date >> 5) - 1) & 15;
477 year = date >> 9;
478 secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
479 ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
480 month < 2 ? 1 : 0) + 3653);
481 /* days since 1.1.70 plus 80's leap day */
482 return local2utc(server, secs);
483}
484
485
486/* Convert linear UNIX date to a MS-DOS time/date pair. */
487
488static void
489date_unix2dos(struct smb_sb_info *server,
490 int unix_date, __u16 *date, __u16 *time)
491{
492 int day, year, nl_day, month;
493
494 unix_date = utc2local(server, unix_date);
495 if (unix_date < 315532800)
496 unix_date = 315532800;
497
498 *time = (unix_date % 60) / 2 +
499 (((unix_date / 60) % 60) << 5) +
500 (((unix_date / 3600) % 24) << 11);
501
502 day = unix_date / 86400 - 3652;
503 year = day / 365;
504 if ((year + 3) / 4 + 365 * year > day)
505 year--;
506 day -= (year + 3) / 4 + 365 * year;
507 if (day == 59 && !(year & 3)) {
508 nl_day = day;
509 month = 2;
510 } else {
511 nl_day = (year & 3) || day <= 59 ? day : day - 1;
512 for (month = 1; month < 12; month++)
513 if (day_n[month] > nl_day)
514 break;
515 }
516 *date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
517}
518
519/* The following are taken from fs/ntfs/util.c */
520
521#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
522
523/*
524 * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
525 * into Unix UTC (based 1970-01-01, in seconds).
526 */
527static struct timespec
528smb_ntutc2unixutc(u64 ntutc)
529{
530 struct timespec ts;
531 /* FIXME: what about the timezone difference? */
532 /* Subtract the NTFS time offset, then convert to 1s intervals. */
533 u64 t = ntutc - NTFS_TIME_OFFSET;
534 ts.tv_nsec = do_div(t, 10000000) * 100;
535 ts.tv_sec = t;
536 return ts;
537}
538
539/* Convert the Unix UTC into NT time */
540static u64
541smb_unixutc2ntutc(struct timespec ts)
542{
543 /* Note: timezone conversion is probably wrong. */
544 /* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
545 return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
546}
547
548#define MAX_FILE_MODE 6
549static mode_t file_mode[] = {
550 S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
551};
552
553static int smb_filetype_to_mode(u32 filetype)
554{
555 if (filetype > MAX_FILE_MODE) {
556 PARANOIA("Filetype out of range: %d\n", filetype);
557 return S_IFREG;
558 }
559 return file_mode[filetype];
560}
561
562static u32 smb_filetype_from_mode(int mode)
563{
564 if (S_ISREG(mode))
565 return UNIX_TYPE_FILE;
566 if (S_ISDIR(mode))
567 return UNIX_TYPE_DIR;
568 if (S_ISLNK(mode))
569 return UNIX_TYPE_SYMLINK;
570 if (S_ISCHR(mode))
571 return UNIX_TYPE_CHARDEV;
572 if (S_ISBLK(mode))
573 return UNIX_TYPE_BLKDEV;
574 if (S_ISFIFO(mode))
575 return UNIX_TYPE_FIFO;
576 if (S_ISSOCK(mode))
577 return UNIX_TYPE_SOCKET;
578 return UNIX_TYPE_UNKNOWN;
579}
580
581
582/*****************************************************************************/
583/* */
584/* Support section. */
585/* */
586/*****************************************************************************/
587
588__u32
589smb_len(__u8 * p)
590{
591 return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
592}
593
594static __u16
595smb_bcc(__u8 * packet)
596{
597 int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
598 return WVAL(packet, pos);
599}
600
601/* smb_valid_packet: We check if packet fulfills the basic
602 requirements of a smb packet */
603
604static int
605smb_valid_packet(__u8 * packet)
606{
607 return (packet[4] == 0xff
608 && packet[5] == 'S'
609 && packet[6] == 'M'
610 && packet[7] == 'B'
611 && (smb_len(packet) + 4 == SMB_HEADER_LEN
612 + SMB_WCT(packet) * 2 + smb_bcc(packet)));
613}
614
615/* smb_verify: We check if we got the answer we expected, and if we
616 got enough data. If bcc == -1, we don't care. */
617
618static int
619smb_verify(__u8 * packet, int command, int wct, int bcc)
620{
621 if (SMB_CMD(packet) != command)
622 goto bad_command;
623 if (SMB_WCT(packet) < wct)
624 goto bad_wct;
625 if (bcc != -1 && smb_bcc(packet) < bcc)
626 goto bad_bcc;
627 return 0;
628
629bad_command:
630 printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
631 command, SMB_CMD(packet));
632 goto fail;
633bad_wct:
634 printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
635 command, wct, SMB_WCT(packet));
636 goto fail;
637bad_bcc:
638 printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
639 command, bcc, smb_bcc(packet));
640fail:
641 return -EIO;
642}
643
644/*
645 * Returns the maximum read or write size for the "payload". Making all of the
646 * packet fit within the negotiated max_xmit size.
647 *
648 * N.B. Since this value is usually computed before locking the server,
649 * the server's packet size must never be decreased!
650 */
651static inline int
652smb_get_xmitsize(struct smb_sb_info *server, int overhead)
653{
654 return server->opt.max_xmit - overhead;
655}
656
657/*
658 * Calculate the maximum read size
659 */
660int
661smb_get_rsize(struct smb_sb_info *server)
662{
663 /* readX has 12 parameters, read has 5 */
664 int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
665 int size = smb_get_xmitsize(server, overhead);
666
667 VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
668
669 return size;
670}
671
672/*
673 * Calculate the maximum write size
674 */
675int
676smb_get_wsize(struct smb_sb_info *server)
677{
678 /* writeX has 14 parameters, write has 5 */
679 int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
680 int size = smb_get_xmitsize(server, overhead);
681
682 VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
683
684 return size;
685}
686
687/*
688 * Convert SMB error codes to -E... errno values.
689 */
690int
691smb_errno(struct smb_request *req)
692{
693 int errcls = req->rq_rcls;
694 int error = req->rq_err;
695 char *class = "Unknown";
696
697 VERBOSE("errcls %d code %d from command 0x%x\n",
698 errcls, error, SMB_CMD(req->rq_header));
699
700 if (errcls == ERRDOS) {
701 switch (error) {
702 case ERRbadfunc:
703 return -EINVAL;
704 case ERRbadfile:
705 case ERRbadpath:
706 return -ENOENT;
707 case ERRnofids:
708 return -EMFILE;
709 case ERRnoaccess:
710 return -EACCES;
711 case ERRbadfid:
712 return -EBADF;
713 case ERRbadmcb:
714 return -EREMOTEIO;
715 case ERRnomem:
716 return -ENOMEM;
717 case ERRbadmem:
718 return -EFAULT;
719 case ERRbadenv:
720 case ERRbadformat:
721 return -EREMOTEIO;
722 case ERRbadaccess:
723 return -EACCES;
724 case ERRbaddata:
725 return -E2BIG;
726 case ERRbaddrive:
727 return -ENXIO;
728 case ERRremcd:
729 return -EREMOTEIO;
730 case ERRdiffdevice:
731 return -EXDEV;
732 case ERRnofiles:
733 return -ENOENT;
734 case ERRbadshare:
735 return -ETXTBSY;
736 case ERRlock:
737 return -EDEADLK;
738 case ERRfilexists:
739 return -EEXIST;
740 case ERROR_INVALID_PARAMETER:
741 return -EINVAL;
742 case ERROR_DISK_FULL:
743 return -ENOSPC;
744 case ERROR_INVALID_NAME:
745 return -ENOENT;
746 case ERROR_DIR_NOT_EMPTY:
747 return -ENOTEMPTY;
748 case ERROR_NOT_LOCKED:
749 return -ENOLCK;
750 case ERROR_ALREADY_EXISTS:
751 return -EEXIST;
752 default:
753 class = "ERRDOS";
754 goto err_unknown;
755 }
756 } else if (errcls == ERRSRV) {
757 switch (error) {
758 /* N.B. This is wrong ... EIO ? */
759 case ERRerror:
760 return -ENFILE;
761 case ERRbadpw:
762 return -EINVAL;
763 case ERRbadtype:
764 case ERRtimeout:
765 return -EIO;
766 case ERRaccess:
767 return -EACCES;
768 /*
769 * This is a fatal error, as it means the "tree ID"
770 * for this connection is no longer valid. We map
771 * to a special error code and get a new connection.
772 */
773 case ERRinvnid:
774 return -EBADSLT;
775 default:
776 class = "ERRSRV";
777 goto err_unknown;
778 }
779 } else if (errcls == ERRHRD) {
780 switch (error) {
781 case ERRnowrite:
782 return -EROFS;
783 case ERRbadunit:
784 return -ENODEV;
785 case ERRnotready:
786 return -EUCLEAN;
787 case ERRbadcmd:
788 case ERRdata:
789 return -EIO;
790 case ERRbadreq:
791 return -ERANGE;
792 case ERRbadshare:
793 return -ETXTBSY;
794 case ERRlock:
795 return -EDEADLK;
796 case ERRdiskfull:
797 return -ENOSPC;
798 default:
799 class = "ERRHRD";
800 goto err_unknown;
801 }
802 } else if (errcls == ERRCMD) {
803 class = "ERRCMD";
804 } else if (errcls == SUCCESS) {
805 return 0; /* This is the only valid 0 return */
806 }
807
808err_unknown:
809 printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
810 class, error, SMB_CMD(req->rq_header));
811 return -EIO;
812}
813
814/* smb_request_ok: We expect the server to be locked. Then we do the
815 request and check the answer completely. When smb_request_ok
816 returns 0, you can be quite sure that everything went well. When
817 the answer is <=0, the returned number is a valid unix errno. */
818
819static int
820smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
821{
822 int result;
823
824 req->rq_resp_wct = wct;
825 req->rq_resp_bcc = bcc;
826
827 result = smb_add_request(req);
828 if (result != 0) {
829 DEBUG1("smb_request failed\n");
830 goto out;
831 }
832
833 if (smb_valid_packet(req->rq_header) != 0) {
834 PARANOIA("invalid packet!\n");
835 goto out;
836 }
837
838 result = smb_verify(req->rq_header, command, wct, bcc);
839
840out:
841 return result;
842}
843
844/*
845 * This implements the NEWCONN ioctl. It installs the server pid,
846 * sets server->state to CONN_VALID, and wakes up the waiting process.
847 */
848int
849smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
850{
851 struct file *filp;
852 struct sock *sk;
853 int error;
854
855 VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
856
857 smb_lock_server(server);
858
859 /*
860 * Make sure we don't already have a valid connection ...
861 */
862 error = -EINVAL;
863 if (server->state == CONN_VALID)
864 goto out;
865
866 error = -EACCES;
867 if (current_uid() != server->mnt->mounted_uid &&
868 !capable(CAP_SYS_ADMIN))
869 goto out;
870
871 error = -EBADF;
872 filp = fget(opt->fd);
873 if (!filp)
874 goto out;
875 if (!smb_valid_socket(filp->f_path.dentry->d_inode))
876 goto out_putf;
877
878 server->sock_file = filp;
879 server->conn_pid = get_pid(task_pid(current));
880 server->opt = *opt;
881 server->generation += 1;
882 server->state = CONN_VALID;
883 error = 0;
884
885 if (server->conn_error) {
886 /*
887 * conn_error is the returncode we originally decided to
888 * drop the old connection on. This message should be positive
889 * and not make people ask questions on why smbfs is printing
890 * error messages ...
891 */
892 printk(KERN_INFO "SMB connection re-established (%d)\n",
893 server->conn_error);
894 server->conn_error = 0;
895 }
896
897 /*
898 * Store the server in sock user_data (Only used by sunrpc)
899 */
900 sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
901 sk->sk_user_data = server;
902
903 /* chain into the data_ready callback */
904 server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
905
906 /* check if we have an old smbmount that uses seconds for the
907 serverzone */
908 if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
909 server->opt.serverzone /= 60;
910
911 /* now that we have an established connection we can detect the server
912 type and enable bug workarounds */
913 if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
914 install_ops(server->ops, &smb_ops_core);
915 else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
916 install_ops(server->ops, &smb_ops_os2);
917 else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
918 (server->opt.max_xmit < 0x1000) &&
919 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
920 /* FIXME: can we kill the WIN95 flag now? */
921 server->mnt->flags |= SMB_MOUNT_WIN95;
922 VERBOSE("detected WIN95 server\n");
923 install_ops(server->ops, &smb_ops_win95);
924 } else {
925 /*
926 * Samba has max_xmit 65535
927 * NT4spX has max_xmit 4536 (or something like that)
928 * win2k has ...
929 */
930 VERBOSE("detected NT1 (Samba, NT4/5) server\n");
931 install_ops(server->ops, &smb_ops_winNT);
932 }
933
934 /* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
935 if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
936 server->ops->getattr = smb_proc_getattr_core;
937 } else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
938 server->ops->getattr = smb_proc_getattr_ff;
939 }
940
941 /* Decode server capabilities */
942 if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
943 /* Should be ok to set this now, as no one can access the
944 mount until the connection has been established. */
945 SB_of(server)->s_maxbytes = ~0ULL >> 1;
946 VERBOSE("LFS enabled\n");
947 }
948 if (server->opt.capabilities & SMB_CAP_UNICODE) {
949 server->mnt->flags |= SMB_MOUNT_UNICODE;
950 VERBOSE("Unicode enabled\n");
951 } else {
952 server->mnt->flags &= ~SMB_MOUNT_UNICODE;
953 }
954#if 0
955 /* flags we may test for other patches ... */
956 if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
957 VERBOSE("Large reads enabled\n");
958 }
959 if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
960 VERBOSE("Large writes enabled\n");
961 }
962#endif
963 if (server->opt.capabilities & SMB_CAP_UNIX) {
964 struct inode *inode;
965 VERBOSE("Using UNIX CIFS extensions\n");
966 install_ops(server->ops, &smb_ops_unix);
967 inode = SB_of(server)->s_root->d_inode;
968 if (inode)
969 inode->i_op = &smb_dir_inode_operations_unix;
970 }
971
972 VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
973 server->opt.protocol, server->opt.max_xmit,
974 pid_nr(server->conn_pid), server->opt.capabilities);
975
976 /* FIXME: this really should be done by smbmount. */
977 if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
978 server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
979 }
980
981 smb_unlock_server(server);
982 smbiod_wake_up();
983 if (server->opt.capabilities & SMB_CAP_UNIX)
984 smb_proc_query_cifsunix(server);
985
986 server->conn_complete++;
987 wake_up_interruptible_all(&server->conn_wq);
988 return error;
989
990out:
991 smb_unlock_server(server);
992 smbiod_wake_up();
993 return error;
994
995out_putf:
996 fput(filp);
997 goto out;
998}
999
1000/* smb_setup_header: We completely set up the packet. You only have to
1001 insert the command-specific fields */
1002
1003__u8 *
1004smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
1005{
1006 __u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
1007 __u8 *p = req->rq_header;
1008 struct smb_sb_info *server = req->rq_server;
1009
1010 p = smb_encode_smb_length(p, xmit_len - 4);
1011
1012 *p++ = 0xff;
1013 *p++ = 'S';
1014 *p++ = 'M';
1015 *p++ = 'B';
1016 *p++ = command;
1017
1018 memset(p, '\0', 19);
1019 p += 19;
1020 p += 8;
1021
1022 if (server->opt.protocol > SMB_PROTOCOL_CORE) {
1023 int flags = SMB_FLAGS_CASELESS_PATHNAMES;
1024 int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
1025 SMB_FLAGS2_EXTENDED_ATTRIBUTES; /* EA? not really ... */
1026
1027 *(req->rq_header + smb_flg) = flags;
1028 if (server->mnt->flags & SMB_MOUNT_UNICODE)
1029 flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
1030 WSET(req->rq_header, smb_flg2, flags2);
1031 }
1032 *p++ = wct; /* wct */
1033 p += 2 * wct;
1034 WSET(p, 0, bcc);
1035
1036 /* Include the header in the data to send */
1037 req->rq_iovlen = 1;
1038 req->rq_iov[0].iov_base = req->rq_header;
1039 req->rq_iov[0].iov_len = xmit_len - bcc;
1040
1041 return req->rq_buffer;
1042}
1043
1044static void
1045smb_setup_bcc(struct smb_request *req, __u8 *p)
1046{
1047 u16 bcc = p - req->rq_buffer;
1048 u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
1049
1050 WSET(pbcc, 0, bcc);
1051
1052 smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN +
1053 2*SMB_WCT(req->rq_header) - 2 + bcc);
1054
1055 /* Include the "bytes" in the data to send */
1056 req->rq_iovlen = 2;
1057 req->rq_iov[1].iov_base = req->rq_buffer;
1058 req->rq_iov[1].iov_len = bcc;
1059}
1060
1061static int
1062smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
1063 __u16 mode, off_t offset)
1064{
1065 int result;
1066 struct smb_request *req;
1067
1068 result = -ENOMEM;
1069 if (! (req = smb_alloc_request(server, 0)))
1070 goto out;
1071
1072 smb_setup_header(req, SMBlseek, 4, 0);
1073 WSET(req->rq_header, smb_vwv0, fileid);
1074 WSET(req->rq_header, smb_vwv1, mode);
1075 DSET(req->rq_header, smb_vwv2, offset);
1076 req->rq_flags |= SMB_REQ_NORETRY;
1077
1078 result = smb_request_ok(req, SMBlseek, 2, 0);
1079 if (result < 0) {
1080 result = 0;
1081 goto out_free;
1082 }
1083
1084 result = DVAL(req->rq_header, smb_vwv0);
1085out_free:
1086 smb_rput(req);
1087out:
1088 return result;
1089}
1090
1091static int
1092smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
1093{
1094 struct inode *ino = dentry->d_inode;
1095 struct smb_inode_info *ei = SMB_I(ino);
1096 int mode, read_write = 0x42, read_only = 0x40;
1097 int res;
1098 char *p;
1099 struct smb_request *req;
1100
1101 /*
1102 * Attempt to open r/w, unless there are no write privileges.
1103 */
1104 mode = read_write;
1105 if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
1106 mode = read_only;
1107#if 0
1108 /* FIXME: why is this code not in? below we fix it so that a caller
1109 wanting RO doesn't get RW. smb_revalidate_inode does some
1110 optimization based on access mode. tail -f needs it to be correct.
1111
1112 We must open rw since we don't do the open if called a second time
1113 with different 'wish'. Is that not supported by smb servers? */
1114 if (!(wish & (O_WRONLY | O_RDWR)))
1115 mode = read_only;
1116#endif
1117
1118 res = -ENOMEM;
1119 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1120 goto out;
1121
1122 retry:
1123 p = smb_setup_header(req, SMBopen, 2, 0);
1124 WSET(req->rq_header, smb_vwv0, mode);
1125 WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
1126 res = smb_simple_encode_path(req, &p, dentry, NULL);
1127 if (res < 0)
1128 goto out_free;
1129 smb_setup_bcc(req, p);
1130
1131 res = smb_request_ok(req, SMBopen, 7, 0);
1132 if (res != 0) {
1133 if (mode == read_write &&
1134 (res == -EACCES || res == -ETXTBSY || res == -EROFS))
1135 {
1136 VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
1137 DENTRY_PATH(dentry), res);
1138 mode = read_only;
1139 req->rq_flags = 0;
1140 goto retry;
1141 }
1142 goto out_free;
1143 }
1144 /* We should now have data in vwv[0..6]. */
1145
1146 ei->fileid = WVAL(req->rq_header, smb_vwv0);
1147 ei->attr = WVAL(req->rq_header, smb_vwv1);
1148 /* smb_vwv2 has mtime */
1149 /* smb_vwv4 has size */
1150 ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
1151 ei->open = server->generation;
1152
1153out_free:
1154 smb_rput(req);
1155out:
1156 return res;
1157}
1158
1159/*
1160 * Make sure the file is open, and check that the access
1161 * is compatible with the desired access.
1162 */
1163int
1164smb_open(struct dentry *dentry, int wish)
1165{
1166 struct inode *inode = dentry->d_inode;
1167 int result;
1168 __u16 access;
1169
1170 result = -ENOENT;
1171 if (!inode) {
1172 printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
1173 DENTRY_PATH(dentry));
1174 goto out;
1175 }
1176
1177 if (!smb_is_open(inode)) {
1178 struct smb_sb_info *server = server_from_inode(inode);
1179 result = 0;
1180 if (!smb_is_open(inode))
1181 result = smb_proc_open(server, dentry, wish);
1182 if (result)
1183 goto out;
1184 /*
1185 * A successful open means the path is still valid ...
1186 */
1187 smb_renew_times(dentry);
1188 }
1189
1190 /*
1191 * Check whether the access is compatible with the desired mode.
1192 */
1193 result = 0;
1194 access = SMB_I(inode)->access;
1195 if (access != wish && access != SMB_O_RDWR) {
1196 PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
1197 DENTRY_PATH(dentry), access, wish);
1198 result = -EACCES;
1199 }
1200out:
1201 return result;
1202}
1203
1204static int
1205smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
1206{
1207 struct smb_request *req;
1208 int result = -ENOMEM;
1209
1210 if (! (req = smb_alloc_request(server, 0)))
1211 goto out;
1212
1213 smb_setup_header(req, SMBclose, 3, 0);
1214 WSET(req->rq_header, smb_vwv0, fileid);
1215 DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
1216 req->rq_flags |= SMB_REQ_NORETRY;
1217 result = smb_request_ok(req, SMBclose, 0, 0);
1218
1219 smb_rput(req);
1220out:
1221 return result;
1222}
1223
1224/*
1225 * Win NT 4.0 has an apparent bug in that it fails to update the
1226 * modify time when writing to a file. As a workaround, we update
1227 * both modify and access time locally, and post the times to the
1228 * server when closing the file.
1229 */
1230static int
1231smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
1232{
1233 struct smb_inode_info *ei = SMB_I(ino);
1234 int result = 0;
1235 if (smb_is_open(ino))
1236 {
1237 /*
1238 * We clear the open flag in advance, in case another
1239 * process observes the value while we block below.
1240 */
1241 ei->open = 0;
1242
1243 /*
1244 * Kludge alert: SMB timestamps are accurate only to
1245 * two seconds ... round the times to avoid needless
1246 * cache invalidations!
1247 */
1248 if (ino->i_mtime.tv_sec & 1) {
1249 ino->i_mtime.tv_sec--;
1250 ino->i_mtime.tv_nsec = 0;
1251 }
1252 if (ino->i_atime.tv_sec & 1) {
1253 ino->i_atime.tv_sec--;
1254 ino->i_atime.tv_nsec = 0;
1255 }
1256 /*
1257 * If the file is open with write permissions,
1258 * update the time stamps to sync mtime and atime.
1259 */
1260 if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
1261 (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
1262 !(ei->access == SMB_O_RDONLY))
1263 {
1264 struct smb_fattr fattr;
1265 smb_get_inode_attr(ino, &fattr);
1266 smb_proc_setattr_ext(server, ino, &fattr);
1267 }
1268
1269 result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
1270 /*
1271 * Force a revalidation after closing ... some servers
1272 * don't post the size until the file has been closed.
1273 */
1274 if (server->opt.protocol < SMB_PROTOCOL_NT1)
1275 ei->oldmtime = 0;
1276 ei->closed = jiffies;
1277 }
1278 return result;
1279}
1280
1281int
1282smb_close(struct inode *ino)
1283{
1284 int result = 0;
1285
1286 if (smb_is_open(ino)) {
1287 struct smb_sb_info *server = server_from_inode(ino);
1288 result = smb_proc_close_inode(server, ino);
1289 }
1290 return result;
1291}
1292
1293/*
1294 * This is used to close a file following a failed instantiate.
1295 * Since we don't have an inode, we can't use any of the above.
1296 */
1297int
1298smb_close_fileid(struct dentry *dentry, __u16 fileid)
1299{
1300 struct smb_sb_info *server = server_from_dentry(dentry);
1301 int result;
1302
1303 result = smb_proc_close(server, fileid, get_seconds());
1304 return result;
1305}
1306
1307/* In smb_proc_read and smb_proc_write we do not retry, because the
1308 file-id would not be valid after a reconnection. */
1309
1310static void
1311smb_proc_read_data(struct smb_request *req)
1312{
1313 req->rq_iov[0].iov_base = req->rq_buffer;
1314 req->rq_iov[0].iov_len = 3;
1315
1316 req->rq_iov[1].iov_base = req->rq_page;
1317 req->rq_iov[1].iov_len = req->rq_rsize;
1318 req->rq_iovlen = 2;
1319
1320 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
1321}
1322
1323static int
1324smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
1325{
1326 struct smb_sb_info *server = server_from_inode(inode);
1327 __u16 returned_count, data_len;
1328 unsigned char *buf;
1329 int result;
1330 struct smb_request *req;
1331 u8 rbuf[4];
1332
1333 result = -ENOMEM;
1334 if (! (req = smb_alloc_request(server, 0)))
1335 goto out;
1336
1337 smb_setup_header(req, SMBread, 5, 0);
1338 buf = req->rq_header;
1339 WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
1340 WSET(buf, smb_vwv1, count);
1341 DSET(buf, smb_vwv2, offset);
1342 WSET(buf, smb_vwv4, 0);
1343
1344 req->rq_page = data;
1345 req->rq_rsize = count;
1346 req->rq_callback = smb_proc_read_data;
1347 req->rq_buffer = rbuf;
1348 req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
1349
1350 result = smb_request_ok(req, SMBread, 5, -1);
1351 if (result < 0)
1352 goto out_free;
1353 returned_count = WVAL(req->rq_header, smb_vwv0);
1354
1355 data_len = WVAL(rbuf, 1);
1356
1357 if (returned_count != data_len) {
1358 printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
1359 printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
1360 returned_count, data_len);
1361 }
1362 result = data_len;
1363
1364out_free:
1365 smb_rput(req);
1366out:
1367 VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
1368 inode->i_ino, SMB_I(inode)->fileid, count, result);
1369 return result;
1370}
1371
1372static int
1373smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
1374{
1375 struct smb_sb_info *server = server_from_inode(inode);
1376 int result;
1377 u16 fileid = SMB_I(inode)->fileid;
1378 u8 buf[4];
1379 struct smb_request *req;
1380
1381 result = -ENOMEM;
1382 if (! (req = smb_alloc_request(server, 0)))
1383 goto out;
1384
1385 VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
1386 inode->i_ino, fileid, count, offset);
1387
1388 smb_setup_header(req, SMBwrite, 5, count + 3);
1389 WSET(req->rq_header, smb_vwv0, fileid);
1390 WSET(req->rq_header, smb_vwv1, count);
1391 DSET(req->rq_header, smb_vwv2, offset);
1392 WSET(req->rq_header, smb_vwv4, 0);
1393
1394 buf[0] = 1;
1395 WSET(buf, 1, count); /* yes, again ... */
1396 req->rq_iov[1].iov_base = buf;
1397 req->rq_iov[1].iov_len = 3;
1398 req->rq_iov[2].iov_base = (char *) data;
1399 req->rq_iov[2].iov_len = count;
1400 req->rq_iovlen = 3;
1401 req->rq_flags |= SMB_REQ_NORETRY;
1402
1403 result = smb_request_ok(req, SMBwrite, 1, 0);
1404 if (result >= 0)
1405 result = WVAL(req->rq_header, smb_vwv0);
1406
1407 smb_rput(req);
1408out:
1409 return result;
1410}
1411
1412/*
1413 * In smb_proc_readX and smb_proc_writeX we do not retry, because the
1414 * file-id would not be valid after a reconnection.
1415 */
1416
1417#define SMB_READX_MAX_PAD 64
1418static void
1419smb_proc_readX_data(struct smb_request *req)
1420{
1421 /* header length, excluding the netbios length (-4) */
1422 int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
1423 int data_off = WVAL(req->rq_header, smb_vwv6);
1424
1425 /*
1426 * Some genius made the padding to the data bytes arbitrary.
1427 * So we must first calculate the amount of padding used by the server.
1428 */
1429 data_off -= hdrlen;
1430 if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
1431 PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
1432 PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
1433 req->rq_rlen = req->rq_bufsize + 1;
1434 return;
1435 }
1436 req->rq_iov[0].iov_base = req->rq_buffer;
1437 req->rq_iov[0].iov_len = data_off;
1438
1439 req->rq_iov[1].iov_base = req->rq_page;
1440 req->rq_iov[1].iov_len = req->rq_rsize;
1441 req->rq_iovlen = 2;
1442
1443 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
1444}
1445
1446static int
1447smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
1448{
1449 struct smb_sb_info *server = server_from_inode(inode);
1450 unsigned char *buf;
1451 int result;
1452 struct smb_request *req;
1453 static char pad[SMB_READX_MAX_PAD];
1454
1455 result = -ENOMEM;
1456 if (! (req = smb_alloc_request(server, 0)))
1457 goto out;
1458
1459 smb_setup_header(req, SMBreadX, 12, 0);
1460 buf = req->rq_header;
1461 WSET(buf, smb_vwv0, 0x00ff);
1462 WSET(buf, smb_vwv1, 0);
1463 WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
1464 DSET(buf, smb_vwv3, (u32)offset); /* low 32 bits */
1465 WSET(buf, smb_vwv5, count);
1466 WSET(buf, smb_vwv6, 0);
1467 DSET(buf, smb_vwv7, 0);
1468 WSET(buf, smb_vwv9, 0);
1469 DSET(buf, smb_vwv10, (u32)(offset >> 32)); /* high 32 bits */
1470 WSET(buf, smb_vwv11, 0);
1471
1472 req->rq_page = data;
1473 req->rq_rsize = count;
1474 req->rq_callback = smb_proc_readX_data;
1475 req->rq_buffer = pad;
1476 req->rq_bufsize = SMB_READX_MAX_PAD;
1477 req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
1478
1479 result = smb_request_ok(req, SMBreadX, 12, -1);
1480 if (result < 0)
1481 goto out_free;
1482 result = WVAL(req->rq_header, smb_vwv5);
1483
1484out_free:
1485 smb_rput(req);
1486out:
1487 VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
1488 inode->i_ino, SMB_I(inode)->fileid, count, result);
1489 return result;
1490}
1491
1492static int
1493smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
1494{
1495 struct smb_sb_info *server = server_from_inode(inode);
1496 int result;
1497 u8 *p;
1498 static u8 pad[4];
1499 struct smb_request *req;
1500
1501 result = -ENOMEM;
1502 if (! (req = smb_alloc_request(server, 0)))
1503 goto out;
1504
1505 VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
1506 inode->i_ino, SMB_I(inode)->fileid, count, offset);
1507
1508 p = smb_setup_header(req, SMBwriteX, 14, count + 1);
1509 WSET(req->rq_header, smb_vwv0, 0x00ff);
1510 WSET(req->rq_header, smb_vwv1, 0);
1511 WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
1512 DSET(req->rq_header, smb_vwv3, (u32)offset); /* low 32 bits */
1513 DSET(req->rq_header, smb_vwv5, 0);
1514 WSET(req->rq_header, smb_vwv7, 0); /* write mode */
1515 WSET(req->rq_header, smb_vwv8, 0);
1516 WSET(req->rq_header, smb_vwv9, 0);
1517 WSET(req->rq_header, smb_vwv10, count); /* data length */
1518 WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
1519 DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
1520
1521 req->rq_iov[1].iov_base = pad;
1522 req->rq_iov[1].iov_len = 1;
1523 req->rq_iov[2].iov_base = (char *) data;
1524 req->rq_iov[2].iov_len = count;
1525 req->rq_iovlen = 3;
1526 req->rq_flags |= SMB_REQ_NORETRY;
1527
1528 result = smb_request_ok(req, SMBwriteX, 6, 0);
1529 if (result >= 0)
1530 result = WVAL(req->rq_header, smb_vwv2);
1531
1532 smb_rput(req);
1533out:
1534 return result;
1535}
1536
1537int
1538smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
1539{
1540 struct smb_sb_info *server = server_from_dentry(dentry);
1541 char *p;
1542 int result;
1543 struct smb_request *req;
1544
1545 result = -ENOMEM;
1546 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1547 goto out;
1548
1549 p = smb_setup_header(req, SMBcreate, 3, 0);
1550 WSET(req->rq_header, smb_vwv0, attr);
1551 DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
1552 result = smb_simple_encode_path(req, &p, dentry, NULL);
1553 if (result < 0)
1554 goto out_free;
1555 smb_setup_bcc(req, p);
1556
1557 result = smb_request_ok(req, SMBcreate, 1, 0);
1558 if (result < 0)
1559 goto out_free;
1560
1561 *fileid = WVAL(req->rq_header, smb_vwv0);
1562 result = 0;
1563
1564out_free:
1565 smb_rput(req);
1566out:
1567 return result;
1568}
1569
1570int
1571smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
1572{
1573 struct smb_sb_info *server = server_from_dentry(old_dentry);
1574 char *p;
1575 int result;
1576 struct smb_request *req;
1577
1578 result = -ENOMEM;
1579 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1580 goto out;
1581
1582 p = smb_setup_header(req, SMBmv, 1, 0);
1583 WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
1584 result = smb_simple_encode_path(req, &p, old_dentry, NULL);
1585 if (result < 0)
1586 goto out_free;
1587 result = smb_simple_encode_path(req, &p, new_dentry, NULL);
1588 if (result < 0)
1589 goto out_free;
1590 smb_setup_bcc(req, p);
1591
1592 if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
1593 goto out_free;
1594 result = 0;
1595
1596out_free:
1597 smb_rput(req);
1598out:
1599 return result;
1600}
1601
1602/*
1603 * Code common to mkdir and rmdir.
1604 */
1605static int
1606smb_proc_generic_command(struct dentry *dentry, __u8 command)
1607{
1608 struct smb_sb_info *server = server_from_dentry(dentry);
1609 char *p;
1610 int result;
1611 struct smb_request *req;
1612
1613 result = -ENOMEM;
1614 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1615 goto out;
1616
1617 p = smb_setup_header(req, command, 0, 0);
1618 result = smb_simple_encode_path(req, &p, dentry, NULL);
1619 if (result < 0)
1620 goto out_free;
1621 smb_setup_bcc(req, p);
1622
1623 result = smb_request_ok(req, command, 0, 0);
1624 if (result < 0)
1625 goto out_free;
1626 result = 0;
1627
1628out_free:
1629 smb_rput(req);
1630out:
1631 return result;
1632}
1633
1634int
1635smb_proc_mkdir(struct dentry *dentry)
1636{
1637 return smb_proc_generic_command(dentry, SMBmkdir);
1638}
1639
1640int
1641smb_proc_rmdir(struct dentry *dentry)
1642{
1643 return smb_proc_generic_command(dentry, SMBrmdir);
1644}
1645
1646#if SMBFS_POSIX_UNLINK
1647/*
1648 * Removes readonly attribute from a file. Used by unlink to give posix
1649 * semantics.
1650 */
1651static int
1652smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
1653{
1654 int result;
1655 struct smb_fattr fattr;
1656
1657 /* FIXME: cifsUE should allow removing a readonly file. */
1658
1659 /* first get current attribute */
1660 smb_init_dirent(server, &fattr);
1661 result = server->ops->getattr(server, dentry, &fattr);
1662 smb_finish_dirent(server, &fattr);
1663 if (result < 0)
1664 return result;
1665
1666 /* if RONLY attribute is set, remove it */
1667 if (fattr.attr & aRONLY) { /* read only attribute is set */
1668 fattr.attr &= ~aRONLY;
1669 result = smb_proc_setattr_core(server, dentry, fattr.attr);
1670 }
1671 return result;
1672}
1673#endif
1674
1675int
1676smb_proc_unlink(struct dentry *dentry)
1677{
1678 struct smb_sb_info *server = server_from_dentry(dentry);
1679 int flag = 0;
1680 char *p;
1681 int result;
1682 struct smb_request *req;
1683
1684 result = -ENOMEM;
1685 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1686 goto out;
1687
1688 retry:
1689 p = smb_setup_header(req, SMBunlink, 1, 0);
1690 WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
1691 result = smb_simple_encode_path(req, &p, dentry, NULL);
1692 if (result < 0)
1693 goto out_free;
1694 smb_setup_bcc(req, p);
1695
1696 if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
1697#if SMBFS_POSIX_UNLINK
1698 if (result == -EACCES && !flag) {
1699 /* Posix semantics is for the read-only state
1700 of a file to be ignored in unlink(). In the
1701 SMB world a unlink() is refused on a
1702 read-only file. To make things easier for
1703 unix users we try to override the files
1704 permission if the unlink fails with the
1705 right error.
1706 This introduces a race condition that could
1707 lead to a file being written by someone who
1708 shouldn't have access, but as far as I can
1709 tell that is unavoidable */
1710
1711 /* remove RONLY attribute and try again */
1712 result = smb_set_rw(dentry,server);
1713 if (result == 0) {
1714 flag = 1;
1715 req->rq_flags = 0;
1716 goto retry;
1717 }
1718 }
1719#endif
1720 goto out_free;
1721 }
1722 result = 0;
1723
1724out_free:
1725 smb_rput(req);
1726out:
1727 return result;
1728}
1729
1730int
1731smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
1732{
1733 int result;
1734 struct smb_request *req;
1735
1736 result = -ENOMEM;
1737 if (! (req = smb_alloc_request(server, 0)))
1738 goto out;
1739
1740 smb_setup_header(req, SMBflush, 1, 0);
1741 WSET(req->rq_header, smb_vwv0, fileid);
1742 req->rq_flags |= SMB_REQ_NORETRY;
1743 result = smb_request_ok(req, SMBflush, 0, 0);
1744
1745 smb_rput(req);
1746out:
1747 return result;
1748}
1749
1750static int
1751smb_proc_trunc32(struct inode *inode, loff_t length)
1752{
1753 /*
1754 * Writing 0bytes is old-SMB magic for truncating files.
1755 * MAX_NON_LFS should prevent this from being called with a too
1756 * large offset.
1757 */
1758 return smb_proc_write(inode, length, 0, NULL);
1759}
1760
1761static int
1762smb_proc_trunc64(struct inode *inode, loff_t length)
1763{
1764 struct smb_sb_info *server = server_from_inode(inode);
1765 int result;
1766 char *param;
1767 char *data;
1768 struct smb_request *req;
1769
1770 result = -ENOMEM;
1771 if (! (req = smb_alloc_request(server, 14)))
1772 goto out;
1773
1774 param = req->rq_buffer;
1775 data = req->rq_buffer + 6;
1776
1777 /* FIXME: must we also set allocation size? winNT seems to do that */
1778 WSET(param, 0, SMB_I(inode)->fileid);
1779 WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
1780 WSET(param, 4, 0);
1781 LSET(data, 0, length);
1782
1783 req->rq_trans2_command = TRANSACT2_SETFILEINFO;
1784 req->rq_ldata = 8;
1785 req->rq_data = data;
1786 req->rq_lparm = 6;
1787 req->rq_parm = param;
1788 req->rq_flags |= SMB_REQ_NORETRY;
1789 result = smb_add_request(req);
1790 if (result < 0)
1791 goto out_free;
1792
1793 result = 0;
1794 if (req->rq_rcls != 0)
1795 result = smb_errno(req);
1796
1797out_free:
1798 smb_rput(req);
1799out:
1800 return result;
1801}
1802
1803static int
1804smb_proc_trunc95(struct inode *inode, loff_t length)
1805{
1806 struct smb_sb_info *server = server_from_inode(inode);
1807 int result = smb_proc_trunc32(inode, length);
1808
1809 /*
1810 * win9x doesn't appear to update the size immediately.
1811 * It will return the old file size after the truncate,
1812 * confusing smbfs. So we force an update.
1813 *
1814 * FIXME: is this still necessary?
1815 */
1816 smb_proc_flush(server, SMB_I(inode)->fileid);
1817 return result;
1818}
1819
1820static void
1821smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
1822{
1823 memset(fattr, 0, sizeof(*fattr));
1824
1825 fattr->f_nlink = 1;
1826 fattr->f_uid = server->mnt->uid;
1827 fattr->f_gid = server->mnt->gid;
1828 fattr->f_unix = 0;
1829}
1830
1831static void
1832smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
1833{
1834 if (fattr->f_unix)
1835 return;
1836
1837 fattr->f_mode = server->mnt->file_mode;
1838 if (fattr->attr & aDIR) {
1839 fattr->f_mode = server->mnt->dir_mode;
1840 fattr->f_size = SMB_ST_BLKSIZE;
1841 }
1842 /* Check the read-only flag */
1843 if (fattr->attr & aRONLY)
1844 fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
1845
1846 /* How many 512 byte blocks do we need for this file? */
1847 fattr->f_blocks = 0;
1848 if (fattr->f_size != 0)
1849 fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
1850 return;
1851}
1852
1853void
1854smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
1855 struct super_block *sb)
1856{
1857 smb_init_dirent(server, fattr);
1858 fattr->attr = aDIR;
1859 fattr->f_ino = 2; /* traditional root inode number */
1860 fattr->f_mtime = current_fs_time(sb);
1861 smb_finish_dirent(server, fattr);
1862}
1863
1864/*
1865 * Decode a dirent for old protocols
1866 *
1867 * qname is filled with the decoded, and possibly translated, name.
1868 * fattr receives decoded attributes
1869 *
1870 * Bugs Noted:
1871 * (1) Pathworks servers may pad the name with extra spaces.
1872 */
1873static char *
1874smb_decode_short_dirent(struct smb_sb_info *server, char *p,
1875 struct qstr *qname, struct smb_fattr *fattr,
1876 unsigned char *name_buf)
1877{
1878 int len;
1879
1880 /*
1881 * SMB doesn't have a concept of inode numbers ...
1882 */
1883 smb_init_dirent(server, fattr);
1884 fattr->f_ino = 0; /* FIXME: do we need this? */
1885
1886 p += SMB_STATUS_SIZE; /* reserved (search_status) */
1887 fattr->attr = *p;
1888 fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
1889 fattr->f_mtime.tv_nsec = 0;
1890 fattr->f_size = DVAL(p, 5);
1891 fattr->f_ctime = fattr->f_mtime;
1892 fattr->f_atime = fattr->f_mtime;
1893 qname->name = p + 9;
1894 len = strnlen(qname->name, 12);
1895
1896 /*
1897 * Trim trailing blanks for Pathworks servers
1898 */
1899 while (len > 2 && qname->name[len-1] == ' ')
1900 len--;
1901
1902 smb_finish_dirent(server, fattr);
1903
1904#if 0
1905 /* FIXME: These only work for ascii chars, and recent smbmount doesn't
1906 allow the flag to be set anyway. It kills const. Remove? */
1907 switch (server->opt.case_handling) {
1908 case SMB_CASE_UPPER:
1909 str_upper(entry->name, len);
1910 break;
1911 case SMB_CASE_LOWER:
1912 str_lower(entry->name, len);
1913 break;
1914 default:
1915 break;
1916 }
1917#endif
1918
1919 qname->len = 0;
1920 len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
1921 qname->name, len,
1922 server->remote_nls, server->local_nls);
1923 if (len > 0) {
1924 qname->len = len;
1925 qname->name = name_buf;
1926 DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
1927 }
1928
1929 return p + 22;
1930}
1931
1932/*
1933 * This routine is used to read in directory entries from the network.
1934 * Note that it is for short directory name seeks, i.e.: protocol <
1935 * SMB_PROTOCOL_LANMAN2
1936 */
1937static int
1938smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
1939 struct smb_cache_control *ctl)
1940{
1941 struct dentry *dir = filp->f_path.dentry;
1942 struct smb_sb_info *server = server_from_dentry(dir);
1943 struct qstr qname;
1944 struct smb_fattr fattr;
1945 char *p;
1946 int result;
1947 int i, first, entries_seen, entries;
1948 int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
1949 __u16 bcc;
1950 __u16 count;
1951 char status[SMB_STATUS_SIZE];
1952 static struct qstr mask = {
1953 .name = "*.*",
1954 .len = 3,
1955 };
1956 unsigned char *last_status;
1957 struct smb_request *req;
1958 unsigned char *name_buf;
1959
1960 VERBOSE("%s/%s\n", DENTRY_PATH(dir));
1961
1962 lock_kernel();
1963
1964 result = -ENOMEM;
1965 if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
1966 goto out;
1967
1968 first = 1;
1969 entries = 0;
1970 entries_seen = 2; /* implicit . and .. */
1971
1972 result = -ENOMEM;
1973 if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
1974 goto out_name;
1975
1976 while (1) {
1977 p = smb_setup_header(req, SMBsearch, 2, 0);
1978 WSET(req->rq_header, smb_vwv0, entries_asked);
1979 WSET(req->rq_header, smb_vwv1, aDIR);
1980 if (first == 1) {
1981 result = smb_simple_encode_path(req, &p, dir, &mask);
1982 if (result < 0)
1983 goto out_free;
1984 if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
1985 result = -ENAMETOOLONG;
1986 goto out_free;
1987 }
1988 *p++ = 5;
1989 WSET(p, 0, 0);
1990 p += 2;
1991 first = 0;
1992 } else {
1993 if (p + 5 + SMB_STATUS_SIZE >
1994 (char *)req->rq_buffer + req->rq_bufsize) {
1995 result = -ENAMETOOLONG;
1996 goto out_free;
1997 }
1998
1999 *p++ = 4;
2000 *p++ = 0;
2001 *p++ = 5;
2002 WSET(p, 0, SMB_STATUS_SIZE);
2003 p += 2;
2004 memcpy(p, status, SMB_STATUS_SIZE);
2005 p += SMB_STATUS_SIZE;
2006 }
2007
2008 smb_setup_bcc(req, p);
2009
2010 result = smb_request_ok(req, SMBsearch, 1, -1);
2011 if (result < 0) {
2012 if ((req->rq_rcls == ERRDOS) &&
2013 (req->rq_err == ERRnofiles))
2014 break;
2015 goto out_free;
2016 }
2017 count = WVAL(req->rq_header, smb_vwv0);
2018 if (count <= 0)
2019 break;
2020
2021 result = -EIO;
2022 bcc = smb_bcc(req->rq_header);
2023 if (bcc != count * SMB_DIRINFO_SIZE + 3)
2024 goto out_free;
2025 p = req->rq_buffer + 3;
2026
2027
2028 /* Make sure the response fits in the buffer. Fixed sized
2029 entries means we don't have to check in the decode loop. */
2030
2031 last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
2032
2033 if (last_status + SMB_DIRINFO_SIZE >=
2034 req->rq_buffer + req->rq_bufsize) {
2035 printk(KERN_ERR "smb_proc_readdir_short: "
2036 "last dir entry outside buffer! "
2037 "%d@%p %d@%p\n", SMB_DIRINFO_SIZE, last_status,
2038 req->rq_bufsize, req->rq_buffer);
2039 goto out_free;
2040 }
2041
2042 /* Read the last entry into the status field. */
2043 memcpy(status, last_status, SMB_STATUS_SIZE);
2044
2045
2046 /* Now we are ready to parse smb directory entries. */
2047
2048 for (i = 0; i < count; i++) {
2049 p = smb_decode_short_dirent(server, p,
2050 &qname, &fattr, name_buf);
2051 if (qname.len == 0)
2052 continue;
2053
2054 if (entries_seen == 2 && qname.name[0] == '.') {
2055 if (qname.len == 1)
2056 continue;
2057 if (qname.name[1] == '.' && qname.len == 2)
2058 continue;
2059 }
2060 if (!smb_fill_cache(filp, dirent, filldir, ctl,
2061 &qname, &fattr))
2062 ; /* stop reading? */
2063 entries_seen++;
2064 }
2065 }
2066 result = entries;
2067
2068out_free:
2069 smb_rput(req);
2070out_name:
2071 kfree(name_buf);
2072out:
2073 unlock_kernel();
2074 return result;
2075}
2076
2077static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
2078{
2079 u64 size, disk_bytes;
2080
2081 /* FIXME: verify nls support. all is sent as utf8? */
2082
2083 fattr->f_unix = 1;
2084 fattr->f_mode = 0;
2085
2086 /* FIXME: use the uniqueID from the remote instead? */
2087 /* 0 L file size in bytes */
2088 /* 8 L file size on disk in bytes (block count) */
2089 /* 40 L uid */
2090 /* 48 L gid */
2091 /* 56 W file type */
2092 /* 60 L devmajor */
2093 /* 68 L devminor */
2094 /* 76 L unique ID (inode) */
2095 /* 84 L permissions */
2096 /* 92 L link count */
2097
2098 size = LVAL(p, 0);
2099 disk_bytes = LVAL(p, 8);
2100
2101 /*
2102 * Some samba versions round up on-disk byte usage
2103 * to 1MB boundaries, making it useless. When seeing
2104 * that, use the size instead.
2105 */
2106 if (!(disk_bytes & 0xfffff))
2107 disk_bytes = size+511;
2108
2109 fattr->f_size = size;
2110 fattr->f_blocks = disk_bytes >> 9;
2111 fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
2112 fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
2113 fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
2114
2115 if (server->mnt->flags & SMB_MOUNT_UID)
2116 fattr->f_uid = server->mnt->uid;
2117 else
2118 fattr->f_uid = LVAL(p, 40);
2119
2120 if (server->mnt->flags & SMB_MOUNT_GID)
2121 fattr->f_gid = server->mnt->gid;
2122 else
2123 fattr->f_gid = LVAL(p, 48);
2124
2125 fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
2126
2127 if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
2128 __u64 major = LVAL(p, 60);
2129 __u64 minor = LVAL(p, 68);
2130
2131 fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
2132 if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
2133 MINOR(fattr->f_rdev) != (minor & 0xffffffff))
2134 fattr->f_rdev = 0;
2135 }
2136
2137 fattr->f_mode |= LVAL(p, 84);
2138
2139 if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
2140 (S_ISDIR(fattr->f_mode)) )
2141 fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
2142 else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
2143 !(S_ISDIR(fattr->f_mode)) )
2144 fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
2145 (fattr->f_mode & S_IFMT);
2146
2147}
2148
2149/*
2150 * Interpret a long filename structure using the specified info level:
2151 * level 1 for anything below NT1 protocol
2152 * level 260 for NT1 protocol
2153 *
2154 * qname is filled with the decoded, and possibly translated, name
2155 * fattr receives decoded attributes.
2156 *
2157 * Bugs Noted:
2158 * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
2159 */
2160static char *
2161smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
2162 struct qstr *qname, struct smb_fattr *fattr,
2163 unsigned char *name_buf)
2164{
2165 char *result;
2166 unsigned int len = 0;
2167 int n;
2168 __u16 date, time;
2169 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
2170
2171 /*
2172 * SMB doesn't have a concept of inode numbers ...
2173 */
2174 smb_init_dirent(server, fattr);
2175 fattr->f_ino = 0; /* FIXME: do we need this? */
2176
2177 switch (level) {
2178 case 1:
2179 len = *((unsigned char *) p + 22);
2180 qname->name = p + 23;
2181 result = p + 24 + len;
2182
2183 date = WVAL(p, 0);
2184 time = WVAL(p, 2);
2185 fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2186 fattr->f_ctime.tv_nsec = 0;
2187
2188 date = WVAL(p, 4);
2189 time = WVAL(p, 6);
2190 fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
2191 fattr->f_atime.tv_nsec = 0;
2192
2193 date = WVAL(p, 8);
2194 time = WVAL(p, 10);
2195 fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2196 fattr->f_mtime.tv_nsec = 0;
2197 fattr->f_size = DVAL(p, 12);
2198 /* ULONG allocation size */
2199 fattr->attr = WVAL(p, 20);
2200
2201 VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
2202 p, len, len, qname->name);
2203 break;
2204 case 260:
2205 result = p + WVAL(p, 0);
2206 len = DVAL(p, 60);
2207 if (len > 255) len = 255;
2208 /* NT4 null terminates, unless we are using unicode ... */
2209 qname->name = p + 94;
2210 if (!unicode && len && qname->name[len-1] == '\0')
2211 len--;
2212
2213 fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
2214 fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
2215 fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
2216 /* change time (32) */
2217 fattr->f_size = LVAL(p, 40);
2218 /* alloc size (48) */
2219 fattr->attr = DVAL(p, 56);
2220
2221 VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
2222 p, len, len, qname->name);
2223 break;
2224 case SMB_FIND_FILE_UNIX:
2225 result = p + WVAL(p, 0);
2226 qname->name = p + 108;
2227
2228 len = strlen(qname->name);
2229 /* FIXME: should we check the length?? */
2230
2231 p += 8;
2232 smb_decode_unix_basic(fattr, server, p);
2233 VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
2234 p, len, len, qname->name);
2235 break;
2236 default:
2237 PARANOIA("Unknown info level %d\n", level);
2238 result = p + WVAL(p, 0);
2239 goto out;
2240 }
2241
2242 smb_finish_dirent(server, fattr);
2243
2244#if 0
2245 /* FIXME: These only work for ascii chars, and recent smbmount doesn't
2246 allow the flag to be set anyway. Remove? */
2247 switch (server->opt.case_handling) {
2248 case SMB_CASE_UPPER:
2249 str_upper(qname->name, len);
2250 break;
2251 case SMB_CASE_LOWER:
2252 str_lower(qname->name, len);
2253 break;
2254 default:
2255 break;
2256 }
2257#endif
2258
2259 qname->len = 0;
2260 n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
2261 qname->name, len,
2262 server->remote_nls, server->local_nls);
2263 if (n > 0) {
2264 qname->len = n;
2265 qname->name = name_buf;
2266 }
2267
2268out:
2269 return result;
2270}
2271
2272/* findfirst/findnext flags */
2273#define SMB_CLOSE_AFTER_FIRST (1<<0)
2274#define SMB_CLOSE_IF_END (1<<1)
2275#define SMB_REQUIRE_RESUME_KEY (1<<2)
2276#define SMB_CONTINUE_BIT (1<<3)
2277
2278/*
2279 * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
2280 * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
2281 * go there for advise.
2282 *
2283 * Bugs Noted:
2284 * (1) When using Info Level 1 Win NT 4.0 truncates directory listings
2285 * for certain patterns of names and/or lengths. The breakage pattern
2286 * is completely reproducible and can be toggled by the creation of a
2287 * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
2288 */
2289static int
2290smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
2291 struct smb_cache_control *ctl)
2292{
2293 struct dentry *dir = filp->f_path.dentry;
2294 struct smb_sb_info *server = server_from_dentry(dir);
2295 struct qstr qname;
2296 struct smb_fattr fattr;
2297
2298 unsigned char *p, *lastname;
2299 char *mask, *param;
2300 __u16 command;
2301 int first, entries_seen;
2302
2303 /* Both NT and OS/2 accept info level 1 (but see note below). */
2304 int info_level = 260;
2305 const int max_matches = 512;
2306
2307 unsigned int ff_searchcount = 0;
2308 unsigned int ff_eos = 0;
2309 unsigned int ff_lastname = 0;
2310 unsigned int ff_dir_handle = 0;
2311 unsigned int loop_count = 0;
2312 unsigned int mask_len, i;
2313 int result;
2314 struct smb_request *req;
2315 unsigned char *name_buf;
2316 static struct qstr star = {
2317 .name = "*",
2318 .len = 1,
2319 };
2320
2321 lock_kernel();
2322
2323 /*
2324 * We always prefer unix style. Use info level 1 for older
2325 * servers that don't do 260.
2326 */
2327 if (server->opt.capabilities & SMB_CAP_UNIX)
2328 info_level = SMB_FIND_FILE_UNIX;
2329 else if (server->opt.protocol < SMB_PROTOCOL_NT1)
2330 info_level = 1;
2331
2332 result = -ENOMEM;
2333 if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
2334 goto out;
2335 if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
2336 goto out_name;
2337 param = req->rq_buffer;
2338
2339 /*
2340 * Encode the initial path
2341 */
2342 mask = param + 12;
2343
2344 result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
2345 if (result <= 0)
2346 goto out_free;
2347 mask_len = result - 1; /* mask_len is strlen, not #bytes */
2348 result = 0;
2349 first = 1;
2350 VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
2351
2352 entries_seen = 2;
2353 ff_eos = 0;
2354
2355 while (ff_eos == 0) {
2356 loop_count += 1;
2357 if (loop_count > 10) {
2358 printk(KERN_WARNING "smb_proc_readdir_long: "
2359 "Looping in FIND_NEXT??\n");
2360 result = -EIO;
2361 break;
2362 }
2363
2364 if (first != 0) {
2365 command = TRANSACT2_FINDFIRST;
2366 WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
2367 WSET(param, 2, max_matches); /* max count */
2368 WSET(param, 4, SMB_CLOSE_IF_END);
2369 WSET(param, 6, info_level);
2370 DSET(param, 8, 0);
2371 } else {
2372 command = TRANSACT2_FINDNEXT;
2373
2374 VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
2375 ff_dir_handle, ff_lastname, mask_len, mask);
2376
2377 WSET(param, 0, ff_dir_handle); /* search handle */
2378 WSET(param, 2, max_matches); /* max count */
2379 WSET(param, 4, info_level);
2380 DSET(param, 6, 0);
2381 WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
2382 }
2383
2384 req->rq_trans2_command = command;
2385 req->rq_ldata = 0;
2386 req->rq_data = NULL;
2387 req->rq_lparm = 12 + mask_len + 1;
2388 req->rq_parm = param;
2389 req->rq_flags = 0;
2390 result = smb_add_request(req);
2391 if (result < 0) {
2392 PARANOIA("error=%d, breaking\n", result);
2393 break;
2394 }
2395
2396 if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
2397 /* a damn Win95 bug - sometimes it clags if you
2398 ask it too fast */
2399 schedule_timeout_interruptible(msecs_to_jiffies(200));
2400 continue;
2401 }
2402
2403 if (req->rq_rcls != 0) {
2404 result = smb_errno(req);
2405 PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
2406 mask, result, req->rq_rcls, req->rq_err);
2407 break;
2408 }
2409
2410 /* parse out some important return info */
2411 if (first != 0) {
2412 ff_dir_handle = WVAL(req->rq_parm, 0);
2413 ff_searchcount = WVAL(req->rq_parm, 2);
2414 ff_eos = WVAL(req->rq_parm, 4);
2415 ff_lastname = WVAL(req->rq_parm, 8);
2416 } else {
2417 ff_searchcount = WVAL(req->rq_parm, 0);
2418 ff_eos = WVAL(req->rq_parm, 2);
2419 ff_lastname = WVAL(req->rq_parm, 6);
2420 }
2421
2422 if (ff_searchcount == 0)
2423 break;
2424
2425 /* Now we are ready to parse smb directory entries. */
2426
2427 /* point to the data bytes */
2428 p = req->rq_data;
2429 for (i = 0; i < ff_searchcount; i++) {
2430 /* make sure we stay within the buffer */
2431 if (p >= req->rq_data + req->rq_ldata) {
2432 printk(KERN_ERR "smb_proc_readdir_long: "
2433 "dirent pointer outside buffer! "
2434 "%p %d@%p\n",
2435 p, req->rq_ldata, req->rq_data);
2436 result = -EIO; /* always a comm. error? */
2437 goto out_free;
2438 }
2439
2440 p = smb_decode_long_dirent(server, p, info_level,
2441 &qname, &fattr, name_buf);
2442
2443 /* ignore . and .. from the server */
2444 if (entries_seen == 2 && qname.name[0] == '.') {
2445 if (qname.len == 1)
2446 continue;
2447 if (qname.name[1] == '.' && qname.len == 2)
2448 continue;
2449 }
2450
2451 if (!smb_fill_cache(filp, dirent, filldir, ctl,
2452 &qname, &fattr))
2453 ; /* stop reading? */
2454 entries_seen++;
2455 }
2456
2457 VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
2458
2459 /*
2460 * We might need the lastname for continuations.
2461 *
2462 * Note that some servers (win95?) point to the filename and
2463 * others (NT4, Samba using NT1) to the dir entry. We assume
2464 * here that those who do not point to a filename do not need
2465 * this info to continue the listing.
2466 *
2467 * OS/2 needs this and talks infolevel 1.
2468 * NetApps want lastname with infolevel 260.
2469 * win2k want lastname with infolevel 260, and points to
2470 * the record not to the name.
2471 * Samba+CifsUnixExt doesn't need lastname.
2472 *
2473 * Both are happy if we return the data they point to. So we do.
2474 * (FIXME: above is not true with win2k)
2475 */
2476 mask_len = 0;
2477 if (info_level != SMB_FIND_FILE_UNIX &&
2478 ff_lastname > 0 && ff_lastname < req->rq_ldata) {
2479 lastname = req->rq_data + ff_lastname;
2480
2481 switch (info_level) {
2482 case 260:
2483 mask_len = req->rq_ldata - ff_lastname;
2484 break;
2485 case 1:
2486 /* lastname points to a length byte */
2487 mask_len = *lastname++;
2488 if (ff_lastname + 1 + mask_len > req->rq_ldata)
2489 mask_len = req->rq_ldata - ff_lastname - 1;
2490 break;
2491 }
2492
2493 /*
2494 * Update the mask string for the next message.
2495 */
2496 if (mask_len > 255)
2497 mask_len = 255;
2498 if (mask_len)
2499 strncpy(mask, lastname, mask_len);
2500 }
2501 mask_len = strnlen(mask, mask_len);
2502 VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
2503 mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
2504
2505 first = 0;
2506 loop_count = 0;
2507 }
2508
2509out_free:
2510 smb_rput(req);
2511out_name:
2512 kfree(name_buf);
2513out:
2514 unlock_kernel();
2515 return result;
2516}
2517
2518/*
2519 * This version uses the trans2 TRANSACT2_FINDFIRST message
2520 * to get the attribute data.
2521 *
2522 * Bugs Noted:
2523 */
2524static int
2525smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
2526 struct smb_fattr *fattr)
2527{
2528 char *param, *mask;
2529 __u16 date, time;
2530 int mask_len, result;
2531 struct smb_request *req;
2532
2533 result = -ENOMEM;
2534 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2535 goto out;
2536 param = req->rq_buffer;
2537 mask = param + 12;
2538
2539 mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
2540 if (mask_len < 0) {
2541 result = mask_len;
2542 goto out_free;
2543 }
2544 VERBOSE("name=%s, len=%d\n", mask, mask_len);
2545 WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
2546 WSET(param, 2, 1); /* max count */
2547 WSET(param, 4, 1); /* close after this call */
2548 WSET(param, 6, 1); /* info_level */
2549 DSET(param, 8, 0);
2550
2551 req->rq_trans2_command = TRANSACT2_FINDFIRST;
2552 req->rq_ldata = 0;
2553 req->rq_data = NULL;
2554 req->rq_lparm = 12 + mask_len;
2555 req->rq_parm = param;
2556 req->rq_flags = 0;
2557 result = smb_add_request(req);
2558 if (result < 0)
2559 goto out_free;
2560 if (req->rq_rcls != 0) {
2561 result = smb_errno(req);
2562#ifdef SMBFS_PARANOIA
2563 if (result != -ENOENT)
2564 PARANOIA("error for %s, rcls=%d, err=%d\n",
2565 mask, req->rq_rcls, req->rq_err);
2566#endif
2567 goto out_free;
2568 }
2569 /* Make sure we got enough data ... */
2570 result = -EINVAL;
2571 if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
2572 PARANOIA("bad result for %s, len=%d, count=%d\n",
2573 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
2574 goto out_free;
2575 }
2576
2577 /*
2578 * Decode the response into the fattr ...
2579 */
2580 date = WVAL(req->rq_data, 0);
2581 time = WVAL(req->rq_data, 2);
2582 fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2583 fattr->f_ctime.tv_nsec = 0;
2584
2585 date = WVAL(req->rq_data, 4);
2586 time = WVAL(req->rq_data, 6);
2587 fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
2588 fattr->f_atime.tv_nsec = 0;
2589
2590 date = WVAL(req->rq_data, 8);
2591 time = WVAL(req->rq_data, 10);
2592 fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2593 fattr->f_mtime.tv_nsec = 0;
2594 VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
2595 mask, date, time, fattr->f_mtime.tv_sec);
2596 fattr->f_size = DVAL(req->rq_data, 12);
2597 /* ULONG allocation size */
2598 fattr->attr = WVAL(req->rq_data, 20);
2599 result = 0;
2600
2601out_free:
2602 smb_rput(req);
2603out:
2604 return result;
2605}
2606
2607static int
2608smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
2609 struct smb_fattr *fattr)
2610{
2611 int result;
2612 char *p;
2613 struct smb_request *req;
2614
2615 result = -ENOMEM;
2616 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2617 goto out;
2618
2619 p = smb_setup_header(req, SMBgetatr, 0, 0);
2620 result = smb_simple_encode_path(req, &p, dir, NULL);
2621 if (result < 0)
2622 goto out_free;
2623 smb_setup_bcc(req, p);
2624
2625 if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
2626 goto out_free;
2627 fattr->attr = WVAL(req->rq_header, smb_vwv0);
2628 fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
2629 fattr->f_mtime.tv_nsec = 0;
2630 fattr->f_size = DVAL(req->rq_header, smb_vwv3);
2631 fattr->f_ctime = fattr->f_mtime;
2632 fattr->f_atime = fattr->f_mtime;
2633#ifdef SMBFS_DEBUG_TIMESTAMP
2634 printk("getattr_core: %s/%s, mtime=%ld\n",
2635 DENTRY_PATH(dir), fattr->f_mtime);
2636#endif
2637 result = 0;
2638
2639out_free:
2640 smb_rput(req);
2641out:
2642 return result;
2643}
2644
2645/*
2646 * Bugs Noted:
2647 * (1) Win 95 swaps the date and time fields in the standard info level.
2648 */
2649static int
2650smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
2651 struct smb_request *req, int infolevel)
2652{
2653 char *p, *param;
2654 int result;
2655
2656 param = req->rq_buffer;
2657 WSET(param, 0, infolevel);
2658 DSET(param, 2, 0);
2659 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
2660 if (result < 0)
2661 goto out;
2662 p = param + 6 + result;
2663
2664 req->rq_trans2_command = TRANSACT2_QPATHINFO;
2665 req->rq_ldata = 0;
2666 req->rq_data = NULL;
2667 req->rq_lparm = p - param;
2668 req->rq_parm = param;
2669 req->rq_flags = 0;
2670 result = smb_add_request(req);
2671 if (result < 0)
2672 goto out;
2673 if (req->rq_rcls != 0) {
2674 VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
2675 &param[6], result, req->rq_rcls, req->rq_err);
2676 result = smb_errno(req);
2677 goto out;
2678 }
2679 result = -ENOENT;
2680 if (req->rq_ldata < 22) {
2681 PARANOIA("not enough data for %s, len=%d\n",
2682 &param[6], req->rq_ldata);
2683 goto out;
2684 }
2685
2686 result = 0;
2687out:
2688 return result;
2689}
2690
2691static int
2692smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
2693 struct smb_fattr *attr)
2694{
2695 u16 date, time;
2696 int off_date = 0, off_time = 2;
2697 int result;
2698 struct smb_request *req;
2699
2700 result = -ENOMEM;
2701 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2702 goto out;
2703
2704 result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
2705 if (result < 0)
2706 goto out_free;
2707
2708 /*
2709 * Kludge alert: Win 95 swaps the date and time field,
2710 * contrary to the CIFS docs and Win NT practice.
2711 */
2712 if (server->mnt->flags & SMB_MOUNT_WIN95) {
2713 off_date = 2;
2714 off_time = 0;
2715 }
2716 date = WVAL(req->rq_data, off_date);
2717 time = WVAL(req->rq_data, off_time);
2718 attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2719 attr->f_ctime.tv_nsec = 0;
2720
2721 date = WVAL(req->rq_data, 4 + off_date);
2722 time = WVAL(req->rq_data, 4 + off_time);
2723 attr->f_atime.tv_sec = date_dos2unix(server, date, time);
2724 attr->f_atime.tv_nsec = 0;
2725
2726 date = WVAL(req->rq_data, 8 + off_date);
2727 time = WVAL(req->rq_data, 8 + off_time);
2728 attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2729 attr->f_mtime.tv_nsec = 0;
2730#ifdef SMBFS_DEBUG_TIMESTAMP
2731 printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
2732 DENTRY_PATH(dir), date, time, attr->f_mtime);
2733#endif
2734 attr->f_size = DVAL(req->rq_data, 12);
2735 attr->attr = WVAL(req->rq_data, 20);
2736
2737out_free:
2738 smb_rput(req);
2739out:
2740 return result;
2741}
2742
2743static int
2744smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
2745 struct smb_fattr *attr)
2746{
2747 struct smb_request *req;
2748 int result;
2749
2750 result = -ENOMEM;
2751 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2752 goto out;
2753
2754 result = smb_proc_getattr_trans2(server, dir, req,
2755 SMB_QUERY_FILE_ALL_INFO);
2756 if (result < 0)
2757 goto out_free;
2758
2759 attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
2760 attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
2761 attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
2762 /* change (24) */
2763 attr->attr = WVAL(req->rq_data, 32);
2764 /* pad? (34) */
2765 /* allocated size (40) */
2766 attr->f_size = LVAL(req->rq_data, 48);
2767
2768out_free:
2769 smb_rput(req);
2770out:
2771 return result;
2772}
2773
2774static int
2775smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
2776 struct smb_fattr *attr)
2777{
2778 struct smb_request *req;
2779 int result;
2780
2781 result = -ENOMEM;
2782 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2783 goto out;
2784
2785 result = smb_proc_getattr_trans2(server, dir, req,
2786 SMB_QUERY_FILE_UNIX_BASIC);
2787 if (result < 0)
2788 goto out_free;
2789
2790 smb_decode_unix_basic(attr, server, req->rq_data);
2791
2792out_free:
2793 smb_rput(req);
2794out:
2795 return result;
2796}
2797
2798static int
2799smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
2800 struct smb_fattr *attr)
2801{
2802 struct inode *inode = dir->d_inode;
2803 int result;
2804
2805 /* FIXME: why not use the "all" version? */
2806 result = smb_proc_getattr_trans2_std(server, dir, attr);
2807 if (result < 0)
2808 goto out;
2809
2810 /*
2811 * None of the getattr versions here can make win9x return the right
2812 * filesize if there are changes made to an open file.
2813 * A seek-to-end does return the right size, but we only need to do
2814 * that on files we have written.
2815 */
2816 if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
2817 smb_is_open(inode))
2818 {
2819 __u16 fileid = SMB_I(inode)->fileid;
2820 attr->f_size = smb_proc_seek(server, fileid, 2, 0);
2821 }
2822
2823out:
2824 return result;
2825}
2826
2827static int
2828smb_proc_ops_wait(struct smb_sb_info *server)
2829{
2830 int result;
2831
2832 result = wait_event_interruptible_timeout(server->conn_wq,
2833 server->conn_complete, 30*HZ);
2834
2835 if (!result || signal_pending(current))
2836 return -EIO;
2837
2838 return 0;
2839}
2840
2841static int
2842smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
2843 struct smb_fattr *fattr)
2844{
2845 int result;
2846
2847 if (smb_proc_ops_wait(server) < 0)
2848 return -EIO;
2849
2850 smb_init_dirent(server, fattr);
2851 result = server->ops->getattr(server, dir, fattr);
2852 smb_finish_dirent(server, fattr);
2853
2854 return result;
2855}
2856
2857static int
2858smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
2859 struct smb_cache_control *ctl)
2860{
2861 struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
2862
2863 if (smb_proc_ops_wait(server) < 0)
2864 return -EIO;
2865
2866 return server->ops->readdir(filp, dirent, filldir, ctl);
2867}
2868
2869int
2870smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
2871{
2872 struct smb_sb_info *server = server_from_dentry(dir);
2873 int result;
2874
2875 smb_init_dirent(server, fattr);
2876 result = server->ops->getattr(server, dir, fattr);
2877 smb_finish_dirent(server, fattr);
2878
2879 return result;
2880}
2881
2882
2883/*
2884 * Because of bugs in the core protocol, we use this only to set
2885 * attributes. See smb_proc_settime() below for timestamp handling.
2886 *
2887 * Bugs Noted:
2888 * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
2889 * with an undocumented error (ERRDOS code 50). Setting
2890 * mtime to 0 allows the attributes to be set.
2891 * (2) The extra parameters following the name string aren't
2892 * in the CIFS docs, but seem to be necessary for operation.
2893 */
2894static int
2895smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
2896 __u16 attr)
2897{
2898 char *p;
2899 int result;
2900 struct smb_request *req;
2901
2902 result = -ENOMEM;
2903 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2904 goto out;
2905
2906 p = smb_setup_header(req, SMBsetatr, 8, 0);
2907 WSET(req->rq_header, smb_vwv0, attr);
2908 DSET(req->rq_header, smb_vwv1, 0); /* mtime */
2909 WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
2910 WSET(req->rq_header, smb_vwv4, 0);
2911 WSET(req->rq_header, smb_vwv5, 0);
2912 WSET(req->rq_header, smb_vwv6, 0);
2913 WSET(req->rq_header, smb_vwv7, 0);
2914 result = smb_simple_encode_path(req, &p, dentry, NULL);
2915 if (result < 0)
2916 goto out_free;
2917 if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
2918 result = -ENAMETOOLONG;
2919 goto out_free;
2920 }
2921 *p++ = 4;
2922 *p++ = 0;
2923 smb_setup_bcc(req, p);
2924
2925 result = smb_request_ok(req, SMBsetatr, 0, 0);
2926 if (result < 0)
2927 goto out_free;
2928 result = 0;
2929
2930out_free:
2931 smb_rput(req);
2932out:
2933 return result;
2934}
2935
2936/*
2937 * Because of bugs in the trans2 setattr messages, we must set
2938 * attributes and timestamps separately. The core SMBsetatr
2939 * message seems to be the only reliable way to set attributes.
2940 */
2941int
2942smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
2943{
2944 struct smb_sb_info *server = server_from_dentry(dir);
2945 int result;
2946
2947 VERBOSE("setting %s/%s, open=%d\n",
2948 DENTRY_PATH(dir), smb_is_open(dir->d_inode));
2949 result = smb_proc_setattr_core(server, dir, fattr->attr);
2950 return result;
2951}
2952
2953/*
2954 * Sets the timestamps for an file open with write permissions.
2955 */
2956static int
2957smb_proc_setattr_ext(struct smb_sb_info *server,
2958 struct inode *inode, struct smb_fattr *fattr)
2959{
2960 __u16 date, time;
2961 int result;
2962 struct smb_request *req;
2963
2964 result = -ENOMEM;
2965 if (! (req = smb_alloc_request(server, 0)))
2966 goto out;
2967
2968 smb_setup_header(req, SMBsetattrE, 7, 0);
2969 WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
2970 /* We don't change the creation time */
2971 WSET(req->rq_header, smb_vwv1, 0);
2972 WSET(req->rq_header, smb_vwv2, 0);
2973 date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
2974 WSET(req->rq_header, smb_vwv3, date);
2975 WSET(req->rq_header, smb_vwv4, time);
2976 date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
2977 WSET(req->rq_header, smb_vwv5, date);
2978 WSET(req->rq_header, smb_vwv6, time);
2979#ifdef SMBFS_DEBUG_TIMESTAMP
2980 printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
2981 date, time, fattr->f_mtime);
2982#endif
2983
2984 req->rq_flags |= SMB_REQ_NORETRY;
2985 result = smb_request_ok(req, SMBsetattrE, 0, 0);
2986 if (result < 0)
2987 goto out_free;
2988 result = 0;
2989out_free:
2990 smb_rput(req);
2991out:
2992 return result;
2993}
2994
2995/*
2996 * Bugs Noted:
2997 * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
2998 * set the file's attribute flags.
2999 */
3000static int
3001smb_proc_setattr_trans2(struct smb_sb_info *server,
3002 struct dentry *dir, struct smb_fattr *fattr)
3003{
3004 __u16 date, time;
3005 char *p, *param;
3006 int result;
3007 char data[26];
3008 struct smb_request *req;
3009
3010 result = -ENOMEM;
3011 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3012 goto out;
3013 param = req->rq_buffer;
3014
3015 WSET(param, 0, 1); /* Info level SMB_INFO_STANDARD */
3016 DSET(param, 2, 0);
3017 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
3018 if (result < 0)
3019 goto out_free;
3020 p = param + 6 + result;
3021
3022 WSET(data, 0, 0); /* creation time */
3023 WSET(data, 2, 0);
3024 date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
3025 WSET(data, 4, date);
3026 WSET(data, 6, time);
3027 date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
3028 WSET(data, 8, date);
3029 WSET(data, 10, time);
3030#ifdef SMBFS_DEBUG_TIMESTAMP
3031 printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
3032 DENTRY_PATH(dir), date, time, fattr->f_mtime);
3033#endif
3034 DSET(data, 12, 0); /* size */
3035 DSET(data, 16, 0); /* blksize */
3036 WSET(data, 20, 0); /* attr */
3037 DSET(data, 22, 0); /* ULONG EA size */
3038
3039 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3040 req->rq_ldata = 26;
3041 req->rq_data = data;
3042 req->rq_lparm = p - param;
3043 req->rq_parm = param;
3044 req->rq_flags = 0;
3045 result = smb_add_request(req);
3046 if (result < 0)
3047 goto out_free;
3048 result = 0;
3049 if (req->rq_rcls != 0)
3050 result = smb_errno(req);
3051
3052out_free:
3053 smb_rput(req);
3054out:
3055 return result;
3056}
3057
3058/*
3059 * ATTR_MODE 0x001
3060 * ATTR_UID 0x002
3061 * ATTR_GID 0x004
3062 * ATTR_SIZE 0x008
3063 * ATTR_ATIME 0x010
3064 * ATTR_MTIME 0x020
3065 * ATTR_CTIME 0x040
3066 * ATTR_ATIME_SET 0x080
3067 * ATTR_MTIME_SET 0x100
3068 * ATTR_FORCE 0x200
3069 * ATTR_ATTR_FLAG 0x400
3070 *
3071 * major/minor should only be set by mknod.
3072 */
3073int
3074smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
3075 unsigned int major, unsigned int minor)
3076{
3077 struct smb_sb_info *server = server_from_dentry(d);
3078 u64 nttime;
3079 char *p, *param;
3080 int result;
3081 char data[100];
3082 struct smb_request *req;
3083
3084 result = -ENOMEM;
3085 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3086 goto out;
3087 param = req->rq_buffer;
3088
3089 DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
3090
3091 WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
3092 DSET(param, 2, 0);
3093 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
3094 if (result < 0)
3095 goto out_free;
3096 p = param + 6 + result;
3097
3098 /* 0 L file size in bytes */
3099 /* 8 L file size on disk in bytes (block count) */
3100 /* 40 L uid */
3101 /* 48 L gid */
3102 /* 56 W file type enum */
3103 /* 60 L devmajor */
3104 /* 68 L devminor */
3105 /* 76 L unique ID (inode) */
3106 /* 84 L permissions */
3107 /* 92 L link count */
3108 LSET(data, 0, SMB_SIZE_NO_CHANGE);
3109 LSET(data, 8, SMB_SIZE_NO_CHANGE);
3110 LSET(data, 16, SMB_TIME_NO_CHANGE);
3111 LSET(data, 24, SMB_TIME_NO_CHANGE);
3112 LSET(data, 32, SMB_TIME_NO_CHANGE);
3113 LSET(data, 40, SMB_UID_NO_CHANGE);
3114 LSET(data, 48, SMB_GID_NO_CHANGE);
3115 DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
3116 LSET(data, 60, major);
3117 LSET(data, 68, minor);
3118 LSET(data, 76, 0);
3119 LSET(data, 84, SMB_MODE_NO_CHANGE);
3120 LSET(data, 92, 0);
3121
3122 if (attr->ia_valid & ATTR_SIZE) {
3123 LSET(data, 0, attr->ia_size);
3124 LSET(data, 8, 0); /* can't set anyway */
3125 }
3126
3127 /*
3128 * FIXME: check the conversion function it the correct one
3129 *
3130 * we can't set ctime but we might as well pass this to the server
3131 * and let it ignore it.
3132 */
3133 if (attr->ia_valid & ATTR_CTIME) {
3134 nttime = smb_unixutc2ntutc(attr->ia_ctime);
3135 LSET(data, 16, nttime);
3136 }
3137 if (attr->ia_valid & ATTR_ATIME) {
3138 nttime = smb_unixutc2ntutc(attr->ia_atime);
3139 LSET(data, 24, nttime);
3140 }
3141 if (attr->ia_valid & ATTR_MTIME) {
3142 nttime = smb_unixutc2ntutc(attr->ia_mtime);
3143 LSET(data, 32, nttime);
3144 }
3145
3146 if (attr->ia_valid & ATTR_UID) {
3147 LSET(data, 40, attr->ia_uid);
3148 }
3149 if (attr->ia_valid & ATTR_GID) {
3150 LSET(data, 48, attr->ia_gid);
3151 }
3152
3153 if (attr->ia_valid & ATTR_MODE) {
3154 LSET(data, 84, attr->ia_mode);
3155 }
3156
3157 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3158 req->rq_ldata = 100;
3159 req->rq_data = data;
3160 req->rq_lparm = p - param;
3161 req->rq_parm = param;
3162 req->rq_flags = 0;
3163 result = smb_add_request(req);
3164
3165out_free:
3166 smb_rput(req);
3167out:
3168 return result;
3169}
3170
3171
3172/*
3173 * Set the modify and access timestamps for a file.
3174 *
3175 * Incredibly enough, in all of SMB there is no message to allow
3176 * setting both attributes and timestamps at once.
3177 *
3178 * Bugs Noted:
3179 * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message
3180 * with info level 1 (INFO_STANDARD).
3181 * (2) Win 95 seems not to support setting directory timestamps.
3182 * (3) Under the core protocol apparently the only way to set the
3183 * timestamp is to open and close the file.
3184 */
3185int
3186smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
3187{
3188 struct smb_sb_info *server = server_from_dentry(dentry);
3189 struct inode *inode = dentry->d_inode;
3190 int result;
3191
3192 VERBOSE("setting %s/%s, open=%d\n",
3193 DENTRY_PATH(dentry), smb_is_open(inode));
3194
3195 /* setting the time on a Win95 server fails (tridge) */
3196 if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 &&
3197 !(server->mnt->flags & SMB_MOUNT_WIN95)) {
3198 if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
3199 result = smb_proc_setattr_ext(server, inode, fattr);
3200 else
3201 result = smb_proc_setattr_trans2(server, dentry, fattr);
3202 } else {
3203 /*
3204 * Fail silently on directories ... timestamp can't be set?
3205 */
3206 result = 0;
3207 if (S_ISREG(inode->i_mode)) {
3208 /*
3209 * Set the mtime by opening and closing the file.
3210 * Note that the file is opened read-only, but this
3211 * still allows us to set the date (tridge)
3212 */
3213 result = -EACCES;
3214 if (!smb_is_open(inode))
3215 smb_proc_open(server, dentry, SMB_O_RDONLY);
3216 if (smb_is_open(inode)) {
3217 inode->i_mtime = fattr->f_mtime;
3218 result = smb_proc_close_inode(server, inode);
3219 }
3220 }
3221 }
3222
3223 return result;
3224}
3225
3226int
3227smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
3228{
3229 struct smb_sb_info *server = SMB_SB(dentry->d_sb);
3230 int result;
3231 char *p;
3232 long unit;
3233 struct smb_request *req;
3234
3235 result = -ENOMEM;
3236 if (! (req = smb_alloc_request(server, 0)))
3237 goto out;
3238
3239 smb_setup_header(req, SMBdskattr, 0, 0);
3240 if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
3241 goto out_free;
3242 p = SMB_VWV(req->rq_header);
3243 unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
3244 attr->f_blocks = WVAL(p, 0) * unit;
3245 attr->f_bsize = SMB_ST_BLKSIZE;
3246 attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
3247 result = 0;
3248
3249out_free:
3250 smb_rput(req);
3251out:
3252 return result;
3253}
3254
3255int
3256smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
3257 char *buffer, int len)
3258{
3259 char *p, *param;
3260 int result;
3261 struct smb_request *req;
3262
3263 DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
3264
3265 result = -ENOMEM;
3266 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3267 goto out;
3268 param = req->rq_buffer;
3269
3270 WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
3271 DSET(param, 2, 0);
3272 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
3273 if (result < 0)
3274 goto out_free;
3275 p = param + 6 + result;
3276
3277 req->rq_trans2_command = TRANSACT2_QPATHINFO;
3278 req->rq_ldata = 0;
3279 req->rq_data = NULL;
3280 req->rq_lparm = p - param;
3281 req->rq_parm = param;
3282 req->rq_flags = 0;
3283 result = smb_add_request(req);
3284 if (result < 0)
3285 goto out_free;
3286 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3287 &param[6], result, req->rq_rcls, req->rq_err);
3288
3289 /* copy data up to the \0 or buffer length */
3290 result = len;
3291 if (req->rq_ldata < len)
3292 result = req->rq_ldata;
3293 strncpy(buffer, req->rq_data, result);
3294
3295out_free:
3296 smb_rput(req);
3297out:
3298 return result;
3299}
3300
3301
3302/*
3303 * Create a symlink object called dentry which points to oldpath.
3304 * Samba does not permit dangling links but returns a suitable error message.
3305 */
3306int
3307smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
3308 const char *oldpath)
3309{
3310 char *p, *param;
3311 int result;
3312 struct smb_request *req;
3313
3314 result = -ENOMEM;
3315 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3316 goto out;
3317 param = req->rq_buffer;
3318
3319 WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
3320 DSET(param, 2, 0);
3321 result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
3322 if (result < 0)
3323 goto out_free;
3324 p = param + 6 + result;
3325
3326 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3327 req->rq_ldata = strlen(oldpath) + 1;
3328 req->rq_data = (char *) oldpath;
3329 req->rq_lparm = p - param;
3330 req->rq_parm = param;
3331 req->rq_flags = 0;
3332 result = smb_add_request(req);
3333 if (result < 0)
3334 goto out_free;
3335
3336 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3337 &param[6], result, req->rq_rcls, req->rq_err);
3338 result = 0;
3339
3340out_free:
3341 smb_rput(req);
3342out:
3343 return result;
3344}
3345
3346/*
3347 * Create a hard link object called new_dentry which points to dentry.
3348 */
3349int
3350smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
3351 struct dentry *new_dentry)
3352{
3353 char *p, *param;
3354 int result;
3355 struct smb_request *req;
3356
3357 result = -ENOMEM;
3358 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3359 goto out;
3360 param = req->rq_buffer;
3361
3362 WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
3363 DSET(param, 2, 0);
3364 result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
3365 new_dentry, NULL);
3366 if (result < 0)
3367 goto out_free;
3368 p = param + 6 + result;
3369
3370 /* Grr, pointless separation of parameters and data ... */
3371 req->rq_data = p;
3372 req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
3373 dentry, NULL);
3374
3375 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3376 req->rq_lparm = p - param;
3377 req->rq_parm = param;
3378 req->rq_flags = 0;
3379 result = smb_add_request(req);
3380 if (result < 0)
3381 goto out_free;
3382
3383 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3384 &param[6], result, req->rq_rcls, req->rq_err);
3385 result = 0;
3386
3387out_free:
3388 smb_rput(req);
3389out:
3390 return result;
3391}
3392
3393static int
3394smb_proc_query_cifsunix(struct smb_sb_info *server)
3395{
3396 int result;
3397 int major, minor;
3398 u64 caps;
3399 char param[2];
3400 struct smb_request *req;
3401
3402 result = -ENOMEM;
3403 if (! (req = smb_alloc_request(server, 100)))
3404 goto out;
3405
3406 WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
3407
3408 req->rq_trans2_command = TRANSACT2_QFSINFO;
3409 req->rq_ldata = 0;
3410 req->rq_data = NULL;
3411 req->rq_lparm = 2;
3412 req->rq_parm = param;
3413 req->rq_flags = 0;
3414 result = smb_add_request(req);
3415 if (result < 0)
3416 goto out_free;
3417
3418 if (req->rq_ldata < 12) {
3419 PARANOIA("Not enough data\n");
3420 goto out_free;
3421 }
3422 major = WVAL(req->rq_data, 0);
3423 minor = WVAL(req->rq_data, 2);
3424
3425 DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
3426 major, minor);
3427 /* FIXME: verify that we are ok with this major/minor? */
3428
3429 caps = LVAL(req->rq_data, 4);
3430 DEBUG1("Server capabilities 0x%016llx\n", caps);
3431
3432out_free:
3433 smb_rput(req);
3434out:
3435 return result;
3436}
3437
3438
3439static void
3440install_ops(struct smb_ops *dst, struct smb_ops *src)
3441{
3442 memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
3443}
3444
3445/* < LANMAN2 */
3446static struct smb_ops smb_ops_core =
3447{
3448 .read = smb_proc_read,
3449 .write = smb_proc_write,
3450 .readdir = smb_proc_readdir_short,
3451 .getattr = smb_proc_getattr_core,
3452 .truncate = smb_proc_trunc32,
3453};
3454
3455/* LANMAN2, OS/2, others? */
3456static struct smb_ops smb_ops_os2 =
3457{
3458 .read = smb_proc_read,
3459 .write = smb_proc_write,
3460 .readdir = smb_proc_readdir_long,
3461 .getattr = smb_proc_getattr_trans2_std,
3462 .truncate = smb_proc_trunc32,
3463};
3464
3465/* Win95, and possibly some NetApp versions too */
3466static struct smb_ops smb_ops_win95 =
3467{
3468 .read = smb_proc_read, /* does not support 12word readX */
3469 .write = smb_proc_write,
3470 .readdir = smb_proc_readdir_long,
3471 .getattr = smb_proc_getattr_95,
3472 .truncate = smb_proc_trunc95,
3473};
3474
3475/* Samba, NT4 and NT5 */
3476static struct smb_ops smb_ops_winNT =
3477{
3478 .read = smb_proc_readX,
3479 .write = smb_proc_writeX,
3480 .readdir = smb_proc_readdir_long,
3481 .getattr = smb_proc_getattr_trans2_all,
3482 .truncate = smb_proc_trunc64,
3483};
3484
3485/* Samba w/ unix extensions. Others? */
3486static struct smb_ops smb_ops_unix =
3487{
3488 .read = smb_proc_readX,
3489 .write = smb_proc_writeX,
3490 .readdir = smb_proc_readdir_long,
3491 .getattr = smb_proc_getattr_unix,
3492 /* FIXME: core/ext/time setattr needs to be cleaned up! */
3493 /* .setattr = smb_proc_setattr_unix, */
3494 .truncate = smb_proc_trunc64,
3495};
3496
3497/* Place holder until real ops are in place */
3498static struct smb_ops smb_ops_null =
3499{
3500 .readdir = smb_proc_readdir_null,
3501 .getattr = smb_proc_getattr_null,
3502};
3503
3504void smb_install_null_ops(struct smb_ops *ops)
3505{
3506 install_ops(ops, &smb_ops_null);
3507}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e6..000000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * Autogenerated with cproto on: Sat Sep 13 17:18:51 CEST 2003
3 */
4
5struct smb_request;
6struct sock;
7struct statfs;
8
9/* proc.c */
10extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
11extern __u32 smb_len(__u8 *p);
12extern int smb_get_rsize(struct smb_sb_info *server);
13extern int smb_get_wsize(struct smb_sb_info *server);
14extern int smb_errno(struct smb_request *req);
15extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
16extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
17extern int smb_open(struct dentry *dentry, int wish);
18extern int smb_close(struct inode *ino);
19extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
20extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
21extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
22extern int smb_proc_mkdir(struct dentry *dentry);
23extern int smb_proc_rmdir(struct dentry *dentry);
24extern int smb_proc_unlink(struct dentry *dentry);
25extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
26extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
27 struct super_block *sb);
28extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
29extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
30extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
31extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
32extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
33extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
34extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
35extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
36extern void smb_install_null_ops(struct smb_ops *ops);
37/* dir.c */
38extern const struct file_operations smb_dir_operations;
39extern const struct inode_operations smb_dir_inode_operations;
40extern const struct inode_operations smb_dir_inode_operations_unix;
41extern void smb_new_dentry(struct dentry *dentry);
42extern void smb_renew_times(struct dentry *dentry);
43/* cache.c */
44extern void smb_invalid_dir_cache(struct inode *dir);
45extern void smb_invalidate_dircache_entries(struct dentry *parent);
46extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
47extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
48/* sock.c */
49extern void smb_data_ready(struct sock *sk, int len);
50extern int smb_valid_socket(struct inode *inode);
51extern void smb_close_socket(struct smb_sb_info *server);
52extern int smb_recv_available(struct smb_sb_info *server);
53extern int smb_receive_header(struct smb_sb_info *server);
54extern int smb_receive_drop(struct smb_sb_info *server);
55extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
56extern int smb_send_request(struct smb_request *req);
57/* inode.c */
58extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
59extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
60extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
61extern void smb_invalidate_inodes(struct smb_sb_info *server);
62extern int smb_revalidate_inode(struct dentry *dentry);
63extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
64extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
65/* file.c */
66extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */
70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */
72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server);
74extern void smbiod_unregister_server(struct smb_sb_info *server);
75extern void smbiod_flush(struct smb_sb_info *server);
76extern int smbiod_retry(struct smb_sb_info *server);
77/* request.c */
78extern int smb_init_request_cache(void);
79extern void smb_destroy_request_cache(void);
80extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
81extern void smb_rput(struct smb_request *req);
82extern int smb_add_request(struct smb_request *req);
83extern int smb_request_send_server(struct smb_sb_info *server);
84extern int smb_request_recv(struct smb_sb_info *server);
85/* symlink.c */
86extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
87extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e862..000000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
1/*
2 * request.c
3 *
4 * Copyright (C) 2001 by Urban Widmark
5 *
6 * Please add a note about your changes to smbfs in the ChangeLog file.
7 */
8
9#include <linux/kernel.h>
10#include <linux/types.h>
11#include <linux/fs.h>
12#include <linux/slab.h>
13#include <linux/net.h>
14#include <linux/sched.h>
15
16#include <linux/smb_fs.h>
17#include <linux/smbno.h>
18#include <linux/smb_mount.h>
19
20#include "smb_debug.h"
21#include "request.h"
22#include "proto.h"
23
24/* #define SMB_SLAB_DEBUG (SLAB_RED_ZONE | SLAB_POISON) */
25#define SMB_SLAB_DEBUG 0
26
27/* cache for request structures */
28static struct kmem_cache *req_cachep;
29
30static int smb_request_send_req(struct smb_request *req);
31
32/*
33 /proc/slabinfo:
34 name, active, num, objsize, active_slabs, num_slaps, #pages
35*/
36
37
38int smb_init_request_cache(void)
39{
40 req_cachep = kmem_cache_create("smb_request",
41 sizeof(struct smb_request), 0,
42 SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
43 NULL);
44 if (req_cachep == NULL)
45 return -ENOMEM;
46
47 return 0;
48}
49
50void smb_destroy_request_cache(void)
51{
52 kmem_cache_destroy(req_cachep);
53}
54
55/*
56 * Allocate and initialise a request structure
57 */
58static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
59 int bufsize)
60{
61 struct smb_request *req;
62 unsigned char *buf = NULL;
63
64 req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
65 VERBOSE("allocating request: %p\n", req);
66 if (!req)
67 goto out;
68
69 if (bufsize > 0) {
70 buf = kmalloc(bufsize, GFP_NOFS);
71 if (!buf) {
72 kmem_cache_free(req_cachep, req);
73 return NULL;
74 }
75 }
76
77 req->rq_buffer = buf;
78 req->rq_bufsize = bufsize;
79 req->rq_server = server;
80 init_waitqueue_head(&req->rq_wait);
81 INIT_LIST_HEAD(&req->rq_queue);
82 atomic_set(&req->rq_count, 1);
83
84out:
85 return req;
86}
87
88struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
89{
90 struct smb_request *req = NULL;
91
92 for (;;) {
93 atomic_inc(&server->nr_requests);
94 if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
95 req = smb_do_alloc_request(server, bufsize);
96 if (req != NULL)
97 break;
98 }
99
100#if 0
101 /*
102 * Try to free up at least one request in order to stay
103 * below the hard limit
104 */
105 if (nfs_try_to_free_pages(server))
106 continue;
107
108 if (fatal_signal_pending(current))
109 return ERR_PTR(-ERESTARTSYS);
110 current->policy = SCHED_YIELD;
111 schedule();
112#else
113 /* FIXME: we want something like nfs does above, but that
114 requires changes to all callers and can wait. */
115 break;
116#endif
117 }
118 return req;
119}
120
121static void smb_free_request(struct smb_request *req)
122{
123 atomic_dec(&req->rq_server->nr_requests);
124 if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
125 kfree(req->rq_buffer);
126 kfree(req->rq_trans2buffer);
127 kmem_cache_free(req_cachep, req);
128}
129
130/*
131 * What prevents a rget to race with a rput? The count must never drop to zero
132 * while it is in use. Only rput if it is ok that it is free'd.
133 */
134static void smb_rget(struct smb_request *req)
135{
136 atomic_inc(&req->rq_count);
137}
138void smb_rput(struct smb_request *req)
139{
140 if (atomic_dec_and_test(&req->rq_count)) {
141 list_del_init(&req->rq_queue);
142 smb_free_request(req);
143 }
144}
145
146/* setup to receive the data part of the SMB */
147static int smb_setup_bcc(struct smb_request *req)
148{
149 int result = 0;
150 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
151
152 if (req->rq_rlen > req->rq_bufsize) {
153 PARANOIA("Packet too large %d > %d\n",
154 req->rq_rlen, req->rq_bufsize);
155 return -ENOBUFS;
156 }
157
158 req->rq_iov[0].iov_base = req->rq_buffer;
159 req->rq_iov[0].iov_len = req->rq_rlen;
160 req->rq_iovlen = 1;
161
162 return result;
163}
164
165/*
166 * Prepare a "normal" request structure.
167 */
168static int smb_setup_request(struct smb_request *req)
169{
170 int len = smb_len(req->rq_header) + 4;
171 req->rq_slen = len;
172
173 /* if we expect a data part in the reply we set the iov's to read it */
174 if (req->rq_resp_bcc)
175 req->rq_setup_read = smb_setup_bcc;
176
177 /* This tries to support re-using the same request */
178 req->rq_bytes_sent = 0;
179 req->rq_rcls = 0;
180 req->rq_err = 0;
181 req->rq_errno = 0;
182 req->rq_fragment = 0;
183 kfree(req->rq_trans2buffer);
184 req->rq_trans2buffer = NULL;
185
186 return 0;
187}
188
189/*
190 * Prepare a transaction2 request structure
191 */
192static int smb_setup_trans2request(struct smb_request *req)
193{
194 struct smb_sb_info *server = req->rq_server;
195 int mparam, mdata;
196 static unsigned char padding[4];
197
198 /* I know the following is very ugly, but I want to build the
199 smb packet as efficiently as possible. */
200
201 const int smb_parameters = 15;
202 const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
203 const int oparam = ALIGN(header + 3, sizeof(u32));
204 const int odata = ALIGN(oparam + req->rq_lparm, sizeof(u32));
205 const int bcc = (req->rq_data ? odata + req->rq_ldata :
206 oparam + req->rq_lparm) - header;
207
208 if ((bcc + oparam) > server->opt.max_xmit)
209 return -ENOMEM;
210 smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
211
212 /*
213 * max parameters + max data + max setup == bufsize to make NT4 happy
214 * and not abort the transfer or split into multiple responses. It also
215 * makes smbfs happy as handling packets larger than the buffer size
216 * is extra work.
217 *
218 * OS/2 is probably going to hate me for this ...
219 */
220 mparam = SMB_TRANS2_MAX_PARAM;
221 mdata = req->rq_bufsize - mparam;
222
223 mdata = server->opt.max_xmit - mparam - 100;
224 if (mdata < 1024) {
225 mdata = 1024;
226 mparam = 20;
227 }
228
229#if 0
230 /* NT/win2k has ~4k max_xmit, so with this we request more than it wants
231 to return as one SMB. Useful for testing the fragmented trans2
232 handling. */
233 mdata = 8192;
234#endif
235
236 WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
237 WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
238 WSET(req->rq_header, smb_mprcnt, mparam);
239 WSET(req->rq_header, smb_mdrcnt, mdata);
240 WSET(req->rq_header, smb_msrcnt, 0); /* max setup always 0 ? */
241 WSET(req->rq_header, smb_flags, 0);
242 DSET(req->rq_header, smb_timeout, 0);
243 WSET(req->rq_header, smb_pscnt, req->rq_lparm);
244 WSET(req->rq_header, smb_psoff, oparam - 4);
245 WSET(req->rq_header, smb_dscnt, req->rq_ldata);
246 WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
247 *(req->rq_header + smb_suwcnt) = 0x01; /* setup count */
248 *(req->rq_header + smb_suwcnt + 1) = 0x00; /* reserved */
249 WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
250
251 req->rq_iovlen = 2;
252 req->rq_iov[0].iov_base = (void *) req->rq_header;
253 req->rq_iov[0].iov_len = oparam;
254 req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
255 req->rq_iov[1].iov_len = req->rq_lparm;
256 req->rq_slen = oparam + req->rq_lparm;
257
258 if (req->rq_data) {
259 req->rq_iovlen += 2;
260 req->rq_iov[2].iov_base = padding;
261 req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
262 req->rq_iov[3].iov_base = req->rq_data;
263 req->rq_iov[3].iov_len = req->rq_ldata;
264 req->rq_slen = odata + req->rq_ldata;
265 }
266
267 /* always a data part for trans2 replies */
268 req->rq_setup_read = smb_setup_bcc;
269
270 return 0;
271}
272
273/*
274 * Add a request and tell smbiod to process it
275 */
276int smb_add_request(struct smb_request *req)
277{
278 long timeleft;
279 struct smb_sb_info *server = req->rq_server;
280 int result = 0;
281
282 smb_setup_request(req);
283 if (req->rq_trans2_command) {
284 if (req->rq_buffer == NULL) {
285 PARANOIA("trans2 attempted without response buffer!\n");
286 return -EIO;
287 }
288 result = smb_setup_trans2request(req);
289 }
290 if (result < 0)
291 return result;
292
293#ifdef SMB_DEBUG_PACKET_SIZE
294 add_xmit_stats(req);
295#endif
296
297 /* add 'req' to the queue of requests */
298 if (smb_lock_server_interruptible(server))
299 return -EINTR;
300
301 /*
302 * Try to send the request as the process. If that fails we queue the
303 * request and let smbiod send it later.
304 */
305
306 /* FIXME: each server has a number on the maximum number of parallel
307 requests. 10, 50 or so. We should not allow more requests to be
308 active. */
309 if (server->mid > 0xf000)
310 server->mid = 0;
311 req->rq_mid = server->mid++;
312 WSET(req->rq_header, smb_mid, req->rq_mid);
313
314 result = 0;
315 if (server->state == CONN_VALID) {
316 if (list_empty(&server->xmitq))
317 result = smb_request_send_req(req);
318 if (result < 0) {
319 /* Connection lost? */
320 server->conn_error = result;
321 server->state = CONN_INVALID;
322 }
323 }
324 if (result != 1)
325 list_add_tail(&req->rq_queue, &server->xmitq);
326 smb_rget(req);
327
328 if (server->state != CONN_VALID)
329 smbiod_retry(server);
330
331 smb_unlock_server(server);
332
333 smbiod_wake_up();
334
335 timeleft = wait_event_interruptible_timeout(req->rq_wait,
336 req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
337 if (!timeleft || signal_pending(current)) {
338 /*
339 * On timeout or on interrupt we want to try and remove the
340 * request from the recvq/xmitq.
341 * First check if the request is still part of a queue. (May
342 * have been removed by some error condition)
343 */
344 smb_lock_server(server);
345 if (!list_empty(&req->rq_queue)) {
346 list_del_init(&req->rq_queue);
347 smb_rput(req);
348 }
349 smb_unlock_server(server);
350 }
351
352 if (!timeleft) {
353 PARANOIA("request [%p, mid=%d] timed out!\n",
354 req, req->rq_mid);
355 VERBOSE("smb_com: %02x\n", *(req->rq_header + smb_com));
356 VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
357 VERBOSE("smb_flg: %02x\n", *(req->rq_header + smb_flg));
358 VERBOSE("smb_tid: %04x\n", WVAL(req->rq_header, smb_tid));
359 VERBOSE("smb_pid: %04x\n", WVAL(req->rq_header, smb_pid));
360 VERBOSE("smb_uid: %04x\n", WVAL(req->rq_header, smb_uid));
361 VERBOSE("smb_mid: %04x\n", WVAL(req->rq_header, smb_mid));
362 VERBOSE("smb_wct: %02x\n", *(req->rq_header + smb_wct));
363
364 req->rq_rcls = ERRSRV;
365 req->rq_err = ERRtimeout;
366
367 /* Just in case it was "stuck" */
368 smbiod_wake_up();
369 }
370 VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
371
372 if (req->rq_rcls != 0)
373 req->rq_errno = smb_errno(req);
374 if (signal_pending(current))
375 req->rq_errno = -ERESTARTSYS;
376 return req->rq_errno;
377}
378
379/*
380 * Send a request and place it on the recvq if successfully sent.
381 * Must be called with the server lock held.
382 */
383static int smb_request_send_req(struct smb_request *req)
384{
385 struct smb_sb_info *server = req->rq_server;
386 int result;
387
388 if (req->rq_bytes_sent == 0) {
389 WSET(req->rq_header, smb_tid, server->opt.tid);
390 WSET(req->rq_header, smb_pid, 1);
391 WSET(req->rq_header, smb_uid, server->opt.server_uid);
392 }
393
394 result = smb_send_request(req);
395 if (result < 0 && result != -EAGAIN)
396 goto out;
397
398 result = 0;
399 if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
400 goto out;
401
402 list_move_tail(&req->rq_queue, &server->recvq);
403 result = 1;
404out:
405 return result;
406}
407
408/*
409 * Sends one request for this server. (smbiod)
410 * Must be called with the server lock held.
411 * Returns: <0 on error
412 * 0 if no request could be completely sent
413 * 1 if all data for one request was sent
414 */
415int smb_request_send_server(struct smb_sb_info *server)
416{
417 struct list_head *head;
418 struct smb_request *req;
419 int result;
420
421 if (server->state != CONN_VALID)
422 return 0;
423
424 /* dequeue first request, if any */
425 req = NULL;
426 head = server->xmitq.next;
427 if (head != &server->xmitq) {
428 req = list_entry(head, struct smb_request, rq_queue);
429 }
430 if (!req)
431 return 0;
432
433 result = smb_request_send_req(req);
434 if (result < 0) {
435 server->conn_error = result;
436 list_move(&req->rq_queue, &server->xmitq);
437 result = -EIO;
438 goto out;
439 }
440
441out:
442 return result;
443}
444
445/*
446 * Try to find a request matching this "mid". Typically the first entry will
447 * be the matching one.
448 */
449static struct smb_request *find_request(struct smb_sb_info *server, int mid)
450{
451 struct list_head *tmp;
452 struct smb_request *req = NULL;
453
454 list_for_each(tmp, &server->recvq) {
455 req = list_entry(tmp, struct smb_request, rq_queue);
456 if (req->rq_mid == mid) {
457 break;
458 }
459 req = NULL;
460 }
461
462 if (!req) {
463 VERBOSE("received reply with mid %d but no request!\n",
464 WVAL(server->header, smb_mid));
465 server->rstate = SMB_RECV_DROP;
466 }
467
468 return req;
469}
470
471/*
472 * Called when we have read the smb header and believe this is a response.
473 */
474static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
475{
476 int hdrlen, wct;
477
478 memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
479
480 wct = *(req->rq_header + smb_wct);
481 if (wct > 20) {
482 PARANOIA("wct too large, %d > 20\n", wct);
483 server->rstate = SMB_RECV_DROP;
484 return 0;
485 }
486
487 req->rq_resp_wct = wct;
488 hdrlen = SMB_HEADER_LEN + wct*2 + 2;
489 VERBOSE("header length: %d smb_wct: %2d\n", hdrlen, wct);
490
491 req->rq_bytes_recvd = SMB_HEADER_LEN;
492 req->rq_rlen = hdrlen;
493 req->rq_iov[0].iov_base = req->rq_header;
494 req->rq_iov[0].iov_len = hdrlen;
495 req->rq_iovlen = 1;
496 server->rstate = SMB_RECV_PARAM;
497
498#ifdef SMB_DEBUG_PACKET_SIZE
499 add_recv_stats(smb_len(server->header));
500#endif
501 return 0;
502}
503
504/*
505 * Reads the SMB parameters
506 */
507static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
508{
509 int result;
510
511 result = smb_receive(server, req);
512 if (result < 0)
513 return result;
514 if (req->rq_bytes_recvd < req->rq_rlen)
515 return 0;
516
517 VERBOSE("result: %d smb_bcc: %04x\n", result,
518 WVAL(req->rq_header, SMB_HEADER_LEN +
519 (*(req->rq_header + smb_wct) * 2)));
520
521 result = 0;
522 req->rq_iov[0].iov_base = NULL;
523 req->rq_rlen = 0;
524 if (req->rq_callback)
525 req->rq_callback(req);
526 else if (req->rq_setup_read)
527 result = req->rq_setup_read(req);
528 if (result < 0) {
529 server->rstate = SMB_RECV_DROP;
530 return result;
531 }
532
533 server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
534
535 req->rq_bytes_recvd = 0; // recvd out of the iov
536
537 VERBOSE("rlen: %d\n", req->rq_rlen);
538 if (req->rq_rlen < 0) {
539 PARANOIA("Parameters read beyond end of packet!\n");
540 server->rstate = SMB_RECV_END;
541 return -EIO;
542 }
543 return 0;
544}
545
546/*
547 * Reads the SMB data
548 */
549static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
550{
551 int result;
552
553 result = smb_receive(server, req);
554 if (result < 0)
555 goto out;
556 if (req->rq_bytes_recvd < req->rq_rlen)
557 goto out;
558 server->rstate = SMB_RECV_END;
559out:
560 VERBOSE("result: %d\n", result);
561 return result;
562}
563
564/*
565 * Receive a transaction2 response
566 * Return: 0 if the response has been fully read
567 * 1 if there are further "fragments" to read
568 * <0 if there is an error
569 */
570static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
571{
572 unsigned char *inbuf;
573 unsigned int parm_disp, parm_offset, parm_count, parm_tot;
574 unsigned int data_disp, data_offset, data_count, data_tot;
575 int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
576
577 VERBOSE("handling trans2\n");
578
579 inbuf = req->rq_header;
580 data_tot = WVAL(inbuf, smb_tdrcnt);
581 parm_tot = WVAL(inbuf, smb_tprcnt);
582 parm_disp = WVAL(inbuf, smb_prdisp);
583 parm_offset = WVAL(inbuf, smb_proff);
584 parm_count = WVAL(inbuf, smb_prcnt);
585 data_disp = WVAL(inbuf, smb_drdisp);
586 data_offset = WVAL(inbuf, smb_droff);
587 data_count = WVAL(inbuf, smb_drcnt);
588
589 /* Modify offset for the split header/buffer we use */
590 if (data_count || data_offset) {
591 if (unlikely(data_offset < hdrlen))
592 goto out_bad_data;
593 else
594 data_offset -= hdrlen;
595 }
596 if (parm_count || parm_offset) {
597 if (unlikely(parm_offset < hdrlen))
598 goto out_bad_parm;
599 else
600 parm_offset -= hdrlen;
601 }
602
603 if (parm_count == parm_tot && data_count == data_tot) {
604 /*
605 * This packet has all the trans2 data.
606 *
607 * We setup the request so that this will be the common
608 * case. It may be a server error to not return a
609 * response that fits.
610 */
611 VERBOSE("single trans2 response "
612 "dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
613 data_count, parm_count,
614 data_offset, parm_offset);
615 req->rq_ldata = data_count;
616 req->rq_lparm = parm_count;
617 req->rq_data = req->rq_buffer + data_offset;
618 req->rq_parm = req->rq_buffer + parm_offset;
619 if (unlikely(parm_offset + parm_count > req->rq_rlen))
620 goto out_bad_parm;
621 if (unlikely(data_offset + data_count > req->rq_rlen))
622 goto out_bad_data;
623 return 0;
624 }
625
626 VERBOSE("multi trans2 response "
627 "frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
628 req->rq_fragment,
629 data_count, parm_count,
630 data_offset, parm_offset);
631
632 if (!req->rq_fragment) {
633 int buf_len;
634
635 /* We got the first trans2 fragment */
636 req->rq_fragment = 1;
637 req->rq_total_data = data_tot;
638 req->rq_total_parm = parm_tot;
639 req->rq_ldata = 0;
640 req->rq_lparm = 0;
641
642 buf_len = data_tot + parm_tot;
643 if (buf_len > SMB_MAX_PACKET_SIZE)
644 goto out_too_long;
645
646 req->rq_trans2bufsize = buf_len;
647 req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
648 if (!req->rq_trans2buffer)
649 goto out_no_mem;
650
651 req->rq_parm = req->rq_trans2buffer;
652 req->rq_data = req->rq_trans2buffer + parm_tot;
653 } else if (unlikely(req->rq_total_data < data_tot ||
654 req->rq_total_parm < parm_tot))
655 goto out_data_grew;
656
657 if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
658 parm_offset + parm_count > req->rq_rlen))
659 goto out_bad_parm;
660 if (unlikely(data_disp + data_count > req->rq_total_data ||
661 data_offset + data_count > req->rq_rlen))
662 goto out_bad_data;
663
664 inbuf = req->rq_buffer;
665 memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
666 memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
667
668 req->rq_ldata += data_count;
669 req->rq_lparm += parm_count;
670
671 /*
672 * Check whether we've received all of the data. Note that
673 * we use the packet totals -- total lengths might shrink!
674 */
675 if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
676 req->rq_ldata = data_tot;
677 req->rq_lparm = parm_tot;
678 return 0;
679 }
680 return 1;
681
682out_too_long:
683 printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
684 data_tot, parm_tot);
685 goto out_EIO;
686out_no_mem:
687 printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
688 req->rq_trans2bufsize);
689 req->rq_errno = -ENOMEM;
690 goto out;
691out_data_grew:
692 printk(KERN_ERR "smb_trans2: data/params grew!\n");
693 goto out_EIO;
694out_bad_parm:
695 printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
696 parm_disp, parm_count, parm_tot, parm_offset);
697 goto out_EIO;
698out_bad_data:
699 printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
700 data_disp, data_count, data_tot, data_offset);
701out_EIO:
702 req->rq_errno = -EIO;
703out:
704 return req->rq_errno;
705}
706
707/*
708 * State machine for receiving responses. We handle the fact that we can't
709 * read the full response in one try by having states telling us how much we
710 * have read.
711 *
712 * Must be called with the server lock held (only called from smbiod).
713 *
714 * Return: <0 on error
715 */
716int smb_request_recv(struct smb_sb_info *server)
717{
718 struct smb_request *req = NULL;
719 int result = 0;
720
721 if (smb_recv_available(server) <= 0)
722 return 0;
723
724 VERBOSE("state: %d\n", server->rstate);
725 switch (server->rstate) {
726 case SMB_RECV_DROP:
727 result = smb_receive_drop(server);
728 if (result < 0)
729 break;
730 if (server->rstate == SMB_RECV_DROP)
731 break;
732 server->rstate = SMB_RECV_START;
733 /* fallthrough */
734 case SMB_RECV_START:
735 server->smb_read = 0;
736 server->rstate = SMB_RECV_HEADER;
737 /* fallthrough */
738 case SMB_RECV_HEADER:
739 result = smb_receive_header(server);
740 if (result < 0)
741 break;
742 if (server->rstate == SMB_RECV_HEADER)
743 break;
744 if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
745 server->rstate = SMB_RECV_REQUEST;
746 break;
747 }
748 if (server->rstate != SMB_RECV_HCOMPLETE)
749 break;
750 /* fallthrough */
751 case SMB_RECV_HCOMPLETE:
752 req = find_request(server, WVAL(server->header, smb_mid));
753 if (!req)
754 break;
755 smb_init_request(server, req);
756 req->rq_rcls = *(req->rq_header + smb_rcls);
757 req->rq_err = WVAL(req->rq_header, smb_err);
758 if (server->rstate != SMB_RECV_PARAM)
759 break;
760 /* fallthrough */
761 case SMB_RECV_PARAM:
762 if (!req)
763 req = find_request(server,WVAL(server->header,smb_mid));
764 if (!req)
765 break;
766 result = smb_recv_param(server, req);
767 if (result < 0)
768 break;
769 if (server->rstate != SMB_RECV_DATA)
770 break;
771 /* fallthrough */
772 case SMB_RECV_DATA:
773 if (!req)
774 req = find_request(server,WVAL(server->header,smb_mid));
775 if (!req)
776 break;
777 result = smb_recv_data(server, req);
778 if (result < 0)
779 break;
780 break;
781
782 /* We should never be called with any of these states */
783 case SMB_RECV_END:
784 case SMB_RECV_REQUEST:
785 BUG();
786 }
787
788 if (result < 0) {
789 /* We saw an error */
790 return result;
791 }
792
793 if (server->rstate != SMB_RECV_END)
794 return 0;
795
796 result = 0;
797 if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
798 result = smb_recv_trans2(server, req);
799
800 /*
801 * Response completely read. Drop any extra bytes sent by the server.
802 * (Yes, servers sometimes add extra bytes to responses)
803 */
804 VERBOSE("smb_len: %d smb_read: %d\n",
805 server->smb_len, server->smb_read);
806 if (server->smb_read < server->smb_len)
807 smb_receive_drop(server);
808
809 server->rstate = SMB_RECV_START;
810
811 if (!result) {
812 list_del_init(&req->rq_queue);
813 req->rq_flags |= SMB_REQ_RECEIVED;
814 smb_rput(req);
815 wake_up_interruptible(&req->rq_wait);
816 }
817 return 0;
818}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c9..000000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
1#include <linux/list.h>
2#include <linux/types.h>
3#include <linux/uio.h>
4#include <linux/wait.h>
5
6struct smb_request {
7 struct list_head rq_queue; /* recvq or xmitq for the server */
8
9 atomic_t rq_count;
10
11 wait_queue_head_t rq_wait;
12 int rq_flags;
13 int rq_mid; /* multiplex ID, set by request.c */
14
15 struct smb_sb_info *rq_server;
16
17 /* header + word count + parameter words + byte count */
18 unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
19
20 int rq_bufsize;
21 unsigned char *rq_buffer;
22
23 /* FIXME: this is not good enough for merging IO requests. */
24 unsigned char *rq_page;
25 int rq_rsize;
26
27 int rq_resp_wct;
28 int rq_resp_bcc;
29
30 int rq_rlen;
31 int rq_bytes_recvd;
32
33 int rq_slen;
34 int rq_bytes_sent;
35
36 int rq_iovlen;
37 struct kvec rq_iov[4];
38
39 int (*rq_setup_read) (struct smb_request *);
40 void (*rq_callback) (struct smb_request *);
41
42 /* ------ trans2 stuff ------ */
43
44 u16 rq_trans2_command; /* 0 if not a trans2 request */
45 unsigned int rq_ldata;
46 unsigned char *rq_data;
47 unsigned int rq_lparm;
48 unsigned char *rq_parm;
49
50 int rq_fragment;
51 u32 rq_total_data;
52 u32 rq_total_parm;
53 int rq_trans2bufsize;
54 unsigned char *rq_trans2buffer;
55
56 /* ------ response ------ */
57
58 unsigned short rq_rcls;
59 unsigned short rq_err;
60 int rq_errno;
61};
62
63#define SMB_REQ_STATIC 0x0001 /* rq_buffer is static */
64#define SMB_REQ_NORETRY 0x0002 /* request is invalid after retry */
65
66#define SMB_REQ_TRANSMITTED 0x4000 /* all data has been sent */
67#define SMB_REQ_RECEIVED 0x8000 /* reply received, smbiod is done */
68
69#define xSMB_REQ_NOREPLY 0x0004 /* we don't want the reply (if any) */
70#define xSMB_REQ_NORECEIVER 0x0008 /* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd755..000000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * Defines some debug macros for smbfs.
3 */
4
5/* This makes a dentry parent/child name pair. Useful for debugging printk's */
6#define DENTRY_PATH(dentry) \
7 (dentry)->d_parent->d_name.name,(dentry)->d_name.name
8
9/*
10 * safety checks that should never happen ???
11 * these are normally enabled.
12 */
13#ifdef SMBFS_PARANOIA
14# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
15#else
16# define PARANOIA(f, a...) do { ; } while(0)
17#endif
18
19/* lots of debug messages */
20#ifdef SMBFS_DEBUG_VERBOSE
21# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
22#else
23# define VERBOSE(f, a...) do { ; } while(0)
24#endif
25
26/*
27 * "normal" debug messages, but not with a normal DEBUG define ... way
28 * too common name.
29 */
30#ifdef SMBFS_DEBUG
31#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
32#else
33#define DEBUG1(f, a...) do { ; } while(0)
34#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10a..000000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
1/*
2 * smbiod.c
3 *
4 * Copyright (C) 2000, Charles Loep / Corel Corp.
5 * Copyright (C) 2001, Urban Widmark
6 */
7
8
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/mm.h>
12#include <linux/string.h>
13#include <linux/stat.h>
14#include <linux/errno.h>
15#include <linux/init.h>
16#include <linux/file.h>
17#include <linux/dcache.h>
18#include <linux/module.h>
19#include <linux/net.h>
20#include <linux/kthread.h>
21#include <net/ip.h>
22
23#include <linux/smb_fs.h>
24#include <linux/smbno.h>
25#include <linux/smb_mount.h>
26
27#include <asm/system.h>
28#include <asm/uaccess.h>
29
30#include "smb_debug.h"
31#include "request.h"
32#include "proto.h"
33
34enum smbiod_state {
35 SMBIOD_DEAD,
36 SMBIOD_STARTING,
37 SMBIOD_RUNNING,
38};
39
40static enum smbiod_state smbiod_state = SMBIOD_DEAD;
41static struct task_struct *smbiod_thread;
42static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
43static LIST_HEAD(smb_servers);
44static DEFINE_SPINLOCK(servers_lock);
45
46#define SMBIOD_DATA_READY (1<<0)
47static unsigned long smbiod_flags;
48
49static int smbiod(void *);
50static int smbiod_start(void);
51
52/*
53 * called when there's work for us to do
54 */
55void smbiod_wake_up(void)
56{
57 if (smbiod_state == SMBIOD_DEAD)
58 return;
59 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
60 wake_up_interruptible(&smbiod_wait);
61}
62
63/*
64 * start smbiod if none is running
65 */
66static int smbiod_start(void)
67{
68 struct task_struct *tsk;
69 int err = 0;
70
71 if (smbiod_state != SMBIOD_DEAD)
72 return 0;
73 smbiod_state = SMBIOD_STARTING;
74 __module_get(THIS_MODULE);
75 spin_unlock(&servers_lock);
76 tsk = kthread_run(smbiod, NULL, "smbiod");
77 if (IS_ERR(tsk)) {
78 err = PTR_ERR(tsk);
79 module_put(THIS_MODULE);
80 }
81
82 spin_lock(&servers_lock);
83 if (err < 0) {
84 smbiod_state = SMBIOD_DEAD;
85 smbiod_thread = NULL;
86 } else {
87 smbiod_state = SMBIOD_RUNNING;
88 smbiod_thread = tsk;
89 }
90 return err;
91}
92
93/*
94 * register a server & start smbiod if necessary
95 */
96int smbiod_register_server(struct smb_sb_info *server)
97{
98 int ret;
99 spin_lock(&servers_lock);
100 list_add(&server->entry, &smb_servers);
101 VERBOSE("%p\n", server);
102 ret = smbiod_start();
103 spin_unlock(&servers_lock);
104 return ret;
105}
106
107/*
108 * Unregister a server
109 * Must be called with the server lock held.
110 */
111void smbiod_unregister_server(struct smb_sb_info *server)
112{
113 spin_lock(&servers_lock);
114 list_del_init(&server->entry);
115 VERBOSE("%p\n", server);
116 spin_unlock(&servers_lock);
117
118 smbiod_wake_up();
119 smbiod_flush(server);
120}
121
122void smbiod_flush(struct smb_sb_info *server)
123{
124 struct list_head *tmp, *n;
125 struct smb_request *req;
126
127 list_for_each_safe(tmp, n, &server->xmitq) {
128 req = list_entry(tmp, struct smb_request, rq_queue);
129 req->rq_errno = -EIO;
130 list_del_init(&req->rq_queue);
131 smb_rput(req);
132 wake_up_interruptible(&req->rq_wait);
133 }
134 list_for_each_safe(tmp, n, &server->recvq) {
135 req = list_entry(tmp, struct smb_request, rq_queue);
136 req->rq_errno = -EIO;
137 list_del_init(&req->rq_queue);
138 smb_rput(req);
139 wake_up_interruptible(&req->rq_wait);
140 }
141}
142
143/*
144 * Wake up smbmount and make it reconnect to the server.
145 * This must be called with the server locked.
146 *
147 * FIXME: add smbconnect version to this
148 */
149int smbiod_retry(struct smb_sb_info *server)
150{
151 struct list_head *head;
152 struct smb_request *req;
153 struct pid *pid = get_pid(server->conn_pid);
154 int result = 0;
155
156 VERBOSE("state: %d\n", server->state);
157 if (server->state == CONN_VALID || server->state == CONN_RETRYING)
158 goto out;
159
160 smb_invalidate_inodes(server);
161
162 /*
163 * Some requests are meaningless after a retry, so we abort them.
164 * One example are all requests using 'fileid' since the files are
165 * closed on retry.
166 */
167 head = server->xmitq.next;
168 while (head != &server->xmitq) {
169 req = list_entry(head, struct smb_request, rq_queue);
170 head = head->next;
171
172 req->rq_bytes_sent = 0;
173 if (req->rq_flags & SMB_REQ_NORETRY) {
174 VERBOSE("aborting request %p on xmitq\n", req);
175 req->rq_errno = -EIO;
176 list_del_init(&req->rq_queue);
177 smb_rput(req);
178 wake_up_interruptible(&req->rq_wait);
179 }
180 }
181
182 /*
183 * FIXME: test the code for retrying request we already sent
184 */
185 head = server->recvq.next;
186 while (head != &server->recvq) {
187 req = list_entry(head, struct smb_request, rq_queue);
188 head = head->next;
189#if 0
190 if (req->rq_flags & SMB_REQ_RETRY) {
191 /* must move the request to the xmitq */
192 VERBOSE("retrying request %p on recvq\n", req);
193 list_move(&req->rq_queue, &server->xmitq);
194 continue;
195 }
196#endif
197
198 VERBOSE("aborting request %p on recvq\n", req);
199 /* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
200 req->rq_errno = -EIO;
201 list_del_init(&req->rq_queue);
202 smb_rput(req);
203 wake_up_interruptible(&req->rq_wait);
204 }
205
206 smb_close_socket(server);
207
208 if (!pid) {
209 /* FIXME: this is fatal, umount? */
210 printk(KERN_ERR "smb_retry: no connection process\n");
211 server->state = CONN_RETRIED;
212 goto out;
213 }
214
215 /*
216 * Change state so that only one retry per server will be started.
217 */
218 server->state = CONN_RETRYING;
219
220 /*
221 * Note: use the "priv" flag, as a user process may need to reconnect.
222 */
223 result = kill_pid(pid, SIGUSR1, 1);
224 if (result) {
225 /* FIXME: this is most likely fatal, umount? */
226 printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
227 goto out;
228 }
229 VERBOSE("signalled pid %d\n", pid_nr(pid));
230
231 /* FIXME: The retried requests should perhaps get a "time boost". */
232
233out:
234 put_pid(pid);
235 return result;
236}
237
238/*
239 * Currently handles lockingX packets.
240 */
241static void smbiod_handle_request(struct smb_sb_info *server)
242{
243 PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
244 server->rstate = SMB_RECV_DROP;
245}
246
247/*
248 * Do some IO for one server.
249 */
250static void smbiod_doio(struct smb_sb_info *server)
251{
252 int result;
253 int maxwork = 7;
254
255 if (server->state != CONN_VALID)
256 goto out;
257
258 do {
259 result = smb_request_recv(server);
260 if (result < 0) {
261 server->state = CONN_INVALID;
262 smbiod_retry(server);
263 goto out; /* reconnecting is slow */
264 } else if (server->rstate == SMB_RECV_REQUEST)
265 smbiod_handle_request(server);
266 } while (result > 0 && maxwork-- > 0);
267
268 /*
269 * If there is more to read then we want to be sure to wake up again.
270 */
271 if (server->state != CONN_VALID)
272 goto out;
273 if (smb_recv_available(server) > 0)
274 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
275
276 do {
277 result = smb_request_send_server(server);
278 if (result < 0) {
279 server->state = CONN_INVALID;
280 smbiod_retry(server);
281 goto out; /* reconnecting is slow */
282 }
283 } while (result > 0);
284
285 /*
286 * If the last request was not sent out we want to wake up again.
287 */
288 if (!list_empty(&server->xmitq))
289 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
290
291out:
292 return;
293}
294
295/*
296 * smbiod kernel thread
297 */
298static int smbiod(void *unused)
299{
300 VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
301
302 for (;;) {
303 struct smb_sb_info *server;
304 struct list_head *pos, *n;
305
306 /* FIXME: Use poll? */
307 wait_event_interruptible(smbiod_wait,
308 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
309 if (signal_pending(current)) {
310 spin_lock(&servers_lock);
311 smbiod_state = SMBIOD_DEAD;
312 spin_unlock(&servers_lock);
313 break;
314 }
315
316 clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
317
318 spin_lock(&servers_lock);
319 if (list_empty(&smb_servers)) {
320 smbiod_state = SMBIOD_DEAD;
321 spin_unlock(&servers_lock);
322 break;
323 }
324
325 list_for_each_safe(pos, n, &smb_servers) {
326 server = list_entry(pos, struct smb_sb_info, entry);
327 VERBOSE("checking server %p\n", server);
328
329 if (server->state == CONN_VALID) {
330 spin_unlock(&servers_lock);
331
332 smb_lock_server(server);
333 smbiod_doio(server);
334 smb_unlock_server(server);
335
336 spin_lock(&servers_lock);
337 }
338 }
339 spin_unlock(&servers_lock);
340 }
341
342 VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
343 module_put_and_exit(0);
344}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd0..000000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
1/*
2 * sock.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/fs.h>
11#include <linux/time.h>
12#include <linux/errno.h>
13#include <linux/socket.h>
14#include <linux/fcntl.h>
15#include <linux/file.h>
16#include <linux/in.h>
17#include <linux/net.h>
18#include <linux/mm.h>
19#include <linux/netdevice.h>
20#include <linux/workqueue.h>
21#include <net/scm.h>
22#include <net/tcp_states.h>
23#include <net/ip.h>
24
25#include <linux/smb_fs.h>
26#include <linux/smb.h>
27#include <linux/smbno.h>
28
29#include <asm/uaccess.h>
30#include <asm/ioctls.h>
31
32#include "smb_debug.h"
33#include "proto.h"
34#include "request.h"
35
36
37static int
38_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
39{
40 struct kvec iov = {ubuf, size};
41 struct msghdr msg = {.msg_flags = flags};
42 msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
43 return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
44}
45
46/*
47 * Return the server this socket belongs to
48 */
49static struct smb_sb_info *
50server_from_socket(struct socket *socket)
51{
52 return socket->sk->sk_user_data;
53}
54
55/*
56 * Called when there is data on the socket.
57 */
58void
59smb_data_ready(struct sock *sk, int len)
60{
61 struct smb_sb_info *server = server_from_socket(sk->sk_socket);
62 void (*data_ready)(struct sock *, int) = server->data_ready;
63
64 data_ready(sk, len);
65 VERBOSE("(%p, %d)\n", sk, len);
66 smbiod_wake_up();
67}
68
69int
70smb_valid_socket(struct inode * inode)
71{
72 return (inode && S_ISSOCK(inode->i_mode) &&
73 SOCKET_I(inode)->type == SOCK_STREAM);
74}
75
76static struct socket *
77server_sock(struct smb_sb_info *server)
78{
79 struct file *file;
80
81 if (server && (file = server->sock_file))
82 {
83#ifdef SMBFS_PARANOIA
84 if (!smb_valid_socket(file->f_path.dentry->d_inode))
85 PARANOIA("bad socket!\n");
86#endif
87 return SOCKET_I(file->f_path.dentry->d_inode);
88 }
89 return NULL;
90}
91
92void
93smb_close_socket(struct smb_sb_info *server)
94{
95 struct file * file = server->sock_file;
96
97 if (file) {
98 struct socket *sock = server_sock(server);
99
100 VERBOSE("closing socket %p\n", sock);
101 sock->sk->sk_data_ready = server->data_ready;
102 server->sock_file = NULL;
103 fput(file);
104 }
105}
106
107static int
108smb_get_length(struct socket *socket, unsigned char *header)
109{
110 int result;
111
112 result = _recvfrom(socket, header, 4, MSG_PEEK);
113 if (result == -EAGAIN)
114 return -ENODATA;
115 if (result < 0) {
116 PARANOIA("recv error = %d\n", -result);
117 return result;
118 }
119 if (result < 4)
120 return -ENODATA;
121
122 switch (header[0]) {
123 case 0x00:
124 case 0x82:
125 break;
126
127 case 0x85:
128 DEBUG1("Got SESSION KEEP ALIVE\n");
129 _recvfrom(socket, header, 4, 0); /* read away */
130 return -ENODATA;
131
132 default:
133 PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
134 return -EIO;
135 }
136
137 /* The length in the RFC NB header is the raw data length */
138 return smb_len(header);
139}
140
141int
142smb_recv_available(struct smb_sb_info *server)
143{
144 mm_segment_t oldfs;
145 int avail, err;
146 struct socket *sock = server_sock(server);
147
148 oldfs = get_fs();
149 set_fs(get_ds());
150 err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
151 set_fs(oldfs);
152 return (err >= 0) ? avail : err;
153}
154
155/*
156 * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
157 */
158static int
159smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
160{
161 struct kvec *iv = *data;
162 int i;
163 int len;
164
165 /*
166 * Eat any sent kvecs
167 */
168 while (iv->iov_len <= amount) {
169 amount -= iv->iov_len;
170 iv++;
171 (*num)--;
172 }
173
174 /*
175 * And chew down the partial one
176 */
177 vec[0].iov_len = iv->iov_len-amount;
178 vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
179 iv++;
180
181 len = vec[0].iov_len;
182
183 /*
184 * And copy any others
185 */
186 for (i = 1; i < *num; i++) {
187 vec[i] = *iv++;
188 len += vec[i].iov_len;
189 }
190
191 *data = vec;
192 return len;
193}
194
195/*
196 * smb_receive_header
197 * Only called by the smbiod thread.
198 */
199int
200smb_receive_header(struct smb_sb_info *server)
201{
202 struct socket *sock;
203 int result = 0;
204 unsigned char peek_buf[4];
205
206 result = -EIO;
207 sock = server_sock(server);
208 if (!sock)
209 goto out;
210 if (sock->sk->sk_state != TCP_ESTABLISHED)
211 goto out;
212
213 if (!server->smb_read) {
214 result = smb_get_length(sock, peek_buf);
215 if (result < 0) {
216 if (result == -ENODATA)
217 result = 0;
218 goto out;
219 }
220 server->smb_len = result + 4;
221
222 if (server->smb_len < SMB_HEADER_LEN) {
223 PARANOIA("short packet: %d\n", result);
224 server->rstate = SMB_RECV_DROP;
225 result = -EIO;
226 goto out;
227 }
228 if (server->smb_len > SMB_MAX_PACKET_SIZE) {
229 PARANOIA("long packet: %d\n", result);
230 server->rstate = SMB_RECV_DROP;
231 result = -EIO;
232 goto out;
233 }
234 }
235
236 result = _recvfrom(sock, server->header + server->smb_read,
237 SMB_HEADER_LEN - server->smb_read, 0);
238 VERBOSE("_recvfrom: %d\n", result);
239 if (result < 0) {
240 VERBOSE("receive error: %d\n", result);
241 goto out;
242 }
243 server->smb_read += result;
244
245 if (server->smb_read == SMB_HEADER_LEN)
246 server->rstate = SMB_RECV_HCOMPLETE;
247out:
248 return result;
249}
250
251static char drop_buffer[PAGE_SIZE];
252
253/*
254 * smb_receive_drop - read and throw away the data
255 * Only called by the smbiod thread.
256 *
257 * FIXME: we are in the kernel, could we just tell the socket that we want
258 * to drop stuff from the buffer?
259 */
260int
261smb_receive_drop(struct smb_sb_info *server)
262{
263 struct socket *sock;
264 unsigned int flags;
265 struct kvec iov;
266 struct msghdr msg;
267 int rlen = smb_len(server->header) - server->smb_read + 4;
268 int result = -EIO;
269
270 if (rlen > PAGE_SIZE)
271 rlen = PAGE_SIZE;
272
273 sock = server_sock(server);
274 if (!sock)
275 goto out;
276 if (sock->sk->sk_state != TCP_ESTABLISHED)
277 goto out;
278
279 flags = MSG_DONTWAIT | MSG_NOSIGNAL;
280 iov.iov_base = drop_buffer;
281 iov.iov_len = PAGE_SIZE;
282 msg.msg_flags = flags;
283 msg.msg_name = NULL;
284 msg.msg_namelen = 0;
285 msg.msg_control = NULL;
286
287 result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
288
289 VERBOSE("read: %d\n", result);
290 if (result < 0) {
291 VERBOSE("receive error: %d\n", result);
292 goto out;
293 }
294 server->smb_read += result;
295
296 if (server->smb_read >= server->smb_len)
297 server->rstate = SMB_RECV_END;
298
299out:
300 return result;
301}
302
303/*
304 * smb_receive
305 * Only called by the smbiod thread.
306 */
307int
308smb_receive(struct smb_sb_info *server, struct smb_request *req)
309{
310 struct socket *sock;
311 unsigned int flags;
312 struct kvec iov[4];
313 struct kvec *p = req->rq_iov;
314 size_t num = req->rq_iovlen;
315 struct msghdr msg;
316 int rlen;
317 int result = -EIO;
318
319 sock = server_sock(server);
320 if (!sock)
321 goto out;
322 if (sock->sk->sk_state != TCP_ESTABLISHED)
323 goto out;
324
325 flags = MSG_DONTWAIT | MSG_NOSIGNAL;
326 msg.msg_flags = flags;
327 msg.msg_name = NULL;
328 msg.msg_namelen = 0;
329 msg.msg_control = NULL;
330
331 /* Dont repeat bytes and count available bufferspace */
332 rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
333 (req->rq_rlen - req->rq_bytes_recvd));
334
335 result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
336
337 VERBOSE("read: %d\n", result);
338 if (result < 0) {
339 VERBOSE("receive error: %d\n", result);
340 goto out;
341 }
342 req->rq_bytes_recvd += result;
343 server->smb_read += result;
344
345out:
346 return result;
347}
348
349/*
350 * Try to send a SMB request. This may return after sending only parts of the
351 * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
352 *
353 * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
354 */
355int
356smb_send_request(struct smb_request *req)
357{
358 struct smb_sb_info *server = req->rq_server;
359 struct socket *sock;
360 struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
361 int slen = req->rq_slen - req->rq_bytes_sent;
362 int result = -EIO;
363 struct kvec iov[4];
364 struct kvec *p = req->rq_iov;
365 size_t num = req->rq_iovlen;
366
367 sock = server_sock(server);
368 if (!sock)
369 goto out;
370 if (sock->sk->sk_state != TCP_ESTABLISHED)
371 goto out;
372
373 /* Dont repeat bytes */
374 if (req->rq_bytes_sent)
375 smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
376
377 result = kernel_sendmsg(sock, &msg, p, num, slen);
378
379 if (result >= 0) {
380 req->rq_bytes_sent += result;
381 if (req->rq_bytes_sent >= req->rq_slen)
382 req->rq_flags |= SMB_REQ_TRANSMITTED;
383 }
384out:
385 return result;
386}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd469..000000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * symlink.c
3 *
4 * Copyright (C) 2002 by John Newbigin
5 *
6 * Please add a note about your changes to smbfs in the ChangeLog file.
7 */
8
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/fcntl.h>
12#include <linux/stat.h>
13#include <linux/mm.h>
14#include <linux/slab.h>
15#include <linux/pagemap.h>
16#include <linux/net.h>
17#include <linux/namei.h>
18
19#include <asm/uaccess.h>
20#include <asm/system.h>
21
22#include <linux/smbno.h>
23#include <linux/smb_fs.h>
24
25#include "smb_debug.h"
26#include "proto.h"
27
28int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
29{
30 DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
31
32 return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
33}
34
35static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
36{
37 char *link = __getname();
38 DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
39
40 if (!link) {
41 link = ERR_PTR(-ENOMEM);
42 } else {
43 int len = smb_proc_read_link(server_from_dentry(dentry),
44 dentry, link, PATH_MAX - 1);
45 if (len < 0) {
46 __putname(link);
47 link = ERR_PTR(len);
48 } else {
49 link[len] = 0;
50 }
51 }
52 nd_set_link(nd, link);
53 return NULL;
54}
55
56static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
57{
58 char *s = nd_get_link(nd);
59 if (!IS_ERR(s))
60 __putname(s);
61}
62
63const struct inode_operations smb_link_inode_operations =
64{
65 .readlink = generic_readlink,
66 .follow_link = smb_follow_link,
67 .put_link = smb_put_link,
68};
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac6585..0dc340aa2be9 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
230 230
231const struct file_operations squashfs_dir_ops = { 231const struct file_operations squashfs_dir_ops = {
232 .read = generic_read_dir, 232 .read = generic_read_dir,
233 .readdir = squashfs_readdir 233 .readdir = squashfs_readdir,
234 .llseek = default_llseek,
234}; 235};
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f8606652..24de30ba34c1 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
34#include <linux/mutex.h> 33#include <linux/mutex.h>
35#include <linux/pagemap.h> 34#include <linux/pagemap.h>
36#include <linux/init.h> 35#include <linux/init.h>
@@ -354,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
354 353
355static void squashfs_put_super(struct super_block *sb) 354static void squashfs_put_super(struct super_block *sb)
356{ 355{
357 lock_kernel();
358
359 if (sb->s_fs_info) { 356 if (sb->s_fs_info) {
360 struct squashfs_sb_info *sbi = sb->s_fs_info; 357 struct squashfs_sb_info *sbi = sb->s_fs_info;
361 squashfs_cache_delete(sbi->block_cache); 358 squashfs_cache_delete(sbi->block_cache);
@@ -370,17 +367,13 @@ static void squashfs_put_super(struct super_block *sb)
370 kfree(sb->s_fs_info); 367 kfree(sb->s_fs_info);
371 sb->s_fs_info = NULL; 368 sb->s_fs_info = NULL;
372 } 369 }
373
374 unlock_kernel();
375} 370}
376 371
377 372
378static int squashfs_get_sb(struct file_system_type *fs_type, int flags, 373static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
379 const char *dev_name, void *data, 374 const char *dev_name, void *data)
380 struct vfsmount *mnt)
381{ 375{
382 return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, 376 return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
383 mnt);
384} 377}
385 378
386 379
@@ -456,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode)
456static struct file_system_type squashfs_fs_type = { 449static struct file_system_type squashfs_fs_type = {
457 .owner = THIS_MODULE, 450 .owner = THIS_MODULE,
458 .name = "squashfs", 451 .name = "squashfs",
459 .get_sb = squashfs_get_sb, 452 .mount = squashfs_mount,
460 .kill_sb = kill_block_super, 453 .kill_sb = kill_block_super,
461 .fs_flags = FS_REQUIRES_DEV 454 .fs_flags = FS_REQUIRES_DEV
462}; 455};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 652b8541f9c6..3876c36699a1 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
158 strncmp(target, name, name_size) == 0) { 158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */ 159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) { 160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr; 161 __le64 xattr_val;
162 u64 xattr;
162 /* val is a reference to the real location */ 163 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start, 164 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val)); 165 &offset, sizeof(val));
165 if (err < 0) 166 if (err < 0)
166 goto failed; 167 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start, 168 err = squashfs_read_metadata(sb, &xattr_val,
168 &offset, sizeof(xattr)); 169 &start, &offset, sizeof(xattr_val));
169 if (err < 0) 170 if (err < 0)
170 goto failed; 171 goto failed;
171 xattr = le64_to_cpu(xattr); 172 xattr = le64_to_cpu(xattr_val);
172 start = SQUASHFS_XATTR_BLK(xattr) + 173 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table; 174 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr); 175 offset = SQUASHFS_XATTR_OFFSET(xattr);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 49fe0d719fbf..b634efce4bde 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -25,7 +25,7 @@
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, 25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *); 26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, 27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *); 28 unsigned int *, unsigned long long *);
29#else 29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, 30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids) 31 u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
35} 35}
36 36
37static inline int squashfs_xattr_lookup(struct super_block *sb, 37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size, 38 unsigned int index, int *count, unsigned int *size,
39 unsigned long long *xattr) 39 unsigned long long *xattr)
40{ 40{
41 return 0; 41 return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098f..d33be5dd6c32 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -34,6 +34,7 @@
34#include "squashfs_fs_sb.h" 34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h" 35#include "squashfs_fs_i.h"
36#include "squashfs.h" 36#include "squashfs.h"
37#include "xattr.h"
37 38
38/* 39/*
39 * Map xattr id using the xattr id look up table 40 * Map xattr id using the xattr id look up table
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..ca696155cd9a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
273 get_fs_excl(); 273 get_fs_excl();
274 sb->s_flags &= ~MS_ACTIVE; 274 sb->s_flags &= ~MS_ACTIVE;
275 275
276 /* bad name - it should be evict_inodes() */ 276 fsnotify_unmount_inodes(&sb->s_inodes);
277 invalidate_inodes(sb); 277
278 evict_inodes(sb);
278 279
279 if (sop->put_super) 280 if (sop->put_super)
280 sop->put_super(sb); 281 sop->put_super(sb);
281 282
282 /* Forget any remaining inodes */ 283 if (!list_empty(&sb->s_inodes)) {
283 if (invalidate_inodes(sb)) {
284 printk("VFS: Busy inodes after unmount of %s. " 284 printk("VFS: Busy inodes after unmount of %s. "
285 "Self-destruct in 5 seconds. Have a nice day...\n", 285 "Self-destruct in 5 seconds. Have a nice day...\n",
286 sb->s_id); 286 sb->s_id);
@@ -715,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data)
715 return set_anon_super(sb, NULL); 715 return set_anon_super(sb, NULL);
716} 716}
717 717
718int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, 718struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
719 int (*fill_super)(struct super_block *, void *, int), 719 void *data, int (*fill_super)(struct super_block *, void *, int))
720 struct vfsmount *mnt)
721{ 720{
722 struct super_block *sb; 721 struct super_block *sb;
723 722
724 sb = sget(fs_type, ns_test_super, ns_set_super, data); 723 sb = sget(fs_type, ns_test_super, ns_set_super, data);
725 if (IS_ERR(sb)) 724 if (IS_ERR(sb))
726 return PTR_ERR(sb); 725 return ERR_CAST(sb);
727 726
728 if (!sb->s_root) { 727 if (!sb->s_root) {
729 int err; 728 int err;
@@ -731,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
731 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); 730 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
732 if (err) { 731 if (err) {
733 deactivate_locked_super(sb); 732 deactivate_locked_super(sb);
734 return err; 733 return ERR_PTR(err);
735 } 734 }
736 735
737 sb->s_flags |= MS_ACTIVE; 736 sb->s_flags |= MS_ACTIVE;
738 } 737 }
739 738
740 simple_set_mnt(mnt, sb); 739 return dget(sb->s_root);
741 return 0;
742} 740}
743 741
744EXPORT_SYMBOL(get_sb_ns); 742EXPORT_SYMBOL(mount_ns);
745 743
746#ifdef CONFIG_BLOCK 744#ifdef CONFIG_BLOCK
747static int set_bdev_super(struct super_block *s, void *data) 745static int set_bdev_super(struct super_block *s, void *data)
@@ -762,10 +760,9 @@ static int test_bdev_super(struct super_block *s, void *data)
762 return (void *)s->s_bdev == data; 760 return (void *)s->s_bdev == data;
763} 761}
764 762
765int get_sb_bdev(struct file_system_type *fs_type, 763struct dentry *mount_bdev(struct file_system_type *fs_type,
766 int flags, const char *dev_name, void *data, 764 int flags, const char *dev_name, void *data,
767 int (*fill_super)(struct super_block *, void *, int), 765 int (*fill_super)(struct super_block *, void *, int))
768 struct vfsmount *mnt)
769{ 766{
770 struct block_device *bdev; 767 struct block_device *bdev;
771 struct super_block *s; 768 struct super_block *s;
@@ -777,7 +774,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
777 774
778 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 775 bdev = open_bdev_exclusive(dev_name, mode, fs_type);
779 if (IS_ERR(bdev)) 776 if (IS_ERR(bdev))
780 return PTR_ERR(bdev); 777 return ERR_CAST(bdev);
781 778
782 /* 779 /*
783 * once the super is inserted into the list by sget, s_umount 780 * once the super is inserted into the list by sget, s_umount
@@ -829,15 +826,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
829 bdev->bd_super = s; 826 bdev->bd_super = s;
830 } 827 }
831 828
832 simple_set_mnt(mnt, s); 829 return dget(s->s_root);
833 return 0;
834 830
835error_s: 831error_s:
836 error = PTR_ERR(s); 832 error = PTR_ERR(s);
837error_bdev: 833error_bdev:
838 close_bdev_exclusive(bdev, mode); 834 close_bdev_exclusive(bdev, mode);
839error: 835error:
840 return error; 836 return ERR_PTR(error);
837}
838EXPORT_SYMBOL(mount_bdev);
839
840int get_sb_bdev(struct file_system_type *fs_type,
841 int flags, const char *dev_name, void *data,
842 int (*fill_super)(struct super_block *, void *, int),
843 struct vfsmount *mnt)
844{
845 struct dentry *root;
846
847 root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
848 if (IS_ERR(root))
849 return PTR_ERR(root);
850 mnt->mnt_root = root;
851 mnt->mnt_sb = root->d_sb;
852 return 0;
841} 853}
842 854
843EXPORT_SYMBOL(get_sb_bdev); 855EXPORT_SYMBOL(get_sb_bdev);
@@ -856,29 +868,42 @@ void kill_block_super(struct super_block *sb)
856EXPORT_SYMBOL(kill_block_super); 868EXPORT_SYMBOL(kill_block_super);
857#endif 869#endif
858 870
859int get_sb_nodev(struct file_system_type *fs_type, 871struct dentry *mount_nodev(struct file_system_type *fs_type,
860 int flags, void *data, 872 int flags, void *data,
861 int (*fill_super)(struct super_block *, void *, int), 873 int (*fill_super)(struct super_block *, void *, int))
862 struct vfsmount *mnt)
863{ 874{
864 int error; 875 int error;
865 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 876 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
866 877
867 if (IS_ERR(s)) 878 if (IS_ERR(s))
868 return PTR_ERR(s); 879 return ERR_CAST(s);
869 880
870 s->s_flags = flags; 881 s->s_flags = flags;
871 882
872 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 883 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
873 if (error) { 884 if (error) {
874 deactivate_locked_super(s); 885 deactivate_locked_super(s);
875 return error; 886 return ERR_PTR(error);
876 } 887 }
877 s->s_flags |= MS_ACTIVE; 888 s->s_flags |= MS_ACTIVE;
878 simple_set_mnt(mnt, s); 889 return dget(s->s_root);
879 return 0;
880} 890}
891EXPORT_SYMBOL(mount_nodev);
892
893int get_sb_nodev(struct file_system_type *fs_type,
894 int flags, void *data,
895 int (*fill_super)(struct super_block *, void *, int),
896 struct vfsmount *mnt)
897{
898 struct dentry *root;
881 899
900 root = mount_nodev(fs_type, flags, data, fill_super);
901 if (IS_ERR(root))
902 return PTR_ERR(root);
903 mnt->mnt_root = root;
904 mnt->mnt_sb = root->d_sb;
905 return 0;
906}
882EXPORT_SYMBOL(get_sb_nodev); 907EXPORT_SYMBOL(get_sb_nodev);
883 908
884static int compare_single(struct super_block *s, void *p) 909static int compare_single(struct super_block *s, void *p)
@@ -886,29 +911,42 @@ static int compare_single(struct super_block *s, void *p)
886 return 1; 911 return 1;
887} 912}
888 913
889int get_sb_single(struct file_system_type *fs_type, 914struct dentry *mount_single(struct file_system_type *fs_type,
890 int flags, void *data, 915 int flags, void *data,
891 int (*fill_super)(struct super_block *, void *, int), 916 int (*fill_super)(struct super_block *, void *, int))
892 struct vfsmount *mnt)
893{ 917{
894 struct super_block *s; 918 struct super_block *s;
895 int error; 919 int error;
896 920
897 s = sget(fs_type, compare_single, set_anon_super, NULL); 921 s = sget(fs_type, compare_single, set_anon_super, NULL);
898 if (IS_ERR(s)) 922 if (IS_ERR(s))
899 return PTR_ERR(s); 923 return ERR_CAST(s);
900 if (!s->s_root) { 924 if (!s->s_root) {
901 s->s_flags = flags; 925 s->s_flags = flags;
902 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 926 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
903 if (error) { 927 if (error) {
904 deactivate_locked_super(s); 928 deactivate_locked_super(s);
905 return error; 929 return ERR_PTR(error);
906 } 930 }
907 s->s_flags |= MS_ACTIVE; 931 s->s_flags |= MS_ACTIVE;
908 } else { 932 } else {
909 do_remount_sb(s, flags, data, 0); 933 do_remount_sb(s, flags, data, 0);
910 } 934 }
911 simple_set_mnt(mnt, s); 935 return dget(s->s_root);
936}
937EXPORT_SYMBOL(mount_single);
938
939int get_sb_single(struct file_system_type *fs_type,
940 int flags, void *data,
941 int (*fill_super)(struct super_block *, void *, int),
942 struct vfsmount *mnt)
943{
944 struct dentry *root;
945 root = mount_single(fs_type, flags, data, fill_super);
946 if (IS_ERR(root))
947 return PTR_ERR(root);
948 mnt->mnt_root = root;
949 mnt->mnt_sb = root->d_sb;
912 return 0; 950 return 0;
913} 951}
914 952
@@ -918,6 +956,7 @@ struct vfsmount *
918vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 956vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
919{ 957{
920 struct vfsmount *mnt; 958 struct vfsmount *mnt;
959 struct dentry *root;
921 char *secdata = NULL; 960 char *secdata = NULL;
922 int error; 961 int error;
923 962
@@ -942,9 +981,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
942 goto out_free_secdata; 981 goto out_free_secdata;
943 } 982 }
944 983
945 error = type->get_sb(type, flags, name, data, mnt); 984 if (type->mount) {
946 if (error < 0) 985 root = type->mount(type, flags, name, data);
947 goto out_free_secdata; 986 if (IS_ERR(root)) {
987 error = PTR_ERR(root);
988 goto out_free_secdata;
989 }
990 mnt->mnt_root = root;
991 mnt->mnt_sb = root->d_sb;
992 } else {
993 error = type->get_sb(type, flags, name, data, mnt);
994 if (error < 0)
995 goto out_free_secdata;
996 }
948 BUG_ON(!mnt->mnt_sb); 997 BUG_ON(!mnt->mnt_sb);
949 WARN_ON(!mnt->mnt_sb->s_bdi); 998 WARN_ON(!mnt->mnt_sb->s_bdi);
950 mnt->mnt_sb->s_flags |= MS_BORN; 999 mnt->mnt_sb->s_flags |= MS_BORN;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353fa..a4759833d62d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,30 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
179 struct bin_buffer *bb = file->private_data; 179 struct bin_buffer *bb = file->private_data;
180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
181 181
182 if (!bb->vm_ops || !bb->vm_ops->open) 182 if (!bb->vm_ops)
183 return;
184
185 if (!sysfs_get_active(attr_sd))
186 return;
187
188 bb->vm_ops->open(vma);
189
190 sysfs_put_active(attr_sd);
191}
192
193static void bin_vma_close(struct vm_area_struct *vma)
194{
195 struct file *file = vma->vm_file;
196 struct bin_buffer *bb = file->private_data;
197 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
198
199 if (!bb->vm_ops || !bb->vm_ops->close)
200 return; 183 return;
201 184
202 if (!sysfs_get_active(attr_sd)) 185 if (!sysfs_get_active(attr_sd))
203 return; 186 return;
204 187
205 bb->vm_ops->close(vma); 188 if (bb->vm_ops->open)
189 bb->vm_ops->open(vma);
206 190
207 sysfs_put_active(attr_sd); 191 sysfs_put_active(attr_sd);
208} 192}
@@ -214,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
214 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 198 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
215 int ret; 199 int ret;
216 200
217 if (!bb->vm_ops || !bb->vm_ops->fault) 201 if (!bb->vm_ops)
218 return VM_FAULT_SIGBUS; 202 return VM_FAULT_SIGBUS;
219 203
220 if (!sysfs_get_active(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
221 return VM_FAULT_SIGBUS; 205 return VM_FAULT_SIGBUS;
222 206
223 ret = bb->vm_ops->fault(vma, vmf); 207 ret = VM_FAULT_SIGBUS;
208 if (bb->vm_ops->fault)
209 ret = bb->vm_ops->fault(vma, vmf);
224 210
225 sysfs_put_active(attr_sd); 211 sysfs_put_active(attr_sd);
226 return ret; 212 return ret;
@@ -236,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
236 if (!bb->vm_ops) 222 if (!bb->vm_ops)
237 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
238 224
239 if (!bb->vm_ops->page_mkwrite)
240 return 0;
241
242 if (!sysfs_get_active(attr_sd)) 225 if (!sysfs_get_active(attr_sd))
243 return VM_FAULT_SIGBUS; 226 return VM_FAULT_SIGBUS;
244 227
245 ret = bb->vm_ops->page_mkwrite(vma, vmf); 228 ret = 0;
229 if (bb->vm_ops->page_mkwrite)
230 ret = bb->vm_ops->page_mkwrite(vma, vmf);
246 231
247 sysfs_put_active(attr_sd); 232 sysfs_put_active(attr_sd);
248 return ret; 233 return ret;
@@ -256,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
256 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 241 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
257 int ret; 242 int ret;
258 243
259 if (!bb->vm_ops || !bb->vm_ops->access) 244 if (!bb->vm_ops)
260 return -EINVAL; 245 return -EINVAL;
261 246
262 if (!sysfs_get_active(attr_sd)) 247 if (!sysfs_get_active(attr_sd))
263 return -EINVAL; 248 return -EINVAL;
264 249
265 ret = bb->vm_ops->access(vma, addr, buf, len, write); 250 ret = -EINVAL;
251 if (bb->vm_ops->access)
252 ret = bb->vm_ops->access(vma, addr, buf, len, write);
266 253
267 sysfs_put_active(attr_sd); 254 sysfs_put_active(attr_sd);
268 return ret; 255 return ret;
@@ -276,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
276 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 263 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
277 int ret; 264 int ret;
278 265
279 if (!bb->vm_ops || !bb->vm_ops->set_policy) 266 if (!bb->vm_ops)
280 return 0; 267 return 0;
281 268
282 if (!sysfs_get_active(attr_sd)) 269 if (!sysfs_get_active(attr_sd))
283 return -EINVAL; 270 return -EINVAL;
284 271
285 ret = bb->vm_ops->set_policy(vma, new); 272 ret = 0;
273 if (bb->vm_ops->set_policy)
274 ret = bb->vm_ops->set_policy(vma, new);
286 275
287 sysfs_put_active(attr_sd); 276 sysfs_put_active(attr_sd);
288 return ret; 277 return ret;
@@ -296,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
296 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 285 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
297 struct mempolicy *pol; 286 struct mempolicy *pol;
298 287
299 if (!bb->vm_ops || !bb->vm_ops->get_policy) 288 if (!bb->vm_ops)
300 return vma->vm_policy; 289 return vma->vm_policy;
301 290
302 if (!sysfs_get_active(attr_sd)) 291 if (!sysfs_get_active(attr_sd))
303 return vma->vm_policy; 292 return vma->vm_policy;
304 293
305 pol = bb->vm_ops->get_policy(vma, addr); 294 pol = vma->vm_policy;
295 if (bb->vm_ops->get_policy)
296 pol = bb->vm_ops->get_policy(vma, addr);
306 297
307 sysfs_put_active(attr_sd); 298 sysfs_put_active(attr_sd);
308 return pol; 299 return pol;
@@ -316,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
316 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 307 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
317 int ret; 308 int ret;
318 309
319 if (!bb->vm_ops || !bb->vm_ops->migrate) 310 if (!bb->vm_ops)
320 return 0; 311 return 0;
321 312
322 if (!sysfs_get_active(attr_sd)) 313 if (!sysfs_get_active(attr_sd))
323 return 0; 314 return 0;
324 315
325 ret = bb->vm_ops->migrate(vma, from, to, flags); 316 ret = 0;
317 if (bb->vm_ops->migrate)
318 ret = bb->vm_ops->migrate(vma, from, to, flags);
326 319
327 sysfs_put_active(attr_sd); 320 sysfs_put_active(attr_sd);
328 return ret; 321 return ret;
@@ -331,7 +324,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
331 324
332static const struct vm_operations_struct bin_vm_ops = { 325static const struct vm_operations_struct bin_vm_ops = {
333 .open = bin_vma_open, 326 .open = bin_vma_open,
334 .close = bin_vma_close,
335 .fault = bin_fault, 327 .fault = bin_fault,
336 .page_mkwrite = bin_page_mkwrite, 328 .page_mkwrite = bin_page_mkwrite,
337 .access = bin_access, 329 .access = bin_access,
@@ -377,6 +369,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
377 if (bb->mmapped && bb->vm_ops != vma->vm_ops) 369 if (bb->mmapped && bb->vm_ops != vma->vm_ops)
378 goto out_put; 370 goto out_put;
379 371
372 /*
373 * It is not possible to successfully wrap close.
374 * So error if someone is trying to use close.
375 */
376 rc = -EINVAL;
377 if (vma->vm_ops && vma->vm_ops->close)
378 goto out_put;
379
380 rc = 0; 380 rc = 0;
381 bb->mmapped = 1; 381 bb->mmapped = 1;
382 bb->vm_ops = vma->vm_ops; 382 bb->vm_ops = vma->vm_ops;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f2af22574c50..266895783b47 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,7 @@
23#include "sysfs.h" 23#include "sysfs.h"
24 24
25 25
26static struct vfsmount *sysfs_mount; 26static struct vfsmount *sysfs_mnt;
27struct kmem_cache *sysfs_dir_cachep; 27struct kmem_cache *sysfs_dir_cachep;
28 28
29static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
95 return error; 95 return error;
96} 96}
97 97
98static int sysfs_get_sb(struct file_system_type *fs_type, 98static struct dentry *sysfs_mount(struct file_system_type *fs_type,
99 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data)
100{ 100{
101 struct sysfs_super_info *info; 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type; 102 enum kobj_ns_type type;
103 struct super_block *sb; 103 struct super_block *sb;
104 int error; 104 int error;
105 105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL); 106 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info) 107 if (!info)
109 goto out; 108 return ERR_PTR(-ENOMEM);
110 109
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) 110 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type); 111 info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); 113 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info) 114 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info); 115 kfree(info);
117 if (IS_ERR(sb)) { 116 if (IS_ERR(sb))
118 error = PTR_ERR(sb); 117 return ERR_CAST(sb);
119 goto out;
120 }
121 if (!sb->s_root) { 118 if (!sb->s_root) {
122 sb->s_flags = flags; 119 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); 120 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) { 121 if (error) {
125 deactivate_locked_super(sb); 122 deactivate_locked_super(sb);
126 goto out; 123 return ERR_PTR(error);
127 } 124 }
128 sb->s_flags |= MS_ACTIVE; 125 sb->s_flags |= MS_ACTIVE;
129 } 126 }
130 127
131 simple_set_mnt(mnt, sb); 128 return dget(sb->s_root);
132 error = 0;
133out:
134 return error;
135} 129}
136 130
137static void sysfs_kill_sb(struct super_block *sb) 131static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
147 141
148static struct file_system_type sysfs_fs_type = { 142static struct file_system_type sysfs_fs_type = {
149 .name = "sysfs", 143 .name = "sysfs",
150 .get_sb = sysfs_get_sb, 144 .mount = sysfs_mount,
151 .kill_sb = sysfs_kill_sb, 145 .kill_sb = sysfs_kill_sb,
152}; 146};
153 147
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
189 183
190 err = register_filesystem(&sysfs_fs_type); 184 err = register_filesystem(&sysfs_fs_type);
191 if (!err) { 185 if (!err) {
192 sysfs_mount = kern_mount(&sysfs_fs_type); 186 sysfs_mnt = kern_mount(&sysfs_fs_type);
193 if (IS_ERR(sysfs_mount)) { 187 if (IS_ERR(sysfs_mnt)) {
194 printk(KERN_ERR "sysfs: could not mount!\n"); 188 printk(KERN_ERR "sysfs: could not mount!\n");
195 err = PTR_ERR(sysfs_mount); 189 err = PTR_ERR(sysfs_mnt);
196 sysfs_mount = NULL; 190 sysfs_mnt = NULL;
197 unregister_filesystem(&sysfs_fs_type); 191 unregister_filesystem(&sysfs_fs_type);
198 goto out_err; 192 goto out_err;
199 } 193 }
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..11e7f7d11cd0 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
126 126
127 inode->i_ctime = CURRENT_TIME_SEC; 127 inode->i_ctime = CURRENT_TIME_SEC;
128 inode_inc_link_count(inode); 128 inode_inc_link_count(inode);
129 atomic_inc(&inode->i_count); 129 ihold(inode);
130 130
131 return add_nondir(dentry, inode); 131 return add_nondir(dentry, inode);
132} 132}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index a0b0cda6927e..3d9c62be0c10 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -526,23 +526,22 @@ failed:
526 526
527/* Every kernel module contains stuff like this. */ 527/* Every kernel module contains stuff like this. */
528 528
529static int sysv_get_sb(struct file_system_type *fs_type, 529static struct dentry *sysv_mount(struct file_system_type *fs_type,
530 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 530 int flags, const char *dev_name, void *data)
531{ 531{
532 return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super, 532 return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
533 mnt);
534} 533}
535 534
536static int v7_get_sb(struct file_system_type *fs_type, 535static struct dentry *v7_mount(struct file_system_type *fs_type,
537 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 536 int flags, const char *dev_name, void *data)
538{ 537{
539 return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt); 538 return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
540} 539}
541 540
542static struct file_system_type sysv_fs_type = { 541static struct file_system_type sysv_fs_type = {
543 .owner = THIS_MODULE, 542 .owner = THIS_MODULE,
544 .name = "sysv", 543 .name = "sysv",
545 .get_sb = sysv_get_sb, 544 .mount = sysv_mount,
546 .kill_sb = kill_block_super, 545 .kill_sb = kill_block_super,
547 .fs_flags = FS_REQUIRES_DEV, 546 .fs_flags = FS_REQUIRES_DEV,
548}; 547};
@@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
550static struct file_system_type v7_fs_type = { 549static struct file_system_type v7_fs_type = {
551 .owner = THIS_MODULE, 550 .owner = THIS_MODULE,
552 .name = "v7", 551 .name = "v7",
553 .get_sb = v7_get_sb, 552 .mount = v7_mount,
554 .kill_sb = kill_block_super, 553 .kill_sb = kill_block_super,
555 .fs_flags = FS_REQUIRES_DEV, 554 .fs_flags = FS_REQUIRES_DEV,
556}; 555};
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79a..8c4fc1425b3e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
144 .release = timerfd_release, 144 .release = timerfd_release,
145 .poll = timerfd_poll, 145 .poll = timerfd_poll,
146 .read = timerfd_read, 146 .read = timerfd_read,
147 .llseek = noop_llseek,
147}; 148};
148 149
149static struct file *timerfd_fget(int fd) 150static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d8..02429d81ca33 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
63 struct ubifs_lp_stats lst; 63 struct ubifs_lp_stats lst;
64 64
65 dbg_cmt("start"); 65 dbg_cmt("start");
66 if (c->ro_media) { 66 ubifs_assert(!c->ro_media && !c->ro_mount);
67
68 if (c->ro_error) {
67 err = -EROFS; 69 err = -EROFS;
68 goto out_up; 70 goto out_up;
69 } 71 }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782f..0bee4dbffc31 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
2239 return err; 2239 return err;
2240} 2240}
2241 2241
2242/**
2243 * dbg_check_data_nodes_order - check that list of data nodes is sorted.
2244 * @c: UBIFS file-system description object
2245 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2246 *
2247 * This function returns zero if the list of data nodes is sorted correctly,
2248 * and %-EINVAL if not.
2249 */
2250int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
2251{
2252 struct list_head *cur;
2253 struct ubifs_scan_node *sa, *sb;
2254
2255 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2256 return 0;
2257
2258 for (cur = head->next; cur->next != head; cur = cur->next) {
2259 ino_t inuma, inumb;
2260 uint32_t blka, blkb;
2261
2262 cond_resched();
2263 sa = container_of(cur, struct ubifs_scan_node, list);
2264 sb = container_of(cur->next, struct ubifs_scan_node, list);
2265
2266 if (sa->type != UBIFS_DATA_NODE) {
2267 ubifs_err("bad node type %d", sa->type);
2268 dbg_dump_node(c, sa->node);
2269 return -EINVAL;
2270 }
2271 if (sb->type != UBIFS_DATA_NODE) {
2272 ubifs_err("bad node type %d", sb->type);
2273 dbg_dump_node(c, sb->node);
2274 return -EINVAL;
2275 }
2276
2277 inuma = key_inum(c, &sa->key);
2278 inumb = key_inum(c, &sb->key);
2279
2280 if (inuma < inumb)
2281 continue;
2282 if (inuma > inumb) {
2283 ubifs_err("larger inum %lu goes before inum %lu",
2284 (unsigned long)inuma, (unsigned long)inumb);
2285 goto error_dump;
2286 }
2287
2288 blka = key_block(c, &sa->key);
2289 blkb = key_block(c, &sb->key);
2290
2291 if (blka > blkb) {
2292 ubifs_err("larger block %u goes before %u", blka, blkb);
2293 goto error_dump;
2294 }
2295 if (blka == blkb) {
2296 ubifs_err("two data nodes for the same block");
2297 goto error_dump;
2298 }
2299 }
2300
2301 return 0;
2302
2303error_dump:
2304 dbg_dump_node(c, sa->node);
2305 dbg_dump_node(c, sb->node);
2306 return -EINVAL;
2307}
2308
2309/**
2310 * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
2311 * @c: UBIFS file-system description object
2312 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2313 *
2314 * This function returns zero if the list of non-data nodes is sorted correctly,
2315 * and %-EINVAL if not.
2316 */
2317int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
2318{
2319 struct list_head *cur;
2320 struct ubifs_scan_node *sa, *sb;
2321
2322 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2323 return 0;
2324
2325 for (cur = head->next; cur->next != head; cur = cur->next) {
2326 ino_t inuma, inumb;
2327 uint32_t hasha, hashb;
2328
2329 cond_resched();
2330 sa = container_of(cur, struct ubifs_scan_node, list);
2331 sb = container_of(cur->next, struct ubifs_scan_node, list);
2332
2333 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2334 sa->type != UBIFS_XENT_NODE) {
2335 ubifs_err("bad node type %d", sa->type);
2336 dbg_dump_node(c, sa->node);
2337 return -EINVAL;
2338 }
2339 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2340 sa->type != UBIFS_XENT_NODE) {
2341 ubifs_err("bad node type %d", sb->type);
2342 dbg_dump_node(c, sb->node);
2343 return -EINVAL;
2344 }
2345
2346 if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2347 ubifs_err("non-inode node goes before inode node");
2348 goto error_dump;
2349 }
2350
2351 if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
2352 continue;
2353
2354 if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2355 /* Inode nodes are sorted in descending size order */
2356 if (sa->len < sb->len) {
2357 ubifs_err("smaller inode node goes first");
2358 goto error_dump;
2359 }
2360 continue;
2361 }
2362
2363 /*
2364 * This is either a dentry or xentry, which should be sorted in
2365 * ascending (parent ino, hash) order.
2366 */
2367 inuma = key_inum(c, &sa->key);
2368 inumb = key_inum(c, &sb->key);
2369
2370 if (inuma < inumb)
2371 continue;
2372 if (inuma > inumb) {
2373 ubifs_err("larger inum %lu goes before inum %lu",
2374 (unsigned long)inuma, (unsigned long)inumb);
2375 goto error_dump;
2376 }
2377
2378 hasha = key_block(c, &sa->key);
2379 hashb = key_block(c, &sb->key);
2380
2381 if (hasha > hashb) {
2382 ubifs_err("larger hash %u goes before %u", hasha, hashb);
2383 goto error_dump;
2384 }
2385 }
2386
2387 return 0;
2388
2389error_dump:
2390 ubifs_msg("dumping first node");
2391 dbg_dump_node(c, sa->node);
2392 ubifs_msg("dumping second node");
2393 dbg_dump_node(c, sb->node);
2394 return -EINVAL;
2395 return 0;
2396}
2397
2242static int invocation_cnt; 2398static int invocation_cnt;
2243 2399
2244int dbg_force_in_the_gaps(void) 2400int dbg_force_in_the_gaps(void)
@@ -2625,6 +2781,7 @@ static const struct file_operations dfs_fops = {
2625 .open = open_debugfs_file, 2781 .open = open_debugfs_file,
2626 .write = write_debugfs_file, 2782 .write = write_debugfs_file,
2627 .owner = THIS_MODULE, 2783 .owner = THIS_MODULE,
2784 .llseek = default_llseek,
2628}; 2785};
2629 2786
2630/** 2787/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea6..69ebe4729151 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
324 int row, int col); 324 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, 325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
326 loff_t size); 326 loff_t size);
327int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
328int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
327 329
328/* Force the use of in-the-gaps method for testing */ 330/* Force the use of in-the-gaps method for testing */
329 331
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
465#define dbg_check_lprops(c) 0 467#define dbg_check_lprops(c) 0
466#define dbg_check_lpt_nodes(c, cnode, row, col) 0 468#define dbg_check_lpt_nodes(c, cnode, row, col) 0
467#define dbg_check_inode_size(c, inode, size) 0 469#define dbg_check_inode_size(c, inode, size) 0
470#define dbg_check_data_nodes_order(c, head) 0
471#define dbg_check_nondata_nodes_order(c, head) 0
468#define dbg_force_in_the_gaps_enabled 0 472#define dbg_force_in_the_gaps_enabled 0
469#define dbg_force_in_the_gaps() 0 473#define dbg_force_in_the_gaps() 0
470#define dbg_failure_mode 0 474#define dbg_failure_mode 0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
550 550
551 lock_2_inodes(dir, inode); 551 lock_2_inodes(dir, inode);
552 inc_nlink(inode); 552 inc_nlink(inode);
553 atomic_inc(&inode->i_count); 553 ihold(inode);
554 inode->i_ctime = ubifs_current_time(inode); 554 inode->i_ctime = ubifs_current_time(inode);
555 dir->i_size += sz_change; 555 dir->i_size += sz_change;
556 dir_ui->ui_size = dir->i_size; 556 dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 03ae894c45de..d77db7e36484 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
433 struct page *page; 433 struct page *page;
434 434
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
436 ubifs_assert(!c->ro_media && !c->ro_mount);
436 437
437 if (unlikely(c->ro_media)) 438 if (unlikely(c->ro_error))
438 return -EROFS; 439 return -EROFS;
439 440
440 /* Try out the fast-path part first */ 441 /* Try out the fast-path part first */
@@ -1439,9 +1440,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
1439 1440
1440 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, 1441 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
1441 i_size_read(inode)); 1442 i_size_read(inode));
1442 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1443 ubifs_assert(!c->ro_media && !c->ro_mount);
1443 1444
1444 if (unlikely(c->ro_media)) 1445 if (unlikely(c->ro_error))
1445 return VM_FAULT_SIGBUS; /* -EROFS */ 1446 return VM_FAULT_SIGBUS; /* -EROFS */
1446 1447
1447 /* 1448 /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca05..151f10882820 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,10 +125,16 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
125 struct ubifs_scan_node *sa, *sb; 125 struct ubifs_scan_node *sa, *sb;
126 126
127 cond_resched(); 127 cond_resched();
128 if (a == b)
129 return 0;
130
128 sa = list_entry(a, struct ubifs_scan_node, list); 131 sa = list_entry(a, struct ubifs_scan_node, list);
129 sb = list_entry(b, struct ubifs_scan_node, list); 132 sb = list_entry(b, struct ubifs_scan_node, list);
133
130 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); 134 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
131 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); 135 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
136 ubifs_assert(sa->type == UBIFS_DATA_NODE);
137 ubifs_assert(sb->type == UBIFS_DATA_NODE);
132 138
133 inuma = key_inum(c, &sa->key); 139 inuma = key_inum(c, &sa->key);
134 inumb = key_inum(c, &sb->key); 140 inumb = key_inum(c, &sb->key);
@@ -157,28 +163,40 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
157 */ 163 */
158int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 164int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
159{ 165{
160 int typea, typeb;
161 ino_t inuma, inumb; 166 ino_t inuma, inumb;
162 struct ubifs_info *c = priv; 167 struct ubifs_info *c = priv;
163 struct ubifs_scan_node *sa, *sb; 168 struct ubifs_scan_node *sa, *sb;
164 169
165 cond_resched(); 170 cond_resched();
171 if (a == b)
172 return 0;
173
166 sa = list_entry(a, struct ubifs_scan_node, list); 174 sa = list_entry(a, struct ubifs_scan_node, list);
167 sb = list_entry(b, struct ubifs_scan_node, list); 175 sb = list_entry(b, struct ubifs_scan_node, list);
168 typea = key_type(c, &sa->key); 176
169 typeb = key_type(c, &sb->key); 177 ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
170 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); 178 key_type(c, &sb->key) != UBIFS_DATA_KEY);
179 ubifs_assert(sa->type != UBIFS_DATA_NODE &&
180 sb->type != UBIFS_DATA_NODE);
171 181
172 /* Inodes go before directory entries */ 182 /* Inodes go before directory entries */
173 if (typea == UBIFS_INO_KEY) { 183 if (sa->type == UBIFS_INO_NODE) {
174 if (typeb == UBIFS_INO_KEY) 184 if (sb->type == UBIFS_INO_NODE)
175 return sb->len - sa->len; 185 return sb->len - sa->len;
176 return -1; 186 return -1;
177 } 187 }
178 if (typeb == UBIFS_INO_KEY) 188 if (sb->type == UBIFS_INO_NODE)
179 return 1; 189 return 1;
180 190
181 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); 191 ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
192 key_type(c, &sa->key) == UBIFS_XENT_KEY);
193 ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
194 key_type(c, &sb->key) == UBIFS_XENT_KEY);
195 ubifs_assert(sa->type == UBIFS_DENT_NODE ||
196 sa->type == UBIFS_XENT_NODE);
197 ubifs_assert(sb->type == UBIFS_DENT_NODE ||
198 sb->type == UBIFS_XENT_NODE);
199
182 inuma = key_inum(c, &sa->key); 200 inuma = key_inum(c, &sa->key);
183 inumb = key_inum(c, &sb->key); 201 inumb = key_inum(c, &sb->key);
184 202
@@ -224,17 +242,33 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
224static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, 242static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
225 struct list_head *nondata, int *min) 243 struct list_head *nondata, int *min)
226{ 244{
245 int err;
227 struct ubifs_scan_node *snod, *tmp; 246 struct ubifs_scan_node *snod, *tmp;
228 247
229 *min = INT_MAX; 248 *min = INT_MAX;
230 249
231 /* Separate data nodes and non-data nodes */ 250 /* Separate data nodes and non-data nodes */
232 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 251 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
233 int err; 252 ubifs_assert(snod->type == UBIFS_INO_NODE ||
253 snod->type == UBIFS_DATA_NODE ||
254 snod->type == UBIFS_DENT_NODE ||
255 snod->type == UBIFS_XENT_NODE ||
256 snod->type == UBIFS_TRUN_NODE);
257
258 if (snod->type != UBIFS_INO_NODE &&
259 snod->type != UBIFS_DATA_NODE &&
260 snod->type != UBIFS_DENT_NODE &&
261 snod->type != UBIFS_XENT_NODE) {
262 /* Probably truncation node, zap it */
263 list_del(&snod->list);
264 kfree(snod);
265 continue;
266 }
234 267
235 ubifs_assert(snod->type != UBIFS_IDX_NODE); 268 ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
236 ubifs_assert(snod->type != UBIFS_REF_NODE); 269 key_type(c, &snod->key) == UBIFS_INO_KEY ||
237 ubifs_assert(snod->type != UBIFS_CS_NODE); 270 key_type(c, &snod->key) == UBIFS_DENT_KEY ||
271 key_type(c, &snod->key) == UBIFS_XENT_KEY);
238 272
239 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 273 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
240 snod->offs, 0); 274 snod->offs, 0);
@@ -258,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
258 /* Sort data and non-data nodes */ 292 /* Sort data and non-data nodes */
259 list_sort(c, &sleb->nodes, &data_nodes_cmp); 293 list_sort(c, &sleb->nodes, &data_nodes_cmp);
260 list_sort(c, nondata, &nondata_nodes_cmp); 294 list_sort(c, nondata, &nondata_nodes_cmp);
295
296 err = dbg_check_data_nodes_order(c, &sleb->nodes);
297 if (err)
298 return err;
299 err = dbg_check_nondata_nodes_order(c, nondata);
300 if (err)
301 return err;
261 return 0; 302 return 0;
262} 303}
263 304
@@ -575,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
575 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 616 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
576 617
577 ubifs_assert_cmt_locked(c); 618 ubifs_assert_cmt_locked(c);
619 ubifs_assert(!c->ro_media && !c->ro_mount);
578 620
579 if (ubifs_gc_should_commit(c)) 621 if (ubifs_gc_should_commit(c))
580 return -EAGAIN; 622 return -EAGAIN;
581 623
582 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 624 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
583 625
584 if (c->ro_media) { 626 if (c->ro_error) {
585 ret = -EROFS; 627 ret = -EROFS;
586 goto out_unlock; 628 goto out_unlock;
587 } 629 }
@@ -677,14 +719,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
677 719
678 ret = ubifs_garbage_collect_leb(c, &lp); 720 ret = ubifs_garbage_collect_leb(c, &lp);
679 if (ret < 0) { 721 if (ret < 0) {
680 if (ret == -EAGAIN || ret == -ENOSPC) { 722 if (ret == -EAGAIN) {
681 /* 723 /*
682 * These codes are not errors, so we have to 724 * This is not error, so we have to return the
683 * return the LEB to lprops. But if the 725 * LEB to lprops. But if 'ubifs_return_leb()'
684 * 'ubifs_return_leb()' function fails, its 726 * fails, its failure code is propagated to the
685 * failure code is propagated to the caller 727 * caller instead of the original '-EAGAIN'.
686 * instead of the original '-EAGAIN' or
687 * '-ENOSPC'.
688 */ 728 */
689 err = ubifs_return_leb(c, lp.lnum); 729 err = ubifs_return_leb(c, lp.lnum);
690 if (err) 730 if (err)
@@ -774,8 +814,8 @@ out_unlock:
774out: 814out:
775 ubifs_assert(ret < 0); 815 ubifs_assert(ret < 0);
776 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); 816 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
777 ubifs_ro_mode(c, ret);
778 ubifs_wbuf_sync_nolock(wbuf); 817 ubifs_wbuf_sync_nolock(wbuf);
818 ubifs_ro_mode(c, ret);
779 mutex_unlock(&wbuf->io_mutex); 819 mutex_unlock(&wbuf->io_mutex);
780 ubifs_return_leb(c, lp.lnum); 820 ubifs_return_leb(c, lp.lnum);
781 return ret; 821 return ret;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30bb..d82173182eeb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
61 */ 61 */
62void ubifs_ro_mode(struct ubifs_info *c, int err) 62void ubifs_ro_mode(struct ubifs_info *c, int err)
63{ 63{
64 if (!c->ro_media) { 64 if (!c->ro_error) {
65 c->ro_media = 1; 65 c->ro_error = 1;
66 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY; 67 c->vfs_sb->s_flags |= MS_RDONLY;
68 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
@@ -356,11 +356,11 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
356 356
357 dbg_io("LEB %d:%d, %d bytes, jhead %s", 357 dbg_io("LEB %d:%d, %d bytes, jhead %s",
358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); 358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
359 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
360 ubifs_assert(!(wbuf->avail & 7)); 359 ubifs_assert(!(wbuf->avail & 7));
361 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 360 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
361 ubifs_assert(!c->ro_media && !c->ro_mount);
362 362
363 if (c->ro_media) 363 if (c->ro_error)
364 return -EROFS; 364 return -EROFS;
365 365
366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); 366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +440,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
440{ 440{
441 int err, i; 441 int err, i;
442 442
443 ubifs_assert(!c->ro_media && !c->ro_mount);
443 if (!c->need_wbuf_sync) 444 if (!c->need_wbuf_sync)
444 return 0; 445 return 0;
445 c->need_wbuf_sync = 0; 446 c->need_wbuf_sync = 0;
446 447
447 if (c->ro_media) { 448 if (c->ro_error) {
448 err = -EROFS; 449 err = -EROFS;
449 goto out_timers; 450 goto out_timers;
450 } 451 }
@@ -519,6 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
519 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 520 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
520 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); 521 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
521 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 522 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
523 ubifs_assert(!c->ro_media && !c->ro_mount);
522 524
523 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 525 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
524 err = -ENOSPC; 526 err = -ENOSPC;
@@ -527,7 +529,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
527 529
528 cancel_wbuf_timer_nolock(wbuf); 530 cancel_wbuf_timer_nolock(wbuf);
529 531
530 if (c->ro_media) 532 if (c->ro_error)
531 return -EROFS; 533 return -EROFS;
532 534
533 if (aligned_len <= wbuf->avail) { 535 if (aligned_len <= wbuf->avail) {
@@ -663,8 +665,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
663 buf_len); 665 buf_len);
664 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 666 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
665 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); 667 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
668 ubifs_assert(!c->ro_media && !c->ro_mount);
666 669
667 if (c->ro_media) 670 if (c->ro_error)
668 return -EROFS; 671 return -EROFS;
669 672
670 ubifs_prepare_node(c, buf, len, 1); 673 ubifs_prepare_node(c, buf, len, 1);
@@ -815,7 +818,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
815 return 0; 818 return 0;
816 819
817out: 820out:
818 ubifs_err("bad node at LEB %d:%d", lnum, offs); 821 ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
822 ubi_is_mapped(c->ubi, lnum));
819 dbg_dump_node(c, buf); 823 dbg_dump_node(c, buf);
820 dbg_dump_stack(); 824 dbg_dump_stack();
821 return -EINVAL; 825 return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68d..914f1bd89e57 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
122 * better to try to allocate space at the ends of eraseblocks. This is 122 * better to try to allocate space at the ends of eraseblocks. This is
123 * what the squeeze parameter does. 123 * what the squeeze parameter does.
124 */ 124 */
125 ubifs_assert(!c->ro_media && !c->ro_mount);
125 squeeze = (jhead == BASEHD); 126 squeeze = (jhead == BASEHD);
126again: 127again:
127 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 128 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
128 129
129 if (c->ro_media) { 130 if (c->ro_error) {
130 err = -EROFS; 131 err = -EROFS;
131 goto out_unlock; 132 goto out_unlock;
132 } 133 }
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0b..92a8491a8f8c 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -306,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
306} 306}
307 307
308/** 308/**
309 * invalid_key_init - initialize invalid node key.
310 * @c: UBIFS file-system description object
311 * @key: key to initialize
312 *
313 * This is a helper function which marks a @key object as invalid.
314 */
315static inline void invalid_key_init(const struct ubifs_info *c,
316 union ubifs_key *key)
317{
318 key->u32[0] = 0xDEADBEAF;
319 key->u32[1] = UBIFS_INVALID_KEY;
320}
321
322/**
309 * key_type - get key type. 323 * key_type - get key type.
310 * @c: UBIFS file-system description object 324 * @c: UBIFS file-system description object
311 * @key: key to get type of 325 * @key: key to get type of
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42c..4d0cb1241460 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
159 jhead = &c->jheads[bud->jhead]; 159 jhead = &c->jheads[bud->jhead];
160 list_add_tail(&bud->list, &jhead->buds_list); 160 list_add_tail(&bud->list, &jhead->buds_list);
161 } else 161 } else
162 ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY)); 162 ubifs_assert(c->replaying && c->ro_mount);
163 163
164 /* 164 /*
165 * Note, although this is a new bud, we anyway account this space now, 165 * Note, although this is a new bud, we anyway account this space now,
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
223 } 223 }
224 224
225 mutex_lock(&c->log_mutex); 225 mutex_lock(&c->log_mutex);
226 226 ubifs_assert(!c->ro_media && !c->ro_mount);
227 if (c->ro_media) { 227 if (c->ro_error) {
228 err = -EROFS; 228 err = -EROFS;
229 goto out_unlock; 229 goto out_unlock;
230 } 230 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 0084a33c4c69..72775d35b99e 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
1363 goto out; 1363 goto out;
1364 for (i = 0; i < c->lsave_cnt; i++) { 1364 for (i = 0; i < c->lsave_cnt; i++) {
1365 int lnum = c->lsave[i]; 1365 int lnum = c->lsave[i];
1366 struct ubifs_lprops *lprops;
1366 1367
1367 /* 1368 /*
1368 * Due to automatic resizing, the values in the lsave table 1369 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
1370 */ 1371 */
1371 if (lnum >= c->leb_cnt) 1372 if (lnum >= c->leb_cnt)
1372 continue; 1373 continue;
1373 ubifs_lpt_lookup(c, lnum); 1374 lprops = ubifs_lpt_lookup(c, lnum);
1375 if (IS_ERR(lprops)) {
1376 err = PTR_ERR(lprops);
1377 goto out;
1378 }
1374 } 1379 }
1375out: 1380out:
1376 vfree(buf); 1381 vfree(buf);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d12535b7fc78..5c90dec5db0b 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
705 struct ubifs_pnode *pnode; 705 struct ubifs_pnode *pnode;
706 706
707 pnode = pnode_lookup(c, 0); 707 pnode = pnode_lookup(c, 0);
708 if (IS_ERR(pnode))
709 return PTR_ERR(pnode);
710
708 while (pnode) { 711 while (pnode) {
709 do_make_pnode_dirty(c, pnode); 712 do_make_pnode_dirty(c, pnode);
710 pnode = next_pnode_to_dirty(c, pnode); 713 pnode = next_pnode_to_dirty(c, pnode);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc0..21f47afdacff 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
361{ 361{
362 int err, lnum, offs, len; 362 int err, lnum, offs, len;
363 363
364 if (c->ro_media) 364 ubifs_assert(!c->ro_media && !c->ro_mount);
365 if (c->ro_error)
365 return -EROFS; 366 return -EROFS;
366 367
367 lnum = UBIFS_MST_LNUM; 368 lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e41..c3de04dc952a 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
132{ 132{
133 int err; 133 int err;
134 134
135 if (c->ro_media) 135 ubifs_assert(!c->ro_media && !c->ro_mount);
136 if (c->ro_error)
136 return -EROFS; 137 return -EROFS;
137 err = ubi_leb_unmap(c->ubi, lnum); 138 err = ubi_leb_unmap(c->ubi, lnum);
138 if (err) { 139 if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
159{ 160{
160 int err; 161 int err;
161 162
162 if (c->ro_media) 163 ubifs_assert(!c->ro_media && !c->ro_mount);
164 if (c->ro_error)
163 return -EROFS; 165 return -EROFS;
164 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); 166 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
165 if (err) { 167 if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
186{ 188{
187 int err; 189 int err;
188 190
189 if (c->ro_media) 191 ubifs_assert(!c->ro_media && !c->ro_mount);
192 if (c->ro_error)
190 return -EROFS; 193 return -EROFS;
191 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); 194 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
192 if (err) { 195 if (err) {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index daae9e1f5382..77e9b874b6c2 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
292 292
293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); 293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
294 294
295 if ((c->vfs_sb->s_flags & MS_RDONLY)) { 295 if (c->ro_mount) {
296 /* Read-only mode. Keep a copy for switching to rw mode */ 296 /* Read-only mode. Keep a copy for switching to rw mode */
297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); 297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
298 if (!c->rcvrd_mst_node) { 298 if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
469 endpt = snod->offs + snod->len; 469 endpt = snod->offs + snod->len;
470 } 470 }
471 471
472 if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) { 472 if (c->ro_mount && !c->remounting_rw) {
473 /* Add to recovery list */ 473 /* Add to recovery list */
474 struct ubifs_unclean_leb *ucleb; 474 struct ubifs_unclean_leb *ucleb;
475 475
@@ -772,7 +772,8 @@ out_free:
772 * @sbuf: LEB-sized buffer to use 772 * @sbuf: LEB-sized buffer to use
773 * 773 *
774 * This function does a scan of a LEB, but caters for errors that might have 774 * This function does a scan of a LEB, but caters for errors that might have
775 * been caused by the unclean unmount from which we are attempting to recover. 775 * been caused by unclean reboots from which we are attempting to recover
776 * (assume that only the last log LEB can be corrupted by an unclean reboot).
776 * 777 *
777 * This function returns %0 on success and a negative error code on failure. 778 * This function returns %0 on success and a negative error code on failure.
778 */ 779 */
@@ -883,7 +884,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
883{ 884{
884 int err; 885 int err;
885 886
886 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw); 887 ubifs_assert(!c->ro_mount || c->remounting_rw);
887 888
888 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); 889 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
889 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); 890 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1461,7 +1462,7 @@ int ubifs_recover_size(struct ubifs_info *c)
1461 } 1462 }
1462 } 1463 }
1463 if (e->exists && e->i_size < e->d_size) { 1464 if (e->exists && e->i_size < e->d_size) {
1464 if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) { 1465 if (!e->inode && c->ro_mount) {
1465 /* Fix the inode size and pin it in memory */ 1466 /* Fix the inode size and pin it in memory */
1466 struct inode *inode; 1467 struct inode *inode;
1467 1468
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3e..eed0fcff8d73 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
627 ubifs_assert(sleb->endpt - offs >= used); 627 ubifs_assert(sleb->endpt - offs >= used);
628 ubifs_assert(sleb->endpt % c->min_io_size == 0); 628 ubifs_assert(sleb->endpt % c->min_io_size == 0);
629 629
630 if (sleb->endpt + c->min_io_size <= c->leb_size && 630 if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
631 !(c->vfs_sb->s_flags & MS_RDONLY))
632 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, 631 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
633 sleb->endpt, UBI_SHORTTERM); 632 sleb->endpt, UBI_SHORTTERM);
634 633
@@ -840,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
840 if (IS_ERR(sleb)) { 839 if (IS_ERR(sleb)) {
841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) 840 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 return PTR_ERR(sleb); 841 return PTR_ERR(sleb);
842 /*
843 * Note, the below function will recover this log LEB only if
844 * it is the last, because unclean reboots can possibly corrupt
845 * only the tail of the log.
846 */
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 847 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
844 if (IS_ERR(sleb)) 848 if (IS_ERR(sleb))
845 return PTR_ERR(sleb); 849 return PTR_ERR(sleb);
@@ -851,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
851 } 855 }
852 856
853 node = sleb->buf; 857 node = sleb->buf;
854
855 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); 858 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
856 if (c->cs_sqnum == 0) { 859 if (c->cs_sqnum == 0) {
857 /* 860 /*
@@ -898,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
898 } 901 }
899 902
900 list_for_each_entry(snod, &sleb->nodes, list) { 903 list_for_each_entry(snod, &sleb->nodes, list) {
901
902 cond_resched(); 904 cond_resched();
903 905
904 if (snod->sqnum >= SQNUM_WATERMARK) { 906 if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1011,7 +1013,6 @@ out:
1011int ubifs_replay_journal(struct ubifs_info *c) 1013int ubifs_replay_journal(struct ubifs_info *c)
1012{ 1014{
1013 int err, i, lnum, offs, free; 1015 int err, i, lnum, offs, free;
1014 void *sbuf = NULL;
1015 1016
1016 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); 1017 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
1017 1018
@@ -1026,14 +1027,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
1026 return -EINVAL; 1027 return -EINVAL;
1027 } 1028 }
1028 1029
1029 sbuf = vmalloc(c->leb_size);
1030 if (!sbuf)
1031 return -ENOMEM;
1032
1033 dbg_mnt("start replaying the journal"); 1030 dbg_mnt("start replaying the journal");
1034
1035 c->replaying = 1; 1031 c->replaying = 1;
1036
1037 lnum = c->ltail_lnum = c->lhead_lnum; 1032 lnum = c->ltail_lnum = c->lhead_lnum;
1038 offs = c->lhead_offs; 1033 offs = c->lhead_offs;
1039 1034
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
1046 lnum = UBIFS_LOG_LNUM; 1041 lnum = UBIFS_LOG_LNUM;
1047 offs = 0; 1042 offs = 0;
1048 } 1043 }
1049 err = replay_log_leb(c, lnum, offs, sbuf); 1044 err = replay_log_leb(c, lnum, offs, c->sbuf);
1050 if (err == 1) 1045 if (err == 1)
1051 /* We hit the end of the log */ 1046 /* We hit the end of the log */
1052 break; 1047 break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
1079out: 1074out:
1080 destroy_replay_tree(c); 1075 destroy_replay_tree(c);
1081 destroy_bud_list(c); 1076 destroy_bud_list(c);
1082 vfree(sbuf);
1083 c->replaying = 0; 1077 c->replaying = 0;
1084 return err; 1078 return err;
1085} 1079}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9dd..bf31b4729e51 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
542 * due to the unavailability of time-travelling equipment. 542 * due to the unavailability of time-travelling equipment.
543 */ 543 */
544 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 544 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
545 struct super_block *sb = c->vfs_sb; 545 ubifs_assert(!c->ro_media || c->ro_mount);
546 int mounting_ro = sb->s_flags & MS_RDONLY; 546 if (!c->ro_mount ||
547
548 ubifs_assert(!c->ro_media || mounting_ro);
549 if (!mounting_ro ||
550 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { 547 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
551 ubifs_err("on-flash format version is w%d/r%d, but " 548 ubifs_err("on-flash format version is w%d/r%d, but "
552 "software only supports up to version " 549 "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
624 c->old_leb_cnt = c->leb_cnt; 621 c->old_leb_cnt = c->leb_cnt;
625 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { 622 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
626 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); 623 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
627 if (c->vfs_sb->s_flags & MS_RDONLY) 624 if (c->ro_mount)
628 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", 625 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
629 c->old_leb_cnt, c->leb_cnt); 626 c->old_leb_cnt, c->leb_cnt);
630 else { 627 else {
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c525384191..3e1ee57dbeaa 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
197 struct ubifs_ino_node *ino = buf; 197 struct ubifs_ino_node *ino = buf;
198 struct ubifs_scan_node *snod; 198 struct ubifs_scan_node *snod;
199 199
200 snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); 200 snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
201 if (!snod) 201 if (!snod)
202 return -ENOMEM; 202 return -ENOMEM;
203 203
@@ -212,13 +212,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
212 case UBIFS_DENT_NODE: 212 case UBIFS_DENT_NODE:
213 case UBIFS_XENT_NODE: 213 case UBIFS_XENT_NODE:
214 case UBIFS_DATA_NODE: 214 case UBIFS_DATA_NODE:
215 case UBIFS_TRUN_NODE:
216 /* 215 /*
217 * The key is in the same place in all keyed 216 * The key is in the same place in all keyed
218 * nodes. 217 * nodes.
219 */ 218 */
220 key_read(c, &ino->key, &snod->key); 219 key_read(c, &ino->key, &snod->key);
221 break; 220 break;
221 default:
222 invalid_key_init(c, &snod->key);
223 break;
222 } 224 }
223 list_add_tail(&snod->list, &sleb->nodes); 225 list_add_tail(&snod->list, &sleb->nodes);
224 sleb->nodes_cnt += 1; 226 sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5ad..46961c003236 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); 250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
251 251
252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || 252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
253 c->ro_media) { 253 c->ro_mount || c->ro_error) {
254 mutex_unlock(&c->umount_mutex); 254 mutex_unlock(&c->umount_mutex);
255 continue; 255 continue;
256 } 256 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cd5900b85d38..91fac54c70e3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1137,11 +1137,11 @@ static int check_free_space(struct ubifs_info *c)
1137 */ 1137 */
1138static int mount_ubifs(struct ubifs_info *c) 1138static int mount_ubifs(struct ubifs_info *c)
1139{ 1139{
1140 struct super_block *sb = c->vfs_sb; 1140 int err;
1141 int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
1142 long long x; 1141 long long x;
1143 size_t sz; 1142 size_t sz;
1144 1143
1144 c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
1145 err = init_constants_early(c); 1145 err = init_constants_early(c);
1146 if (err) 1146 if (err)
1147 return err; 1147 return err;
@@ -1154,7 +1154,7 @@ static int mount_ubifs(struct ubifs_info *c)
1154 if (err) 1154 if (err)
1155 goto out_free; 1155 goto out_free;
1156 1156
1157 if (c->empty && (mounted_read_only || c->ro_media)) { 1157 if (c->empty && (c->ro_mount || c->ro_media)) {
1158 /* 1158 /*
1159 * This UBI volume is empty, and read-only, or the file system 1159 * This UBI volume is empty, and read-only, or the file system
1160 * is mounted read-only - we cannot format it. 1160 * is mounted read-only - we cannot format it.
@@ -1165,7 +1165,7 @@ static int mount_ubifs(struct ubifs_info *c)
1165 goto out_free; 1165 goto out_free;
1166 } 1166 }
1167 1167
1168 if (c->ro_media && !mounted_read_only) { 1168 if (c->ro_media && !c->ro_mount) {
1169 ubifs_err("cannot mount read-write - read-only media"); 1169 ubifs_err("cannot mount read-write - read-only media");
1170 err = -EROFS; 1170 err = -EROFS;
1171 goto out_free; 1171 goto out_free;
@@ -1185,7 +1185,7 @@ static int mount_ubifs(struct ubifs_info *c)
1185 if (!c->sbuf) 1185 if (!c->sbuf)
1186 goto out_free; 1186 goto out_free;
1187 1187
1188 if (!mounted_read_only) { 1188 if (!c->ro_mount) {
1189 c->ileb_buf = vmalloc(c->leb_size); 1189 c->ileb_buf = vmalloc(c->leb_size);
1190 if (!c->ileb_buf) 1190 if (!c->ileb_buf)
1191 goto out_free; 1191 goto out_free;
@@ -1228,7 +1228,7 @@ static int mount_ubifs(struct ubifs_info *c)
1228 } 1228 }
1229 1229
1230 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); 1230 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
1231 if (!mounted_read_only) { 1231 if (!c->ro_mount) {
1232 err = alloc_wbufs(c); 1232 err = alloc_wbufs(c);
1233 if (err) 1233 if (err)
1234 goto out_cbuf; 1234 goto out_cbuf;
@@ -1254,12 +1254,12 @@ static int mount_ubifs(struct ubifs_info *c)
1254 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { 1254 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1255 ubifs_msg("recovery needed"); 1255 ubifs_msg("recovery needed");
1256 c->need_recovery = 1; 1256 c->need_recovery = 1;
1257 if (!mounted_read_only) { 1257 if (!c->ro_mount) {
1258 err = ubifs_recover_inl_heads(c, c->sbuf); 1258 err = ubifs_recover_inl_heads(c, c->sbuf);
1259 if (err) 1259 if (err)
1260 goto out_master; 1260 goto out_master;
1261 } 1261 }
1262 } else if (!mounted_read_only) { 1262 } else if (!c->ro_mount) {
1263 /* 1263 /*
1264 * Set the "dirty" flag so that if we reboot uncleanly we 1264 * Set the "dirty" flag so that if we reboot uncleanly we
1265 * will notice this immediately on the next mount. 1265 * will notice this immediately on the next mount.
@@ -1270,7 +1270,7 @@ static int mount_ubifs(struct ubifs_info *c)
1270 goto out_master; 1270 goto out_master;
1271 } 1271 }
1272 1272
1273 err = ubifs_lpt_init(c, 1, !mounted_read_only); 1273 err = ubifs_lpt_init(c, 1, !c->ro_mount);
1274 if (err) 1274 if (err)
1275 goto out_lpt; 1275 goto out_lpt;
1276 1276
@@ -1285,11 +1285,11 @@ static int mount_ubifs(struct ubifs_info *c)
1285 /* Calculate 'min_idx_lebs' after journal replay */ 1285 /* Calculate 'min_idx_lebs' after journal replay */
1286 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 1286 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
1287 1287
1288 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); 1288 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
1289 if (err) 1289 if (err)
1290 goto out_orphans; 1290 goto out_orphans;
1291 1291
1292 if (!mounted_read_only) { 1292 if (!c->ro_mount) {
1293 int lnum; 1293 int lnum;
1294 1294
1295 err = check_free_space(c); 1295 err = check_free_space(c);
@@ -1351,7 +1351,7 @@ static int mount_ubifs(struct ubifs_info *c)
1351 spin_unlock(&ubifs_infos_lock); 1351 spin_unlock(&ubifs_infos_lock);
1352 1352
1353 if (c->need_recovery) { 1353 if (c->need_recovery) {
1354 if (mounted_read_only) 1354 if (c->ro_mount)
1355 ubifs_msg("recovery deferred"); 1355 ubifs_msg("recovery deferred");
1356 else { 1356 else {
1357 c->need_recovery = 0; 1357 c->need_recovery = 0;
@@ -1378,7 +1378,7 @@ static int mount_ubifs(struct ubifs_info *c)
1378 1378
1379 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1379 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
1380 c->vi.ubi_num, c->vi.vol_id, c->vi.name); 1380 c->vi.ubi_num, c->vi.vol_id, c->vi.name);
1381 if (mounted_read_only) 1381 if (c->ro_mount)
1382 ubifs_msg("mounted read-only"); 1382 ubifs_msg("mounted read-only");
1383 x = (long long)c->main_lebs * c->leb_size; 1383 x = (long long)c->main_lebs * c->leb_size;
1384 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " 1384 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1640,7 +1640,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1640 } 1640 }
1641 1641
1642 dbg_gen("re-mounted read-write"); 1642 dbg_gen("re-mounted read-write");
1643 c->vfs_sb->s_flags &= ~MS_RDONLY; 1643 c->ro_mount = 0;
1644 c->remounting_rw = 0; 1644 c->remounting_rw = 0;
1645 c->always_chk_crc = 0; 1645 c->always_chk_crc = 0;
1646 err = dbg_check_space_info(c); 1646 err = dbg_check_space_info(c);
@@ -1676,7 +1676,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1676 int i, err; 1676 int i, err;
1677 1677
1678 ubifs_assert(!c->need_recovery); 1678 ubifs_assert(!c->need_recovery);
1679 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 1679 ubifs_assert(!c->ro_mount);
1680 1680
1681 mutex_lock(&c->umount_mutex); 1681 mutex_lock(&c->umount_mutex);
1682 if (c->bgt) { 1682 if (c->bgt) {
@@ -1686,10 +1686,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1686 1686
1687 dbg_save_space_info(c); 1687 dbg_save_space_info(c);
1688 1688
1689 for (i = 0; i < c->jhead_cnt; i++) { 1689 for (i = 0; i < c->jhead_cnt; i++)
1690 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1690 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1691 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1692 }
1693 1691
1694 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1692 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1695 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1693 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1704,6 +1702,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1704 vfree(c->ileb_buf); 1702 vfree(c->ileb_buf);
1705 c->ileb_buf = NULL; 1703 c->ileb_buf = NULL;
1706 ubifs_lpt_free(c, 1); 1704 ubifs_lpt_free(c, 1);
1705 c->ro_mount = 1;
1707 err = dbg_check_space_info(c); 1706 err = dbg_check_space_info(c);
1708 if (err) 1707 if (err)
1709 ubifs_ro_mode(c, err); 1708 ubifs_ro_mode(c, err);
@@ -1735,7 +1734,7 @@ static void ubifs_put_super(struct super_block *sb)
1735 * the mutex is locked. 1734 * the mutex is locked.
1736 */ 1735 */
1737 mutex_lock(&c->umount_mutex); 1736 mutex_lock(&c->umount_mutex);
1738 if (!(c->vfs_sb->s_flags & MS_RDONLY)) { 1737 if (!c->ro_mount) {
1739 /* 1738 /*
1740 * First of all kill the background thread to make sure it does 1739 * First of all kill the background thread to make sure it does
1741 * not interfere with un-mounting and freeing resources. 1740 * not interfere with un-mounting and freeing resources.
@@ -1745,23 +1744,22 @@ static void ubifs_put_super(struct super_block *sb)
1745 c->bgt = NULL; 1744 c->bgt = NULL;
1746 } 1745 }
1747 1746
1748 /* Synchronize write-buffers */
1749 if (c->jheads)
1750 for (i = 0; i < c->jhead_cnt; i++)
1751 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1752
1753 /* 1747 /*
1754 * On fatal errors c->ro_media is set to 1, in which case we do 1748 * On fatal errors c->ro_error is set to 1, in which case we do
1755 * not write the master node. 1749 * not write the master node.
1756 */ 1750 */
1757 if (!c->ro_media) { 1751 if (!c->ro_error) {
1752 int err;
1753
1754 /* Synchronize write-buffers */
1755 for (i = 0; i < c->jhead_cnt; i++)
1756 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1757
1758 /* 1758 /*
1759 * We are being cleanly unmounted which means the 1759 * We are being cleanly unmounted which means the
1760 * orphans were killed - indicate this in the master 1760 * orphans were killed - indicate this in the master
1761 * node. Also save the reserved GC LEB number. 1761 * node. Also save the reserved GC LEB number.
1762 */ 1762 */
1763 int err;
1764
1765 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1763 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1766 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1764 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1767 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1765 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1774,6 +1772,10 @@ static void ubifs_put_super(struct super_block *sb)
1774 */ 1772 */
1775 ubifs_err("failed to write master node, " 1773 ubifs_err("failed to write master node, "
1776 "error %d", err); 1774 "error %d", err);
1775 } else {
1776 for (i = 0; i < c->jhead_cnt; i++)
1777 /* Make sure write-buffer timers are canceled */
1778 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1777 } 1779 }
1778 } 1780 }
1779 1781
@@ -1797,17 +1799,21 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1797 return err; 1799 return err;
1798 } 1800 }
1799 1801
1800 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1802 if (c->ro_mount && !(*flags & MS_RDONLY)) {
1803 if (c->ro_error) {
1804 ubifs_msg("cannot re-mount R/W due to prior errors");
1805 return -EROFS;
1806 }
1801 if (c->ro_media) { 1807 if (c->ro_media) {
1802 ubifs_msg("cannot re-mount due to prior errors"); 1808 ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
1803 return -EROFS; 1809 return -EROFS;
1804 } 1810 }
1805 err = ubifs_remount_rw(c); 1811 err = ubifs_remount_rw(c);
1806 if (err) 1812 if (err)
1807 return err; 1813 return err;
1808 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1814 } else if (!c->ro_mount && (*flags & MS_RDONLY)) {
1809 if (c->ro_media) { 1815 if (c->ro_error) {
1810 ubifs_msg("cannot re-mount due to prior errors"); 1816 ubifs_msg("cannot re-mount R/O due to prior errors");
1811 return -EROFS; 1817 return -EROFS;
1812 } 1818 }
1813 ubifs_remount_ro(c); 1819 ubifs_remount_ro(c);
@@ -2032,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data)
2032 return c->vi.cdev == *dev; 2038 return c->vi.cdev == *dev;
2033} 2039}
2034 2040
2035static int ubifs_get_sb(struct file_system_type *fs_type, int flags, 2041static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
2036 const char *name, void *data, struct vfsmount *mnt) 2042 const char *name, void *data)
2037{ 2043{
2038 struct ubi_volume_desc *ubi; 2044 struct ubi_volume_desc *ubi;
2039 struct ubi_volume_info vi; 2045 struct ubi_volume_info vi;
@@ -2049,9 +2055,9 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2049 */ 2055 */
2050 ubi = open_ubi(name, UBI_READONLY); 2056 ubi = open_ubi(name, UBI_READONLY);
2051 if (IS_ERR(ubi)) { 2057 if (IS_ERR(ubi)) {
2052 ubifs_err("cannot open \"%s\", error %d", 2058 dbg_err("cannot open \"%s\", error %d",
2053 name, (int)PTR_ERR(ubi)); 2059 name, (int)PTR_ERR(ubi));
2054 return PTR_ERR(ubi); 2060 return ERR_CAST(ubi);
2055 } 2061 }
2056 ubi_get_volume_info(ubi, &vi); 2062 ubi_get_volume_info(ubi, &vi);
2057 2063
@@ -2064,9 +2070,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2064 } 2070 }
2065 2071
2066 if (sb->s_root) { 2072 if (sb->s_root) {
2073 struct ubifs_info *c1 = sb->s_fs_info;
2074
2067 /* A new mount point for already mounted UBIFS */ 2075 /* A new mount point for already mounted UBIFS */
2068 dbg_gen("this ubi volume is already mounted"); 2076 dbg_gen("this ubi volume is already mounted");
2069 if ((flags ^ sb->s_flags) & MS_RDONLY) { 2077 if (!!(flags & MS_RDONLY) != c1->ro_mount) {
2070 err = -EBUSY; 2078 err = -EBUSY;
2071 goto out_deact; 2079 goto out_deact;
2072 } 2080 }
@@ -2087,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2087 /* 'fill_super()' opens ubi again so we must close it here */ 2095 /* 'fill_super()' opens ubi again so we must close it here */
2088 ubi_close_volume(ubi); 2096 ubi_close_volume(ubi);
2089 2097
2090 simple_set_mnt(mnt, sb); 2098 return dget(sb->s_root);
2091 return 0;
2092 2099
2093out_deact: 2100out_deact:
2094 deactivate_locked_super(sb); 2101 deactivate_locked_super(sb);
2095out_close: 2102out_close:
2096 ubi_close_volume(ubi); 2103 ubi_close_volume(ubi);
2097 return err; 2104 return ERR_PTR(err);
2098} 2105}
2099 2106
2100static struct file_system_type ubifs_fs_type = { 2107static struct file_system_type ubifs_fs_type = {
2101 .name = "ubifs", 2108 .name = "ubifs",
2102 .owner = THIS_MODULE, 2109 .owner = THIS_MODULE,
2103 .get_sb = ubifs_get_sb, 2110 .mount = ubifs_mount,
2104 .kill_sb = kill_anon_super, 2111 .kill_sb = kill_anon_super,
2105}; 2112};
2106 2113
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e5..ad9cf0133622 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1177 unsigned long time = get_seconds(); 1177 unsigned long time = get_seconds();
1178 1178
1179 dbg_tnc("search key %s", DBGKEY(key)); 1179 dbg_tnc("search key %s", DBGKEY(key));
1180 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
1180 1181
1181 znode = c->zroot.znode; 1182 znode = c->zroot.znode;
1182 if (unlikely(!znode)) { 1183 if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
2966 * 2967 *
2967 * This function searches an indexing node by its first key @key and its 2968 * This function searches an indexing node by its first key @key and its
2968 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing 2969 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
2969 * nodes it traverses to TNC. This function is called fro indexing nodes which 2970 * nodes it traverses to TNC. This function is called for indexing nodes which
2970 * were found on the media by scanning, for example when garbage-collecting or 2971 * were found on the media by scanning, for example when garbage-collecting or
2971 * when doing in-the-gaps commit. This means that the indexing node which is 2972 * when doing in-the-gaps commit. This means that the indexing node which is
2972 * looked for does not have to have exactly the same leftmost key @key, because 2973 * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
2988 struct ubifs_znode *znode, *zn; 2989 struct ubifs_znode *znode, *zn;
2989 int n, nn; 2990 int n, nn;
2990 2991
2992 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
2993
2991 /* 2994 /*
2992 * The arguments have probably been read off flash, so don't assume 2995 * The arguments have probably been read off flash, so don't assume
2993 * they are valid. 2996 * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0c9876b396dd..381d6b207a52 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
119 * in TNC. However, when replaying, it is handy to introduce fake "truncation" 119 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
120 * keys for truncation nodes because the code becomes simpler. So we define 120 * keys for truncation nodes because the code becomes simpler. So we define
121 * %UBIFS_TRUN_KEY type. 121 * %UBIFS_TRUN_KEY type.
122 *
123 * But otherwise, out of the journal reply scope, the truncation keys are
124 * invalid.
122 */ 125 */
123#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT 126#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
127#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
124 128
125/* 129/*
126 * How much a directory entry/extended attribute entry adds to the parent/host 130 * How much a directory entry/extended attribute entry adds to the parent/host
@@ -1028,6 +1032,8 @@ struct ubifs_debug_info;
1028 * @max_leb_cnt: maximum count of logical eraseblocks 1032 * @max_leb_cnt: maximum count of logical eraseblocks
1029 * @old_leb_cnt: count of logical eraseblocks before re-size 1033 * @old_leb_cnt: count of logical eraseblocks before re-size
1030 * @ro_media: the underlying UBI volume is read-only 1034 * @ro_media: the underlying UBI volume is read-only
1035 * @ro_mount: the file-system was mounted as read-only
1036 * @ro_error: UBIFS switched to R/O mode because an error happened
1031 * 1037 *
1032 * @dirty_pg_cnt: number of dirty pages (not used) 1038 * @dirty_pg_cnt: number of dirty pages (not used)
1033 * @dirty_zn_cnt: number of dirty znodes 1039 * @dirty_zn_cnt: number of dirty znodes
@@ -1168,11 +1174,14 @@ struct ubifs_debug_info;
1168 * @replay_sqnum: sequence number of node currently being replayed 1174 * @replay_sqnum: sequence number of node currently being replayed
1169 * @need_recovery: file-system needs recovery 1175 * @need_recovery: file-system needs recovery
1170 * @replaying: set to %1 during journal replay 1176 * @replaying: set to %1 during journal replay
1171 * @unclean_leb_list: LEBs to recover when mounting ro to rw 1177 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
1172 * @rcvrd_mst_node: recovered master node to write when mounting ro to rw 1178 * mode
1179 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
1180 * FS to R/W mode
1173 * @size_tree: inode size information for recovery 1181 * @size_tree: inode size information for recovery
1174 * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) 1182 * @remounting_rw: set while re-mounting from R/O mode to R/W mode
1175 * @always_chk_crc: always check CRCs (while mounting and remounting rw) 1183 * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
1184 * mode)
1176 * @mount_opts: UBIFS-specific mount options 1185 * @mount_opts: UBIFS-specific mount options
1177 * 1186 *
1178 * @dbg: debugging-related information 1187 * @dbg: debugging-related information
@@ -1268,7 +1277,9 @@ struct ubifs_info {
1268 int leb_cnt; 1277 int leb_cnt;
1269 int max_leb_cnt; 1278 int max_leb_cnt;
1270 int old_leb_cnt; 1279 int old_leb_cnt;
1271 int ro_media; 1280 unsigned int ro_media:1;
1281 unsigned int ro_mount:1;
1282 unsigned int ro_error:1;
1272 1283
1273 atomic_long_t dirty_pg_cnt; 1284 atomic_long_t dirty_pg_cnt;
1274 atomic_long_t dirty_zn_cnt; 1285 atomic_long_t dirty_zn_cnt;
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..f8def3c8ea4c 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
1config UDF_FS 1config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 depends on BKL # needs serious work to remove
3 select CRC_ITU_T 4 select CRC_ITU_T
4 help 5 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if 6 This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..6d8dc02baebb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1101 inc_nlink(inode); 1101 inc_nlink(inode);
1102 inode->i_ctime = current_fs_time(inode->i_sb); 1102 inode->i_ctime = current_fs_time(inode->i_sb);
1103 mark_inode_dirty(inode); 1103 mark_inode_dirty(inode);
1104 atomic_inc(&inode->i_count); 1104 ihold(inode);
1105 d_instantiate(dentry, inode); 1105 d_instantiate(dentry, inode);
1106 unlock_kernel(); 1106 unlock_kernel();
1107 1107
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 65412d84a45d..4a5c7c61836a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
107} 107}
108 108
109/* UDF filesystem type */ 109/* UDF filesystem type */
110static int udf_get_sb(struct file_system_type *fs_type, 110static struct dentry *udf_mount(struct file_system_type *fs_type,
111 int flags, const char *dev_name, void *data, 111 int flags, const char *dev_name, void *data)
112 struct vfsmount *mnt)
113{ 112{
114 return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt); 113 return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
115} 114}
116 115
117static struct file_system_type udf_fstype = { 116static struct file_system_type udf_fstype = {
118 .owner = THIS_MODULE, 117 .owner = THIS_MODULE,
119 .name = "udf", 118 .name = "udf",
120 .get_sb = udf_get_sb, 119 .mount = udf_mount,
121 .kill_sb = kill_block_super, 120 .kill_sb = kill_block_super,
122 .fs_flags = FS_REQUIRES_DEV, 121 .fs_flags = FS_REQUIRES_DEV,
123}; 122};
@@ -1880,6 +1879,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1880 struct kernel_lb_addr rootdir, fileset; 1879 struct kernel_lb_addr rootdir, fileset;
1881 struct udf_sb_info *sbi; 1880 struct udf_sb_info *sbi;
1882 1881
1882 lock_kernel();
1883
1883 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1884 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1884 uopt.uid = -1; 1885 uopt.uid = -1;
1885 uopt.gid = -1; 1886 uopt.gid = -1;
@@ -1888,8 +1889,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1888 uopt.dmode = UDF_INVALID_MODE; 1889 uopt.dmode = UDF_INVALID_MODE;
1889 1890
1890 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1891 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1891 if (!sbi) 1892 if (!sbi) {
1893 unlock_kernel();
1892 return -ENOMEM; 1894 return -ENOMEM;
1895 }
1893 1896
1894 sb->s_fs_info = sbi; 1897 sb->s_fs_info = sbi;
1895 1898
@@ -2035,6 +2038,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2035 goto error_out; 2038 goto error_out;
2036 } 2039 }
2037 sb->s_maxbytes = MAX_LFS_FILESIZE; 2040 sb->s_maxbytes = MAX_LFS_FILESIZE;
2041 unlock_kernel();
2038 return 0; 2042 return 0;
2039 2043
2040error_out: 2044error_out:
@@ -2055,6 +2059,7 @@ error_out:
2055 kfree(sbi); 2059 kfree(sbi);
2056 sb->s_fs_info = NULL; 2060 sb->s_fs_info = NULL;
2057 2061
2062 unlock_kernel();
2058 return -EINVAL; 2063 return -EINVAL;
2059} 2064}
2060 2065
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
1config UFS_FS 1config UFS_FS
2 tristate "UFS file system support (read only)" 2 tristate "UFS file system support (read only)"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # probably fixable
4 help 5 help
5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 6 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
6 OpenBSD and NeXTstep) use a file system called UFS. Some System V 7 OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
180 180
181 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
182 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
183 atomic_inc(&inode->i_count); 183 ihold(inode);
184 184
185 error = ufs_add_nondir(dentry, inode); 185 error = ufs_add_nondir(dentry, inode);
186 unlock_kernel(); 186 unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d510c1b91817..2c47daed56da 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
696 unsigned maxsymlen; 696 unsigned maxsymlen;
697 int ret = -EINVAL; 697 int ret = -EINVAL;
698 698
699 lock_kernel();
700
699 uspi = NULL; 701 uspi = NULL;
700 ubh = NULL; 702 ubh = NULL;
701 flags = 0; 703 flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
1163 goto failed; 1165 goto failed;
1164 1166
1165 UFSD("EXIT\n"); 1167 UFSD("EXIT\n");
1168 unlock_kernel();
1166 return 0; 1169 return 0;
1167 1170
1168dalloc_failed: 1171dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
1174 kfree(sbi); 1177 kfree(sbi);
1175 sb->s_fs_info = NULL; 1178 sb->s_fs_info = NULL;
1176 UFSD("EXIT (FAILED)\n"); 1179 UFSD("EXIT (FAILED)\n");
1180 unlock_kernel();
1177 return ret; 1181 return ret;
1178 1182
1179failed_nomem: 1183failed_nomem:
1180 UFSD("EXIT (NOMEM)\n"); 1184 UFSD("EXIT (NOMEM)\n");
1185 unlock_kernel();
1181 return -ENOMEM; 1186 return -ENOMEM;
1182} 1187}
1183 1188
@@ -1449,16 +1454,16 @@ static const struct super_operations ufs_super_ops = {
1449 .show_options = ufs_show_options, 1454 .show_options = ufs_show_options,
1450}; 1455};
1451 1456
1452static int ufs_get_sb(struct file_system_type *fs_type, 1457static struct dentry *ufs_mount(struct file_system_type *fs_type,
1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1458 int flags, const char *dev_name, void *data)
1454{ 1459{
1455 return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt); 1460 return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
1456} 1461}
1457 1462
1458static struct file_system_type ufs_fs_type = { 1463static struct file_system_type ufs_fs_type = {
1459 .owner = THIS_MODULE, 1464 .owner = THIS_MODULE,
1460 .name = "ufs", 1465 .name = "ufs",
1461 .get_sb = ufs_get_sb, 1466 .mount = ufs_mount,
1462 .kill_sb = kill_block_super, 1467 .kill_sb = kill_block_super,
1463 .fs_flags = FS_REQUIRES_DEV, 1468 .fs_flags = FS_REQUIRES_DEV,
1464}; 1469};
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
22config XFS_QUOTA 22config XFS_QUOTA
23 bool "XFS Quota support" 23 bool "XFS Quota support"
24 depends on XFS_FS 24 depends on XFS_FS
25 select QUOTACTL
25 help 26 help
26 If you say Y here, you will be able to set limits for disk usage on 27 If you say Y here, you will be able to set limits for disk usage on
27 a per user and/or a per group basis under XFS. XFS considers quota 28 a per user and/or a per group basis under XFS. XFS considers quota
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..7d287afccde5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1111,11 +1111,12 @@ xfs_vm_writepage(
1111 uptodate = 0; 1111 uptodate = 0;
1112 1112
1113 /* 1113 /*
1114 * A hole may still be marked uptodate because discard_buffer 1114 * set_page_dirty dirties all buffers in a page, independent
1115 * leaves the flag set. 1115 * of their state. The dirty state however is entirely
1116 * meaningless for holes (!mapped && uptodate), so skip
1117 * buffers covering holes here.
1116 */ 1118 */
1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 1119 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1119 imap_valid = 0; 1120 imap_valid = 0;
1120 continue; 1121 continue;
1121 } 1122 }
@@ -1139,8 +1140,7 @@ xfs_vm_writepage(
1139 type = IO_DELAY; 1140 type = IO_DELAY;
1140 flags = BMAPI_ALLOCATE; 1141 flags = BMAPI_ALLOCATE;
1141 1142
1142 if (wbc->sync_mode == WB_SYNC_NONE && 1143 if (wbc->sync_mode == WB_SYNC_NONE)
1143 wbc->nonblocking)
1144 flags |= BMAPI_TRYLOCK; 1144 flags |= BMAPI_TRYLOCK;
1145 } 1145 }
1146 1146
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..aa1d353def29 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,8 +188,8 @@ _xfs_buf_initialize(
188 atomic_set(&bp->b_hold, 1); 188 atomic_set(&bp->b_hold, 1);
189 init_completion(&bp->b_iowait); 189 init_completion(&bp->b_iowait);
190 INIT_LIST_HEAD(&bp->b_list); 190 INIT_LIST_HEAD(&bp->b_list);
191 INIT_LIST_HEAD(&bp->b_hash_list); 191 RB_CLEAR_NODE(&bp->b_rbnode);
192 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 192 sema_init(&bp->b_sema, 0); /* held, no waiters */
193 XB_SET_OWNER(bp); 193 XB_SET_OWNER(bp);
194 bp->b_target = target; 194 bp->b_target = target;
195 bp->b_file_offset = range_base; 195 bp->b_file_offset = range_base;
@@ -262,8 +262,6 @@ xfs_buf_free(
262{ 262{
263 trace_xfs_buf_free(bp, _RET_IP_); 263 trace_xfs_buf_free(bp, _RET_IP_);
264 264
265 ASSERT(list_empty(&bp->b_hash_list));
266
267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
268 uint i; 266 uint i;
269 267
@@ -422,8 +420,10 @@ _xfs_buf_find(
422{ 420{
423 xfs_off_t range_base; 421 xfs_off_t range_base;
424 size_t range_length; 422 size_t range_length;
425 xfs_bufhash_t *hash; 423 struct xfs_perag *pag;
426 xfs_buf_t *bp, *n; 424 struct rb_node **rbp;
425 struct rb_node *parent;
426 xfs_buf_t *bp;
427 427
428 range_base = (ioff << BBSHIFT); 428 range_base = (ioff << BBSHIFT);
429 range_length = (isize << BBSHIFT); 429 range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
432 ASSERT(!(range_length < (1 << btp->bt_sshift))); 432 ASSERT(!(range_length < (1 << btp->bt_sshift)));
433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
434 434
435 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 435 /* get tree root */
436 436 pag = xfs_perag_get(btp->bt_mount,
437 spin_lock(&hash->bh_lock); 437 xfs_daddr_to_agno(btp->bt_mount, ioff));
438 438
439 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 439 /* walk tree */
440 ASSERT(btp == bp->b_target); 440 spin_lock(&pag->pag_buf_lock);
441 if (bp->b_file_offset == range_base && 441 rbp = &pag->pag_buf_tree.rb_node;
442 bp->b_buffer_length == range_length) { 442 parent = NULL;
443 bp = NULL;
444 while (*rbp) {
445 parent = *rbp;
446 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
447
448 if (range_base < bp->b_file_offset)
449 rbp = &(*rbp)->rb_left;
450 else if (range_base > bp->b_file_offset)
451 rbp = &(*rbp)->rb_right;
452 else {
453 /*
454 * found a block offset match. If the range doesn't
455 * match, the only way this is allowed is if the buffer
456 * in the cache is stale and the transaction that made
457 * it stale has not yet committed. i.e. we are
458 * reallocating a busy extent. Skip this buffer and
459 * continue searching to the right for an exact match.
460 */
461 if (bp->b_buffer_length != range_length) {
462 ASSERT(bp->b_flags & XBF_STALE);
463 rbp = &(*rbp)->rb_right;
464 continue;
465 }
443 atomic_inc(&bp->b_hold); 466 atomic_inc(&bp->b_hold);
444 goto found; 467 goto found;
445 } 468 }
@@ -449,17 +472,21 @@ _xfs_buf_find(
449 if (new_bp) { 472 if (new_bp) {
450 _xfs_buf_initialize(new_bp, btp, range_base, 473 _xfs_buf_initialize(new_bp, btp, range_base,
451 range_length, flags); 474 range_length, flags);
452 new_bp->b_hash = hash; 475 rb_link_node(&new_bp->b_rbnode, parent, rbp);
453 list_add(&new_bp->b_hash_list, &hash->bh_list); 476 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
477 /* the buffer keeps the perag reference until it is freed */
478 new_bp->b_pag = pag;
479 spin_unlock(&pag->pag_buf_lock);
454 } else { 480 } else {
455 XFS_STATS_INC(xb_miss_locked); 481 XFS_STATS_INC(xb_miss_locked);
482 spin_unlock(&pag->pag_buf_lock);
483 xfs_perag_put(pag);
456 } 484 }
457
458 spin_unlock(&hash->bh_lock);
459 return new_bp; 485 return new_bp;
460 486
461found: 487found:
462 spin_unlock(&hash->bh_lock); 488 spin_unlock(&pag->pag_buf_lock);
489 xfs_perag_put(pag);
463 490
464 /* Attempt to get the semaphore without sleeping, 491 /* Attempt to get the semaphore without sleeping,
465 * if this does not work then we need to drop the 492 * if this does not work then we need to drop the
@@ -625,8 +652,7 @@ void
625xfs_buf_readahead( 652xfs_buf_readahead(
626 xfs_buftarg_t *target, 653 xfs_buftarg_t *target,
627 xfs_off_t ioff, 654 xfs_off_t ioff,
628 size_t isize, 655 size_t isize)
629 xfs_buf_flags_t flags)
630{ 656{
631 struct backing_dev_info *bdi; 657 struct backing_dev_info *bdi;
632 658
@@ -634,8 +660,42 @@ xfs_buf_readahead(
634 if (bdi_read_congested(bdi)) 660 if (bdi_read_congested(bdi))
635 return; 661 return;
636 662
637 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 663 xfs_buf_read(target, ioff, isize,
638 xfs_buf_read(target, ioff, isize, flags); 664 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
665}
666
667/*
668 * Read an uncached buffer from disk. Allocates and returns a locked
669 * buffer containing the disk contents or nothing.
670 */
671struct xfs_buf *
672xfs_buf_read_uncached(
673 struct xfs_mount *mp,
674 struct xfs_buftarg *target,
675 xfs_daddr_t daddr,
676 size_t length,
677 int flags)
678{
679 xfs_buf_t *bp;
680 int error;
681
682 bp = xfs_buf_get_uncached(target, length, flags);
683 if (!bp)
684 return NULL;
685
686 /* set up the buffer for a read IO */
687 xfs_buf_lock(bp);
688 XFS_BUF_SET_ADDR(bp, daddr);
689 XFS_BUF_READ(bp);
690 XFS_BUF_BUSY(bp);
691
692 xfsbdstrat(mp, bp);
693 error = xfs_buf_iowait(bp);
694 if (error || bp->b_error) {
695 xfs_buf_relse(bp);
696 return NULL;
697 }
698 return bp;
639} 699}
640 700
641xfs_buf_t * 701xfs_buf_t *
@@ -707,9 +767,10 @@ xfs_buf_associate_memory(
707} 767}
708 768
709xfs_buf_t * 769xfs_buf_t *
710xfs_buf_get_noaddr( 770xfs_buf_get_uncached(
771 struct xfs_buftarg *target,
711 size_t len, 772 size_t len,
712 xfs_buftarg_t *target) 773 int flags)
713{ 774{
714 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 775 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
715 int error, i; 776 int error, i;
@@ -725,7 +786,7 @@ xfs_buf_get_noaddr(
725 goto fail_free_buf; 786 goto fail_free_buf;
726 787
727 for (i = 0; i < page_count; i++) { 788 for (i = 0; i < page_count; i++) {
728 bp->b_pages[i] = alloc_page(GFP_KERNEL); 789 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
729 if (!bp->b_pages[i]) 790 if (!bp->b_pages[i])
730 goto fail_free_mem; 791 goto fail_free_mem;
731 } 792 }
@@ -740,7 +801,7 @@ xfs_buf_get_noaddr(
740 801
741 xfs_buf_unlock(bp); 802 xfs_buf_unlock(bp);
742 803
743 trace_xfs_buf_get_noaddr(bp, _RET_IP_); 804 trace_xfs_buf_get_uncached(bp, _RET_IP_);
744 return bp; 805 return bp;
745 806
746 fail_free_mem: 807 fail_free_mem:
@@ -774,29 +835,30 @@ void
774xfs_buf_rele( 835xfs_buf_rele(
775 xfs_buf_t *bp) 836 xfs_buf_t *bp)
776{ 837{
777 xfs_bufhash_t *hash = bp->b_hash; 838 struct xfs_perag *pag = bp->b_pag;
778 839
779 trace_xfs_buf_rele(bp, _RET_IP_); 840 trace_xfs_buf_rele(bp, _RET_IP_);
780 841
781 if (unlikely(!hash)) { 842 if (!pag) {
782 ASSERT(!bp->b_relse); 843 ASSERT(!bp->b_relse);
844 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
783 if (atomic_dec_and_test(&bp->b_hold)) 845 if (atomic_dec_and_test(&bp->b_hold))
784 xfs_buf_free(bp); 846 xfs_buf_free(bp);
785 return; 847 return;
786 } 848 }
787 849
850 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
788 ASSERT(atomic_read(&bp->b_hold) > 0); 851 ASSERT(atomic_read(&bp->b_hold) > 0);
789 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 852 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
790 if (bp->b_relse) { 853 if (bp->b_relse) {
791 atomic_inc(&bp->b_hold); 854 atomic_inc(&bp->b_hold);
792 spin_unlock(&hash->bh_lock); 855 spin_unlock(&pag->pag_buf_lock);
793 (*(bp->b_relse)) (bp); 856 bp->b_relse(bp);
794 } else if (bp->b_flags & XBF_FS_MANAGED) {
795 spin_unlock(&hash->bh_lock);
796 } else { 857 } else {
797 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 858 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
798 list_del_init(&bp->b_hash_list); 859 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
799 spin_unlock(&hash->bh_lock); 860 spin_unlock(&pag->pag_buf_lock);
861 xfs_perag_put(pag);
800 xfs_buf_free(bp); 862 xfs_buf_free(bp);
801 } 863 }
802 } 864 }
@@ -859,7 +921,7 @@ xfs_buf_lock(
859 trace_xfs_buf_lock(bp, _RET_IP_); 921 trace_xfs_buf_lock(bp, _RET_IP_);
860 922
861 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 923 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
862 xfs_log_force(bp->b_mount, 0); 924 xfs_log_force(bp->b_target->bt_mount, 0);
863 if (atomic_read(&bp->b_io_remaining)) 925 if (atomic_read(&bp->b_io_remaining))
864 blk_run_address_space(bp->b_target->bt_mapping); 926 blk_run_address_space(bp->b_target->bt_mapping);
865 down(&bp->b_sema); 927 down(&bp->b_sema);
@@ -924,19 +986,7 @@ xfs_buf_iodone_work(
924 xfs_buf_t *bp = 986 xfs_buf_t *bp =
925 container_of(work, xfs_buf_t, b_iodone_work); 987 container_of(work, xfs_buf_t, b_iodone_work);
926 988
927 /* 989 if (bp->b_iodone)
928 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
929 * ordered flag and reissue them. Because we can't tell the higher
930 * layers directly that they should not issue ordered I/O anymore, they
931 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
932 */
933 if ((bp->b_error == EOPNOTSUPP) &&
934 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
935 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
936 bp->b_flags &= ~XBF_ORDERED;
937 bp->b_flags |= _XFS_BARRIER_FAILED;
938 xfs_buf_iorequest(bp);
939 } else if (bp->b_iodone)
940 (*(bp->b_iodone))(bp); 990 (*(bp->b_iodone))(bp);
941 else if (bp->b_flags & XBF_ASYNC) 991 else if (bp->b_flags & XBF_ASYNC)
942 xfs_buf_relse(bp); 992 xfs_buf_relse(bp);
@@ -982,7 +1032,6 @@ xfs_bwrite(
982{ 1032{
983 int error; 1033 int error;
984 1034
985 bp->b_mount = mp;
986 bp->b_flags |= XBF_WRITE; 1035 bp->b_flags |= XBF_WRITE;
987 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1036 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
988 1037
@@ -1003,8 +1052,6 @@ xfs_bdwrite(
1003{ 1052{
1004 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1053 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1005 1054
1006 bp->b_mount = mp;
1007
1008 bp->b_flags &= ~XBF_READ; 1055 bp->b_flags &= ~XBF_READ;
1009 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1056 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1010 1057
@@ -1013,7 +1060,7 @@ xfs_bdwrite(
1013 1060
1014/* 1061/*
1015 * Called when we want to stop a buffer from getting written or read. 1062 * Called when we want to stop a buffer from getting written or read.
1016 * We attach the EIO error, muck with its flags, and call biodone 1063 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1017 * so that the proper iodone callbacks get called. 1064 * so that the proper iodone callbacks get called.
1018 */ 1065 */
1019STATIC int 1066STATIC int
@@ -1030,21 +1077,21 @@ xfs_bioerror(
1030 XFS_BUF_ERROR(bp, EIO); 1077 XFS_BUF_ERROR(bp, EIO);
1031 1078
1032 /* 1079 /*
1033 * We're calling biodone, so delete XBF_DONE flag. 1080 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1034 */ 1081 */
1035 XFS_BUF_UNREAD(bp); 1082 XFS_BUF_UNREAD(bp);
1036 XFS_BUF_UNDELAYWRITE(bp); 1083 XFS_BUF_UNDELAYWRITE(bp);
1037 XFS_BUF_UNDONE(bp); 1084 XFS_BUF_UNDONE(bp);
1038 XFS_BUF_STALE(bp); 1085 XFS_BUF_STALE(bp);
1039 1086
1040 xfs_biodone(bp); 1087 xfs_buf_ioend(bp, 0);
1041 1088
1042 return EIO; 1089 return EIO;
1043} 1090}
1044 1091
1045/* 1092/*
1046 * Same as xfs_bioerror, except that we are releasing the buffer 1093 * Same as xfs_bioerror, except that we are releasing the buffer
1047 * here ourselves, and avoiding the biodone call. 1094 * here ourselves, and avoiding the xfs_buf_ioend call.
1048 * This is meant for userdata errors; metadata bufs come with 1095 * This is meant for userdata errors; metadata bufs come with
1049 * iodone functions attached, so that we can track down errors. 1096 * iodone functions attached, so that we can track down errors.
1050 */ 1097 */
@@ -1093,7 +1140,7 @@ int
1093xfs_bdstrat_cb( 1140xfs_bdstrat_cb(
1094 struct xfs_buf *bp) 1141 struct xfs_buf *bp)
1095{ 1142{
1096 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { 1143 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1097 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1144 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1098 /* 1145 /*
1099 * Metadata write that didn't get logged but 1146 * Metadata write that didn't get logged but
@@ -1195,7 +1242,7 @@ _xfs_buf_ioapply(
1195 1242
1196 if (bp->b_flags & XBF_ORDERED) { 1243 if (bp->b_flags & XBF_ORDERED) {
1197 ASSERT(!(bp->b_flags & XBF_READ)); 1244 ASSERT(!(bp->b_flags & XBF_READ));
1198 rw = WRITE_BARRIER; 1245 rw = WRITE_FLUSH_FUA;
1199 } else if (bp->b_flags & XBF_LOG_BUFFER) { 1246 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1200 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1247 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1201 bp->b_flags &= ~_XBF_RUN_QUEUES; 1248 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1399,62 +1446,24 @@ xfs_buf_iomove(
1399 */ 1446 */
1400void 1447void
1401xfs_wait_buftarg( 1448xfs_wait_buftarg(
1402 xfs_buftarg_t *btp) 1449 struct xfs_buftarg *btp)
1403{
1404 xfs_buf_t *bp, *n;
1405 xfs_bufhash_t *hash;
1406 uint i;
1407
1408 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1409 hash = &btp->bt_hash[i];
1410again:
1411 spin_lock(&hash->bh_lock);
1412 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1413 ASSERT(btp == bp->b_target);
1414 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1415 spin_unlock(&hash->bh_lock);
1416 /*
1417 * Catch superblock reference count leaks
1418 * immediately
1419 */
1420 BUG_ON(bp->b_bn == 0);
1421 delay(100);
1422 goto again;
1423 }
1424 }
1425 spin_unlock(&hash->bh_lock);
1426 }
1427}
1428
1429/*
1430 * Allocate buffer hash table for a given target.
1431 * For devices containing metadata (i.e. not the log/realtime devices)
1432 * we need to allocate a much larger hash table.
1433 */
1434STATIC void
1435xfs_alloc_bufhash(
1436 xfs_buftarg_t *btp,
1437 int external)
1438{ 1450{
1439 unsigned int i; 1451 struct xfs_perag *pag;
1452 uint i;
1440 1453
1441 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ 1454 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
1442 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1455 pag = xfs_perag_get(btp->bt_mount, i);
1443 sizeof(xfs_bufhash_t)); 1456 spin_lock(&pag->pag_buf_lock);
1444 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1457 while (rb_first(&pag->pag_buf_tree)) {
1445 spin_lock_init(&btp->bt_hash[i].bh_lock); 1458 spin_unlock(&pag->pag_buf_lock);
1446 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1459 delay(100);
1460 spin_lock(&pag->pag_buf_lock);
1461 }
1462 spin_unlock(&pag->pag_buf_lock);
1463 xfs_perag_put(pag);
1447 } 1464 }
1448} 1465}
1449 1466
1450STATIC void
1451xfs_free_bufhash(
1452 xfs_buftarg_t *btp)
1453{
1454 kmem_free_large(btp->bt_hash);
1455 btp->bt_hash = NULL;
1456}
1457
1458/* 1467/*
1459 * buftarg list for delwrite queue processing 1468 * buftarg list for delwrite queue processing
1460 */ 1469 */
@@ -1487,7 +1496,6 @@ xfs_free_buftarg(
1487 xfs_flush_buftarg(btp, 1); 1496 xfs_flush_buftarg(btp, 1);
1488 if (mp->m_flags & XFS_MOUNT_BARRIER) 1497 if (mp->m_flags & XFS_MOUNT_BARRIER)
1489 xfs_blkdev_issue_flush(btp); 1498 xfs_blkdev_issue_flush(btp);
1490 xfs_free_bufhash(btp);
1491 iput(btp->bt_mapping->host); 1499 iput(btp->bt_mapping->host);
1492 1500
1493 /* Unregister the buftarg first so that we don't get a 1501 /* Unregister the buftarg first so that we don't get a
@@ -1572,6 +1580,7 @@ xfs_mapping_buftarg(
1572 XFS_BUFTARG_NAME(btp)); 1580 XFS_BUFTARG_NAME(btp));
1573 return ENOMEM; 1581 return ENOMEM;
1574 } 1582 }
1583 inode->i_ino = get_next_ino();
1575 inode->i_mode = S_IFBLK; 1584 inode->i_mode = S_IFBLK;
1576 inode->i_bdev = bdev; 1585 inode->i_bdev = bdev;
1577 inode->i_rdev = bdev->bd_dev; 1586 inode->i_rdev = bdev->bd_dev;
@@ -1609,6 +1618,7 @@ out_error:
1609 1618
1610xfs_buftarg_t * 1619xfs_buftarg_t *
1611xfs_alloc_buftarg( 1620xfs_alloc_buftarg(
1621 struct xfs_mount *mp,
1612 struct block_device *bdev, 1622 struct block_device *bdev,
1613 int external, 1623 int external,
1614 const char *fsname) 1624 const char *fsname)
@@ -1617,6 +1627,7 @@ xfs_alloc_buftarg(
1617 1627
1618 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1628 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1619 1629
1630 btp->bt_mount = mp;
1620 btp->bt_dev = bdev->bd_dev; 1631 btp->bt_dev = bdev->bd_dev;
1621 btp->bt_bdev = bdev; 1632 btp->bt_bdev = bdev;
1622 if (xfs_setsize_buftarg_early(btp, bdev)) 1633 if (xfs_setsize_buftarg_early(btp, bdev))
@@ -1625,7 +1636,6 @@ xfs_alloc_buftarg(
1625 goto error; 1636 goto error;
1626 if (xfs_alloc_delwrite_queue(btp, fsname)) 1637 if (xfs_alloc_delwrite_queue(btp, fsname))
1627 goto error; 1638 goto error;
1628 xfs_alloc_bufhash(btp, external);
1629 return btp; 1639 return btp;
1630 1640
1631error: 1641error:
@@ -1771,7 +1781,6 @@ xfs_buf_delwri_split(
1771 INIT_LIST_HEAD(list); 1781 INIT_LIST_HEAD(list);
1772 spin_lock(dwlk); 1782 spin_lock(dwlk);
1773 list_for_each_entry_safe(bp, n, dwq, b_list) { 1783 list_for_each_entry_safe(bp, n, dwq, b_list) {
1774 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1775 ASSERT(bp->b_flags & XBF_DELWRI); 1784 ASSERT(bp->b_flags & XBF_DELWRI);
1776 1785
1777 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { 1786 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1785,6 +1794,7 @@ xfs_buf_delwri_split(
1785 _XBF_RUN_QUEUES); 1794 _XBF_RUN_QUEUES);
1786 bp->b_flags |= XBF_WRITE; 1795 bp->b_flags |= XBF_WRITE;
1787 list_move_tail(&bp->b_list, list); 1796 list_move_tail(&bp->b_list, list);
1797 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1788 } else 1798 } else
1789 skipped++; 1799 skipped++;
1790 } 1800 }
@@ -1916,7 +1926,7 @@ xfs_flush_buftarg(
1916 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1926 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1917 1927
1918 list_del_init(&bp->b_list); 1928 list_del_init(&bp->b_list);
1919 xfs_iowait(bp); 1929 xfs_buf_iowait(bp);
1920 xfs_buf_relse(bp); 1930 xfs_buf_relse(bp);
1921 } 1931 }
1922 } 1932 }
@@ -1933,7 +1943,7 @@ xfs_buf_init(void)
1933 goto out; 1943 goto out;
1934 1944
1935 xfslogd_workqueue = alloc_workqueue("xfslogd", 1945 xfslogd_workqueue = alloc_workqueue("xfslogd",
1936 WQ_RESCUER | WQ_HIGHPRI, 1); 1946 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1937 if (!xfslogd_workqueue) 1947 if (!xfslogd_workqueue)
1938 goto out_free_buf_zone; 1948 goto out_free_buf_zone;
1939 1949
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..383a3f37cf98 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ 53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
55#define XBF_ORDERED (1 << 11)/* use ordered writes */ 54#define XBF_ORDERED (1 << 11)/* use ordered writes */
56#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ 55#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
57#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ 56#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
@@ -86,14 +85,6 @@ typedef enum {
86 */ 85 */
87#define _XBF_PAGE_LOCKED (1 << 22) 86#define _XBF_PAGE_LOCKED (1 << 22)
88 87
89/*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95#define _XFS_BARRIER_FAILED (1 << 23)
96
97typedef unsigned int xfs_buf_flags_t; 88typedef unsigned int xfs_buf_flags_t;
98 89
99#define XFS_BUF_FLAGS \ 90#define XFS_BUF_FLAGS \
@@ -104,7 +95,6 @@ typedef unsigned int xfs_buf_flags_t;
104 { XBF_DONE, "DONE" }, \ 95 { XBF_DONE, "DONE" }, \
105 { XBF_DELWRI, "DELWRI" }, \ 96 { XBF_DELWRI, "DELWRI" }, \
106 { XBF_STALE, "STALE" }, \ 97 { XBF_STALE, "STALE" }, \
107 { XBF_FS_MANAGED, "FS_MANAGED" }, \
108 { XBF_ORDERED, "ORDERED" }, \ 98 { XBF_ORDERED, "ORDERED" }, \
109 { XBF_READ_AHEAD, "READ_AHEAD" }, \ 99 { XBF_READ_AHEAD, "READ_AHEAD" }, \
110 { XBF_LOCK, "LOCK" }, /* should never be set */\ 100 { XBF_LOCK, "LOCK" }, /* should never be set */\
@@ -114,8 +104,7 @@ typedef unsigned int xfs_buf_flags_t;
114 { _XBF_PAGES, "PAGES" }, \ 104 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 105 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 106 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ 107 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119 108
120 109
121typedef enum { 110typedef enum {
@@ -132,14 +121,11 @@ typedef struct xfs_buftarg {
132 dev_t bt_dev; 121 dev_t bt_dev;
133 struct block_device *bt_bdev; 122 struct block_device *bt_bdev;
134 struct address_space *bt_mapping; 123 struct address_space *bt_mapping;
124 struct xfs_mount *bt_mount;
135 unsigned int bt_bsize; 125 unsigned int bt_bsize;
136 unsigned int bt_sshift; 126 unsigned int bt_sshift;
137 size_t bt_smask; 127 size_t bt_smask;
138 128
139 /* per device buffer hash table */
140 uint bt_hashshift;
141 xfs_bufhash_t *bt_hash;
142
143 /* per device delwri queue */ 129 /* per device delwri queue */
144 struct task_struct *bt_task; 130 struct task_struct *bt_task;
145 struct list_head bt_list; 131 struct list_head bt_list;
@@ -167,34 +153,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
167#define XB_PAGES 2 153#define XB_PAGES 2
168 154
169typedef struct xfs_buf { 155typedef struct xfs_buf {
156 /*
157 * first cacheline holds all the fields needed for an uncontended cache
158 * hit to be fully processed. The semaphore straddles the cacheline
159 * boundary, but the counter and lock sits on the first cacheline,
160 * which is the only bit that is touched if we hit the semaphore
161 * fast-path on locking.
162 */
163 struct rb_node b_rbnode; /* rbtree node */
164 xfs_off_t b_file_offset; /* offset in file */
165 size_t b_buffer_length;/* size of buffer in bytes */
166 atomic_t b_hold; /* reference count */
167 xfs_buf_flags_t b_flags; /* status flags */
170 struct semaphore b_sema; /* semaphore for lockables */ 168 struct semaphore b_sema; /* semaphore for lockables */
171 unsigned long b_queuetime; /* time buffer was queued */ 169
172 atomic_t b_pin_count; /* pin count */
173 wait_queue_head_t b_waiters; /* unpin waiters */ 170 wait_queue_head_t b_waiters; /* unpin waiters */
174 struct list_head b_list; 171 struct list_head b_list;
175 xfs_buf_flags_t b_flags; /* status flags */ 172 struct xfs_perag *b_pag; /* contains rbtree root */
176 struct list_head b_hash_list; /* hash table list */
177 xfs_bufhash_t *b_hash; /* hash table list start */
178 xfs_buftarg_t *b_target; /* buffer target (device) */ 173 xfs_buftarg_t *b_target; /* buffer target (device) */
179 atomic_t b_hold; /* reference count */
180 xfs_daddr_t b_bn; /* block number for I/O */ 174 xfs_daddr_t b_bn; /* block number for I/O */
181 xfs_off_t b_file_offset; /* offset in file */
182 size_t b_buffer_length;/* size of buffer in bytes */
183 size_t b_count_desired;/* desired transfer size */ 175 size_t b_count_desired;/* desired transfer size */
184 void *b_addr; /* virtual address of buffer */ 176 void *b_addr; /* virtual address of buffer */
185 struct work_struct b_iodone_work; 177 struct work_struct b_iodone_work;
186 atomic_t b_io_remaining; /* #outstanding I/O requests */
187 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 178 xfs_buf_iodone_t b_iodone; /* I/O completion function */
188 xfs_buf_relse_t b_relse; /* releasing function */ 179 xfs_buf_relse_t b_relse; /* releasing function */
189 struct completion b_iowait; /* queue for I/O waiters */ 180 struct completion b_iowait; /* queue for I/O waiters */
190 void *b_fspriv; 181 void *b_fspriv;
191 void *b_fspriv2; 182 void *b_fspriv2;
192 struct xfs_mount *b_mount;
193 unsigned short b_error; /* error code on I/O */
194 unsigned int b_page_count; /* size of page array */
195 unsigned int b_offset; /* page offset in first page */
196 struct page **b_pages; /* array of page pointers */ 183 struct page **b_pages; /* array of page pointers */
197 struct page *b_page_array[XB_PAGES]; /* inline pages */ 184 struct page *b_page_array[XB_PAGES]; /* inline pages */
185 unsigned long b_queuetime; /* time buffer was queued */
186 atomic_t b_pin_count; /* pin count */
187 atomic_t b_io_remaining; /* #outstanding I/O requests */
188 unsigned int b_page_count; /* size of page array */
189 unsigned int b_offset; /* page offset in first page */
190 unsigned short b_error; /* error code on I/O */
198#ifdef XFS_BUF_LOCK_TRACKING 191#ifdef XFS_BUF_LOCK_TRACKING
199 int b_last_holder; 192 int b_last_holder;
200#endif 193#endif
@@ -213,11 +206,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
213 xfs_buf_flags_t); 206 xfs_buf_flags_t);
214 207
215extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 208extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
216extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 209extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
217extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 210extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
218extern void xfs_buf_hold(xfs_buf_t *); 211extern void xfs_buf_hold(xfs_buf_t *);
219extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, 212extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
220 xfs_buf_flags_t); 213struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
214 struct xfs_buftarg *target,
215 xfs_daddr_t daddr, size_t length, int flags);
221 216
222/* Releasing Buffers */ 217/* Releasing Buffers */
223extern void xfs_buf_free(xfs_buf_t *); 218extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +237,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
242extern int xfs_buf_iowait(xfs_buf_t *); 237extern int xfs_buf_iowait(xfs_buf_t *);
243extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 238extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
244 xfs_buf_rw_t); 239 xfs_buf_rw_t);
240#define xfs_buf_zero(bp, off, len) \
241 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
245 242
246static inline int xfs_buf_geterror(xfs_buf_t *bp) 243static inline int xfs_buf_geterror(xfs_buf_t *bp)
247{ 244{
@@ -276,8 +273,6 @@ extern void xfs_buf_terminate(void);
276 XFS_BUF_DONE(bp); \ 273 XFS_BUF_DONE(bp); \
277 } while (0) 274 } while (0)
278 275
279#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
280
281#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 276#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
282#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 277#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
283#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 278#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
@@ -356,25 +351,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
356 xfs_buf_rele(bp); 351 xfs_buf_rele(bp);
357} 352}
358 353
359#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
360
361#define xfs_biomove(bp, off, len, data, rw) \
362 xfs_buf_iomove((bp), (off), (len), (data), \
363 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
364
365#define xfs_biozero(bp, off, len) \
366 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
367
368#define xfs_iowait(bp) xfs_buf_iowait(bp)
369
370#define xfs_baread(target, rablkno, ralen) \
371 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
372
373
374/* 354/*
375 * Handling of buftargs. 355 * Handling of buftargs.
376 */ 356 */
377extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); 357extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
358 struct block_device *, int, const char *);
378extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 359extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
379extern void xfs_wait_buftarg(xfs_buftarg_t *); 360extern void xfs_wait_buftarg(xfs_buftarg_t *);
380extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 361extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_CRED_H__
19#define __XFS_CRED_H__
20
21#include <linux/capability.h>
22
23/*
24 * Credentials
25 */
26typedef const struct cred cred_t;
27
28#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
32 xfs_off_t last, 32 xfs_off_t last,
33 int fiopt) 33 int fiopt)
34{ 34{
35 struct address_space *mapping = VFS_I(ip)->i_mapping; 35 /* can't toss partial tail pages, so mask them out */
36 36 last &= ~(PAGE_SIZE - 1);
37 if (mapping->nrpages) 37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38 truncate_inode_pages(mapping, first);
39} 38}
40 39
41int 40int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
50 49
51 trace_xfs_pagecache_inval(ip, first, last); 50 trace_xfs_pagecache_inval(ip, first, last);
52 51
53 if (mapping->nrpages) { 52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
54 xfs_iflags_clear(ip, XFS_ITRUNCATED); 53 ret = filemap_write_and_wait_range(mapping, first,
55 ret = filemap_write_and_wait(mapping); 54 last == -1 ? LLONG_MAX : last);
56 if (!ret) 55 if (!ret)
57 truncate_inode_pages(mapping, first); 56 truncate_inode_pages_range(mapping, first, last);
58 }
59 return -ret; 57 return -ret;
60} 58}
61 59
@@ -71,10 +69,9 @@ xfs_flush_pages(
71 int ret = 0; 69 int ret = 0;
72 int ret2; 70 int ret2;
73 71
74 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
75 xfs_iflags_clear(ip, XFS_ITRUNCATED); 73 ret = -filemap_fdatawrite_range(mapping, first,
76 ret = -filemap_fdatawrite(mapping); 74 last == -1 ? LLONG_MAX : last);
77 }
78 if (flags & XBF_ASYNC) 75 if (flags & XBF_ASYNC)
79 return ret; 76 return ret;
80 ret2 = xfs_wait_on_pages(ip, first, last); 77 ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
91{ 88{
92 struct address_space *mapping = VFS_I(ip)->i_mapping; 89 struct address_space *mapping = VFS_I(ip)->i_mapping;
93 90
94 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
95 return -filemap_fdatawait(mapping); 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last);
94 }
96 return 0; 95 return 0;
97} 96}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_cred.h"
20#include "xfs_sysctl.h" 19#include "xfs_sysctl.h"
21 20
22/* 21/*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_GLOBALS_H__
19#define __XFS_GLOBALS_H__
20
21extern uint64_t xfs_panic_mask; /* set to cause more panics */
22
23#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..ad442d9e392e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -416,7 +416,7 @@ xfs_attrlist_by_handle(
416 if (IS_ERR(dentry)) 416 if (IS_ERR(dentry))
417 return PTR_ERR(dentry); 417 return PTR_ERR(dentry);
418 418
419 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 419 kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
420 if (!kbuf) 420 if (!kbuf)
421 goto out_dput; 421 goto out_dput;
422 422
@@ -790,7 +790,7 @@ xfs_ioc_fsgetxattr(
790 xfs_ilock(ip, XFS_ILOCK_SHARED); 790 xfs_ilock(ip, XFS_ILOCK_SHARED);
791 fa.fsx_xflags = xfs_ip2xflags(ip); 791 fa.fsx_xflags = xfs_ip2xflags(ip);
792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
793 fa.fsx_projid = ip->i_d.di_projid; 793 fa.fsx_projid = xfs_get_projid(ip);
794 794
795 if (attr) { 795 if (attr) {
796 if (ip->i_afp) { 796 if (ip->i_afp) {
@@ -909,10 +909,10 @@ xfs_ioctl_setattr(
909 return XFS_ERROR(EIO); 909 return XFS_ERROR(EIO);
910 910
911 /* 911 /*
912 * Disallow 32bit project ids because on-disk structure 912 * Disallow 32bit project ids when projid32bit feature is not enabled.
913 * is 16bit only.
914 */ 913 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) 914 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
915 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
916 return XFS_ERROR(EINVAL); 916 return XFS_ERROR(EINVAL);
917 917
918 /* 918 /*
@@ -961,7 +961,7 @@ xfs_ioctl_setattr(
961 if (mask & FSX_PROJID) { 961 if (mask & FSX_PROJID) {
962 if (XFS_IS_QUOTA_RUNNING(mp) && 962 if (XFS_IS_QUOTA_RUNNING(mp) &&
963 XFS_IS_PQUOTA_ON(mp) && 963 XFS_IS_PQUOTA_ON(mp) &&
964 ip->i_d.di_projid != fa->fsx_projid) { 964 xfs_get_projid(ip) != fa->fsx_projid) {
965 ASSERT(tp); 965 ASSERT(tp);
966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
967 capable(CAP_FOWNER) ? 967 capable(CAP_FOWNER) ?
@@ -1063,12 +1063,12 @@ xfs_ioctl_setattr(
1063 * Change the ownerships and register quota modifications 1063 * Change the ownerships and register quota modifications
1064 * in the transaction. 1064 * in the transaction.
1065 */ 1065 */
1066 if (ip->i_d.di_projid != fa->fsx_projid) { 1066 if (xfs_get_projid(ip) != fa->fsx_projid) {
1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1068 olddquot = xfs_qm_vop_chown(tp, ip, 1068 olddquot = xfs_qm_vop_chown(tp, ip,
1069 &ip->i_gdquot, gdqp); 1069 &ip->i_gdquot, gdqp);
1070 } 1070 }
1071 ip->i_d.di_projid = fa->fsx_projid; 1071 xfs_set_projid(ip, fa->fsx_projid);
1072 1072
1073 /* 1073 /*
1074 * We may have to rev the inode as well as 1074 * We may have to rev the inode as well as
@@ -1088,8 +1088,8 @@ xfs_ioctl_setattr(
1088 xfs_diflags_to_linux(ip); 1088 xfs_diflags_to_linux(ip);
1089 } 1089 }
1090 1090
1091 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1091 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1092 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1092 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1093 1093
1094 XFS_STATS_INC(xs_ig_attrchg); 1094 XFS_STATS_INC(xs_ig_attrchg);
1095 1095
@@ -1301,7 +1301,8 @@ xfs_file_ioctl(
1301 case XFS_IOC_ALLOCSP64: 1301 case XFS_IOC_ALLOCSP64:
1302 case XFS_IOC_FREESP64: 1302 case XFS_IOC_FREESP64:
1303 case XFS_IOC_RESVSP64: 1303 case XFS_IOC_RESVSP64:
1304 case XFS_IOC_UNRESVSP64: { 1304 case XFS_IOC_UNRESVSP64:
1305 case XFS_IOC_ZERO_RANGE: {
1305 xfs_flock64_t bf; 1306 xfs_flock64_t bf;
1306 1307
1307 if (copy_from_user(&bf, arg, sizeof(bf))) 1308 if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..b3486dfa5520 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) || 164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
165 get_user(bstat->bs_extents, &bstat32->bs_extents) || 165 get_user(bstat->bs_extents, &bstat32->bs_extents) ||
166 get_user(bstat->bs_gen, &bstat32->bs_gen) || 166 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
167 get_user(bstat->bs_projid, &bstat32->bs_projid) || 167 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
168 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
168 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || 169 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
169 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || 170 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
170 get_user(bstat->bs_aextents, &bstat32->bs_aextents)) 171 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
218 put_user(buffer->bs_extents, &p32->bs_extents) || 219 put_user(buffer->bs_extents, &p32->bs_extents) ||
219 put_user(buffer->bs_gen, &p32->bs_gen) || 220 put_user(buffer->bs_gen, &p32->bs_gen) ||
220 put_user(buffer->bs_projid, &p32->bs_projid) || 221 put_user(buffer->bs_projid, &p32->bs_projid) ||
222 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
221 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 223 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
222 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 224 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
223 put_user(buffer->bs_aextents, &p32->bs_aextents)) 225 put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
574 case XFS_IOC_FSGEOMETRY_V1: 576 case XFS_IOC_FSGEOMETRY_V1:
575 case XFS_IOC_FSGROWFSDATA: 577 case XFS_IOC_FSGROWFSDATA:
576 case XFS_IOC_FSGROWFSRT: 578 case XFS_IOC_FSGROWFSRT:
579 case XFS_IOC_ZERO_RANGE:
577 return xfs_file_ioctl(filp, cmd, p); 580 return xfs_file_ioctl(filp, cmd, p);
578#else 581#else
579 case XFS_IOC_ALLOCSP_32: 582 case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..08b605792a99 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
65 __s32 bs_extsize; /* extent size */ 65 __s32 bs_extsize; /* extent size */
66 __s32 bs_extents; /* number of extents */ 66 __s32 bs_extents; /* number of extents */
67 __u32 bs_gen; /* generation count */ 67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid; /* project id */ 68 __u16 bs_projid_lo; /* lower part of project id */
69 unsigned char bs_pad[14]; /* pad space, unused */ 69#define bs_projid bs_projid_lo /* (previously just bs_projid) */
70 __u16 bs_projid_hi; /* high part of project id */
71 unsigned char bs_pad[12]; /* pad space, unused */
70 __u32 bs_dmevmask; /* DMIG event mask */ 72 __u32 bs_dmevmask; /* DMIG event mask */
71 __u16 bs_dmstate; /* DMIG state info */ 73 __u16 bs_dmstate; /* DMIG state info */
72 __u16 bs_aextents; /* attribute number of extents */ 74 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..94d5fd6a2973 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -95,41 +95,6 @@ xfs_mark_inode_dirty(
95} 95}
96 96
97/* 97/*
98 * Change the requested timestamp in the given inode.
99 * We don't lock across timestamp updates, and we don't log them but
100 * we do record the fact that there is dirty information in core.
101 */
102void
103xfs_ichgtime(
104 xfs_inode_t *ip,
105 int flags)
106{
107 struct inode *inode = VFS_I(ip);
108 timespec_t tv;
109 int sync_it = 0;
110
111 tv = current_fs_time(inode->i_sb);
112
113 if ((flags & XFS_ICHGTIME_MOD) &&
114 !timespec_equal(&inode->i_mtime, &tv)) {
115 inode->i_mtime = tv;
116 sync_it = 1;
117 }
118 if ((flags & XFS_ICHGTIME_CHG) &&
119 !timespec_equal(&inode->i_ctime, &tv)) {
120 inode->i_ctime = tv;
121 sync_it = 1;
122 }
123
124 /*
125 * Update complete - now make sure everyone knows that the inode
126 * is dirty.
127 */
128 if (sync_it)
129 xfs_mark_inode_dirty_sync(ip);
130}
131
132/*
133 * Hook in SELinux. This is not quite correct yet, what we really need 98 * Hook in SELinux. This is not quite correct yet, what we really need
134 * here (as we do for default ACLs) is a mechanism by which creation of 99 * here (as we do for default ACLs) is a mechanism by which creation of
135 * these attrs can be journalled at inode creation time (along with the 100 * these attrs can be journalled at inode creation time (along with the
@@ -224,7 +189,7 @@ xfs_vn_mknod(
224 } 189 }
225 190
226 xfs_dentry_to_name(&name, dentry); 191 xfs_dentry_to_name(&name, dentry);
227 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 192 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
228 if (unlikely(error)) 193 if (unlikely(error))
229 goto out_free_acl; 194 goto out_free_acl;
230 195
@@ -352,7 +317,7 @@ xfs_vn_link(
352 if (unlikely(error)) 317 if (unlikely(error))
353 return -error; 318 return -error;
354 319
355 atomic_inc(&inode->i_count); 320 ihold(inode);
356 d_instantiate(dentry, inode); 321 d_instantiate(dentry, inode);
357 return 0; 322 return 0;
358} 323}
@@ -397,7 +362,7 @@ xfs_vn_symlink(
397 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 362 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
398 xfs_dentry_to_name(&name, dentry); 363 xfs_dentry_to_name(&name, dentry);
399 364
400 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 365 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
401 if (unlikely(error)) 366 if (unlikely(error))
402 goto out; 367 goto out;
403 368
@@ -795,7 +760,10 @@ xfs_setup_inode(
795 760
796 inode->i_ino = ip->i_ino; 761 inode->i_ino = ip->i_ino;
797 inode->i_state = I_NEW; 762 inode->i_state = I_NEW;
798 inode_add_to_lists(ip->i_mount->m_super, inode); 763
764 inode_sb_list_add(inode);
765 /* make the inode look hashed for the writeback code */
766 hlist_add_fake(&inode->i_hash);
799 767
800 inode->i_mode = ip->i_d.di_mode; 768 inode->i_mode = ip->i_d.di_mode;
801 inode->i_nlink = ip->i_d.di_nlink; 769 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..214ddd71ff79 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -71,6 +71,7 @@
71#include <linux/random.h> 71#include <linux/random.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/writeback.h> 73#include <linux/writeback.h>
74#include <linux/capability.h>
74 75
75#include <asm/page.h> 76#include <asm/page.h>
76#include <asm/div64.h> 77#include <asm/div64.h>
@@ -79,14 +80,12 @@
79#include <asm/byteorder.h> 80#include <asm/byteorder.h>
80#include <asm/unaligned.h> 81#include <asm/unaligned.h>
81 82
82#include <xfs_cred.h>
83#include <xfs_vnode.h> 83#include <xfs_vnode.h>
84#include <xfs_stats.h> 84#include <xfs_stats.h>
85#include <xfs_sysctl.h> 85#include <xfs_sysctl.h>
86#include <xfs_iops.h> 86#include <xfs_iops.h>
87#include <xfs_aops.h> 87#include <xfs_aops.h>
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h>
90#include <xfs_buf.h> 89#include <xfs_buf.h>
91 90
92/* 91/*
@@ -144,7 +143,7 @@
144#define SYNCHRONIZE() barrier() 143#define SYNCHRONIZE() barrier()
145#define __return_address __builtin_return_address(0) 144#define __return_address __builtin_return_address(0)
146 145
147#define dfltprid 0 146#define XFS_PROJID_DEFAULT 0
148#define MAXPATHLEN 1024 147#define MAXPATHLEN 1024
149 148
150#define MIN(a,b) (min(a,b)) 149#define MIN(a,b) (min(a,b))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..064f964d4f3c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
44#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
45#include "xfs_utils.h" 45#include "xfs_utils.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_version.h"
48#include "xfs_log_priv.h" 47#include "xfs_log_priv.h"
49#include "xfs_trans_priv.h" 48#include "xfs_trans_priv.h"
50#include "xfs_filestream.h" 49#include "xfs_filestream.h"
@@ -354,9 +353,6 @@ xfs_parseargs(
354 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 353 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
355 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 354 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
356 mp->m_flags |= XFS_MOUNT_DELAYLOG; 355 mp->m_flags |= XFS_MOUNT_DELAYLOG;
357 cmn_err(CE_WARN,
358 "Enabling EXPERIMENTAL delayed logging feature "
359 "- use at your own risk.\n");
360 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
361 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
362 } else if (!strcmp(this_char, "ihashsize")) { 358 } else if (!strcmp(this_char, "ihashsize")) {
@@ -577,7 +573,7 @@ xfs_max_file_offset(
577 573
578 /* Figure out maximum filesize, on Linux this can depend on 574 /* Figure out maximum filesize, on Linux this can depend on
579 * the filesystem blocksize (on 32 bit platforms). 575 * the filesystem blocksize (on 32 bit platforms).
580 * __block_prepare_write does this in an [unsigned] long... 576 * __block_write_begin does this in an [unsigned] long...
581 * page->index << (PAGE_CACHE_SHIFT - bbits) 577 * page->index << (PAGE_CACHE_SHIFT - bbits)
582 * So, for page sized blocks (4K on 32 bit platforms), 578 * So, for page sized blocks (4K on 32 bit platforms),
583 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 579 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -645,7 +641,7 @@ xfs_barrier_test(
645 XFS_BUF_ORDERED(sbp); 641 XFS_BUF_ORDERED(sbp);
646 642
647 xfsbdstrat(mp, sbp); 643 xfsbdstrat(mp, sbp);
648 error = xfs_iowait(sbp); 644 error = xfs_buf_iowait(sbp);
649 645
650 /* 646 /*
651 * Clear all the flags we set and possible error state in the 647 * Clear all the flags we set and possible error state in the
@@ -693,8 +689,7 @@ void
693xfs_blkdev_issue_flush( 689xfs_blkdev_issue_flush(
694 xfs_buftarg_t *buftarg) 690 xfs_buftarg_t *buftarg)
695{ 691{
696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, 692 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
697 BLKDEV_IFL_WAIT);
698} 693}
699 694
700STATIC void 695STATIC void
@@ -758,18 +753,20 @@ xfs_open_devices(
758 * Setup xfs_mount buffer target pointers 753 * Setup xfs_mount buffer target pointers
759 */ 754 */
760 error = ENOMEM; 755 error = ENOMEM;
761 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); 756 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
762 if (!mp->m_ddev_targp) 757 if (!mp->m_ddev_targp)
763 goto out_close_rtdev; 758 goto out_close_rtdev;
764 759
765 if (rtdev) { 760 if (rtdev) {
766 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); 761 mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
762 mp->m_fsname);
767 if (!mp->m_rtdev_targp) 763 if (!mp->m_rtdev_targp)
768 goto out_free_ddev_targ; 764 goto out_free_ddev_targ;
769 } 765 }
770 766
771 if (logdev && logdev != ddev) { 767 if (logdev && logdev != ddev) {
772 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); 768 mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
769 mp->m_fsname);
773 if (!mp->m_logdev_targp) 770 if (!mp->m_logdev_targp)
774 goto out_free_rtdev_targ; 771 goto out_free_rtdev_targ;
775 } else { 772 } else {
@@ -972,12 +969,7 @@ xfs_fs_inode_init_once(
972 969
973/* 970/*
974 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that 971 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
975 * we catch unlogged VFS level updates to the inode. Care must be taken 972 * we catch unlogged VFS level updates to the inode.
976 * here - the transaction code calls mark_inode_dirty_sync() to mark the
977 * VFS inode dirty in a transaction and clears the i_update_core field;
978 * it must clear the field after calling mark_inode_dirty_sync() to
979 * correctly indicate that the dirty state has been propagated into the
980 * inode log item.
981 * 973 *
982 * We need the barrier() to maintain correct ordering between unlogged 974 * We need the barrier() to maintain correct ordering between unlogged
983 * updates and the transaction commit code that clears the i_update_core 975 * updates and the transaction commit code that clears the i_update_core
@@ -1521,8 +1513,9 @@ xfs_fs_fill_super(
1521 if (error) 1513 if (error)
1522 goto out_free_fsname; 1514 goto out_free_fsname;
1523 1515
1524 if (xfs_icsb_init_counters(mp)) 1516 error = xfs_icsb_init_counters(mp);
1525 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1517 if (error)
1518 goto out_close_devices;
1526 1519
1527 error = xfs_readsb(mp, flags); 1520 error = xfs_readsb(mp, flags);
1528 if (error) 1521 if (error)
@@ -1583,6 +1576,7 @@ xfs_fs_fill_super(
1583 xfs_freesb(mp); 1576 xfs_freesb(mp);
1584 out_destroy_counters: 1577 out_destroy_counters:
1585 xfs_icsb_destroy_counters(mp); 1578 xfs_icsb_destroy_counters(mp);
1579 out_close_devices:
1586 xfs_close_devices(mp); 1580 xfs_close_devices(mp);
1587 out_free_fsname: 1581 out_free_fsname:
1588 xfs_free_fsname(mp); 1582 xfs_free_fsname(mp);
@@ -1612,16 +1606,14 @@ xfs_fs_fill_super(
1612 goto out_free_sb; 1606 goto out_free_sb;
1613} 1607}
1614 1608
1615STATIC int 1609STATIC struct dentry *
1616xfs_fs_get_sb( 1610xfs_fs_mount(
1617 struct file_system_type *fs_type, 1611 struct file_system_type *fs_type,
1618 int flags, 1612 int flags,
1619 const char *dev_name, 1613 const char *dev_name,
1620 void *data, 1614 void *data)
1621 struct vfsmount *mnt)
1622{ 1615{
1623 return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, 1616 return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
1624 mnt);
1625} 1617}
1626 1618
1627static const struct super_operations xfs_super_operations = { 1619static const struct super_operations xfs_super_operations = {
@@ -1642,7 +1634,7 @@ static const struct super_operations xfs_super_operations = {
1642static struct file_system_type xfs_fs_type = { 1634static struct file_system_type xfs_fs_type = {
1643 .owner = THIS_MODULE, 1635 .owner = THIS_MODULE,
1644 .name = "xfs", 1636 .name = "xfs",
1645 .get_sb = xfs_fs_get_sb, 1637 .mount = xfs_fs_mount,
1646 .kill_sb = kill_block_super, 1638 .kill_sb = kill_block_super,
1647 .fs_flags = FS_REQUIRES_DEV, 1639 .fs_flags = FS_REQUIRES_DEV,
1648}; 1640};
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
62# define XFS_DBG_STRING "no debug" 62# define XFS_DBG_STRING "no debug"
63#endif 63#endif
64 64
65#define XFS_VERSION_STRING "SGI XFS"
65#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ 66#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
66 XFS_SECURITY_STRING \ 67 XFS_SECURITY_STRING \
67 XFS_REALTIME_STRING \ 68 XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..afb0d7cfad1c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,42 +39,39 @@
39#include <linux/kthread.h> 39#include <linux/kthread.h>
40#include <linux/freezer.h> 40#include <linux/freezer.h>
41 41
42/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between
45 * lookup reduction and stack usage. This is in the reclaim path, so we can't
46 * be too greedy.
47 */
48#define XFS_LOOKUP_BATCH 32
42 49
43STATIC xfs_inode_t * 50STATIC int
44xfs_inode_ag_lookup( 51xfs_inode_ag_walk_grab(
45 struct xfs_mount *mp, 52 struct xfs_inode *ip)
46 struct xfs_perag *pag,
47 uint32_t *first_index,
48 int tag)
49{ 53{
50 int nr_found; 54 struct inode *inode = VFS_I(ip);
51 struct xfs_inode *ip;
52 55
53 /* 56 /* nothing to sync during shutdown */
54 * use a gang lookup to find the next inode in the tree 57 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
55 * as the tree is sparse and a gang lookup walks to find 58 return EFSCORRUPTED;
56 * the number of objects requested. 59
57 */ 60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
58 if (tag == XFS_ICI_NO_TAG) { 61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
59 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 62 return ENOENT;
60 (void **)&ip, *first_index, 1); 63
61 } else { 64 /* If we can't grab the inode, it must on it's way to reclaim. */
62 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 65 if (!igrab(inode))
63 (void **)&ip, *first_index, 1, tag); 66 return ENOENT;
67
68 if (is_bad_inode(inode)) {
69 IRELE(ip);
70 return ENOENT;
64 } 71 }
65 if (!nr_found)
66 return NULL;
67 72
68 /* 73 /* inode is valid */
69 * Update the index for the next lookup. Catch overflows 74 return 0;
70 * into the next AG range which can occur if we have inodes
71 * in the last block of the AG and we are currently
72 * pointing to the last inode.
73 */
74 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
75 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
76 return NULL;
77 return ip;
78} 75}
79 76
80STATIC int 77STATIC int
@@ -83,49 +80,75 @@ xfs_inode_ag_walk(
83 struct xfs_perag *pag, 80 struct xfs_perag *pag,
84 int (*execute)(struct xfs_inode *ip, 81 int (*execute)(struct xfs_inode *ip,
85 struct xfs_perag *pag, int flags), 82 struct xfs_perag *pag, int flags),
86 int flags, 83 int flags)
87 int tag,
88 int exclusive,
89 int *nr_to_scan)
90{ 84{
91 uint32_t first_index; 85 uint32_t first_index;
92 int last_error = 0; 86 int last_error = 0;
93 int skipped; 87 int skipped;
88 int done;
89 int nr_found;
94 90
95restart: 91restart:
92 done = 0;
96 skipped = 0; 93 skipped = 0;
97 first_index = 0; 94 first_index = 0;
95 nr_found = 0;
98 do { 96 do {
97 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
99 int error = 0; 98 int error = 0;
100 xfs_inode_t *ip; 99 int i;
101 100
102 if (exclusive) 101 read_lock(&pag->pag_ici_lock);
103 write_lock(&pag->pag_ici_lock); 102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
104 else 103 (void **)batch, first_index,
105 read_lock(&pag->pag_ici_lock); 104 XFS_LOOKUP_BATCH);
106 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 105 if (!nr_found) {
107 if (!ip) { 106 read_unlock(&pag->pag_ici_lock);
108 if (exclusive)
109 write_unlock(&pag->pag_ici_lock);
110 else
111 read_unlock(&pag->pag_ici_lock);
112 break; 107 break;
113 } 108 }
114 109
115 /* execute releases pag->pag_ici_lock */ 110 /*
116 error = execute(ip, pag, flags); 111 * Grab the inodes before we drop the lock. if we found
117 if (error == EAGAIN) { 112 * nothing, nr == 0 and the loop will be skipped.
118 skipped++; 113 */
119 continue; 114 for (i = 0; i < nr_found; i++) {
115 struct xfs_inode *ip = batch[i];
116
117 if (done || xfs_inode_ag_walk_grab(ip))
118 batch[i] = NULL;
119
120 /*
121 * Update the index for the next lookup. Catch overflows
122 * into the next AG range which can occur if we have inodes
123 * in the last block of the AG and we are currently
124 * pointing to the last inode.
125 */
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1;
129 }
130
131 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock);
133
134 for (i = 0; i < nr_found; i++) {
135 if (!batch[i])
136 continue;
137 error = execute(batch[i], pag, flags);
138 IRELE(batch[i]);
139 if (error == EAGAIN) {
140 skipped++;
141 continue;
142 }
143 if (error && last_error != EFSCORRUPTED)
144 last_error = error;
120 } 145 }
121 if (error)
122 last_error = error;
123 146
124 /* bail out if the filesystem is corrupted. */ 147 /* bail out if the filesystem is corrupted. */
125 if (error == EFSCORRUPTED) 148 if (error == EFSCORRUPTED)
126 break; 149 break;
127 150
128 } while ((*nr_to_scan)--); 151 } while (nr_found && !done);
129 152
130 if (skipped) { 153 if (skipped) {
131 delay(1); 154 delay(1);
@@ -134,110 +157,32 @@ restart:
134 return last_error; 157 return last_error;
135} 158}
136 159
137/*
138 * Select the next per-ag structure to iterate during the walk. The reclaim
139 * walk is optimised only to walk AGs with reclaimable inodes in them.
140 */
141static struct xfs_perag *
142xfs_inode_ag_iter_next_pag(
143 struct xfs_mount *mp,
144 xfs_agnumber_t *first,
145 int tag)
146{
147 struct xfs_perag *pag = NULL;
148
149 if (tag == XFS_ICI_RECLAIM_TAG) {
150 int found;
151 int ref;
152
153 spin_lock(&mp->m_perag_lock);
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, *first, 1, tag);
156 if (found <= 0) {
157 spin_unlock(&mp->m_perag_lock);
158 return NULL;
159 }
160 *first = pag->pag_agno + 1;
161 /* open coded pag reference increment */
162 ref = atomic_inc_return(&pag->pag_ref);
163 spin_unlock(&mp->m_perag_lock);
164 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
165 } else {
166 pag = xfs_perag_get(mp, *first);
167 (*first)++;
168 }
169 return pag;
170}
171
172int 160int
173xfs_inode_ag_iterator( 161xfs_inode_ag_iterator(
174 struct xfs_mount *mp, 162 struct xfs_mount *mp,
175 int (*execute)(struct xfs_inode *ip, 163 int (*execute)(struct xfs_inode *ip,
176 struct xfs_perag *pag, int flags), 164 struct xfs_perag *pag, int flags),
177 int flags, 165 int flags)
178 int tag,
179 int exclusive,
180 int *nr_to_scan)
181{ 166{
182 struct xfs_perag *pag; 167 struct xfs_perag *pag;
183 int error = 0; 168 int error = 0;
184 int last_error = 0; 169 int last_error = 0;
185 xfs_agnumber_t ag; 170 xfs_agnumber_t ag;
186 int nr;
187 171
188 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
189 ag = 0; 172 ag = 0;
190 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { 173 while ((pag = xfs_perag_get(mp, ag))) {
191 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 174 ag = pag->pag_agno + 1;
192 exclusive, &nr); 175 error = xfs_inode_ag_walk(mp, pag, execute, flags);
193 xfs_perag_put(pag); 176 xfs_perag_put(pag);
194 if (error) { 177 if (error) {
195 last_error = error; 178 last_error = error;
196 if (error == EFSCORRUPTED) 179 if (error == EFSCORRUPTED)
197 break; 180 break;
198 } 181 }
199 if (nr <= 0)
200 break;
201 } 182 }
202 if (nr_to_scan)
203 *nr_to_scan = nr;
204 return XFS_ERROR(last_error); 183 return XFS_ERROR(last_error);
205} 184}
206 185
207/* must be called with pag_ici_lock held and releases it */
208int
209xfs_sync_inode_valid(
210 struct xfs_inode *ip,
211 struct xfs_perag *pag)
212{
213 struct inode *inode = VFS_I(ip);
214 int error = EFSCORRUPTED;
215
216 /* nothing to sync during shutdown */
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 goto out_unlock;
219
220 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
221 error = ENOENT;
222 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
223 goto out_unlock;
224
225 /* If we can't grab the inode, it must on it's way to reclaim. */
226 if (!igrab(inode))
227 goto out_unlock;
228
229 if (is_bad_inode(inode)) {
230 IRELE(ip);
231 goto out_unlock;
232 }
233
234 /* inode is valid */
235 error = 0;
236out_unlock:
237 read_unlock(&pag->pag_ici_lock);
238 return error;
239}
240
241STATIC int 186STATIC int
242xfs_sync_inode_data( 187xfs_sync_inode_data(
243 struct xfs_inode *ip, 188 struct xfs_inode *ip,
@@ -248,10 +193,6 @@ xfs_sync_inode_data(
248 struct address_space *mapping = inode->i_mapping; 193 struct address_space *mapping = inode->i_mapping;
249 int error = 0; 194 int error = 0;
250 195
251 error = xfs_sync_inode_valid(ip, pag);
252 if (error)
253 return error;
254
255 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 196 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
256 goto out_wait; 197 goto out_wait;
257 198
@@ -268,7 +209,6 @@ xfs_sync_inode_data(
268 out_wait: 209 out_wait:
269 if (flags & SYNC_WAIT) 210 if (flags & SYNC_WAIT)
270 xfs_ioend_wait(ip); 211 xfs_ioend_wait(ip);
271 IRELE(ip);
272 return error; 212 return error;
273} 213}
274 214
@@ -280,10 +220,6 @@ xfs_sync_inode_attr(
280{ 220{
281 int error = 0; 221 int error = 0;
282 222
283 error = xfs_sync_inode_valid(ip, pag);
284 if (error)
285 return error;
286
287 xfs_ilock(ip, XFS_ILOCK_SHARED); 223 xfs_ilock(ip, XFS_ILOCK_SHARED);
288 if (xfs_inode_clean(ip)) 224 if (xfs_inode_clean(ip))
289 goto out_unlock; 225 goto out_unlock;
@@ -302,7 +238,6 @@ xfs_sync_inode_attr(
302 238
303 out_unlock: 239 out_unlock:
304 xfs_iunlock(ip, XFS_ILOCK_SHARED); 240 xfs_iunlock(ip, XFS_ILOCK_SHARED);
305 IRELE(ip);
306 return error; 241 return error;
307} 242}
308 243
@@ -318,8 +253,7 @@ xfs_sync_data(
318 253
319 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 254 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
320 255
321 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 256 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
322 XFS_ICI_NO_TAG, 0, NULL);
323 if (error) 257 if (error)
324 return XFS_ERROR(error); 258 return XFS_ERROR(error);
325 259
@@ -337,8 +271,7 @@ xfs_sync_attr(
337{ 271{
338 ASSERT((flags & ~SYNC_WAIT) == 0); 272 ASSERT((flags & ~SYNC_WAIT) == 0);
339 273
340 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 274 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
341 XFS_ICI_NO_TAG, 0, NULL);
342} 275}
343 276
344STATIC int 277STATIC int
@@ -698,6 +631,43 @@ __xfs_inode_clear_reclaim_tag(
698} 631}
699 632
700/* 633/*
634 * Grab the inode for reclaim exclusively.
635 * Return 0 if we grabbed it, non-zero otherwise.
636 */
637STATIC int
638xfs_reclaim_inode_grab(
639 struct xfs_inode *ip,
640 int flags)
641{
642
643 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks.
647 */
648 if ((flags & SYNC_TRYLOCK) &&
649 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
650 return 1;
651 }
652
653 /*
654 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us.
657 */
658 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */
662 spin_unlock(&ip->i_flags_lock);
663 return 1;
664 }
665 __xfs_iflags_set(ip, XFS_IRECLAIM);
666 spin_unlock(&ip->i_flags_lock);
667 return 0;
668}
669
670/*
701 * Inodes in different states need to be treated differently, and the return 671 * Inodes in different states need to be treated differently, and the return
702 * value of xfs_iflush is not sufficient to get this right. The following table 672 * value of xfs_iflush is not sufficient to get this right. The following table
703 * lists the inode states and the reclaim actions necessary for non-blocking 673 * lists the inode states and the reclaim actions necessary for non-blocking
@@ -755,23 +725,6 @@ xfs_reclaim_inode(
755{ 725{
756 int error = 0; 726 int error = 0;
757 727
758 /*
759 * The radix tree lock here protects a thread in xfs_iget from racing
760 * with us starting reclaim on the inode. Once we have the
761 * XFS_IRECLAIM flag set it will not touch us.
762 */
763 spin_lock(&ip->i_flags_lock);
764 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
765 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
766 /* ignore as it is already under reclaim */
767 spin_unlock(&ip->i_flags_lock);
768 write_unlock(&pag->pag_ici_lock);
769 return 0;
770 }
771 __xfs_iflags_set(ip, XFS_IRECLAIM);
772 spin_unlock(&ip->i_flags_lock);
773 write_unlock(&pag->pag_ici_lock);
774
775 xfs_ilock(ip, XFS_ILOCK_EXCL); 728 xfs_ilock(ip, XFS_ILOCK_EXCL);
776 if (!xfs_iflock_nowait(ip)) { 729 if (!xfs_iflock_nowait(ip)) {
777 if (!(sync_mode & SYNC_WAIT)) 730 if (!(sync_mode & SYNC_WAIT))
@@ -868,13 +821,127 @@ reclaim:
868 821
869} 822}
870 823
824/*
825 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
826 * corrupted, we still want to try to reclaim all the inodes. If we don't,
827 * then a shut down during filesystem unmount reclaim walk leak all the
828 * unreclaimed inodes.
829 */
830int
831xfs_reclaim_inodes_ag(
832 struct xfs_mount *mp,
833 int flags,
834 int *nr_to_scan)
835{
836 struct xfs_perag *pag;
837 int error = 0;
838 int last_error = 0;
839 xfs_agnumber_t ag;
840 int trylock = flags & SYNC_TRYLOCK;
841 int skipped;
842
843restart:
844 ag = 0;
845 skipped = 0;
846 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
847 unsigned long first_index = 0;
848 int done = 0;
849 int nr_found = 0;
850
851 ag = pag->pag_agno + 1;
852
853 if (trylock) {
854 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
855 skipped++;
856 xfs_perag_put(pag);
857 continue;
858 }
859 first_index = pag->pag_ici_reclaim_cursor;
860 } else
861 mutex_lock(&pag->pag_ici_reclaim_lock);
862
863 do {
864 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
865 int i;
866
867 write_lock(&pag->pag_ici_lock);
868 nr_found = radix_tree_gang_lookup_tag(
869 &pag->pag_ici_root,
870 (void **)batch, first_index,
871 XFS_LOOKUP_BATCH,
872 XFS_ICI_RECLAIM_TAG);
873 if (!nr_found) {
874 write_unlock(&pag->pag_ici_lock);
875 break;
876 }
877
878 /*
879 * Grab the inodes before we drop the lock. if we found
880 * nothing, nr == 0 and the loop will be skipped.
881 */
882 for (i = 0; i < nr_found; i++) {
883 struct xfs_inode *ip = batch[i];
884
885 if (done || xfs_reclaim_inode_grab(ip, flags))
886 batch[i] = NULL;
887
888 /*
889 * Update the index for the next lookup. Catch
890 * overflows into the next AG range which can
891 * occur if we have inodes in the last block of
892 * the AG and we are currently pointing to the
893 * last inode.
894 */
895 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
896 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
897 done = 1;
898 }
899
900 /* unlock now we've grabbed the inodes. */
901 write_unlock(&pag->pag_ici_lock);
902
903 for (i = 0; i < nr_found; i++) {
904 if (!batch[i])
905 continue;
906 error = xfs_reclaim_inode(batch[i], pag, flags);
907 if (error && last_error != EFSCORRUPTED)
908 last_error = error;
909 }
910
911 *nr_to_scan -= XFS_LOOKUP_BATCH;
912
913 } while (nr_found && !done && *nr_to_scan > 0);
914
915 if (trylock && !done)
916 pag->pag_ici_reclaim_cursor = first_index;
917 else
918 pag->pag_ici_reclaim_cursor = 0;
919 mutex_unlock(&pag->pag_ici_reclaim_lock);
920 xfs_perag_put(pag);
921 }
922
923 /*
924 * if we skipped any AG, and we still have scan count remaining, do
925 * another pass this time using blocking reclaim semantics (i.e
926 * waiting on the reclaim locks and ignoring the reclaim cursors). This
927 * ensure that when we get more reclaimers than AGs we block rather
928 * than spin trying to execute reclaim.
929 */
930 if (trylock && skipped && *nr_to_scan > 0) {
931 trylock = 0;
932 goto restart;
933 }
934 return XFS_ERROR(last_error);
935}
936
871int 937int
872xfs_reclaim_inodes( 938xfs_reclaim_inodes(
873 xfs_mount_t *mp, 939 xfs_mount_t *mp,
874 int mode) 940 int mode)
875{ 941{
876 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 942 int nr_to_scan = INT_MAX;
877 XFS_ICI_RECLAIM_TAG, 1, NULL); 943
944 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
878} 945}
879 946
880/* 947/*
@@ -896,17 +963,16 @@ xfs_reclaim_inode_shrink(
896 if (!(gfp_mask & __GFP_FS)) 963 if (!(gfp_mask & __GFP_FS))
897 return -1; 964 return -1;
898 965
899 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, 966 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
900 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 967 /* terminate if we don't exhaust the scan */
901 /* if we don't exhaust the scan, don't bother coming back */
902 if (nr_to_scan > 0) 968 if (nr_to_scan > 0)
903 return -1; 969 return -1;
904 } 970 }
905 971
906 reclaimable = 0; 972 reclaimable = 0;
907 ag = 0; 973 ag = 0;
908 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, 974 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
909 XFS_ICI_RECLAIM_TAG))) { 975 ag = pag->pag_agno + 1;
910 reclaimable += pag->pag_ici_reclaimable; 976 reclaimable += pag->pag_ici_reclaimable;
911 xfs_perag_put(pag); 977 xfs_perag_put(pag);
912 } 978 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..32ba6628290c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -47,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
48 struct xfs_inode *ip); 48 struct xfs_inode *ip);
49 49
50int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 50int xfs_sync_inode_grab(struct xfs_inode *ip);
51int xfs_inode_ag_iterator(struct xfs_mount *mp, 51int xfs_inode_ag_iterator(struct xfs_mount *mp,
52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
53 int flags, int tag, int write_lock, int *nr_to_scan); 53 int flags);
54 54
55void xfs_inode_shrinker_register(struct xfs_mount *mp); 55void xfs_inode_shrinker_register(struct xfs_mount *mp);
56void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 56void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..acef2e98c594 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); 127DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
128DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_cond_lock); 326DEFINE_BUF_EVENT(xfs_buf_cond_lock);
327DEFINE_BUF_EVENT(xfs_buf_unlock); 327DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
329DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
330DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
331DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
332DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
333DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
334DEFINE_BUF_EVENT(xfs_buf_get_noaddr); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
335DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
336DEFINE_BUF_EVENT(xfs_buf_item_relse); 335DEFINE_BUF_EVENT(xfs_buf_item_relse);
337DEFINE_BUF_EVENT(xfs_buf_item_iodone); 336DEFINE_BUF_EVENT(xfs_buf_item_iodone);
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VERSION_H__
19#define __XFS_VERSION_H__
20
21/*
22 * Dummy file that can contain a timestamp to put into the
23 * XFS init string, to help users keep track of what they're
24 * running
25 */
26
27#define XFS_VERSION_STRING "SGI XFS"
28
29#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..faf8e1a83a12 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -463,87 +463,68 @@ xfs_qm_dqtobp(
463 uint flags) 463 uint flags)
464{ 464{
465 xfs_bmbt_irec_t map; 465 xfs_bmbt_irec_t map;
466 int nmaps, error; 466 int nmaps = 1, error;
467 xfs_buf_t *bp; 467 xfs_buf_t *bp;
468 xfs_inode_t *quotip; 468 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
469 xfs_mount_t *mp; 469 xfs_mount_t *mp = dqp->q_mount;
470 xfs_disk_dquot_t *ddq; 470 xfs_disk_dquot_t *ddq;
471 xfs_dqid_t id; 471 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
472 boolean_t newdquot;
473 xfs_trans_t *tp = (tpp ? *tpp : NULL); 472 xfs_trans_t *tp = (tpp ? *tpp : NULL);
474 473
475 mp = dqp->q_mount; 474 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
476 id = be32_to_cpu(dqp->q_core.d_id);
477 nmaps = 1;
478 newdquot = B_FALSE;
479 475
480 /* 476 xfs_ilock(quotip, XFS_ILOCK_SHARED);
481 * If we don't know where the dquot lives, find out. 477 if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
482 */
483 if (dqp->q_blkno == (xfs_daddr_t) 0) {
484 /* We use the id as an index */
485 dqp->q_fileoffset = (xfs_fileoff_t)id /
486 mp->m_quotainfo->qi_dqperchunk;
487 nmaps = 1;
488 quotip = XFS_DQ_TO_QIP(dqp);
489 xfs_ilock(quotip, XFS_ILOCK_SHARED);
490 /* 478 /*
491 * Return if this type of quotas is turned off while we didn't 479 * Return if this type of quotas is turned off while we
492 * have an inode lock 480 * didn't have the quota inode lock.
493 */ 481 */
494 if (XFS_IS_THIS_QUOTA_OFF(dqp)) { 482 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
495 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 483 return ESRCH;
496 return (ESRCH); 484 }
497 } 485
486 /*
487 * Find the block map; no allocations yet
488 */
489 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
490 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
491 NULL, 0, &map, &nmaps, NULL);
492
493 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
494 if (error)
495 return error;
496
497 ASSERT(nmaps == 1);
498 ASSERT(map.br_blockcount == 1);
499
500 /*
501 * Offset of dquot in the (fixed sized) dquot chunk.
502 */
503 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
504 sizeof(xfs_dqblk_t);
505
506 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
507 if (map.br_startblock == HOLESTARTBLOCK) {
498 /* 508 /*
499 * Find the block map; no allocations yet 509 * We don't allocate unless we're asked to
500 */ 510 */
501 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 511 if (!(flags & XFS_QMOPT_DQALLOC))
502 XFS_DQUOT_CLUSTER_SIZE_FSB, 512 return ENOENT;
503 XFS_BMAPI_METADATA,
504 NULL, 0, &map, &nmaps, NULL);
505 513
506 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 514 ASSERT(tp);
515 error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
516 dqp->q_fileoffset, &bp);
507 if (error) 517 if (error)
508 return (error); 518 return error;
509 ASSERT(nmaps == 1); 519 tp = *tpp;
510 ASSERT(map.br_blockcount == 1); 520 } else {
521 trace_xfs_dqtobp_read(dqp);
511 522
512 /* 523 /*
513 * offset of dquot in the (fixed sized) dquot chunk. 524 * store the blkno etc so that we don't have to do the
525 * mapping all the time
514 */ 526 */
515 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * 527 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
516 sizeof(xfs_dqblk_t);
517 if (map.br_startblock == HOLESTARTBLOCK) {
518 /*
519 * We don't allocate unless we're asked to
520 */
521 if (!(flags & XFS_QMOPT_DQALLOC))
522 return (ENOENT);
523
524 ASSERT(tp);
525 if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
526 dqp->q_fileoffset, &bp)))
527 return (error);
528 tp = *tpp;
529 newdquot = B_TRUE;
530 } else {
531 /*
532 * store the blkno etc so that we don't have to do the
533 * mapping all the time
534 */
535 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
536 }
537 }
538 ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
539 ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
540
541 /*
542 * Read in the buffer, unless we've just done the allocation
543 * (in which case we already have the buf).
544 */
545 if (!newdquot) {
546 trace_xfs_dqtobp_read(dqp);
547 528
548 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 529 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
549 dqp->q_blkno, 530 dqp->q_blkno,
@@ -552,13 +533,14 @@ xfs_qm_dqtobp(
552 if (error || !bp) 533 if (error || !bp)
553 return XFS_ERROR(error); 534 return XFS_ERROR(error);
554 } 535 }
536
555 ASSERT(XFS_BUF_ISBUSY(bp)); 537 ASSERT(XFS_BUF_ISBUSY(bp));
556 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 538 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
557 539
558 /* 540 /*
559 * calculate the location of the dquot inside the buffer. 541 * calculate the location of the dquot inside the buffer.
560 */ 542 */
561 ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); 543 ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
562 544
563 /* 545 /*
564 * A simple sanity check in case we got a corrupted dquot... 546 * A simple sanity check in case we got a corrupted dquot...
@@ -1176,18 +1158,18 @@ xfs_qm_dqflush(
1176 xfs_dquot_t *dqp, 1158 xfs_dquot_t *dqp,
1177 uint flags) 1159 uint flags)
1178{ 1160{
1179 xfs_mount_t *mp; 1161 struct xfs_mount *mp = dqp->q_mount;
1180 xfs_buf_t *bp; 1162 struct xfs_buf *bp;
1181 xfs_disk_dquot_t *ddqp; 1163 struct xfs_disk_dquot *ddqp;
1182 int error; 1164 int error;
1183 1165
1184 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1166 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1185 ASSERT(!completion_done(&dqp->q_flush)); 1167 ASSERT(!completion_done(&dqp->q_flush));
1168
1186 trace_xfs_dqflush(dqp); 1169 trace_xfs_dqflush(dqp);
1187 1170
1188 /* 1171 /*
1189 * If not dirty, or it's pinned and we are not supposed to 1172 * If not dirty, or it's pinned and we are not supposed to block, nada.
1190 * block, nada.
1191 */ 1173 */
1192 if (!XFS_DQ_IS_DIRTY(dqp) || 1174 if (!XFS_DQ_IS_DIRTY(dqp) ||
1193 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { 1175 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1183,46 @@ xfs_qm_dqflush(
1201 * down forcibly. If that's the case we must not write this dquot 1183 * down forcibly. If that's the case we must not write this dquot
1202 * to disk, because the log record didn't make it to disk! 1184 * to disk, because the log record didn't make it to disk!
1203 */ 1185 */
1204 if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { 1186 if (XFS_FORCED_SHUTDOWN(mp)) {
1205 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1187 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1206 xfs_dqfunlock(dqp); 1188 xfs_dqfunlock(dqp);
1207 return XFS_ERROR(EIO); 1189 return XFS_ERROR(EIO);
1208 } 1190 }
1209 1191
1210 /* 1192 /*
1211 * Get the buffer containing the on-disk dquot 1193 * Get the buffer containing the on-disk dquot
1212 * We don't need a transaction envelope because we know that the
1213 * the ondisk-dquot has already been allocated for.
1214 */ 1194 */
1215 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { 1195 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
1196 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1197 if (error) {
1216 ASSERT(error != ENOENT); 1198 ASSERT(error != ENOENT);
1217 /*
1218 * Quotas could have gotten turned off (ESRCH)
1219 */
1220 xfs_dqfunlock(dqp); 1199 xfs_dqfunlock(dqp);
1221 return (error); 1200 return error;
1222 } 1201 }
1223 1202
1224 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 1203 /*
1225 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1204 * Calculate the location of the dquot inside the buffer.
1226 xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE); 1205 */
1206 ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
1207
1208 /*
1209 * A simple sanity check in case we got a corrupted dquot..
1210 */
1211 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
1213 xfs_buf_relse(bp);
1214 xfs_dqfunlock(dqp);
1215 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1227 return XFS_ERROR(EIO); 1216 return XFS_ERROR(EIO);
1228 } 1217 }
1229 1218
1230 /* This is the only portion of data that needs to persist */ 1219 /* This is the only portion of data that needs to persist */
1231 memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); 1220 memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
1232 1221
1233 /* 1222 /*
1234 * Clear the dirty field and remember the flush lsn for later use. 1223 * Clear the dirty field and remember the flush lsn for later use.
1235 */ 1224 */
1236 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1225 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1237 mp = dqp->q_mount;
1238 1226
1239 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, 1227 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
1240 &dqp->q_logitem.qli_item.li_lsn); 1228 &dqp->q_logitem.qli_item.li_lsn);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..f8e854b4fde8 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,8 +55,6 @@ uint ndquot;
55kmem_zone_t *qm_dqzone; 55kmem_zone_t *qm_dqzone;
56kmem_zone_t *qm_dqtrxzone; 56kmem_zone_t *qm_dqtrxzone;
57 57
58static cred_t xfs_zerocr;
59
60STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 58STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
61STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 59STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
62 60
@@ -837,7 +835,7 @@ xfs_qm_dqattach_locked(
837 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 835 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
838 flags & XFS_QMOPT_DQALLOC, 836 flags & XFS_QMOPT_DQALLOC,
839 ip->i_udquot, &ip->i_gdquot) : 837 ip->i_udquot, &ip->i_gdquot) :
840 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 838 xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
841 flags & XFS_QMOPT_DQALLOC, 839 flags & XFS_QMOPT_DQALLOC,
842 ip->i_udquot, &ip->i_gdquot); 840 ip->i_udquot, &ip->i_gdquot);
843 /* 841 /*
@@ -1199,87 +1197,6 @@ xfs_qm_list_destroy(
1199 mutex_destroy(&(list->qh_lock)); 1197 mutex_destroy(&(list->qh_lock));
1200} 1198}
1201 1199
1202
1203/*
1204 * Stripped down version of dqattach. This doesn't attach, or even look at the
1205 * dquots attached to the inode. The rationale is that there won't be any
1206 * attached at the time this is called from quotacheck.
1207 */
1208STATIC int
1209xfs_qm_dqget_noattach(
1210 xfs_inode_t *ip,
1211 xfs_dquot_t **O_udqpp,
1212 xfs_dquot_t **O_gdqpp)
1213{
1214 int error;
1215 xfs_mount_t *mp;
1216 xfs_dquot_t *udqp, *gdqp;
1217
1218 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1219 mp = ip->i_mount;
1220 udqp = NULL;
1221 gdqp = NULL;
1222
1223 if (XFS_IS_UQUOTA_ON(mp)) {
1224 ASSERT(ip->i_udquot == NULL);
1225 /*
1226 * We want the dquot allocated if it doesn't exist.
1227 */
1228 if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
1229 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
1230 &udqp))) {
1231 /*
1232 * Shouldn't be able to turn off quotas here.
1233 */
1234 ASSERT(error != ESRCH);
1235 ASSERT(error != ENOENT);
1236 return error;
1237 }
1238 ASSERT(udqp);
1239 }
1240
1241 if (XFS_IS_OQUOTA_ON(mp)) {
1242 ASSERT(ip->i_gdquot == NULL);
1243 if (udqp)
1244 xfs_dqunlock(udqp);
1245 error = XFS_IS_GQUOTA_ON(mp) ?
1246 xfs_qm_dqget(mp, ip,
1247 ip->i_d.di_gid, XFS_DQ_GROUP,
1248 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1249 &gdqp) :
1250 xfs_qm_dqget(mp, ip,
1251 ip->i_d.di_projid, XFS_DQ_PROJ,
1252 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1253 &gdqp);
1254 if (error) {
1255 if (udqp)
1256 xfs_qm_dqrele(udqp);
1257 ASSERT(error != ESRCH);
1258 ASSERT(error != ENOENT);
1259 return error;
1260 }
1261 ASSERT(gdqp);
1262
1263 /* Reacquire the locks in the right order */
1264 if (udqp) {
1265 if (! xfs_qm_dqlock_nowait(udqp)) {
1266 xfs_dqunlock(gdqp);
1267 xfs_dqlock(udqp);
1268 xfs_dqlock(gdqp);
1269 }
1270 }
1271 }
1272
1273 *O_udqpp = udqp;
1274 *O_gdqpp = gdqp;
1275
1276#ifdef QUOTADEBUG
1277 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
1278 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
1279#endif
1280 return 0;
1281}
1282
1283/* 1200/*
1284 * Create an inode and return with a reference already taken, but unlocked 1201 * Create an inode and return with a reference already taken, but unlocked
1285 * This is how we create quota inodes 1202 * This is how we create quota inodes
@@ -1305,8 +1222,8 @@ xfs_qm_qino_alloc(
1305 return error; 1222 return error;
1306 } 1223 }
1307 1224
1308 if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 1225 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
1309 &xfs_zerocr, 0, 1, ip, &committed))) { 1226 if (error) {
1310 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1227 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1311 XFS_TRANS_ABORT); 1228 XFS_TRANS_ABORT);
1312 return error; 1229 return error;
@@ -1516,7 +1433,7 @@ xfs_qm_dqiterate(
1516 rablkcnt = map[i+1].br_blockcount; 1433 rablkcnt = map[i+1].br_blockcount;
1517 rablkno = map[i+1].br_startblock; 1434 rablkno = map[i+1].br_startblock;
1518 while (rablkcnt--) { 1435 while (rablkcnt--) {
1519 xfs_baread(mp->m_ddev_targp, 1436 xfs_buf_readahead(mp->m_ddev_targp,
1520 XFS_FSB_TO_DADDR(mp, rablkno), 1437 XFS_FSB_TO_DADDR(mp, rablkno),
1521 mp->m_quotainfo->qi_dqchunklen); 1438 mp->m_quotainfo->qi_dqchunklen);
1522 rablkno++; 1439 rablkno++;
@@ -1546,18 +1463,34 @@ xfs_qm_dqiterate(
1546 1463
1547/* 1464/*
1548 * Called by dqusage_adjust in doing a quotacheck. 1465 * Called by dqusage_adjust in doing a quotacheck.
1549 * Given the inode, and a dquot (either USR or GRP, doesn't matter), 1466 *
1550 * this updates its incore copy as well as the buffer copy. This is 1467 * Given the inode, and a dquot id this updates both the incore dqout as well
1551 * so that once the quotacheck is done, we can just log all the buffers, 1468 * as the buffer copy. This is so that once the quotacheck is done, we can
1552 * as opposed to logging numerous updates to individual dquots. 1469 * just log all the buffers, as opposed to logging numerous updates to
1470 * individual dquots.
1553 */ 1471 */
1554STATIC void 1472STATIC int
1555xfs_qm_quotacheck_dqadjust( 1473xfs_qm_quotacheck_dqadjust(
1556 xfs_dquot_t *dqp, 1474 struct xfs_inode *ip,
1475 xfs_dqid_t id,
1476 uint type,
1557 xfs_qcnt_t nblks, 1477 xfs_qcnt_t nblks,
1558 xfs_qcnt_t rtblks) 1478 xfs_qcnt_t rtblks)
1559{ 1479{
1560 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1480 struct xfs_mount *mp = ip->i_mount;
1481 struct xfs_dquot *dqp;
1482 int error;
1483
1484 error = xfs_qm_dqget(mp, ip, id, type,
1485 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
1486 if (error) {
1487 /*
1488 * Shouldn't be able to turn off quotas here.
1489 */
1490 ASSERT(error != ESRCH);
1491 ASSERT(error != ENOENT);
1492 return error;
1493 }
1561 1494
1562 trace_xfs_dqadjust(dqp); 1495 trace_xfs_dqadjust(dqp);
1563 1496
@@ -1582,11 +1515,13 @@ xfs_qm_quotacheck_dqadjust(
1582 * There are no timers for the default values set in the root dquot. 1515 * There are no timers for the default values set in the root dquot.
1583 */ 1516 */
1584 if (dqp->q_core.d_id) { 1517 if (dqp->q_core.d_id) {
1585 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1518 xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
1586 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1519 xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
1587 } 1520 }
1588 1521
1589 dqp->dq_flags |= XFS_DQ_DIRTY; 1522 dqp->dq_flags |= XFS_DQ_DIRTY;
1523 xfs_qm_dqput(dqp);
1524 return 0;
1590} 1525}
1591 1526
1592STATIC int 1527STATIC int
@@ -1629,8 +1564,7 @@ xfs_qm_dqusage_adjust(
1629 int *res) /* result code value */ 1564 int *res) /* result code value */
1630{ 1565{
1631 xfs_inode_t *ip; 1566 xfs_inode_t *ip;
1632 xfs_dquot_t *udqp, *gdqp; 1567 xfs_qcnt_t nblks, rtblks = 0;
1633 xfs_qcnt_t nblks, rtblks;
1634 int error; 1568 int error;
1635 1569
1636 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1570 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1584,24 @@ xfs_qm_dqusage_adjust(
1650 * the case in all other instances. It's OK that we do this because 1584 * the case in all other instances. It's OK that we do this because
1651 * quotacheck is done only at mount time. 1585 * quotacheck is done only at mount time.
1652 */ 1586 */
1653 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { 1587 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
1588 if (error) {
1654 *res = BULKSTAT_RV_NOTHING; 1589 *res = BULKSTAT_RV_NOTHING;
1655 return error; 1590 return error;
1656 } 1591 }
1657 1592
1658 /* 1593 ASSERT(ip->i_delayed_blks == 0);
1659 * Obtain the locked dquots. In case of an error (eg. allocation
1660 * fails for ENOSPC), we return the negative of the error number
1661 * to bulkstat, so that it can get propagated to quotacheck() and
1662 * making us disable quotas for the file system.
1663 */
1664 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1665 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1666 IRELE(ip);
1667 *res = BULKSTAT_RV_GIVEUP;
1668 return error;
1669 }
1670 1594
1671 rtblks = 0; 1595 if (XFS_IS_REALTIME_INODE(ip)) {
1672 if (! XFS_IS_REALTIME_INODE(ip)) {
1673 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
1674 } else {
1675 /* 1596 /*
1676 * Walk thru the extent list and count the realtime blocks. 1597 * Walk thru the extent list and count the realtime blocks.
1677 */ 1598 */
1678 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { 1599 error = xfs_qm_get_rtblks(ip, &rtblks);
1679 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1600 if (error)
1680 IRELE(ip); 1601 goto error0;
1681 if (udqp)
1682 xfs_qm_dqput(udqp);
1683 if (gdqp)
1684 xfs_qm_dqput(gdqp);
1685 *res = BULKSTAT_RV_GIVEUP;
1686 return error;
1687 }
1688 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1689 } 1602 }
1690 ASSERT(ip->i_delayed_blks == 0);
1691 1603
1692 /* 1604 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1693 * We can't release the inode while holding its dquot locks.
1694 * The inode can go into inactive and might try to acquire the dquotlocks.
1695 * So, just unlock here and do a vn_rele at the end.
1696 */
1697 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1698 1605
1699 /* 1606 /*
1700 * Add the (disk blocks and inode) resources occupied by this 1607 * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1616,36 @@ xfs_qm_dqusage_adjust(
1709 * and quotaoffs don't race. (Quotachecks happen at mount time only). 1616 * and quotaoffs don't race. (Quotachecks happen at mount time only).
1710 */ 1617 */
1711 if (XFS_IS_UQUOTA_ON(mp)) { 1618 if (XFS_IS_UQUOTA_ON(mp)) {
1712 ASSERT(udqp); 1619 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
1713 xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); 1620 XFS_DQ_USER, nblks, rtblks);
1714 xfs_qm_dqput(udqp); 1621 if (error)
1622 goto error0;
1715 } 1623 }
1716 if (XFS_IS_OQUOTA_ON(mp)) { 1624
1717 ASSERT(gdqp); 1625 if (XFS_IS_GQUOTA_ON(mp)) {
1718 xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); 1626 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
1719 xfs_qm_dqput(gdqp); 1627 XFS_DQ_GROUP, nblks, rtblks);
1628 if (error)
1629 goto error0;
1720 } 1630 }
1721 /*
1722 * Now release the inode. This will send it to 'inactive', and
1723 * possibly even free blocks.
1724 */
1725 IRELE(ip);
1726 1631
1727 /* 1632 if (XFS_IS_PQUOTA_ON(mp)) {
1728 * Goto next inode. 1633 error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
1729 */ 1634 XFS_DQ_PROJ, nblks, rtblks);
1635 if (error)
1636 goto error0;
1637 }
1638
1639 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1640 IRELE(ip);
1730 *res = BULKSTAT_RV_DIDONE; 1641 *res = BULKSTAT_RV_DIDONE;
1731 return 0; 1642 return 0;
1643
1644error0:
1645 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1646 IRELE(ip);
1647 *res = BULKSTAT_RV_GIVEUP;
1648 return error;
1732} 1649}
1733 1650
1734/* 1651/*
@@ -2224,7 +2141,7 @@ xfs_qm_write_sb_changes(
2224 2141
2225 2142
2226/* 2143/*
2227 * Given an inode, a uid and gid (from cred_t) make sure that we have 2144 * Given an inode, a uid, gid and prid make sure that we have
2228 * allocated relevant dquot(s) on disk, and that we won't exceed inode 2145 * allocated relevant dquot(s) on disk, and that we won't exceed inode
2229 * quotas by creating this file. 2146 * quotas by creating this file.
2230 * This also attaches dquot(s) to the given inode after locking it, 2147 * This also attaches dquot(s) to the given inode after locking it,
@@ -2332,7 +2249,7 @@ xfs_qm_vop_dqalloc(
2332 xfs_dqunlock(gq); 2249 xfs_dqunlock(gq);
2333 } 2250 }
2334 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 2251 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
2335 if (ip->i_d.di_projid != prid) { 2252 if (xfs_get_projid(ip) != prid) {
2336 xfs_iunlock(ip, lockflags); 2253 xfs_iunlock(ip, lockflags);
2337 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 2254 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
2338 XFS_DQ_PROJ, 2255 XFS_DQ_PROJ,
@@ -2454,7 +2371,7 @@ xfs_qm_vop_chown_reserve(
2454 } 2371 }
2455 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 2372 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
2456 if (XFS_IS_PQUOTA_ON(ip->i_mount) && 2373 if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
2457 ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) 2374 xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
2458 prjflags = XFS_QMOPT_ENOSPC; 2375 prjflags = XFS_QMOPT_ENOSPC;
2459 2376
2460 if (prjflags || 2377 if (prjflags ||
@@ -2558,7 +2475,7 @@ xfs_qm_vop_create_dqattach(
2558 ip->i_gdquot = gdqp; 2475 ip->i_gdquot = gdqp;
2559 ASSERT(XFS_IS_OQUOTA_ON(mp)); 2476 ASSERT(XFS_IS_OQUOTA_ON(mp));
2560 ASSERT((XFS_IS_GQUOTA_ON(mp) ? 2477 ASSERT((XFS_IS_GQUOTA_ON(mp) ?
2561 ip->i_d.di_gid : ip->i_d.di_projid) == 2478 ip->i_d.di_gid : xfs_get_projid(ip)) ==
2562 be32_to_cpu(gdqp->q_core.d_id)); 2479 be32_to_cpu(gdqp->q_core.d_id));
2563 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2480 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2564 } 2481 }
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..45b5cb1788ab 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
81 xfs_mount_t *mp = ip->i_mount; 81 xfs_mount_t *mp = ip->i_mount;
82 xfs_dquot_t *dqp; 82 xfs_dquot_t *dqp;
83 83
84 if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { 84 if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); 85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
86 xfs_qm_dqput(dqp); 86 xfs_qm_dqput(dqp);
87 } 87 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..bdebc183223e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile(
276 goto out_unlock; 276 goto out_unlock;
277 } 277 }
278 278
279 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 279 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
281 281
282out_unlock: 282out_unlock:
@@ -875,21 +875,14 @@ xfs_dqrele_inode(
875 struct xfs_perag *pag, 875 struct xfs_perag *pag,
876 int flags) 876 int flags)
877{ 877{
878 int error;
879
880 /* skip quota inodes */ 878 /* skip quota inodes */
881 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 879 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
882 ip == ip->i_mount->m_quotainfo->qi_gquotaip) { 880 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
883 ASSERT(ip->i_udquot == NULL); 881 ASSERT(ip->i_udquot == NULL);
884 ASSERT(ip->i_gdquot == NULL); 882 ASSERT(ip->i_gdquot == NULL);
885 read_unlock(&pag->pag_ici_lock);
886 return 0; 883 return 0;
887 } 884 }
888 885
889 error = xfs_sync_inode_valid(ip, pag);
890 if (error)
891 return error;
892
893 xfs_ilock(ip, XFS_ILOCK_EXCL); 886 xfs_ilock(ip, XFS_ILOCK_EXCL);
894 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { 887 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
895 xfs_qm_dqrele(ip->i_udquot); 888 xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +893,6 @@ xfs_dqrele_inode(
900 ip->i_gdquot = NULL; 893 ip->i_gdquot = NULL;
901 } 894 }
902 xfs_iunlock(ip, XFS_ILOCK_EXCL); 895 xfs_iunlock(ip, XFS_ILOCK_EXCL);
903
904 IRELE(ip);
905 return 0; 896 return 0;
906} 897}
907 898
@@ -918,8 +909,7 @@ xfs_qm_dqrele_all_inodes(
918 uint flags) 909 uint flags)
919{ 910{
920 ASSERT(mp->m_quotainfo); 911 ASSERT(mp->m_quotainfo);
921 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, 912 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
922 XFS_ICI_NO_TAG, 0, NULL);
923} 913}
924 914
925/*------------------------------------------------------------------------*/ 915/*------------------------------------------------------------------------*/
@@ -1175,7 +1165,7 @@ xfs_qm_internalqcheck_adjust(
1175 } 1165 }
1176 xfs_qm_internalqcheck_get_dquots(mp, 1166 xfs_qm_internalqcheck_get_dquots(mp,
1177 (xfs_dqid_t) ip->i_d.di_uid, 1167 (xfs_dqid_t) ip->i_d.di_uid,
1178 (xfs_dqid_t) ip->i_d.di_projid, 1168 (xfs_dqid_t) xfs_get_projid(ip),
1179 (xfs_dqid_t) ip->i_d.di_gid, 1169 (xfs_dqid_t) ip->i_d.di_gid,
1180 &ud, &gd); 1170 &ud, &gd);
1181 if (XFS_IS_UQUOTA_ON(mp)) { 1171 if (XFS_IS_UQUOTA_ON(mp)) {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..63c7a1a6c022 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,15 @@ typedef struct xfs_perag {
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 rwlock_t pag_ici_lock; /* incore inode lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
234 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
235
236 /* buffer cache index */
237 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
238 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
239
240 /* for rcu-safe freeing */
241 struct rcu_head rcu_head;
233#endif 242#endif
234 int pagb_count; /* pagb slots in use */ 243 int pagb_count; /* pagb slots in use */
235} xfs_perag_t; 244} xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..112abc439ca5 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -675,7 +675,7 @@ xfs_alloc_ag_vextent_near(
675 xfs_agblock_t gtbnoa; /* aligned ... */ 675 xfs_agblock_t gtbnoa; /* aligned ... */
676 xfs_extlen_t gtdiff; /* difference to right side entry */ 676 xfs_extlen_t gtdiff; /* difference to right side entry */
677 xfs_extlen_t gtlen; /* length of right side entry */ 677 xfs_extlen_t gtlen; /* length of right side entry */
678 xfs_extlen_t gtlena; /* aligned ... */ 678 xfs_extlen_t gtlena = 0; /* aligned ... */
679 xfs_agblock_t gtnew; /* useful start bno of right side */ 679 xfs_agblock_t gtnew; /* useful start bno of right side */
680 int error; /* error code */ 680 int error; /* error code */
681 int i; /* result code, temporary */ 681 int i; /* result code, temporary */
@@ -684,7 +684,7 @@ xfs_alloc_ag_vextent_near(
684 xfs_agblock_t ltbnoa; /* aligned ... */ 684 xfs_agblock_t ltbnoa; /* aligned ... */
685 xfs_extlen_t ltdiff; /* difference to left side entry */ 685 xfs_extlen_t ltdiff; /* difference to left side entry */
686 xfs_extlen_t ltlen; /* length of left side entry */ 686 xfs_extlen_t ltlen; /* length of left side entry */
687 xfs_extlen_t ltlena; /* aligned ... */ 687 xfs_extlen_t ltlena = 0; /* aligned ... */
688 xfs_agblock_t ltnew; /* useful start bno of left side */ 688 xfs_agblock_t ltnew; /* useful start bno of left side */
689 xfs_extlen_t rlen; /* length of returned extent */ 689 xfs_extlen_t rlen; /* length of returned extent */
690#if defined(DEBUG) && defined(__KERNEL__) 690#if defined(DEBUG) && defined(__KERNEL__)
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..3916925e2584 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -280,38 +280,6 @@ xfs_allocbt_key_diff(
280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
281} 281}
282 282
283STATIC int
284xfs_allocbt_kill_root(
285 struct xfs_btree_cur *cur,
286 struct xfs_buf *bp,
287 int level,
288 union xfs_btree_ptr *newroot)
289{
290 int error;
291
292 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
293 XFS_BTREE_STATS_INC(cur, killroot);
294
295 /*
296 * Update the root pointer, decreasing the level by 1 and then
297 * free the old root.
298 */
299 xfs_allocbt_set_root(cur, newroot, -1);
300 error = xfs_allocbt_free_block(cur, bp);
301 if (error) {
302 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
303 return error;
304 }
305
306 XFS_BTREE_STATS_INC(cur, free);
307
308 xfs_btree_setbuf(cur, level, NULL);
309 cur->bc_nlevels--;
310
311 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
312 return 0;
313}
314
315#ifdef DEBUG 283#ifdef DEBUG
316STATIC int 284STATIC int
317xfs_allocbt_keys_inorder( 285xfs_allocbt_keys_inorder(
@@ -423,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
423 391
424 .dup_cursor = xfs_allocbt_dup_cursor, 392 .dup_cursor = xfs_allocbt_dup_cursor,
425 .set_root = xfs_allocbt_set_root, 393 .set_root = xfs_allocbt_set_root,
426 .kill_root = xfs_allocbt_kill_root,
427 .alloc_block = xfs_allocbt_alloc_block, 394 .alloc_block = xfs_allocbt_alloc_block,
428 .free_block = xfs_allocbt_free_block, 395 .free_block = xfs_allocbt_free_block,
429 .update_lastrec = xfs_allocbt_update_lastrec, 396 .update_lastrec = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..c86375378810 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
355 if (mp->m_flags & XFS_MOUNT_WSYNC) { 355 if (mp->m_flags & XFS_MOUNT_WSYNC) {
356 xfs_trans_set_sync(args.trans); 356 xfs_trans_set_sync(args.trans);
357 } 357 }
358
359 if (!error && (flags & ATTR_KERNOTIME) == 0) {
360 xfs_trans_ichgtime(args.trans, dp,
361 XFS_ICHGTIME_CHG);
362 }
358 err2 = xfs_trans_commit(args.trans, 363 err2 = xfs_trans_commit(args.trans,
359 XFS_TRANS_RELEASE_LOG_RES); 364 XFS_TRANS_RELEASE_LOG_RES);
360 xfs_iunlock(dp, XFS_ILOCK_EXCL); 365 xfs_iunlock(dp, XFS_ILOCK_EXCL);
361 366
362 /*
363 * Hit the inode change time.
364 */
365 if (!error && (flags & ATTR_KERNOTIME) == 0) {
366 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
367 }
368 return(error == 0 ? err2 : error); 367 return(error == 0 ? err2 : error);
369 } 368 }
370 369
@@ -420,6 +419,9 @@ xfs_attr_set_int(
420 xfs_trans_set_sync(args.trans); 419 xfs_trans_set_sync(args.trans);
421 } 420 }
422 421
422 if ((flags & ATTR_KERNOTIME) == 0)
423 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
424
423 /* 425 /*
424 * Commit the last in the sequence of transactions. 426 * Commit the last in the sequence of transactions.
425 */ 427 */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
427 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 429 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
428 xfs_iunlock(dp, XFS_ILOCK_EXCL); 430 xfs_iunlock(dp, XFS_ILOCK_EXCL);
429 431
430 /*
431 * Hit the inode change time.
432 */
433 if (!error && (flags & ATTR_KERNOTIME) == 0) {
434 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
435 }
436
437 return(error); 432 return(error);
438 433
439out: 434out:
@@ -567,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
567 xfs_trans_set_sync(args.trans); 562 xfs_trans_set_sync(args.trans);
568 } 563 }
569 564
565 if ((flags & ATTR_KERNOTIME) == 0)
566 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
567
570 /* 568 /*
571 * Commit the last in the sequence of transactions. 569 * Commit the last in the sequence of transactions.
572 */ 570 */
@@ -574,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
574 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 572 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
575 xfs_iunlock(dp, XFS_ILOCK_EXCL); 573 xfs_iunlock(dp, XFS_ILOCK_EXCL);
576 574
577 /*
578 * Hit the inode change time.
579 */
580 if (!error && (flags & ATTR_KERNOTIME) == 0) {
581 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
582 }
583
584 return(error); 575 return(error);
585 576
586out: 577out:
@@ -1995,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1995 1986
1996 tmp = (valuelen < XFS_BUF_SIZE(bp)) 1987 tmp = (valuelen < XFS_BUF_SIZE(bp))
1997 ? valuelen : XFS_BUF_SIZE(bp); 1988 ? valuelen : XFS_BUF_SIZE(bp);
1998 xfs_biomove(bp, 0, tmp, dst, XBF_READ); 1989 xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
1999 xfs_buf_relse(bp); 1990 xfs_buf_relse(bp);
2000 dst += tmp; 1991 dst += tmp;
2001 valuelen -= tmp; 1992 valuelen -= tmp;
@@ -2125,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2125 2116
2126 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2117 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2127 XFS_BUF_SIZE(bp); 2118 XFS_BUF_SIZE(bp);
2128 xfs_biomove(bp, 0, tmp, src, XBF_WRITE); 2119 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2129 if (tmp < XFS_BUF_SIZE(bp)) 2120 if (tmp < XFS_BUF_SIZE(bp))
2130 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2121 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2131 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2122 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
2132 return (error); 2123 return (error);
2133 } 2124 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..8abd12e32e13 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -614,7 +614,7 @@ xfs_bmap_add_extent(
614 nblks += cur->bc_private.b.allocated; 614 nblks += cur->bc_private.b.allocated;
615 ASSERT(nblks <= da_old); 615 ASSERT(nblks <= da_old);
616 if (nblks < da_old) 616 if (nblks < da_old)
617 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 617 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
618 (int64_t)(da_old - nblks), rsvd); 618 (int64_t)(da_old - nblks), rsvd);
619 } 619 }
620 /* 620 /*
@@ -1079,7 +1079,8 @@ xfs_bmap_add_extent_delay_real(
1079 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 1079 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1080 (cur ? cur->bc_private.b.allocated : 0)); 1080 (cur ? cur->bc_private.b.allocated : 0));
1081 if (diff > 0 && 1081 if (diff > 0 &&
1082 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { 1082 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1083 -((int64_t)diff), rsvd)) {
1083 /* 1084 /*
1084 * Ick gross gag me with a spoon. 1085 * Ick gross gag me with a spoon.
1085 */ 1086 */
@@ -1089,16 +1090,18 @@ xfs_bmap_add_extent_delay_real(
1089 temp--; 1090 temp--;
1090 diff--; 1091 diff--;
1091 if (!diff || 1092 if (!diff ||
1092 !xfs_mod_incore_sb(ip->i_mount, 1093 !xfs_icsb_modify_counters(ip->i_mount,
1093 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1094 XFS_SBS_FDBLOCKS,
1095 -((int64_t)diff), rsvd))
1094 break; 1096 break;
1095 } 1097 }
1096 if (temp2) { 1098 if (temp2) {
1097 temp2--; 1099 temp2--;
1098 diff--; 1100 diff--;
1099 if (!diff || 1101 if (!diff ||
1100 !xfs_mod_incore_sb(ip->i_mount, 1102 !xfs_icsb_modify_counters(ip->i_mount,
1101 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1103 XFS_SBS_FDBLOCKS,
1104 -((int64_t)diff), rsvd))
1102 break; 1105 break;
1103 } 1106 }
1104 } 1107 }
@@ -1766,7 +1769,7 @@ xfs_bmap_add_extent_hole_delay(
1766 } 1769 }
1767 if (oldlen != newlen) { 1770 if (oldlen != newlen) {
1768 ASSERT(oldlen > newlen); 1771 ASSERT(oldlen > newlen);
1769 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 1772 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1770 (int64_t)(oldlen - newlen), rsvd); 1773 (int64_t)(oldlen - newlen), rsvd);
1771 /* 1774 /*
1772 * Nothing to do for disk quota accounting here. 1775 * Nothing to do for disk quota accounting here.
@@ -3111,9 +3114,10 @@ xfs_bmap_del_extent(
3111 * Nothing to do for disk quota accounting here. 3114 * Nothing to do for disk quota accounting here.
3112 */ 3115 */
3113 ASSERT(da_old >= da_new); 3116 ASSERT(da_old >= da_new);
3114 if (da_old > da_new) 3117 if (da_old > da_new) {
3115 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), 3118 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
3116 rsvd); 3119 (int64_t)(da_old - da_new), rsvd);
3120 }
3117done: 3121done:
3118 *logflagsp = flags; 3122 *logflagsp = flags;
3119 return error; 3123 return error;
@@ -4526,13 +4530,13 @@ xfs_bmapi(
4526 -((int64_t)extsz), (flags & 4530 -((int64_t)extsz), (flags &
4527 XFS_BMAPI_RSVBLOCKS)); 4531 XFS_BMAPI_RSVBLOCKS));
4528 } else { 4532 } else {
4529 error = xfs_mod_incore_sb(mp, 4533 error = xfs_icsb_modify_counters(mp,
4530 XFS_SBS_FDBLOCKS, 4534 XFS_SBS_FDBLOCKS,
4531 -((int64_t)alen), (flags & 4535 -((int64_t)alen), (flags &
4532 XFS_BMAPI_RSVBLOCKS)); 4536 XFS_BMAPI_RSVBLOCKS));
4533 } 4537 }
4534 if (!error) { 4538 if (!error) {
4535 error = xfs_mod_incore_sb(mp, 4539 error = xfs_icsb_modify_counters(mp,
4536 XFS_SBS_FDBLOCKS, 4540 XFS_SBS_FDBLOCKS,
4537 -((int64_t)indlen), (flags & 4541 -((int64_t)indlen), (flags &
4538 XFS_BMAPI_RSVBLOCKS)); 4542 XFS_BMAPI_RSVBLOCKS));
@@ -4542,7 +4546,7 @@ xfs_bmapi(
4542 (int64_t)extsz, (flags & 4546 (int64_t)extsz, (flags &
4543 XFS_BMAPI_RSVBLOCKS)); 4547 XFS_BMAPI_RSVBLOCKS));
4544 else if (error) 4548 else if (error)
4545 xfs_mod_incore_sb(mp, 4549 xfs_icsb_modify_counters(mp,
4546 XFS_SBS_FDBLOCKS, 4550 XFS_SBS_FDBLOCKS,
4547 (int64_t)alen, (flags & 4551 (int64_t)alen, (flags &
4548 XFS_BMAPI_RSVBLOCKS)); 4552 XFS_BMAPI_RSVBLOCKS));
@@ -4744,8 +4748,12 @@ xfs_bmapi(
4744 * Check if writing previously allocated but 4748 * Check if writing previously allocated but
4745 * unwritten extents. 4749 * unwritten extents.
4746 */ 4750 */
4747 if (wr && mval->br_state == XFS_EXT_UNWRITTEN && 4751 if (wr &&
4748 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { 4752 ((mval->br_state == XFS_EXT_UNWRITTEN &&
4753 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
4754 (mval->br_state == XFS_EXT_NORM &&
4755 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
4756 (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
4749 /* 4757 /*
4750 * Modify (by adding) the state flag, if writing. 4758 * Modify (by adding) the state flag, if writing.
4751 */ 4759 */
@@ -4757,7 +4765,9 @@ xfs_bmapi(
4757 *firstblock; 4765 *firstblock;
4758 cur->bc_private.b.flist = flist; 4766 cur->bc_private.b.flist = flist;
4759 } 4767 }
4760 mval->br_state = XFS_EXT_NORM; 4768 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4769 ? XFS_EXT_NORM
4770 : XFS_EXT_UNWRITTEN;
4761 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4771 error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
4762 firstblock, flist, &tmp_logflags, 4772 firstblock, flist, &tmp_logflags,
4763 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4773 whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
@@ -5200,7 +5210,7 @@ xfs_bunmapi(
5200 ip, -((long)del.br_blockcount), 0, 5210 ip, -((long)del.br_blockcount), 0,
5201 XFS_QMOPT_RES_RTBLKS); 5211 XFS_QMOPT_RES_RTBLKS);
5202 } else { 5212 } else {
5203 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, 5213 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5204 (int64_t)del.br_blockcount, rsvd); 5214 (int64_t)del.br_blockcount, rsvd);
5205 (void)xfs_trans_reserve_quota_nblks(NULL, 5215 (void)xfs_trans_reserve_quota_nblks(NULL,
5206 ip, -((long)del.br_blockcount), 0, 5216 ip, -((long)del.br_blockcount), 0,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..71ec9b6ecdfc 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -74,9 +74,12 @@ typedef struct xfs_bmap_free
74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ 74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
75 /* combine contig. space */ 75 /* combine contig. space */
76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ 76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
77#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */ 77/*
78 /* need write cache flushing and no */ 78 * unwritten extent conversion - this needs write cache flushing and no additional
79 /* additional allocation alignments */ 79 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
80 * from written to unwritten, otherwise convert from unwritten to written.
81 */
82#define XFS_BMAPI_CONVERT 0x200
80 83
81#define XFS_BMAPI_FLAGS \ 84#define XFS_BMAPI_FLAGS \
82 { XFS_BMAPI_WRITE, "WRITE" }, \ 85 { XFS_BMAPI_WRITE, "WRITE" }, \
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..04f9cca8da7e 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
217 */ 217 */
218 for (i = 0; i < cur->bc_nlevels; i++) { 218 for (i = 0; i < cur->bc_nlevels; i++) {
219 if (cur->bc_bufs[i]) 219 if (cur->bc_bufs[i])
220 xfs_btree_setbuf(cur, i, NULL); 220 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
221 else if (!error) 221 else if (!error)
222 break; 222 break;
223 } 223 }
@@ -656,7 +656,7 @@ xfs_btree_reada_bufl(
656 656
657 ASSERT(fsbno != NULLFSBLOCK); 657 ASSERT(fsbno != NULLFSBLOCK);
658 d = XFS_FSB_TO_DADDR(mp, fsbno); 658 d = XFS_FSB_TO_DADDR(mp, fsbno);
659 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 659 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
660} 660}
661 661
662/* 662/*
@@ -676,7 +676,7 @@ xfs_btree_reada_bufs(
676 ASSERT(agno != NULLAGNUMBER); 676 ASSERT(agno != NULLAGNUMBER);
677 ASSERT(agbno != NULLAGBLOCK); 677 ASSERT(agbno != NULLAGBLOCK);
678 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 678 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
679 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 679 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
680} 680}
681 681
682STATIC int 682STATIC int
@@ -763,22 +763,19 @@ xfs_btree_readahead(
763 * Set the buffer for level "lev" in the cursor to bp, releasing 763 * Set the buffer for level "lev" in the cursor to bp, releasing
764 * any previous buffer. 764 * any previous buffer.
765 */ 765 */
766void 766STATIC void
767xfs_btree_setbuf( 767xfs_btree_setbuf(
768 xfs_btree_cur_t *cur, /* btree cursor */ 768 xfs_btree_cur_t *cur, /* btree cursor */
769 int lev, /* level in btree */ 769 int lev, /* level in btree */
770 xfs_buf_t *bp) /* new buffer to set */ 770 xfs_buf_t *bp) /* new buffer to set */
771{ 771{
772 struct xfs_btree_block *b; /* btree block */ 772 struct xfs_btree_block *b; /* btree block */
773 xfs_buf_t *obp; /* old buffer pointer */
774 773
775 obp = cur->bc_bufs[lev]; 774 if (cur->bc_bufs[lev])
776 if (obp) 775 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
777 xfs_trans_brelse(cur->bc_tp, obp);
778 cur->bc_bufs[lev] = bp; 776 cur->bc_bufs[lev] = bp;
779 cur->bc_ra[lev] = 0; 777 cur->bc_ra[lev] = 0;
780 if (!bp) 778
781 return;
782 b = XFS_BUF_TO_BLOCK(bp); 779 b = XFS_BUF_TO_BLOCK(bp);
783 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 780 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
784 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) 781 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -3011,6 +3008,43 @@ out0:
3011 return 0; 3008 return 0;
3012} 3009}
3013 3010
3011/*
3012 * Kill the current root node, and replace it with it's only child node.
3013 */
3014STATIC int
3015xfs_btree_kill_root(
3016 struct xfs_btree_cur *cur,
3017 struct xfs_buf *bp,
3018 int level,
3019 union xfs_btree_ptr *newroot)
3020{
3021 int error;
3022
3023 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3024 XFS_BTREE_STATS_INC(cur, killroot);
3025
3026 /*
3027 * Update the root pointer, decreasing the level by 1 and then
3028 * free the old root.
3029 */
3030 cur->bc_ops->set_root(cur, newroot, -1);
3031
3032 error = cur->bc_ops->free_block(cur, bp);
3033 if (error) {
3034 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3035 return error;
3036 }
3037
3038 XFS_BTREE_STATS_INC(cur, free);
3039
3040 cur->bc_bufs[level] = NULL;
3041 cur->bc_ra[level] = 0;
3042 cur->bc_nlevels--;
3043
3044 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3045 return 0;
3046}
3047
3014STATIC int 3048STATIC int
3015xfs_btree_dec_cursor( 3049xfs_btree_dec_cursor(
3016 struct xfs_btree_cur *cur, 3050 struct xfs_btree_cur *cur,
@@ -3195,7 +3229,7 @@ xfs_btree_delrec(
3195 * Make it the new root of the btree. 3229 * Make it the new root of the btree.
3196 */ 3230 */
3197 pp = xfs_btree_ptr_addr(cur, 1, block); 3231 pp = xfs_btree_ptr_addr(cur, 1, block);
3198 error = cur->bc_ops->kill_root(cur, bp, level, pp); 3232 error = xfs_btree_kill_root(cur, bp, level, pp);
3199 if (error) 3233 if (error)
3200 goto error0; 3234 goto error0;
3201 } else if (level > 0) { 3235 } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
152 152
153 /* update btree root pointer */ 153 /* update btree root pointer */
154 void (*set_root)(struct xfs_btree_cur *cur, 154 void (*set_root)(struct xfs_btree_cur *cur,
155 union xfs_btree_ptr *nptr, int level_change); 155 union xfs_btree_ptr *nptr, int level_change);
156 int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
157 int level, union xfs_btree_ptr *newroot);
158 156
159 /* block allocation / freeing */ 157 /* block allocation / freeing */
160 int (*alloc_block)(struct xfs_btree_cur *cur, 158 int (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
399 xfs_agblock_t agbno, /* allocation group block number */ 397 xfs_agblock_t agbno, /* allocation group block number */
400 xfs_extlen_t count); /* count of filesystem blocks */ 398 xfs_extlen_t count); /* count of filesystem blocks */
401 399
402/*
403 * Set the buffer for level "lev" in the cursor to bp, releasing
404 * any previous buffer.
405 */
406void
407xfs_btree_setbuf(
408 xfs_btree_cur_t *cur, /* btree cursor */
409 int lev, /* level in btree */
410 struct xfs_buf *bp); /* new buffer to set */
411
412 400
413/* 401/*
414 * Common btree core entry points. 402 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..2686d0d54c5b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -692,8 +692,7 @@ xfs_buf_item_init(
692 * the first. If we do already have one, there is 692 * the first. If we do already have one, there is
693 * nothing to do here so return. 693 * nothing to do here so return.
694 */ 694 */
695 if (bp->b_mount != mp) 695 ASSERT(bp->b_target->bt_mount == mp);
696 bp->b_mount = mp;
697 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 696 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
698 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 697 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
699 if (lip->li_type == XFS_LI_BUF) { 698 if (lip->li_type == XFS_LI_BUF) {
@@ -974,7 +973,7 @@ xfs_buf_iodone_callbacks(
974 xfs_buf_do_callbacks(bp, lip); 973 xfs_buf_do_callbacks(bp, lip);
975 XFS_BUF_SET_FSPRIVATE(bp, NULL); 974 XFS_BUF_SET_FSPRIVATE(bp, NULL);
976 XFS_BUF_CLR_IODONE_FUNC(bp); 975 XFS_BUF_CLR_IODONE_FUNC(bp);
977 xfs_biodone(bp); 976 xfs_buf_ioend(bp, 0);
978 return; 977 return;
979 } 978 }
980 979
@@ -1033,7 +1032,7 @@ xfs_buf_iodone_callbacks(
1033 xfs_buf_do_callbacks(bp, lip); 1032 xfs_buf_do_callbacks(bp, lip);
1034 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1033 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1035 XFS_BUF_CLR_IODONE_FUNC(bp); 1034 XFS_BUF_CLR_IODONE_FUNC(bp);
1036 xfs_biodone(bp); 1035 xfs_buf_ioend(bp, 0);
1037} 1036}
1038 1037
1039/* 1038/*
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..1c00bedb3175 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2042,7 +2042,7 @@ xfs_da_do_buf(
2042 mappedbno, nmapped, 0, &bp); 2042 mappedbno, nmapped, 0, &bp);
2043 break; 2043 break;
2044 case 3: 2044 case 3:
2045 xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); 2045 xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
2046 error = 0; 2046 error = 0;
2047 bp = NULL; 2047 bp = NULL;
2048 break; 2048 break;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
49 __be32 di_uid; /* owner's user id */ 49 __be32 di_uid; /* owner's user id */
50 __be32 di_gid; /* owner's group id */ 50 __be32 di_gid; /* owner's group id */
51 __be32 di_nlink; /* number of links to file */ 51 __be32 di_nlink; /* number of links to file */
52 __be16 di_projid; /* owner's project id */ 52 __be16 di_projid_lo; /* lower part of owner's project id */
53 __u8 di_pad[8]; /* unused, zeroed space */ 53 __be16 di_projid_hi; /* higher part owner's project id */
54 __u8 di_pad[6]; /* unused, zeroed space */
54 __be16 di_flushiter; /* incremented on flush */ 55 __be16 di_flushiter; /* incremented on flush */
55 xfs_timestamp_t di_atime; /* time last accessed */ 56 xfs_timestamp_t di_atime; /* time last accessed */
56 xfs_timestamp_t di_mtime; /* time last modified */ 57 xfs_timestamp_t di_mtime; /* time last modified */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
961 if (i > ra_current && 961 if (i > ra_current &&
962 map[ra_index].br_blockcount >= 962 map[ra_index].br_blockcount >=
963 mp->m_dirblkfsbs) { 963 mp->m_dirblkfsbs) {
964 xfs_baread(mp->m_ddev_targp, 964 xfs_buf_readahead(mp->m_ddev_targp,
965 XFS_FSB_TO_DADDR(mp, 965 XFS_FSB_TO_DADDR(mp,
966 map[ra_index].br_startblock + 966 map[ra_index].br_startblock +
967 ra_offset), 967 ra_offset),
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
744 * If the file's parent directory is known, take its iolock in exclusive 744 * If the file's parent directory is known, take its iolock in exclusive
745 * mode to prevent two sibling files from racing each other to migrate 745 * mode to prevent two sibling files from racing each other to migrate
746 * themselves and their parent to different AGs. 746 * themselves and their parent to different AGs.
747 *
748 * Note that we lock the parent directory iolock inside the child
749 * iolock here. That's fine as we never hold both parent and child
750 * iolock in any other place. This is different from the ilock,
751 * which requires locking of the child after the parent for namespace
752 * operations.
747 */ 753 */
748 if (pip) 754 if (pip)
749 xfs_ilock(pip, XFS_IOLOCK_EXCL); 755 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
750 756
751 /* 757 /*
752 * A new AG needs to be found for the file. If the file's parent 758 * A new AG needs to be found for the file. If the file's parent
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
293 __s32 bs_extsize; /* extent size */ 293 __s32 bs_extsize; /* extent size */
294 __s32 bs_extents; /* number of extents */ 294 __s32 bs_extents; /* number of extents */
295 __u32 bs_gen; /* generation count */ 295 __u32 bs_gen; /* generation count */
296 __u16 bs_projid; /* project id */ 296 __u16 bs_projid_lo; /* lower part of project id */
297#define bs_projid bs_projid_lo /* (previously just bs_projid) */
297 __u16 bs_forkoff; /* inode fork offset in bytes */ 298 __u16 bs_forkoff; /* inode fork offset in bytes */
298 unsigned char bs_pad[12]; /* pad space, unused */ 299 __u16 bs_projid_hi; /* higher part of project id */
300 unsigned char bs_pad[10]; /* pad space, unused */
299 __u32 bs_dmevmask; /* DMIG event mask */ 301 __u32 bs_dmevmask; /* DMIG event mask */
300 __u16 bs_dmstate; /* DMIG state info */ 302 __u16 bs_dmstate; /* DMIG state info */
301 __u16 bs_aextents; /* attribute number of extents */ 303 __u16 bs_aextents; /* attribute number of extents */
@@ -448,6 +450,7 @@ typedef struct xfs_handle {
448/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ 450/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
449/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 451/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
450#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 452#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
453#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
451 454
452/* 455/*
453 * ioctl commands that replace IRIX syssgi()'s 456 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..a7c116e814af 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) 144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
145 return error; 145 return error;
146 dpct = pct - mp->m_sb.sb_imax_pct; 146 dpct = pct - mp->m_sb.sb_imax_pct;
147 error = xfs_read_buf(mp, mp->m_ddev_targp, 147 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0, &bp); 149 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
150 if (error) 150 if (!bp)
151 return error; 151 return EIO;
152 ASSERT(bp);
153 xfs_buf_relse(bp); 152 xfs_buf_relse(bp);
154 153
155 new = nb; /* use new as a temporary here */ 154 new = nb; /* use new as a temporary here */
@@ -597,7 +596,8 @@ out:
597 * the extra reserve blocks from the reserve..... 596 * the extra reserve blocks from the reserve.....
598 */ 597 */
599 int error; 598 int error;
600 error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); 599 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
600 fdblks_delta, 0);
601 if (error == ENOSPC) 601 if (error == ENOSPC)
602 goto retry; 602 goto retry;
603 } 603 }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..0626a32c3447 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
212 * to log a whole cluster of inodes instead of all the 212 * to log a whole cluster of inodes instead of all the
213 * individual transactions causing a lot of log traffic. 213 * individual transactions causing a lot of log traffic.
214 */ 214 */
215 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
216 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
217 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
218 uint isize = sizeof(struct xfs_dinode); 218 uint isize = sizeof(struct xfs_dinode);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
183 cur->bc_rec.i.ir_startino; 183 cur->bc_rec.i.ir_startino;
184} 184}
185 185
186STATIC int
187xfs_inobt_kill_root(
188 struct xfs_btree_cur *cur,
189 struct xfs_buf *bp,
190 int level,
191 union xfs_btree_ptr *newroot)
192{
193 int error;
194
195 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
196 XFS_BTREE_STATS_INC(cur, killroot);
197
198 /*
199 * Update the root pointer, decreasing the level by 1 and then
200 * free the old root.
201 */
202 xfs_inobt_set_root(cur, newroot, -1);
203 error = xfs_inobt_free_block(cur, bp);
204 if (error) {
205 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
206 return error;
207 }
208
209 XFS_BTREE_STATS_INC(cur, free);
210
211 cur->bc_bufs[level] = NULL;
212 cur->bc_nlevels--;
213
214 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
215 return 0;
216}
217
218#ifdef DEBUG 186#ifdef DEBUG
219STATIC int 187STATIC int
220xfs_inobt_keys_inorder( 188xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
309 277
310 .dup_cursor = xfs_inobt_dup_cursor, 278 .dup_cursor = xfs_inobt_dup_cursor,
311 .set_root = xfs_inobt_set_root, 279 .set_root = xfs_inobt_set_root,
312 .kill_root = xfs_inobt_kill_root,
313 .alloc_block = xfs_inobt_alloc_block, 280 .alloc_block = xfs_inobt_alloc_block,
314 .free_block = xfs_inobt_free_block, 281 .free_block = xfs_inobt_free_block,
315 .get_minrecs = xfs_inobt_get_minrecs, 282 .get_minrecs = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..0cdd26932d8e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -365,8 +365,8 @@ xfs_iget(
365 xfs_perag_t *pag; 365 xfs_perag_t *pag;
366 xfs_agino_t agino; 366 xfs_agino_t agino;
367 367
368 /* the radix tree exists only in inode capable AGs */ 368 /* reject inode numbers outside existing AGs */
369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) 369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
370 return EINVAL; 370 return EINVAL;
371 371
372 /* get the perag structure and ensure that it's inode capable */ 372 /* get the perag structure and ensure that it's inode capable */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..108c7a085f94 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -660,7 +660,8 @@ xfs_dinode_from_disk(
660 to->di_uid = be32_to_cpu(from->di_uid); 660 to->di_uid = be32_to_cpu(from->di_uid);
661 to->di_gid = be32_to_cpu(from->di_gid); 661 to->di_gid = be32_to_cpu(from->di_gid);
662 to->di_nlink = be32_to_cpu(from->di_nlink); 662 to->di_nlink = be32_to_cpu(from->di_nlink);
663 to->di_projid = be16_to_cpu(from->di_projid); 663 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
664 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
664 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 665 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
665 to->di_flushiter = be16_to_cpu(from->di_flushiter); 666 to->di_flushiter = be16_to_cpu(from->di_flushiter);
666 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 667 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +696,8 @@ xfs_dinode_to_disk(
695 to->di_uid = cpu_to_be32(from->di_uid); 696 to->di_uid = cpu_to_be32(from->di_uid);
696 to->di_gid = cpu_to_be32(from->di_gid); 697 to->di_gid = cpu_to_be32(from->di_gid);
697 to->di_nlink = cpu_to_be32(from->di_nlink); 698 to->di_nlink = cpu_to_be32(from->di_nlink);
698 to->di_projid = cpu_to_be16(from->di_projid); 699 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
700 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
699 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 701 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
700 to->di_flushiter = cpu_to_be16(from->di_flushiter); 702 to->di_flushiter = cpu_to_be16(from->di_flushiter);
701 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 703 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -874,7 +876,7 @@ xfs_iread(
874 if (ip->i_d.di_version == 1) { 876 if (ip->i_d.di_version == 1) {
875 ip->i_d.di_nlink = ip->i_d.di_onlink; 877 ip->i_d.di_nlink = ip->i_d.di_onlink;
876 ip->i_d.di_onlink = 0; 878 ip->i_d.di_onlink = 0;
877 ip->i_d.di_projid = 0; 879 xfs_set_projid(ip, 0);
878 } 880 }
879 881
880 ip->i_delayed_blks = 0; 882 ip->i_delayed_blks = 0;
@@ -982,8 +984,7 @@ xfs_ialloc(
982 mode_t mode, 984 mode_t mode,
983 xfs_nlink_t nlink, 985 xfs_nlink_t nlink,
984 xfs_dev_t rdev, 986 xfs_dev_t rdev,
985 cred_t *cr, 987 prid_t prid,
986 xfs_prid_t prid,
987 int okalloc, 988 int okalloc,
988 xfs_buf_t **ialloc_context, 989 xfs_buf_t **ialloc_context,
989 boolean_t *call_again, 990 boolean_t *call_again,
@@ -1027,7 +1028,7 @@ xfs_ialloc(
1027 ASSERT(ip->i_d.di_nlink == nlink); 1028 ASSERT(ip->i_d.di_nlink == nlink);
1028 ip->i_d.di_uid = current_fsuid(); 1029 ip->i_d.di_uid = current_fsuid();
1029 ip->i_d.di_gid = current_fsgid(); 1030 ip->i_d.di_gid = current_fsgid();
1030 ip->i_d.di_projid = prid; 1031 xfs_set_projid(ip, prid);
1031 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1032 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1032 1033
1033 /* 1034 /*
@@ -2725,7 +2726,7 @@ cluster_corrupt_out:
2725 XFS_BUF_UNDONE(bp); 2726 XFS_BUF_UNDONE(bp);
2726 XFS_BUF_STALE(bp); 2727 XFS_BUF_STALE(bp);
2727 XFS_BUF_ERROR(bp,EIO); 2728 XFS_BUF_ERROR(bp,EIO);
2728 xfs_biodone(bp); 2729 xfs_buf_ioend(bp, 0);
2729 } else { 2730 } else {
2730 XFS_BUF_STALE(bp); 2731 XFS_BUF_STALE(bp);
2731 xfs_buf_relse(bp); 2732 xfs_buf_relse(bp);
@@ -3008,7 +3009,7 @@ xfs_iflush_int(
3008 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3009 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3009 memset(&(dip->di_pad[0]), 0, 3010 memset(&(dip->di_pad[0]), 0,
3010 sizeof(dip->di_pad)); 3011 sizeof(dip->di_pad));
3011 ASSERT(ip->i_d.di_projid == 0); 3012 ASSERT(xfs_get_projid(ip) == 0);
3012 } 3013 }
3013 } 3014 }
3014 3015
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..fb2ca2e4cdc9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
134 __uint32_t di_uid; /* owner's user id */ 134 __uint32_t di_uid; /* owner's user id */
135 __uint32_t di_gid; /* owner's group id */ 135 __uint32_t di_gid; /* owner's group id */
136 __uint32_t di_nlink; /* number of links to file */ 136 __uint32_t di_nlink; /* number of links to file */
137 __uint16_t di_projid; /* owner's project id */ 137 __uint16_t di_projid_lo; /* lower part of owner's project id */
138 __uint8_t di_pad[8]; /* unused, zeroed space */ 138 __uint16_t di_projid_hi; /* higher part of owner's project id */
139 __uint8_t di_pad[6]; /* unused, zeroed space */
139 __uint16_t di_flushiter; /* incremented on flush */ 140 __uint16_t di_flushiter; /* incremented on flush */
140 xfs_ictimestamp_t di_atime; /* time last accessed */ 141 xfs_ictimestamp_t di_atime; /* time last accessed */
141 xfs_ictimestamp_t di_mtime; /* time last modified */ 142 xfs_ictimestamp_t di_mtime; /* time last modified */
@@ -212,7 +213,6 @@ typedef struct xfs_icdinode {
212#ifdef __KERNEL__ 213#ifdef __KERNEL__
213 214
214struct bhv_desc; 215struct bhv_desc;
215struct cred;
216struct xfs_buf; 216struct xfs_buf;
217struct xfs_bmap_free; 217struct xfs_bmap_free;
218struct xfs_bmbt_irec; 218struct xfs_bmbt_irec;
@@ -335,6 +335,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
335} 335}
336 336
337/* 337/*
338 * Project quota id helpers (previously projid was 16bit only
339 * and using two 16bit values to hold new 32bit projid was choosen
340 * to retain compatibility with "old" filesystems).
341 */
342static inline prid_t
343xfs_get_projid(struct xfs_inode *ip)
344{
345 return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
346}
347
348static inline void
349xfs_set_projid(struct xfs_inode *ip,
350 prid_t projid)
351{
352 ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
353 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
354}
355
356/*
338 * Manage the i_flush queue embedded in the inode. This completion 357 * Manage the i_flush queue embedded in the inode. This completion
339 * queue synchronizes processes attempting to flush the in-core 358 * queue synchronizes processes attempting to flush the in-core
340 * inode back to disk. 359 * inode back to disk.
@@ -456,8 +475,8 @@ void xfs_inode_free(struct xfs_inode *ip);
456 * xfs_inode.c prototypes. 475 * xfs_inode.c prototypes.
457 */ 476 */
458int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, 477int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
459 xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, 478 xfs_nlink_t, xfs_dev_t, prid_t, int,
460 int, struct xfs_buf **, boolean_t *, xfs_inode_t **); 479 struct xfs_buf **, boolean_t *, xfs_inode_t **);
461 480
462uint xfs_ip2xflags(struct xfs_inode *); 481uint xfs_ip2xflags(struct xfs_inode *);
463uint xfs_dic2xflags(struct xfs_dinode *); 482uint xfs_dic2xflags(struct xfs_dinode *);
@@ -471,7 +490,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
471void xfs_iext_realloc(xfs_inode_t *, int, int); 490void xfs_iext_realloc(xfs_inode_t *, int, int);
472void xfs_iunpin_wait(xfs_inode_t *); 491void xfs_iunpin_wait(xfs_inode_t *);
473int xfs_iflush(xfs_inode_t *, uint); 492int xfs_iflush(xfs_inode_t *, uint);
474void xfs_ichgtime(xfs_inode_t *, int);
475void xfs_lock_inodes(xfs_inode_t **, int, uint); 493void xfs_lock_inodes(xfs_inode_t **, int, uint);
476void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 494void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
477 495
@@ -482,7 +500,7 @@ void xfs_mark_inode_dirty_sync(xfs_inode_t *);
482#define IHOLD(ip) \ 500#define IHOLD(ip) \
483do { \ 501do { \
484 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 502 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
485 atomic_inc(&(VFS_I(ip)->i_count)); \ 503 ihold(VFS_I(ip)); \
486 trace_xfs_ihold(ip, _THIS_IP_); \ 504 trace_xfs_ihold(ip, _THIS_IP_); \
487} while (0) 505} while (0)
488 506
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..c7ac020705df 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -223,15 +223,6 @@ xfs_inode_item_format(
223 nvecs = 1; 223 nvecs = 1;
224 224
225 /* 225 /*
226 * Make sure the linux inode is dirty. We do this before
227 * clearing i_update_core as the VFS will call back into
228 * XFS here and set i_update_core, so we need to dirty the
229 * inode first so that the ordering of i_update_core and
230 * unlogged modifications still works as described below.
231 */
232 xfs_mark_inode_dirty_sync(ip);
233
234 /*
235 * Clear i_update_core if the timestamps (or any other 226 * Clear i_update_core if the timestamps (or any other
236 * non-transactional modification) need flushing/logging 227 * non-transactional modification) need flushing/logging
237 * and we're about to log them with the rest of the core. 228 * and we're about to log them with the rest of the core.
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..dc1882adaf54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
92 * further change. 92 * further change.
93 */ 93 */
94 buf->bs_nlink = dic->di_nlink; 94 buf->bs_nlink = dic->di_nlink;
95 buf->bs_projid = dic->di_projid; 95 buf->bs_projid_lo = dic->di_projid_lo;
96 buf->bs_projid_hi = dic->di_projid_hi;
96 buf->bs_ino = ino; 97 buf->bs_ino = ino;
97 buf->bs_mode = dic->di_mode; 98 buf->bs_mode = dic->di_mode;
98 buf->bs_uid = dic->di_uid; 99 buf->bs_uid = dic->di_uid;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..cee4ab9f8a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -917,19 +917,6 @@ xlog_iodone(xfs_buf_t *bp)
917 l = iclog->ic_log; 917 l = iclog->ic_log;
918 918
919 /* 919 /*
920 * If the _XFS_BARRIER_FAILED flag was set by a lower
921 * layer, it means the underlying device no longer supports
922 * barrier I/O. Warn loudly and turn off barriers.
923 */
924 if (bp->b_flags & _XFS_BARRIER_FAILED) {
925 bp->b_flags &= ~_XFS_BARRIER_FAILED;
926 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
927 xfs_fs_cmn_err(CE_WARN, l->l_mp,
928 "xlog_iodone: Barriers are no longer supported"
929 " by device. Disabling barriers\n");
930 }
931
932 /*
933 * Race to shutdown the filesystem if we see an error. 920 * Race to shutdown the filesystem if we see an error.
934 */ 921 */
935 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 922 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1131,7 +1118,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1131 iclog->ic_prev = prev_iclog; 1118 iclog->ic_prev = prev_iclog;
1132 prev_iclog = iclog; 1119 prev_iclog = iclog;
1133 1120
1134 bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); 1121 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1122 log->l_iclog_size, 0);
1135 if (!bp) 1123 if (!bp)
1136 goto out_free_iclog; 1124 goto out_free_iclog;
1137 if (!XFS_BUF_CPSEMA(bp)) 1125 if (!XFS_BUF_CPSEMA(bp))
@@ -1309,7 +1297,7 @@ xlog_bdstrat(
1309 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1297 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1310 XFS_BUF_ERROR(bp, EIO); 1298 XFS_BUF_ERROR(bp, EIO);
1311 XFS_BUF_STALE(bp); 1299 XFS_BUF_STALE(bp);
1312 xfs_biodone(bp); 1300 xfs_buf_ioend(bp, 0);
1313 /* 1301 /*
1314 * It would seem logical to return EIO here, but we rely on 1302 * It would seem logical to return EIO here, but we rely on
1315 * the log state machine to propagate I/O errors instead of 1303 * the log state machine to propagate I/O errors instead of
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e206fc1fa36..23d6ceb5e97b 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -146,102 +146,6 @@ xlog_cil_init_post_recovery(
146} 146}
147 147
148/* 148/*
149 * Insert the log item into the CIL and calculate the difference in space
150 * consumed by the item. Add the space to the checkpoint ticket and calculate
151 * if the change requires additional log metadata. If it does, take that space
152 * as well. Remove the amount of space we addded to the checkpoint ticket from
153 * the current transaction ticket so that the accounting works out correctly.
154 *
155 * If this is the first time the item is being placed into the CIL in this
156 * context, pin it so it can't be written to disk until the CIL is flushed to
157 * the iclog and the iclog written to disk.
158 */
159static void
160xlog_cil_insert(
161 struct log *log,
162 struct xlog_ticket *ticket,
163 struct xfs_log_item *item,
164 struct xfs_log_vec *lv)
165{
166 struct xfs_cil *cil = log->l_cilp;
167 struct xfs_log_vec *old = lv->lv_item->li_lv;
168 struct xfs_cil_ctx *ctx = cil->xc_ctx;
169 int len;
170 int diff_iovecs;
171 int iclog_space;
172
173 if (old) {
174 /* existing lv on log item, space used is a delta */
175 ASSERT(!list_empty(&item->li_cil));
176 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
177
178 len = lv->lv_buf_len - old->lv_buf_len;
179 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
180 kmem_free(old->lv_buf);
181 kmem_free(old);
182 } else {
183 /* new lv, must pin the log item */
184 ASSERT(!lv->lv_item->li_lv);
185 ASSERT(list_empty(&item->li_cil));
186
187 len = lv->lv_buf_len;
188 diff_iovecs = lv->lv_niovecs;
189 IOP_PIN(lv->lv_item);
190
191 }
192 len += diff_iovecs * sizeof(xlog_op_header_t);
193
194 /* attach new log vector to log item */
195 lv->lv_item->li_lv = lv;
196
197 spin_lock(&cil->xc_cil_lock);
198 list_move_tail(&item->li_cil, &cil->xc_cil);
199 ctx->nvecs += diff_iovecs;
200
201 /*
202 * If this is the first time the item is being committed to the CIL,
203 * store the sequence number on the log item so we can tell
204 * in future commits whether this is the first checkpoint the item is
205 * being committed into.
206 */
207 if (!item->li_seq)
208 item->li_seq = ctx->sequence;
209
210 /*
211 * Now transfer enough transaction reservation to the context ticket
212 * for the checkpoint. The context ticket is special - the unit
213 * reservation has to grow as well as the current reservation as we
214 * steal from tickets so we can correctly determine the space used
215 * during the transaction commit.
216 */
217 if (ctx->ticket->t_curr_res == 0) {
218 /* first commit in checkpoint, steal the header reservation */
219 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
220 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
221 ticket->t_curr_res -= ctx->ticket->t_unit_res;
222 }
223
224 /* do we need space for more log record headers? */
225 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
226 if (len > 0 && (ctx->space_used / iclog_space !=
227 (ctx->space_used + len) / iclog_space)) {
228 int hdrs;
229
230 hdrs = (len + iclog_space - 1) / iclog_space;
231 /* need to take into account split region headers, too */
232 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
233 ctx->ticket->t_unit_res += hdrs;
234 ctx->ticket->t_curr_res += hdrs;
235 ticket->t_curr_res -= hdrs;
236 ASSERT(ticket->t_curr_res >= len);
237 }
238 ticket->t_curr_res -= len;
239 ctx->space_used += len;
240
241 spin_unlock(&cil->xc_cil_lock);
242}
243
244/*
245 * Format log item into a flat buffers 149 * Format log item into a flat buffers
246 * 150 *
247 * For delayed logging, we need to hold a formatted buffer containing all the 151 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +190,7 @@ xlog_cil_format_items(
286 len += lv->lv_iovecp[index].i_len; 190 len += lv->lv_iovecp[index].i_len;
287 191
288 lv->lv_buf_len = len; 192 lv->lv_buf_len = len;
289 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); 193 lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
290 ptr = lv->lv_buf; 194 ptr = lv->lv_buf;
291 195
292 for (index = 0; index < lv->lv_niovecs; index++) { 196 for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
300 } 204 }
301} 205}
302 206
207/*
208 * Prepare the log item for insertion into the CIL. Calculate the difference in
209 * log space and vectors it will consume, and if it is a new item pin it as
210 * well.
211 */
212STATIC void
213xfs_cil_prepare_item(
214 struct log *log,
215 struct xfs_log_vec *lv,
216 int *len,
217 int *diff_iovecs)
218{
219 struct xfs_log_vec *old = lv->lv_item->li_lv;
220
221 if (old) {
222 /* existing lv on log item, space used is a delta */
223 ASSERT(!list_empty(&lv->lv_item->li_cil));
224 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
225
226 *len += lv->lv_buf_len - old->lv_buf_len;
227 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
228 kmem_free(old->lv_buf);
229 kmem_free(old);
230 } else {
231 /* new lv, must pin the log item */
232 ASSERT(!lv->lv_item->li_lv);
233 ASSERT(list_empty(&lv->lv_item->li_cil));
234
235 *len += lv->lv_buf_len;
236 *diff_iovecs += lv->lv_niovecs;
237 IOP_PIN(lv->lv_item);
238
239 }
240
241 /* attach new log vector to log item */
242 lv->lv_item->li_lv = lv;
243
244 /*
245 * If this is the first time the item is being committed to the
246 * CIL, store the sequence number on the log item so we can
247 * tell in future commits whether this is the first checkpoint
248 * the item is being committed into.
249 */
250 if (!lv->lv_item->li_seq)
251 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
252}
253
254/*
255 * Insert the log items into the CIL and calculate the difference in space
256 * consumed by the item. Add the space to the checkpoint ticket and calculate
257 * if the change requires additional log metadata. If it does, take that space
258 * as well. Remove the amount of space we addded to the checkpoint ticket from
259 * the current transaction ticket so that the accounting works out correctly.
260 */
303static void 261static void
304xlog_cil_insert_items( 262xlog_cil_insert_items(
305 struct log *log, 263 struct log *log,
306 struct xfs_log_vec *log_vector, 264 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket, 265 struct xlog_ticket *ticket)
308 xfs_lsn_t *start_lsn)
309{ 266{
310 struct xfs_log_vec *lv; 267 struct xfs_cil *cil = log->l_cilp;
311 268 struct xfs_cil_ctx *ctx = cil->xc_ctx;
312 if (start_lsn) 269 struct xfs_log_vec *lv;
313 *start_lsn = log->l_cilp->xc_ctx->sequence; 270 int len = 0;
271 int diff_iovecs = 0;
272 int iclog_space;
314 273
315 ASSERT(log_vector); 274 ASSERT(log_vector);
275
276 /*
277 * Do all the accounting aggregation and switching of log vectors
278 * around in a separate loop to the insertion of items into the CIL.
279 * Then we can do a separate loop to update the CIL within a single
280 * lock/unlock pair. This reduces the number of round trips on the CIL
281 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
282 * hold time for the transaction commit.
283 *
284 * If this is the first time the item is being placed into the CIL in
285 * this context, pin it so it can't be written to disk until the CIL is
286 * flushed to the iclog and the iclog written to disk.
287 *
288 * We can do this safely because the context can't checkpoint until we
289 * are done so it doesn't matter exactly how we update the CIL.
290 */
291 for (lv = log_vector; lv; lv = lv->lv_next)
292 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
293
294 /* account for space used by new iovec headers */
295 len += diff_iovecs * sizeof(xlog_op_header_t);
296
297 spin_lock(&cil->xc_cil_lock);
298
299 /* move the items to the tail of the CIL */
316 for (lv = log_vector; lv; lv = lv->lv_next) 300 for (lv = log_vector; lv; lv = lv->lv_next)
317 xlog_cil_insert(log, ticket, lv->lv_item, lv); 301 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
302
303 ctx->nvecs += diff_iovecs;
304
305 /*
306 * Now transfer enough transaction reservation to the context ticket
307 * for the checkpoint. The context ticket is special - the unit
308 * reservation has to grow as well as the current reservation as we
309 * steal from tickets so we can correctly determine the space used
310 * during the transaction commit.
311 */
312 if (ctx->ticket->t_curr_res == 0) {
313 /* first commit in checkpoint, steal the header reservation */
314 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
315 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
316 ticket->t_curr_res -= ctx->ticket->t_unit_res;
317 }
318
319 /* do we need space for more log record headers? */
320 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
321 if (len > 0 && (ctx->space_used / iclog_space !=
322 (ctx->space_used + len) / iclog_space)) {
323 int hdrs;
324
325 hdrs = (len + iclog_space - 1) / iclog_space;
326 /* need to take into account split region headers, too */
327 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
328 ctx->ticket->t_unit_res += hdrs;
329 ctx->ticket->t_curr_res += hdrs;
330 ticket->t_curr_res -= hdrs;
331 ASSERT(ticket->t_curr_res >= len);
332 }
333 ticket->t_curr_res -= len;
334 ctx->space_used += len;
335
336 spin_unlock(&cil->xc_cil_lock);
318} 337}
319 338
320static void 339static void
@@ -638,7 +657,10 @@ xfs_log_commit_cil(
638 657
639 /* lock out background commit */ 658 /* lock out background commit */
640 down_read(&log->l_cilp->xc_ctx_lock); 659 down_read(&log->l_cilp->xc_ctx_lock);
641 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); 660 if (commit_lsn)
661 *commit_lsn = log->l_cilp->xc_ctx->sequence;
662
663 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
642 664
643 /* check we didn't blow the reservation */ 665 /* check we didn't blow the reservation */
644 if (tp->t_ticket->t_curr_res < 0) 666 if (tp->t_ticket->t_curr_res < 0)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..966d3f97458c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -107,7 +107,8 @@ xlog_get_bp(
107 nbblks += log->l_sectBBsize; 107 nbblks += log->l_sectBBsize;
108 nbblks = round_up(nbblks, log->l_sectBBsize); 108 nbblks = round_up(nbblks, log->l_sectBBsize);
109 109
110 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 110 return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
111 BBTOB(nbblks), 0);
111} 112}
112 113
113STATIC void 114STATIC void
@@ -167,7 +168,7 @@ xlog_bread_noalign(
167 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 168 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
168 169
169 xfsbdstrat(log->l_mp, bp); 170 xfsbdstrat(log->l_mp, bp);
170 error = xfs_iowait(bp); 171 error = xfs_buf_iowait(bp);
171 if (error) 172 if (error)
172 xfs_ioerror_alert("xlog_bread", log->l_mp, 173 xfs_ioerror_alert("xlog_bread", log->l_mp,
173 bp, XFS_BUF_ADDR(bp)); 174 bp, XFS_BUF_ADDR(bp));
@@ -321,12 +322,13 @@ xlog_recover_iodone(
321 * this during recovery. One strike! 322 * this during recovery. One strike!
322 */ 323 */
323 xfs_ioerror_alert("xlog_recover_iodone", 324 xfs_ioerror_alert("xlog_recover_iodone",
324 bp->b_mount, bp, XFS_BUF_ADDR(bp)); 325 bp->b_target->bt_mount, bp,
325 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 326 XFS_BUF_ADDR(bp));
327 xfs_force_shutdown(bp->b_target->bt_mount,
328 SHUTDOWN_META_IO_ERROR);
326 } 329 }
327 bp->b_mount = NULL;
328 XFS_BUF_CLR_IODONE_FUNC(bp); 330 XFS_BUF_CLR_IODONE_FUNC(bp);
329 xfs_biodone(bp); 331 xfs_buf_ioend(bp, 0);
330} 332}
331 333
332/* 334/*
@@ -2275,8 +2277,7 @@ xlog_recover_do_buffer_trans(
2275 XFS_BUF_STALE(bp); 2277 XFS_BUF_STALE(bp);
2276 error = xfs_bwrite(mp, bp); 2278 error = xfs_bwrite(mp, bp);
2277 } else { 2279 } else {
2278 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2280 ASSERT(bp->b_target->bt_mount == mp);
2279 bp->b_mount = mp;
2280 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2281 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2281 xfs_bdwrite(mp, bp); 2282 xfs_bdwrite(mp, bp);
2282 } 2283 }
@@ -2540,8 +2541,7 @@ xlog_recover_do_inode_trans(
2540 } 2541 }
2541 2542
2542write_inode_buffer: 2543write_inode_buffer:
2543 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2544 ASSERT(bp->b_target->bt_mount == mp);
2544 bp->b_mount = mp;
2545 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2545 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2546 xfs_bdwrite(mp, bp); 2546 xfs_bdwrite(mp, bp);
2547error: 2547error:
@@ -2678,8 +2678,7 @@ xlog_recover_do_dquot_trans(
2678 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2678 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2679 2679
2680 ASSERT(dq_f->qlf_size == 2); 2680 ASSERT(dq_f->qlf_size == 2);
2681 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2681 ASSERT(bp->b_target->bt_mount == mp);
2682 bp->b_mount = mp;
2683 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2682 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2684 xfs_bdwrite(mp, bp); 2683 xfs_bdwrite(mp, bp);
2685 2684
@@ -3817,7 +3816,7 @@ xlog_do_recover(
3817 XFS_BUF_READ(bp); 3816 XFS_BUF_READ(bp);
3818 XFS_BUF_UNASYNC(bp); 3817 XFS_BUF_UNASYNC(bp);
3819 xfsbdstrat(log->l_mp, bp); 3818 xfsbdstrat(log->l_mp, bp);
3820 error = xfs_iowait(bp); 3819 error = xfs_buf_iowait(bp);
3821 if (error) { 3820 if (error) {
3822 xfs_ioerror_alert("xlog_do_recover", 3821 xfs_ioerror_alert("xlog_do_recover",
3823 log->l_mp, bp, XFS_BUF_ADDR(bp)); 3822 log->l_mp, bp, XFS_BUF_ADDR(bp));
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..19e9dfa1c254 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
52 int); 52 int);
53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, 53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
54 int); 54 int);
55STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
56 int64_t, int);
57STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); 55STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
58
59#else 56#else
60 57
61#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) 58#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
62#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) 59#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
63#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0)
64
65#endif 60#endif
66 61
67static const struct { 62static const struct {
@@ -199,6 +194,8 @@ xfs_uuid_unmount(
199 194
200/* 195/*
201 * Reference counting access wrappers to the perag structures. 196 * Reference counting access wrappers to the perag structures.
197 * Because we never free per-ag structures, the only thing we
198 * have to protect against changes is the tree structure itself.
202 */ 199 */
203struct xfs_perag * 200struct xfs_perag *
204xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) 201xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,19 +203,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
206 struct xfs_perag *pag; 203 struct xfs_perag *pag;
207 int ref = 0; 204 int ref = 0;
208 205
209 spin_lock(&mp->m_perag_lock); 206 rcu_read_lock();
210 pag = radix_tree_lookup(&mp->m_perag_tree, agno); 207 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
211 if (pag) { 208 if (pag) {
212 ASSERT(atomic_read(&pag->pag_ref) >= 0); 209 ASSERT(atomic_read(&pag->pag_ref) >= 0);
213 /* catch leaks in the positive direction during testing */
214 ASSERT(atomic_read(&pag->pag_ref) < 1000);
215 ref = atomic_inc_return(&pag->pag_ref); 210 ref = atomic_inc_return(&pag->pag_ref);
216 } 211 }
217 spin_unlock(&mp->m_perag_lock); 212 rcu_read_unlock();
218 trace_xfs_perag_get(mp, agno, ref, _RET_IP_); 213 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
219 return pag; 214 return pag;
220} 215}
221 216
217/*
218 * search from @first to find the next perag with the given tag set.
219 */
220struct xfs_perag *
221xfs_perag_get_tag(
222 struct xfs_mount *mp,
223 xfs_agnumber_t first,
224 int tag)
225{
226 struct xfs_perag *pag;
227 int found;
228 int ref;
229
230 rcu_read_lock();
231 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
232 (void **)&pag, first, 1, tag);
233 if (found <= 0) {
234 rcu_read_unlock();
235 return NULL;
236 }
237 ref = atomic_inc_return(&pag->pag_ref);
238 rcu_read_unlock();
239 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
240 return pag;
241}
242
222void 243void
223xfs_perag_put(struct xfs_perag *pag) 244xfs_perag_put(struct xfs_perag *pag)
224{ 245{
@@ -229,10 +250,18 @@ xfs_perag_put(struct xfs_perag *pag)
229 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); 250 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
230} 251}
231 252
253STATIC void
254__xfs_free_perag(
255 struct rcu_head *head)
256{
257 struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
258
259 ASSERT(atomic_read(&pag->pag_ref) == 0);
260 kmem_free(pag);
261}
262
232/* 263/*
233 * Free up the resources associated with a mount structure. Assume that 264 * Free up the per-ag resources associated with the mount structure.
234 * the structure was initially zeroed, so we can tell which fields got
235 * initialized.
236 */ 265 */
237STATIC void 266STATIC void
238xfs_free_perag( 267xfs_free_perag(
@@ -244,10 +273,10 @@ xfs_free_perag(
244 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 273 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
245 spin_lock(&mp->m_perag_lock); 274 spin_lock(&mp->m_perag_lock);
246 pag = radix_tree_delete(&mp->m_perag_tree, agno); 275 pag = radix_tree_delete(&mp->m_perag_tree, agno);
276 spin_unlock(&mp->m_perag_lock);
247 ASSERT(pag); 277 ASSERT(pag);
248 ASSERT(atomic_read(&pag->pag_ref) == 0); 278 ASSERT(atomic_read(&pag->pag_ref) == 0);
249 spin_unlock(&mp->m_perag_lock); 279 call_rcu(&pag->rcu_head, __xfs_free_perag);
250 kmem_free(pag);
251 } 280 }
252} 281}
253 282
@@ -444,7 +473,10 @@ xfs_initialize_perag(
444 pag->pag_agno = index; 473 pag->pag_agno = index;
445 pag->pag_mount = mp; 474 pag->pag_mount = mp;
446 rwlock_init(&pag->pag_ici_lock); 475 rwlock_init(&pag->pag_ici_lock);
476 mutex_init(&pag->pag_ici_reclaim_lock);
447 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
478 spin_lock_init(&pag->pag_buf_lock);
479 pag->pag_buf_tree = RB_ROOT;
448 480
449 if (radix_tree_preload(GFP_NOFS)) 481 if (radix_tree_preload(GFP_NOFS))
450 goto out_unwind; 482 goto out_unwind;
@@ -639,7 +671,6 @@ int
639xfs_readsb(xfs_mount_t *mp, int flags) 671xfs_readsb(xfs_mount_t *mp, int flags)
640{ 672{
641 unsigned int sector_size; 673 unsigned int sector_size;
642 unsigned int extra_flags;
643 xfs_buf_t *bp; 674 xfs_buf_t *bp;
644 int error; 675 int error;
645 676
@@ -652,28 +683,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
652 * access to the superblock. 683 * access to the superblock.
653 */ 684 */
654 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 685 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
655 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
656 686
657 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 687reread:
658 extra_flags); 688 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
659 if (!bp || XFS_BUF_ISERROR(bp)) { 689 XFS_SB_DADDR, sector_size, 0);
660 xfs_fs_mount_cmn_err(flags, "SB read failed"); 690 if (!bp) {
661 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 691 xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
662 goto fail; 692 return EIO;
663 } 693 }
664 ASSERT(XFS_BUF_ISBUSY(bp));
665 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
666 694
667 /* 695 /*
668 * Initialize the mount structure from the superblock. 696 * Initialize the mount structure from the superblock.
669 * But first do some basic consistency checking. 697 * But first do some basic consistency checking.
670 */ 698 */
671 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 699 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
672
673 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 700 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
674 if (error) { 701 if (error) {
675 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 702 xfs_fs_mount_cmn_err(flags, "SB validate failed");
676 goto fail; 703 goto release_buf;
677 } 704 }
678 705
679 /* 706 /*
@@ -684,7 +711,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
684 "device supports only %u byte sectors (not %u)", 711 "device supports only %u byte sectors (not %u)",
685 sector_size, mp->m_sb.sb_sectsize); 712 sector_size, mp->m_sb.sb_sectsize);
686 error = ENOSYS; 713 error = ENOSYS;
687 goto fail; 714 goto release_buf;
688 } 715 }
689 716
690 /* 717 /*
@@ -692,33 +719,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
692 * re-read the superblock so the buffer is correctly sized. 719 * re-read the superblock so the buffer is correctly sized.
693 */ 720 */
694 if (sector_size < mp->m_sb.sb_sectsize) { 721 if (sector_size < mp->m_sb.sb_sectsize) {
695 XFS_BUF_UNMANAGE(bp);
696 xfs_buf_relse(bp); 722 xfs_buf_relse(bp);
697 sector_size = mp->m_sb.sb_sectsize; 723 sector_size = mp->m_sb.sb_sectsize;
698 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, 724 goto reread;
699 BTOBB(sector_size), extra_flags);
700 if (!bp || XFS_BUF_ISERROR(bp)) {
701 xfs_fs_mount_cmn_err(flags, "SB re-read failed");
702 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
703 goto fail;
704 }
705 ASSERT(XFS_BUF_ISBUSY(bp));
706 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
707 } 725 }
708 726
709 /* Initialize per-cpu counters */ 727 /* Initialize per-cpu counters */
710 xfs_icsb_reinit_counters(mp); 728 xfs_icsb_reinit_counters(mp);
711 729
712 mp->m_sb_bp = bp; 730 mp->m_sb_bp = bp;
713 xfs_buf_relse(bp); 731 xfs_buf_unlock(bp);
714 ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
715 return 0; 732 return 0;
716 733
717 fail: 734release_buf:
718 if (bp) { 735 xfs_buf_relse(bp);
719 XFS_BUF_UNMANAGE(bp);
720 xfs_buf_relse(bp);
721 }
722 return error; 736 return error;
723} 737}
724 738
@@ -991,42 +1005,35 @@ xfs_check_sizes(xfs_mount_t *mp)
991{ 1005{
992 xfs_buf_t *bp; 1006 xfs_buf_t *bp;
993 xfs_daddr_t d; 1007 xfs_daddr_t d;
994 int error;
995 1008
996 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1009 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
997 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1010 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
998 cmn_err(CE_WARN, "XFS: size check 1 failed"); 1011 cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
999 return XFS_ERROR(EFBIG); 1012 return XFS_ERROR(EFBIG);
1000 } 1013 }
1001 error = xfs_read_buf(mp, mp->m_ddev_targp, 1014 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1002 d - XFS_FSS_TO_BB(mp, 1), 1015 d - XFS_FSS_TO_BB(mp, 1),
1003 XFS_FSS_TO_BB(mp, 1), 0, &bp); 1016 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1004 if (!error) { 1017 if (!bp) {
1005 xfs_buf_relse(bp); 1018 cmn_err(CE_WARN, "XFS: last sector read failed");
1006 } else { 1019 return EIO;
1007 cmn_err(CE_WARN, "XFS: size check 2 failed");
1008 if (error == ENOSPC)
1009 error = XFS_ERROR(EFBIG);
1010 return error;
1011 } 1020 }
1021 xfs_buf_relse(bp);
1012 1022
1013 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1023 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1014 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1024 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1015 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1025 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1016 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1026 cmn_err(CE_WARN, "XFS: log size mismatch detected");
1017 return XFS_ERROR(EFBIG); 1027 return XFS_ERROR(EFBIG);
1018 } 1028 }
1019 error = xfs_read_buf(mp, mp->m_logdev_targp, 1029 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1020 d - XFS_FSB_TO_BB(mp, 1), 1030 d - XFS_FSB_TO_BB(mp, 1),
1021 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1031 XFS_FSB_TO_B(mp, 1), 0);
1022 if (!error) { 1032 if (!bp) {
1023 xfs_buf_relse(bp); 1033 cmn_err(CE_WARN, "XFS: log device read failed");
1024 } else { 1034 return EIO;
1025 cmn_err(CE_WARN, "XFS: size check 3 failed");
1026 if (error == ENOSPC)
1027 error = XFS_ERROR(EFBIG);
1028 return error;
1029 } 1035 }
1036 xfs_buf_relse(bp);
1030 } 1037 }
1031 return 0; 1038 return 0;
1032} 1039}
@@ -1601,7 +1608,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1601 XFS_BUF_UNASYNC(sbp); 1608 XFS_BUF_UNASYNC(sbp);
1602 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1609 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1603 xfsbdstrat(mp, sbp); 1610 xfsbdstrat(mp, sbp);
1604 error = xfs_iowait(sbp); 1611 error = xfs_buf_iowait(sbp);
1605 if (error) 1612 if (error)
1606 xfs_ioerror_alert("xfs_unmountfs_writesb", 1613 xfs_ioerror_alert("xfs_unmountfs_writesb",
1607 mp, sbp, XFS_BUF_ADDR(sbp)); 1614 mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1832,135 +1839,72 @@ xfs_mod_incore_sb_unlocked(
1832 */ 1839 */
1833int 1840int
1834xfs_mod_incore_sb( 1841xfs_mod_incore_sb(
1835 xfs_mount_t *mp, 1842 struct xfs_mount *mp,
1836 xfs_sb_field_t field, 1843 xfs_sb_field_t field,
1837 int64_t delta, 1844 int64_t delta,
1838 int rsvd) 1845 int rsvd)
1839{ 1846{
1840 int status; 1847 int status;
1841 1848
1842 /* check for per-cpu counters */
1843 switch (field) {
1844#ifdef HAVE_PERCPU_SB 1849#ifdef HAVE_PERCPU_SB
1845 case XFS_SBS_ICOUNT: 1850 ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
1846 case XFS_SBS_IFREE:
1847 case XFS_SBS_FDBLOCKS:
1848 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1849 status = xfs_icsb_modify_counters(mp, field,
1850 delta, rsvd);
1851 break;
1852 }
1853 /* FALLTHROUGH */
1854#endif 1851#endif
1855 default: 1852 spin_lock(&mp->m_sb_lock);
1856 spin_lock(&mp->m_sb_lock); 1853 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1857 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1854 spin_unlock(&mp->m_sb_lock);
1858 spin_unlock(&mp->m_sb_lock);
1859 break;
1860 }
1861 1855
1862 return status; 1856 return status;
1863} 1857}
1864 1858
1865/* 1859/*
1866 * xfs_mod_incore_sb_batch() is used to change more than one field 1860 * Change more than one field in the in-core superblock structure at a time.
1867 * in the in-core superblock structure at a time. This modification
1868 * is protected by a lock internal to this module. The fields and
1869 * changes to those fields are specified in the array of xfs_mod_sb
1870 * structures passed in.
1871 * 1861 *
1872 * Either all of the specified deltas will be applied or none of 1862 * The fields and changes to those fields are specified in the array of
1873 * them will. If any modified field dips below 0, then all modifications 1863 * xfs_mod_sb structures passed in. Either all of the specified deltas
1874 * will be backed out and EINVAL will be returned. 1864 * will be applied or none of them will. If any modified field dips below 0,
1865 * then all modifications will be backed out and EINVAL will be returned.
1866 *
1867 * Note that this function may not be used for the superblock values that
1868 * are tracked with the in-memory per-cpu counters - a direct call to
1869 * xfs_icsb_modify_counters is required for these.
1875 */ 1870 */
1876int 1871int
1877xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) 1872xfs_mod_incore_sb_batch(
1873 struct xfs_mount *mp,
1874 xfs_mod_sb_t *msb,
1875 uint nmsb,
1876 int rsvd)
1878{ 1877{
1879 int status=0; 1878 xfs_mod_sb_t *msbp = &msb[0];
1880 xfs_mod_sb_t *msbp; 1879 int error = 0;
1881 1880
1882 /* 1881 /*
1883 * Loop through the array of mod structures and apply each 1882 * Loop through the array of mod structures and apply each individually.
1884 * individually. If any fail, then back out all those 1883 * If any fail, then back out all those which have already been applied.
1885 * which have already been applied. Do all of this within 1884 * Do all of this within the scope of the m_sb_lock so that all of the
1886 * the scope of the m_sb_lock so that all of the changes will 1885 * changes will be atomic.
1887 * be atomic.
1888 */ 1886 */
1889 spin_lock(&mp->m_sb_lock); 1887 spin_lock(&mp->m_sb_lock);
1890 msbp = &msb[0];
1891 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { 1888 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
1892 /* 1889 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1893 * Apply the delta at index n. If it fails, break 1890 msbp->msb_field > XFS_SBS_FDBLOCKS);
1894 * from the loop so we'll fall into the undo loop
1895 * below.
1896 */
1897 switch (msbp->msb_field) {
1898#ifdef HAVE_PERCPU_SB
1899 case XFS_SBS_ICOUNT:
1900 case XFS_SBS_IFREE:
1901 case XFS_SBS_FDBLOCKS:
1902 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1903 spin_unlock(&mp->m_sb_lock);
1904 status = xfs_icsb_modify_counters(mp,
1905 msbp->msb_field,
1906 msbp->msb_delta, rsvd);
1907 spin_lock(&mp->m_sb_lock);
1908 break;
1909 }
1910 /* FALLTHROUGH */
1911#endif
1912 default:
1913 status = xfs_mod_incore_sb_unlocked(mp,
1914 msbp->msb_field,
1915 msbp->msb_delta, rsvd);
1916 break;
1917 }
1918 1891
1919 if (status != 0) { 1892 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1920 break; 1893 msbp->msb_delta, rsvd);
1921 } 1894 if (error)
1895 goto unwind;
1922 } 1896 }
1897 spin_unlock(&mp->m_sb_lock);
1898 return 0;
1923 1899
1924 /* 1900unwind:
1925 * If we didn't complete the loop above, then back out 1901 while (--msbp >= msb) {
1926 * any changes made to the superblock. If you add code 1902 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1927 * between the loop above and here, make sure that you 1903 -msbp->msb_delta, rsvd);
1928 * preserve the value of status. Loop back until 1904 ASSERT(error == 0);
1929 * we step below the beginning of the array. Make sure
1930 * we don't touch anything back there.
1931 */
1932 if (status != 0) {
1933 msbp--;
1934 while (msbp >= msb) {
1935 switch (msbp->msb_field) {
1936#ifdef HAVE_PERCPU_SB
1937 case XFS_SBS_ICOUNT:
1938 case XFS_SBS_IFREE:
1939 case XFS_SBS_FDBLOCKS:
1940 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1941 spin_unlock(&mp->m_sb_lock);
1942 status = xfs_icsb_modify_counters(mp,
1943 msbp->msb_field,
1944 -(msbp->msb_delta),
1945 rsvd);
1946 spin_lock(&mp->m_sb_lock);
1947 break;
1948 }
1949 /* FALLTHROUGH */
1950#endif
1951 default:
1952 status = xfs_mod_incore_sb_unlocked(mp,
1953 msbp->msb_field,
1954 -(msbp->msb_delta),
1955 rsvd);
1956 break;
1957 }
1958 ASSERT(status == 0);
1959 msbp--;
1960 }
1961 } 1905 }
1962 spin_unlock(&mp->m_sb_lock); 1906 spin_unlock(&mp->m_sb_lock);
1963 return status; 1907 return error;
1964} 1908}
1965 1909
1966/* 1910/*
@@ -1998,18 +1942,13 @@ xfs_getsb(
1998 */ 1942 */
1999void 1943void
2000xfs_freesb( 1944xfs_freesb(
2001 xfs_mount_t *mp) 1945 struct xfs_mount *mp)
2002{ 1946{
2003 xfs_buf_t *bp; 1947 struct xfs_buf *bp = mp->m_sb_bp;
2004 1948
2005 /* 1949 xfs_buf_lock(bp);
2006 * Use xfs_getsb() so that the buffer will be locked
2007 * when we call xfs_buf_relse().
2008 */
2009 bp = xfs_getsb(mp, 0);
2010 XFS_BUF_UNMANAGE(bp);
2011 xfs_buf_relse(bp);
2012 mp->m_sb_bp = NULL; 1950 mp->m_sb_bp = NULL;
1951 xfs_buf_relse(bp);
2013} 1952}
2014 1953
2015/* 1954/*
@@ -2496,7 +2435,7 @@ xfs_icsb_balance_counter(
2496 spin_unlock(&mp->m_sb_lock); 2435 spin_unlock(&mp->m_sb_lock);
2497} 2436}
2498 2437
2499STATIC int 2438int
2500xfs_icsb_modify_counters( 2439xfs_icsb_modify_counters(
2501 xfs_mount_t *mp, 2440 xfs_mount_t *mp,
2502 xfs_sb_field_t field, 2441 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..5861b4980740 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
53 53
54#include "xfs_sync.h" 54#include "xfs_sync.h"
55 55
56struct cred;
57struct log; 56struct log;
58struct xfs_mount_args; 57struct xfs_mount_args;
59struct xfs_inode; 58struct xfs_inode;
@@ -91,6 +90,8 @@ extern void xfs_icsb_reinit_counters(struct xfs_mount *);
91extern void xfs_icsb_destroy_counters(struct xfs_mount *); 90extern void xfs_icsb_destroy_counters(struct xfs_mount *);
92extern void xfs_icsb_sync_counters(struct xfs_mount *, int); 91extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
93extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); 92extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
93extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
94 int64_t, int);
94 95
95#else 96#else
96#define xfs_icsb_init_counters(mp) (0) 97#define xfs_icsb_init_counters(mp) (0)
@@ -98,6 +99,8 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
98#define xfs_icsb_reinit_counters(mp) do { } while (0) 99#define xfs_icsb_reinit_counters(mp) do { } while (0)
99#define xfs_icsb_sync_counters(mp, flags) do { } while (0) 100#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
100#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) 101#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
102#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
103 xfs_mod_incore_sb(mp, field, delta, rsvd)
101#endif 104#endif
102 105
103typedef struct xfs_mount { 106typedef struct xfs_mount {
@@ -232,8 +235,6 @@ typedef struct xfs_mount {
232#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ 235#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
233#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred 236#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
234 * I/O size in stat() */ 237 * I/O size in stat() */
235#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
236 counters */
237#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams 238#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
238 allocator */ 239 allocator */
239#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ 240#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
@@ -327,6 +328,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
327 * perag get/put wrappers for ref counting 328 * perag get/put wrappers for ref counting
328 */ 329 */
329struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); 330struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
331struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
332 int tag);
330void xfs_perag_put(struct xfs_perag *pag); 333void xfs_perag_put(struct xfs_perag *pag);
331 334
332/* 335/*
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..9bb6eda4cd21 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) 346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
347#define xfs_trans_apply_dquot_deltas(tp) 347#define xfs_trans_apply_dquot_deltas(tp)
348#define xfs_trans_unreserve_and_mod_dquots(tp) 348#define xfs_trans_unreserve_and_mod_dquots(tp)
349#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) 349static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
350#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) 350 struct xfs_inode *ip, long nblks, long ninos, uint flags)
351{
352 return 0;
353}
354static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
355 struct xfs_mount *mp, struct xfs_dquot *udqp,
356 struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
357{
358 return 0;
359}
351#define xfs_qm_vop_create_dqattach(tp, ip, u, g) 360#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
352#define xfs_qm_vop_rename_dqattach(it) (0) 361#define xfs_qm_vop_rename_dqattach(it) (0)
353#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) 362#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
357#define xfs_qm_dqdetach(ip) 366#define xfs_qm_dqdetach(ip)
358#define xfs_qm_dqrele(d) 367#define xfs_qm_dqrele(d)
359#define xfs_qm_statvfs(ip, s) 368#define xfs_qm_statvfs(ip, s)
360#define xfs_qm_sync(mp, fl) (0) 369static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
370{
371 return 0;
372}
361#define xfs_qm_newmount(mp, a, b) (0) 373#define xfs_qm_newmount(mp, a, b) (0)
362#define xfs_qm_mount_quotas(mp) 374#define xfs_qm_mount_quotas(mp)
363#define xfs_qm_unmount(mp) 375#define xfs_qm_unmount(mp)
364#define xfs_qm_unmount_quotas(mp) (0) 376#define xfs_qm_unmount_quotas(mp)
365#endif /* CONFIG_XFS_QUOTA */ 377#endif /* CONFIG_XFS_QUOTA */
366 378
367#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ 379#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_REFCACHE_H__
19#define __XFS_REFCACHE_H__
20
21#ifdef HAVE_REFCACHE
22/*
23 * Maximum size (in inodes) for the NFS reference cache
24 */
25#define XFS_REFCACHE_SIZE_MAX 512
26
27struct xfs_inode;
28struct xfs_mount;
29
30extern void xfs_refcache_insert(struct xfs_inode *);
31extern void xfs_refcache_purge_ip(struct xfs_inode *);
32extern void xfs_refcache_purge_mp(struct xfs_mount *);
33extern void xfs_refcache_purge_some(struct xfs_mount *);
34extern void xfs_refcache_resize(int);
35extern void xfs_refcache_destroy(void);
36
37extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
38
39#else
40
41#define xfs_refcache_insert(ip) do { } while (0)
42#define xfs_refcache_purge_ip(ip) do { } while (0)
43#define xfs_refcache_purge_mp(mp) do { } while (0)
44#define xfs_refcache_purge_some(mp) do { } while (0)
45#define xfs_refcache_resize(size) do { } while (0)
46#define xfs_refcache_destroy() do { } while (0)
47
48#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
49
50#endif
51
52#endif /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..d2af0a8381a6 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
183 * tree quota mechanism would be circumvented. 183 * tree quota mechanism would be circumvented.
184 */ 184 */
185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
186 (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { 186 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
187 error = XFS_ERROR(EXDEV); 187 error = XFS_ERROR(EXDEV);
188 goto error_return; 188 goto error_return;
189 } 189 }
@@ -211,7 +211,9 @@ xfs_rename(
211 goto error_return; 211 goto error_return;
212 if (error) 212 if (error)
213 goto abort_return; 213 goto abort_return;
214 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 214
215 xfs_trans_ichgtime(tp, target_dp,
216 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
215 217
216 if (new_parent && src_is_directory) { 218 if (new_parent && src_is_directory) {
217 error = xfs_bumplink(tp, target_dp); 219 error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
249 &first_block, &free_list, spaceres); 251 &first_block, &free_list, spaceres);
250 if (error) 252 if (error)
251 goto abort_return; 253 goto abort_return;
252 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 254
255 xfs_trans_ichgtime(tp, target_dp,
256 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
253 257
254 /* 258 /*
255 * Decrement the link count on the target since the target 259 * Decrement the link count on the target since the target
@@ -292,7 +296,7 @@ xfs_rename(
292 * inode isn't really being changed, but old unix file systems did 296 * inode isn't really being changed, but old unix file systems did
293 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
294 */ 298 */
295 xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
296 300
297 /* 301 /*
298 * Adjust the link count on src_dp. This is necessary when 302 * Adjust the link count on src_dp. This is necessary when
@@ -315,7 +319,7 @@ xfs_rename(
315 if (error) 319 if (error)
316 goto abort_return; 320 goto abort_return;
317 321
318 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 322 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
319 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 323 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
320 if (new_parent) 324 if (new_parent)
321 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 325 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..12a191385310 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
39#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_buf.h"
42 43
43 44
44/* 45/*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
1883 /* 1884 /*
1884 * Read in the last block of the device, make sure it exists. 1885 * Read in the last block of the device, make sure it exists.
1885 */ 1886 */
1886 error = xfs_read_buf(mp, mp->m_rtdev_targp, 1887 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
1887 XFS_FSB_TO_BB(mp, nrblocks - 1), 1888 XFS_FSB_TO_BB(mp, nrblocks - 1),
1888 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1889 XFS_FSB_TO_B(mp, 1), 0);
1889 if (error) 1890 if (!bp)
1890 return error; 1891 return EIO;
1891 ASSERT(bp);
1892 xfs_buf_relse(bp); 1892 xfs_buf_relse(bp);
1893
1893 /* 1894 /*
1894 * Calculate new parameters. These are the final values to be reached. 1895 * Calculate new parameters. These are the final values to be reached.
1895 */ 1896 */
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
2215{ 2216{
2216 xfs_buf_t *bp; /* buffer for last block of subvolume */ 2217 xfs_buf_t *bp; /* buffer for last block of subvolume */
2217 xfs_daddr_t d; /* address of last block of subvolume */ 2218 xfs_daddr_t d; /* address of last block of subvolume */
2218 int error; /* error return value */
2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */ 2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */
2220 2220
2221 sbp = &mp->m_sb; 2221 sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
2242 (unsigned long long) mp->m_sb.sb_rblocks); 2242 (unsigned long long) mp->m_sb.sb_rblocks);
2243 return XFS_ERROR(EFBIG); 2243 return XFS_ERROR(EFBIG);
2244 } 2244 }
2245 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2245 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
2246 d - XFS_FSB_TO_BB(mp, 1), 2246 d - XFS_FSB_TO_BB(mp, 1),
2247 XFS_FSB_TO_BB(mp, 1), 0, &bp); 2247 XFS_FSB_TO_B(mp, 1), 0);
2248 if (error) { 2248 if (!bp) {
2249 cmn_err(CE_WARN, 2249 cmn_err(CE_WARN, "XFS: realtime device size check failed");
2250 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2250 return EIO;
2251 if (error == ENOSPC)
2252 return XFS_ERROR(EFBIG);
2253 return error;
2254 } 2251 }
2255 xfs_buf_relse(bp); 2252 xfs_buf_relse(bp);
2256 return 0; 2253 return 0;
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
83 84
84#define XFS_SB_VERSION2_OKREALFBITS \ 85#define XFS_SB_VERSION2_OKREALFBITS \
85 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
86 XFS_SB_VERSION2_ATTR2BIT) 87 XFS_SB_VERSION2_ATTR2BIT | \
88 XFS_SB_VERSION2_PROJID32BIT)
87#define XFS_SB_VERSION2_OKSASHFBITS \ 89#define XFS_SB_VERSION2_OKSASHFBITS \
88 (0) 90 (0)
89#define XFS_SB_VERSION2_OKREALBITS \ 91#define XFS_SB_VERSION2_OKREALBITS \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
495 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; 497 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
496} 498}
497 499
500static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
501{
502 return xfs_sb_version_hasmorebits(sbp) &&
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504}
505
498/* 506/*
499 * end of superblock version macros 507 * end of superblock version macros
500 */ 508 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..f6d956b7711e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -696,7 +696,7 @@ xfs_trans_reserve(
696 * fail if the count would go below zero. 696 * fail if the count would go below zero.
697 */ 697 */
698 if (blocks > 0) { 698 if (blocks > 0) {
699 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 699 error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
700 -((int64_t)blocks), rsvd); 700 -((int64_t)blocks), rsvd);
701 if (error != 0) { 701 if (error != 0) {
702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +767,7 @@ undo_log:
767 767
768undo_blocks: 768undo_blocks:
769 if (blocks > 0) { 769 if (blocks > 0) {
770 (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 770 xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
771 (int64_t)blocks, rsvd); 771 (int64_t)blocks, rsvd);
772 tp->t_blk_res = 0; 772 tp->t_blk_res = 0;
773 } 773 }
@@ -1009,7 +1009,7 @@ void
1009xfs_trans_unreserve_and_mod_sb( 1009xfs_trans_unreserve_and_mod_sb(
1010 xfs_trans_t *tp) 1010 xfs_trans_t *tp)
1011{ 1011{
1012 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ 1012 xfs_mod_sb_t msb[9]; /* If you add cases, add entries */
1013 xfs_mod_sb_t *msbp; 1013 xfs_mod_sb_t *msbp;
1014 xfs_mount_t *mp = tp->t_mountp; 1014 xfs_mount_t *mp = tp->t_mountp;
1015 /* REFERENCED */ 1015 /* REFERENCED */
@@ -1017,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
1017 int rsvd; 1017 int rsvd;
1018 int64_t blkdelta = 0; 1018 int64_t blkdelta = 0;
1019 int64_t rtxdelta = 0; 1019 int64_t rtxdelta = 0;
1020 int64_t idelta = 0;
1021 int64_t ifreedelta = 0;
1020 1022
1021 msbp = msb; 1023 msbp = msb;
1022 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 1024 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
1023 1025
1024 /* calculate free blocks delta */ 1026 /* calculate deltas */
1025 if (tp->t_blk_res > 0) 1027 if (tp->t_blk_res > 0)
1026 blkdelta = tp->t_blk_res; 1028 blkdelta = tp->t_blk_res;
1027
1028 if ((tp->t_fdblocks_delta != 0) && 1029 if ((tp->t_fdblocks_delta != 0) &&
1029 (xfs_sb_version_haslazysbcount(&mp->m_sb) || 1030 (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1030 (tp->t_flags & XFS_TRANS_SB_DIRTY))) 1031 (tp->t_flags & XFS_TRANS_SB_DIRTY)))
1031 blkdelta += tp->t_fdblocks_delta; 1032 blkdelta += tp->t_fdblocks_delta;
1032 1033
1033 if (blkdelta != 0) {
1034 msbp->msb_field = XFS_SBS_FDBLOCKS;
1035 msbp->msb_delta = blkdelta;
1036 msbp++;
1037 }
1038
1039 /* calculate free realtime extents delta */
1040 if (tp->t_rtx_res > 0) 1034 if (tp->t_rtx_res > 0)
1041 rtxdelta = tp->t_rtx_res; 1035 rtxdelta = tp->t_rtx_res;
1042
1043 if ((tp->t_frextents_delta != 0) && 1036 if ((tp->t_frextents_delta != 0) &&
1044 (tp->t_flags & XFS_TRANS_SB_DIRTY)) 1037 (tp->t_flags & XFS_TRANS_SB_DIRTY))
1045 rtxdelta += tp->t_frextents_delta; 1038 rtxdelta += tp->t_frextents_delta;
1046 1039
1040 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1041 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1042 idelta = tp->t_icount_delta;
1043 ifreedelta = tp->t_ifree_delta;
1044 }
1045
1046 /* apply the per-cpu counters */
1047 if (blkdelta) {
1048 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
1049 blkdelta, rsvd);
1050 if (error)
1051 goto out;
1052 }
1053
1054 if (idelta) {
1055 error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
1056 idelta, rsvd);
1057 if (error)
1058 goto out_undo_fdblocks;
1059 }
1060
1061 if (ifreedelta) {
1062 error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
1063 ifreedelta, rsvd);
1064 if (error)
1065 goto out_undo_icount;
1066 }
1067
1068 /* apply remaining deltas */
1047 if (rtxdelta != 0) { 1069 if (rtxdelta != 0) {
1048 msbp->msb_field = XFS_SBS_FREXTENTS; 1070 msbp->msb_field = XFS_SBS_FREXTENTS;
1049 msbp->msb_delta = rtxdelta; 1071 msbp->msb_delta = rtxdelta;
1050 msbp++; 1072 msbp++;
1051 } 1073 }
1052 1074
1053 /* apply remaining deltas */
1054
1055 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1056 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1057 if (tp->t_icount_delta != 0) {
1058 msbp->msb_field = XFS_SBS_ICOUNT;
1059 msbp->msb_delta = tp->t_icount_delta;
1060 msbp++;
1061 }
1062 if (tp->t_ifree_delta != 0) {
1063 msbp->msb_field = XFS_SBS_IFREE;
1064 msbp->msb_delta = tp->t_ifree_delta;
1065 msbp++;
1066 }
1067 }
1068
1069 if (tp->t_flags & XFS_TRANS_SB_DIRTY) { 1075 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
1070 if (tp->t_dblocks_delta != 0) { 1076 if (tp->t_dblocks_delta != 0) {
1071 msbp->msb_field = XFS_SBS_DBLOCKS; 1077 msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1121,24 @@ xfs_trans_unreserve_and_mod_sb(
1115 if (msbp > msb) { 1121 if (msbp > msb) {
1116 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, 1122 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
1117 (uint)(msbp - msb), rsvd); 1123 (uint)(msbp - msb), rsvd);
1118 ASSERT(error == 0); 1124 if (error)
1125 goto out_undo_ifreecount;
1119 } 1126 }
1127
1128 return;
1129
1130out_undo_ifreecount:
1131 if (ifreedelta)
1132 xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
1133out_undo_icount:
1134 if (idelta)
1135 xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
1136out_undo_fdblocks:
1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out:
1140 ASSERT(error = 0);
1141 return;
1120} 1142}
1121 1143
1122/* 1144/*
@@ -1389,15 +1411,12 @@ xfs_trans_item_committed(
1389 */ 1411 */
1390STATIC void 1412STATIC void
1391xfs_trans_committed( 1413xfs_trans_committed(
1392 struct xfs_trans *tp, 1414 void *arg,
1393 int abortflag) 1415 int abortflag)
1394{ 1416{
1417 struct xfs_trans *tp = arg;
1395 struct xfs_log_item_desc *lidp, *next; 1418 struct xfs_log_item_desc *lidp, *next;
1396 1419
1397 /* Call the transaction's completion callback if there is one. */
1398 if (tp->t_callback != NULL)
1399 tp->t_callback(tp, tp->t_callarg);
1400
1401 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { 1420 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1402 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); 1421 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
1403 xfs_trans_free_item_desc(lidp); 1422 xfs_trans_free_item_desc(lidp);
@@ -1525,7 +1544,7 @@ xfs_trans_commit_iclog(
1525 * running in simulation mode (the log is explicitly turned 1544 * running in simulation mode (the log is explicitly turned
1526 * off). 1545 * off).
1527 */ 1546 */
1528 tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; 1547 tp->t_logcb.cb_func = xfs_trans_committed;
1529 tp->t_logcb.cb_arg = tp; 1548 tp->t_logcb.cb_arg = tp;
1530 1549
1531 /* 1550 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..246286b77a86 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
399 * transaction. */ 399 * transaction. */
400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */ 400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ 401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
402 xfs_trans_callback_t t_callback; /* transaction callback */
403 void *t_callarg; /* callback arg */
404 unsigned int t_flags; /* misc flags */ 402 unsigned int t_flags; /* misc flags */
405 int64_t t_icount_delta; /* superblock icount change */ 403 int64_t t_icount_delta; /* superblock icount change */
406 int64_t t_ifree_delta; /* superblock ifree change */ 404 int64_t t_ifree_delta; /* superblock ifree change */
@@ -473,6 +471,7 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
473void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
474int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, 472int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
475 xfs_ino_t , uint, uint, struct xfs_inode **); 473 xfs_ino_t , uint, uint, struct xfs_inode **);
474void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
476void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 475void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
477void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); 476void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
478void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 477void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..c47918c302a5 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
336 ASSERT(!XFS_BUF_ISASYNC(bp)); 336 ASSERT(!XFS_BUF_ISASYNC(bp));
337 XFS_BUF_READ(bp); 337 XFS_BUF_READ(bp);
338 xfsbdstrat(tp->t_mountp, bp); 338 xfsbdstrat(tp->t_mountp, bp);
339 error = xfs_iowait(bp); 339 error = xfs_buf_iowait(bp);
340 if (error) { 340 if (error) {
341 xfs_ioerror_alert("xfs_trans_read_buf", mp, 341 xfs_ioerror_alert("xfs_trans_read_buf", mp,
342 bp, blkno); 342 bp, blkno);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..ccb34532768b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -118,6 +118,36 @@ xfs_trans_ijoin_ref(
118} 118}
119 119
120/* 120/*
121 * Transactional inode timestamp update. Requires the inode to be locked and
122 * joined to the transaction supplied. Relies on the transaction subsystem to
123 * track dirty state and update/writeback the inode accordingly.
124 */
125void
126xfs_trans_ichgtime(
127 struct xfs_trans *tp,
128 struct xfs_inode *ip,
129 int flags)
130{
131 struct inode *inode = VFS_I(ip);
132 timespec_t tv;
133
134 ASSERT(tp);
135 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
136 ASSERT(ip->i_transp == tp);
137
138 tv = current_fs_time(inode->i_sb);
139
140 if ((flags & XFS_ICHGTIME_MOD) &&
141 !timespec_equal(&inode->i_mtime, &tv)) {
142 inode->i_mtime = tv;
143 }
144 if ((flags & XFS_ICHGTIME_CHG) &&
145 !timespec_equal(&inode->i_ctime, &tv)) {
146 inode->i_ctime = tv;
147 }
148}
149
150/*
121 * This is called to mark the fields indicated in fieldmask as needing 151 * This is called to mark the fields indicated in fieldmask as needing
122 * to be logged when the transaction is committed. The inode must 152 * to be logged when the transaction is committed. The inode must
123 * already be associated with the given transaction. 153 * already be associated with the given transaction.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..26d1867d8156 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ 73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ 74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */ 76typedef __uint32_t xlog_tid_t; /* transaction ID type */
79 77
80/* 78/*
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
56 mode_t mode, 56 mode_t mode,
57 xfs_nlink_t nlink, 57 xfs_nlink_t nlink,
58 xfs_dev_t rdev, 58 xfs_dev_t rdev,
59 cred_t *credp,
60 prid_t prid, /* project id */ 59 prid_t prid, /* project id */
61 int okalloc, /* ok to allocate new space */ 60 int okalloc, /* ok to allocate new space */
62 xfs_inode_t **ipp, /* pointer to inode; it will be 61 xfs_inode_t **ipp, /* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
93 * transaction commit so that no other process can steal 92 * transaction commit so that no other process can steal
94 * the inode(s) that we've just allocated. 93 * the inode(s) that we've just allocated.
95 */ 94 */
96 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, 95 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
97 &ialloc_context, &call_again, &ip); 96 &ialloc_context, &call_again, &ip);
98 97
99 /* 98 /*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
197 * other allocations in this allocation group, 196 * other allocations in this allocation group,
198 * this call should always succeed. 197 * this call should always succeed.
199 */ 198 */
200 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, 199 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
201 okalloc, &ialloc_context, &call_again, &ip); 200 okalloc, &ialloc_context, &call_again, &ip);
202 201
203 /* 202 /*
@@ -235,7 +234,7 @@ xfs_droplink(
235{ 234{
236 int error; 235 int error;
237 236
238 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 237 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
239 238
240 ASSERT (ip->i_d.di_nlink > 0); 239 ASSERT (ip->i_d.di_nlink > 0);
241 ip->i_d.di_nlink--; 240 ip->i_d.di_nlink--;
@@ -299,7 +298,7 @@ xfs_bumplink(
299{ 298{
300 if (ip->i_d.di_nlink >= XFS_MAXLINK) 299 if (ip->i_d.di_nlink >= XFS_MAXLINK)
301 return XFS_ERROR(EMLINK); 300 return XFS_ERROR(EMLINK);
302 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 301 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
303 302
304 ASSERT(ip->i_d.di_nlink > 0); 303 ASSERT(ip->i_d.di_nlink > 0);
305 ip->i_d.di_nlink++; 304 ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
19#define __XFS_UTILS_H__ 19#define __XFS_UTILS_H__
20 20
21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, 21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
22 xfs_dev_t, cred_t *, prid_t, int, 22 xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
23 xfs_inode_t **, int *);
24extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); 23extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
25extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); 24extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
26extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); 25extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..8e4a63c4151a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
114 */ 114 */
115 ASSERT(udqp == NULL); 115 ASSERT(udqp == NULL);
116 ASSERT(gdqp == NULL); 116 ASSERT(gdqp == NULL);
117 code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, 117 code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
118 qflags, &udqp, &gdqp); 118 qflags, &udqp, &gdqp);
119 if (code) 119 if (code)
120 return code; 120 return code;
@@ -184,8 +184,11 @@ xfs_setattr(
184 ip->i_size == 0 && ip->i_d.di_nextents == 0) { 184 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
185 xfs_iunlock(ip, XFS_ILOCK_EXCL); 185 xfs_iunlock(ip, XFS_ILOCK_EXCL);
186 lock_flags &= ~XFS_ILOCK_EXCL; 186 lock_flags &= ~XFS_ILOCK_EXCL;
187 if (mask & ATTR_CTIME) 187 if (mask & ATTR_CTIME) {
188 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 188 inode->i_mtime = inode->i_ctime =
189 current_fs_time(inode->i_sb);
190 xfs_mark_inode_dirty_sync(ip);
191 }
189 code = 0; 192 code = 0;
190 goto error_return; 193 goto error_return;
191 } 194 }
@@ -1253,8 +1256,7 @@ xfs_create(
1253 struct xfs_name *name, 1256 struct xfs_name *name,
1254 mode_t mode, 1257 mode_t mode,
1255 xfs_dev_t rdev, 1258 xfs_dev_t rdev,
1256 xfs_inode_t **ipp, 1259 xfs_inode_t **ipp)
1257 cred_t *credp)
1258{ 1260{
1259 int is_dir = S_ISDIR(mode); 1261 int is_dir = S_ISDIR(mode);
1260 struct xfs_mount *mp = dp->i_mount; 1262 struct xfs_mount *mp = dp->i_mount;
@@ -1266,7 +1268,7 @@ xfs_create(
1266 boolean_t unlock_dp_on_error = B_FALSE; 1268 boolean_t unlock_dp_on_error = B_FALSE;
1267 uint cancel_flags; 1269 uint cancel_flags;
1268 int committed; 1270 int committed;
1269 xfs_prid_t prid; 1271 prid_t prid;
1270 struct xfs_dquot *udqp = NULL; 1272 struct xfs_dquot *udqp = NULL;
1271 struct xfs_dquot *gdqp = NULL; 1273 struct xfs_dquot *gdqp = NULL;
1272 uint resblks; 1274 uint resblks;
@@ -1279,9 +1281,9 @@ xfs_create(
1279 return XFS_ERROR(EIO); 1281 return XFS_ERROR(EIO);
1280 1282
1281 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1283 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1282 prid = dp->i_d.di_projid; 1284 prid = xfs_get_projid(dp);
1283 else 1285 else
1284 prid = dfltprid; 1286 prid = XFS_PROJID_DEFAULT;
1285 1287
1286 /* 1288 /*
1287 * Make sure that we have allocated dquot(s) on disk. 1289 * Make sure that we have allocated dquot(s) on disk.
@@ -1360,7 +1362,7 @@ xfs_create(
1360 * entry pointing to them, but a directory also the "." entry 1362 * entry pointing to them, but a directory also the "." entry
1361 * pointing to itself. 1363 * pointing to itself.
1362 */ 1364 */
1363 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, 1365 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1364 prid, resblks > 0, &ip, &committed); 1366 prid, resblks > 0, &ip, &committed);
1365 if (error) { 1367 if (error) {
1366 if (error == ENOSPC) 1368 if (error == ENOSPC)
@@ -1391,7 +1393,7 @@ xfs_create(
1391 ASSERT(error != ENOSPC); 1393 ASSERT(error != ENOSPC);
1392 goto out_trans_abort; 1394 goto out_trans_abort;
1393 } 1395 }
1394 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1396 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1395 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1397 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1396 1398
1397 if (is_dir) { 1399 if (is_dir) {
@@ -1742,7 +1744,7 @@ xfs_remove(
1742 ASSERT(error != ENOENT); 1744 ASSERT(error != ENOENT);
1743 goto out_bmap_cancel; 1745 goto out_bmap_cancel;
1744 } 1746 }
1745 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1747 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1746 1748
1747 if (is_dir) { 1749 if (is_dir) {
1748 /* 1750 /*
@@ -1880,7 +1882,7 @@ xfs_link(
1880 * the tree quota mechanism could be circumvented. 1882 * the tree quota mechanism could be circumvented.
1881 */ 1883 */
1882 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1884 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1883 (tdp->i_d.di_projid != sip->i_d.di_projid))) { 1885 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1884 error = XFS_ERROR(EXDEV); 1886 error = XFS_ERROR(EXDEV);
1885 goto error_return; 1887 goto error_return;
1886 } 1888 }
@@ -1895,7 +1897,7 @@ xfs_link(
1895 &first_block, &free_list, resblks); 1897 &first_block, &free_list, resblks);
1896 if (error) 1898 if (error)
1897 goto abort_return; 1899 goto abort_return;
1898 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1900 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1899 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1901 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1900 1902
1901 error = xfs_bumplink(tp, sip); 1903 error = xfs_bumplink(tp, sip);
@@ -1933,8 +1935,7 @@ xfs_symlink(
1933 struct xfs_name *link_name, 1935 struct xfs_name *link_name,
1934 const char *target_path, 1936 const char *target_path,
1935 mode_t mode, 1937 mode_t mode,
1936 xfs_inode_t **ipp, 1938 xfs_inode_t **ipp)
1937 cred_t *credp)
1938{ 1939{
1939 xfs_mount_t *mp = dp->i_mount; 1940 xfs_mount_t *mp = dp->i_mount;
1940 xfs_trans_t *tp; 1941 xfs_trans_t *tp;
@@ -1955,7 +1956,7 @@ xfs_symlink(
1955 int byte_cnt; 1956 int byte_cnt;
1956 int n; 1957 int n;
1957 xfs_buf_t *bp; 1958 xfs_buf_t *bp;
1958 xfs_prid_t prid; 1959 prid_t prid;
1959 struct xfs_dquot *udqp, *gdqp; 1960 struct xfs_dquot *udqp, *gdqp;
1960 uint resblks; 1961 uint resblks;
1961 1962
@@ -1978,9 +1979,9 @@ xfs_symlink(
1978 1979
1979 udqp = gdqp = NULL; 1980 udqp = gdqp = NULL;
1980 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1981 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1981 prid = dp->i_d.di_projid; 1982 prid = xfs_get_projid(dp);
1982 else 1983 else
1983 prid = (xfs_prid_t)dfltprid; 1984 prid = XFS_PROJID_DEFAULT;
1984 1985
1985 /* 1986 /*
1986 * Make sure that we have allocated dquot(s) on disk. 1987 * Make sure that we have allocated dquot(s) on disk.
@@ -2046,8 +2047,8 @@ xfs_symlink(
2046 /* 2047 /*
2047 * Allocate an inode for the symlink. 2048 * Allocate an inode for the symlink.
2048 */ 2049 */
2049 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 2050 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
2050 1, 0, credp, prid, resblks > 0, &ip, NULL); 2051 prid, resblks > 0, &ip, NULL);
2051 if (error) { 2052 if (error) {
2052 if (error == ENOSPC) 2053 if (error == ENOSPC)
2053 goto error_return; 2054 goto error_return;
@@ -2129,7 +2130,7 @@ xfs_symlink(
2129 &first_block, &free_list, resblks); 2130 &first_block, &free_list, resblks);
2130 if (error) 2131 if (error)
2131 goto error1; 2132 goto error1;
2132 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2133 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2133 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2134 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2134 2135
2135 /* 2136 /*
@@ -2272,7 +2273,7 @@ xfs_alloc_file_space(
2272 count = len; 2273 count = len;
2273 imapp = &imaps[0]; 2274 imapp = &imaps[0];
2274 nimaps = 1; 2275 nimaps = 1;
2275 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); 2276 bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
2276 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 2277 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
2277 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 2278 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
2278 2279
@@ -2431,9 +2432,9 @@ xfs_zero_remaining_bytes(
2431 if (endoff > ip->i_size) 2432 if (endoff > ip->i_size)
2432 endoff = ip->i_size; 2433 endoff = ip->i_size;
2433 2434
2434 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 2435 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
2435 XFS_IS_REALTIME_INODE(ip) ? 2436 mp->m_rtdev_targp : mp->m_ddev_targp,
2436 mp->m_rtdev_targp : mp->m_ddev_targp); 2437 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
2437 if (!bp) 2438 if (!bp)
2438 return XFS_ERROR(ENOMEM); 2439 return XFS_ERROR(ENOMEM);
2439 2440
@@ -2459,7 +2460,7 @@ xfs_zero_remaining_bytes(
2459 XFS_BUF_READ(bp); 2460 XFS_BUF_READ(bp);
2460 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); 2461 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2461 xfsbdstrat(mp, bp); 2462 xfsbdstrat(mp, bp);
2462 error = xfs_iowait(bp); 2463 error = xfs_buf_iowait(bp);
2463 if (error) { 2464 if (error) {
2464 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 2465 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
2465 mp, bp, XFS_BUF_ADDR(bp)); 2466 mp, bp, XFS_BUF_ADDR(bp));
@@ -2472,7 +2473,7 @@ xfs_zero_remaining_bytes(
2472 XFS_BUF_UNREAD(bp); 2473 XFS_BUF_UNREAD(bp);
2473 XFS_BUF_WRITE(bp); 2474 XFS_BUF_WRITE(bp);
2474 xfsbdstrat(mp, bp); 2475 xfsbdstrat(mp, bp);
2475 error = xfs_iowait(bp); 2476 error = xfs_buf_iowait(bp);
2476 if (error) { 2477 if (error) {
2477 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 2478 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
2478 mp, bp, XFS_BUF_ADDR(bp)); 2479 mp, bp, XFS_BUF_ADDR(bp));
@@ -2711,6 +2712,7 @@ xfs_change_file_space(
2711 xfs_off_t llen; 2712 xfs_off_t llen;
2712 xfs_trans_t *tp; 2713 xfs_trans_t *tp;
2713 struct iattr iattr; 2714 struct iattr iattr;
2715 int prealloc_type;
2714 2716
2715 if (!S_ISREG(ip->i_d.di_mode)) 2717 if (!S_ISREG(ip->i_d.di_mode))
2716 return XFS_ERROR(EINVAL); 2718 return XFS_ERROR(EINVAL);
@@ -2753,12 +2755,17 @@ xfs_change_file_space(
2753 * size to be changed. 2755 * size to be changed.
2754 */ 2756 */
2755 setprealloc = clrprealloc = 0; 2757 setprealloc = clrprealloc = 0;
2758 prealloc_type = XFS_BMAPI_PREALLOC;
2756 2759
2757 switch (cmd) { 2760 switch (cmd) {
2761 case XFS_IOC_ZERO_RANGE:
2762 prealloc_type |= XFS_BMAPI_CONVERT;
2763 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2764 /* FALLTHRU */
2758 case XFS_IOC_RESVSP: 2765 case XFS_IOC_RESVSP:
2759 case XFS_IOC_RESVSP64: 2766 case XFS_IOC_RESVSP64:
2760 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2767 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2761 1, attr_flags); 2768 prealloc_type, attr_flags);
2762 if (error) 2769 if (error)
2763 return error; 2770 return error;
2764 setprealloc = 1; 2771 setprealloc = 1;
@@ -2827,7 +2834,7 @@ xfs_change_file_space(
2827 if (ip->i_d.di_mode & S_IXGRP) 2834 if (ip->i_d.di_mode & S_IXGRP)
2828 ip->i_d.di_mode &= ~S_ISGID; 2835 ip->i_d.di_mode &= ~S_ISGID;
2829 2836
2830 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2837 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2831 } 2838 }
2832 if (setprealloc) 2839 if (setprealloc)
2833 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 2840 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..f6702927eee4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
2#define _XFS_VNODEOPS_H 1 2#define _XFS_VNODEOPS_H 1
3 3
4struct attrlist_cursor_kern; 4struct attrlist_cursor_kern;
5struct cred;
6struct file; 5struct file;
7struct iattr; 6struct iattr;
8struct inode; 7struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 25int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 struct xfs_inode **ipp, struct xfs_name *ci_name); 26 struct xfs_inode **ipp, struct xfs_name *ci_name);
28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, 27int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
29 xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp); 28 xfs_dev_t rdev, struct xfs_inode **ipp);
30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 29int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 30 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 31int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 33int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
35 xfs_off_t *offset, filldir_t filldir); 34 xfs_off_t *offset, filldir_t filldir);
36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
37 const char *target_path, mode_t mode, struct xfs_inode **ipp, 36 const char *target_path, mode_t mode, struct xfs_inode **ipp);
38 cred_t *credp);
39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
40int xfs_change_file_space(struct xfs_inode *ip, int cmd, 38int xfs_change_file_space(struct xfs_inode *ip, int cmd,
41 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); 39 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);