aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig16
-rw-r--r--fs/9p/Makefile2
-rw-r--r--fs/9p/acl.c395
-rw-r--r--fs/9p/acl.h49
-rw-r--r--fs/9p/fid.c1
-rw-r--r--fs/9p/v9fs.c22
-rw-r--r--fs/9p/v9fs.h50
-rw-r--r--fs/9p/v9fs_vfs.h5
-rw-r--r--fs/9p/vfs_addr.c30
-rw-r--r--fs/9p/vfs_dentry.c6
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_file.c265
-rw-r--r--fs/9p/vfs_inode.c779
-rw-r--r--fs/9p/vfs_inode_dotl.c824
-rw-r--r--fs/9p/vfs_super.c44
-rw-r--r--fs/9p/xattr.c54
-rw-r--r--fs/9p/xattr.h6
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile7
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/adfs/dir.c12
-rw-r--r--fs/adfs/super.c30
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/amigaffs.c4
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/namei.c69
-rw-r--r--fs/affs/super.c42
-rw-r--r--fs/afs/cmservice.c12
-rw-r--r--fs/afs/dir.c15
-rw-r--r--fs/afs/flock.c5
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/afs/internal.h5
-rw-r--r--fs/afs/main.c13
-rw-r--r--fs/afs/mntpt.c64
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/afs/security.c7
-rw-r--r--fs/afs/server.c13
-rw-r--r--fs/afs/super.c35
-rw-r--r--fs/afs/vlocation.c14
-rw-r--r--fs/afs/write.c19
-rw-r--r--fs/aio.c45
-rw-r--r--fs/anon_inodes.c39
-rw-r--r--fs/autofs/Kconfig21
-rw-r--r--fs/autofs/Makefile7
-rw-r--r--fs/autofs/autofs_i.h165
-rw-r--r--fs/autofs/dirhash.c250
-rw-r--r--fs/autofs/init.c52
-rw-r--r--fs/autofs/inode.c288
-rw-r--r--fs/autofs/root.c643
-rw-r--r--fs/autofs/symlink.c26
-rw-r--r--fs/autofs/waitq.c205
-rw-r--r--fs/autofs4/autofs_i.h134
-rw-r--r--fs/autofs4/dev-ioctl.c3
-rw-r--r--fs/autofs4/expire.c170
-rw-r--r--fs/autofs4/init.c8
-rw-r--r--fs/autofs4/inode.c115
-rw-r--r--fs/autofs4/root.c790
-rw-r--r--fs/autofs4/symlink.c3
-rw-r--r--fs/autofs4/waitq.c40
-rw-r--r--fs/bad_inode.c5
-rw-r--r--fs/befs/endian.h16
-rw-r--r--fs/befs/linuxvfs.c23
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/inode.c22
-rw-r--r--fs/binfmt_elf.c25
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/bio-integrity.c7
-rw-r--r--fs/bio.c23
-rw-r--r--fs/block_dev.c820
-rw-r--r--fs/btrfs/acl.c21
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c19
-rw-r--r--fs/btrfs/export.c12
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/extent_io.c1
-rw-r--r--fs/btrfs/file.c113
-rw-r--r--fs/btrfs/inode.c128
-rw-r--r--fs/btrfs/super.c17
-rw-r--r--fs/btrfs/volumes.c32
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/buffer.c73
-rw-r--r--fs/cachefiles/daemon.c1
-rw-r--r--fs/ceph/Kconfig14
-rw-r--r--fs/ceph/Makefile34
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c80
-rw-r--r--fs/ceph/armor.c103
-rw-r--r--fs/ceph/auth.c259
-rw-r--r--fs/ceph/auth.h92
-rw-r--r--fs/ceph/auth_none.c131
-rw-r--r--fs/ceph/auth_none.h30
-rw-r--r--fs/ceph/auth_x.c687
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c65
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c110
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c72
-rw-r--r--fs/ceph/ceph_fs.h728
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c609
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c412
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c415
-rw-r--r--fs/ceph/decode.h196
-rw-r--r--fs/ceph/dir.c166
-rw-r--r--fs/ceph/export.c7
-rw-r--r--fs/ceph/file.c264
-rw-r--r--fs/ceph/inode.c119
-rw-r--r--fs/ceph/ioctl.c77
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c117
-rw-r--r--fs/ceph/mds_client.c236
-rw-r--r--fs/ceph/mds_client.h55
-rw-r--r--fs/ceph/mdsmap.c11
-rw-r--r--fs/ceph/mdsmap.h62
-rw-r--r--fs/ceph/messenger.c2277
-rw-r--r--fs/ceph/messenger.h253
-rw-r--r--fs/ceph/mon_client.c1018
-rw-r--r--fs/ceph/mon_client.h121
-rw-r--r--fs/ceph/msgpool.c64
-rw-r--r--fs/ceph/msgpool.h25
-rw-r--r--fs/ceph/msgr.h175
-rw-r--r--fs/ceph/osd_client.c1539
-rw-r--r--fs/ceph/osd_client.h167
-rw-r--r--fs/ceph/osdmap.c1110
-rw-r--r--fs/ceph/osdmap.h128
-rw-r--r--fs/ceph/pagelist.c63
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h405
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)82
-rw-r--r--fs/ceph/super.c1197
-rw-r--r--fs/ceph/super.h408
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c21
-rw-r--r--fs/char_dev.c16
-rw-r--r--fs/cifs/Kconfig12
-rw-r--r--fs/cifs/Makefile6
-rw-r--r--fs/cifs/README19
-rw-r--r--fs/cifs/TODO2
-rw-r--r--fs/cifs/cache.c16
-rw-r--r--fs/cifs/cifs_debug.c44
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifs_dfs_ref.c138
-rw-r--r--fs/cifs/cifs_fs_sb.h15
-rw-r--r--fs/cifs/cifs_spnego.c10
-rw-r--r--fs/cifs/cifs_unicode.c127
-rw-r--r--fs/cifs/cifsacl.c106
-rw-r--r--fs/cifs/cifsacl.h4
-rw-r--r--fs/cifs/cifsencrypt.c583
-rw-r--r--fs/cifs/cifsencrypt.h33
-rw-r--r--fs/cifs/cifsfs.c223
-rw-r--r--fs/cifs/cifsfs.h33
-rw-r--r--fs/cifs/cifsglob.h255
-rw-r--r--fs/cifs/cifspdu.h76
-rw-r--r--fs/cifs/cifsproto.h60
-rw-r--r--fs/cifs/cifssmb.c339
-rw-r--r--fs/cifs/cn_cifs.h37
-rw-r--r--fs/cifs/connect.c1274
-rw-r--r--fs/cifs/dir.c306
-rw-r--r--fs/cifs/dns_resolve.c2
-rw-r--r--fs/cifs/file.c1495
-rw-r--r--fs/cifs/fscache.c21
-rw-r--r--fs/cifs/inode.c344
-rw-r--r--fs/cifs/ioctl.c27
-rw-r--r--fs/cifs/link.c417
-rw-r--r--fs/cifs/md4.c205
-rw-r--r--fs/cifs/md5.c366
-rw-r--r--fs/cifs/md5.h38
-rw-r--r--fs/cifs/misc.c246
-rw-r--r--fs/cifs/netmisc.c8
-rw-r--r--fs/cifs/ntlmssp.h15
-rw-r--r--fs/cifs/readdir.c105
-rw-r--r--fs/cifs/sess.c343
-rw-r--r--fs/cifs/smbdes.c1
-rw-r--r--fs/cifs/smbencrypt.c92
-rw-r--r--fs/cifs/transport.c486
-rw-r--r--fs/cifs/xattr.c115
-rw-r--r--fs/coda/cache.c26
-rw-r--r--fs/coda/cnode.c22
-rw-r--r--fs/coda/coda_cache.h22
-rw-r--r--fs/coda/coda_fs_i.h58
-rw-r--r--fs/coda/coda_linux.c3
-rw-r--r--fs/coda/coda_linux.h101
-rw-r--r--fs/coda/dir.c184
-rw-r--r--fs/coda/file.c34
-rw-r--r--fs/coda/inode.c82
-rw-r--r--fs/coda/pioctl.c31
-rw-r--r--fs/coda/psdev.c46
-rw-r--r--fs/coda/symlink.c7
-rw-r--r--fs/coda/upcall.c94
-rw-r--r--fs/compat.c84
-rw-r--r--fs/compat_ioctl.c97
-rw-r--r--fs/configfs/Kconfig4
-rw-r--r--fs/configfs/configfs_internal.h5
-rw-r--r--fs/configfs/dir.c22
-rw-r--r--fs/configfs/inode.c9
-rw-r--r--fs/configfs/mount.c9
-rw-r--r--fs/cramfs/inode.c119
-rw-r--r--fs/dcache.c1530
-rw-r--r--fs/debugfs/file.c3
-rw-r--r--fs/debugfs/inode.c9
-rw-r--r--fs/devpts/inode.c32
-rw-r--r--fs/direct-io.c12
-rw-r--r--fs/dlm/Kconfig3
-rw-r--r--fs/dlm/debug_fs.c3
-rw-r--r--fs/dlm/lock.c3
-rw-r--r--fs/dlm/lowcomms.c63
-rw-r--r--fs/dlm/plock.c3
-rw-r--r--fs/dlm/user.c3
-rw-r--r--fs/ecryptfs/crypto.c30
-rw-r--r--fs/ecryptfs/dentry.c9
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c32
-rw-r--r--fs/ecryptfs/inode.c54
-rw-r--r--fs/ecryptfs/keystore.c71
-rw-r--r--fs/ecryptfs/main.c185
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/mmap.c35
-rw-r--r--fs/ecryptfs/super.c15
-rw-r--r--fs/efs/super.c17
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/eventpoll.c66
-rw-r--r--fs/exec.c218
-rw-r--r--fs/exofs/dir.c4
-rw-r--r--fs/exofs/file.c6
-rw-r--r--fs/exofs/inode.c76
-rw-r--r--fs/exofs/ios.c10
-rw-r--r--fs/exofs/namei.c2
-rw-r--r--fs/exofs/super.c19
-rw-r--r--fs/exportfs/expfs.c31
-rw-r--r--fs/ext2/acl.c11
-rw-r--r--fs/ext2/acl.h2
-rw-r--r--fs/ext2/balloc.c3
-rw-r--r--fs/ext2/dir.c21
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/inode.c15
-rw-r--r--fs/ext2/namei.c4
-rw-r--r--fs/ext2/super.c52
-rw-r--r--fs/ext2/xattr.c12
-rw-r--r--fs/ext3/acl.c11
-rw-r--r--fs/ext3/acl.h2
-rw-r--r--fs/ext3/balloc.c283
-rw-r--r--fs/ext3/dir.c15
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext3/ialloc.c11
-rw-r--r--fs/ext3/inode.c30
-rw-r--r--fs/ext3/ioctl.c22
-rw-r--r--fs/ext3/namei.c140
-rw-r--r--fs/ext3/resize.c78
-rw-r--r--fs/ext3/super.c173
-rw-r--r--fs/ext3/xattr.c2
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/acl.c11
-rw-r--r--fs/ext4/acl.h2
-rw-r--r--fs/ext4/balloc.c8
-rw-r--r--fs/ext4/block_validity.c7
-rw-r--r--fs/ext4/dir.c58
-rw-r--r--fs/ext4/ext4.h208
-rw-r--r--fs/ext4/ext4_extents.h73
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c455
-rw-r--r--fs/ext4/file.c68
-rw-r--r--fs/ext4/fsync.c90
-rw-r--r--fs/ext4/ialloc.c137
-rw-r--r--fs/ext4/inode.c683
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c598
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/move_extent.c22
-rw-r--r--fs/ext4/namei.c134
-rw-r--r--fs/ext4/page-io.c428
-rw-r--r--fs/ext4/resize.c119
-rw-r--r--fs/ext4/super.c971
-rw-r--r--fs/ext4/xattr.c32
-rw-r--r--fs/ext4/xattr.h10
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/inode.c27
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fat/namei_msdos.c49
-rw-r--r--fs/fat/namei_vfat.c79
-rw-r--r--fs/fcntl.c64
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/filesystems.c3
-rw-r--r--fs/freevxfs/vxfs_inode.c10
-rw-r--r--fs/freevxfs/vxfs_lookup.c14
-rw-r--r--fs/freevxfs/vxfs_super.c16
-rw-r--r--fs/fs-writeback.c197
-rw-r--r--fs/fs_struct.c49
-rw-r--r--fs/fscache/operation.c2
-rw-r--r--fs/fuse/control.c15
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c175
-rw-r--r--fs/fuse/dir.c70
-rw-r--r--fs/fuse/file.c140
-rw-r--r--fs/fuse/fuse_i.h27
-rw-r--r--fs/fuse/inode.c66
-rw-r--r--fs/generic_acl.c20
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/acl.c5
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c27
-rw-r--r--fs/gfs2/bmap.c266
-rw-r--r--fs/gfs2/bmap.h20
-rw-r--r--fs/gfs2/dentry.c24
-rw-r--r--fs/gfs2/dir.c31
-rw-r--r--fs/gfs2/dir.h34
-rw-r--r--fs/gfs2/export.c66
-rw-r--r--fs/gfs2/file.c270
-rw-r--r--fs/gfs2/glock.c109
-rw-r--r--fs/gfs2/glock.h30
-rw-r--r--fs/gfs2/glops.c7
-rw-r--r--fs/gfs2/incore.h21
-rw-r--r--fs/gfs2/inode.c234
-rw-r--r--fs/gfs2/inode.h22
-rw-r--r--fs/gfs2/lock_dlm.c19
-rw-r--r--fs/gfs2/log.c19
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/ops_fstype.c139
-rw-r--r--fs/gfs2/ops_inode.c112
-rw-r--r--fs/gfs2/quota.c44
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c184
-rw-r--r--fs/gfs2/rgrp.h9
-rw-r--r--fs/gfs2/super.c37
-rw-r--r--fs/gfs2/sys.c22
-rw-r--r--fs/gfs2/trace_gfs2.h3
-rw-r--r--fs/gfs2/trans.h9
-rw-r--r--fs/gfs2/xattr.c25
-rw-r--r--fs/hfs/bfind.c4
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/btree.h2
-rw-r--r--fs/hfs/dir.c2
-rw-r--r--fs/hfs/hfs_fs.h21
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/string.c17
-rw-r--r--fs/hfs/super.c28
-rw-r--r--fs/hfs/sysdep.c7
-rw-r--r--fs/hfsplus/bfind.c23
-rw-r--r--fs/hfsplus/bitmap.c23
-rw-r--r--fs/hfsplus/bnode.c70
-rw-r--r--fs/hfsplus/brec.c57
-rw-r--r--fs/hfsplus/btree.c100
-rw-r--r--fs/hfsplus/catalog.c127
-rw-r--r--fs/hfsplus/dir.c237
-rw-r--r--fs/hfsplus/extents.c275
-rw-r--r--fs/hfsplus/hfsplus_fs.h207
-rw-r--r--fs/hfsplus/hfsplus_raw.h6
-rw-r--r--fs/hfsplus/inode.c272
-rw-r--r--fs/hfsplus/ioctl.c157
-rw-r--r--fs/hfsplus/options.c54
-rw-r--r--fs/hfsplus/part_tbl.c130
-rw-r--r--fs/hfsplus/super.c508
-rw-r--r--fs/hfsplus/unicode.c72
-rw-r--r--fs/hfsplus/wrapper.c196
-rw-r--r--fs/hostfs/hostfs.h10
-rw-r--r--fs/hostfs/hostfs_kern.c54
-rw-r--r--fs/hostfs/hostfs_user.c14
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/hpfs/buffer.c4
-rw-r--r--fs/hpfs/dentry.c32
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/hpfs_fn.h4
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/namei.c2
-rw-r--r--fs/hpfs/super.c30
-rw-r--r--fs/hppfs/hppfs.c18
-rw-r--r--fs/hugetlbfs/inode.c37
-rw-r--r--fs/inode.c543
-rw-r--r--fs/internal.h12
-rw-r--r--fs/ioctl.c18
-rw-r--r--fs/ioprio.c13
-rw-r--r--fs/isofs/dir.c6
-rw-r--r--fs/isofs/inode.c217
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/namei.c13
-rw-r--r--fs/isofs/rock.c10
-rw-r--r--fs/jbd/checkpoint.c4
-rw-r--r--fs/jbd/commit.c40
-rw-r--r--fs/jbd/journal.c44
-rw-r--r--fs/jbd/recovery.c2
-rw-r--r--fs/jbd/transaction.c8
-rw-r--r--fs/jbd2/checkpoint.c13
-rw-r--r--fs/jbd2/commit.c88
-rw-r--r--fs/jbd2/journal.c68
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/jbd2/transaction.c9
-rw-r--r--fs/jffs2/acl.c5
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/build.c7
-rw-r--r--fs/jffs2/compr.c6
-rw-r--r--fs/jffs2/compr.h4
-rw-r--r--fs/jffs2/compr_lzo.c4
-rw-r--r--fs/jffs2/compr_rtime.c6
-rw-r--r--fs/jffs2/compr_rubin.c11
-rw-r--r--fs/jffs2/compr_zlib.c6
-rw-r--r--fs/jffs2/dir.c7
-rw-r--r--fs/jffs2/erase.c2
-rw-r--r--fs/jffs2/fs.c26
-rw-r--r--fs/jffs2/gc.c7
-rw-r--r--fs/jffs2/jffs2_fs_sb.h3
-rw-r--r--fs/jffs2/nodelist.c8
-rw-r--r--fs/jffs2/nodelist.h3
-rw-r--r--fs/jffs2/scan.c12
-rw-r--r--fs/jffs2/super.c27
-rw-r--r--fs/jffs2/xattr.c12
-rw-r--r--fs/jfs/acl.c8
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c23
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/namei.c71
-rw-r--r--fs/jfs/super.c48
-rw-r--r--fs/libfs.c116
-rw-r--r--fs/lockd/Makefile6
-rw-r--r--fs/lockd/clnt4xdr.c605
-rw-r--r--fs/lockd/clntlock.c20
-rw-r--r--fs/lockd/clntproc.c32
-rw-r--r--fs/lockd/clntxdr.c627
-rw-r--r--fs/lockd/host.c418
-rw-r--r--fs/lockd/mon.c111
-rw-r--r--fs/lockd/svc.c13
-rw-r--r--fs/lockd/svc4proc.c23
-rw-r--r--fs/lockd/svclock.c72
-rw-r--r--fs/lockd/svcproc.c31
-rw-r--r--fs/lockd/svcsubs.c9
-rw-r--r--fs/lockd/xdr.c287
-rw-r--r--fs/lockd/xdr4.c255
-rw-r--r--fs/locks.c261
-rw-r--r--fs/logfs/dev_bdev.c20
-rw-r--r--fs/logfs/dev_mtd.c18
-rw-r--r--fs/logfs/dir.c9
-rw-r--r--fs/logfs/inode.c9
-rw-r--r--fs/logfs/journal.c2
-rw-r--r--fs/logfs/logfs.h22
-rw-r--r--fs/logfs/readwrite.c3
-rw-r--r--fs/logfs/super.c77
-rw-r--r--fs/mbcache.c12
-rw-r--r--fs/minix/inode.c18
-rw-r--r--fs/minix/namei.c4
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/namei.c1158
-rw-r--r--fs/namespace.c367
-rw-r--r--fs/ncpfs/dir.c253
-rw-r--r--fs/ncpfs/file.c29
-rw-r--r--fs/ncpfs/inode.c87
-rw-r--r--fs/ncpfs/ioctl.c475
-rw-r--r--fs/ncpfs/mmap.c4
-rw-r--r--fs/ncpfs/ncp_fs.h98
-rw-r--r--fs/ncpfs/ncp_fs_i.h29
-rw-r--r--fs/ncpfs/ncp_fs_sb.h176
-rw-r--r--fs/ncpfs/ncplib_kernel.c103
-rw-r--r--fs/ncpfs/ncplib_kernel.h33
-rw-r--r--fs/ncpfs/ncpsign_kernel.c11
-rw-r--r--fs/ncpfs/ncpsign_kernel.h2
-rw-r--r--fs/ncpfs/sock.c3
-rw-r--r--fs/ncpfs/symlink.c4
-rw-r--r--fs/nfs/Kconfig19
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.c97
-rw-r--r--fs/nfs/callback.h61
-rw-r--r--fs/nfs/callback_proc.c332
-rw-r--r--fs/nfs/callback_xdr.c142
-rw-r--r--fs/nfs/client.c325
-rw-r--r--fs/nfs/delegation.c379
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c1117
-rw-r--r--fs/nfs/direct.c38
-rw-r--r--fs/nfs/dns_resolve.c6
-rw-r--r--fs/nfs/file.c89
-rw-r--r--fs/nfs/getroot.c15
-rw-r--r--fs/nfs/idmap.c211
-rw-r--r--fs/nfs/inode.c80
-rw-r--r--fs/nfs/internal.h34
-rw-r--r--fs/nfs/mount_clnt.c91
-rw-r--r--fs/nfs/namespace.c94
-rw-r--r--fs/nfs/nfs2xdr.c1339
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs3proc.c62
-rw-r--r--fs/nfs/nfs3xdr.c2870
-rw-r--r--fs/nfs/nfs4_fs.h15
-rw-r--r--fs/nfs/nfs4filelayout.c280
-rw-r--r--fs/nfs/nfs4filelayout.h94
-rw-r--r--fs/nfs/nfs4filelayoutdev.c453
-rw-r--r--fs/nfs/nfs4proc.c704
-rw-r--r--fs/nfs/nfs4renewd.c11
-rw-r--r--fs/nfs/nfs4state.c335
-rw-r--r--fs/nfs/nfs4xdr.c2033
-rw-r--r--fs/nfs/nfsroot.c568
-rw-r--r--fs/nfs/pagelist.c19
-rw-r--r--fs/nfs/pnfs.c965
-rw-r--r--fs/nfs/pnfs.h235
-rw-r--r--fs/nfs/proc.c40
-rw-r--r--fs/nfs/read.c5
-rw-r--r--fs/nfs/super.c200
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfs/unlink.c259
-rw-r--r--fs/nfs/write.c27
-rw-r--r--fs/nfs_common/nfsacl.c54
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/acl.h59
-rw-r--r--fs/nfsd/export.c77
-rw-r--r--fs/nfsd/idmap.h62
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs3xdr.c6
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c1036
-rw-r--r--fs/nfsd/nfs4idmap.c120
-rw-r--r--fs/nfsd/nfs4proc.c66
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c776
-rw-r--r--fs/nfsd/nfs4xdr.c133
-rw-r--r--fs/nfsd/nfsctl.c39
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c7
-rw-r--r--fs/nfsd/state.h68
-rw-r--r--fs/nfsd/vfs.c112
-rw-r--r--fs/nfsd/xdr4.h30
-rw-r--r--fs/nilfs2/Makefile2
-rw-r--r--fs/nilfs2/bmap.c69
-rw-r--r--fs/nilfs2/bmap.h10
-rw-r--r--fs/nilfs2/btnode.c20
-rw-r--r--fs/nilfs2/cpfile.c72
-rw-r--r--fs/nilfs2/cpfile.h4
-rw-r--r--fs/nilfs2/dat.c92
-rw-r--r--fs/nilfs2/dat.h4
-rw-r--r--fs/nilfs2/dir.c3
-rw-r--r--fs/nilfs2/export.h17
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/gcdat.c87
-rw-r--r--fs/nilfs2/gcinode.c127
-rw-r--r--fs/nilfs2/ifile.c62
-rw-r--r--fs/nilfs2/ifile.h4
-rw-r--r--fs/nilfs2/inode.c351
-rw-r--r--fs/nilfs2/ioctl.c52
-rw-r--r--fs/nilfs2/mdt.c317
-rw-r--r--fs/nilfs2/mdt.h32
-rw-r--r--fs/nilfs2/namei.c142
-rw-r--r--fs/nilfs2/nilfs.h49
-rw-r--r--fs/nilfs2/page.c139
-rw-r--r--fs/nilfs2/page.h9
-rw-r--r--fs/nilfs2/recovery.c21
-rw-r--r--fs/nilfs2/sb.h18
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/nilfs2/segment.c147
-rw-r--r--fs/nilfs2/segment.h10
-rw-r--r--fs/nilfs2/sufile.c77
-rw-r--r--fs/nilfs2/sufile.h6
-rw-r--r--fs/nilfs2/super.c680
-rw-r--r--fs/nilfs2/the_nilfs.c352
-rw-r--r--fs/nilfs2/the_nilfs.h104
-rw-r--r--fs/no-block.c1
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c33
-rw-r--r--fs/notify/fanotify/fanotify_user.c176
-rw-r--r--fs/notify/fsnotify.c76
-rw-r--r--fs/notify/inode_mark.c11
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/notify/vfsmount_mark.c6
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c35
-rw-r--r--fs/ntfs/inode.c9
-rw-r--r--fs/ntfs/mft.c11
-rw-r--r--fs/ntfs/super.c58
-rw-r--r--fs/ocfs2/Kconfig5
-rw-r--r--fs/ocfs2/acl.c8
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/alloc.c77
-rw-r--r--fs/ocfs2/alloc.h4
-rw-r--r--fs/ocfs2/aops.c94
-rw-r--r--fs/ocfs2/aops.h29
-rw-r--r--fs/ocfs2/cluster/heartbeat.c787
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h14
-rw-r--r--fs/ocfs2/cluster/netdebug.c286
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/quorum.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c150
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h35
-rw-r--r--fs/ocfs2/dcache.c52
-rw-r--r--fs/ocfs2/dcache.h1
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmast.c76
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h109
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c212
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c410
-rw-r--r--fs/ocfs2/dlm/dlmlock.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlm/dlmthread.c132
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c20
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/export.c6
-rw-r--r--fs/ocfs2/file.c125
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/inode.h12
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c15
-rw-r--r--fs/ocfs2/ocfs2.h68
-rw-r--r--fs/ocfs2/ocfs2_fs.h46
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h95
-rw-r--r--fs/ocfs2/refcounttree.c43
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c6
-rw-r--r--fs/ocfs2/suballoc.c18
-rw-r--r--fs/ocfs2/super.c197
-rw-r--r--fs/ocfs2/sysfile.c60
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/inode.c9
-rw-r--r--fs/open.c17
-rw-r--r--fs/openpromfs/inode.c17
-rw-r--r--fs/partitions/check.c130
-rw-r--r--fs/partitions/check.h3
-rw-r--r--fs/partitions/efi.c25
-rw-r--r--fs/partitions/ldm.c2
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/pipe.c45
-rw-r--r--fs/pnode.c4
-rw-r--r--fs/posix_acl.c17
-rw-r--r--fs/proc/Kconfig10
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c28
-rw-r--r--fs/proc/base.c302
-rw-r--r--fs/proc/consoles.c114
-rw-r--r--fs/proc/devices.c4
-rw-r--r--fs/proc/generic.c21
-rw-r--r--fs/proc/inode.c17
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/proc_sysctl.c34
-rw-r--r--fs/proc/proc_tty.c26
-rw-r--r--fs/proc/root.c17
-rw-r--r--fs/proc/softirqs.c8
-rw-r--r--fs/proc/stat.c16
-rw-r--r--fs/proc/task_mmu.c22
-rw-r--r--fs/proc/task_nommu.c7
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/dir.c4
-rw-r--r--fs/qnx4/inode.c24
-rw-r--r--fs/qnx4/namei.c4
-rw-r--r--fs/quota/Kconfig4
-rw-r--r--fs/quota/dquot.c66
-rw-r--r--fs/quota/quota.c41
-rw-r--r--fs/quota/quota_tree.c9
-rw-r--r--fs/ramfs/inode.c18
-rw-r--r--fs/read_write.c91
-rw-r--r--fs/reiserfs/Kconfig6
-rw-r--r--fs/reiserfs/README2
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c27
-rw-r--r--fs/reiserfs/ioctl.c14
-rw-r--r--fs/reiserfs/journal.c128
-rw-r--r--fs/reiserfs/namei.c2
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/super.c36
-rw-r--r--fs/reiserfs/xattr.c25
-rw-r--r--fs/reiserfs/xattr_acl.c6
-rw-r--r--fs/romfs/super.c27
-rw-r--r--fs/select.c8
-rw-r--r--fs/seq_file.c8
-rw-r--r--fs/signalfd.c11
-rw-r--r--fs/smbfs/Kconfig55
-rw-r--r--fs/smbfs/Makefile18
-rw-r--r--fs/smbfs/cache.c208
-rw-r--r--fs/smbfs/dir.c702
-rw-r--r--fs/smbfs/file.c454
-rw-r--r--fs/smbfs/getopt.c64
-rw-r--r--fs/smbfs/getopt.h14
-rw-r--r--fs/smbfs/inode.c839
-rw-r--r--fs/smbfs/ioctl.c69
-rw-r--r--fs/smbfs/proc.c3507
-rw-r--r--fs/smbfs/proto.h87
-rw-r--r--fs/smbfs/request.c818
-rw-r--r--fs/smbfs/request.h70
-rw-r--r--fs/smbfs/smb_debug.h34
-rw-r--r--fs/smbfs/smbiod.c344
-rw-r--r--fs/smbfs/sock.c386
-rw-r--r--fs/smbfs/symlink.c68
-rw-r--r--fs/splice.c67
-rw-r--r--fs/squashfs/Kconfig18
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/block.c9
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c16
-rw-r--r--fs/squashfs/decompressor.h9
-rw-r--r--fs/squashfs/dir.c3
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/lzo_wrapper.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h6
-rw-r--r--fs/squashfs/super.c24
-rw-r--r--fs/squashfs/xattr.c9
-rw-r--r--fs/squashfs/xattr.h4
-rw-r--r--fs/squashfs/xattr_id.c2
-rw-r--r--fs/squashfs/xz_wrapper.c147
-rw-r--r--fs/squashfs/zlib_wrapper.c21
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c141
-rw-r--r--fs/sysfs/Kconfig2
-rw-r--r--fs/sysfs/bin.c68
-rw-r--r--fs/sysfs/dir.c10
-rw-r--r--fs/sysfs/group.c53
-rw-r--r--fs/sysfs/inode.c12
-rw-r--r--fs/sysfs/mount.c32
-rw-r--r--fs/sysfs/sysfs.h3
-rw-r--r--fs/sysv/inode.c9
-rw-r--r--fs/sysv/namei.c6
-rw-r--r--fs/sysv/super.c25
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/commit.c4
-rw-r--r--fs/ubifs/debug.c157
-rw-r--r--fs/ubifs/debug.h4
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c7
-rw-r--r--fs/ubifs/gc.c82
-rw-r--r--fs/ubifs/io.c20
-rw-r--r--fs/ubifs/journal.c3
-rw-r--r--fs/ubifs/key.h14
-rw-r--r--fs/ubifs/log.c6
-rw-r--r--fs/ubifs/lpt.c7
-rw-r--r--fs/ubifs/lpt_commit.c3
-rw-r--r--fs/ubifs/master.c3
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c11
-rw-r--r--fs/ubifs/replay.c20
-rw-r--r--fs/ubifs/sb.c9
-rw-r--r--fs/ubifs/scan.c6
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c103
-rw-r--r--fs/ubifs/tnc.c5
-rw-r--r--fs/ubifs/ubifs.h23
-rw-r--r--fs/udf/balloc.c3
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/file.c11
-rw-r--r--fs/udf/ialloc.c21
-rw-r--r--fs/udf/inode.c51
-rw-r--r--fs/udf/namei.c109
-rw-r--r--fs/udf/partition.c27
-rw-r--r--fs/udf/super.c77
-rw-r--r--fs/udf/symlink.c12
-rw-r--r--fs/udf/udf_i.h13
-rw-r--r--fs/udf/udf_sb.h22
-rw-r--r--fs/udf/udfdecl.h4
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/super.c22
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c527
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c481
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h110
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c191
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c587
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c44
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c100
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c75
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c477
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h97
-rw-r--r--fs/xfs/linux-2.6/xfs_version.h29
-rw-r--r--fs/xfs/quota/xfs_dquot.c165
-rw-r--r--fs/xfs/quota/xfs_qm.c267
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c16
-rw-r--r--fs/xfs/support/debug.c112
-rw-r--r--fs/xfs/support/debug.h25
-rw-r--r--fs/xfs/xfs_acl.h2
-rw-r--r--fs/xfs/xfs_ag.h11
-rw-r--r--fs/xfs/xfs_alloc.c365
-rw-r--r--fs/xfs/xfs_alloc.h41
-rw-r--r--fs/xfs/xfs_alloc_btree.c33
-rw-r--r--fs/xfs/xfs_attr.c37
-rw-r--r--fs/xfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/xfs_bmap.c190
-rw-r--r--fs/xfs/xfs_bmap.h14
-rw-r--r--fs/xfs/xfs_btree.c65
-rw-r--r--fs/xfs/xfs_btree.h14
-rw-r--r--fs/xfs/xfs_buf_item.c194
-rw-r--r--fs/xfs/xfs_buf_item.h11
-rw-r--r--fs/xfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_dinode.h5
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_error.c34
-rw-r--r--fs/xfs/xfs_error.h23
-rw-r--r--fs/xfs/xfs_extfree_item.c96
-rw-r--r--fs/xfs/xfs_extfree_item.h11
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_fs.h7
-rw-r--r--fs/xfs/xfs_fsops.c25
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.c33
-rw-r--r--fs/xfs/xfs_iget.c92
-rw-r--r--fs/xfs/xfs_inode.c71
-rw-r--r--fs/xfs/xfs_inode.h47
-rw-r--r--fs/xfs/xfs_inode_item.c130
-rw-r--r--fs/xfs/xfs_iomap.c238
-rw-r--r--fs/xfs/xfs_iomap.h27
-rw-r--r--fs/xfs/xfs_itable.c3
-rw-r--r--fs/xfs/xfs_log.c759
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c264
-rw-r--r--fs/xfs/xfs_log_priv.h127
-rw-r--r--fs/xfs/xfs_log_recover.c647
-rw-r--r--fs/xfs/xfs_mount.c330
-rw-r--r--fs/xfs/xfs_mount.h23
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_quota.h20
-rw-r--r--fs/xfs/xfs_refcache.h52
-rw-r--r--fs/xfs/xfs_rename.c15
-rw-r--r--fs/xfs/xfs_rtalloc.c29
-rw-r--r--fs/xfs/xfs_sb.h10
-rw-r--r--fs/xfs/xfs_trans.c211
-rw-r--r--fs/xfs/xfs_trans.h5
-rw-r--r--fs/xfs/xfs_trans_ail.c232
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c8
-rw-r--r--fs/xfs/xfs_trans_inode.c30
-rw-r--r--fs/xfs/xfs_trans_priv.h35
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_utils.c9
-rw-r--r--fs/xfs/xfs_utils.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c126
-rw-r--r--fs/xfs/xfs_vnodeops.h6
870 files changed, 45428 insertions, 49732 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4e..814ac4e213a8 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -9,6 +9,8 @@ config 9P_FS
9 9
10 If unsure, say N. 10 If unsure, say N.
11 11
12if 9P_FS
13
12config 9P_FSCACHE 14config 9P_FSCACHE
13 bool "Enable 9P client caching support (EXPERIMENTAL)" 15 bool "Enable 9P client caching support (EXPERIMENTAL)"
14 depends on EXPERIMENTAL 16 depends on EXPERIMENTAL
@@ -17,3 +19,17 @@ config 9P_FSCACHE
17 Choose Y here to enable persistent, read-only local 19 Choose Y here to enable persistent, read-only local
18 caching support for 9p clients using FS-Cache 20 caching support for 9p clients using FS-Cache
19 21
22
23config 9P_FS_POSIX_ACL
24 bool "9P POSIX Access Control Lists"
25 select FS_POSIX_ACL
26 help
27 POSIX Access Control Lists (ACLs) support permissions for users and
28 groups beyond the owner/group/world scheme.
29
30 To learn more about Access Control Lists, visit the POSIX ACLs for
31 Linux website <http://acl.bestbits.at/>.
32
33 If you don't know what Access Control Lists are, say N
34
35endif
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 91fba025fcbe..ab8c12780634 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
39p-objs := \ 39p-objs := \
4 vfs_super.o \ 4 vfs_super.o \
5 vfs_inode.o \ 5 vfs_inode.o \
6 vfs_inode_dotl.o \
6 vfs_addr.o \ 7 vfs_addr.o \
7 vfs_file.o \ 8 vfs_file.o \
8 vfs_dir.o \ 9 vfs_dir.o \
@@ -13,3 +14,4 @@ obj-$(CONFIG_9P_FS) := 9p.o
13 xattr_user.o 14 xattr_user.o
14 15
159p-$(CONFIG_9P_FSCACHE) += cache.o 169p-$(CONFIG_9P_FSCACHE) += cache.o
179p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 000000000000..02a2cf616318
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,395 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/fs.h>
17#include <net/9p/9p.h>
18#include <net/9p/client.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/posix_acl_xattr.h>
22#include "xattr.h"
23#include "acl.h"
24#include "v9fs_vfs.h"
25#include "v9fs.h"
26
27static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
28{
29 ssize_t size;
30 void *value = NULL;
31 struct posix_acl *acl = NULL;
32
33 size = v9fs_fid_xattr_get(fid, name, NULL, 0);
34 if (size > 0) {
35 value = kzalloc(size, GFP_NOFS);
36 if (!value)
37 return ERR_PTR(-ENOMEM);
38 size = v9fs_fid_xattr_get(fid, name, value, size);
39 if (size > 0) {
40 acl = posix_acl_from_xattr(value, size);
41 if (IS_ERR(acl))
42 goto err_out;
43 }
44 } else if (size == -ENODATA || size == 0 ||
45 size == -ENOSYS || size == -EOPNOTSUPP) {
46 acl = NULL;
47 } else
48 acl = ERR_PTR(-EIO);
49
50err_out:
51 kfree(value);
52 return acl;
53}
54
55int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
56{
57 int retval = 0;
58 struct posix_acl *pacl, *dacl;
59 struct v9fs_session_info *v9ses;
60
61 v9ses = v9fs_inode2v9ses(inode);
62 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
63 set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
64 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
65 return 0;
66 }
67 /* get the default/access acl values and cache them */
68 dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
69 pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
70
71 if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
72 set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
73 set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
74 posix_acl_release(dacl);
75 posix_acl_release(pacl);
76 } else
77 retval = -EIO;
78
79 return retval;
80}
81
82static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
83{
84 struct posix_acl *acl;
85 /*
86 * 9p Always cache the acl value when
87 * instantiating the inode (v9fs_inode_from_fid)
88 */
89 acl = get_cached_acl(inode, type);
90 BUG_ON(acl == ACL_NOT_CACHED);
91 return acl;
92}
93
94int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
95{
96 struct posix_acl *acl;
97 struct v9fs_session_info *v9ses;
98
99 if (flags & IPERM_FLAG_RCU)
100 return -ECHILD;
101
102 v9ses = v9fs_inode2v9ses(inode);
103 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
104 /*
105 * On access = client mode get the acl
106 * values from the server
107 */
108 return 0;
109 }
110 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
111
112 if (IS_ERR(acl))
113 return PTR_ERR(acl);
114 if (acl) {
115 int error = posix_acl_permission(inode, acl, mask);
116 posix_acl_release(acl);
117 return error;
118 }
119 return -EAGAIN;
120}
121
122static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
123{
124 int retval;
125 char *name;
126 size_t size;
127 void *buffer;
128 struct inode *inode = dentry->d_inode;
129
130 set_cached_acl(inode, type, acl);
131 /* Set a setxattr request to server */
132 size = posix_acl_xattr_size(acl->a_count);
133 buffer = kmalloc(size, GFP_KERNEL);
134 if (!buffer)
135 return -ENOMEM;
136 retval = posix_acl_to_xattr(acl, buffer, size);
137 if (retval < 0)
138 goto err_free_out;
139 switch (type) {
140 case ACL_TYPE_ACCESS:
141 name = POSIX_ACL_XATTR_ACCESS;
142 break;
143 case ACL_TYPE_DEFAULT:
144 name = POSIX_ACL_XATTR_DEFAULT;
145 break;
146 default:
147 BUG();
148 }
149 retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
150err_free_out:
151 kfree(buffer);
152 return retval;
153}
154
155int v9fs_acl_chmod(struct dentry *dentry)
156{
157 int retval = 0;
158 struct posix_acl *acl, *clone;
159 struct inode *inode = dentry->d_inode;
160
161 if (S_ISLNK(inode->i_mode))
162 return -EOPNOTSUPP;
163 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
164 if (acl) {
165 clone = posix_acl_clone(acl, GFP_KERNEL);
166 posix_acl_release(acl);
167 if (!clone)
168 return -ENOMEM;
169 retval = posix_acl_chmod_masq(clone, inode->i_mode);
170 if (!retval)
171 retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
172 posix_acl_release(clone);
173 }
174 return retval;
175}
176
177int v9fs_set_create_acl(struct dentry *dentry,
178 struct posix_acl *dpacl, struct posix_acl *pacl)
179{
180 if (dpacl)
181 v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
182 if (pacl)
183 v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
184 posix_acl_release(dpacl);
185 posix_acl_release(pacl);
186 return 0;
187}
188
189int v9fs_acl_mode(struct inode *dir, mode_t *modep,
190 struct posix_acl **dpacl, struct posix_acl **pacl)
191{
192 int retval = 0;
193 mode_t mode = *modep;
194 struct posix_acl *acl = NULL;
195
196 if (!S_ISLNK(mode)) {
197 acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
198 if (IS_ERR(acl))
199 return PTR_ERR(acl);
200 if (!acl)
201 mode &= ~current_umask();
202 }
203 if (acl) {
204 struct posix_acl *clone;
205
206 if (S_ISDIR(mode))
207 *dpacl = acl;
208 clone = posix_acl_clone(acl, GFP_NOFS);
209 retval = -ENOMEM;
210 if (!clone)
211 goto cleanup;
212
213 retval = posix_acl_create_masq(clone, &mode);
214 if (retval < 0) {
215 posix_acl_release(clone);
216 goto cleanup;
217 }
218 if (retval > 0)
219 *pacl = clone;
220 }
221 *modep = mode;
222 return 0;
223cleanup:
224 posix_acl_release(acl);
225 return retval;
226
227}
228
229static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
230 void *buffer, size_t size, int type)
231{
232 char *full_name;
233
234 switch (type) {
235 case ACL_TYPE_ACCESS:
236 full_name = POSIX_ACL_XATTR_ACCESS;
237 break;
238 case ACL_TYPE_DEFAULT:
239 full_name = POSIX_ACL_XATTR_DEFAULT;
240 break;
241 default:
242 BUG();
243 }
244 return v9fs_xattr_get(dentry, full_name, buffer, size);
245}
246
247static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
248 void *buffer, size_t size, int type)
249{
250 struct v9fs_session_info *v9ses;
251 struct posix_acl *acl;
252 int error;
253
254 if (strcmp(name, "") != 0)
255 return -EINVAL;
256
257 v9ses = v9fs_inode2v9ses(dentry->d_inode);
258 /*
259 * We allow set/get/list of acl when access=client is not specified
260 */
261 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
262 return v9fs_remote_get_acl(dentry, name, buffer, size, type);
263
264 acl = v9fs_get_cached_acl(dentry->d_inode, type);
265 if (IS_ERR(acl))
266 return PTR_ERR(acl);
267 if (acl == NULL)
268 return -ENODATA;
269 error = posix_acl_to_xattr(acl, buffer, size);
270 posix_acl_release(acl);
271
272 return error;
273}
274
275static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
276 const void *value, size_t size,
277 int flags, int type)
278{
279 char *full_name;
280
281 switch (type) {
282 case ACL_TYPE_ACCESS:
283 full_name = POSIX_ACL_XATTR_ACCESS;
284 break;
285 case ACL_TYPE_DEFAULT:
286 full_name = POSIX_ACL_XATTR_DEFAULT;
287 break;
288 default:
289 BUG();
290 }
291 return v9fs_xattr_set(dentry, full_name, value, size, flags);
292}
293
294
295static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
296 const void *value, size_t size,
297 int flags, int type)
298{
299 int retval;
300 struct posix_acl *acl;
301 struct v9fs_session_info *v9ses;
302 struct inode *inode = dentry->d_inode;
303
304 if (strcmp(name, "") != 0)
305 return -EINVAL;
306
307 v9ses = v9fs_inode2v9ses(dentry->d_inode);
308 /*
309 * set the attribute on the remote. Without even looking at the
310 * xattr value. We leave it to the server to validate
311 */
312 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
313 return v9fs_remote_set_acl(dentry, name,
314 value, size, flags, type);
315
316 if (S_ISLNK(inode->i_mode))
317 return -EOPNOTSUPP;
318 if (!is_owner_or_cap(inode))
319 return -EPERM;
320 if (value) {
321 /* update the cached acl value */
322 acl = posix_acl_from_xattr(value, size);
323 if (IS_ERR(acl))
324 return PTR_ERR(acl);
325 else if (acl) {
326 retval = posix_acl_valid(acl);
327 if (retval)
328 goto err_out;
329 }
330 } else
331 acl = NULL;
332
333 switch (type) {
334 case ACL_TYPE_ACCESS:
335 name = POSIX_ACL_XATTR_ACCESS;
336 if (acl) {
337 mode_t mode = inode->i_mode;
338 retval = posix_acl_equiv_mode(acl, &mode);
339 if (retval < 0)
340 goto err_out;
341 else {
342 struct iattr iattr;
343 if (retval == 0) {
344 /*
345 * ACL can be represented
346 * by the mode bits. So don't
347 * update ACL.
348 */
349 acl = NULL;
350 value = NULL;
351 size = 0;
352 }
353 /* Updte the mode bits */
354 iattr.ia_mode = ((mode & S_IALLUGO) |
355 (inode->i_mode & ~S_IALLUGO));
356 iattr.ia_valid = ATTR_MODE;
357 /* FIXME should we update ctime ?
358 * What is the following setxattr update the
359 * mode ?
360 */
361 v9fs_vfs_setattr_dotl(dentry, &iattr);
362 }
363 }
364 break;
365 case ACL_TYPE_DEFAULT:
366 name = POSIX_ACL_XATTR_DEFAULT;
367 if (!S_ISDIR(inode->i_mode)) {
368 retval = acl ? -EINVAL : 0;
369 goto err_out;
370 }
371 break;
372 default:
373 BUG();
374 }
375 retval = v9fs_xattr_set(dentry, name, value, size, flags);
376 if (!retval)
377 set_cached_acl(inode, type, acl);
378err_out:
379 posix_acl_release(acl);
380 return retval;
381}
382
383const struct xattr_handler v9fs_xattr_acl_access_handler = {
384 .prefix = POSIX_ACL_XATTR_ACCESS,
385 .flags = ACL_TYPE_ACCESS,
386 .get = v9fs_xattr_get_acl,
387 .set = v9fs_xattr_set_acl,
388};
389
390const struct xattr_handler v9fs_xattr_acl_default_handler = {
391 .prefix = POSIX_ACL_XATTR_DEFAULT,
392 .flags = ACL_TYPE_DEFAULT,
393 .get = v9fs_xattr_get_acl,
394 .set = v9fs_xattr_set_acl,
395};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 000000000000..7ef3ac9f6d95
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14#ifndef FS_9P_ACL_H
15#define FS_9P_ACL_H
16
17#ifdef CONFIG_9P_FS_POSIX_ACL
18extern int v9fs_get_acl(struct inode *, struct p9_fid *);
19extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
20extern int v9fs_acl_chmod(struct dentry *);
21extern int v9fs_set_create_acl(struct dentry *,
22 struct posix_acl *, struct posix_acl *);
23extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
24 struct posix_acl **dpacl, struct posix_acl **pacl);
25#else
26#define v9fs_check_acl NULL
27static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
28{
29 return 0;
30}
31static inline int v9fs_acl_chmod(struct dentry *dentry)
32{
33 return 0;
34}
35static inline int v9fs_set_create_acl(struct dentry *dentry,
36 struct posix_acl *dpacl,
37 struct posix_acl *pacl)
38{
39 return 0;
40}
41static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
42 struct posix_acl **dpacl,
43 struct posix_acl **pacl)
44{
45 return 0;
46}
47
48#endif
49#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6406f896bf95..b00223c99d70 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
149 switch (access) { 149 switch (access) {
150 case V9FS_ACCESS_SINGLE: 150 case V9FS_ACCESS_SINGLE:
151 case V9FS_ACCESS_USER: 151 case V9FS_ACCESS_USER:
152 case V9FS_ACCESS_CLIENT:
152 uid = current_fsuid(); 153 uid = current_fsuid();
153 any = 0; 154 any = 0;
154 break; 155 break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 38dc0e067599..2f77cd33ba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
193 v9ses->flags |= V9FS_ACCESS_USER; 193 v9ses->flags |= V9FS_ACCESS_USER;
194 else if (strcmp(s, "any") == 0) 194 else if (strcmp(s, "any") == 0)
195 v9ses->flags |= V9FS_ACCESS_ANY; 195 v9ses->flags |= V9FS_ACCESS_ANY;
196 else { 196 else if (strcmp(s, "client") == 0) {
197#ifdef CONFIG_9P_FS_POSIX_ACL
198 v9ses->flags |= V9FS_ACCESS_CLIENT;
199#else
200 P9_DPRINTK(P9_DEBUG_ERROR,
201 "access=client option not supported\n");
202 kfree(s);
203 ret = -EINVAL;
204 goto free_and_return;
205#endif
206 } else {
197 v9ses->flags |= V9FS_ACCESS_SINGLE; 207 v9ses->flags |= V9FS_ACCESS_SINGLE;
198 v9ses->uid = simple_strtoul(s, &e, 10); 208 v9ses->uid = simple_strtoul(s, &e, 10);
199 if (*e != '\0') 209 if (*e != '\0')
@@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
278 288
279 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 289 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
280 290
291 if (!v9fs_proto_dotl(v9ses) &&
292 ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
293 /*
294 * We support ACCESS_CLIENT only for dotl.
295 * Fall back to ACCESS_USER
296 */
297 v9ses->flags &= ~V9FS_ACCESS_MASK;
298 v9ses->flags |= V9FS_ACCESS_USER;
299 }
300 /*FIXME !! */
281 /* for legacy mode, fall back to V9FS_ACCESS_ANY */ 301 /* for legacy mode, fall back to V9FS_ACCESS_ANY */
282 if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && 302 if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
283 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { 303 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4c963c9fc41f..c4b5d8864f0d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
33 * 33 *
34 * Session flags reflect options selected by users at mount time 34 * Session flags reflect options selected by users at mount time
35 */ 35 */
36#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
37 V9FS_ACCESS_USER | \
38 V9FS_ACCESS_CLIENT)
39#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
40
36enum p9_session_flags { 41enum p9_session_flags {
37 V9FS_PROTO_2000U = 0x01, 42 V9FS_PROTO_2000U = 0x01,
38 V9FS_PROTO_2000L = 0x02, 43 V9FS_PROTO_2000L = 0x02,
39 V9FS_ACCESS_SINGLE = 0x04, 44 V9FS_ACCESS_SINGLE = 0x04,
40 V9FS_ACCESS_USER = 0x08, 45 V9FS_ACCESS_USER = 0x08,
41 V9FS_ACCESS_ANY = 0x0C, 46 V9FS_ACCESS_CLIENT = 0x10
42 V9FS_ACCESS_MASK = 0x0C,
43}; 47};
44 48
45/* possible values of ->cache */ 49/* possible values of ->cache */
@@ -109,11 +113,27 @@ struct v9fs_session_info {
109 113
110struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
111 char *); 115 char *);
112void v9fs_session_close(struct v9fs_session_info *v9ses); 116extern void v9fs_session_close(struct v9fs_session_info *v9ses);
113void v9fs_session_cancel(struct v9fs_session_info *v9ses); 117extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
114void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); 118extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
119extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
120 struct nameidata *nameidata);
121extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
122extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
123extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
124 struct inode *new_dir, struct dentry *new_dentry);
125extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
126 void *p);
127extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
128 struct p9_fid *fid,
129 struct super_block *sb);
115 130
116#define V9FS_MAGIC 0x01021997 131extern const struct inode_operations v9fs_dir_inode_operations_dotl;
132extern const struct inode_operations v9fs_file_inode_operations_dotl;
133extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
134extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
135 struct p9_fid *fid,
136 struct super_block *sb);
117 137
118/* other default globals */ 138/* other default globals */
119#define V9FS_PORT 564 139#define V9FS_PORT 564
@@ -136,3 +156,21 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
136{ 156{
137 return v9ses->flags & V9FS_PROTO_2000L; 157 return v9ses->flags & V9FS_PROTO_2000L;
138} 158}
159
160/**
161 * v9fs_inode_from_fid - Helper routine to populate an inode by
162 * issuing a attribute request
163 * @v9ses: session information
164 * @fid: fid to issue attribute request for
165 * @sb: superblock on which to create inode
166 *
167 */
168static inline struct inode *
169v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
170 struct super_block *sb)
171{
172 if (v9fs_proto_dotl(v9ses))
173 return v9fs_inode_dotl(v9ses, fid, sb);
174 else
175 return v9fs_inode(v9ses, fid, sb);
176}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 88418c419ea7..b789f8e597ec 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,8 +59,11 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
59int v9fs_dir_release(struct inode *inode, struct file *filp); 59int v9fs_dir_release(struct inode *inode, struct file *filp);
60int v9fs_file_open(struct inode *inode, struct file *file); 60int v9fs_file_open(struct inode *inode, struct file *file);
61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); 61void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
62void v9fs_dentry_release(struct dentry *);
63int v9fs_uflags2omode(int uflags, int extended); 62int v9fs_uflags2omode(int uflags, int extended);
64 63
65ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 64ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
66void v9fs_blank_wstat(struct p9_wstat *wstat); 65void v9fs_blank_wstat(struct p9_wstat *wstat);
66int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
67int v9fs_file_fsync_dotl(struct file *filp, int datasync);
68
69#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b3..b7f2a8e3863e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
154 return 0; 154 return 0;
155} 155}
156 156
157/**
158 * v9fs_direct_IO - 9P address space operation for direct I/O
159 * @rw: direction (read or write)
160 * @iocb: target I/O control block
161 * @iov: array of vectors that define I/O buffer
162 * @pos: offset in file to begin the operation
163 * @nr_segs: size of iovec array
164 *
165 * The presence of v9fs_direct_IO() in the address space ops vector
166 * allowes open() O_DIRECT flags which would have failed otherwise.
167 *
168 * In the non-cached mode, we shunt off direct read and write requests before
169 * the VFS gets them, so this method should never be called.
170 *
171 * Direct IO is not 'yet' supported in the cached mode. Hence when
172 * this routine is called through generic_file_aio_read(), the read/write fails
173 * with an error.
174 *
175 */
176ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
177 loff_t pos, unsigned long nr_segs)
178{
179 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
180 "off/no(%lld/%lu) EINVAL\n",
181 iocb->ki_filp->f_path.dentry->d_name.name,
182 (long long) pos, nr_segs);
183
184 return -EINVAL;
185}
157const struct address_space_operations v9fs_addr_operations = { 186const struct address_space_operations v9fs_addr_operations = {
158 .readpage = v9fs_vfs_readpage, 187 .readpage = v9fs_vfs_readpage,
159 .readpages = v9fs_vfs_readpages, 188 .readpages = v9fs_vfs_readpages,
160 .releasepage = v9fs_release_page, 189 .releasepage = v9fs_release_page,
161 .invalidatepage = v9fs_invalidate_page, 190 .invalidatepage = v9fs_invalidate_page,
162 .launder_page = v9fs_launder_page, 191 .launder_page = v9fs_launder_page,
192 .direct_IO = v9fs_direct_IO,
163}; 193};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f3933..233b7d4ffe5e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
51 * 51 *
52 */ 52 */
53 53
54static int v9fs_dentry_delete(struct dentry *dentry) 54static int v9fs_dentry_delete(const struct dentry *dentry)
55{ 55{
56 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 56 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
57 dentry); 57 dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
68 * 68 *
69 */ 69 */
70 70
71static int v9fs_cached_dentry_delete(struct dentry *dentry) 71static int v9fs_cached_dentry_delete(const struct dentry *dentry)
72{ 72{
73 struct inode *inode = dentry->d_inode; 73 struct inode *inode = dentry->d_inode;
74 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 74 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(struct dentry *dentry)
86 * 86 *
87 */ 87 */
88 88
89void v9fs_dentry_release(struct dentry *dentry) 89static void v9fs_dentry_release(struct dentry *dentry)
90{ 90{
91 struct v9fs_dentry *dent; 91 struct v9fs_dentry *dent;
92 struct p9_fid *temp, *current_fid; 92 struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 899f168fd19c..b84ebe8cefed 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -242,7 +242,8 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
242 while (rdir->head < rdir->tail) { 242 while (rdir->head < rdir->tail) {
243 243
244 err = p9dirent_read(rdir->buf + rdir->head, 244 err = p9dirent_read(rdir->buf + rdir->head,
245 buflen - rdir->head, &curdirent, 245 rdir->tail - rdir->head,
246 &curdirent,
246 fid->clnt->proto_version); 247 fid->clnt->proto_version);
247 if (err < 0) { 248 if (err < 0) {
248 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 249 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -314,4 +315,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
314 .readdir = v9fs_dir_readdir_dotl, 315 .readdir = v9fs_dir_readdir_dotl,
315 .open = v9fs_file_open, 316 .open = v9fs_file_open,
316 .release = v9fs_dir_release, 317 .release = v9fs_dir_release,
318 .fsync = v9fs_file_fsync_dotl,
317}; 319};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e97c92bd6f16..240c30674396 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/utsname.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <linux/idr.h> 38#include <linux/idr.h>
38#include <net/9p/9p.h> 39#include <net/9p/9p.h>
@@ -44,6 +45,7 @@
44#include "cache.h" 45#include "cache.h"
45 46
46static const struct file_operations v9fs_cached_file_operations; 47static const struct file_operations v9fs_cached_file_operations;
48static const struct file_operations v9fs_cached_file_operations_dotl;
47 49
48/** 50/**
49 * v9fs_file_open - open a file (or directory) 51 * v9fs_file_open - open a file (or directory)
@@ -92,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
92 /* enable cached file options */ 94 /* enable cached file options */
93 if(file->f_op == &v9fs_file_operations) 95 if(file->f_op == &v9fs_file_operations)
94 file->f_op = &v9fs_cached_file_operations; 96 file->f_op = &v9fs_cached_file_operations;
97 else if (file->f_op == &v9fs_file_operations_dotl)
98 file->f_op = &v9fs_cached_file_operations_dotl;
95 99
96#ifdef CONFIG_9P_FSCACHE 100#ifdef CONFIG_9P_FSCACHE
97 v9fs_cache_inode_set_cookie(inode, file); 101 v9fs_cache_inode_set_cookie(inode, file);
@@ -130,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
130 return res; 134 return res;
131} 135}
132 136
137static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
138{
139 struct p9_flock flock;
140 struct p9_fid *fid;
141 uint8_t status;
142 int res = 0;
143 unsigned char fl_type;
144
145 fid = filp->private_data;
146 BUG_ON(fid == NULL);
147
148 if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
149 BUG();
150
151 res = posix_lock_file_wait(filp, fl);
152 if (res < 0)
153 goto out;
154
155 /* convert posix lock to p9 tlock args */
156 memset(&flock, 0, sizeof(flock));
157 flock.type = fl->fl_type;
158 flock.start = fl->fl_start;
159 if (fl->fl_end == OFFSET_MAX)
160 flock.length = 0;
161 else
162 flock.length = fl->fl_end - fl->fl_start + 1;
163 flock.proc_id = fl->fl_pid;
164 flock.client_id = utsname()->nodename;
165 if (IS_SETLKW(cmd))
166 flock.flags = P9_LOCK_FLAGS_BLOCK;
167
168 /*
169 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
170 * for lock request, keep on trying
171 */
172 for (;;) {
173 res = p9_client_lock_dotl(fid, &flock, &status);
174 if (res < 0)
175 break;
176
177 if (status != P9_LOCK_BLOCKED)
178 break;
179 if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
180 break;
181 schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
182 }
183
184 /* map 9p status to VFS status */
185 switch (status) {
186 case P9_LOCK_SUCCESS:
187 res = 0;
188 break;
189 case P9_LOCK_BLOCKED:
190 res = -EAGAIN;
191 break;
192 case P9_LOCK_ERROR:
193 case P9_LOCK_GRACE:
194 res = -ENOLCK;
195 break;
196 default:
197 BUG();
198 }
199
200 /*
201 * incase server returned error for lock request, revert
202 * it locally
203 */
204 if (res < 0 && fl->fl_type != F_UNLCK) {
205 fl_type = fl->fl_type;
206 fl->fl_type = F_UNLCK;
207 res = posix_lock_file_wait(filp, fl);
208 fl->fl_type = fl_type;
209 }
210out:
211 return res;
212}
213
214static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
215{
216 struct p9_getlock glock;
217 struct p9_fid *fid;
218 int res = 0;
219
220 fid = filp->private_data;
221 BUG_ON(fid == NULL);
222
223 posix_test_lock(filp, fl);
224 /*
225 * if we have a conflicting lock locally, no need to validate
226 * with server
227 */
228 if (fl->fl_type != F_UNLCK)
229 return res;
230
231 /* convert posix lock to p9 tgetlock args */
232 memset(&glock, 0, sizeof(glock));
233 glock.type = fl->fl_type;
234 glock.start = fl->fl_start;
235 if (fl->fl_end == OFFSET_MAX)
236 glock.length = 0;
237 else
238 glock.length = fl->fl_end - fl->fl_start + 1;
239 glock.proc_id = fl->fl_pid;
240 glock.client_id = utsname()->nodename;
241
242 res = p9_client_getlock_dotl(fid, &glock);
243 if (res < 0)
244 return res;
245 if (glock.type != F_UNLCK) {
246 fl->fl_type = glock.type;
247 fl->fl_start = glock.start;
248 if (glock.length == 0)
249 fl->fl_end = OFFSET_MAX;
250 else
251 fl->fl_end = glock.start + glock.length - 1;
252 fl->fl_pid = glock.proc_id;
253 } else
254 fl->fl_type = F_UNLCK;
255
256 return res;
257}
258
259/**
260 * v9fs_file_lock_dotl - lock a file (or directory)
261 * @filp: file to be locked
262 * @cmd: lock command
263 * @fl: file lock structure
264 *
265 */
266
267static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
268{
269 struct inode *inode = filp->f_path.dentry->d_inode;
270 int ret = -ENOLCK;
271
272 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
273 cmd, fl, filp->f_path.dentry->d_name.name);
274
275 /* No mandatory locks */
276 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
277 goto out_err;
278
279 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
280 filemap_write_and_wait(inode->i_mapping);
281 invalidate_mapping_pages(&inode->i_data, 0, -1);
282 }
283
284 if (IS_SETLK(cmd) || IS_SETLKW(cmd))
285 ret = v9fs_file_do_lock(filp, cmd, fl);
286 else if (IS_GETLK(cmd))
287 ret = v9fs_file_getlock(filp, fl);
288 else
289 ret = -EINVAL;
290out_err:
291 return ret;
292}
293
294/**
295 * v9fs_file_flock_dotl - lock a file
296 * @filp: file to be locked
297 * @cmd: lock command
298 * @fl: file lock structure
299 *
300 */
301
302static int v9fs_file_flock_dotl(struct file *filp, int cmd,
303 struct file_lock *fl)
304{
305 struct inode *inode = filp->f_path.dentry->d_inode;
306 int ret = -ENOLCK;
307
308 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
309 cmd, fl, filp->f_path.dentry->d_name.name);
310
311 /* No mandatory locks */
312 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
313 goto out_err;
314
315 if (!(fl->fl_flags & FL_FLOCK))
316 goto out_err;
317
318 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
319 filemap_write_and_wait(inode->i_mapping);
320 invalidate_mapping_pages(&inode->i_data, 0, -1);
321 }
322 /* Convert flock to posix lock */
323 fl->fl_owner = (fl_owner_t)filp;
324 fl->fl_start = 0;
325 fl->fl_end = OFFSET_MAX;
326 fl->fl_flags |= FL_POSIX;
327 fl->fl_flags ^= FL_FLOCK;
328
329 if (IS_SETLK(cmd) | IS_SETLKW(cmd))
330 ret = v9fs_file_do_lock(filp, cmd, fl);
331 else
332 ret = -EINVAL;
333out_err:
334 return ret;
335}
336
133/** 337/**
134 * v9fs_file_readn - read from a file 338 * v9fs_file_readn - read from a file
135 * @filp: file pointer to read 339 * @filp: file pointer to read
@@ -219,7 +423,9 @@ static ssize_t
219v9fs_file_write(struct file *filp, const char __user * data, 423v9fs_file_write(struct file *filp, const char __user * data,
220 size_t count, loff_t * offset) 424 size_t count, loff_t * offset)
221{ 425{
222 int n, rsize, total = 0; 426 ssize_t retval;
427 size_t total = 0;
428 int n;
223 struct p9_fid *fid; 429 struct p9_fid *fid;
224 struct p9_client *clnt; 430 struct p9_client *clnt;
225 struct inode *inode = filp->f_path.dentry->d_inode; 431 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -232,14 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data,
232 fid = filp->private_data; 438 fid = filp->private_data;
233 clnt = fid->clnt; 439 clnt = fid->clnt;
234 440
235 rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ; 441 retval = generic_write_checks(filp, &origin, &count, 0);
442 if (retval)
443 goto out;
236 444
237 do { 445 retval = -EINVAL;
238 if (count < rsize) 446 if ((ssize_t) count < 0)
239 rsize = count; 447 goto out;
448 retval = 0;
449 if (!count)
450 goto out;
240 451
241 n = p9_client_write(fid, NULL, data+total, origin+total, 452 do {
242 rsize); 453 n = p9_client_write(fid, NULL, data+total, origin+total, count);
243 if (n <= 0) 454 if (n <= 0)
244 break; 455 break;
245 count -= n; 456 count -= n;
@@ -258,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
258 } 469 }
259 470
260 if (n < 0) 471 if (n < 0)
261 return n; 472 retval = n;
262 473 else
263 return total; 474 retval = total;
475out:
476 return retval;
264} 477}
265 478
266static int v9fs_file_fsync(struct file *filp, int datasync) 479static int v9fs_file_fsync(struct file *filp, int datasync)
@@ -278,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
278 return retval; 491 return retval;
279} 492}
280 493
494int v9fs_file_fsync_dotl(struct file *filp, int datasync)
495{
496 struct p9_fid *fid;
497 int retval;
498
499 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
500 filp, datasync);
501
502 fid = filp->private_data;
503
504 retval = p9_client_fsync(fid, datasync);
505 return retval;
506}
507
281static const struct file_operations v9fs_cached_file_operations = { 508static const struct file_operations v9fs_cached_file_operations = {
282 .llseek = generic_file_llseek, 509 .llseek = generic_file_llseek,
283 .read = do_sync_read, 510 .read = do_sync_read,
@@ -290,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = {
290 .fsync = v9fs_file_fsync, 517 .fsync = v9fs_file_fsync,
291}; 518};
292 519
520static const struct file_operations v9fs_cached_file_operations_dotl = {
521 .llseek = generic_file_llseek,
522 .read = do_sync_read,
523 .aio_read = generic_file_aio_read,
524 .write = v9fs_file_write,
525 .open = v9fs_file_open,
526 .release = v9fs_dir_release,
527 .lock = v9fs_file_lock_dotl,
528 .flock = v9fs_file_flock_dotl,
529 .mmap = generic_file_readonly_mmap,
530 .fsync = v9fs_file_fsync_dotl,
531};
532
293const struct file_operations v9fs_file_operations = { 533const struct file_operations v9fs_file_operations = {
294 .llseek = generic_file_llseek, 534 .llseek = generic_file_llseek,
295 .read = v9fs_file_read, 535 .read = v9fs_file_read,
@@ -307,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = {
307 .write = v9fs_file_write, 547 .write = v9fs_file_write,
308 .open = v9fs_file_open, 548 .open = v9fs_file_open,
309 .release = v9fs_dir_release, 549 .release = v9fs_dir_release,
310 .lock = v9fs_file_lock, 550 .lock = v9fs_file_lock_dotl,
551 .flock = v9fs_file_flock_dotl,
311 .mmap = generic_file_readonly_mmap, 552 .mmap = generic_file_readonly_mmap,
312 .fsync = v9fs_file_fsync, 553 .fsync = v9fs_file_fsync_dotl,
313}; 554};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..b76a40bdf4c2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -36,6 +36,7 @@
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/xattr.h> 38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
39#include <net/9p/9p.h> 40#include <net/9p/9p.h>
40#include <net/9p/client.h> 41#include <net/9p/client.h>
41 42
@@ -44,14 +45,12 @@
44#include "fid.h" 45#include "fid.h"
45#include "cache.h" 46#include "cache.h"
46#include "xattr.h" 47#include "xattr.h"
48#include "acl.h"
47 49
48static const struct inode_operations v9fs_dir_inode_operations; 50static const struct inode_operations v9fs_dir_inode_operations;
49static const struct inode_operations v9fs_dir_inode_operations_dotu; 51static const struct inode_operations v9fs_dir_inode_operations_dotu;
50static const struct inode_operations v9fs_dir_inode_operations_dotl;
51static const struct inode_operations v9fs_file_inode_operations; 52static const struct inode_operations v9fs_file_inode_operations;
52static const struct inode_operations v9fs_file_inode_operations_dotl;
53static const struct inode_operations v9fs_symlink_inode_operations; 53static const struct inode_operations v9fs_symlink_inode_operations;
54static const struct inode_operations v9fs_symlink_inode_operations_dotl;
55 54
56/** 55/**
57 * unixmode2p9mode - convert unix mode bits to plan 9 56 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -231,46 +230,18 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
231 * 230 *
232 */ 231 */
233 232
234void v9fs_destroy_inode(struct inode *inode) 233static void v9fs_i_callback(struct rcu_head *head)
235{ 234{
235 struct inode *inode = container_of(head, struct inode, i_rcu);
236 INIT_LIST_HEAD(&inode->i_dentry);
236 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); 237 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
237} 238}
238#endif
239
240/**
241 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
242 * new file system object. This checks the S_ISGID to determine the owning
243 * group of the new file system object.
244 */
245
246static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
247{
248 BUG_ON(dir_inode == NULL);
249
250 if (dir_inode->i_mode & S_ISGID) {
251 /* set_gid bit is set.*/
252 return dir_inode->i_gid;
253 }
254 return current_fsgid();
255}
256
257/**
258 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
259 * dir inode.
260 *
261 */
262 239
263static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) 240void v9fs_destroy_inode(struct inode *inode)
264{ 241{
265 struct dentry *dentry; 242 call_rcu(&inode->i_rcu, v9fs_i_callback);
266
267 spin_lock(&dcache_lock);
268 /* Directory should have only one entry. */
269 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
270 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
271 spin_unlock(&dcache_lock);
272 return dentry;
273} 243}
244#endif
274 245
275/** 246/**
276 * v9fs_get_inode - helper function to setup an inode 247 * v9fs_get_inode - helper function to setup an inode
@@ -441,7 +412,7 @@ void v9fs_evict_inode(struct inode *inode)
441#endif 412#endif
442} 413}
443 414
444static struct inode * 415struct inode *
445v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, 416v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
446 struct super_block *sb) 417 struct super_block *sb)
447{ 418{
@@ -476,55 +447,6 @@ error:
476 return ERR_PTR(err); 447 return ERR_PTR(err);
477} 448}
478 449
479static struct inode *
480v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
481 struct super_block *sb)
482{
483 struct inode *ret = NULL;
484 int err;
485 struct p9_stat_dotl *st;
486
487 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
488 if (IS_ERR(st))
489 return ERR_CAST(st);
490
491 ret = v9fs_get_inode(sb, st->st_mode);
492 if (IS_ERR(ret)) {
493 err = PTR_ERR(ret);
494 goto error;
495 }
496
497 v9fs_stat2inode_dotl(st, ret);
498 ret->i_ino = v9fs_qid2ino(&st->qid);
499#ifdef CONFIG_9P_FSCACHE
500 v9fs_vcookie_set_qid(ret, &st->qid);
501 v9fs_cache_inode_get_cookie(ret);
502#endif
503 kfree(st);
504 return ret;
505error:
506 kfree(st);
507 return ERR_PTR(err);
508}
509
510/**
511 * v9fs_inode_from_fid - Helper routine to populate an inode by
512 * issuing a attribute request
513 * @v9ses: session information
514 * @fid: fid to issue attribute request for
515 * @sb: superblock on which to create inode
516 *
517 */
518static inline struct inode *
519v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
520 struct super_block *sb)
521{
522 if (v9fs_proto_dotl(v9ses))
523 return v9fs_inode_dotl(v9ses, fid, sb);
524 else
525 return v9fs_inode(v9ses, fid, sb);
526}
527
528/** 450/**
529 * v9fs_remove - helper function to remove files and directories 451 * v9fs_remove - helper function to remove files and directories
530 * @dir: directory inode that is being deleted 452 * @dir: directory inode that is being deleted
@@ -553,13 +475,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
553 return retval; 475 return retval;
554} 476}
555 477
556static int
557v9fs_open_created(struct inode *inode, struct file *file)
558{
559 return 0;
560}
561
562
563/** 478/**
564 * v9fs_create - Create a file 479 * v9fs_create - Create a file
565 * @v9ses: session information 480 * @v9ses: session information
@@ -622,12 +537,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
622 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 537 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
623 goto error; 538 goto error;
624 } 539 }
625
626 if (v9ses->cache)
627 dentry->d_op = &v9fs_cached_dentry_operations;
628 else
629 dentry->d_op = &v9fs_dentry_operations;
630
631 d_instantiate(dentry, inode); 540 d_instantiate(dentry, inode);
632 err = v9fs_fid_add(dentry, fid); 541 err = v9fs_fid_add(dentry, fid);
633 if (err < 0) 542 if (err < 0)
@@ -646,121 +555,6 @@ error:
646} 555}
647 556
648/** 557/**
649 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
650 * @dir: directory inode that is being created
651 * @dentry: dentry that is being deleted
652 * @mode: create permissions
653 * @nd: path information
654 *
655 */
656
657static int
658v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
659 struct nameidata *nd)
660{
661 int err = 0;
662 char *name = NULL;
663 gid_t gid;
664 int flags;
665 struct v9fs_session_info *v9ses;
666 struct p9_fid *fid = NULL;
667 struct p9_fid *dfid, *ofid;
668 struct file *filp;
669 struct p9_qid qid;
670 struct inode *inode;
671
672 v9ses = v9fs_inode2v9ses(dir);
673 if (nd && nd->flags & LOOKUP_OPEN)
674 flags = nd->intent.open.flags - 1;
675 else
676 flags = O_RDWR;
677
678 name = (char *) dentry->d_name.name;
679 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
680 "mode:0x%x\n", name, flags, mode);
681
682 dfid = v9fs_fid_lookup(dentry->d_parent);
683 if (IS_ERR(dfid)) {
684 err = PTR_ERR(dfid);
685 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
686 return err;
687 }
688
689 /* clone a fid to use for creation */
690 ofid = p9_client_walk(dfid, 0, NULL, 1);
691 if (IS_ERR(ofid)) {
692 err = PTR_ERR(ofid);
693 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
694 return err;
695 }
696
697 gid = v9fs_get_fsgid_for_create(dir);
698 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
699 if (err < 0) {
700 P9_DPRINTK(P9_DEBUG_VFS,
701 "p9_client_open_dotl failed in creat %d\n",
702 err);
703 goto error;
704 }
705
706 /* No need to populate the inode if we are not opening the file AND
707 * not in cached mode.
708 */
709 if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
710 /* Not in cached mode. No need to populate inode with stat */
711 dentry->d_op = &v9fs_dentry_operations;
712 p9_client_clunk(ofid);
713 d_instantiate(dentry, NULL);
714 return 0;
715 }
716
717 /* Now walk from the parent so we can get an unopened fid. */
718 fid = p9_client_walk(dfid, 1, &name, 1);
719 if (IS_ERR(fid)) {
720 err = PTR_ERR(fid);
721 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
722 fid = NULL;
723 goto error;
724 }
725
726 /* instantiate inode and assign the unopened fid to dentry */
727 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
728 if (IS_ERR(inode)) {
729 err = PTR_ERR(inode);
730 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
731 goto error;
732 }
733 if (v9ses->cache)
734 dentry->d_op = &v9fs_cached_dentry_operations;
735 else
736 dentry->d_op = &v9fs_dentry_operations;
737 d_instantiate(dentry, inode);
738 err = v9fs_fid_add(dentry, fid);
739 if (err < 0)
740 goto error;
741
742 /* if we are opening a file, assign the open fid to the file */
743 if (nd && nd->flags & LOOKUP_OPEN) {
744 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
745 if (IS_ERR(filp)) {
746 p9_client_clunk(ofid);
747 return PTR_ERR(filp);
748 }
749 filp->private_data = ofid;
750 } else
751 p9_client_clunk(ofid);
752
753 return 0;
754
755error:
756 if (ofid)
757 p9_client_clunk(ofid);
758 if (fid)
759 p9_client_clunk(fid);
760 return err;
761}
762
763/**
764 * v9fs_vfs_create - VFS hook to create files 558 * v9fs_vfs_create - VFS hook to create files
765 * @dir: directory inode that is being created 559 * @dir: directory inode that is being created
766 * @dentry: dentry that is being deleted 560 * @dentry: dentry that is being deleted
@@ -800,7 +594,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
800 594
801 /* if we are opening a file, assign the open fid to the file */ 595 /* if we are opening a file, assign the open fid to the file */
802 if (nd && nd->flags & LOOKUP_OPEN) { 596 if (nd && nd->flags & LOOKUP_OPEN) {
803 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); 597 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
804 if (IS_ERR(filp)) { 598 if (IS_ERR(filp)) {
805 err = PTR_ERR(filp); 599 err = PTR_ERR(filp);
806 goto error; 600 goto error;
@@ -850,83 +644,6 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
850 return err; 644 return err;
851} 645}
852 646
853
854/**
855 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
856 * @dir: inode that is being unlinked
857 * @dentry: dentry that is being unlinked
858 * @mode: mode for new directory
859 *
860 */
861
862static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
863 int mode)
864{
865 int err;
866 struct v9fs_session_info *v9ses;
867 struct p9_fid *fid = NULL, *dfid = NULL;
868 gid_t gid;
869 char *name;
870 struct inode *inode;
871 struct p9_qid qid;
872 struct dentry *dir_dentry;
873
874 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
875 err = 0;
876 v9ses = v9fs_inode2v9ses(dir);
877
878 mode |= S_IFDIR;
879 dir_dentry = v9fs_dentry_from_dir_inode(dir);
880 dfid = v9fs_fid_lookup(dir_dentry);
881 if (IS_ERR(dfid)) {
882 err = PTR_ERR(dfid);
883 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
884 dfid = NULL;
885 goto error;
886 }
887
888 gid = v9fs_get_fsgid_for_create(dir);
889 if (gid < 0) {
890 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
891 goto error;
892 }
893
894 name = (char *) dentry->d_name.name;
895 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
896 if (err < 0)
897 goto error;
898
899 /* instantiate inode and assign the unopened fid to the dentry */
900 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
901 fid = p9_client_walk(dfid, 1, &name, 1);
902 if (IS_ERR(fid)) {
903 err = PTR_ERR(fid);
904 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
905 err);
906 fid = NULL;
907 goto error;
908 }
909
910 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
911 if (IS_ERR(inode)) {
912 err = PTR_ERR(inode);
913 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
914 err);
915 goto error;
916 }
917 dentry->d_op = &v9fs_cached_dentry_operations;
918 d_instantiate(dentry, inode);
919 err = v9fs_fid_add(dentry, fid);
920 if (err < 0)
921 goto error;
922 fid = NULL;
923 }
924error:
925 if (fid)
926 p9_client_clunk(fid);
927 return err;
928}
929
930/** 647/**
931 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode 648 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
932 * @dir: inode that is being walked from 649 * @dir: inode that is being walked from
@@ -935,7 +652,7 @@ error:
935 * 652 *
936 */ 653 */
937 654
938static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, 655struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
939 struct nameidata *nameidata) 656 struct nameidata *nameidata)
940{ 657{
941 struct super_block *sb; 658 struct super_block *sb;
@@ -979,17 +696,14 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
979 696
980 result = v9fs_fid_add(dentry, fid); 697 result = v9fs_fid_add(dentry, fid);
981 if (result < 0) 698 if (result < 0)
982 goto error; 699 goto error_iput;
983 700
984inst_out: 701inst_out:
985 if (v9ses->cache)
986 dentry->d_op = &v9fs_cached_dentry_operations;
987 else
988 dentry->d_op = &v9fs_dentry_operations;
989
990 d_add(dentry, inode); 702 d_add(dentry, inode);
991 return NULL; 703 return NULL;
992 704
705error_iput:
706 iput(inode);
993error: 707error:
994 p9_client_clunk(fid); 708 p9_client_clunk(fid);
995 709
@@ -1003,7 +717,7 @@ error:
1003 * 717 *
1004 */ 718 */
1005 719
1006static int v9fs_vfs_unlink(struct inode *i, struct dentry *d) 720int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
1007{ 721{
1008 return v9fs_remove(i, d, 0); 722 return v9fs_remove(i, d, 0);
1009} 723}
@@ -1015,7 +729,7 @@ static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
1015 * 729 *
1016 */ 730 */
1017 731
1018static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) 732int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
1019{ 733{
1020 return v9fs_remove(i, d, 1); 734 return v9fs_remove(i, d, 1);
1021} 735}
@@ -1029,7 +743,7 @@ static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
1029 * 743 *
1030 */ 744 */
1031 745
1032static int 746int
1033v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 747v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1034 struct inode *new_dir, struct dentry *new_dentry) 748 struct inode *new_dir, struct dentry *new_dentry)
1035{ 749{
@@ -1136,42 +850,6 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1136 return 0; 850 return 0;
1137} 851}
1138 852
1139static int
1140v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
1141 struct kstat *stat)
1142{
1143 int err;
1144 struct v9fs_session_info *v9ses;
1145 struct p9_fid *fid;
1146 struct p9_stat_dotl *st;
1147
1148 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1149 err = -EPERM;
1150 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1151 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
1152 return simple_getattr(mnt, dentry, stat);
1153
1154 fid = v9fs_fid_lookup(dentry);
1155 if (IS_ERR(fid))
1156 return PTR_ERR(fid);
1157
1158 /* Ask for all the fields in stat structure. Server will return
1159 * whatever it supports
1160 */
1161
1162 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
1163 if (IS_ERR(st))
1164 return PTR_ERR(st);
1165
1166 v9fs_stat2inode_dotl(st, dentry->d_inode);
1167 generic_fillattr(dentry->d_inode, stat);
1168 /* Change block size to what the server returned */
1169 stat->blksize = st->st_blksize;
1170
1171 kfree(st);
1172 return 0;
1173}
1174
1175/** 853/**
1176 * v9fs_vfs_setattr - set file metadata 854 * v9fs_vfs_setattr - set file metadata
1177 * @dentry: file whose metadata to set 855 * @dentry: file whose metadata to set
@@ -1231,58 +909,6 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
1231} 909}
1232 910
1233/** 911/**
1234 * v9fs_vfs_setattr_dotl - set file metadata
1235 * @dentry: file whose metadata to set
1236 * @iattr: metadata assignment structure
1237 *
1238 */
1239
1240static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1241{
1242 int retval;
1243 struct v9fs_session_info *v9ses;
1244 struct p9_fid *fid;
1245 struct p9_iattr_dotl p9attr;
1246
1247 P9_DPRINTK(P9_DEBUG_VFS, "\n");
1248
1249 retval = inode_change_ok(dentry->d_inode, iattr);
1250 if (retval)
1251 return retval;
1252
1253 p9attr.valid = iattr->ia_valid;
1254 p9attr.mode = iattr->ia_mode;
1255 p9attr.uid = iattr->ia_uid;
1256 p9attr.gid = iattr->ia_gid;
1257 p9attr.size = iattr->ia_size;
1258 p9attr.atime_sec = iattr->ia_atime.tv_sec;
1259 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
1260 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
1261 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
1262
1263 retval = -EPERM;
1264 v9ses = v9fs_inode2v9ses(dentry->d_inode);
1265 fid = v9fs_fid_lookup(dentry);
1266 if (IS_ERR(fid))
1267 return PTR_ERR(fid);
1268
1269 retval = p9_client_setattr(fid, &p9attr);
1270 if (retval < 0)
1271 return retval;
1272
1273 if ((iattr->ia_valid & ATTR_SIZE) &&
1274 iattr->ia_size != i_size_read(dentry->d_inode)) {
1275 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1276 if (retval)
1277 return retval;
1278 }
1279
1280 setattr_copy(dentry->d_inode, iattr);
1281 mark_inode_dirty(dentry->d_inode);
1282 return 0;
1283}
1284
1285/**
1286 * v9fs_stat2inode - populate an inode structure with mistat info 912 * v9fs_stat2inode - populate an inode structure with mistat info
1287 * @stat: Plan 9 metadata (mistat) structure 913 * @stat: Plan 9 metadata (mistat) structure
1288 * @inode: inode to populate 914 * @inode: inode to populate
@@ -1360,77 +986,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1360} 986}
1361 987
1362/** 988/**
1363 * v9fs_stat2inode_dotl - populate an inode structure with stat info
1364 * @stat: stat structure
1365 * @inode: inode to populate
1366 * @sb: superblock of filesystem
1367 *
1368 */
1369
1370void
1371v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
1372{
1373
1374 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
1375 inode->i_atime.tv_sec = stat->st_atime_sec;
1376 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1377 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1378 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1379 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1380 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1381 inode->i_uid = stat->st_uid;
1382 inode->i_gid = stat->st_gid;
1383 inode->i_nlink = stat->st_nlink;
1384 inode->i_mode = stat->st_mode;
1385 inode->i_rdev = new_decode_dev(stat->st_rdev);
1386
1387 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
1388 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1389
1390 i_size_write(inode, stat->st_size);
1391 inode->i_blocks = stat->st_blocks;
1392 } else {
1393 if (stat->st_result_mask & P9_STATS_ATIME) {
1394 inode->i_atime.tv_sec = stat->st_atime_sec;
1395 inode->i_atime.tv_nsec = stat->st_atime_nsec;
1396 }
1397 if (stat->st_result_mask & P9_STATS_MTIME) {
1398 inode->i_mtime.tv_sec = stat->st_mtime_sec;
1399 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
1400 }
1401 if (stat->st_result_mask & P9_STATS_CTIME) {
1402 inode->i_ctime.tv_sec = stat->st_ctime_sec;
1403 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
1404 }
1405 if (stat->st_result_mask & P9_STATS_UID)
1406 inode->i_uid = stat->st_uid;
1407 if (stat->st_result_mask & P9_STATS_GID)
1408 inode->i_gid = stat->st_gid;
1409 if (stat->st_result_mask & P9_STATS_NLINK)
1410 inode->i_nlink = stat->st_nlink;
1411 if (stat->st_result_mask & P9_STATS_MODE) {
1412 inode->i_mode = stat->st_mode;
1413 if ((S_ISBLK(inode->i_mode)) ||
1414 (S_ISCHR(inode->i_mode)))
1415 init_special_inode(inode, inode->i_mode,
1416 inode->i_rdev);
1417 }
1418 if (stat->st_result_mask & P9_STATS_RDEV)
1419 inode->i_rdev = new_decode_dev(stat->st_rdev);
1420 if (stat->st_result_mask & P9_STATS_SIZE)
1421 i_size_write(inode, stat->st_size);
1422 if (stat->st_result_mask & P9_STATS_BLOCKS)
1423 inode->i_blocks = stat->st_blocks;
1424 }
1425 if (stat->st_result_mask & P9_STATS_GEN)
1426 inode->i_generation = stat->st_gen;
1427
1428 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
1429 * because the inode structure does not have fields for them.
1430 */
1431}
1432
1433/**
1434 * v9fs_qid2ino - convert qid into inode number 989 * v9fs_qid2ino - convert qid into inode number
1435 * @qid: qid to hash 990 * @qid: qid to hash
1436 * 991 *
@@ -1473,7 +1028,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1473 if (IS_ERR(fid)) 1028 if (IS_ERR(fid))
1474 return PTR_ERR(fid); 1029 return PTR_ERR(fid);
1475 1030
1476 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) 1031 if (!v9fs_proto_dotu(v9ses))
1477 return -EBADF; 1032 return -EBADF;
1478 1033
1479 st = p9_client_stat(fid); 1034 st = p9_client_stat(fid);
@@ -1536,7 +1091,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1536 * 1091 *
1537 */ 1092 */
1538 1093
1539static void 1094void
1540v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) 1095v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1541{ 1096{
1542 char *s = nd_get_link(nd); 1097 char *s = nd_get_link(nd);
@@ -1580,99 +1135,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1580} 1135}
1581 1136
1582/** 1137/**
1583 * v9fs_vfs_symlink_dotl - helper function to create symlinks
1584 * @dir: directory inode containing symlink
1585 * @dentry: dentry for symlink
1586 * @symname: symlink data
1587 *
1588 * See Also: 9P2000.L RFC for more information
1589 *
1590 */
1591
1592static int
1593v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
1594 const char *symname)
1595{
1596 struct v9fs_session_info *v9ses;
1597 struct p9_fid *dfid;
1598 struct p9_fid *fid = NULL;
1599 struct inode *inode;
1600 struct p9_qid qid;
1601 char *name;
1602 int err;
1603 gid_t gid;
1604
1605 name = (char *) dentry->d_name.name;
1606 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
1607 dir->i_ino, name, symname);
1608 v9ses = v9fs_inode2v9ses(dir);
1609
1610 dfid = v9fs_fid_lookup(dentry->d_parent);
1611 if (IS_ERR(dfid)) {
1612 err = PTR_ERR(dfid);
1613 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1614 return err;
1615 }
1616
1617 gid = v9fs_get_fsgid_for_create(dir);
1618
1619 if (gid < 0) {
1620 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
1621 goto error;
1622 }
1623
1624 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
1625 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
1626
1627 if (err < 0) {
1628 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
1629 goto error;
1630 }
1631
1632 if (v9ses->cache) {
1633 /* Now walk from the parent so we can get an unopened fid. */
1634 fid = p9_client_walk(dfid, 1, &name, 1);
1635 if (IS_ERR(fid)) {
1636 err = PTR_ERR(fid);
1637 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1638 err);
1639 fid = NULL;
1640 goto error;
1641 }
1642
1643 /* instantiate inode and assign the unopened fid to dentry */
1644 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1645 if (IS_ERR(inode)) {
1646 err = PTR_ERR(inode);
1647 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1648 err);
1649 goto error;
1650 }
1651 dentry->d_op = &v9fs_cached_dentry_operations;
1652 d_instantiate(dentry, inode);
1653 err = v9fs_fid_add(dentry, fid);
1654 if (err < 0)
1655 goto error;
1656 fid = NULL;
1657 } else {
1658 /* Not in cached mode. No need to populate inode with stat */
1659 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
1660 if (IS_ERR(inode)) {
1661 err = PTR_ERR(inode);
1662 goto error;
1663 }
1664 dentry->d_op = &v9fs_dentry_operations;
1665 d_instantiate(dentry, inode);
1666 }
1667
1668error:
1669 if (fid)
1670 p9_client_clunk(fid);
1671
1672 return err;
1673}
1674
1675/**
1676 * v9fs_vfs_symlink - helper function to create symlinks 1138 * v9fs_vfs_symlink - helper function to create symlinks
1677 * @dir: directory inode containing symlink 1139 * @dir: directory inode containing symlink
1678 * @dentry: dentry for symlink 1140 * @dentry: dentry for symlink
@@ -1731,76 +1193,6 @@ clunk_fid:
1731} 1193}
1732 1194
1733/** 1195/**
1734 * v9fs_vfs_link_dotl - create a hardlink for dotl
1735 * @old_dentry: dentry for file to link to
1736 * @dir: inode destination for new link
1737 * @dentry: dentry for link
1738 *
1739 */
1740
1741static int
1742v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
1743 struct dentry *dentry)
1744{
1745 int err;
1746 struct p9_fid *dfid, *oldfid;
1747 char *name;
1748 struct v9fs_session_info *v9ses;
1749 struct dentry *dir_dentry;
1750
1751 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
1752 dir->i_ino, old_dentry->d_name.name,
1753 dentry->d_name.name);
1754
1755 v9ses = v9fs_inode2v9ses(dir);
1756 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1757 dfid = v9fs_fid_lookup(dir_dentry);
1758 if (IS_ERR(dfid))
1759 return PTR_ERR(dfid);
1760
1761 oldfid = v9fs_fid_lookup(old_dentry);
1762 if (IS_ERR(oldfid))
1763 return PTR_ERR(oldfid);
1764
1765 name = (char *) dentry->d_name.name;
1766
1767 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
1768
1769 if (err < 0) {
1770 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
1771 return err;
1772 }
1773
1774 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1775 /* Get the latest stat info from server. */
1776 struct p9_fid *fid;
1777 struct p9_stat_dotl *st;
1778
1779 fid = v9fs_fid_lookup(old_dentry);
1780 if (IS_ERR(fid))
1781 return PTR_ERR(fid);
1782
1783 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
1784 if (IS_ERR(st))
1785 return PTR_ERR(st);
1786
1787 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
1788
1789 kfree(st);
1790 } else {
1791 /* Caching disabled. No need to get upto date stat info.
1792 * This dentry will be released immediately. So, just i_count++
1793 */
1794 atomic_inc(&old_dentry->d_inode->i_count);
1795 }
1796
1797 dentry->d_op = old_dentry->d_op;
1798 d_instantiate(dentry, old_dentry->d_inode);
1799
1800 return err;
1801}
1802
1803/**
1804 * v9fs_vfs_mknod - create a special file 1196 * v9fs_vfs_mknod - create a special file
1805 * @dir: inode destination for new link 1197 * @dir: inode destination for new link
1806 * @dentry: dentry for file 1198 * @dentry: dentry for file
@@ -1845,100 +1237,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1845 return retval; 1237 return retval;
1846} 1238}
1847 1239
1848/**
1849 * v9fs_vfs_mknod_dotl - create a special file
1850 * @dir: inode destination for new link
1851 * @dentry: dentry for file
1852 * @mode: mode for creation
1853 * @rdev: device associated with special file
1854 *
1855 */
1856static int
1857v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
1858 dev_t rdev)
1859{
1860 int err;
1861 char *name;
1862 struct v9fs_session_info *v9ses;
1863 struct p9_fid *fid = NULL, *dfid = NULL;
1864 struct inode *inode;
1865 gid_t gid;
1866 struct p9_qid qid;
1867 struct dentry *dir_dentry;
1868
1869 P9_DPRINTK(P9_DEBUG_VFS,
1870 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1871 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
1872
1873 if (!new_valid_dev(rdev))
1874 return -EINVAL;
1875
1876 v9ses = v9fs_inode2v9ses(dir);
1877 dir_dentry = v9fs_dentry_from_dir_inode(dir);
1878 dfid = v9fs_fid_lookup(dir_dentry);
1879 if (IS_ERR(dfid)) {
1880 err = PTR_ERR(dfid);
1881 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
1882 dfid = NULL;
1883 goto error;
1884 }
1885
1886 gid = v9fs_get_fsgid_for_create(dir);
1887 if (gid < 0) {
1888 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
1889 goto error;
1890 }
1891
1892 name = (char *) dentry->d_name.name;
1893
1894 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
1895 if (err < 0)
1896 goto error;
1897
1898 /* instantiate inode and assign the unopened fid to the dentry */
1899 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1900 fid = p9_client_walk(dfid, 1, &name, 1);
1901 if (IS_ERR(fid)) {
1902 err = PTR_ERR(fid);
1903 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
1904 err);
1905 fid = NULL;
1906 goto error;
1907 }
1908
1909 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
1910 if (IS_ERR(inode)) {
1911 err = PTR_ERR(inode);
1912 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
1913 err);
1914 goto error;
1915 }
1916 dentry->d_op = &v9fs_cached_dentry_operations;
1917 d_instantiate(dentry, inode);
1918 err = v9fs_fid_add(dentry, fid);
1919 if (err < 0)
1920 goto error;
1921 fid = NULL;
1922 } else {
1923 /*
1924 * Not in cached mode. No need to populate inode with stat.
1925 * socket syscall returns a fd, so we need instantiate
1926 */
1927 inode = v9fs_get_inode(dir->i_sb, mode);
1928 if (IS_ERR(inode)) {
1929 err = PTR_ERR(inode);
1930 goto error;
1931 }
1932 dentry->d_op = &v9fs_dentry_operations;
1933 d_instantiate(dentry, inode);
1934 }
1935
1936error:
1937 if (fid)
1938 p9_client_clunk(fid);
1939 return err;
1940}
1941
1942static const struct inode_operations v9fs_dir_inode_operations_dotu = { 1240static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1943 .create = v9fs_vfs_create, 1241 .create = v9fs_vfs_create,
1944 .lookup = v9fs_vfs_lookup, 1242 .lookup = v9fs_vfs_lookup,
@@ -1953,25 +1251,6 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1953 .setattr = v9fs_vfs_setattr, 1251 .setattr = v9fs_vfs_setattr,
1954}; 1252};
1955 1253
1956static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1957 .create = v9fs_vfs_create_dotl,
1958 .lookup = v9fs_vfs_lookup,
1959 .link = v9fs_vfs_link_dotl,
1960 .symlink = v9fs_vfs_symlink_dotl,
1961 .unlink = v9fs_vfs_unlink,
1962 .mkdir = v9fs_vfs_mkdir_dotl,
1963 .rmdir = v9fs_vfs_rmdir,
1964 .mknod = v9fs_vfs_mknod_dotl,
1965 .rename = v9fs_vfs_rename,
1966 .getattr = v9fs_vfs_getattr_dotl,
1967 .setattr = v9fs_vfs_setattr_dotl,
1968 .setxattr = generic_setxattr,
1969 .getxattr = generic_getxattr,
1970 .removexattr = generic_removexattr,
1971 .listxattr = v9fs_listxattr,
1972
1973};
1974
1975static const struct inode_operations v9fs_dir_inode_operations = { 1254static const struct inode_operations v9fs_dir_inode_operations = {
1976 .create = v9fs_vfs_create, 1255 .create = v9fs_vfs_create,
1977 .lookup = v9fs_vfs_lookup, 1256 .lookup = v9fs_vfs_lookup,
@@ -1989,15 +1268,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
1989 .setattr = v9fs_vfs_setattr, 1268 .setattr = v9fs_vfs_setattr,
1990}; 1269};
1991 1270
1992static const struct inode_operations v9fs_file_inode_operations_dotl = {
1993 .getattr = v9fs_vfs_getattr_dotl,
1994 .setattr = v9fs_vfs_setattr_dotl,
1995 .setxattr = generic_setxattr,
1996 .getxattr = generic_getxattr,
1997 .removexattr = generic_removexattr,
1998 .listxattr = v9fs_listxattr,
1999};
2000
2001static const struct inode_operations v9fs_symlink_inode_operations = { 1271static const struct inode_operations v9fs_symlink_inode_operations = {
2002 .readlink = generic_readlink, 1272 .readlink = generic_readlink,
2003 .follow_link = v9fs_vfs_follow_link, 1273 .follow_link = v9fs_vfs_follow_link,
@@ -2006,14 +1276,3 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
2006 .setattr = v9fs_vfs_setattr, 1276 .setattr = v9fs_vfs_setattr,
2007}; 1277};
2008 1278
2009static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
2010 .readlink = generic_readlink,
2011 .follow_link = v9fs_vfs_follow_link,
2012 .put_link = v9fs_vfs_put_link,
2013 .getattr = v9fs_vfs_getattr_dotl,
2014 .setattr = v9fs_vfs_setattr_dotl,
2015 .setxattr = generic_setxattr,
2016 .getxattr = generic_getxattr,
2017 .removexattr = generic_removexattr,
2018 .listxattr = v9fs_listxattr,
2019};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
new file mode 100644
index 000000000000..fe3ffa9aace4
--- /dev/null
+++ b/fs/9p/vfs_inode_dotl.c
@@ -0,0 +1,824 @@
1/*
2 * linux/fs/9p/vfs_inode_dotl.c
3 *
4 * This file contains vfs inode ops for the 9P2000.L protocol.
5 *
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to:
20 * Free Software Foundation
21 * 51 Franklin Street, Fifth Floor
22 * Boston, MA 02111-1301 USA
23 *
24 */
25
26#include <linux/module.h>
27#include <linux/errno.h>
28#include <linux/fs.h>
29#include <linux/file.h>
30#include <linux/pagemap.h>
31#include <linux/stat.h>
32#include <linux/string.h>
33#include <linux/inet.h>
34#include <linux/namei.h>
35#include <linux/idr.h>
36#include <linux/sched.h>
37#include <linux/slab.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <net/9p/9p.h>
41#include <net/9p/client.h>
42
43#include "v9fs.h"
44#include "v9fs_vfs.h"
45#include "fid.h"
46#include "cache.h"
47#include "xattr.h"
48#include "acl.h"
49
50static int
51v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
52 dev_t rdev);
53
54/**
55 * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
56 * new file system object. This checks the S_ISGID to determine the owning
57 * group of the new file system object.
58 */
59
60static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
61{
62 BUG_ON(dir_inode == NULL);
63
64 if (dir_inode->i_mode & S_ISGID) {
65 /* set_gid bit is set.*/
66 return dir_inode->i_gid;
67 }
68 return current_fsgid();
69}
70
71/**
72 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
73 * dir inode.
74 *
75 */
76
77static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
78{
79 struct dentry *dentry;
80
81 spin_lock(&inode->i_lock);
82 /* Directory should have only one entry. */
83 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
84 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
85 spin_unlock(&inode->i_lock);
86 return dentry;
87}
88
89struct inode *
90v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
91 struct super_block *sb)
92{
93 struct inode *ret = NULL;
94 int err;
95 struct p9_stat_dotl *st;
96
97 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
98 if (IS_ERR(st))
99 return ERR_CAST(st);
100
101 ret = v9fs_get_inode(sb, st->st_mode);
102 if (IS_ERR(ret)) {
103 err = PTR_ERR(ret);
104 goto error;
105 }
106
107 v9fs_stat2inode_dotl(st, ret);
108 ret->i_ino = v9fs_qid2ino(&st->qid);
109#ifdef CONFIG_9P_FSCACHE
110 v9fs_vcookie_set_qid(ret, &st->qid);
111 v9fs_cache_inode_get_cookie(ret);
112#endif
113 err = v9fs_get_acl(ret, fid);
114 if (err) {
115 iput(ret);
116 goto error;
117 }
118 kfree(st);
119 return ret;
120error:
121 kfree(st);
122 return ERR_PTR(err);
123}
124
125/**
126 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
127 * @dir: directory inode that is being created
128 * @dentry: dentry that is being deleted
129 * @mode: create permissions
130 * @nd: path information
131 *
132 */
133
134static int
135v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
136 struct nameidata *nd)
137{
138 int err = 0;
139 char *name = NULL;
140 gid_t gid;
141 int flags;
142 mode_t mode;
143 struct v9fs_session_info *v9ses;
144 struct p9_fid *fid = NULL;
145 struct p9_fid *dfid, *ofid;
146 struct file *filp;
147 struct p9_qid qid;
148 struct inode *inode;
149 struct posix_acl *pacl = NULL, *dacl = NULL;
150
151 v9ses = v9fs_inode2v9ses(dir);
152 if (nd && nd->flags & LOOKUP_OPEN)
153 flags = nd->intent.open.flags - 1;
154 else {
155 /*
156 * create call without LOOKUP_OPEN is due
157 * to mknod of regular files. So use mknod
158 * operation.
159 */
160 return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
161 }
162
163 name = (char *) dentry->d_name.name;
164 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
165 "mode:0x%x\n", name, flags, omode);
166
167 dfid = v9fs_fid_lookup(dentry->d_parent);
168 if (IS_ERR(dfid)) {
169 err = PTR_ERR(dfid);
170 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
171 return err;
172 }
173
174 /* clone a fid to use for creation */
175 ofid = p9_client_walk(dfid, 0, NULL, 1);
176 if (IS_ERR(ofid)) {
177 err = PTR_ERR(ofid);
178 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
179 return err;
180 }
181
182 gid = v9fs_get_fsgid_for_create(dir);
183
184 mode = omode;
185 /* Update mode based on ACL value */
186 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
187 if (err) {
188 P9_DPRINTK(P9_DEBUG_VFS,
189 "Failed to get acl values in creat %d\n", err);
190 goto error;
191 }
192 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
193 if (err < 0) {
194 P9_DPRINTK(P9_DEBUG_VFS,
195 "p9_client_open_dotl failed in creat %d\n",
196 err);
197 goto error;
198 }
199
200 /* instantiate inode and assign the unopened fid to the dentry */
201 fid = p9_client_walk(dfid, 1, &name, 1);
202 if (IS_ERR(fid)) {
203 err = PTR_ERR(fid);
204 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
205 fid = NULL;
206 goto error;
207 }
208 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
209 if (IS_ERR(inode)) {
210 err = PTR_ERR(inode);
211 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
212 goto error;
213 }
214 d_instantiate(dentry, inode);
215 err = v9fs_fid_add(dentry, fid);
216 if (err < 0)
217 goto error;
218
219 /* Now set the ACL based on the default value */
220 v9fs_set_create_acl(dentry, dacl, pacl);
221
222 /* Since we are opening a file, assign the open fid to the file */
223 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
224 if (IS_ERR(filp)) {
225 p9_client_clunk(ofid);
226 return PTR_ERR(filp);
227 }
228 filp->private_data = ofid;
229 return 0;
230
231error:
232 if (ofid)
233 p9_client_clunk(ofid);
234 if (fid)
235 p9_client_clunk(fid);
236 return err;
237}
238
239/**
240 * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
241 * @dir: inode that is being unlinked
242 * @dentry: dentry that is being unlinked
243 * @mode: mode for new directory
244 *
245 */
246
247static int v9fs_vfs_mkdir_dotl(struct inode *dir,
248 struct dentry *dentry, int omode)
249{
250 int err;
251 struct v9fs_session_info *v9ses;
252 struct p9_fid *fid = NULL, *dfid = NULL;
253 gid_t gid;
254 char *name;
255 mode_t mode;
256 struct inode *inode;
257 struct p9_qid qid;
258 struct dentry *dir_dentry;
259 struct posix_acl *dacl = NULL, *pacl = NULL;
260
261 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
262 err = 0;
263 v9ses = v9fs_inode2v9ses(dir);
264
265 omode |= S_IFDIR;
266 if (dir->i_mode & S_ISGID)
267 omode |= S_ISGID;
268
269 dir_dentry = v9fs_dentry_from_dir_inode(dir);
270 dfid = v9fs_fid_lookup(dir_dentry);
271 if (IS_ERR(dfid)) {
272 err = PTR_ERR(dfid);
273 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
274 dfid = NULL;
275 goto error;
276 }
277
278 gid = v9fs_get_fsgid_for_create(dir);
279 mode = omode;
280 /* Update mode based on ACL value */
281 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
282 if (err) {
283 P9_DPRINTK(P9_DEBUG_VFS,
284 "Failed to get acl values in mkdir %d\n", err);
285 goto error;
286 }
287 name = (char *) dentry->d_name.name;
288 err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
289 if (err < 0)
290 goto error;
291
292 /* instantiate inode and assign the unopened fid to the dentry */
293 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
294 fid = p9_client_walk(dfid, 1, &name, 1);
295 if (IS_ERR(fid)) {
296 err = PTR_ERR(fid);
297 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
298 err);
299 fid = NULL;
300 goto error;
301 }
302
303 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
304 if (IS_ERR(inode)) {
305 err = PTR_ERR(inode);
306 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
307 err);
308 goto error;
309 }
310 d_instantiate(dentry, inode);
311 err = v9fs_fid_add(dentry, fid);
312 if (err < 0)
313 goto error;
314 fid = NULL;
315 } else {
316 /*
317 * Not in cached mode. No need to populate
318 * inode with stat. We need to get an inode
319 * so that we can set the acl with dentry
320 */
321 inode = v9fs_get_inode(dir->i_sb, mode);
322 if (IS_ERR(inode)) {
323 err = PTR_ERR(inode);
324 goto error;
325 }
326 d_instantiate(dentry, inode);
327 }
328 /* Now set the ACL based on the default value */
329 v9fs_set_create_acl(dentry, dacl, pacl);
330
331error:
332 if (fid)
333 p9_client_clunk(fid);
334 return err;
335}
336
337static int
338v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
339 struct kstat *stat)
340{
341 int err;
342 struct v9fs_session_info *v9ses;
343 struct p9_fid *fid;
344 struct p9_stat_dotl *st;
345
346 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
347 err = -EPERM;
348 v9ses = v9fs_inode2v9ses(dentry->d_inode);
349 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
350 return simple_getattr(mnt, dentry, stat);
351
352 fid = v9fs_fid_lookup(dentry);
353 if (IS_ERR(fid))
354 return PTR_ERR(fid);
355
356 /* Ask for all the fields in stat structure. Server will return
357 * whatever it supports
358 */
359
360 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
361 if (IS_ERR(st))
362 return PTR_ERR(st);
363
364 v9fs_stat2inode_dotl(st, dentry->d_inode);
365 generic_fillattr(dentry->d_inode, stat);
366 /* Change block size to what the server returned */
367 stat->blksize = st->st_blksize;
368
369 kfree(st);
370 return 0;
371}
372
373/**
374 * v9fs_vfs_setattr_dotl - set file metadata
375 * @dentry: file whose metadata to set
376 * @iattr: metadata assignment structure
377 *
378 */
379
380int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
381{
382 int retval;
383 struct v9fs_session_info *v9ses;
384 struct p9_fid *fid;
385 struct p9_iattr_dotl p9attr;
386
387 P9_DPRINTK(P9_DEBUG_VFS, "\n");
388
389 retval = inode_change_ok(dentry->d_inode, iattr);
390 if (retval)
391 return retval;
392
393 p9attr.valid = iattr->ia_valid;
394 p9attr.mode = iattr->ia_mode;
395 p9attr.uid = iattr->ia_uid;
396 p9attr.gid = iattr->ia_gid;
397 p9attr.size = iattr->ia_size;
398 p9attr.atime_sec = iattr->ia_atime.tv_sec;
399 p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
400 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
401 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
402
403 retval = -EPERM;
404 v9ses = v9fs_inode2v9ses(dentry->d_inode);
405 fid = v9fs_fid_lookup(dentry);
406 if (IS_ERR(fid))
407 return PTR_ERR(fid);
408
409 retval = p9_client_setattr(fid, &p9attr);
410 if (retval < 0)
411 return retval;
412
413 if ((iattr->ia_valid & ATTR_SIZE) &&
414 iattr->ia_size != i_size_read(dentry->d_inode)) {
415 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
416 if (retval)
417 return retval;
418 }
419
420 setattr_copy(dentry->d_inode, iattr);
421 mark_inode_dirty(dentry->d_inode);
422 if (iattr->ia_valid & ATTR_MODE) {
423 /* We also want to update ACL when we update mode bits */
424 retval = v9fs_acl_chmod(dentry);
425 if (retval < 0)
426 return retval;
427 }
428 return 0;
429}
430
431/**
432 * v9fs_stat2inode_dotl - populate an inode structure with stat info
433 * @stat: stat structure
434 * @inode: inode to populate
435 * @sb: superblock of filesystem
436 *
437 */
438
439void
440v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
441{
442
443 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
444 inode->i_atime.tv_sec = stat->st_atime_sec;
445 inode->i_atime.tv_nsec = stat->st_atime_nsec;
446 inode->i_mtime.tv_sec = stat->st_mtime_sec;
447 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
448 inode->i_ctime.tv_sec = stat->st_ctime_sec;
449 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
450 inode->i_uid = stat->st_uid;
451 inode->i_gid = stat->st_gid;
452 inode->i_nlink = stat->st_nlink;
453 inode->i_mode = stat->st_mode;
454 inode->i_rdev = new_decode_dev(stat->st_rdev);
455
456 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
457 init_special_inode(inode, inode->i_mode, inode->i_rdev);
458
459 i_size_write(inode, stat->st_size);
460 inode->i_blocks = stat->st_blocks;
461 } else {
462 if (stat->st_result_mask & P9_STATS_ATIME) {
463 inode->i_atime.tv_sec = stat->st_atime_sec;
464 inode->i_atime.tv_nsec = stat->st_atime_nsec;
465 }
466 if (stat->st_result_mask & P9_STATS_MTIME) {
467 inode->i_mtime.tv_sec = stat->st_mtime_sec;
468 inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
469 }
470 if (stat->st_result_mask & P9_STATS_CTIME) {
471 inode->i_ctime.tv_sec = stat->st_ctime_sec;
472 inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
473 }
474 if (stat->st_result_mask & P9_STATS_UID)
475 inode->i_uid = stat->st_uid;
476 if (stat->st_result_mask & P9_STATS_GID)
477 inode->i_gid = stat->st_gid;
478 if (stat->st_result_mask & P9_STATS_NLINK)
479 inode->i_nlink = stat->st_nlink;
480 if (stat->st_result_mask & P9_STATS_MODE) {
481 inode->i_mode = stat->st_mode;
482 if ((S_ISBLK(inode->i_mode)) ||
483 (S_ISCHR(inode->i_mode)))
484 init_special_inode(inode, inode->i_mode,
485 inode->i_rdev);
486 }
487 if (stat->st_result_mask & P9_STATS_RDEV)
488 inode->i_rdev = new_decode_dev(stat->st_rdev);
489 if (stat->st_result_mask & P9_STATS_SIZE)
490 i_size_write(inode, stat->st_size);
491 if (stat->st_result_mask & P9_STATS_BLOCKS)
492 inode->i_blocks = stat->st_blocks;
493 }
494 if (stat->st_result_mask & P9_STATS_GEN)
495 inode->i_generation = stat->st_gen;
496
497 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
498 * because the inode structure does not have fields for them.
499 */
500}
501
502static int
503v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
504 const char *symname)
505{
506 struct v9fs_session_info *v9ses;
507 struct p9_fid *dfid;
508 struct p9_fid *fid = NULL;
509 struct inode *inode;
510 struct p9_qid qid;
511 char *name;
512 int err;
513 gid_t gid;
514
515 name = (char *) dentry->d_name.name;
516 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
517 dir->i_ino, name, symname);
518 v9ses = v9fs_inode2v9ses(dir);
519
520 dfid = v9fs_fid_lookup(dentry->d_parent);
521 if (IS_ERR(dfid)) {
522 err = PTR_ERR(dfid);
523 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
524 return err;
525 }
526
527 gid = v9fs_get_fsgid_for_create(dir);
528
529 /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
530 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
531
532 if (err < 0) {
533 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
534 goto error;
535 }
536
537 if (v9ses->cache) {
538 /* Now walk from the parent so we can get an unopened fid. */
539 fid = p9_client_walk(dfid, 1, &name, 1);
540 if (IS_ERR(fid)) {
541 err = PTR_ERR(fid);
542 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
543 err);
544 fid = NULL;
545 goto error;
546 }
547
548 /* instantiate inode and assign the unopened fid to dentry */
549 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
550 if (IS_ERR(inode)) {
551 err = PTR_ERR(inode);
552 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
553 err);
554 goto error;
555 }
556 d_instantiate(dentry, inode);
557 err = v9fs_fid_add(dentry, fid);
558 if (err < 0)
559 goto error;
560 fid = NULL;
561 } else {
562 /* Not in cached mode. No need to populate inode with stat */
563 inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
564 if (IS_ERR(inode)) {
565 err = PTR_ERR(inode);
566 goto error;
567 }
568 d_instantiate(dentry, inode);
569 }
570
571error:
572 if (fid)
573 p9_client_clunk(fid);
574
575 return err;
576}
577
578/**
579 * v9fs_vfs_link_dotl - create a hardlink for dotl
580 * @old_dentry: dentry for file to link to
581 * @dir: inode destination for new link
582 * @dentry: dentry for link
583 *
584 */
585
586static int
587v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
588 struct dentry *dentry)
589{
590 int err;
591 struct p9_fid *dfid, *oldfid;
592 char *name;
593 struct v9fs_session_info *v9ses;
594 struct dentry *dir_dentry;
595
596 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
597 dir->i_ino, old_dentry->d_name.name,
598 dentry->d_name.name);
599
600 v9ses = v9fs_inode2v9ses(dir);
601 dir_dentry = v9fs_dentry_from_dir_inode(dir);
602 dfid = v9fs_fid_lookup(dir_dentry);
603 if (IS_ERR(dfid))
604 return PTR_ERR(dfid);
605
606 oldfid = v9fs_fid_lookup(old_dentry);
607 if (IS_ERR(oldfid))
608 return PTR_ERR(oldfid);
609
610 name = (char *) dentry->d_name.name;
611
612 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
613
614 if (err < 0) {
615 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
616 return err;
617 }
618
619 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
620 /* Get the latest stat info from server. */
621 struct p9_fid *fid;
622 struct p9_stat_dotl *st;
623
624 fid = v9fs_fid_lookup(old_dentry);
625 if (IS_ERR(fid))
626 return PTR_ERR(fid);
627
628 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
629 if (IS_ERR(st))
630 return PTR_ERR(st);
631
632 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
633
634 kfree(st);
635 } else {
636 /* Caching disabled. No need to get upto date stat info.
637 * This dentry will be released immediately. So, just hold the
638 * inode
639 */
640 ihold(old_dentry->d_inode);
641 }
642 d_instantiate(dentry, old_dentry->d_inode);
643
644 return err;
645}
646
647/**
648 * v9fs_vfs_mknod_dotl - create a special file
649 * @dir: inode destination for new link
650 * @dentry: dentry for file
651 * @mode: mode for creation
652 * @rdev: device associated with special file
653 *
654 */
655static int
656v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
657 dev_t rdev)
658{
659 int err;
660 char *name;
661 mode_t mode;
662 struct v9fs_session_info *v9ses;
663 struct p9_fid *fid = NULL, *dfid = NULL;
664 struct inode *inode;
665 gid_t gid;
666 struct p9_qid qid;
667 struct dentry *dir_dentry;
668 struct posix_acl *dacl = NULL, *pacl = NULL;
669
670 P9_DPRINTK(P9_DEBUG_VFS,
671 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
672 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
673
674 if (!new_valid_dev(rdev))
675 return -EINVAL;
676
677 v9ses = v9fs_inode2v9ses(dir);
678 dir_dentry = v9fs_dentry_from_dir_inode(dir);
679 dfid = v9fs_fid_lookup(dir_dentry);
680 if (IS_ERR(dfid)) {
681 err = PTR_ERR(dfid);
682 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
683 dfid = NULL;
684 goto error;
685 }
686
687 gid = v9fs_get_fsgid_for_create(dir);
688 mode = omode;
689 /* Update mode based on ACL value */
690 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
691 if (err) {
692 P9_DPRINTK(P9_DEBUG_VFS,
693 "Failed to get acl values in mknod %d\n", err);
694 goto error;
695 }
696 name = (char *) dentry->d_name.name;
697
698 err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
699 if (err < 0)
700 goto error;
701
702 /* instantiate inode and assign the unopened fid to the dentry */
703 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
704 fid = p9_client_walk(dfid, 1, &name, 1);
705 if (IS_ERR(fid)) {
706 err = PTR_ERR(fid);
707 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
708 err);
709 fid = NULL;
710 goto error;
711 }
712
713 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
714 if (IS_ERR(inode)) {
715 err = PTR_ERR(inode);
716 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
717 err);
718 goto error;
719 }
720 d_instantiate(dentry, inode);
721 err = v9fs_fid_add(dentry, fid);
722 if (err < 0)
723 goto error;
724 fid = NULL;
725 } else {
726 /*
727 * Not in cached mode. No need to populate inode with stat.
728 * socket syscall returns a fd, so we need instantiate
729 */
730 inode = v9fs_get_inode(dir->i_sb, mode);
731 if (IS_ERR(inode)) {
732 err = PTR_ERR(inode);
733 goto error;
734 }
735 d_instantiate(dentry, inode);
736 }
737 /* Now set the ACL based on the default value */
738 v9fs_set_create_acl(dentry, dacl, pacl);
739error:
740 if (fid)
741 p9_client_clunk(fid);
742 return err;
743}
744
745/**
746 * v9fs_vfs_follow_link_dotl - follow a symlink path
747 * @dentry: dentry for symlink
748 * @nd: nameidata
749 *
750 */
751
752static void *
753v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
754{
755 int retval;
756 struct p9_fid *fid;
757 char *link = __getname();
758 char *target;
759
760 P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
761
762 if (!link) {
763 link = ERR_PTR(-ENOMEM);
764 goto ndset;
765 }
766 fid = v9fs_fid_lookup(dentry);
767 if (IS_ERR(fid)) {
768 __putname(link);
769 link = ERR_PTR(PTR_ERR(fid));
770 goto ndset;
771 }
772 retval = p9_client_readlink(fid, &target);
773 if (!retval) {
774 strcpy(link, target);
775 kfree(target);
776 goto ndset;
777 }
778 __putname(link);
779 link = ERR_PTR(retval);
780ndset:
781 nd_set_link(nd, link);
782 return NULL;
783}
784
785const struct inode_operations v9fs_dir_inode_operations_dotl = {
786 .create = v9fs_vfs_create_dotl,
787 .lookup = v9fs_vfs_lookup,
788 .link = v9fs_vfs_link_dotl,
789 .symlink = v9fs_vfs_symlink_dotl,
790 .unlink = v9fs_vfs_unlink,
791 .mkdir = v9fs_vfs_mkdir_dotl,
792 .rmdir = v9fs_vfs_rmdir,
793 .mknod = v9fs_vfs_mknod_dotl,
794 .rename = v9fs_vfs_rename,
795 .getattr = v9fs_vfs_getattr_dotl,
796 .setattr = v9fs_vfs_setattr_dotl,
797 .setxattr = generic_setxattr,
798 .getxattr = generic_getxattr,
799 .removexattr = generic_removexattr,
800 .listxattr = v9fs_listxattr,
801 .check_acl = v9fs_check_acl,
802};
803
804const struct inode_operations v9fs_file_inode_operations_dotl = {
805 .getattr = v9fs_vfs_getattr_dotl,
806 .setattr = v9fs_vfs_setattr_dotl,
807 .setxattr = generic_setxattr,
808 .getxattr = generic_getxattr,
809 .removexattr = generic_removexattr,
810 .listxattr = v9fs_listxattr,
811 .check_acl = v9fs_check_acl,
812};
813
814const struct inode_operations v9fs_symlink_inode_operations_dotl = {
815 .readlink = generic_readlink,
816 .follow_link = v9fs_vfs_follow_link_dotl,
817 .put_link = v9fs_vfs_put_link,
818 .getattr = v9fs_vfs_getattr_dotl,
819 .setattr = v9fs_vfs_setattr_dotl,
820 .setxattr = generic_setxattr,
821 .getxattr = generic_getxattr,
822 .removexattr = generic_removexattr,
823 .listxattr = v9fs_listxattr,
824};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1d12ba0ed3db..dbaabe3b8131 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,6 +39,7 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h> 41#include <linux/statfs.h>
42#include <linux/magic.h>
42#include <net/9p/9p.h> 43#include <net/9p/9p.h>
43#include <net/9p/client.h> 44#include <net/9p/client.h>
44 45
@@ -46,6 +47,7 @@
46#include "v9fs_vfs.h" 47#include "v9fs_vfs.h"
47#include "fid.h" 48#include "fid.h"
48#include "xattr.h" 49#include "xattr.h"
50#include "acl.h"
49 51
50static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; 52static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
51 53
@@ -66,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
66 * v9fs_fill_super - populate superblock with info 68 * v9fs_fill_super - populate superblock with info
67 * @sb: superblock 69 * @sb: superblock
68 * @v9ses: session information 70 * @v9ses: session information
69 * @flags: flags propagated from v9fs_get_sb() 71 * @flags: flags propagated from v9fs_mount()
70 * 72 *
71 */ 73 */
72 74
@@ -88,22 +90,25 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
88 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 90 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
89 MS_NOATIME; 91 MS_NOATIME;
90 92
93#ifdef CONFIG_9P_FS_POSIX_ACL
94 if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
95 sb->s_flags |= MS_POSIXACL;
96#endif
97
91 save_mount_options(sb, data); 98 save_mount_options(sb, data);
92} 99}
93 100
94/** 101/**
95 * v9fs_get_sb - mount a superblock 102 * v9fs_mount - mount a superblock
96 * @fs_type: file system type 103 * @fs_type: file system type
97 * @flags: mount flags 104 * @flags: mount flags
98 * @dev_name: device name that was mounted 105 * @dev_name: device name that was mounted
99 * @data: mount options 106 * @data: mount options
100 * @mnt: mountpoint record to be instantiated
101 * 107 *
102 */ 108 */
103 109
104static int v9fs_get_sb(struct file_system_type *fs_type, int flags, 110static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
105 const char *dev_name, void *data, 111 const char *dev_name, void *data)
106 struct vfsmount *mnt)
107{ 112{
108 struct super_block *sb = NULL; 113 struct super_block *sb = NULL;
109 struct inode *inode = NULL; 114 struct inode *inode = NULL;
@@ -117,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
117 122
118 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 123 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
119 if (!v9ses) 124 if (!v9ses)
120 return -ENOMEM; 125 return ERR_PTR(-ENOMEM);
121 126
122 fid = v9fs_session_init(v9ses, dev_name, data); 127 fid = v9fs_session_init(v9ses, dev_name, data);
123 if (IS_ERR(fid)) { 128 if (IS_ERR(fid)) {
@@ -136,6 +141,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
136 } 141 }
137 v9fs_fill_super(sb, v9ses, flags, data); 142 v9fs_fill_super(sb, v9ses, flags, data);
138 143
144 if (v9ses->cache)
145 sb->s_d_op = &v9fs_cached_dentry_operations;
146 else
147 sb->s_d_op = &v9fs_dentry_operations;
148
139 inode = v9fs_get_inode(sb, S_IFDIR | mode); 149 inode = v9fs_get_inode(sb, S_IFDIR | mode);
140 if (IS_ERR(inode)) { 150 if (IS_ERR(inode)) {
141 retval = PTR_ERR(inode); 151 retval = PTR_ERR(inode);
@@ -149,7 +159,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
149 goto release_sb; 159 goto release_sb;
150 } 160 }
151 sb->s_root = root; 161 sb->s_root = root;
152
153 if (v9fs_proto_dotl(v9ses)) { 162 if (v9fs_proto_dotl(v9ses)) {
154 struct p9_stat_dotl *st = NULL; 163 struct p9_stat_dotl *st = NULL;
155 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); 164 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
@@ -174,19 +183,21 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
174 p9stat_free(st); 183 p9stat_free(st);
175 kfree(st); 184 kfree(st);
176 } 185 }
177 186 retval = v9fs_get_acl(inode, fid);
187 if (retval)
188 goto release_sb;
178 v9fs_fid_add(root, fid); 189 v9fs_fid_add(root, fid);
179 190
180 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); 191 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
181 simple_set_mnt(mnt, sb); 192 return dget(sb->s_root);
182 return 0;
183 193
184clunk_fid: 194clunk_fid:
185 p9_client_clunk(fid); 195 p9_client_clunk(fid);
186close_session: 196close_session:
187 v9fs_session_close(v9ses); 197 v9fs_session_close(v9ses);
188 kfree(v9ses); 198 kfree(v9ses);
189 return retval; 199 return ERR_PTR(retval);
200
190release_sb: 201release_sb:
191 /* 202 /*
192 * we will do the session_close and root dentry release 203 * we will do the session_close and root dentry release
@@ -196,7 +207,7 @@ release_sb:
196 */ 207 */
197 p9_client_clunk(fid); 208 p9_client_clunk(fid);
198 deactivate_locked_super(sb); 209 deactivate_locked_super(sb);
199 return retval; 210 return ERR_PTR(retval);
200} 211}
201 212
202/** 213/**
@@ -211,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
211 222
212 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); 223 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
213 224
214 if (s->s_root)
215 v9fs_dentry_release(s->s_root); /* clunk root */
216
217 kill_anon_super(s); 225 kill_anon_super(s);
218 226
219 v9fs_session_cancel(v9ses); 227 v9fs_session_cancel(v9ses);
@@ -249,7 +257,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
249 if (v9fs_proto_dotl(v9ses)) { 257 if (v9fs_proto_dotl(v9ses)) {
250 res = p9_client_statfs(fid, &rs); 258 res = p9_client_statfs(fid, &rs);
251 if (res == 0) { 259 if (res == 0) {
252 buf->f_type = rs.type; 260 buf->f_type = V9FS_MAGIC;
253 buf->f_bsize = rs.bsize; 261 buf->f_bsize = rs.bsize;
254 buf->f_blocks = rs.blocks; 262 buf->f_blocks = rs.blocks;
255 buf->f_bfree = rs.bfree; 263 buf->f_bfree = rs.bfree;
@@ -292,7 +300,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
292 300
293struct file_system_type v9fs_fs_type = { 301struct file_system_type v9fs_fs_type = {
294 .name = "9p", 302 .name = "9p",
295 .get_sb = v9fs_get_sb, 303 .mount = v9fs_mount,
296 .kill_sb = v9fs_kill_super, 304 .kill_sb = v9fs_kill_super,
297 .owner = THIS_MODULE, 305 .owner = THIS_MODULE,
298 .fs_flags = FS_RENAME_DOES_D_MOVE, 306 .fs_flags = FS_RENAME_DOES_D_MOVE,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f88e5c2dc873..d288773871b3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -21,30 +21,13 @@
21#include "fid.h" 21#include "fid.h"
22#include "xattr.h" 22#include "xattr.h"
23 23
24/* 24ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
25 * v9fs_xattr_get() 25 void *buffer, size_t buffer_size)
26 *
27 * Copy an extended attribute into the buffer
28 * provided, or compute the buffer size required.
29 * Buffer is NULL to compute the size of the buffer required.
30 *
31 * Returns a negative error number on failure, or the number of bytes
32 * used / required on success.
33 */
34ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
35 void *buffer, size_t buffer_size)
36{ 26{
37 ssize_t retval; 27 ssize_t retval;
38 int msize, read_count; 28 int msize, read_count;
39 u64 offset = 0, attr_size; 29 u64 offset = 0, attr_size;
40 struct p9_fid *fid, *attr_fid; 30 struct p9_fid *attr_fid;
41
42 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
43 __func__, name, buffer_size);
44
45 fid = v9fs_fid_lookup(dentry);
46 if (IS_ERR(fid))
47 return PTR_ERR(fid);
48 31
49 attr_fid = p9_client_xattrwalk(fid, name, &attr_size); 32 attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
50 if (IS_ERR(attr_fid)) { 33 if (IS_ERR(attr_fid)) {
@@ -88,6 +71,31 @@ error:
88 71
89} 72}
90 73
74
75/*
76 * v9fs_xattr_get()
77 *
78 * Copy an extended attribute into the buffer
79 * provided, or compute the buffer size required.
80 * Buffer is NULL to compute the size of the buffer required.
81 *
82 * Returns a negative error number on failure, or the number of bytes
83 * used / required on success.
84 */
85ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
86 void *buffer, size_t buffer_size)
87{
88 struct p9_fid *fid;
89
90 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
91 __func__, name, buffer_size);
92 fid = v9fs_fid_lookup(dentry);
93 if (IS_ERR(fid))
94 return PTR_ERR(fid);
95
96 return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
97}
98
91/* 99/*
92 * v9fs_xattr_set() 100 * v9fs_xattr_set()
93 * 101 *
@@ -125,7 +133,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
125 "p9_client_xattrcreate failed %d\n", retval); 133 "p9_client_xattrcreate failed %d\n", retval);
126 goto error; 134 goto error;
127 } 135 }
128 msize = fid->clnt->msize;; 136 msize = fid->clnt->msize;
129 while (value_len) { 137 while (value_len) {
130 if (value_len > (msize - P9_IOHDRSZ)) 138 if (value_len > (msize - P9_IOHDRSZ))
131 write_count = msize - P9_IOHDRSZ; 139 write_count = msize - P9_IOHDRSZ;
@@ -156,5 +164,9 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
156 164
157const struct xattr_handler *v9fs_xattr_handlers[] = { 165const struct xattr_handler *v9fs_xattr_handlers[] = {
158 &v9fs_xattr_user_handler, 166 &v9fs_xattr_user_handler,
167#ifdef CONFIG_9P_FS_POSIX_ACL
168 &v9fs_xattr_acl_access_handler,
169 &v9fs_xattr_acl_default_handler,
170#endif
159 NULL 171 NULL
160}; 172};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 9ddf672ae5c4..eaa837c53bd5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -15,10 +15,16 @@
15#define FS_9P_XATTR_H 15#define FS_9P_XATTR_H
16 16
17#include <linux/xattr.h> 17#include <linux/xattr.h>
18#include <net/9p/9p.h>
19#include <net/9p/client.h>
18 20
19extern const struct xattr_handler *v9fs_xattr_handlers[]; 21extern const struct xattr_handler *v9fs_xattr_handlers[];
20extern struct xattr_handler v9fs_xattr_user_handler; 22extern struct xattr_handler v9fs_xattr_user_handler;
23extern const struct xattr_handler v9fs_xattr_acl_access_handler;
24extern const struct xattr_handler v9fs_xattr_acl_default_handler;
21 25
26extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
27 void *, size_t);
22extern ssize_t v9fs_xattr_get(struct dentry *, const char *, 28extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
23 void *, size_t); 29 void *, size_t);
24extern int v9fs_xattr_set(struct dentry *, const char *, 30extern int v9fs_xattr_set(struct dentry *, const char *,
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..3db9caa57edc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
30source "fs/reiserfs/Kconfig" 30source "fs/reiserfs/Kconfig"
31source "fs/jfs/Kconfig" 31source "fs/jfs/Kconfig"
32 32
33config FS_POSIX_ACL
34# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
35#
36# NOTE: you can implement Posix ACLs without these helpers (XFS does).
37# Never use this symbol for ifdefs.
38#
39 bool
40 default n
41
42source "fs/xfs/Kconfig" 33source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig" 34source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig" 35source "fs/ocfs2/Kconfig"
@@ -47,8 +38,19 @@ source "fs/nilfs2/Kconfig"
47 38
48endif # BLOCK 39endif # BLOCK
49 40
41# Posix ACL utility routines
42#
43# Note: Posix ACLs can be implemented without these helpers. Never use
44# this symbol for ifdefs in core code.
45#
46config FS_POSIX_ACL
47 def_bool n
48
49config EXPORTFS
50 tristate
51
50config FILE_LOCKING 52config FILE_LOCKING
51 bool "Enable POSIX file locking API" if EMBEDDED 53 bool "Enable POSIX file locking API" if EXPERT
52 default y 54 default y
53 help 55 help
54 This option enables standard file locking support, required 56 This option enables standard file locking support, required
@@ -59,7 +61,6 @@ source "fs/notify/Kconfig"
59 61
60source "fs/quota/Kconfig" 62source "fs/quota/Kconfig"
61 63
62source "fs/autofs/Kconfig"
63source "fs/autofs4/Kconfig" 64source "fs/autofs4/Kconfig"
64source "fs/fuse/Kconfig" 65source "fs/fuse/Kconfig"
65 66
@@ -221,9 +222,6 @@ config LOCKD_V4
221 depends on FILE_LOCKING 222 depends on FILE_LOCKING
222 default y 223 default y
223 224
224config EXPORTFS
225 tristate
226
227config NFS_ACL_SUPPORT 225config NFS_ACL_SUPPORT
228 tristate 226 tristate
229 select FS_POSIX_ACL 227 select FS_POSIX_ACL
@@ -234,7 +232,6 @@ config NFS_COMMON
234 default y 232 default y
235 233
236source "net/sunrpc/Kconfig" 234source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig" 235source "fs/ceph/Kconfig"
239source "fs/cifs/Kconfig" 236source "fs/cifs/Kconfig"
240source "fs/ncpfs/Kconfig" 237source "fs/ncpfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc8..79e2ca7973b7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
42 42
43config CORE_DUMP_DEFAULT_ELF_HEADERS 43config CORE_DUMP_DEFAULT_ELF_HEADERS
44 bool "Write ELF core dumps with partial segments" 44 bool "Write ELF core dumps with partial segments"
45 default n 45 default y
46 depends on BINFMT_ELF && ELF_CORE 46 depends on BINFMT_ELF && ELF_CORE
47 help 47 help
48 ELF core dump files describe each memory mapping of the crashed 48 ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
60 inherited. See Documentation/filesystems/proc.txt for details. 60 inherited. See Documentation/filesystems/proc.txt for details.
61 61
62 This config option changes the default setting of coredump_filter 62 This config option changes the default setting of coredump_filter
63 seen at boot time. If unsure, say N. 63 seen at boot time. If unsure, say Y.
64 64
65config BINFMT_FLAT 65config BINFMT_FLAT
66 bool "Kernel support for flat binaries" 66 bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..a7f7cef0c0c8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
29obj-$(CONFIG_AIO) += aio.o 29obj-$(CONFIG_AIO) += aio.o
30obj-$(CONFIG_FILE_LOCKING) += locks.o 30obj-$(CONFIG_FILE_LOCKING) += locks.o
31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
32 32obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o
33nfsd-$(CONFIG_NFSD) := nfsctl.o
34obj-y += $(nfsd-y) $(nfsd-m)
35
36obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 33obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
37obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o 34obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
38obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o 35obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
@@ -91,7 +88,6 @@ obj-$(CONFIG_NFSD) += nfsd/
91obj-$(CONFIG_LOCKD) += lockd/ 88obj-$(CONFIG_LOCKD) += lockd/
92obj-$(CONFIG_NLS) += nls/ 89obj-$(CONFIG_NLS) += nls/
93obj-$(CONFIG_SYSV_FS) += sysv/ 90obj-$(CONFIG_SYSV_FS) += sysv/
94obj-$(CONFIG_SMB_FS) += smbfs/
95obj-$(CONFIG_CIFS) += cifs/ 91obj-$(CONFIG_CIFS) += cifs/
96obj-$(CONFIG_NCP_FS) += ncpfs/ 92obj-$(CONFIG_NCP_FS) += ncpfs/
97obj-$(CONFIG_HPFS_FS) += hpfs/ 93obj-$(CONFIG_HPFS_FS) += hpfs/
@@ -104,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS) += ubifs/
104obj-$(CONFIG_AFFS_FS) += affs/ 100obj-$(CONFIG_AFFS_FS) += affs/
105obj-$(CONFIG_ROMFS_FS) += romfs/ 101obj-$(CONFIG_ROMFS_FS) += romfs/
106obj-$(CONFIG_QNX4FS_FS) += qnx4/ 102obj-$(CONFIG_QNX4FS_FS) += qnx4/
107obj-$(CONFIG_AUTOFS_FS) += autofs/
108obj-$(CONFIG_AUTOFS4_FS) += autofs4/ 103obj-$(CONFIG_AUTOFS4_FS) += autofs4/
109obj-$(CONFIG_ADFS_FS) += adfs/ 104obj-$(CONFIG_ADFS_FS) += adfs/
110obj-$(CONFIG_FUSE_FS) += fuse/ 105obj-$(CONFIG_FUSE_FS) += fuse/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
1config ADFS_FS 1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)" 2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on BLOCK && EXPERIMENTAL
4 depends on BKL # need to fix
4 help 5 help
5 The Acorn Disc Filing System is the standard file system of the 6 The Acorn Disc Filing System is the standard file system of the
6 RiscOS operating system which runs on Acorn's ARM-based Risc PC 7 RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de744..3b4a764ed780 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
201}; 201};
202 202
203static int 203static int
204adfs_hash(struct dentry *parent, struct qstr *qstr) 204adfs_hash(const struct dentry *parent, const struct inode *inode,
205 struct qstr *qstr)
205{ 206{
206 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; 207 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
207 const unsigned char *name; 208 const unsigned char *name;
@@ -237,17 +238,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
237 * requirements of the underlying filesystem. 238 * requirements of the underlying filesystem.
238 */ 239 */
239static int 240static int
240adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name) 241adfs_compare(const struct dentry *parent, const struct inode *pinode,
242 const struct dentry *dentry, const struct inode *inode,
243 unsigned int len, const char *str, const struct qstr *name)
241{ 244{
242 int i; 245 int i;
243 246
244 if (entry->len != name->len) 247 if (len != name->len)
245 return 1; 248 return 1;
246 249
247 for (i = 0; i < name->len; i++) { 250 for (i = 0; i < name->len; i++) {
248 char a, b; 251 char a, b;
249 252
250 a = entry->name[i]; 253 a = str[i];
251 b = name->name[i]; 254 b = name->name[i];
252 255
253 if (a >= 'A' && a <= 'Z') 256 if (a >= 'A' && a <= 'Z')
@@ -273,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
273 struct object_info obj; 276 struct object_info obj;
274 int error; 277 int error;
275 278
276 dentry->d_op = &adfs_dentry_operations;
277 lock_kernel(); 279 lock_kernel();
278 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); 280 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
279 if (error == 0) { 281 if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1d..2d7954049fbe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
240 return &ei->vfs_inode; 240 return &ei->vfs_inode;
241} 241}
242 242
243static void adfs_destroy_inode(struct inode *inode) 243static void adfs_i_callback(struct rcu_head *head)
244{ 244{
245 struct inode *inode = container_of(head, struct inode, i_rcu);
246 INIT_LIST_HEAD(&inode->i_dentry);
245 kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); 247 kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
246} 248}
247 249
250static void adfs_destroy_inode(struct inode *inode)
251{
252 call_rcu(&inode->i_rcu, adfs_i_callback);
253}
254
248static void init_once(void *foo) 255static void init_once(void *foo)
249{ 256{
250 struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; 257 struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -352,11 +359,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
352 struct adfs_sb_info *asb; 359 struct adfs_sb_info *asb;
353 struct inode *root; 360 struct inode *root;
354 361
362 lock_kernel();
363
355 sb->s_flags |= MS_NODIRATIME; 364 sb->s_flags |= MS_NODIRATIME;
356 365
357 asb = kzalloc(sizeof(*asb), GFP_KERNEL); 366 asb = kzalloc(sizeof(*asb), GFP_KERNEL);
358 if (!asb) 367 if (!asb) {
368 unlock_kernel();
359 return -ENOMEM; 369 return -ENOMEM;
370 }
360 sb->s_fs_info = asb; 371 sb->s_fs_info = asb;
361 372
362 /* set default options */ 373 /* set default options */
@@ -462,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
462 asb->s_namelen = ADFS_F_NAME_LEN; 473 asb->s_namelen = ADFS_F_NAME_LEN;
463 } 474 }
464 475
476 sb->s_d_op = &adfs_dentry_operations;
465 root = adfs_iget(sb, &root_obj); 477 root = adfs_iget(sb, &root_obj);
466 sb->s_root = d_alloc_root(root); 478 sb->s_root = d_alloc_root(root);
467 if (!sb->s_root) { 479 if (!sb->s_root) {
@@ -472,8 +484,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
472 kfree(asb->s_map); 484 kfree(asb->s_map);
473 adfs_error(sb, "get root inode failed\n"); 485 adfs_error(sb, "get root inode failed\n");
474 goto error; 486 goto error;
475 } else 487 }
476 sb->s_root->d_op = &adfs_dentry_operations; 488 unlock_kernel();
477 return 0; 489 return 0;
478 490
479error_free_bh: 491error_free_bh:
@@ -481,20 +493,20 @@ error_free_bh:
481error: 493error:
482 sb->s_fs_info = NULL; 494 sb->s_fs_info = NULL;
483 kfree(asb); 495 kfree(asb);
496 unlock_kernel();
484 return -EINVAL; 497 return -EINVAL;
485} 498}
486 499
487static int adfs_get_sb(struct file_system_type *fs_type, 500static struct dentry *adfs_mount(struct file_system_type *fs_type,
488 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 501 int flags, const char *dev_name, void *data)
489{ 502{
490 return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super, 503 return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
491 mnt);
492} 504}
493 505
494static struct file_system_type adfs_fs_type = { 506static struct file_system_type adfs_fs_type = {
495 .owner = THIS_MODULE, 507 .owner = THIS_MODULE,
496 .name = "adfs", 508 .name = "adfs",
497 .get_sb = adfs_get_sb, 509 .mount = adfs_mount,
498 .kill_sb = kill_block_super, 510 .kill_sb = kill_block_super,
499 .fs_flags = FS_REQUIRES_DEV, 511 .fs_flags = FS_REQUIRES_DEV,
500}; 512};
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb34025..0e95f73a7023 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations affs_aops;
201extern const struct address_space_operations affs_aops_ofs; 201extern const struct address_space_operations affs_aops_ofs;
202 202
203extern const struct dentry_operations affs_dentry_operations; 203extern const struct dentry_operations affs_dentry_operations;
204extern const struct dentry_operations affs_intl_dentry_operations;
204 205
205static inline void 206static inline void
206affs_set_blocksize(struct super_block *sb, int size) 207affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7d0f0a30f7a3..3a4557e8325c 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
128 void *data = dentry->d_fsdata; 128 void *data = dentry->d_fsdata;
129 struct list_head *head, *next; 129 struct list_head *head, *next;
130 130
131 spin_lock(&dcache_lock); 131 spin_lock(&inode->i_lock);
132 head = &inode->i_dentry; 132 head = &inode->i_dentry;
133 next = head->next; 133 next = head->next;
134 while (next != head) { 134 while (next != head) {
@@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
139 } 139 }
140 next = next->next; 140 next = next->next;
141 } 141 }
142 spin_unlock(&dcache_lock); 142 spin_unlock(&inode->i_lock);
143} 143}
144 144
145 145
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
894 if (AFFS_SB(sb)->s_flags & SF_OFS) { 894 if (AFFS_SB(sb)->s_flags & SF_OFS) {
895 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); 895 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
896 u32 tmp; 896 u32 tmp;
897 if (IS_ERR(ext_bh)) { 897 if (IS_ERR(bh)) {
898 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", 898 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
899 ext, PTR_ERR(ext_bh)); 899 ext, PTR_ERR(bh));
900 return; 900 return;
901 } 901 }
902 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); 902 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); 388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
389 mark_buffer_dirty_inode(inode_bh, inode); 389 mark_buffer_dirty_inode(inode_bh, inode);
390 inode->i_nlink = 2; 390 inode->i_nlink = 2;
391 atomic_inc(&inode->i_count); 391 ihold(inode);
392 } 392 }
393 affs_fix_checksum(sb, bh); 393 affs_fix_checksum(sb, bh);
394 mark_buffer_dirty_inode(bh, inode); 394 mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07a..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,26 @@
13typedef int (*toupper_t)(int); 13typedef int (*toupper_t)(int);
14 14
15static int affs_toupper(int ch); 15static int affs_toupper(int ch);
16static int affs_hash_dentry(struct dentry *, struct qstr *); 16static int affs_hash_dentry(const struct dentry *,
17static int affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 17 const struct inode *, struct qstr *);
18static int affs_compare_dentry(const struct dentry *parent,
19 const struct inode *pinode,
20 const struct dentry *dentry, const struct inode *inode,
21 unsigned int len, const char *str, const struct qstr *name);
18static int affs_intl_toupper(int ch); 22static int affs_intl_toupper(int ch);
19static int affs_intl_hash_dentry(struct dentry *, struct qstr *); 23static int affs_intl_hash_dentry(const struct dentry *,
20static int affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 24 const struct inode *, struct qstr *);
25static int affs_intl_compare_dentry(const struct dentry *parent,
26 const struct inode *pinode,
27 const struct dentry *dentry, const struct inode *inode,
28 unsigned int len, const char *str, const struct qstr *name);
21 29
22const struct dentry_operations affs_dentry_operations = { 30const struct dentry_operations affs_dentry_operations = {
23 .d_hash = affs_hash_dentry, 31 .d_hash = affs_hash_dentry,
24 .d_compare = affs_compare_dentry, 32 .d_compare = affs_compare_dentry,
25}; 33};
26 34
27static const struct dentry_operations affs_intl_dentry_operations = { 35const struct dentry_operations affs_intl_dentry_operations = {
28 .d_hash = affs_intl_hash_dentry, 36 .d_hash = affs_intl_hash_dentry,
29 .d_compare = affs_intl_compare_dentry, 37 .d_compare = affs_intl_compare_dentry,
30}; 38};
@@ -58,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
58 * Note: the dentry argument is the parent dentry. 66 * Note: the dentry argument is the parent dentry.
59 */ 67 */
60static inline int 68static inline int
61__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper) 69__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
62{ 70{
63 const u8 *name = qstr->name; 71 const u8 *name = qstr->name;
64 unsigned long hash; 72 unsigned long hash;
65 int i; 73 int i;
66 74
67 i = affs_check_name(qstr->name,qstr->len); 75 i = affs_check_name(qstr->name, qstr->len);
68 if (i) 76 if (i)
69 return i; 77 return i;
70 78
@@ -78,39 +86,41 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
78} 86}
79 87
80static int 88static int
81affs_hash_dentry(struct dentry *dentry, struct qstr *qstr) 89affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
90 struct qstr *qstr)
82{ 91{
83 return __affs_hash_dentry(dentry, qstr, affs_toupper); 92 return __affs_hash_dentry(qstr, affs_toupper);
84} 93}
85static int 94static int
86affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr) 95affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
96 struct qstr *qstr)
87{ 97{
88 return __affs_hash_dentry(dentry, qstr, affs_intl_toupper); 98 return __affs_hash_dentry(qstr, affs_intl_toupper);
89} 99}
90 100
91static inline int 101static inline int __affs_compare_dentry(unsigned int len,
92__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper) 102 const char *str, const struct qstr *name, toupper_t toupper)
93{ 103{
94 const u8 *aname = a->name; 104 const u8 *aname = str;
95 const u8 *bname = b->name; 105 const u8 *bname = name->name;
96 int len;
97 106
98 /* 'a' is the qstr of an already existing dentry, so the name 107 /*
99 * must be valid. 'b' must be validated first. 108 * 'str' is the name of an already existing dentry, so the name
109 * must be valid. 'name' must be validated first.
100 */ 110 */
101 111
102 if (affs_check_name(b->name,b->len)) 112 if (affs_check_name(name->name, name->len))
103 return 1; 113 return 1;
104 114
105 /* If the names are longer than the allowed 30 chars, 115 /*
116 * If the names are longer than the allowed 30 chars,
106 * the excess is ignored, so their length may differ. 117 * the excess is ignored, so their length may differ.
107 */ 118 */
108 len = a->len;
109 if (len >= 30) { 119 if (len >= 30) {
110 if (b->len < 30) 120 if (name->len < 30)
111 return 1; 121 return 1;
112 len = 30; 122 len = 30;
113 } else if (len != b->len) 123 } else if (len != name->len)
114 return 1; 124 return 1;
115 125
116 for (; len > 0; len--) 126 for (; len > 0; len--)
@@ -121,14 +131,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
121} 131}
122 132
123static int 133static int
124affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 134affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
135 const struct dentry *dentry, const struct inode *inode,
136 unsigned int len, const char *str, const struct qstr *name)
125{ 137{
126 return __affs_compare_dentry(dentry, a, b, affs_toupper); 138 return __affs_compare_dentry(len, str, name, affs_toupper);
127} 139}
128static int 140static int
129affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 141affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
142 const struct dentry *dentry, const struct inode *inode,
143 unsigned int len, const char *str, const struct qstr *name)
130{ 144{
131 return __affs_compare_dentry(dentry, a, b, affs_intl_toupper); 145 return __affs_compare_dentry(len, str, name, affs_intl_toupper);
132} 146}
133 147
134/* 148/*
@@ -226,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
226 if (IS_ERR(inode)) 240 if (IS_ERR(inode))
227 return ERR_CAST(inode); 241 return ERR_CAST(inode);
228 } 242 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 243 d_add(dentry, inode);
231 return NULL; 244 return NULL;
232} 245}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 33c4e7eef470..b31507d0f9b9 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
16#include <linux/parser.h> 16#include <linux/parser.h>
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include "affs.h" 20#include "affs.h"
22 21
@@ -46,8 +45,6 @@ affs_put_super(struct super_block *sb)
46 struct affs_sb_info *sbi = AFFS_SB(sb); 45 struct affs_sb_info *sbi = AFFS_SB(sb);
47 pr_debug("AFFS: put_super()\n"); 46 pr_debug("AFFS: put_super()\n");
48 47
49 lock_kernel();
50
51 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt) 48 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
52 affs_commit_super(sb, 1, 1); 49 affs_commit_super(sb, 1, 1);
53 50
@@ -56,8 +53,6 @@ affs_put_super(struct super_block *sb)
56 affs_brelse(sbi->s_root_bh); 53 affs_brelse(sbi->s_root_bh);
57 kfree(sbi); 54 kfree(sbi);
58 sb->s_fs_info = NULL; 55 sb->s_fs_info = NULL;
59
60 unlock_kernel();
61} 56}
62 57
63static void 58static void
@@ -100,17 +95,24 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
100 return &i->vfs_inode; 95 return &i->vfs_inode;
101} 96}
102 97
103static void affs_destroy_inode(struct inode *inode) 98static void affs_i_callback(struct rcu_head *head)
104{ 99{
100 struct inode *inode = container_of(head, struct inode, i_rcu);
101 INIT_LIST_HEAD(&inode->i_dentry);
105 kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); 102 kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
106} 103}
107 104
105static void affs_destroy_inode(struct inode *inode)
106{
107 call_rcu(&inode->i_rcu, affs_i_callback);
108}
109
108static void init_once(void *foo) 110static void init_once(void *foo)
109{ 111{
110 struct affs_inode_info *ei = (struct affs_inode_info *) foo; 112 struct affs_inode_info *ei = (struct affs_inode_info *) foo;
111 113
112 init_MUTEX(&ei->i_link_lock); 114 sema_init(&ei->i_link_lock, 1);
113 init_MUTEX(&ei->i_ext_lock); 115 sema_init(&ei->i_ext_lock, 1);
114 inode_init_once(&ei->vfs_inode); 116 inode_init_once(&ei->vfs_inode);
115} 117}
116 118
@@ -302,6 +304,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
302 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); 304 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
303 if (!sbi) 305 if (!sbi)
304 return -ENOMEM; 306 return -ENOMEM;
307
305 sb->s_fs_info = sbi; 308 sb->s_fs_info = sbi;
306 mutex_init(&sbi->s_bmlock); 309 mutex_init(&sbi->s_bmlock);
307 spin_lock_init(&sbi->symlink_lock); 310 spin_lock_init(&sbi->symlink_lock);
@@ -474,12 +477,16 @@ got_root:
474 goto out_error_noinode; 477 goto out_error_noinode;
475 } 478 }
476 479
480 if (AFFS_SB(sb)->s_flags & SF_INTL)
481 sb->s_d_op = &affs_intl_dentry_operations;
482 else
483 sb->s_d_op = &affs_dentry_operations;
484
477 sb->s_root = d_alloc_root(root_inode); 485 sb->s_root = d_alloc_root(root_inode);
478 if (!sb->s_root) { 486 if (!sb->s_root) {
479 printk(KERN_ERR "AFFS: Get root inode failed\n"); 487 printk(KERN_ERR "AFFS: Get root inode failed\n");
480 goto out_error; 488 goto out_error;
481 } 489 }
482 sb->s_root->d_op = &affs_dentry_operations;
483 490
484 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); 491 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
485 return 0; 492 return 0;
@@ -527,7 +534,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
527 kfree(new_opts); 534 kfree(new_opts);
528 return -EINVAL; 535 return -EINVAL;
529 } 536 }
530 lock_kernel(); 537
531 replace_mount_options(sb, new_opts); 538 replace_mount_options(sb, new_opts);
532 539
533 sbi->s_flags = mount_flags; 540 sbi->s_flags = mount_flags;
@@ -543,17 +550,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
543 memcpy(sbi->s_volume, volume, 32); 550 memcpy(sbi->s_volume, volume, 32);
544 spin_unlock(&sbi->symlink_lock); 551 spin_unlock(&sbi->symlink_lock);
545 552
546 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 553 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
547 unlock_kernel();
548 return 0; 554 return 0;
549 } 555
550 if (*flags & MS_RDONLY) { 556 if (*flags & MS_RDONLY) {
551 affs_write_super(sb); 557 affs_write_super(sb);
552 affs_free_bitmap(sb); 558 affs_free_bitmap(sb);
553 } else 559 } else
554 res = affs_init_bitmap(sb, flags); 560 res = affs_init_bitmap(sb, flags);
555 561
556 unlock_kernel();
557 return res; 562 return res;
558} 563}
559 564
@@ -579,17 +584,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
579 return 0; 584 return 0;
580} 585}
581 586
582static int affs_get_sb(struct file_system_type *fs_type, 587static struct dentry *affs_mount(struct file_system_type *fs_type,
583 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 588 int flags, const char *dev_name, void *data)
584{ 589{
585 return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super, 590 return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
586 mnt);
587} 591}
588 592
589static struct file_system_type affs_fs_type = { 593static struct file_system_type affs_fs_type = {
590 .owner = THIS_MODULE, 594 .owner = THIS_MODULE,
591 .name = "affs", 595 .name = "affs",
592 .get_sb = affs_get_sb, 596 .mount = affs_mount,
593 .kill_sb = kill_block_super, 597 .kill_sb = kill_block_super,
594 .fs_flags = FS_REQUIRES_DEV, 598 .fs_flags = FS_REQUIRES_DEV,
595}; 599};
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54a..1c8c6cc6de30 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
289 call->server = server; 289 call->server = server;
290 290
291 INIT_WORK(&call->work, SRXAFSCB_CallBack); 291 INIT_WORK(&call->work, SRXAFSCB_CallBack);
292 schedule_work(&call->work); 292 queue_work(afs_wq, &call->work);
293 return 0; 293 return 0;
294} 294}
295 295
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
336 call->server = server; 336 call->server = server;
337 337
338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
339 schedule_work(&call->work); 339 queue_work(afs_wq, &call->work);
340 return 0; 340 return 0;
341} 341}
342 342
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
367 call->server = server; 367 call->server = server;
368 368
369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
370 schedule_work(&call->work); 370 queue_work(afs_wq, &call->work);
371 return 0; 371 return 0;
372} 372}
373 373
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
400 call->state = AFS_CALL_REPLYING; 400 call->state = AFS_CALL_REPLYING;
401 401
402 INIT_WORK(&call->work, SRXAFSCB_Probe); 402 INIT_WORK(&call->work, SRXAFSCB_Probe);
403 schedule_work(&call->work); 403 queue_work(afs_wq, &call->work);
404 return 0; 404 return 0;
405} 405}
406 406
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
496 call->state = AFS_CALL_REPLYING; 496 call->state = AFS_CALL_REPLYING;
497 497
498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); 498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
499 schedule_work(&call->work); 499 queue_work(afs_wq, &call->work);
500 return 0; 500 return 0;
501} 501}
502 502
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
580 call->state = AFS_CALL_REPLYING; 580 call->state = AFS_CALL_REPLYING;
581 581
582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); 582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
583 schedule_work(&call->work); 583 queue_work(afs_wq, &call->work);
584 return 0; 584 return 0;
585} 585}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/namei.h>
16#include <linux/pagemap.h> 17#include <linux/pagemap.h>
17#include <linux/ctype.h> 18#include <linux/ctype.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
@@ -23,7 +24,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
23static int afs_dir_open(struct inode *inode, struct file *file); 24static int afs_dir_open(struct inode *inode, struct file *file);
24static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); 25static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
25static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); 26static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
26static int afs_d_delete(struct dentry *dentry); 27static int afs_d_delete(const struct dentry *dentry);
27static void afs_d_release(struct dentry *dentry); 28static void afs_d_release(struct dentry *dentry);
28static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, 29static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
29 loff_t fpos, u64 ino, unsigned dtype); 30 loff_t fpos, u64 ino, unsigned dtype);
@@ -61,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
61 .setattr = afs_setattr, 62 .setattr = afs_setattr,
62}; 63};
63 64
64static const struct dentry_operations afs_fs_dentry_operations = { 65const struct dentry_operations afs_fs_dentry_operations = {
65 .d_revalidate = afs_d_revalidate, 66 .d_revalidate = afs_d_revalidate,
66 .d_delete = afs_d_delete, 67 .d_delete = afs_d_delete,
67 .d_release = afs_d_release, 68 .d_release = afs_d_release,
69 .d_automount = afs_d_automount,
68}; 70};
69 71
70#define AFS_DIR_HASHTBL_SIZE 128 72#define AFS_DIR_HASHTBL_SIZE 128
@@ -581,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
581 } 583 }
582 584
583success: 585success:
584 dentry->d_op = &afs_fs_dentry_operations;
585
586 d_add(dentry, inode); 586 d_add(dentry, inode);
587 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", 587 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
588 fid.vnode, 588 fid.vnode,
@@ -607,6 +607,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
607 void *dir_version; 607 void *dir_version;
608 int ret; 608 int ret;
609 609
610 if (nd->flags & LOOKUP_RCU)
611 return -ECHILD;
612
610 vnode = AFS_FS_I(dentry->d_inode); 613 vnode = AFS_FS_I(dentry->d_inode);
611 614
612 if (dentry->d_inode) 615 if (dentry->d_inode)
@@ -730,7 +733,7 @@ out_bad:
730 * - called from dput() when d_count is going to 0. 733 * - called from dput() when d_count is going to 0.
731 * - return 1 to request dentry be unhashed, 0 otherwise 734 * - return 1 to request dentry be unhashed, 0 otherwise
732 */ 735 */
733static int afs_d_delete(struct dentry *dentry) 736static int afs_d_delete(const struct dentry *dentry)
734{ 737{
735 _enter("%s", dentry->d_name.name); 738 _enter("%s", dentry->d_name.name);
736 739
@@ -1045,7 +1048,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
1045 if (ret < 0) 1048 if (ret < 0)
1046 goto link_error; 1049 goto link_error;
1047 1050
1048 atomic_inc(&vnode->vfs_inode.i_count); 1051 ihold(&vnode->vfs_inode);
1049 d_instantiate(dentry, &vnode->vfs_inode); 1052 d_instantiate(dentry, &vnode->vfs_inode);
1050 key_put(key); 1053 key_put(key);
1051 _leave(" = 0"); 1054 _leave(" = 0");
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325eb..757d664575dd 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/smp_lock.h>
13#include "internal.h" 12#include "internal.h"
14 13
15#define AFS_LOCK_GRANTED 0 14#define AFS_LOCK_GRANTED 0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
274 273
275 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; 274 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
276 275
277 lock_kernel(); 276 lock_flocks();
278 277
279 /* make sure we've got a callback on this file and that our view of the 278 /* make sure we've got a callback on this file and that our view of the
280 * data version is up to date */ 279 * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
421 afs_vnode_fetch_status(vnode, NULL, key); 420 afs_vnode_fetch_status(vnode, NULL, key);
422 421
423error: 422error:
424 unlock_kernel(); 423 unlock_flocks();
425 _leave(" = %d", ret); 424 _leave(" = %d", ret);
426 return ret; 425 return ret;
427 426
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c3..db66c5201474 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
184 inode->i_generation = 0; 184 inode->i_generation = 0;
185 185
186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); 186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
187 inode->i_flags |= S_NOATIME; 187 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
188 inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
188 unlock_new_inode(inode); 189 unlock_new_inode(inode);
189 _leave(" = %p", inode); 190 _leave(" = %p", inode);
190 return inode; 191 return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736fc..5a9b6843bac1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
486 * dir.c 486 * dir.c
487 */ 487 */
488extern const struct inode_operations afs_dir_inode_operations; 488extern const struct inode_operations afs_dir_inode_operations;
489extern const struct dentry_operations afs_fs_dentry_operations;
489extern const struct file_operations afs_dir_file_operations; 490extern const struct file_operations afs_dir_file_operations;
490 491
491/* 492/*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
576/* 577/*
577 * main.c 578 * main.c
578 */ 579 */
580extern struct workqueue_struct *afs_wq;
579extern struct afs_uuid afs_uuid; 581extern struct afs_uuid afs_uuid;
580 582
581/* 583/*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
590extern const struct inode_operations afs_autocell_inode_operations; 592extern const struct inode_operations afs_autocell_inode_operations;
591extern const struct file_operations afs_mntpt_file_operations; 593extern const struct file_operations afs_mntpt_file_operations;
592 594
595extern struct vfsmount *afs_d_automount(struct path *);
593extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); 596extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
594extern void afs_mntpt_kill_timer(void); 597extern void afs_mntpt_kill_timer(void);
595 598
@@ -624,7 +627,7 @@ extern void afs_clear_permits(struct afs_vnode *);
624extern void afs_cache_permit(struct afs_vnode *, struct key *, long); 627extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
625extern void afs_zap_permits(struct rcu_head *); 628extern void afs_zap_permits(struct rcu_head *);
626extern struct key *afs_request_key(struct afs_cell *); 629extern struct key *afs_request_key(struct afs_cell *);
627extern int afs_permission(struct inode *, int); 630extern int afs_permission(struct inode *, int, unsigned int);
628 631
629/* 632/*
630 * server.c 633 * server.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b22..42dd2e499ed8 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
33struct workqueue_struct *afs_wq;
33 34
34/* 35/*
35 * get a client UUID 36 * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
87 if (ret < 0) 88 if (ret < 0)
88 return ret; 89 return ret;
89 90
91 /* create workqueue */
92 ret = -ENOMEM;
93 afs_wq = alloc_workqueue("afs", 0, 0);
94 if (!afs_wq)
95 return ret;
96
90 /* register the /proc stuff */ 97 /* register the /proc stuff */
91 ret = afs_proc_init(); 98 ret = afs_proc_init();
92 if (ret < 0) 99 if (ret < 0)
93 return ret; 100 goto error_proc;
94 101
95#ifdef CONFIG_AFS_FSCACHE 102#ifdef CONFIG_AFS_FSCACHE
96 /* we want to be able to cache */ 103 /* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
140error_cache: 147error_cache:
141#endif 148#endif
142 afs_proc_cleanup(); 149 afs_proc_cleanup();
150error_proc:
151 destroy_workqueue(afs_wq);
143 rcu_barrier(); 152 rcu_barrier();
144 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 153 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
145 return ret; 154 return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
163 afs_purge_servers(); 172 afs_purge_servers();
164 afs_callback_update_kill(); 173 afs_callback_update_kill();
165 afs_vlocation_purge(); 174 afs_vlocation_purge();
166 flush_scheduled_work(); 175 destroy_workqueue(afs_wq);
167 afs_cell_purge(); 176 afs_cell_purge();
168#ifdef CONFIG_AFS_FSCACHE 177#ifdef CONFIG_AFS_FSCACHE
169 fscache_unregister_netfs(&afs_cache_netfs); 178 fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6d552686c498..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,22 +24,20 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
24 struct dentry *dentry, 24 struct dentry *dentry,
25 struct nameidata *nd); 25 struct nameidata *nd);
26static int afs_mntpt_open(struct inode *inode, struct file *file); 26static int afs_mntpt_open(struct inode *inode, struct file *file);
27static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
28static void afs_mntpt_expiry_timed_out(struct work_struct *work); 27static void afs_mntpt_expiry_timed_out(struct work_struct *work);
29 28
30const struct file_operations afs_mntpt_file_operations = { 29const struct file_operations afs_mntpt_file_operations = {
31 .open = afs_mntpt_open, 30 .open = afs_mntpt_open,
31 .llseek = noop_llseek,
32}; 32};
33 33
34const struct inode_operations afs_mntpt_inode_operations = { 34const struct inode_operations afs_mntpt_inode_operations = {
35 .lookup = afs_mntpt_lookup, 35 .lookup = afs_mntpt_lookup,
36 .follow_link = afs_mntpt_follow_link,
37 .readlink = page_readlink, 36 .readlink = page_readlink,
38 .getattr = afs_getattr, 37 .getattr = afs_getattr,
39}; 38};
40 39
41const struct inode_operations afs_autocell_inode_operations = { 40const struct inode_operations afs_autocell_inode_operations = {
42 .follow_link = afs_mntpt_follow_link,
43 .getattr = afs_getattr, 41 .getattr = afs_getattr,
44}; 42};
45 43
@@ -87,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
87 _debug("symlink is a mountpoint"); 85 _debug("symlink is a mountpoint");
88 spin_lock(&vnode->lock); 86 spin_lock(&vnode->lock);
89 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); 87 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
88 vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
90 spin_unlock(&vnode->lock); 89 spin_unlock(&vnode->lock);
91 } 90 }
92 91
@@ -237,52 +236,24 @@ error_no_devname:
237} 236}
238 237
239/* 238/*
240 * follow a link from a mountpoint directory, thus causing it to be mounted 239 * handle an automount point
241 */ 240 */
242static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) 241struct vfsmount *afs_d_automount(struct path *path)
243{ 242{
244 struct vfsmount *newmnt; 243 struct vfsmount *newmnt;
245 int err;
246 244
247 _enter("%p{%s},{%s:%p{%s},}", 245 _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
248 dentry,
249 dentry->d_name.name,
250 nd->path.mnt->mnt_devname,
251 dentry,
252 nd->path.dentry->d_name.name);
253
254 dput(nd->path.dentry);
255 nd->path.dentry = dget(dentry);
256 246
257 newmnt = afs_mntpt_do_automount(nd->path.dentry); 247 newmnt = afs_mntpt_do_automount(path->dentry);
258 if (IS_ERR(newmnt)) { 248 if (IS_ERR(newmnt))
259 path_put(&nd->path); 249 return newmnt;
260 return (void *)newmnt;
261 }
262
263 mntget(newmnt);
264 err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
265 switch (err) {
266 case 0:
267 path_put(&nd->path);
268 nd->path.mnt = newmnt;
269 nd->path.dentry = dget(newmnt->mnt_root);
270 schedule_delayed_work(&afs_mntpt_expiry_timer,
271 afs_mntpt_expiry_timeout * HZ);
272 break;
273 case -EBUSY:
274 /* someone else made a mount here whilst we were busy */
275 while (d_mountpoint(nd->path.dentry) &&
276 follow_down(&nd->path))
277 ;
278 err = 0;
279 default:
280 mntput(newmnt);
281 break;
282 }
283 250
284 _leave(" = %d", err); 251 mntget(newmnt); /* prevent immediate expiration */
285 return ERR_PTR(err); 252 mnt_set_expiry(newmnt, &afs_vfsmounts);
253 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
254 afs_mntpt_expiry_timeout * HZ);
255 _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
256 return newmnt;
286} 257}
287 258
288/* 259/*
@@ -294,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
294 265
295 if (!list_empty(&afs_vfsmounts)) { 266 if (!list_empty(&afs_vfsmounts)) {
296 mark_mounts_for_expiry(&afs_vfsmounts); 267 mark_mounts_for_expiry(&afs_vfsmounts);
297 schedule_delayed_work(&afs_mntpt_expiry_timer, 268 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
298 afs_mntpt_expiry_timeout * HZ); 269 afs_mntpt_expiry_timeout * HZ);
299 } 270 }
300 271
301 _leave(""); 272 _leave("");
@@ -309,6 +280,5 @@ void afs_mntpt_kill_timer(void)
309 _enter(""); 280 _enter("");
310 281
311 ASSERT(list_empty(&afs_vfsmounts)); 282 ASSERT(list_empty(&afs_vfsmounts));
312 cancel_delayed_work(&afs_mntpt_expiry_timer); 283 cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
313 flush_scheduled_work();
314} 284}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01f..e45a323aebb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
410 if (!call) { 410 if (!call) {
411 /* its an incoming call for our callback service */ 411 /* its an incoming call for our callback service */
412 skb_queue_tail(&afs_incoming_calls, skb); 412 skb_queue_tail(&afs_incoming_calls, skb);
413 schedule_work(&afs_collect_incoming_call_work); 413 queue_work(afs_wq, &afs_collect_incoming_call_work);
414 } else { 414 } else {
415 /* route the messages directly to the appropriate call */ 415 /* route the messages directly to the appropriate call */
416 skb_queue_tail(&call->rx_queue, skb); 416 skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e4..f44b9d355377 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
285 * - AFS ACLs are attached to directories only, and a file is controlled by its 285 * - AFS ACLs are attached to directories only, and a file is controlled by its
286 * parent directory's ACL 286 * parent directory's ACL
287 */ 287 */
288int afs_permission(struct inode *inode, int mask) 288int afs_permission(struct inode *inode, int mask, unsigned int flags)
289{ 289{
290 struct afs_vnode *vnode = AFS_FS_I(inode); 290 struct afs_vnode *vnode = AFS_FS_I(inode);
291 afs_access_t uninitialized_var(access); 291 afs_access_t uninitialized_var(access);
292 struct key *key; 292 struct key *key;
293 int ret; 293 int ret;
294 294
295 if (flags & IPERM_FLAG_RCU)
296 return -ECHILD;
297
295 _enter("{{%x:%u},%lx},%x,", 298 _enter("{{%x:%u},%lx},%x,",
296 vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); 299 vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
297 300
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
347 } 350 }
348 351
349 key_put(key); 352 key_put(key);
350 ret = generic_permission(inode, mask, NULL); 353 ret = generic_permission(inode, mask, flags, NULL);
351 _leave(" = %d", ret); 354 _leave(" = %d", ret);
352 return ret; 355 return ret;
353 356
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7bc..d59b7516e943 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
238 if (atomic_read(&server->usage) == 0) { 238 if (atomic_read(&server->usage) == 0) {
239 list_move_tail(&server->grave, &afs_server_graveyard); 239 list_move_tail(&server->grave, &afs_server_graveyard);
240 server->time_of_death = get_seconds(); 240 server->time_of_death = get_seconds();
241 schedule_delayed_work(&afs_server_reaper, 241 queue_delayed_work(afs_wq, &afs_server_reaper,
242 afs_server_timeout * HZ); 242 afs_server_timeout * HZ);
243 } 243 }
244 spin_unlock(&afs_server_graveyard_lock); 244 spin_unlock(&afs_server_graveyard_lock);
245 _leave(" [dead]"); 245 _leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
285 expiry = server->time_of_death + afs_server_timeout; 285 expiry = server->time_of_death + afs_server_timeout;
286 if (expiry > now) { 286 if (expiry > now) {
287 delay = (expiry - now) * HZ; 287 delay = (expiry - now) * HZ;
288 if (!schedule_delayed_work(&afs_server_reaper, delay)) { 288 if (!queue_delayed_work(afs_wq, &afs_server_reaper,
289 delay)) {
289 cancel_delayed_work(&afs_server_reaper); 290 cancel_delayed_work(&afs_server_reaper);
290 schedule_delayed_work(&afs_server_reaper, 291 queue_delayed_work(afs_wq, &afs_server_reaper,
291 delay); 292 delay);
292 } 293 }
293 break; 294 break;
294 } 295 }
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
323{ 324{
324 afs_server_timeout = 0; 325 afs_server_timeout = 0;
325 cancel_delayed_work(&afs_server_reaper); 326 cancel_delayed_work(&afs_server_reaper);
326 schedule_delayed_work(&afs_server_reaper, 0); 327 queue_delayed_work(afs_wq, &afs_server_reaper, 0);
327} 328}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 77e1e5a61154..fb240e8766d6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -19,7 +19,6 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/smp_lock.h>
23#include <linux/fs.h> 22#include <linux/fs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/parser.h> 24#include <linux/parser.h>
@@ -30,9 +29,8 @@
30#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ 29#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
31 30
32static void afs_i_init_once(void *foo); 31static void afs_i_init_once(void *foo);
33static int afs_get_sb(struct file_system_type *fs_type, 32static struct dentry *afs_mount(struct file_system_type *fs_type,
34 int flags, const char *dev_name, 33 int flags, const char *dev_name, void *data);
35 void *data, struct vfsmount *mnt);
36static struct inode *afs_alloc_inode(struct super_block *sb); 34static struct inode *afs_alloc_inode(struct super_block *sb);
37static void afs_put_super(struct super_block *sb); 35static void afs_put_super(struct super_block *sb);
38static void afs_destroy_inode(struct inode *inode); 36static void afs_destroy_inode(struct inode *inode);
@@ -41,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
41struct file_system_type afs_fs_type = { 39struct file_system_type afs_fs_type = {
42 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
43 .name = "afs", 41 .name = "afs",
44 .get_sb = afs_get_sb, 42 .mount = afs_mount,
45 .kill_sb = kill_anon_super, 43 .kill_sb = kill_anon_super,
46 .fs_flags = 0, 44 .fs_flags = 0,
47}; 45};
@@ -338,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
338 if (!root) 336 if (!root)
339 goto error; 337 goto error;
340 338
339 sb->s_d_op = &afs_fs_dentry_operations;
341 sb->s_root = root; 340 sb->s_root = root;
342 341
343 _leave(" = 0"); 342 _leave(" = 0");
@@ -360,11 +359,8 @@ error:
360/* 359/*
361 * get an AFS superblock 360 * get an AFS superblock
362 */ 361 */
363static int afs_get_sb(struct file_system_type *fs_type, 362static struct dentry *afs_mount(struct file_system_type *fs_type,
364 int flags, 363 int flags, const char *dev_name, void *options)
365 const char *dev_name,
366 void *options,
367 struct vfsmount *mnt)
368{ 364{
369 struct afs_mount_params params; 365 struct afs_mount_params params;
370 struct super_block *sb; 366 struct super_block *sb;
@@ -428,12 +424,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
428 ASSERTCMP(sb->s_flags, &, MS_ACTIVE); 424 ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
429 } 425 }
430 426
431 simple_set_mnt(mnt, sb);
432 afs_put_volume(params.volume); 427 afs_put_volume(params.volume);
433 afs_put_cell(params.cell); 428 afs_put_cell(params.cell);
434 kfree(new_opts); 429 kfree(new_opts);
435 _leave(" = 0 [%p]", sb); 430 _leave(" = 0 [%p]", sb);
436 return 0; 431 return dget(sb->s_root);
437 432
438error: 433error:
439 afs_put_volume(params.volume); 434 afs_put_volume(params.volume);
@@ -441,7 +436,7 @@ error:
441 key_put(params.key); 436 key_put(params.key);
442 kfree(new_opts); 437 kfree(new_opts);
443 _leave(" = %d", ret); 438 _leave(" = %d", ret);
444 return ret; 439 return ERR_PTR(ret);
445} 440}
446 441
447/* 442/*
@@ -453,12 +448,8 @@ static void afs_put_super(struct super_block *sb)
453 448
454 _enter(""); 449 _enter("");
455 450
456 lock_kernel();
457
458 afs_put_volume(as->volume); 451 afs_put_volume(as->volume);
459 452
460 unlock_kernel();
461
462 _leave(""); 453 _leave("");
463} 454}
464 455
@@ -508,6 +499,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
508 return &vnode->vfs_inode; 499 return &vnode->vfs_inode;
509} 500}
510 501
502static void afs_i_callback(struct rcu_head *head)
503{
504 struct inode *inode = container_of(head, struct inode, i_rcu);
505 struct afs_vnode *vnode = AFS_FS_I(inode);
506 INIT_LIST_HEAD(&inode->i_dentry);
507 kmem_cache_free(afs_inode_cachep, vnode);
508}
509
511/* 510/*
512 * destroy an AFS inode struct 511 * destroy an AFS inode struct
513 */ 512 */
@@ -521,7 +520,7 @@ static void afs_destroy_inode(struct inode *inode)
521 520
522 ASSERTCMP(vnode->server, ==, NULL); 521 ASSERTCMP(vnode->server, ==, NULL);
523 522
524 kmem_cache_free(afs_inode_cachep, vnode); 523 call_rcu(&inode->i_rcu, afs_i_callback);
525 atomic_dec(&afs_count_active_inodes); 524 atomic_dec(&afs_count_active_inodes);
526} 525}
527 526
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361d..431984d2e372 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
507 _debug("buried"); 507 _debug("buried");
508 list_move_tail(&vl->grave, &afs_vlocation_graveyard); 508 list_move_tail(&vl->grave, &afs_vlocation_graveyard);
509 vl->time_of_death = get_seconds(); 509 vl->time_of_death = get_seconds();
510 schedule_delayed_work(&afs_vlocation_reap, 510 queue_delayed_work(afs_wq, &afs_vlocation_reap,
511 afs_vlocation_timeout * HZ); 511 afs_vlocation_timeout * HZ);
512 512
513 /* suspend updates on this record */ 513 /* suspend updates on this record */
514 if (!list_empty(&vl->update)) { 514 if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
561 if (expiry > now) { 561 if (expiry > now) {
562 delay = (expiry - now) * HZ; 562 delay = (expiry - now) * HZ;
563 _debug("delay %lu", delay); 563 _debug("delay %lu", delay);
564 if (!schedule_delayed_work(&afs_vlocation_reap, 564 if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
565 delay)) { 565 delay)) {
566 cancel_delayed_work(&afs_vlocation_reap); 566 cancel_delayed_work(&afs_vlocation_reap);
567 schedule_delayed_work(&afs_vlocation_reap, 567 queue_delayed_work(afs_wq, &afs_vlocation_reap,
568 delay); 568 delay);
569 } 569 }
570 break; 570 break;
571 } 571 }
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
620 destroy_workqueue(afs_vlocation_update_worker); 620 destroy_workqueue(afs_vlocation_update_worker);
621 621
622 cancel_delayed_work(&afs_vlocation_reap); 622 cancel_delayed_work(&afs_vlocation_reap);
623 schedule_delayed_work(&afs_vlocation_reap, 0); 623 queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
624} 624}
625 625
626/* 626/*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
438 */ 438 */
439int afs_writepage(struct page *page, struct writeback_control *wbc) 439int afs_writepage(struct page *page, struct writeback_control *wbc)
440{ 440{
441 struct backing_dev_info *bdi = page->mapping->backing_dev_info;
442 struct afs_writeback *wb; 441 struct afs_writeback *wb;
443 int ret; 442 int ret;
444 443
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
455 } 454 }
456 455
457 wbc->nr_to_write -= ret; 456 wbc->nr_to_write -= ret;
458 if (wbc->nonblocking && bdi_write_congested(bdi))
459 wbc->encountered_congestion = 1;
460 457
461 _leave(" = 0"); 458 _leave(" = 0");
462 return 0; 459 return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
469 struct writeback_control *wbc, 466 struct writeback_control *wbc,
470 pgoff_t index, pgoff_t end, pgoff_t *_next) 467 pgoff_t index, pgoff_t end, pgoff_t *_next)
471{ 468{
472 struct backing_dev_info *bdi = mapping->backing_dev_info;
473 struct afs_writeback *wb; 469 struct afs_writeback *wb;
474 struct page *page; 470 struct page *page;
475 int ret, n; 471 int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
529 525
530 wbc->nr_to_write -= ret; 526 wbc->nr_to_write -= ret;
531 527
532 if (wbc->nonblocking && bdi_write_congested(bdi)) {
533 wbc->encountered_congestion = 1;
534 break;
535 }
536
537 cond_resched(); 528 cond_resched();
538 } while (index < end && wbc->nr_to_write > 0); 529 } while (index < end && wbc->nr_to_write > 0);
539 530
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
548int afs_writepages(struct address_space *mapping, 539int afs_writepages(struct address_space *mapping,
549 struct writeback_control *wbc) 540 struct writeback_control *wbc)
550{ 541{
551 struct backing_dev_info *bdi = mapping->backing_dev_info;
552 pgoff_t start, end, next; 542 pgoff_t start, end, next;
553 int ret; 543 int ret;
554 544
555 _enter(""); 545 _enter("");
556 546
557 if (wbc->nonblocking && bdi_write_congested(bdi)) {
558 wbc->encountered_congestion = 1;
559 _leave(" = 0 [congest]");
560 return 0;
561 }
562
563 if (wbc->range_cyclic) { 547 if (wbc->range_cyclic) {
564 start = mapping->writeback_index; 548 start = mapping->writeback_index;
565 end = -1; 549 end = -1;
566 ret = afs_writepages_region(mapping, wbc, start, end, &next); 550 ret = afs_writepages_region(mapping, wbc, start, end, &next);
567 if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && 551 if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
568 !(wbc->nonblocking && wbc->encountered_congestion))
569 ret = afs_writepages_region(mapping, wbc, 0, start, 552 ret = afs_writepages_region(mapping, wbc, 0, start,
570 &next); 553 &next);
571 mapping->writeback_index = next; 554 mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 250b0a73c8a8..fc557a3be0a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
87 87
88 aio_wq = create_workqueue("aio"); 88 aio_wq = create_workqueue("aio");
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
90 BUG_ON(!abe_pool); 90 BUG_ON(!aio_wq || !abe_pool);
91 91
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 93
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
798 queue_delayed_work(aio_wq, &ctx->wq, timeout); 798 queue_delayed_work(aio_wq, &ctx->wq, timeout);
799} 799}
800 800
801
802/* 801/*
803 * aio_run_iocbs: 802 * aio_run_all_iocbs:
804 * Process all pending retries queued on the ioctx 803 * Process all pending retries queued on the ioctx
805 * run list. 804 * run list, and keep running them until the list
806 * Assumes it is operating within the aio issuer's mm 805 * stays empty.
807 * context. 806 * Assumes it is operating within the aio issuer's mm context.
808 */
809static inline void aio_run_iocbs(struct kioctx *ctx)
810{
811 int requeue;
812
813 spin_lock_irq(&ctx->ctx_lock);
814
815 requeue = __aio_run_iocbs(ctx);
816 spin_unlock_irq(&ctx->ctx_lock);
817 if (requeue)
818 aio_queue_work(ctx);
819}
820
821/*
822 * just like aio_run_iocbs, but keeps running them until
823 * the list stays empty
824 */ 807 */
825static inline void aio_run_all_iocbs(struct kioctx *ctx) 808static inline void aio_run_all_iocbs(struct kioctx *ctx)
826{ 809{
@@ -1543,7 +1526,19 @@ static void aio_batch_add(struct address_space *mapping,
1543 } 1526 }
1544 1527
1545 abe = mempool_alloc(abe_pool, GFP_KERNEL); 1528 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1546 BUG_ON(!igrab(mapping->host)); 1529
1530 /*
1531 * we should be using igrab here, but
1532 * we don't want to hammer on the global
1533 * inode spinlock just to take an extra
1534 * reference on a file that we must already
1535 * have a reference to.
1536 *
1537 * When we're called, we always have a reference
1538 * on the file, so we must always have a reference
1539 * on the inode, so ihold() is safe here.
1540 */
1541 ihold(mapping->host);
1547 abe->mapping = mapping; 1542 abe->mapping = mapping;
1548 hlist_add_head(&abe->list, &batch_hash[bucket]); 1543 hlist_add_head(&abe->list, &batch_hash[bucket]);
1549 return; 1544 return;
@@ -1827,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1827 long ret = -EINVAL; 1822 long ret = -EINVAL;
1828 1823
1829 if (likely(ioctx)) { 1824 if (likely(ioctx)) {
1830 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) 1825 if (likely(min_nr <= nr && min_nr >= 0))
1831 ret = read_events(ioctx, min_nr, nr, events, timeout); 1826 ret = read_events(ioctx, min_nr, nr, events, timeout);
1832 put_ioctx(ioctx); 1827 put_ioctx(ioctx);
1833 } 1828 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,14 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
26static struct inode *anon_inode_inode; 26static struct inode *anon_inode_inode;
27static const struct file_operations anon_inode_fops; 27static const struct file_operations anon_inode_fops;
28 28
29static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
30 const char *dev_name, void *data,
31 struct vfsmount *mnt)
32{
33 return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
34 mnt);
35}
36
37/* 29/*
38 * anon_inodefs_dname() is called from d_path(). 30 * anon_inodefs_dname() is called from d_path().
39 */ 31 */
@@ -43,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
43 dentry->d_name.name); 35 dentry->d_name.name);
44} 36}
45 37
38static const struct dentry_operations anon_inodefs_dentry_operations = {
39 .d_dname = anon_inodefs_dname,
40};
41
42static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
43 int flags, const char *dev_name, void *data)
44{
45 return mount_pseudo(fs_type, "anon_inode:", NULL,
46 &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
47}
48
46static struct file_system_type anon_inode_fs_type = { 49static struct file_system_type anon_inode_fs_type = {
47 .name = "anon_inodefs", 50 .name = "anon_inodefs",
48 .get_sb = anon_inodefs_get_sb, 51 .mount = anon_inodefs_mount,
49 .kill_sb = kill_anon_super, 52 .kill_sb = kill_anon_super,
50}; 53};
51static const struct dentry_operations anon_inodefs_dentry_operations = {
52 .d_dname = anon_inodefs_dname,
53};
54 54
55/* 55/*
56 * nop .set_page_dirty method so that people can use .page_mkwrite on 56 * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -66,9 +66,9 @@ static const struct address_space_operations anon_aops = {
66}; 66};
67 67
68/** 68/**
69 * anon_inode_getfd - creates a new file instance by hooking it up to an 69 * anon_inode_getfile - creates a new file instance by hooking it up to an
70 * anonymous inode, and a dentry that describe the "class" 70 * anonymous inode, and a dentry that describe the "class"
71 * of the file 71 * of the file
72 * 72 *
73 * @name: [in] name of the "class" of the new file 73 * @name: [in] name of the "class" of the new file
74 * @fops: [in] file operations for the new file 74 * @fops: [in] file operations for the new file
@@ -104,19 +104,17 @@ struct file *anon_inode_getfile(const char *name,
104 this.name = name; 104 this.name = name;
105 this.len = strlen(name); 105 this.len = strlen(name);
106 this.hash = 0; 106 this.hash = 0;
107 path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); 107 path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
108 if (!path.dentry) 108 if (!path.dentry)
109 goto err_module; 109 goto err_module;
110 110
111 path.mnt = mntget(anon_inode_mnt); 111 path.mnt = mntget(anon_inode_mnt);
112 /* 112 /*
113 * We know the anon_inode inode count is always greater than zero, 113 * We know the anon_inode inode count is always greater than zero,
114 * so we can avoid doing an igrab() and we can use an open-coded 114 * so ihold() is safe.
115 * atomic_inc().
116 */ 115 */
117 atomic_inc(&anon_inode_inode->i_count); 116 ihold(anon_inode_inode);
118 117
119 path.dentry->d_op = &anon_inodefs_dentry_operations;
120 d_instantiate(path.dentry, anon_inode_inode); 118 d_instantiate(path.dentry, anon_inode_inode);
121 119
122 error = -ENFILE; 120 error = -ENFILE;
@@ -194,6 +192,7 @@ static struct inode *anon_inode_mkinode(void)
194 if (!inode) 192 if (!inode)
195 return ERR_PTR(-ENOMEM); 193 return ERR_PTR(-ENOMEM);
196 194
195 inode->i_ino = get_next_ino();
197 inode->i_fop = &anon_inode_fops; 196 inode->i_fop = &anon_inode_fops;
198 197
199 inode->i_mapping->a_ops = &anon_aops; 198 inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 5f3bea90911e..000000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
1config AUTOFS_FS
2 tristate "Kernel automounter support"
3 help
4 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce
6 overhead in the already-mounted case; this is unlike the BSD
7 automounter (amd), which is a pure user space daemon.
8
9 To use the automounter you need the user-space tools from the autofs
10 package; you can find the location in <file:Documentation/Changes>.
11 You also want to answer Y to "NFS file system support", below.
12
13 If you want to use the newer version of the automounter with more
14 features, say N here and say Y to "Kernel automounter v4 support",
15 below.
16
17 To compile this support as a module, choose M here: the module will be
18 called autofs.
19
20 If you are not a part of a fairly large, distributed network, you
21 probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d05..000000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
1#
2# Makefile for the linux autofs-filesystem routines.
3#
4
5obj-$(CONFIG_AUTOFS_FS) += autofs.o
6
7autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec45..000000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * linux/fs/autofs/autofs_i.h
4 *
5 * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/* Internal header file for autofs */
14
15#include <linux/auto_fs.h>
16
17/* This is the range of ioctl() numbers we claim as ours */
18#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
19#define AUTOFS_IOC_COUNT 32
20
21#include <linux/kernel.h>
22#include <linux/slab.h>
23#include <linux/time.h>
24#include <linux/string.h>
25#include <linux/wait.h>
26#include <linux/dcache.h>
27#include <linux/namei.h>
28#include <linux/mount.h>
29#include <linux/sched.h>
30
31#include <asm/current.h>
32#include <asm/uaccess.h>
33
34#ifdef DEBUG
35#define DPRINTK(D) (printk D)
36#else
37#define DPRINTK(D) ((void)0)
38#endif
39
40/*
41 * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
42 * kernel will keep the negative response cached for up to the time given
43 * here, although the time can be shorter if the kernel throws the dcache
44 * entry away. This probably should be settable from user space.
45 */
46#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */
47
48/* Structures associated with the root directory hash table */
49
50#define AUTOFS_HASH_SIZE 67
51
52struct autofs_dir_ent {
53 int hash;
54 char *name;
55 int len;
56 ino_t ino;
57 struct dentry *dentry;
58 /* Linked list of entries */
59 struct autofs_dir_ent *next;
60 struct autofs_dir_ent **back;
61 /* The following entries are for the expiry system */
62 unsigned long last_usage;
63 struct list_head exp;
64};
65
66struct autofs_dirhash {
67 struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
68 struct list_head expiry_head;
69};
70
71struct autofs_wait_queue {
72 wait_queue_head_t queue;
73 struct autofs_wait_queue *next;
74 autofs_wqt_t wait_queue_token;
75 /* We use the following to see what we are waiting for */
76 int hash;
77 int len;
78 char *name;
79 /* This is for status reporting upon return */
80 int status;
81 int wait_ctr;
82};
83
84struct autofs_symlink {
85 char *data;
86 int len;
87 time_t mtime;
88};
89
90#define AUTOFS_MAX_SYMLINKS 256
91
92#define AUTOFS_ROOT_INO 1
93#define AUTOFS_FIRST_SYMLINK 2
94#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
95
96#define AUTOFS_SYMLINK_BITMAP_LEN \
97 ((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
98
99#define AUTOFS_SBI_MAGIC 0x6d4a556d
100
101struct autofs_sb_info {
102 u32 magic;
103 struct file *pipe;
104 struct pid *oz_pgrp;
105 int catatonic;
106 struct super_block *sb;
107 unsigned long exp_timeout;
108 ino_t next_dir_ino;
109 struct autofs_wait_queue *queues; /* Wait queue pointer */
110 struct autofs_dirhash dirhash; /* Root directory hash */
111 struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
112 unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
113};
114
115static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
116{
117 return (struct autofs_sb_info *)(sb->s_fs_info);
118}
119
120/* autofs_oz_mode(): do we see the man behind the curtain? (The
121 processes which do manipulations for us in user space sees the raw
122 filesystem without "magic".) */
123
124static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
125 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
126}
127
128/* Hash operations */
129
130void autofs_initialize_hash(struct autofs_dirhash *);
131struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
132void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
133void autofs_hash_delete(struct autofs_dir_ent *);
134struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
135void autofs_hash_dputall(struct autofs_dirhash *);
136void autofs_hash_nuke(struct autofs_sb_info *);
137
138/* Expiration-handling functions */
139
140void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
141struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
142
143/* Operations structures */
144
145extern const struct inode_operations autofs_root_inode_operations;
146extern const struct inode_operations autofs_symlink_inode_operations;
147extern const struct file_operations autofs_root_operations;
148
149/* Initializing function */
150
151int autofs_fill_super(struct super_block *, void *, int);
152void autofs_kill_sb(struct super_block *sb);
153struct inode *autofs_iget(struct super_block *, unsigned long);
154
155/* Queue management functions */
156
157int autofs_wait(struct autofs_sb_info *,struct qstr *);
158int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
159void autofs_catatonic_mode(struct autofs_sb_info *);
160
161#ifdef DEBUG
162void autofs_say(const char *name, int len);
163#else
164#define autofs_say(n,l) ((void)0)
165#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e5..000000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/dirhash.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include "autofs_i.h"
14
15/* Functions for maintenance of expiry queue */
16
17static void autofs_init_usage(struct autofs_dirhash *dh,
18 struct autofs_dir_ent *ent)
19{
20 list_add_tail(&ent->exp, &dh->expiry_head);
21 ent->last_usage = jiffies;
22}
23
24static void autofs_delete_usage(struct autofs_dir_ent *ent)
25{
26 list_del(&ent->exp);
27}
28
29void autofs_update_usage(struct autofs_dirhash *dh,
30 struct autofs_dir_ent *ent)
31{
32 autofs_delete_usage(ent); /* Unlink from current position */
33 autofs_init_usage(dh,ent); /* Relink at queue tail */
34}
35
36struct autofs_dir_ent *autofs_expire(struct super_block *sb,
37 struct autofs_sb_info *sbi,
38 struct vfsmount *mnt)
39{
40 struct autofs_dirhash *dh = &sbi->dirhash;
41 struct autofs_dir_ent *ent;
42 unsigned long timeout = sbi->exp_timeout;
43
44 while (1) {
45 struct path path;
46 int umount_ok;
47
48 if ( list_empty(&dh->expiry_head) || sbi->catatonic )
49 return NULL; /* No entries */
50 /* We keep the list sorted by last_usage and want old stuff */
51 ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
52 if (jiffies - ent->last_usage < timeout)
53 break;
54 /* Move to end of list in case expiry isn't desirable */
55 autofs_update_usage(dh, ent);
56
57 /* Check to see that entry is expirable */
58 if ( ent->ino < AUTOFS_FIRST_DIR_INO )
59 return ent; /* Symlinks are always expirable */
60
61 /* Get the dentry for the autofs subdirectory */
62 path.dentry = ent->dentry;
63
64 if (!path.dentry) {
65 /* Should only happen in catatonic mode */
66 printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
67 autofs_delete_usage(ent);
68 continue;
69 }
70
71 if (!path.dentry->d_inode) {
72 dput(path.dentry);
73 printk("autofs: negative dentry on expiry queue: %s\n",
74 ent->name);
75 autofs_delete_usage(ent);
76 continue;
77 }
78
79 /* Make sure entry is mounted and unused; note that dentry will
80 point to the mounted-on-top root. */
81 if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
82 !d_mountpoint(path.dentry)) {
83 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
84 continue;
85 }
86 path.mnt = mnt;
87 path_get(&path);
88 if (!follow_down(&path)) {
89 path_put(&path);
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue;
92 }
93 while (d_mountpoint(path.dentry) && follow_down(&path))
94 ;
95 umount_ok = may_umount(path.mnt);
96 path_put(&path);
97
98 if (umount_ok) {
99 DPRINTK(("autofs: signaling expire on %s\n", ent->name));
100 return ent; /* Expirable! */
101 }
102 DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
103 }
104 return NULL; /* No expirable entries */
105}
106
107void autofs_initialize_hash(struct autofs_dirhash *dh) {
108 memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
109 INIT_LIST_HEAD(&dh->expiry_head);
110}
111
112struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
113{
114 struct autofs_dir_ent *dhn;
115
116 DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
117 autofs_say(name->name,name->len);
118
119 for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
120 if ( name->hash == dhn->hash &&
121 name->len == dhn->len &&
122 !memcmp(name->name, dhn->name, name->len) )
123 break;
124 }
125
126 return dhn;
127}
128
129void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
130{
131 struct autofs_dir_ent **dhnp;
132
133 DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
134 autofs_say(ent->name,ent->len);
135
136 autofs_init_usage(dh,ent);
137 if (ent->dentry)
138 dget(ent->dentry);
139
140 dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
141 ent->next = *dhnp;
142 ent->back = dhnp;
143 *dhnp = ent;
144 if ( ent->next )
145 ent->next->back = &(ent->next);
146}
147
148void autofs_hash_delete(struct autofs_dir_ent *ent)
149{
150 *(ent->back) = ent->next;
151 if ( ent->next )
152 ent->next->back = ent->back;
153
154 autofs_delete_usage(ent);
155
156 if ( ent->dentry )
157 dput(ent->dentry);
158 kfree(ent->name);
159 kfree(ent);
160}
161
162/*
163 * Used by readdir(). We must validate "ptr", so we can't simply make it
164 * a pointer. Values below 0xffff are reserved; calling with any value
165 * <= 0x10000 will return the first entry found.
166 *
167 * "last" can be NULL or the value returned by the last search *if* we
168 * want the next sequential entry.
169 */
170struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
171 off_t *ptr, struct autofs_dir_ent *last)
172{
173 int bucket, ecount, i;
174 struct autofs_dir_ent *ent;
175
176 bucket = (*ptr >> 16) - 1;
177 ecount = *ptr & 0xffff;
178
179 if ( bucket < 0 ) {
180 bucket = ecount = 0;
181 }
182
183 DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
184
185 ent = last ? last->next : NULL;
186
187 if ( ent ) {
188 ecount++;
189 } else {
190 while ( bucket < AUTOFS_HASH_SIZE ) {
191 ent = dh->h[bucket];
192 for ( i = ecount ; ent && i ; i-- )
193 ent = ent->next;
194
195 if (ent) {
196 ecount++; /* Point to *next* entry */
197 break;
198 }
199
200 bucket++; ecount = 0;
201 }
202 }
203
204#ifdef DEBUG
205 if ( !ent )
206 printk("autofs_hash_enum: nothing found\n");
207 else {
208 printk("autofs_hash_enum: found hash %08x, name", ent->hash);
209 autofs_say(ent->name,ent->len);
210 }
211#endif
212
213 *ptr = ((bucket+1) << 16) + ecount;
214 return ent;
215}
216
217/* Iterate over all the ents, and remove all dentry pointers. Used on
218 entering catatonic mode, in order to make the filesystem unmountable. */
219void autofs_hash_dputall(struct autofs_dirhash *dh)
220{
221 int i;
222 struct autofs_dir_ent *ent;
223
224 for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
225 for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
226 if ( ent->dentry ) {
227 dput(ent->dentry);
228 ent->dentry = NULL;
229 }
230 }
231 }
232}
233
234/* Delete everything. This is used on filesystem destruction, so we
235 make no attempt to keep the pointers valid */
236void autofs_hash_nuke(struct autofs_sb_info *sbi)
237{
238 int i;
239 struct autofs_dir_ent *ent, *nent;
240
241 for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
242 for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
243 nent = ent->next;
244 if ( ent->dentry )
245 dput(ent->dentry);
246 kfree(ent->name);
247 kfree(ent);
248 }
249 }
250}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f37..000000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/init.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include "autofs_i.h"
16
17static int autofs_get_sb(struct file_system_type *fs_type,
18 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
19{
20 return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
21}
22
23static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE,
25 .name = "autofs",
26 .get_sb = autofs_get_sb,
27 .kill_sb = autofs_kill_sb,
28};
29
30static int __init init_autofs_fs(void)
31{
32 return register_filesystem(&autofs_fs_type);
33}
34
35static void __exit exit_autofs_fs(void)
36{
37 unregister_filesystem(&autofs_fs_type);
38}
39
40module_init(init_autofs_fs);
41module_exit(exit_autofs_fs);
42
43#ifdef DEBUG
44void autofs_say(const char *name, int len)
45{
46 printk("(%d: ", len);
47 while ( len-- )
48 printk("%c", *name++);
49 printk(")\n");
50}
51#endif
52MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e26..000000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/inode.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/kernel.h>
14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/file.h>
17#include <linux/parser.h>
18#include <linux/bitops.h>
19#include <linux/magic.h>
20#include "autofs_i.h"
21#include <linux/module.h>
22
23void autofs_kill_sb(struct super_block *sb)
24{
25 struct autofs_sb_info *sbi = autofs_sbi(sb);
26 unsigned int n;
27
28 /*
29 * In the event of a failure in get_sb_nodev the superblock
30 * info is not present so nothing else has been setup, so
31 * just call kill_anon_super when we are called from
32 * deactivate_super.
33 */
34 if (!sbi)
35 goto out_kill_sb;
36
37 if (!sbi->catatonic)
38 autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
39
40 put_pid(sbi->oz_pgrp);
41
42 autofs_hash_nuke(sbi);
43 for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
44 if (test_bit(n, sbi->symlink_bitmap))
45 kfree(sbi->symlink[n].data);
46 }
47
48 kfree(sb->s_fs_info);
49
50out_kill_sb:
51 DPRINTK(("autofs: shutting down\n"));
52 kill_anon_super(sb);
53}
54
55static const struct super_operations autofs_sops = {
56 .statfs = simple_statfs,
57 .show_options = generic_show_options,
58};
59
60enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
61
62static const match_table_t autofs_tokens = {
63 {Opt_fd, "fd=%u"},
64 {Opt_uid, "uid=%u"},
65 {Opt_gid, "gid=%u"},
66 {Opt_pgrp, "pgrp=%u"},
67 {Opt_minproto, "minproto=%u"},
68 {Opt_maxproto, "maxproto=%u"},
69 {Opt_err, NULL}
70};
71
72static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
73 pid_t *pgrp, int *minproto, int *maxproto)
74{
75 char *p;
76 substring_t args[MAX_OPT_ARGS];
77 int option;
78
79 *uid = current_uid();
80 *gid = current_gid();
81 *pgrp = task_pgrp_nr(current);
82
83 *minproto = *maxproto = AUTOFS_PROTO_VERSION;
84
85 *pipefd = -1;
86
87 if (!options)
88 return 1;
89
90 while ((p = strsep(&options, ",")) != NULL) {
91 int token;
92 if (!*p)
93 continue;
94
95 token = match_token(p, autofs_tokens, args);
96 switch (token) {
97 case Opt_fd:
98 if (match_int(&args[0], &option))
99 return 1;
100 *pipefd = option;
101 break;
102 case Opt_uid:
103 if (match_int(&args[0], &option))
104 return 1;
105 *uid = option;
106 break;
107 case Opt_gid:
108 if (match_int(&args[0], &option))
109 return 1;
110 *gid = option;
111 break;
112 case Opt_pgrp:
113 if (match_int(&args[0], &option))
114 return 1;
115 *pgrp = option;
116 break;
117 case Opt_minproto:
118 if (match_int(&args[0], &option))
119 return 1;
120 *minproto = option;
121 break;
122 case Opt_maxproto:
123 if (match_int(&args[0], &option))
124 return 1;
125 *maxproto = option;
126 break;
127 default:
128 return 1;
129 }
130 }
131 return (*pipefd < 0);
132}
133
134int autofs_fill_super(struct super_block *s, void *data, int silent)
135{
136 struct inode * root_inode;
137 struct dentry * root;
138 struct file * pipe;
139 int pipefd;
140 struct autofs_sb_info *sbi;
141 int minproto, maxproto;
142 pid_t pgid;
143
144 save_mount_options(s, data);
145
146 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
147 if (!sbi)
148 goto fail_unlock;
149 DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
150
151 s->s_fs_info = sbi;
152 sbi->magic = AUTOFS_SBI_MAGIC;
153 sbi->pipe = NULL;
154 sbi->catatonic = 1;
155 sbi->exp_timeout = 0;
156 autofs_initialize_hash(&sbi->dirhash);
157 sbi->queues = NULL;
158 memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
159 sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
160 s->s_blocksize = 1024;
161 s->s_blocksize_bits = 10;
162 s->s_magic = AUTOFS_SUPER_MAGIC;
163 s->s_op = &autofs_sops;
164 s->s_time_gran = 1;
165 sbi->sb = s;
166
167 root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
168 if (IS_ERR(root_inode))
169 goto fail_free;
170 root = d_alloc_root(root_inode);
171 pipe = NULL;
172
173 if (!root)
174 goto fail_iput;
175
176 /* Can this call block? - WTF cares? s is locked. */
177 if (parse_options(data, &pipefd, &root_inode->i_uid,
178 &root_inode->i_gid, &pgid, &minproto,
179 &maxproto)) {
180 printk("autofs: called with bogus options\n");
181 goto fail_dput;
182 }
183
184 /* Couldn't this be tested earlier? */
185 if (minproto > AUTOFS_PROTO_VERSION ||
186 maxproto < AUTOFS_PROTO_VERSION) {
187 printk("autofs: kernel does not match daemon version\n");
188 goto fail_dput;
189 }
190
191 DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
192 sbi->oz_pgrp = find_get_pid(pgid);
193
194 if (!sbi->oz_pgrp) {
195 printk("autofs: could not find process group %d\n", pgid);
196 goto fail_dput;
197 }
198
199 pipe = fget(pipefd);
200
201 if (!pipe) {
202 printk("autofs: could not open pipe file descriptor\n");
203 goto fail_put_pid;
204 }
205
206 if (!pipe->f_op || !pipe->f_op->write)
207 goto fail_fput;
208 sbi->pipe = pipe;
209 sbi->catatonic = 0;
210
211 /*
212 * Success! Install the root dentry now to indicate completion.
213 */
214 s->s_root = root;
215 return 0;
216
217fail_fput:
218 printk("autofs: pipe file descriptor does not contain proper ops\n");
219 fput(pipe);
220fail_put_pid:
221 put_pid(sbi->oz_pgrp);
222fail_dput:
223 dput(root);
224 goto fail_free;
225fail_iput:
226 printk("autofs: get root dentry failed\n");
227 iput(root_inode);
228fail_free:
229 kfree(sbi);
230 s->s_fs_info = NULL;
231fail_unlock:
232 return -EINVAL;
233}
234
235struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
236{
237 unsigned int n;
238 struct autofs_sb_info *sbi = autofs_sbi(sb);
239 struct inode *inode;
240
241 inode = iget_locked(sb, ino);
242 if (!inode)
243 return ERR_PTR(-ENOMEM);
244 if (!(inode->i_state & I_NEW))
245 return inode;
246
247 /* Initialize to the default case (stub directory) */
248
249 inode->i_op = &simple_dir_inode_operations;
250 inode->i_fop = &simple_dir_operations;
251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
252 inode->i_nlink = 2;
253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
254
255 if (ino == AUTOFS_ROOT_INO) {
256 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
257 inode->i_op = &autofs_root_inode_operations;
258 inode->i_fop = &autofs_root_operations;
259 goto done;
260 }
261
262 inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
263 inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
264
265 if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
266 /* Symlink inode - should be in symlink list */
267 struct autofs_symlink *sl;
268
269 n = ino - AUTOFS_FIRST_SYMLINK;
270 if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
271 printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
272 goto done;
273 }
274
275 inode->i_op = &autofs_symlink_inode_operations;
276 sl = &sbi->symlink[n];
277 inode->i_private = sl;
278 inode->i_mode = S_IFLNK | S_IRWXUGO;
279 inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
280 inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
281 inode->i_size = sl->len;
282 inode->i_nlink = 1;
283 }
284
285done:
286 unlock_new_inode(inode);
287 return inode;
288}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 11b1ea786d00..000000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,643 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/root.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/capability.h>
14#include <linux/errno.h>
15#include <linux/stat.h>
16#include <linux/slab.h>
17#include <linux/param.h>
18#include <linux/time.h>
19#include <linux/compat.h>
20#include <linux/smp_lock.h>
21#include "autofs_i.h"
22
23static int autofs_root_readdir(struct file *,void *,filldir_t);
24static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
25static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
26static int autofs_root_unlink(struct inode *,struct dentry *);
27static int autofs_root_rmdir(struct inode *,struct dentry *);
28static int autofs_root_mkdir(struct inode *,struct dentry *,int);
29static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
30static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
31
32const struct file_operations autofs_root_operations = {
33 .llseek = generic_file_llseek,
34 .read = generic_read_dir,
35 .readdir = autofs_root_readdir,
36 .unlocked_ioctl = autofs_root_ioctl,
37#ifdef CONFIG_COMPAT
38 .compat_ioctl = autofs_root_compat_ioctl,
39#endif
40};
41
42const struct inode_operations autofs_root_inode_operations = {
43 .lookup = autofs_root_lookup,
44 .unlink = autofs_root_unlink,
45 .symlink = autofs_root_symlink,
46 .mkdir = autofs_root_mkdir,
47 .rmdir = autofs_root_rmdir,
48};
49
50static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
51{
52 struct autofs_dir_ent *ent = NULL;
53 struct autofs_dirhash *dirhash;
54 struct autofs_sb_info *sbi;
55 struct inode * inode = filp->f_path.dentry->d_inode;
56 off_t onr, nr;
57
58 lock_kernel();
59
60 sbi = autofs_sbi(inode->i_sb);
61 dirhash = &sbi->dirhash;
62 nr = filp->f_pos;
63
64 switch(nr)
65 {
66 case 0:
67 if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
68 goto out;
69 filp->f_pos = ++nr;
70 /* fall through */
71 case 1:
72 if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
73 goto out;
74 filp->f_pos = ++nr;
75 /* fall through */
76 default:
77 while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
78 if (!ent->dentry || d_mountpoint(ent->dentry)) {
79 if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
80 goto out;
81 filp->f_pos = nr;
82 }
83 }
84 break;
85 }
86
87out:
88 unlock_kernel();
89 return 0;
90}
91
92static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
93{
94 struct inode * inode;
95 struct autofs_dir_ent *ent;
96 int status = 0;
97
98 if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
99 do {
100 if (status && dentry->d_inode) {
101 if (status != -ENOENT)
102 printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
103 return 0; /* Try to get the kernel to invalidate this dentry */
104 }
105
106 /* Turn this into a real negative dentry? */
107 if (status == -ENOENT) {
108 dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
109 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
110 return 1;
111 } else if (status) {
112 /* Return a negative dentry, but leave it "pending" */
113 return 1;
114 }
115 status = autofs_wait(sbi, &dentry->d_name);
116 } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
117 }
118
119 /* Abuse this field as a pointer to the directory entry, used to
120 find the expire list pointers */
121 dentry->d_time = (unsigned long) ent;
122
123 if (!dentry->d_inode) {
124 inode = autofs_iget(sb, ent->ino);
125 if (IS_ERR(inode)) {
126 /* Failed, but leave pending for next time */
127 return 1;
128 }
129 dentry->d_inode = inode;
130 }
131
132 /* If this is a directory that isn't a mount point, bitch at the
133 daemon and fix it in user space */
134 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
135 return !autofs_wait(sbi, &dentry->d_name);
136 }
137
138 /* We don't update the usages for the autofs daemon itself, this
139 is necessary for recursive autofs mounts */
140 if (!autofs_oz_mode(sbi)) {
141 autofs_update_usage(&sbi->dirhash,ent);
142 }
143
144 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
145 return 1;
146}
147
148
149/*
150 * Revalidate is called on every cache lookup. Some of those
151 * cache lookups may actually happen while the dentry is not
152 * yet completely filled in, and revalidate has to delay such
153 * lookups..
154 */
155static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
156{
157 struct inode * dir;
158 struct autofs_sb_info *sbi;
159 struct autofs_dir_ent *ent;
160 int res;
161
162 lock_kernel();
163 dir = dentry->d_parent->d_inode;
164 sbi = autofs_sbi(dir->i_sb);
165
166 /* Pending dentry */
167 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
168 if (autofs_oz_mode(sbi))
169 res = 1;
170 else
171 res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
172 unlock_kernel();
173 return res;
174 }
175
176 /* Negative dentry.. invalidate if "old" */
177 if (!dentry->d_inode) {
178 unlock_kernel();
179 return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
180 }
181
182 /* Check for a non-mountpoint directory */
183 if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
184 if (autofs_oz_mode(sbi))
185 res = 1;
186 else
187 res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
188 unlock_kernel();
189 return res;
190 }
191
192 /* Update the usage list */
193 if (!autofs_oz_mode(sbi)) {
194 ent = (struct autofs_dir_ent *) dentry->d_time;
195 if (ent)
196 autofs_update_usage(&sbi->dirhash,ent);
197 }
198 unlock_kernel();
199 return 1;
200}
201
202static const struct dentry_operations autofs_dentry_operations = {
203 .d_revalidate = autofs_revalidate,
204};
205
206static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
207{
208 struct autofs_sb_info *sbi;
209 int oz_mode;
210
211 DPRINTK(("autofs_root_lookup: name = "));
212 lock_kernel();
213 autofs_say(dentry->d_name.name,dentry->d_name.len);
214
215 if (dentry->d_name.len > NAME_MAX) {
216 unlock_kernel();
217 return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
218 }
219
220 sbi = autofs_sbi(dir->i_sb);
221
222 oz_mode = autofs_oz_mode(sbi);
223 DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
224 "oz_mode = %d\n", task_pid_nr(current),
225 task_pgrp_nr(current), sbi->catatonic,
226 oz_mode));
227
228 /*
229 * Mark the dentry incomplete, but add it. This is needed so
230 * that the VFS layer knows about the dentry, and we can count
231 * on catching any lookups through the revalidate.
232 *
233 * Let all the hard work be done by the revalidate function that
234 * needs to be able to do this anyway..
235 *
236 * We need to do this before we release the directory semaphore.
237 */
238 dentry->d_op = &autofs_dentry_operations;
239 dentry->d_flags |= DCACHE_AUTOFS_PENDING;
240 d_add(dentry, NULL);
241
242 mutex_unlock(&dir->i_mutex);
243 autofs_revalidate(dentry, nd);
244 mutex_lock(&dir->i_mutex);
245
246 /*
247 * If we are still pending, check if we had to handle
248 * a signal. If so we can force a restart..
249 */
250 if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
251 /* See if we were interrupted */
252 if (signal_pending(current)) {
253 sigset_t *sigset = &current->pending.signal;
254 if (sigismember (sigset, SIGKILL) ||
255 sigismember (sigset, SIGQUIT) ||
256 sigismember (sigset, SIGINT)) {
257 unlock_kernel();
258 return ERR_PTR(-ERESTARTNOINTR);
259 }
260 }
261 }
262 unlock_kernel();
263
264 /*
265 * If this dentry is unhashed, then we shouldn't honour this
266 * lookup even if the dentry is positive. Returning ENOENT here
267 * doesn't do the right thing for all system calls, but it should
268 * be OK for the operations we permit from an autofs.
269 */
270 if (dentry->d_inode && d_unhashed(dentry))
271 return ERR_PTR(-ENOENT);
272
273 return NULL;
274}
275
276static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
277{
278 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
279 struct autofs_dirhash *dh = &sbi->dirhash;
280 struct autofs_dir_ent *ent;
281 unsigned int n;
282 int slsize;
283 struct autofs_symlink *sl;
284 struct inode *inode;
285
286 DPRINTK(("autofs_root_symlink: %s <- ", symname));
287 autofs_say(dentry->d_name.name,dentry->d_name.len);
288
289 lock_kernel();
290 if (!autofs_oz_mode(sbi)) {
291 unlock_kernel();
292 return -EACCES;
293 }
294
295 if (autofs_hash_lookup(dh, &dentry->d_name)) {
296 unlock_kernel();
297 return -EEXIST;
298 }
299
300 n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
301 if (n >= AUTOFS_MAX_SYMLINKS) {
302 unlock_kernel();
303 return -ENOSPC;
304 }
305
306 set_bit(n,sbi->symlink_bitmap);
307 sl = &sbi->symlink[n];
308 sl->len = strlen(symname);
309 sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
310 if (!sl->data) {
311 clear_bit(n,sbi->symlink_bitmap);
312 unlock_kernel();
313 return -ENOSPC;
314 }
315
316 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
317 if (!ent) {
318 kfree(sl->data);
319 clear_bit(n,sbi->symlink_bitmap);
320 unlock_kernel();
321 return -ENOSPC;
322 }
323
324 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
325 if (!ent->name) {
326 kfree(sl->data);
327 kfree(ent);
328 clear_bit(n,sbi->symlink_bitmap);
329 unlock_kernel();
330 return -ENOSPC;
331 }
332
333 memcpy(sl->data,symname,slsize);
334 sl->mtime = get_seconds();
335
336 ent->ino = AUTOFS_FIRST_SYMLINK + n;
337 ent->hash = dentry->d_name.hash;
338 memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
339 ent->dentry = NULL; /* We don't keep the dentry for symlinks */
340
341 autofs_hash_insert(dh,ent);
342
343 inode = autofs_iget(dir->i_sb, ent->ino);
344 if (IS_ERR(inode))
345 return PTR_ERR(inode);
346
347 d_instantiate(dentry, inode);
348 unlock_kernel();
349 return 0;
350}
351
352/*
353 * NOTE!
354 *
355 * Normal filesystems would do a "d_delete()" to tell the VFS dcache
356 * that the file no longer exists. However, doing that means that the
357 * VFS layer can turn the dentry into a negative dentry, which we
358 * obviously do not want (we're dropping the entry not because it
359 * doesn't exist, but because it has timed out).
360 *
361 * Also see autofs_root_rmdir()..
362 */
363static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
364{
365 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
366 struct autofs_dirhash *dh = &sbi->dirhash;
367 struct autofs_dir_ent *ent;
368 unsigned int n;
369
370 /* This allows root to remove symlinks */
371 lock_kernel();
372 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
373 unlock_kernel();
374 return -EACCES;
375 }
376
377 ent = autofs_hash_lookup(dh, &dentry->d_name);
378 if (!ent) {
379 unlock_kernel();
380 return -ENOENT;
381 }
382
383 n = ent->ino - AUTOFS_FIRST_SYMLINK;
384 if (n >= AUTOFS_MAX_SYMLINKS) {
385 unlock_kernel();
386 return -EISDIR; /* It's a directory, dummy */
387 }
388 if (!test_bit(n,sbi->symlink_bitmap)) {
389 unlock_kernel();
390 return -EINVAL; /* Nonexistent symlink? Shouldn't happen */
391 }
392
393 dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
394 autofs_hash_delete(ent);
395 clear_bit(n,sbi->symlink_bitmap);
396 kfree(sbi->symlink[n].data);
397 d_drop(dentry);
398
399 unlock_kernel();
400 return 0;
401}
402
403static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
404{
405 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
406 struct autofs_dirhash *dh = &sbi->dirhash;
407 struct autofs_dir_ent *ent;
408
409 lock_kernel();
410 if (!autofs_oz_mode(sbi)) {
411 unlock_kernel();
412 return -EACCES;
413 }
414
415 ent = autofs_hash_lookup(dh, &dentry->d_name);
416 if (!ent) {
417 unlock_kernel();
418 return -ENOENT;
419 }
420
421 if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
422 unlock_kernel();
423 return -ENOTDIR; /* Not a directory */
424 }
425
426 if (ent->dentry != dentry) {
427 printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
428 }
429
430 dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
431 autofs_hash_delete(ent);
432 drop_nlink(dir);
433 d_drop(dentry);
434 unlock_kernel();
435
436 return 0;
437}
438
439static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
440{
441 struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
442 struct autofs_dirhash *dh = &sbi->dirhash;
443 struct autofs_dir_ent *ent;
444 struct inode *inode;
445 ino_t ino;
446
447 lock_kernel();
448 if (!autofs_oz_mode(sbi)) {
449 unlock_kernel();
450 return -EACCES;
451 }
452
453 ent = autofs_hash_lookup(dh, &dentry->d_name);
454 if (ent) {
455 unlock_kernel();
456 return -EEXIST;
457 }
458
459 if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
460 printk("autofs: Out of inode numbers -- what the heck did you do??\n");
461 unlock_kernel();
462 return -ENOSPC;
463 }
464 ino = sbi->next_dir_ino++;
465
466 ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
467 if (!ent) {
468 unlock_kernel();
469 return -ENOSPC;
470 }
471
472 ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
473 if (!ent->name) {
474 kfree(ent);
475 unlock_kernel();
476 return -ENOSPC;
477 }
478
479 ent->hash = dentry->d_name.hash;
480 memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
481 ent->ino = ino;
482 ent->dentry = dentry;
483 autofs_hash_insert(dh,ent);
484
485 inc_nlink(dir);
486
487 inode = autofs_iget(dir->i_sb, ino);
488 if (IS_ERR(inode)) {
489 drop_nlink(dir);
490 return PTR_ERR(inode);
491 }
492
493 d_instantiate(dentry, inode);
494 unlock_kernel();
495
496 return 0;
497}
498
499/* Get/set timeout ioctl() operation */
500#ifdef CONFIG_COMPAT
501static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
502 unsigned int __user *p)
503{
504 unsigned long ntimeout;
505
506 if (get_user(ntimeout, p) ||
507 put_user(sbi->exp_timeout / HZ, p))
508 return -EFAULT;
509
510 if (ntimeout > UINT_MAX/HZ)
511 sbi->exp_timeout = 0;
512 else
513 sbi->exp_timeout = ntimeout * HZ;
514
515 return 0;
516}
517#endif
518
519static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
520 unsigned long __user *p)
521{
522 unsigned long ntimeout;
523
524 if (get_user(ntimeout, p) ||
525 put_user(sbi->exp_timeout / HZ, p))
526 return -EFAULT;
527
528 if (ntimeout > ULONG_MAX/HZ)
529 sbi->exp_timeout = 0;
530 else
531 sbi->exp_timeout = ntimeout * HZ;
532
533 return 0;
534}
535
536/* Return protocol version */
537static inline int autofs_get_protover(int __user *p)
538{
539 return put_user(AUTOFS_PROTO_VERSION, p);
540}
541
542/* Perform an expiry operation */
543static inline int autofs_expire_run(struct super_block *sb,
544 struct autofs_sb_info *sbi,
545 struct vfsmount *mnt,
546 struct autofs_packet_expire __user *pkt_p)
547{
548 struct autofs_dir_ent *ent;
549 struct autofs_packet_expire pkt;
550
551 memset(&pkt,0,sizeof pkt);
552
553 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
554 pkt.hdr.type = autofs_ptype_expire;
555
556 if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
557 return -EAGAIN;
558
559 pkt.len = ent->len;
560 memcpy(pkt.name, ent->name, pkt.len);
561 pkt.name[pkt.len] = '\0';
562
563 if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
564 return -EFAULT;
565
566 return 0;
567}
568
569/*
570 * ioctl()'s on the root directory is the chief method for the daemon to
571 * generate kernel reactions
572 */
573static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
574 unsigned int cmd, unsigned long arg)
575{
576 struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
577 void __user *argp = (void __user *)arg;
578
579 DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
580
581 if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
582 _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
583 return -ENOTTY;
584
585 if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
586 return -EPERM;
587
588 switch(cmd) {
589 case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
590 return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
591 case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
592 return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
593 case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
594 autofs_catatonic_mode(sbi);
595 return 0;
596 case AUTOFS_IOC_PROTOVER: /* Get protocol version */
597 return autofs_get_protover(argp);
598#ifdef CONFIG_COMPAT
599 case AUTOFS_IOC_SETTIMEOUT32:
600 return autofs_compat_get_set_timeout(sbi, argp);
601#endif
602 case AUTOFS_IOC_SETTIMEOUT:
603 return autofs_get_set_timeout(sbi, argp);
604 case AUTOFS_IOC_EXPIRE:
605 return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
606 argp);
607 default:
608 return -ENOSYS;
609 }
610
611}
612
613static long autofs_root_ioctl(struct file *filp,
614 unsigned int cmd, unsigned long arg)
615{
616 int ret;
617
618 lock_kernel();
619 ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
620 filp, cmd, arg);
621 unlock_kernel();
622
623 return ret;
624}
625
626#ifdef CONFIG_COMPAT
627static long autofs_root_compat_ioctl(struct file *filp,
628 unsigned int cmd, unsigned long arg)
629{
630 struct inode *inode = filp->f_path.dentry->d_inode;
631 int ret;
632
633 lock_kernel();
634 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
635 ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
636 else
637 ret = autofs_do_root_ioctl(inode, filp, cmd,
638 (unsigned long)compat_ptr(arg));
639 unlock_kernel();
640
641 return ret;
642}
643#endif
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce2..000000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/symlink.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include "autofs_i.h"
14
15/* Nothing to release.. */
16static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
17{
18 char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
19 nd_set_link(nd, s);
20 return NULL;
21}
22
23const struct inode_operations autofs_symlink_inode_operations = {
24 .readlink = generic_readlink,
25 .follow_link = autofs_follow_link
26};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f0..000000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/* -*- linux-c -*- --------------------------------------------------------- *
2 *
3 * linux/fs/autofs/waitq.c
4 *
5 * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
6 *
7 * This file is part of the Linux kernel and is made available under
8 * the terms of the GNU General Public License, version 2, or at your
9 * option, any later version, incorporated herein by reference.
10 *
11 * ------------------------------------------------------------------------- */
12
13#include <linux/slab.h>
14#include <linux/time.h>
15#include <linux/signal.h>
16#include <linux/file.h>
17#include "autofs_i.h"
18
19/* We make this a static variable rather than a part of the superblock; it
20 is better if we don't reassign numbers easily even across filesystems */
21static autofs_wqt_t autofs_next_wait_queue = 1;
22
23/* These are the signals we allow interrupting a pending mount */
24#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
25
26void autofs_catatonic_mode(struct autofs_sb_info *sbi)
27{
28 struct autofs_wait_queue *wq, *nwq;
29
30 DPRINTK(("autofs: entering catatonic mode\n"));
31
32 sbi->catatonic = 1;
33 wq = sbi->queues;
34 sbi->queues = NULL; /* Erase all wait queues */
35 while ( wq ) {
36 nwq = wq->next;
37 wq->status = -ENOENT; /* Magic is gone - report failure */
38 kfree(wq->name);
39 wq->name = NULL;
40 wake_up(&wq->queue);
41 wq = nwq;
42 }
43 fput(sbi->pipe); /* Close the pipe */
44 sbi->pipe = NULL;
45 autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
46}
47
48static int autofs_write(struct file *file, const void *addr, int bytes)
49{
50 unsigned long sigpipe, flags;
51 mm_segment_t fs;
52 const char *data = (const char *)addr;
53 ssize_t wr = 0;
54
55 /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
56
57 sigpipe = sigismember(&current->pending.signal, SIGPIPE);
58
59 /* Save pointer to user space and point back to kernel space */
60 fs = get_fs();
61 set_fs(KERNEL_DS);
62
63 while (bytes &&
64 (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
65 data += wr;
66 bytes -= wr;
67 }
68
69 set_fs(fs);
70
71 /* Keep the currently executing process from receiving a
72 SIGPIPE unless it was already supposed to get one */
73 if (wr == -EPIPE && !sigpipe) {
74 spin_lock_irqsave(&current->sighand->siglock, flags);
75 sigdelset(&current->pending.signal, SIGPIPE);
76 recalc_sigpending();
77 spin_unlock_irqrestore(&current->sighand->siglock, flags);
78 }
79
80 return (bytes > 0);
81}
82
83static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
84{
85 struct autofs_packet_missing pkt;
86
87 DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
88 autofs_say(wq->name,wq->len);
89
90 memset(&pkt,0,sizeof pkt); /* For security reasons */
91
92 pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
93 pkt.hdr.type = autofs_ptype_missing;
94 pkt.wait_queue_token = wq->wait_queue_token;
95 pkt.len = wq->len;
96 memcpy(pkt.name, wq->name, pkt.len);
97 pkt.name[pkt.len] = '\0';
98
99 if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
100 autofs_catatonic_mode(sbi);
101}
102
103int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
104{
105 struct autofs_wait_queue *wq;
106 int status;
107
108 /* In catatonic mode, we don't wait for nobody */
109 if ( sbi->catatonic )
110 return -ENOENT;
111
112 /* We shouldn't be able to get here, but just in case */
113 if ( name->len > NAME_MAX )
114 return -ENOENT;
115
116 for ( wq = sbi->queues ; wq ; wq = wq->next ) {
117 if ( wq->hash == name->hash &&
118 wq->len == name->len &&
119 wq->name && !memcmp(wq->name,name->name,name->len) )
120 break;
121 }
122
123 if ( !wq ) {
124 /* Create a new wait queue */
125 wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
126 if ( !wq )
127 return -ENOMEM;
128
129 wq->name = kmalloc(name->len,GFP_KERNEL);
130 if ( !wq->name ) {
131 kfree(wq);
132 return -ENOMEM;
133 }
134 wq->wait_queue_token = autofs_next_wait_queue++;
135 init_waitqueue_head(&wq->queue);
136 wq->hash = name->hash;
137 wq->len = name->len;
138 wq->status = -EINTR; /* Status return if interrupted */
139 memcpy(wq->name, name->name, name->len);
140 wq->next = sbi->queues;
141 sbi->queues = wq;
142
143 /* autofs_notify_daemon() may block */
144 wq->wait_ctr = 2;
145 autofs_notify_daemon(sbi,wq);
146 } else
147 wq->wait_ctr++;
148
149 /* wq->name is NULL if and only if the lock is already released */
150
151 if ( sbi->catatonic ) {
152 /* We might have slept, so check again for catatonic mode */
153 wq->status = -ENOENT;
154 kfree(wq->name);
155 wq->name = NULL;
156 }
157
158 if ( wq->name ) {
159 /* Block all but "shutdown" signals while waiting */
160 sigset_t sigmask;
161
162 siginitsetinv(&sigmask, SHUTDOWN_SIGS);
163 sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
164
165 interruptible_sleep_on(&wq->queue);
166
167 sigprocmask(SIG_SETMASK, &sigmask, NULL);
168 } else {
169 DPRINTK(("autofs_wait: skipped sleeping\n"));
170 }
171
172 status = wq->status;
173
174 if ( ! --wq->wait_ctr ) /* Are we the last process to need status? */
175 kfree(wq);
176
177 return status;
178}
179
180
181int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
182{
183 struct autofs_wait_queue *wq, **wql;
184
185 for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
186 if ( wq->wait_queue_token == wait_queue_token )
187 break;
188 }
189 if ( !wq )
190 return -EINVAL;
191
192 *wql = wq->next; /* Unlink from chain */
193 kfree(wq->name);
194 wq->name = NULL; /* Do not wait on this queue */
195
196 wq->status = status;
197
198 if ( ! --wq->wait_ctr ) /* Is anyone still waiting for this guy? */
199 kfree(wq);
200 else
201 wake_up(&wq->queue);
202
203 return 0;
204}
205
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283abf67d7..54f923792728 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
16#include <linux/auto_fs4.h> 16#include <linux/auto_fs4.h>
17#include <linux/auto_dev-ioctl.h> 17#include <linux/auto_dev-ioctl.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/spinlock.h>
19#include <linux/list.h> 20#include <linux/list.h>
20 21
21/* This is the range of ioctl() numbers we claim as ours */ 22/* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do { \
60 current->pid, __func__, ##args); \ 61 current->pid, __func__, ##args); \
61} while (0) 62} while (0)
62 63
64extern spinlock_t autofs4_lock;
65
63/* Unified info structure. This is pointed to by both the dentry and 66/* Unified info structure. This is pointed to by both the dentry and
64 inode structures. Each file in the filesystem has an instance of this 67 inode structures. Each file in the filesystem has an instance of this
65 structure. It holds a reference to the dentry, so dentries are never 68 structure. It holds a reference to the dentry, so dentries are never
@@ -85,18 +88,9 @@ struct autofs_info {
85 88
86 uid_t uid; 89 uid_t uid;
87 gid_t gid; 90 gid_t gid;
88
89 mode_t mode;
90 size_t size;
91
92 void (*free)(struct autofs_info *);
93 union {
94 const char *symlink;
95 } u;
96}; 91};
97 92
98#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 93#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
99#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
100#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 94#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
101 95
102struct autofs_wait_queue { 96struct autofs_wait_queue {
@@ -173,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
173 return 0; 167 return 0;
174} 168}
175 169
176static inline void autofs4_copy_atime(struct file *src, struct file *dst) 170struct inode *autofs4_get_inode(struct super_block *, mode_t);
177{
178 dst->f_path.dentry->d_inode->i_atime =
179 src->f_path.dentry->d_inode->i_atime;
180 return;
181}
182
183struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
184void autofs4_free_ino(struct autofs_info *); 171void autofs4_free_ino(struct autofs_info *);
185 172
186/* Expiration */ 173/* Expiration */
@@ -209,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
209 196
210extern const struct inode_operations autofs4_symlink_inode_operations; 197extern const struct inode_operations autofs4_symlink_inode_operations;
211extern const struct inode_operations autofs4_dir_inode_operations; 198extern const struct inode_operations autofs4_dir_inode_operations;
212extern const struct inode_operations autofs4_root_inode_operations;
213extern const struct inode_operations autofs4_indirect_root_inode_operations;
214extern const struct inode_operations autofs4_direct_root_inode_operations;
215extern const struct file_operations autofs4_dir_operations; 199extern const struct file_operations autofs4_dir_operations;
216extern const struct file_operations autofs4_root_operations; 200extern const struct file_operations autofs4_root_operations;
201extern const struct dentry_operations autofs4_dentry_operations;
202
203/* VFS automount flags management functions */
204
205static inline void __managed_dentry_set_automount(struct dentry *dentry)
206{
207 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
208}
209
210static inline void managed_dentry_set_automount(struct dentry *dentry)
211{
212 spin_lock(&dentry->d_lock);
213 __managed_dentry_set_automount(dentry);
214 spin_unlock(&dentry->d_lock);
215}
216
217static inline void __managed_dentry_clear_automount(struct dentry *dentry)
218{
219 dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
220}
221
222static inline void managed_dentry_clear_automount(struct dentry *dentry)
223{
224 spin_lock(&dentry->d_lock);
225 __managed_dentry_clear_automount(dentry);
226 spin_unlock(&dentry->d_lock);
227}
228
229static inline void __managed_dentry_set_transit(struct dentry *dentry)
230{
231 dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
232}
233
234static inline void managed_dentry_set_transit(struct dentry *dentry)
235{
236 spin_lock(&dentry->d_lock);
237 __managed_dentry_set_transit(dentry);
238 spin_unlock(&dentry->d_lock);
239}
240
241static inline void __managed_dentry_clear_transit(struct dentry *dentry)
242{
243 dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
244}
245
246static inline void managed_dentry_clear_transit(struct dentry *dentry)
247{
248 spin_lock(&dentry->d_lock);
249 __managed_dentry_clear_transit(dentry);
250 spin_unlock(&dentry->d_lock);
251}
252
253static inline void __managed_dentry_set_managed(struct dentry *dentry)
254{
255 dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
256}
257
258static inline void managed_dentry_set_managed(struct dentry *dentry)
259{
260 spin_lock(&dentry->d_lock);
261 __managed_dentry_set_managed(dentry);
262 spin_unlock(&dentry->d_lock);
263}
264
265static inline void __managed_dentry_clear_managed(struct dentry *dentry)
266{
267 dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
268}
269
270static inline void managed_dentry_clear_managed(struct dentry *dentry)
271{
272 spin_lock(&dentry->d_lock);
273 __managed_dentry_clear_managed(dentry);
274 spin_unlock(&dentry->d_lock);
275}
217 276
218/* Initializing function */ 277/* Initializing function */
219 278
220int autofs4_fill_super(struct super_block *, void *, int); 279int autofs4_fill_super(struct super_block *, void *, int);
221struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode); 280struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
281void autofs4_clean_ino(struct autofs_info *);
222 282
223/* Queue management functions */ 283/* Queue management functions */
224 284
@@ -226,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
226int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 286int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
227void autofs4_catatonic_mode(struct autofs_sb_info *); 287void autofs4_catatonic_mode(struct autofs_sb_info *);
228 288
229static inline int autofs4_follow_mount(struct path *path)
230{
231 int res = 0;
232
233 while (d_mountpoint(path->dentry)) {
234 int followed = follow_down(path);
235 if (!followed)
236 break;
237 res = 1;
238 }
239 return res;
240}
241
242static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) 289static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
243{ 290{
244 return new_encode_dev(sbi->sb->s_dev); 291 return new_encode_dev(sbi->sb->s_dev);
@@ -254,17 +301,15 @@ static inline int simple_positive(struct dentry *dentry)
254 return dentry->d_inode && !d_unhashed(dentry); 301 return dentry->d_inode && !d_unhashed(dentry);
255} 302}
256 303
257static inline int __simple_empty(struct dentry *dentry) 304static inline void __autofs4_add_expiring(struct dentry *dentry)
258{ 305{
259 struct dentry *child; 306 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
260 int ret = 0; 307 struct autofs_info *ino = autofs4_dentry_ino(dentry);
261 308 if (ino) {
262 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) 309 if (list_empty(&ino->expiring))
263 if (simple_positive(child)) 310 list_add(&ino->expiring, &sbi->expiring_list);
264 goto out; 311 }
265 ret = 1; 312 return;
266out:
267 return ret;
268} 313}
269 314
270static inline void autofs4_add_expiring(struct dentry *dentry) 315static inline void autofs4_add_expiring(struct dentry *dentry)
@@ -293,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
293 return; 338 return;
294} 339}
295 340
296void autofs4_dentry_release(struct dentry *);
297extern void autofs4_kill_sb(struct super_block *); 341extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22f..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
551 551
552 err = have_submounts(path.dentry); 552 err = have_submounts(path.dentry);
553 553
554 if (follow_down(&path)) 554 if (follow_down_one(&path))
555 magic = path.mnt->mnt_sb->s_magic; 555 magic = path.mnt->mnt_sb->s_magic;
556 } 556 }
557 557
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
724 .unlocked_ioctl = autofs_dev_ioctl, 724 .unlocked_ioctl = autofs_dev_ioctl,
725 .compat_ioctl = autofs_dev_ioctl_compat, 725 .compat_ioctl = autofs_dev_ioctl_compat,
726 .owner = THIS_MODULE, 726 .owner = THIS_MODULE,
727 .llseek = noop_llseek,
727}; 728};
728 729
729static struct miscdevice _autofs_dev_ioctl_misc = { 730static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a796c9417fb1..f43100b9662b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
26 if (ino == NULL) 26 if (ino == NULL)
27 return 0; 27 return 0;
28 28
29 /* No point expiring a pending mount */
30 if (ino->flags & AUTOFS_INF_PENDING)
31 return 0;
32
33 if (!do_now) { 29 if (!do_now) {
34 /* Too young to die */ 30 /* Too young to die */
35 if (!timeout || time_after(ino->last_used + timeout, now)) 31 if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
56 52
57 path_get(&path); 53 path_get(&path);
58 54
59 if (!follow_down(&path)) 55 if (!follow_down_one(&path))
60 goto done; 56 goto done;
61 57
62 if (is_autofs4_dentry(path.dentry)) { 58 if (is_autofs4_dentry(path.dentry)) {
@@ -91,24 +87,64 @@ done:
91} 87}
92 88
93/* 89/*
94 * Calculate next entry in top down tree traversal. 90 * Calculate and dget next entry in top down tree traversal.
95 * From next_mnt in namespace.c - elegant.
96 */ 91 */
97static struct dentry *next_dentry(struct dentry *p, struct dentry *root) 92static struct dentry *get_next_positive_dentry(struct dentry *prev,
93 struct dentry *root)
98{ 94{
99 struct list_head *next = p->d_subdirs.next; 95 struct list_head *next;
96 struct dentry *p, *ret;
97
98 if (prev == NULL)
99 return dget(root);
100 100
101 spin_lock(&autofs4_lock);
102relock:
103 p = prev;
104 spin_lock(&p->d_lock);
105again:
106 next = p->d_subdirs.next;
101 if (next == &p->d_subdirs) { 107 if (next == &p->d_subdirs) {
102 while (1) { 108 while (1) {
103 if (p == root) 109 struct dentry *parent;
110
111 if (p == root) {
112 spin_unlock(&p->d_lock);
113 spin_unlock(&autofs4_lock);
114 dput(prev);
104 return NULL; 115 return NULL;
116 }
117
118 parent = p->d_parent;
119 if (!spin_trylock(&parent->d_lock)) {
120 spin_unlock(&p->d_lock);
121 cpu_relax();
122 goto relock;
123 }
124 spin_unlock(&p->d_lock);
105 next = p->d_u.d_child.next; 125 next = p->d_u.d_child.next;
106 if (next != &p->d_parent->d_subdirs) 126 p = parent;
127 if (next != &parent->d_subdirs)
107 break; 128 break;
108 p = p->d_parent;
109 } 129 }
110 } 130 }
111 return list_entry(next, struct dentry, d_u.d_child); 131 ret = list_entry(next, struct dentry, d_u.d_child);
132
133 spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
134 /* Negative dentry - try next */
135 if (!simple_positive(ret)) {
136 spin_unlock(&p->d_lock);
137 p = ret;
138 goto again;
139 }
140 dget_dlock(ret);
141 spin_unlock(&ret->d_lock);
142 spin_unlock(&p->d_lock);
143 spin_unlock(&autofs4_lock);
144
145 dput(prev);
146
147 return ret;
112} 148}
113 149
114/* 150/*
@@ -158,18 +194,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
158 if (!simple_positive(top)) 194 if (!simple_positive(top))
159 return 1; 195 return 1;
160 196
161 spin_lock(&dcache_lock); 197 p = NULL;
162 for (p = top; p; p = next_dentry(p, top)) { 198 while ((p = get_next_positive_dentry(p, top))) {
163 /* Negative dentry - give up */
164 if (!simple_positive(p))
165 continue;
166
167 DPRINTK("dentry %p %.*s", 199 DPRINTK("dentry %p %.*s",
168 p, (int) p->d_name.len, p->d_name.name); 200 p, (int) p->d_name.len, p->d_name.name);
169 201
170 p = dget(p);
171 spin_unlock(&dcache_lock);
172
173 /* 202 /*
174 * Is someone visiting anywhere in the subtree ? 203 * Is someone visiting anywhere in the subtree ?
175 * If there's no mount we need to check the usage 204 * If there's no mount we need to check the usage
@@ -198,16 +227,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
198 else 227 else
199 ino_count++; 228 ino_count++;
200 229
201 if (atomic_read(&p->d_count) > ino_count) { 230 if (p->d_count > ino_count) {
202 top_ino->last_used = jiffies; 231 top_ino->last_used = jiffies;
203 dput(p); 232 dput(p);
204 return 1; 233 return 1;
205 } 234 }
206 } 235 }
207 dput(p);
208 spin_lock(&dcache_lock);
209 } 236 }
210 spin_unlock(&dcache_lock);
211 237
212 /* Timeout of a tree mount is ultimately determined by its top dentry */ 238 /* Timeout of a tree mount is ultimately determined by its top dentry */
213 if (!autofs4_can_expire(top, timeout, do_now)) 239 if (!autofs4_can_expire(top, timeout, do_now))
@@ -226,32 +252,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
226 DPRINTK("parent %p %.*s", 252 DPRINTK("parent %p %.*s",
227 parent, (int)parent->d_name.len, parent->d_name.name); 253 parent, (int)parent->d_name.len, parent->d_name.name);
228 254
229 spin_lock(&dcache_lock); 255 p = NULL;
230 for (p = parent; p; p = next_dentry(p, parent)) { 256 while ((p = get_next_positive_dentry(p, parent))) {
231 /* Negative dentry - give up */
232 if (!simple_positive(p))
233 continue;
234
235 DPRINTK("dentry %p %.*s", 257 DPRINTK("dentry %p %.*s",
236 p, (int) p->d_name.len, p->d_name.name); 258 p, (int) p->d_name.len, p->d_name.name);
237 259
238 p = dget(p);
239 spin_unlock(&dcache_lock);
240
241 if (d_mountpoint(p)) { 260 if (d_mountpoint(p)) {
242 /* Can we umount this guy */ 261 /* Can we umount this guy */
243 if (autofs4_mount_busy(mnt, p)) 262 if (autofs4_mount_busy(mnt, p))
244 goto cont; 263 continue;
245 264
246 /* Can we expire this guy */ 265 /* Can we expire this guy */
247 if (autofs4_can_expire(p, timeout, do_now)) 266 if (autofs4_can_expire(p, timeout, do_now))
248 return p; 267 return p;
249 } 268 }
250cont:
251 dput(p);
252 spin_lock(&dcache_lock);
253 } 269 }
254 spin_unlock(&dcache_lock);
255 return NULL; 270 return NULL;
256} 271}
257 272
@@ -264,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
264 unsigned long timeout; 279 unsigned long timeout;
265 struct dentry *root = dget(sb->s_root); 280 struct dentry *root = dget(sb->s_root);
266 int do_now = how & AUTOFS_EXP_IMMEDIATE; 281 int do_now = how & AUTOFS_EXP_IMMEDIATE;
282 struct autofs_info *ino;
267 283
268 if (!root) 284 if (!root)
269 return NULL; 285 return NULL;
@@ -272,17 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
272 timeout = sbi->exp_timeout; 288 timeout = sbi->exp_timeout;
273 289
274 spin_lock(&sbi->fs_lock); 290 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) {
294 spin_unlock(&sbi->fs_lock);
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
275 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
276 struct autofs_info *ino = autofs4_dentry_ino(root); 299 struct autofs_info *ino = autofs4_dentry_ino(root);
277 if (d_mountpoint(root)) {
278 ino->flags |= AUTOFS_INF_MOUNTPOINT;
279 root->d_mounted--;
280 }
281 ino->flags |= AUTOFS_INF_EXPIRING; 300 ino->flags |= AUTOFS_INF_EXPIRING;
282 init_completion(&ino->expire_complete); 301 init_completion(&ino->expire_complete);
283 spin_unlock(&sbi->fs_lock); 302 spin_unlock(&sbi->fs_lock);
284 return root; 303 return root;
285 } 304 }
305 managed_dentry_clear_transit(root);
286 spin_unlock(&sbi->fs_lock); 306 spin_unlock(&sbi->fs_lock);
287 dput(root); 307 dput(root);
288 308
@@ -302,8 +322,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
302{ 322{
303 unsigned long timeout; 323 unsigned long timeout;
304 struct dentry *root = sb->s_root; 324 struct dentry *root = sb->s_root;
325 struct dentry *dentry;
305 struct dentry *expired = NULL; 326 struct dentry *expired = NULL;
306 struct list_head *next;
307 int do_now = how & AUTOFS_EXP_IMMEDIATE; 327 int do_now = how & AUTOFS_EXP_IMMEDIATE;
308 int exp_leaves = how & AUTOFS_EXP_LEAVES; 328 int exp_leaves = how & AUTOFS_EXP_LEAVES;
309 struct autofs_info *ino; 329 struct autofs_info *ino;
@@ -315,25 +335,14 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
315 now = jiffies; 335 now = jiffies;
316 timeout = sbi->exp_timeout; 336 timeout = sbi->exp_timeout;
317 337
318 spin_lock(&dcache_lock); 338 dentry = NULL;
319 next = root->d_subdirs.next; 339 while ((dentry = get_next_positive_dentry(dentry, root))) {
320
321 /* On exit from the loop expire is set to a dgot dentry
322 * to expire or it's NULL */
323 while ( next != &root->d_subdirs ) {
324 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
325
326 /* Negative dentry - give up */
327 if (!simple_positive(dentry)) {
328 next = next->next;
329 continue;
330 }
331
332 dentry = dget(dentry);
333 spin_unlock(&dcache_lock);
334
335 spin_lock(&sbi->fs_lock); 340 spin_lock(&sbi->fs_lock);
336 ino = autofs4_dentry_ino(dentry); 341 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont;
345 managed_dentry_set_transit(dentry);
337 346
338 /* 347 /*
339 * Case 1: (i) indirect mount or top level pseudo direct mount 348 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -347,7 +356,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
347 356
348 /* Path walk currently on this dentry? */ 357 /* Path walk currently on this dentry? */
349 ino_count = atomic_read(&ino->count) + 2; 358 ino_count = atomic_read(&ino->count) + 2;
350 if (atomic_read(&dentry->d_count) > ino_count) 359 if (dentry->d_count > ino_count)
351 goto next; 360 goto next;
352 361
353 /* Can we umount this guy */ 362 /* Can we umount this guy */
@@ -369,7 +378,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
369 if (!exp_leaves) { 378 if (!exp_leaves) {
370 /* Path walk currently on this dentry? */ 379 /* Path walk currently on this dentry? */
371 ino_count = atomic_read(&ino->count) + 1; 380 ino_count = atomic_read(&ino->count) + 1;
372 if (atomic_read(&dentry->d_count) > ino_count) 381 if (dentry->d_count > ino_count)
373 goto next; 382 goto next;
374 383
375 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { 384 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -383,7 +392,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
383 } else { 392 } else {
384 /* Path walk currently on this dentry? */ 393 /* Path walk currently on this dentry? */
385 ino_count = atomic_read(&ino->count) + 1; 394 ino_count = atomic_read(&ino->count) + 1;
386 if (atomic_read(&dentry->d_count) > ino_count) 395 if (dentry->d_count > ino_count)
387 goto next; 396 goto next;
388 397
389 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); 398 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
@@ -393,12 +402,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
393 } 402 }
394 } 403 }
395next: 404next:
405 managed_dentry_clear_transit(dentry);
406cont:
396 spin_unlock(&sbi->fs_lock); 407 spin_unlock(&sbi->fs_lock);
397 dput(dentry);
398 spin_lock(&dcache_lock);
399 next = next->next;
400 } 408 }
401 spin_unlock(&dcache_lock);
402 return NULL; 409 return NULL;
403 410
404found: 411found:
@@ -408,9 +415,13 @@ found:
408 ino->flags |= AUTOFS_INF_EXPIRING; 415 ino->flags |= AUTOFS_INF_EXPIRING;
409 init_completion(&ino->expire_complete); 416 init_completion(&ino->expire_complete);
410 spin_unlock(&sbi->fs_lock); 417 spin_unlock(&sbi->fs_lock);
411 spin_lock(&dcache_lock); 418 spin_lock(&autofs4_lock);
419 spin_lock(&expired->d_parent->d_lock);
420 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
412 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); 421 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
413 spin_unlock(&dcache_lock); 422 spin_unlock(&expired->d_lock);
423 spin_unlock(&expired->d_parent->d_lock);
424 spin_unlock(&autofs4_lock);
414 return expired; 425 return expired;
415} 426}
416 427
@@ -473,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
473 spin_lock(&sbi->fs_lock); 484 spin_lock(&sbi->fs_lock);
474 ino = autofs4_dentry_ino(dentry); 485 ino = autofs4_dentry_ino(dentry);
475 ino->flags &= ~AUTOFS_INF_EXPIRING; 486 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
476 complete_all(&ino->expire_complete); 489 complete_all(&ino->expire_complete);
477 spin_unlock(&sbi->fs_lock); 490 spin_unlock(&sbi->fs_lock);
478 491
@@ -498,11 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
498 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); 511 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
499 512
500 spin_lock(&sbi->fs_lock); 513 spin_lock(&sbi->fs_lock);
501 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
502 sb->s_root->d_mounted++;
503 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
504 }
505 ino->flags &= ~AUTOFS_INF_EXPIRING; 514 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock);
516 if (ret)
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) &&
522 !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
523 __managed_dentry_set_automount(dentry);
524 }
525 spin_unlock(&dentry->d_lock);
506 complete_all(&ino->expire_complete); 526 complete_all(&ino->expire_complete);
507 spin_unlock(&sbi->fs_lock); 527 spin_unlock(&sbi->fs_lock);
508 dput(dentry); 528 dput(dentry);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd8957..c038727b4050 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include "autofs_i.h" 15#include "autofs_i.h"
16 16
17static int autofs_get_sb(struct file_system_type *fs_type, 17static struct dentry *autofs_mount(struct file_system_type *fs_type,
18 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 18 int flags, const char *dev_name, void *data)
19{ 19{
20 return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt); 20 return mount_nodev(fs_type, flags, data, autofs4_fill_super);
21} 21}
22 22
23static struct file_system_type autofs_fs_type = { 23static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE, 24 .owner = THIS_MODULE,
25 .name = "autofs", 25 .name = "autofs",
26 .get_sb = autofs_get_sb, 26 .mount = autofs_mount,
27 .kill_sb = autofs4_kill_sb, 27 .kill_sb = autofs4_kill_sb,
28}; 28};
29 29
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..180fa2425e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
22#include "autofs_i.h" 22#include "autofs_i.h"
23#include <linux/module.h> 23#include <linux/module.h>
24 24
25static void ino_lnkfree(struct autofs_info *ino) 25struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
26{ 26{
27 if (ino->u.symlink) { 27 struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
28 kfree(ino->u.symlink); 28 if (ino) {
29 ino->u.symlink = NULL;
30 }
31}
32
33struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
34 struct autofs_sb_info *sbi, mode_t mode)
35{
36 int reinit = 1;
37
38 if (ino == NULL) {
39 reinit = 0;
40 ino = kmalloc(sizeof(*ino), GFP_KERNEL);
41 }
42
43 if (ino == NULL)
44 return NULL;
45
46 if (!reinit) {
47 ino->flags = 0;
48 ino->inode = NULL;
49 ino->dentry = NULL;
50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 29 INIT_LIST_HEAD(&ino->active);
52 ino->active_count = 0;
53 INIT_LIST_HEAD(&ino->expiring); 30 INIT_LIST_HEAD(&ino->expiring);
54 atomic_set(&ino->count, 0); 31 ino->last_used = jiffies;
32 ino->sbi = sbi;
55 } 33 }
34 return ino;
35}
56 36
37void autofs4_clean_ino(struct autofs_info *ino)
38{
57 ino->uid = 0; 39 ino->uid = 0;
58 ino->gid = 0; 40 ino->gid = 0;
59 ino->mode = mode;
60 ino->last_used = jiffies; 41 ino->last_used = jiffies;
61
62 ino->sbi = sbi;
63
64 if (reinit && ino->free)
65 (ino->free)(ino);
66
67 memset(&ino->u, 0, sizeof(ino->u));
68
69 ino->free = NULL;
70
71 if (S_ISLNK(mode))
72 ino->free = ino_lnkfree;
73
74 return ino;
75} 42}
76 43
77void autofs4_free_ino(struct autofs_info *ino) 44void autofs4_free_ino(struct autofs_info *ino)
78{ 45{
79 struct autofs_info *p_ino;
80
81 if (ino->dentry) {
82 ino->dentry->d_fsdata = NULL;
83 if (ino->dentry->d_inode) {
84 struct dentry *parent = ino->dentry->d_parent;
85 if (atomic_dec_and_test(&ino->count)) {
86 p_ino = autofs4_dentry_ino(parent);
87 if (p_ino && parent != ino->dentry)
88 atomic_dec(&p_ino->count);
89 }
90 dput(ino->dentry);
91 }
92 ino->dentry = NULL;
93 }
94 if (ino->free)
95 (ino->free)(ino);
96 kfree(ino); 46 kfree(ino);
97} 47}
98 48
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
148 return 0; 98 return 0;
149} 99}
150 100
101static void autofs4_evict_inode(struct inode *inode)
102{
103 end_writeback(inode);
104 kfree(inode->i_private);
105}
106
151static const struct super_operations autofs4_sops = { 107static const struct super_operations autofs4_sops = {
152 .statfs = simple_statfs, 108 .statfs = simple_statfs,
153 .show_options = autofs4_show_options, 109 .show_options = autofs4_show_options,
110 .evict_inode = autofs4_evict_inode,
154}; 111};
155 112
156enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, 113enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
240 return (*pipefd < 0); 197 return (*pipefd < 0);
241} 198}
242 199
243static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
244{
245 struct autofs_info *ino;
246
247 ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
248 if (!ino)
249 return NULL;
250
251 return ino;
252}
253
254static const struct dentry_operations autofs4_sb_dentry_operations = {
255 .d_release = autofs4_dentry_release,
256};
257
258int autofs4_fill_super(struct super_block *s, void *data, int silent) 200int autofs4_fill_super(struct super_block *s, void *data, int silent)
259{ 201{
260 struct inode * root_inode; 202 struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
292 s->s_blocksize_bits = 10; 234 s->s_blocksize_bits = 10;
293 s->s_magic = AUTOFS_SUPER_MAGIC; 235 s->s_magic = AUTOFS_SUPER_MAGIC;
294 s->s_op = &autofs4_sops; 236 s->s_op = &autofs4_sops;
237 s->s_d_op = &autofs4_dentry_operations;
295 s->s_time_gran = 1; 238 s->s_time_gran = 1;
296 239
297 /* 240 /*
298 * Get the root inode and dentry, but defer checking for errors. 241 * Get the root inode and dentry, but defer checking for errors.
299 */ 242 */
300 ino = autofs4_mkroot(sbi); 243 ino = autofs4_new_ino(sbi);
301 if (!ino) 244 if (!ino)
302 goto fail_free; 245 goto fail_free;
303 root_inode = autofs4_get_inode(s, ino); 246 root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
304 if (!root_inode) 247 if (!root_inode)
305 goto fail_ino; 248 goto fail_ino;
306 249
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
309 goto fail_iput; 252 goto fail_iput;
310 pipe = NULL; 253 pipe = NULL;
311 254
312 root->d_op = &autofs4_sb_dentry_operations;
313 root->d_fsdata = ino; 255 root->d_fsdata = ino;
314 256
315 /* Can this call block? */ 257 /* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
320 goto fail_dput; 262 goto fail_dput;
321 } 263 }
322 264
265 if (autofs_type_trigger(sbi->type))
266 __managed_dentry_set_managed(root);
267
323 root_inode->i_fop = &autofs4_root_operations; 268 root_inode->i_fop = &autofs4_root_operations;
324 root_inode->i_op = autofs_type_trigger(sbi->type) ? 269 root_inode->i_op = &autofs4_dir_inode_operations;
325 &autofs4_direct_root_inode_operations :
326 &autofs4_indirect_root_inode_operations;
327 270
328 /* Couldn't this be tested earlier? */ 271 /* Couldn't this be tested earlier? */
329 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || 272 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,28 +326,26 @@ fail_unlock:
383 return -EINVAL; 326 return -EINVAL;
384} 327}
385 328
386struct inode *autofs4_get_inode(struct super_block *sb, 329struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
387 struct autofs_info *inf)
388{ 330{
389 struct inode *inode = new_inode(sb); 331 struct inode *inode = new_inode(sb);
390 332
391 if (inode == NULL) 333 if (inode == NULL)
392 return NULL; 334 return NULL;
393 335
394 inf->inode = inode; 336 inode->i_mode = mode;
395 inode->i_mode = inf->mode;
396 if (sb->s_root) { 337 if (sb->s_root) {
397 inode->i_uid = sb->s_root->d_inode->i_uid; 338 inode->i_uid = sb->s_root->d_inode->i_uid;
398 inode->i_gid = sb->s_root->d_inode->i_gid; 339 inode->i_gid = sb->s_root->d_inode->i_gid;
399 } 340 }
400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 341 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
342 inode->i_ino = get_next_ino();
401 343
402 if (S_ISDIR(inf->mode)) { 344 if (S_ISDIR(mode)) {
403 inode->i_nlink = 2; 345 inode->i_nlink = 2;
404 inode->i_op = &autofs4_dir_inode_operations; 346 inode->i_op = &autofs4_dir_inode_operations;
405 inode->i_fop = &autofs4_dir_operations; 347 inode->i_fop = &autofs4_dir_operations;
406 } else if (S_ISLNK(inf->mode)) { 348 } else if (S_ISLNK(mode)) {
407 inode->i_size = inf->size;
408 inode->i_op = &autofs4_symlink_inode_operations; 349 inode->i_op = &autofs4_symlink_inode_operations;
409 } 350 }
410 351
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cb1bd38dc08c..014e7aba3b08 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -19,22 +19,25 @@
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/compat.h> 21#include <linux/compat.h>
22#include <linux/smp_lock.h> 22#include <linux/mutex.h>
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
26DEFINE_SPINLOCK(autofs4_lock);
27
26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 28static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
27static int autofs4_dir_unlink(struct inode *,struct dentry *); 29static int autofs4_dir_unlink(struct inode *,struct dentry *);
28static int autofs4_dir_rmdir(struct inode *,struct dentry *); 30static int autofs4_dir_rmdir(struct inode *,struct dentry *);
29static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 31static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); 32static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
33#ifdef CONFIG_COMPAT
31static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); 34static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
35#endif
32static int autofs4_dir_open(struct inode *inode, struct file *file); 36static int autofs4_dir_open(struct inode *inode, struct file *file);
33static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
34static void *autofs4_follow_link(struct dentry *, struct nameidata *); 38static struct vfsmount *autofs4_d_automount(struct path *);
35 39static int autofs4_d_manage(struct dentry *, bool, bool);
36#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) 40static void autofs4_dentry_release(struct dentry *);
37#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
38 41
39const struct file_operations autofs4_root_operations = { 42const struct file_operations autofs4_root_operations = {
40 .open = dcache_dir_open, 43 .open = dcache_dir_open,
@@ -56,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
56 .llseek = dcache_dir_lseek, 59 .llseek = dcache_dir_lseek,
57}; 60};
58 61
59const struct inode_operations autofs4_indirect_root_inode_operations = { 62const struct inode_operations autofs4_dir_inode_operations = {
60 .lookup = autofs4_lookup, 63 .lookup = autofs4_lookup,
61 .unlink = autofs4_dir_unlink, 64 .unlink = autofs4_dir_unlink,
62 .symlink = autofs4_dir_symlink, 65 .symlink = autofs4_dir_symlink,
@@ -64,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
64 .rmdir = autofs4_dir_rmdir, 67 .rmdir = autofs4_dir_rmdir,
65}; 68};
66 69
67const struct inode_operations autofs4_direct_root_inode_operations = { 70const struct dentry_operations autofs4_dentry_operations = {
68 .lookup = autofs4_lookup, 71 .d_automount = autofs4_d_automount,
69 .unlink = autofs4_dir_unlink, 72 .d_manage = autofs4_d_manage,
70 .mkdir = autofs4_dir_mkdir, 73 .d_release = autofs4_dentry_release,
71 .rmdir = autofs4_dir_rmdir,
72 .follow_link = autofs4_follow_link,
73};
74
75const struct inode_operations autofs4_dir_inode_operations = {
76 .lookup = autofs4_lookup,
77 .unlink = autofs4_dir_unlink,
78 .symlink = autofs4_dir_symlink,
79 .mkdir = autofs4_dir_mkdir,
80 .rmdir = autofs4_dir_rmdir,
81}; 74};
82 75
83static void autofs4_add_active(struct dentry *dentry) 76static void autofs4_add_active(struct dentry *dentry)
@@ -112,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
112 return; 105 return;
113} 106}
114 107
115static unsigned int autofs4_need_mount(unsigned int flags)
116{
117 unsigned int res = 0;
118 if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
119 res = 1;
120 return res;
121}
122
123static int autofs4_dir_open(struct inode *inode, struct file *file) 108static int autofs4_dir_open(struct inode *inode, struct file *file)
124{ 109{
125 struct dentry *dentry = file->f_path.dentry; 110 struct dentry *dentry = file->f_path.dentry;
@@ -140,275 +125,41 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
140 * autofs file system so just let the libfs routines handle 125 * autofs file system so just let the libfs routines handle
141 * it. 126 * it.
142 */ 127 */
143 spin_lock(&dcache_lock); 128 spin_lock(&autofs4_lock);
129 spin_lock(&dentry->d_lock);
144 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 130 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
145 spin_unlock(&dcache_lock); 131 spin_unlock(&dentry->d_lock);
132 spin_unlock(&autofs4_lock);
146 return -ENOENT; 133 return -ENOENT;
147 } 134 }
148 spin_unlock(&dcache_lock); 135 spin_unlock(&dentry->d_lock);
136 spin_unlock(&autofs4_lock);
149 137
150out: 138out:
151 return dcache_dir_open(inode, file); 139 return dcache_dir_open(inode, file);
152} 140}
153 141
154static int try_to_fill_dentry(struct dentry *dentry, int flags) 142static void autofs4_dentry_release(struct dentry *de)
155{
156 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
157 struct autofs_info *ino = autofs4_dentry_ino(dentry);
158 int status;
159
160 DPRINTK("dentry=%p %.*s ino=%p",
161 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
162
163 /*
164 * Wait for a pending mount, triggering one if there
165 * isn't one already
166 */
167 if (dentry->d_inode == NULL) {
168 DPRINTK("waiting for mount name=%.*s",
169 dentry->d_name.len, dentry->d_name.name);
170
171 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
172
173 DPRINTK("mount done status=%d", status);
174
175 /* Turn this into a real negative dentry? */
176 if (status == -ENOENT) {
177 spin_lock(&sbi->fs_lock);
178 ino->flags &= ~AUTOFS_INF_PENDING;
179 spin_unlock(&sbi->fs_lock);
180 return status;
181 } else if (status) {
182 /* Return a negative dentry, but leave it "pending" */
183 return status;
184 }
185 /* Trigger mount for path component or follow link */
186 } else if (ino->flags & AUTOFS_INF_PENDING ||
187 autofs4_need_mount(flags)) {
188 DPRINTK("waiting for mount name=%.*s",
189 dentry->d_name.len, dentry->d_name.name);
190
191 spin_lock(&sbi->fs_lock);
192 ino->flags |= AUTOFS_INF_PENDING;
193 spin_unlock(&sbi->fs_lock);
194 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
195
196 DPRINTK("mount done status=%d", status);
197
198 if (status) {
199 spin_lock(&sbi->fs_lock);
200 ino->flags &= ~AUTOFS_INF_PENDING;
201 spin_unlock(&sbi->fs_lock);
202 return status;
203 }
204 }
205
206 /* Initialize expiry counter after successful mount */
207 ino->last_used = jiffies;
208
209 spin_lock(&sbi->fs_lock);
210 ino->flags &= ~AUTOFS_INF_PENDING;
211 spin_unlock(&sbi->fs_lock);
212
213 return 0;
214}
215
216/* For autofs direct mounts the follow link triggers the mount */
217static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
218{ 143{
219 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 144 struct autofs_info *ino = autofs4_dentry_ino(de);
220 struct autofs_info *ino = autofs4_dentry_ino(dentry); 145 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
221 int oz_mode = autofs4_oz_mode(sbi);
222 unsigned int lookup_type;
223 int status;
224
225 DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
226 dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
227 nd->flags);
228 /*
229 * For an expire of a covered direct or offset mount we need
230 * to break out of follow_down() at the autofs mount trigger
231 * (d_mounted--), so we can see the expiring flag, and manage
232 * the blocking and following here until the expire is completed.
233 */
234 if (oz_mode) {
235 spin_lock(&sbi->fs_lock);
236 if (ino->flags & AUTOFS_INF_EXPIRING) {
237 spin_unlock(&sbi->fs_lock);
238 /* Follow down to our covering mount. */
239 if (!follow_down(&nd->path))
240 goto done;
241 goto follow;
242 }
243 spin_unlock(&sbi->fs_lock);
244 goto done;
245 }
246
247 /* If an expire request is pending everyone must wait. */
248 autofs4_expire_wait(dentry);
249
250 /* We trigger a mount for almost all flags */
251 lookup_type = autofs4_need_mount(nd->flags);
252 spin_lock(&sbi->fs_lock);
253 spin_lock(&dcache_lock);
254 if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
255 spin_unlock(&dcache_lock);
256 spin_unlock(&sbi->fs_lock);
257 goto follow;
258 }
259
260 /*
261 * If the dentry contains directories then it is an autofs
262 * multi-mount with no root mount offset. So don't try to
263 * mount it again.
264 */
265 if (ino->flags & AUTOFS_INF_PENDING ||
266 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
267 spin_unlock(&dcache_lock);
268 spin_unlock(&sbi->fs_lock);
269
270 status = try_to_fill_dentry(dentry, nd->flags);
271 if (status)
272 goto out_error;
273
274 goto follow;
275 }
276 spin_unlock(&dcache_lock);
277 spin_unlock(&sbi->fs_lock);
278follow:
279 /*
280 * If there is no root mount it must be an autofs
281 * multi-mount with no root offset so we don't need
282 * to follow it.
283 */
284 if (d_mountpoint(dentry)) {
285 if (!autofs4_follow_mount(&nd->path)) {
286 status = -ENOENT;
287 goto out_error;
288 }
289 }
290
291done:
292 return NULL;
293
294out_error:
295 path_put(&nd->path);
296 return ERR_PTR(status);
297}
298
299/*
300 * Revalidate is called on every cache lookup. Some of those
301 * cache lookups may actually happen while the dentry is not
302 * yet completely filled in, and revalidate has to delay such
303 * lookups..
304 */
305static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
306{
307 struct inode *dir = dentry->d_parent->d_inode;
308 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
309 int oz_mode = autofs4_oz_mode(sbi);
310 int flags = nd ? nd->flags : 0;
311 int status = 1;
312
313 /* Pending dentry */
314 spin_lock(&sbi->fs_lock);
315 if (autofs4_ispending(dentry)) {
316 /* The daemon never causes a mount to trigger */
317 spin_unlock(&sbi->fs_lock);
318
319 if (oz_mode)
320 return 1;
321
322 /*
323 * If the directory has gone away due to an expire
324 * we have been called as ->d_revalidate() and so
325 * we need to return false and proceed to ->lookup().
326 */
327 if (autofs4_expire_wait(dentry) == -EAGAIN)
328 return 0;
329
330 /*
331 * A zero status is success otherwise we have a
332 * negative error code.
333 */
334 status = try_to_fill_dentry(dentry, flags);
335 if (status == 0)
336 return 1;
337
338 return status;
339 }
340 spin_unlock(&sbi->fs_lock);
341
342 /* Negative dentry.. invalidate if "old" */
343 if (dentry->d_inode == NULL)
344 return 0;
345
346 /* Check for a non-mountpoint directory with no contents */
347 spin_lock(&dcache_lock);
348 if (S_ISDIR(dentry->d_inode->i_mode) &&
349 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
350 DPRINTK("dentry=%p %.*s, emptydir",
351 dentry, dentry->d_name.len, dentry->d_name.name);
352 spin_unlock(&dcache_lock);
353
354 /* The daemon never causes a mount to trigger */
355 if (oz_mode)
356 return 1;
357
358 /*
359 * A zero status is success otherwise we have a
360 * negative error code.
361 */
362 status = try_to_fill_dentry(dentry, flags);
363 if (status == 0)
364 return 1;
365
366 return status;
367 }
368 spin_unlock(&dcache_lock);
369
370 return 1;
371}
372
373void autofs4_dentry_release(struct dentry *de)
374{
375 struct autofs_info *inf;
376 146
377 DPRINTK("releasing %p", de); 147 DPRINTK("releasing %p", de);
378 148
379 inf = autofs4_dentry_ino(de); 149 if (!ino)
380 de->d_fsdata = NULL; 150 return;
381
382 if (inf) {
383 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
384
385 if (sbi) {
386 spin_lock(&sbi->lookup_lock);
387 if (!list_empty(&inf->active))
388 list_del(&inf->active);
389 if (!list_empty(&inf->expiring))
390 list_del(&inf->expiring);
391 spin_unlock(&sbi->lookup_lock);
392 }
393
394 inf->dentry = NULL;
395 inf->inode = NULL;
396 151
397 autofs4_free_ino(inf); 152 if (sbi) {
153 spin_lock(&sbi->lookup_lock);
154 if (!list_empty(&ino->active))
155 list_del(&ino->active);
156 if (!list_empty(&ino->expiring))
157 list_del(&ino->expiring);
158 spin_unlock(&sbi->lookup_lock);
398 } 159 }
399}
400 160
401/* For dentries of directories in the root dir */ 161 autofs4_free_ino(ino);
402static const struct dentry_operations autofs4_root_dentry_operations = { 162}
403 .d_revalidate = autofs4_revalidate,
404 .d_release = autofs4_dentry_release,
405};
406
407/* For other dentries */
408static const struct dentry_operations autofs4_dentry_operations = {
409 .d_revalidate = autofs4_revalidate,
410 .d_release = autofs4_dentry_release,
411};
412 163
413static struct dentry *autofs4_lookup_active(struct dentry *dentry) 164static struct dentry *autofs4_lookup_active(struct dentry *dentry)
414{ 165{
@@ -420,7 +171,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
420 const unsigned char *str = name->name; 171 const unsigned char *str = name->name;
421 struct list_head *p, *head; 172 struct list_head *p, *head;
422 173
423 spin_lock(&dcache_lock); 174 spin_lock(&autofs4_lock);
424 spin_lock(&sbi->lookup_lock); 175 spin_lock(&sbi->lookup_lock);
425 head = &sbi->active_list; 176 head = &sbi->active_list;
426 list_for_each(p, head) { 177 list_for_each(p, head) {
@@ -434,7 +185,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
434 spin_lock(&active->d_lock); 185 spin_lock(&active->d_lock);
435 186
436 /* Already gone? */ 187 /* Already gone? */
437 if (atomic_read(&active->d_count) == 0) 188 if (active->d_count == 0)
438 goto next; 189 goto next;
439 190
440 qstr = &active->d_name; 191 qstr = &active->d_name;
@@ -450,17 +201,17 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
450 goto next; 201 goto next;
451 202
452 if (d_unhashed(active)) { 203 if (d_unhashed(active)) {
453 dget(active); 204 dget_dlock(active);
454 spin_unlock(&active->d_lock); 205 spin_unlock(&active->d_lock);
455 spin_unlock(&sbi->lookup_lock); 206 spin_unlock(&sbi->lookup_lock);
456 spin_unlock(&dcache_lock); 207 spin_unlock(&autofs4_lock);
457 return active; 208 return active;
458 } 209 }
459next: 210next:
460 spin_unlock(&active->d_lock); 211 spin_unlock(&active->d_lock);
461 } 212 }
462 spin_unlock(&sbi->lookup_lock); 213 spin_unlock(&sbi->lookup_lock);
463 spin_unlock(&dcache_lock); 214 spin_unlock(&autofs4_lock);
464 215
465 return NULL; 216 return NULL;
466} 217}
@@ -475,7 +226,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
475 const unsigned char *str = name->name; 226 const unsigned char *str = name->name;
476 struct list_head *p, *head; 227 struct list_head *p, *head;
477 228
478 spin_lock(&dcache_lock); 229 spin_lock(&autofs4_lock);
479 spin_lock(&sbi->lookup_lock); 230 spin_lock(&sbi->lookup_lock);
480 head = &sbi->expiring_list; 231 head = &sbi->expiring_list;
481 list_for_each(p, head) { 232 list_for_each(p, head) {
@@ -505,66 +256,261 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
505 goto next; 256 goto next;
506 257
507 if (d_unhashed(expiring)) { 258 if (d_unhashed(expiring)) {
508 dget(expiring); 259 dget_dlock(expiring);
509 spin_unlock(&expiring->d_lock); 260 spin_unlock(&expiring->d_lock);
510 spin_unlock(&sbi->lookup_lock); 261 spin_unlock(&sbi->lookup_lock);
511 spin_unlock(&dcache_lock); 262 spin_unlock(&autofs4_lock);
512 return expiring; 263 return expiring;
513 } 264 }
514next: 265next:
515 spin_unlock(&expiring->d_lock); 266 spin_unlock(&expiring->d_lock);
516 } 267 }
517 spin_unlock(&sbi->lookup_lock); 268 spin_unlock(&sbi->lookup_lock);
518 spin_unlock(&dcache_lock); 269 spin_unlock(&autofs4_lock);
519 270
520 return NULL; 271 return NULL;
521} 272}
522 273
274static int autofs4_mount_wait(struct dentry *dentry)
275{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status;
279
280 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 }
288 return 0;
289}
290
291static int do_expire_wait(struct dentry *dentry)
292{
293 struct dentry *expiring;
294
295 expiring = autofs4_lookup_expiring(dentry);
296 if (!expiring)
297 return autofs4_expire_wait(dentry);
298 else {
299 /*
300 * If we are racing with expire the request might not
301 * be quite complete, but the directory has been removed
302 * so it must have been successful, just wait for it.
303 */
304 autofs4_expire_wait(expiring);
305 autofs4_del_expiring(expiring);
306 dput(expiring);
307 }
308 return 0;
309}
310
311static struct dentry *autofs4_mountpoint_changed(struct path *path)
312{
313 struct dentry *dentry = path->dentry;
314 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
315
316 /*
317 * If this is an indirect mount the dentry could have gone away
318 * as a result of an expire and a new one created.
319 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent;
322 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new)
324 return NULL;
325 dput(path->dentry);
326 path->dentry = new;
327 }
328 return path->dentry;
329}
330
331static struct vfsmount *autofs4_d_automount(struct path *path)
332{
333 struct dentry *dentry = path->dentry;
334 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
335 struct autofs_info *ino = autofs4_dentry_ino(dentry);
336 int status;
337
338 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name);
340
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi))
355 return NULL;
356
357 /*
358 * If an expire request is pending everyone must wait.
359 * If the expire fails we're still mounted so continue
360 * the follow and return. A return of -EAGAIN (which only
361 * happens with indirect mounts) means the expire completed
362 * and the directory was removed, so just go ahead and try
363 * the mount.
364 */
365 status = do_expire_wait(dentry);
366 if (status && status != -EAGAIN)
367 return NULL;
368
369 /* Callback to the daemon to perform the mount or wait */
370 spin_lock(&sbi->fs_lock);
371 if (ino->flags & AUTOFS_INF_PENDING) {
372 spin_unlock(&sbi->fs_lock);
373 status = autofs4_mount_wait(dentry);
374 if (status)
375 return ERR_PTR(status);
376 spin_lock(&sbi->fs_lock);
377 goto done;
378 }
379
380 /*
381 * If the dentry is a symlink it's equivalent to a directory
382 * having d_mountpoint() true, so there's no need to call back
383 * to the daemon.
384 */
385 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
386 goto done;
387 if (!d_mountpoint(dentry)) {
388 /*
389 * It's possible that user space hasn't removed directories
390 * after umounting a rootless multi-mount, although it
391 * should. For v5 have_submounts() is sufficient to handle
392 * this because the leaves of the directory tree under the
393 * mount never trigger mounts themselves (they have an autofs
394 * trigger mount mounted on them). But v4 pseudo direct mounts
395 * do need the leaves to to trigger mounts. In this case we
396 * have no choice but to use the list_empty() check and
397 * require user space behave.
398 */
399 if (sbi->version > 4) {
400 if (have_submounts(dentry))
401 goto done;
402 } else {
403 spin_lock(&dentry->d_lock);
404 if (!list_empty(&dentry->d_subdirs)) {
405 spin_unlock(&dentry->d_lock);
406 goto done;
407 }
408 spin_unlock(&dentry->d_lock);
409 }
410 ino->flags |= AUTOFS_INF_PENDING;
411 spin_unlock(&sbi->fs_lock);
412 status = autofs4_mount_wait(dentry);
413 if (status)
414 return ERR_PTR(status);
415 spin_lock(&sbi->fs_lock);
416 ino->flags &= ~AUTOFS_INF_PENDING;
417 }
418done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /*
421 * Any needed mounting has been completed and the path updated
422 * so turn this into a normal dentry so we don't continually
423 * call ->d_automount() and ->d_manage().
424 */
425 spin_lock(&dentry->d_lock);
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during
431 * the follow.
432 */
433 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
436 __managed_dentry_clear_automount(dentry);
437 spin_unlock(&dentry->d_lock);
438 }
439 spin_unlock(&sbi->fs_lock);
440
441 /* Mount succeeded, check if we ended up with a new dentry */
442 dentry = autofs4_mountpoint_changed(path);
443 if (!dentry)
444 return ERR_PTR(-ENOENT);
445
446 return NULL;
447}
448
449int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
450{
451 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
452
453 DPRINTK("dentry=%p %.*s",
454 dentry, dentry->d_name.len, dentry->d_name.name);
455
456 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi) || mounting_here) {
458 if (!d_mountpoint(dentry))
459 return -EISDIR;
460 return 0;
461 }
462
463 /* We need to sleep, so we need pathwalk to be in ref-mode */
464 if (rcu_walk)
465 return -ECHILD;
466
467 /* Wait for pending expires */
468 do_expire_wait(dentry);
469
470 /*
471 * This dentry may be under construction so wait on mount
472 * completion.
473 */
474 return autofs4_mount_wait(dentry);
475}
476
523/* Lookups in the root directory */ 477/* Lookups in the root directory */
524static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 478static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
525{ 479{
526 struct autofs_sb_info *sbi; 480 struct autofs_sb_info *sbi;
527 struct autofs_info *ino; 481 struct autofs_info *ino;
528 struct dentry *expiring, *active; 482 struct dentry *active;
529 int oz_mode;
530 483
531 DPRINTK("name = %.*s", 484 DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
532 dentry->d_name.len, dentry->d_name.name);
533 485
534 /* File name too long to exist */ 486 /* File name too long to exist */
535 if (dentry->d_name.len > NAME_MAX) 487 if (dentry->d_name.len > NAME_MAX)
536 return ERR_PTR(-ENAMETOOLONG); 488 return ERR_PTR(-ENAMETOOLONG);
537 489
538 sbi = autofs4_sbi(dir->i_sb); 490 sbi = autofs4_sbi(dir->i_sb);
539 oz_mode = autofs4_oz_mode(sbi);
540 491
541 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 492 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
542 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 493 current->pid, task_pgrp_nr(current), sbi->catatonic,
494 autofs4_oz_mode(sbi));
543 495
544 active = autofs4_lookup_active(dentry); 496 active = autofs4_lookup_active(dentry);
545 if (active) { 497 if (active) {
546 dentry = active; 498 return active;
547 ino = autofs4_dentry_ino(dentry);
548 } else { 499 } else {
549 /* 500 /*
550 * Mark the dentry incomplete but don't hash it. We do this 501 * A dentry that is not within the root can never trigger a
551 * to serialize our inode creation operations (symlink and 502 * mount operation, unless the directory already exists, so we
552 * mkdir) which prevents deadlock during the callback to 503 * can return fail immediately. The daemon however does need
553 * the daemon. Subsequent user space lookups for the same 504 * to create directories within the file system.
554 * dentry are placed on the wait queue while the daemon
555 * itself is allowed passage unresticted so the create
556 * operation itself can then hash the dentry. Finally,
557 * we check for the hashed dentry and return the newly
558 * hashed dentry.
559 */ 505 */
560 dentry->d_op = &autofs4_root_dentry_operations; 506 if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
507 return ERR_PTR(-ENOENT);
561 508
562 /* 509 /* Mark entries in the root as mount triggers */
563 * And we need to ensure that the same dentry is used for 510 if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
564 * all following lookup calls until it is hashed so that 511 __managed_dentry_set_managed(dentry);
565 * the dentry flags are persistent throughout the request. 512
566 */ 513 ino = autofs4_new_ino(sbi);
567 ino = autofs4_init_ino(NULL, sbi, 0555);
568 if (!ino) 514 if (!ino)
569 return ERR_PTR(-ENOMEM); 515 return ERR_PTR(-ENOMEM);
570 516
@@ -575,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
575 521
576 d_instantiate(dentry, NULL); 522 d_instantiate(dentry, NULL);
577 } 523 }
578
579 if (!oz_mode) {
580 mutex_unlock(&dir->i_mutex);
581 expiring = autofs4_lookup_expiring(dentry);
582 if (expiring) {
583 /*
584 * If we are racing with expire the request might not
585 * be quite complete but the directory has been removed
586 * so it must have been successful, so just wait for it.
587 */
588 autofs4_expire_wait(expiring);
589 autofs4_del_expiring(expiring);
590 dput(expiring);
591 }
592
593 spin_lock(&sbi->fs_lock);
594 ino->flags |= AUTOFS_INF_PENDING;
595 spin_unlock(&sbi->fs_lock);
596 if (dentry->d_op && dentry->d_op->d_revalidate)
597 (dentry->d_op->d_revalidate)(dentry, nd);
598 mutex_lock(&dir->i_mutex);
599 }
600
601 /*
602 * If we are still pending, check if we had to handle
603 * a signal. If so we can force a restart..
604 */
605 if (ino->flags & AUTOFS_INF_PENDING) {
606 /* See if we were interrupted */
607 if (signal_pending(current)) {
608 sigset_t *sigset = &current->pending.signal;
609 if (sigismember (sigset, SIGKILL) ||
610 sigismember (sigset, SIGQUIT) ||
611 sigismember (sigset, SIGINT)) {
612 if (active)
613 dput(active);
614 return ERR_PTR(-ERESTARTNOINTR);
615 }
616 }
617 if (!oz_mode) {
618 spin_lock(&sbi->fs_lock);
619 ino->flags &= ~AUTOFS_INF_PENDING;
620 spin_unlock(&sbi->fs_lock);
621 }
622 }
623
624 /*
625 * If this dentry is unhashed, then we shouldn't honour this
626 * lookup. Returning ENOENT here doesn't do the right thing
627 * for all system calls, but it should be OK for the operations
628 * we permit from an autofs.
629 */
630 if (!oz_mode && d_unhashed(dentry)) {
631 /*
632 * A user space application can (and has done in the past)
633 * remove and re-create this directory during the callback.
634 * This can leave us with an unhashed dentry, but a
635 * successful mount! So we need to perform another
636 * cached lookup in case the dentry now exists.
637 */
638 struct dentry *parent = dentry->d_parent;
639 struct dentry *new = d_lookup(parent, &dentry->d_name);
640 if (new != NULL)
641 dentry = new;
642 else
643 dentry = ERR_PTR(-ENOENT);
644
645 if (active)
646 dput(active);
647
648 return dentry;
649 }
650
651 if (active)
652 return active;
653
654 return NULL; 524 return NULL;
655} 525}
656 526
@@ -662,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
662 struct autofs_info *ino = autofs4_dentry_ino(dentry); 532 struct autofs_info *ino = autofs4_dentry_ino(dentry);
663 struct autofs_info *p_ino; 533 struct autofs_info *p_ino;
664 struct inode *inode; 534 struct inode *inode;
535 size_t size = strlen(symname);
665 char *cp; 536 char *cp;
666 537
667 DPRINTK("%s <- %.*s", symname, 538 DPRINTK("%s <- %.*s", symname,
@@ -670,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
670 if (!autofs4_oz_mode(sbi)) 541 if (!autofs4_oz_mode(sbi))
671 return -EACCES; 542 return -EACCES;
672 543
673 ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); 544 BUG_ON(!ino);
674 if (!ino) 545
675 return -ENOMEM; 546 autofs4_clean_ino(ino);
676 547
677 autofs4_del_active(dentry); 548 autofs4_del_active(dentry);
678 549
679 ino->size = strlen(symname); 550 cp = kmalloc(size + 1, GFP_KERNEL);
680 cp = kmalloc(ino->size + 1, GFP_KERNEL); 551 if (!cp)
681 if (!cp) {
682 if (!dentry->d_fsdata)
683 kfree(ino);
684 return -ENOMEM; 552 return -ENOMEM;
685 }
686 553
687 strcpy(cp, symname); 554 strcpy(cp, symname);
688 555
689 inode = autofs4_get_inode(dir->i_sb, ino); 556 inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
690 if (!inode) { 557 if (!inode) {
691 kfree(cp); 558 kfree(cp);
692 if (!dentry->d_fsdata) 559 if (!dentry->d_fsdata)
693 kfree(ino); 560 kfree(ino);
694 return -ENOMEM; 561 return -ENOMEM;
695 } 562 }
563 inode->i_private = cp;
564 inode->i_size = size;
696 d_add(dentry, inode); 565 d_add(dentry, inode);
697 566
698 if (dir == dir->i_sb->s_root->d_inode) 567 dget(dentry);
699 dentry->d_op = &autofs4_root_dentry_operations;
700 else
701 dentry->d_op = &autofs4_dentry_operations;
702
703 dentry->d_fsdata = ino;
704 ino->dentry = dget(dentry);
705 atomic_inc(&ino->count); 568 atomic_inc(&ino->count);
706 p_ino = autofs4_dentry_ino(dentry->d_parent); 569 p_ino = autofs4_dentry_ino(dentry->d_parent);
707 if (p_ino && dentry->d_parent != dentry) 570 if (p_ino && dentry->d_parent != dentry)
708 atomic_inc(&p_ino->count); 571 atomic_inc(&p_ino->count);
709 ino->inode = inode;
710 572
711 ino->u.symlink = cp;
712 dir->i_mtime = CURRENT_TIME; 573 dir->i_mtime = CURRENT_TIME;
713 574
714 return 0; 575 return 0;
@@ -751,16 +612,68 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
751 612
752 dir->i_mtime = CURRENT_TIME; 613 dir->i_mtime = CURRENT_TIME;
753 614
754 spin_lock(&dcache_lock); 615 spin_lock(&autofs4_lock);
755 autofs4_add_expiring(dentry); 616 autofs4_add_expiring(dentry);
756 spin_lock(&dentry->d_lock); 617 spin_lock(&dentry->d_lock);
757 __d_drop(dentry); 618 __d_drop(dentry);
758 spin_unlock(&dentry->d_lock); 619 spin_unlock(&dentry->d_lock);
759 spin_unlock(&dcache_lock); 620 spin_unlock(&autofs4_lock);
760 621
761 return 0; 622 return 0;
762} 623}
763 624
625/*
626 * Version 4 of autofs provides a pseudo direct mount implementation
627 * that relies on directories at the leaves of a directory tree under
628 * an indirect mount to trigger mounts. To allow for this we need to
629 * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
630 * of the directory tree. There is no need to clear the automount flag
631 * following a mount or restore it after an expire because these mounts
632 * are always covered. However, it is neccessary to ensure that these
633 * flags are clear on non-empty directories to avoid unnecessary calls
634 * during path walks.
635 */
636static void autofs_set_leaf_automount_flags(struct dentry *dentry)
637{
638 struct dentry *parent;
639
640 /* root and dentrys in the root are already handled */
641 if (IS_ROOT(dentry->d_parent))
642 return;
643
644 managed_dentry_set_managed(dentry);
645
646 parent = dentry->d_parent;
647 /* only consider parents below dentrys in the root */
648 if (IS_ROOT(parent->d_parent))
649 return;
650 managed_dentry_clear_managed(parent);
651 return;
652}
653
654static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
655{
656 struct list_head *d_child;
657 struct dentry *parent;
658
659 /* flags for dentrys in the root are handled elsewhere */
660 if (IS_ROOT(dentry->d_parent))
661 return;
662
663 managed_dentry_clear_managed(dentry);
664
665 parent = dentry->d_parent;
666 /* only consider parents below dentrys in the root */
667 if (IS_ROOT(parent->d_parent))
668 return;
669 d_child = &dentry->d_u.d_child;
670 /* Set parent managed if it's becoming empty */
671 if (d_child->next == &parent->d_subdirs &&
672 d_child->prev == &parent->d_subdirs)
673 managed_dentry_set_managed(parent);
674 return;
675}
676
764static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) 677static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
765{ 678{
766 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 679 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -773,16 +686,23 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
773 if (!autofs4_oz_mode(sbi)) 686 if (!autofs4_oz_mode(sbi))
774 return -EACCES; 687 return -EACCES;
775 688
776 spin_lock(&dcache_lock); 689 spin_lock(&autofs4_lock);
690 spin_lock(&sbi->lookup_lock);
691 spin_lock(&dentry->d_lock);
777 if (!list_empty(&dentry->d_subdirs)) { 692 if (!list_empty(&dentry->d_subdirs)) {
778 spin_unlock(&dcache_lock); 693 spin_unlock(&dentry->d_lock);
694 spin_unlock(&sbi->lookup_lock);
695 spin_unlock(&autofs4_lock);
779 return -ENOTEMPTY; 696 return -ENOTEMPTY;
780 } 697 }
781 autofs4_add_expiring(dentry); 698 __autofs4_add_expiring(dentry);
782 spin_lock(&dentry->d_lock); 699 spin_unlock(&sbi->lookup_lock);
783 __d_drop(dentry); 700 __d_drop(dentry);
784 spin_unlock(&dentry->d_lock); 701 spin_unlock(&dentry->d_lock);
785 spin_unlock(&dcache_lock); 702 spin_unlock(&autofs4_lock);
703
704 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry);
786 706
787 if (atomic_dec_and_test(&ino->count)) { 707 if (atomic_dec_and_test(&ino->count)) {
788 p_ino = autofs4_dentry_ino(dentry->d_parent); 708 p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -812,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
812 DPRINTK("dentry %p, creating %.*s", 732 DPRINTK("dentry %p, creating %.*s",
813 dentry, dentry->d_name.len, dentry->d_name.name); 733 dentry, dentry->d_name.len, dentry->d_name.name);
814 734
815 ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); 735 BUG_ON(!ino);
816 if (!ino) 736
817 return -ENOMEM; 737 autofs4_clean_ino(ino);
818 738
819 autofs4_del_active(dentry); 739 autofs4_del_active(dentry);
820 740
821 inode = autofs4_get_inode(dir->i_sb, ino); 741 inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
822 if (!inode) { 742 if (!inode)
823 if (!dentry->d_fsdata)
824 kfree(ino);
825 return -ENOMEM; 743 return -ENOMEM;
826 }
827 d_add(dentry, inode); 744 d_add(dentry, inode);
828 745
829 if (dir == dir->i_sb->s_root->d_inode) 746 if (sbi->version < 5)
830 dentry->d_op = &autofs4_root_dentry_operations; 747 autofs_set_leaf_automount_flags(dentry);
831 else
832 dentry->d_op = &autofs4_dentry_operations;
833 748
834 dentry->d_fsdata = ino; 749 dget(dentry);
835 ino->dentry = dget(dentry);
836 atomic_inc(&ino->count); 750 atomic_inc(&ino->count);
837 p_ino = autofs4_dentry_ino(dentry->d_parent); 751 p_ino = autofs4_dentry_ino(dentry->d_parent);
838 if (p_ino && dentry->d_parent != dentry) 752 if (p_ino && dentry->d_parent != dentry)
839 atomic_inc(&p_ino->count); 753 atomic_inc(&p_ino->count);
840 ino->inode = inode;
841 inc_nlink(dir); 754 inc_nlink(dir);
842 dir->i_mtime = CURRENT_TIME; 755 dir->i_mtime = CURRENT_TIME;
843 756
@@ -919,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
919int is_autofs4_dentry(struct dentry *dentry) 832int is_autofs4_dentry(struct dentry *dentry)
920{ 833{
921 return dentry && dentry->d_inode && 834 return dentry && dentry->d_inode &&
922 (dentry->d_op == &autofs4_root_dentry_operations || 835 dentry->d_op == &autofs4_dentry_operations &&
923 dentry->d_op == &autofs4_dentry_operations) &&
924 dentry->d_fsdata != NULL; 836 dentry->d_fsdata != NULL;
925} 837}
926 838
@@ -981,14 +893,8 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
981static long autofs4_root_ioctl(struct file *filp, 893static long autofs4_root_ioctl(struct file *filp,
982 unsigned int cmd, unsigned long arg) 894 unsigned int cmd, unsigned long arg)
983{ 895{
984 long ret;
985 struct inode *inode = filp->f_dentry->d_inode; 896 struct inode *inode = filp->f_dentry->d_inode;
986 897 return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
987 lock_kernel();
988 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
989 unlock_kernel();
990
991 return ret;
992} 898}
993 899
994#ifdef CONFIG_COMPAT 900#ifdef CONFIG_COMPAT
@@ -998,13 +904,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
998 struct inode *inode = filp->f_path.dentry->d_inode; 904 struct inode *inode = filp->f_path.dentry->d_inode;
999 int ret; 905 int ret;
1000 906
1001 lock_kernel();
1002 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) 907 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
1003 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 908 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
1004 else 909 else
1005 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, 910 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
1006 (unsigned long)compat_ptr(arg)); 911 (unsigned long)compat_ptr(arg));
1007 unlock_kernel();
1008 912
1009 return ret; 913 return ret;
1010} 914}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2e..f27c094a1919 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
14 14
15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) 15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
16{ 16{
17 struct autofs_info *ino = autofs4_dentry_ino(dentry); 17 nd_set_link(nd, dentry->d_inode->i_private);
18 nd_set_link(nd, (char *)ino->u.symlink);
19 return NULL; 18 return NULL;
20} 19}
21 20
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375386f8..56010056b2e6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
186{ 186{
187 struct dentry *root = sbi->sb->s_root; 187 struct dentry *root = sbi->sb->s_root;
188 struct dentry *tmp; 188 struct dentry *tmp;
189 char *buf = *name; 189 char *buf;
190 char *p; 190 char *p;
191 int len = 0; 191 int len;
192 unsigned seq;
192 193
193 spin_lock(&dcache_lock); 194rename_retry:
195 buf = *name;
196 len = 0;
197
198 seq = read_seqbegin(&rename_lock);
199 rcu_read_lock();
200 spin_lock(&autofs4_lock);
194 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
195 len += tmp->d_name.len + 1; 202 len += tmp->d_name.len + 1;
196 203
197 if (!len || --len > NAME_MAX) { 204 if (!len || --len > NAME_MAX) {
198 spin_unlock(&dcache_lock); 205 spin_unlock(&autofs4_lock);
206 rcu_read_unlock();
207 if (read_seqretry(&rename_lock, seq))
208 goto rename_retry;
199 return 0; 209 return 0;
200 } 210 }
201 211
@@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
208 p -= tmp->d_name.len; 218 p -= tmp->d_name.len;
209 strncpy(p, tmp->d_name.name, tmp->d_name.len); 219 strncpy(p, tmp->d_name.name, tmp->d_name.len);
210 } 220 }
211 spin_unlock(&dcache_lock); 221 spin_unlock(&autofs4_lock);
222 rcu_read_unlock();
223 if (read_seqretry(&rename_lock, seq))
224 goto rename_retry;
212 225
213 return len; 226 return len;
214} 227}
@@ -296,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
296 * completed while we waited on the mutex ... 309 * completed while we waited on the mutex ...
297 */ 310 */
298 if (notify == NFY_MOUNT) { 311 if (notify == NFY_MOUNT) {
312 struct dentry *new = NULL;
313 int valid = 1;
314
299 /* 315 /*
300 * If the dentry was successfully mounted while we slept 316 * If the dentry was successfully mounted while we slept
301 * on the wait queue mutex we can return success. If it 317 * on the wait queue mutex we can return success. If it
@@ -303,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
303 * a multi-mount with no mount at it's base) we can 319 * a multi-mount with no mount at it's base) we can
304 * continue on and create a new request. 320 * continue on and create a new request.
305 */ 321 */
322 if (!IS_ROOT(dentry)) {
323 if (dentry->d_inode && d_unhashed(dentry)) {
324 struct dentry *parent = dentry->d_parent;
325 new = d_lookup(parent, &dentry->d_name);
326 if (new)
327 dentry = new;
328 }
329 }
306 if (have_submounts(dentry)) 330 if (have_submounts(dentry))
307 return 0; 331 valid = 0;
332
333 if (new)
334 dput(new);
335 return valid;
308 } 336 }
309 337
310 return 1; 338 return 1;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aaddef..9ad2369d9e35 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
229 return -EIO; 229 return -EIO;
230} 230}
231 231
232static int bad_inode_permission(struct inode *inode, int mask) 232static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
233{ 233{
234 if (flags & IPERM_FLAG_RCU)
235 return -ECHILD;
236
234 return -EIO; 237 return -EIO;
235} 238}
236 239
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d05..27223878ba9f 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
102} 102}
103 103
104static inline befs_data_stream 104static inline befs_data_stream
105fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n) 105fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
106{ 106{
107 befs_data_stream data; 107 befs_data_stream data;
108 int i; 108 int i;
109 109
110 for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i) 110 for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
111 data.direct[i] = fsrun_to_cpu(sb, n.direct[i]); 111 data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
112 112
113 data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range); 113 data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
114 data.indirect = fsrun_to_cpu(sb, n.indirect); 114 data.indirect = fsrun_to_cpu(sb, n->indirect);
115 data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range); 115 data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
116 data.double_indirect = fsrun_to_cpu(sb, n.double_indirect); 116 data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
117 data.max_double_indirect_range = fs64_to_cpu(sb, 117 data.max_double_indirect_range = fs64_to_cpu(sb,
118 n. 118 n->
119 max_double_indirect_range); 119 max_double_indirect_range);
120 data.size = fs64_to_cpu(sb, n.size); 120 data.size = fs64_to_cpu(sb, n->size);
121 121
122 return data; 122 return data;
123} 123}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dc39d2824885..b1d0c794747b 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
284 return &bi->vfs_inode; 284 return &bi->vfs_inode;
285} 285}
286 286
287static void 287static void befs_i_callback(struct rcu_head *head)
288befs_destroy_inode(struct inode *inode)
289{ 288{
289 struct inode *inode = container_of(head, struct inode, i_rcu);
290 INIT_LIST_HEAD(&inode->i_dentry);
290 kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); 291 kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
291} 292}
292 293
294static void befs_destroy_inode(struct inode *inode)
295{
296 call_rcu(&inode->i_rcu, befs_i_callback);
297}
298
293static void init_once(void *foo) 299static void init_once(void *foo)
294{ 300{
295 struct befs_inode_info *bi = (struct befs_inode_info *) foo; 301 struct befs_inode_info *bi = (struct befs_inode_info *) foo;
@@ -384,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
384 int num_blks; 390 int num_blks;
385 391
386 befs_ino->i_data.ds = 392 befs_ino->i_data.ds =
387 fsds_to_cpu(sb, raw_inode->data.datastream); 393 fsds_to_cpu(sb, &raw_inode->data.datastream);
388 394
389 num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds); 395 num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
390 inode->i_blocks = 396 inode->i_blocks =
@@ -913,18 +919,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
913 return 0; 919 return 0;
914} 920}
915 921
916static int 922static struct dentry *
917befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, 923befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
918 void *data, struct vfsmount *mnt) 924 void *data)
919{ 925{
920 return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super, 926 return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
921 mnt);
922} 927}
923 928
924static struct file_system_type befs_fs_type = { 929static struct file_system_type befs_fs_type = {
925 .owner = THIS_MODULE, 930 .owner = THIS_MODULE,
926 .name = "befs", 931 .name = "befs",
927 .get_sb = befs_get_sb, 932 .mount = befs_mount,
928 .kill_sb = kill_block_super, 933 .kill_sb = kill_block_super,
929 .fs_flags = FS_REQUIRES_DEV, 934 .fs_flags = FS_REQUIRES_DEV,
930}; 935};
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
176 inc_nlink(inode); 176 inc_nlink(inode);
177 inode->i_ctime = CURRENT_TIME_SEC; 177 inode->i_ctime = CURRENT_TIME_SEC;
178 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
179 atomic_inc(&inode->i_count); 179 ihold(inode);
180 d_instantiate(new, inode); 180 d_instantiate(new, inode);
181 mutex_unlock(&info->bfs_lock); 181 mutex_unlock(&info->bfs_lock);
182 return 0; 182 return 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c4daf0f5fc02..a8e37f81d097 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/vfs.h> 16#include <linux/vfs.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
@@ -215,14 +214,10 @@ static void bfs_put_super(struct super_block *s)
215 if (!info) 214 if (!info)
216 return; 215 return;
217 216
218 lock_kernel();
219
220 mutex_destroy(&info->bfs_lock); 217 mutex_destroy(&info->bfs_lock);
221 kfree(info->si_imap); 218 kfree(info->si_imap);
222 kfree(info); 219 kfree(info);
223 s->s_fs_info = NULL; 220 s->s_fs_info = NULL;
224
225 unlock_kernel();
226} 221}
227 222
228static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -253,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
253 return &bi->vfs_inode; 248 return &bi->vfs_inode;
254} 249}
255 250
256static void bfs_destroy_inode(struct inode *inode) 251static void bfs_i_callback(struct rcu_head *head)
257{ 252{
253 struct inode *inode = container_of(head, struct inode, i_rcu);
254 INIT_LIST_HEAD(&inode->i_dentry);
258 kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); 255 kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
259} 256}
260 257
258static void bfs_destroy_inode(struct inode *inode)
259{
260 call_rcu(&inode->i_rcu, bfs_i_callback);
261}
262
261static void init_once(void *foo) 263static void init_once(void *foo)
262{ 264{
263 struct bfs_inode_info *bi = foo; 265 struct bfs_inode_info *bi = foo;
@@ -455,16 +457,16 @@ out:
455 return ret; 457 return ret;
456} 458}
457 459
458static int bfs_get_sb(struct file_system_type *fs_type, 460static struct dentry *bfs_mount(struct file_system_type *fs_type,
459 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 461 int flags, const char *dev_name, void *data)
460{ 462{
461 return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt); 463 return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
462} 464}
463 465
464static struct file_system_type bfs_fs_type = { 466static struct file_system_type bfs_fs_type = {
465 .owner = THIS_MODULE, 467 .owner = THIS_MODULE,
466 .name = "bfs", 468 .name = "bfs",
467 .get_sb = bfs_get_sb, 469 .mount = bfs_mount,
468 .kill_sb = kill_block_super, 470 .kill_sb = kill_block_super,
469 .fs_flags = FS_REQUIRES_DEV, 471 .fs_flags = FS_REQUIRES_DEV,
470}; 472};
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a6..d5b640ba6cb1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
66#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) 66#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
67 67
68static struct linux_binfmt elf_format = { 68static struct linux_binfmt elf_format = {
69 .module = THIS_MODULE, 69 .module = THIS_MODULE,
70 .load_binary = load_elf_binary, 70 .load_binary = load_elf_binary,
71 .load_shlib = load_elf_library, 71 .load_shlib = load_elf_library,
72 .core_dump = elf_core_dump, 72 .core_dump = elf_core_dump,
73 .min_coredump = ELF_EXEC_PAGESIZE, 73 .min_coredump = ELF_EXEC_PAGESIZE,
74 .hasvdso = 1
75}; 74};
76 75
77#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) 76#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
316 return 0; 315 return 0;
317} 316}
318 317
319#ifndef elf_map
320
321static unsigned long elf_map(struct file *filep, unsigned long addr, 318static unsigned long elf_map(struct file *filep, unsigned long addr,
322 struct elf_phdr *eppnt, int prot, int type, 319 struct elf_phdr *eppnt, int prot, int type,
323 unsigned long total_size) 320 unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
354 return(map_addr); 351 return(map_addr);
355} 352}
356 353
357#endif /* !elf_map */
358
359static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) 354static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
360{ 355{
361 int i, first_idx = -1, last_idx = -1; 356 int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
421 goto out; 416 goto out;
422 417
423 retval = kernel_read(interpreter, interp_elf_ex->e_phoff, 418 retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
424 (char *)elf_phdata,size); 419 (char *)elf_phdata, size);
425 error = -EIO; 420 error = -EIO;
426 if (retval != size) { 421 if (retval != size) {
427 if (retval < 0) 422 if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
601 goto out; 596 goto out;
602 if (!elf_check_arch(&loc->elf_ex)) 597 if (!elf_check_arch(&loc->elf_ex))
603 goto out; 598 goto out;
604 if (!bprm->file->f_op||!bprm->file->f_op->mmap) 599 if (!bprm->file->f_op || !bprm->file->f_op->mmap)
605 goto out; 600 goto out;
606 601
607 /* Now read in all of the header information */ 602 /* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
761 /* There was a PT_LOAD segment with p_memsz > p_filesz 756 /* There was a PT_LOAD segment with p_memsz > p_filesz
762 before this one. Map anonymous pages, if needed, 757 before this one. Map anonymous pages, if needed,
763 and clear the area. */ 758 and clear the area. */
764 retval = set_brk (elf_bss + load_bias, 759 retval = set_brk(elf_bss + load_bias,
765 elf_brk + load_bias); 760 elf_brk + load_bias);
766 if (retval) { 761 if (retval) {
767 send_sig(SIGKILL, current, 0); 762 send_sig(SIGKILL, current, 0);
768 goto out_free_dentry; 763 goto out_free_dentry;
@@ -800,7 +795,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
800 * default mmap base, as well as whatever program they 795 * default mmap base, as well as whatever program they
801 * might try to exec. This is because the brk will 796 * might try to exec. This is because the brk will
802 * follow the loader, and is not movable. */ 797 * follow the loader, and is not movable. */
803#ifdef CONFIG_X86 798#if defined(CONFIG_X86) || defined(CONFIG_ARM)
804 load_bias = 0; 799 load_bias = 0;
805#else 800#else
806 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 801 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd0cc0bf9a40..1befe2ec8186 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
495 struct inode * inode = new_inode(sb); 495 struct inode * inode = new_inode(sb);
496 496
497 if (inode) { 497 if (inode) {
498 inode->i_ino = get_next_ino();
498 inode->i_mode = mode; 499 inode->i_mode = mode;
499 inode->i_atime = inode->i_mtime = inode->i_ctime = 500 inode->i_atime = inode->i_mtime = inode->i_ctime =
500 current_fs_time(inode->i_sb); 501 current_fs_time(inode->i_sb);
@@ -576,6 +577,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
576static const struct file_operations bm_entry_operations = { 577static const struct file_operations bm_entry_operations = {
577 .read = bm_entry_read, 578 .read = bm_entry_read,
578 .write = bm_entry_write, 579 .write = bm_entry_write,
580 .llseek = default_llseek,
579}; 581};
580 582
581/* /register */ 583/* /register */
@@ -643,6 +645,7 @@ out:
643 645
644static const struct file_operations bm_register_operations = { 646static const struct file_operations bm_register_operations = {
645 .write = bm_register_write, 647 .write = bm_register_write,
648 .llseek = noop_llseek,
646}; 649};
647 650
648/* /status */ 651/* /status */
@@ -680,6 +683,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
680static const struct file_operations bm_status_operations = { 683static const struct file_operations bm_status_operations = {
681 .read = bm_status_read, 684 .read = bm_status_read,
682 .write = bm_status_write, 685 .write = bm_status_write,
686 .llseek = default_llseek,
683}; 687};
684 688
685/* Superblock handling */ 689/* Superblock handling */
@@ -702,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
702 return err; 706 return err;
703} 707}
704 708
705static int bm_get_sb(struct file_system_type *fs_type, 709static struct dentry *bm_mount(struct file_system_type *fs_type,
706 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 710 int flags, const char *dev_name, void *data)
707{ 711{
708 return get_sb_single(fs_type, flags, data, bm_fill_super, mnt); 712 return mount_single(fs_type, flags, data, bm_fill_super);
709} 713}
710 714
711static struct linux_binfmt misc_format = { 715static struct linux_binfmt misc_format = {
@@ -716,7 +720,7 @@ static struct linux_binfmt misc_format = {
716static struct file_system_type bm_fs_type = { 720static struct file_system_type bm_fs_type = {
717 .owner = THIS_MODULE, 721 .owner = THIS_MODULE,
718 .name = "binfmt_misc", 722 .name = "binfmt_misc",
719 .get_sb = bm_get_sb, 723 .mount = bm_mount,
720 .kill_sb = kill_litter_super, 724 .kill_sb = kill_litter_super,
721}; 725};
722 726
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b8..e49cce234c65 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
782{ 782{
783 unsigned int i; 783 unsigned int i;
784 784
785 kintegrityd_wq = create_workqueue("kintegrityd"); 785 /*
786 * kintegrityd won't block much but may burn a lot of CPU cycles.
787 * Make it highpri CPU intensive wq with max concurrency of 1.
788 */
789 kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
790 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
786 if (!kintegrityd_wq) 791 if (!kintegrityd_wq)
787 panic("Failed to create kintegrityd\n"); 792 panic("Failed to create kintegrityd\n");
788 793
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7c..4bd454fa844e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
370{ 370{
371 struct bio *bio; 371 struct bio *bio;
372 372
373 if (nr_iovecs > UIO_MAXIOV)
374 return NULL;
375
373 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), 376 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
374 gfp_mask); 377 gfp_mask);
375 if (unlikely(!bio)) 378 if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
697static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, 700static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
698 gfp_t gfp_mask) 701 gfp_t gfp_mask)
699{ 702{
700 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); 703 struct bio_map_data *bmd;
701 704
705 if (iov_count > UIO_MAXIOV)
706 return NULL;
707
708 bmd = kmalloc(sizeof(*bmd), gfp_mask);
702 if (!bmd) 709 if (!bmd)
703 return NULL; 710 return NULL;
704 711
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
827 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 834 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
828 start = uaddr >> PAGE_SHIFT; 835 start = uaddr >> PAGE_SHIFT;
829 836
837 /*
838 * Overflow, abort
839 */
840 if (end < start)
841 return ERR_PTR(-EINVAL);
842
830 nr_pages += end - start; 843 nr_pages += end - start;
831 len += iov[i].iov_len; 844 len += iov[i].iov_len;
832 } 845 }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
955 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 968 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
956 unsigned long start = uaddr >> PAGE_SHIFT; 969 unsigned long start = uaddr >> PAGE_SHIFT;
957 970
971 /*
972 * Overflow, abort
973 */
974 if (end < start)
975 return ERR_PTR(-EINVAL);
976
958 nr_pages += end - start; 977 nr_pages += end - start;
959 /* 978 /*
960 * buffer must be aligned to at least hardsector size for now 979 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
982 unsigned long start = uaddr >> PAGE_SHIFT; 1001 unsigned long start = uaddr >> PAGE_SHIFT;
983 const int local_nr_pages = end - start; 1002 const int local_nr_pages = end - start;
984 const int page_limit = cur_page + local_nr_pages; 1003 const int page_limit = cur_page + local_nr_pages;
985 1004
986 ret = get_user_pages_fast(uaddr, local_nr_pages, 1005 ret = get_user_pages_fast(uaddr, local_nr_pages,
987 write_to_vm, &pages[cur_page]); 1006 write_to_vm, &pages[cur_page]);
988 if (ret < local_nr_pages) { 1007 if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/kmod.h> 12#include <linux/kmod.h>
13#include <linux/major.h> 13#include <linux/major.h>
14#include <linux/smp_lock.h>
15#include <linux/device_cgroup.h> 14#include <linux/device_cgroup.h>
16#include <linux/highmem.h> 15#include <linux/highmem.h>
17#include <linux/blkdev.h> 16#include <linux/blkdev.h>
@@ -48,6 +47,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
48 47
49EXPORT_SYMBOL(I_BDEV); 48EXPORT_SYMBOL(I_BDEV);
50 49
50/*
51 * move the inode from it's current bdi to the a new bdi. if the inode is dirty
52 * we need to move it onto the dirty list of @dst so that the inode is always
53 * on the right list.
54 */
55static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst)
57{
58 spin_lock(&inode_lock);
59 inode->i_data.backing_dev_info = dst;
60 if (inode->i_state & I_DIRTY)
61 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
62 spin_unlock(&inode_lock);
63}
64
51static sector_t max_block(struct block_device *bdev) 65static sector_t max_block(struct block_device *bdev)
52{ 66{
53 sector_t retval = ~((sector_t)0); 67 sector_t retval = ~((sector_t)0);
@@ -370,7 +384,7 @@ int blkdev_fsync(struct file *filp, int datasync)
370 */ 384 */
371 mutex_unlock(&bd_inode->i_mutex); 385 mutex_unlock(&bd_inode->i_mutex);
372 386
373 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); 387 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
374 if (error == -EOPNOTSUPP) 388 if (error == -EOPNOTSUPP)
375 error = 0; 389 error = 0;
376 390
@@ -395,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
395 return &ei->vfs_inode; 409 return &ei->vfs_inode;
396} 410}
397 411
398static void bdev_destroy_inode(struct inode *inode) 412static void bdev_i_callback(struct rcu_head *head)
399{ 413{
414 struct inode *inode = container_of(head, struct inode, i_rcu);
400 struct bdev_inode *bdi = BDEV_I(inode); 415 struct bdev_inode *bdi = BDEV_I(inode);
401 416
417 INIT_LIST_HEAD(&inode->i_dentry);
402 kmem_cache_free(bdev_cachep, bdi); 418 kmem_cache_free(bdev_cachep, bdi);
403} 419}
404 420
421static void bdev_destroy_inode(struct inode *inode)
422{
423 call_rcu(&inode->i_rcu, bdev_i_callback);
424}
425
405static void init_once(void *foo) 426static void init_once(void *foo)
406{ 427{
407 struct bdev_inode *ei = (struct bdev_inode *) foo; 428 struct bdev_inode *ei = (struct bdev_inode *) foo;
@@ -412,7 +433,7 @@ static void init_once(void *foo)
412 INIT_LIST_HEAD(&bdev->bd_inodes); 433 INIT_LIST_HEAD(&bdev->bd_inodes);
413 INIT_LIST_HEAD(&bdev->bd_list); 434 INIT_LIST_HEAD(&bdev->bd_list);
414#ifdef CONFIG_SYSFS 435#ifdef CONFIG_SYSFS
415 INIT_LIST_HEAD(&bdev->bd_holder_list); 436 INIT_LIST_HEAD(&bdev->bd_holder_disks);
416#endif 437#endif
417 inode_init_once(&ei->vfs_inode); 438 inode_init_once(&ei->vfs_inode);
418 /* Initialize mutex for freeze. */ 439 /* Initialize mutex for freeze. */
@@ -449,15 +470,15 @@ static const struct super_operations bdev_sops = {
449 .evict_inode = bdev_evict_inode, 470 .evict_inode = bdev_evict_inode,
450}; 471};
451 472
452static int bd_get_sb(struct file_system_type *fs_type, 473static struct dentry *bd_mount(struct file_system_type *fs_type,
453 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 474 int flags, const char *dev_name, void *data)
454{ 475{
455 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 476 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
456} 477}
457 478
458static struct file_system_type bd_type = { 479static struct file_system_type bd_type = {
459 .name = "bdev", 480 .name = "bdev",
460 .get_sb = bd_get_sb, 481 .mount = bd_mount,
461 .kill_sb = kill_anon_super, 482 .kill_sb = kill_anon_super,
462}; 483};
463 484
@@ -550,7 +571,7 @@ EXPORT_SYMBOL(bdget);
550 */ 571 */
551struct block_device *bdgrab(struct block_device *bdev) 572struct block_device *bdgrab(struct block_device *bdev)
552{ 573{
553 atomic_inc(&bdev->bd_inode->i_count); 574 ihold(bdev->bd_inode);
554 return bdev; 575 return bdev;
555} 576}
556 577
@@ -580,7 +601,7 @@ static struct block_device *bd_acquire(struct inode *inode)
580 spin_lock(&bdev_lock); 601 spin_lock(&bdev_lock);
581 bdev = inode->i_bdev; 602 bdev = inode->i_bdev;
582 if (bdev) { 603 if (bdev) {
583 atomic_inc(&bdev->bd_inode->i_count); 604 ihold(bdev->bd_inode);
584 spin_unlock(&bdev_lock); 605 spin_unlock(&bdev_lock);
585 return bdev; 606 return bdev;
586 } 607 }
@@ -591,12 +612,12 @@ static struct block_device *bd_acquire(struct inode *inode)
591 spin_lock(&bdev_lock); 612 spin_lock(&bdev_lock);
592 if (!inode->i_bdev) { 613 if (!inode->i_bdev) {
593 /* 614 /*
594 * We take an additional bd_inode->i_count for inode, 615 * We take an additional reference to bd_inode,
595 * and it's released in clear_inode() of inode. 616 * and it's released in clear_inode() of inode.
596 * So, we can access it via ->i_mapping always 617 * So, we can access it via ->i_mapping always
597 * without igrab(). 618 * without igrab().
598 */ 619 */
599 atomic_inc(&bdev->bd_inode->i_count); 620 ihold(bdev->bd_inode);
600 inode->i_bdev = bdev; 621 inode->i_bdev = bdev;
601 inode->i_mapping = bdev->bd_inode->i_mapping; 622 inode->i_mapping = bdev->bd_inode->i_mapping;
602 list_add(&inode->i_devices, &bdev->bd_inodes); 623 list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -648,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
648 else if (bdev->bd_contains == bdev) 669 else if (bdev->bd_contains == bdev)
649 return true; /* is a whole device which isn't held */ 670 return true; /* is a whole device which isn't held */
650 671
651 else if (whole->bd_holder == bd_claim) 672 else if (whole->bd_holder == bd_may_claim)
652 return true; /* is a partition of a device that is being partitioned */ 673 return true; /* is a partition of a device that is being partitioned */
653 else if (whole->bd_holder != NULL) 674 else if (whole->bd_holder != NULL)
654 return false; /* is a partition of a held device */ 675 return false; /* is a partition of a held device */
@@ -760,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
760 } 781 }
761} 782}
762 783
763/* releases bdev_lock */ 784#ifdef CONFIG_SYSFS
764static void __bd_abort_claiming(struct block_device *whole, void *holder) 785struct bd_holder_disk {
765{ 786 struct list_head list;
766 BUG_ON(whole->bd_claiming != holder); 787 struct gendisk *disk;
767 whole->bd_claiming = NULL; 788 int refcnt;
768 wake_up_bit(&whole->bd_claiming, 0); 789};
769
770 spin_unlock(&bdev_lock);
771 bdput(whole);
772}
773
774/**
775 * bd_abort_claiming - abort claiming a block device
776 * @whole: whole block device returned by bd_start_claiming()
777 * @holder: holder trying to claim @bdev
778 *
779 * Abort a claiming block started by bd_start_claiming(). Note that
780 * @whole is not the block device to be claimed but the whole device
781 * returned by bd_start_claiming().
782 *
783 * CONTEXT:
784 * Grabs and releases bdev_lock.
785 */
786static void bd_abort_claiming(struct block_device *whole, void *holder)
787{
788 spin_lock(&bdev_lock);
789 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
790}
791
792/* increment holders when we have a legitimate claim. requires bdev_lock */
793static void __bd_claim(struct block_device *bdev, struct block_device *whole,
794 void *holder)
795{
796 /* note that for a whole device bd_holders
797 * will be incremented twice, and bd_holder will
798 * be set to bd_claim before being set to holder
799 */
800 whole->bd_holders++;
801 whole->bd_holder = bd_claim;
802 bdev->bd_holders++;
803 bdev->bd_holder = holder;
804}
805
806/**
807 * bd_finish_claiming - finish claiming a block device
808 * @bdev: block device of interest (passed to bd_start_claiming())
809 * @whole: whole block device returned by bd_start_claiming()
810 * @holder: holder trying to claim @bdev
811 *
812 * Finish a claiming block started by bd_start_claiming().
813 *
814 * CONTEXT:
815 * Grabs and releases bdev_lock.
816 */
817static void bd_finish_claiming(struct block_device *bdev,
818 struct block_device *whole, void *holder)
819{
820 spin_lock(&bdev_lock);
821 BUG_ON(!bd_may_claim(bdev, whole, holder));
822 __bd_claim(bdev, whole, holder);
823 __bd_abort_claiming(whole, holder); /* not actually an abort */
824}
825 790
826/** 791static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
827 * bd_claim - claim a block device 792 struct gendisk *disk)
828 * @bdev: block device to claim
829 * @holder: holder trying to claim @bdev
830 *
831 * Try to claim @bdev which must have been opened successfully.
832 *
833 * CONTEXT:
834 * Might sleep.
835 *
836 * RETURNS:
837 * 0 if successful, -EBUSY if @bdev is already claimed.
838 */
839int bd_claim(struct block_device *bdev, void *holder)
840{ 793{
841 struct block_device *whole = bdev->bd_contains; 794 struct bd_holder_disk *holder;
842 int res;
843
844 might_sleep();
845 795
846 spin_lock(&bdev_lock); 796 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
847 res = bd_prepare_to_claim(bdev, whole, holder); 797 if (holder->disk == disk)
848 if (res == 0) 798 return holder;
849 __bd_claim(bdev, whole, holder); 799 return NULL;
850 spin_unlock(&bdev_lock);
851
852 return res;
853}
854EXPORT_SYMBOL(bd_claim);
855
856void bd_release(struct block_device *bdev)
857{
858 spin_lock(&bdev_lock);
859 if (!--bdev->bd_contains->bd_holders)
860 bdev->bd_contains->bd_holder = NULL;
861 if (!--bdev->bd_holders)
862 bdev->bd_holder = NULL;
863 spin_unlock(&bdev_lock);
864} 800}
865 801
866EXPORT_SYMBOL(bd_release);
867
868#ifdef CONFIG_SYSFS
869/*
870 * Functions for bd_claim_by_kobject / bd_release_from_kobject
871 *
872 * If a kobject is passed to bd_claim_by_kobject()
873 * and the kobject has a parent directory,
874 * following symlinks are created:
875 * o from the kobject to the claimed bdev
876 * o from "holders" directory of the bdev to the parent of the kobject
877 * bd_release_from_kobject() removes these symlinks.
878 *
879 * Example:
880 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to
881 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
882 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
883 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
884 */
885
886static int add_symlink(struct kobject *from, struct kobject *to) 802static int add_symlink(struct kobject *from, struct kobject *to)
887{ 803{
888 if (!from || !to)
889 return 0;
890 return sysfs_create_link(from, to, kobject_name(to)); 804 return sysfs_create_link(from, to, kobject_name(to));
891} 805}
892 806
893static void del_symlink(struct kobject *from, struct kobject *to) 807static void del_symlink(struct kobject *from, struct kobject *to)
894{ 808{
895 if (!from || !to)
896 return;
897 sysfs_remove_link(from, kobject_name(to)); 809 sysfs_remove_link(from, kobject_name(to));
898} 810}
899 811
900/*
901 * 'struct bd_holder' contains pointers to kobjects symlinked by
902 * bd_claim_by_kobject.
903 * It's connected to bd_holder_list which is protected by bdev->bd_sem.
904 */
905struct bd_holder {
906 struct list_head list; /* chain of holders of the bdev */
907 int count; /* references from the holder */
908 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */
909 struct kobject *hdev; /* e.g. "/block/dm-0" */
910 struct kobject *hdir; /* e.g. "/block/sda/holders" */
911 struct kobject *sdev; /* e.g. "/block/sda" */
912};
913
914/*
915 * Get references of related kobjects at once.
916 * Returns 1 on success. 0 on failure.
917 *
918 * Should call bd_holder_release_dirs() after successful use.
919 */
920static int bd_holder_grab_dirs(struct block_device *bdev,
921 struct bd_holder *bo)
922{
923 if (!bdev || !bo)
924 return 0;
925
926 bo->sdir = kobject_get(bo->sdir);
927 if (!bo->sdir)
928 return 0;
929
930 bo->hdev = kobject_get(bo->sdir->parent);
931 if (!bo->hdev)
932 goto fail_put_sdir;
933
934 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
935 if (!bo->sdev)
936 goto fail_put_hdev;
937
938 bo->hdir = kobject_get(bdev->bd_part->holder_dir);
939 if (!bo->hdir)
940 goto fail_put_sdev;
941
942 return 1;
943
944fail_put_sdev:
945 kobject_put(bo->sdev);
946fail_put_hdev:
947 kobject_put(bo->hdev);
948fail_put_sdir:
949 kobject_put(bo->sdir);
950
951 return 0;
952}
953
954/* Put references of related kobjects at once. */
955static void bd_holder_release_dirs(struct bd_holder *bo)
956{
957 kobject_put(bo->hdir);
958 kobject_put(bo->sdev);
959 kobject_put(bo->hdev);
960 kobject_put(bo->sdir);
961}
962
963static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
964{
965 struct bd_holder *bo;
966
967 bo = kzalloc(sizeof(*bo), GFP_KERNEL);
968 if (!bo)
969 return NULL;
970
971 bo->count = 1;
972 bo->sdir = kobj;
973
974 return bo;
975}
976
977static void free_bd_holder(struct bd_holder *bo)
978{
979 kfree(bo);
980}
981
982/** 812/**
983 * find_bd_holder - find matching struct bd_holder from the block device 813 * bd_link_disk_holder - create symlinks between holding disk and slave bdev
814 * @bdev: the claimed slave bdev
815 * @disk: the holding disk
984 * 816 *
985 * @bdev: struct block device to be searched 817 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
986 * @bo: target struct bd_holder
987 * 818 *
988 * Returns matching entry with @bo in @bdev->bd_holder_list. 819 * This functions creates the following sysfs symlinks.
989 * If found, increment the reference count and return the pointer. 820 *
990 * If not found, returns NULL. 821 * - from "slaves" directory of the holder @disk to the claimed @bdev
991 */ 822 * - from "holders" directory of the @bdev to the holder @disk
992static struct bd_holder *find_bd_holder(struct block_device *bdev,
993 struct bd_holder *bo)
994{
995 struct bd_holder *tmp;
996
997 list_for_each_entry(tmp, &bdev->bd_holder_list, list)
998 if (tmp->sdir == bo->sdir) {
999 tmp->count++;
1000 return tmp;
1001 }
1002
1003 return NULL;
1004}
1005
1006/**
1007 * add_bd_holder - create sysfs symlinks for bd_claim() relationship
1008 * 823 *
1009 * @bdev: block device to be bd_claimed 824 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
1010 * @bo: preallocated and initialized by alloc_bd_holder() 825 * passed to bd_link_disk_holder(), then:
1011 * 826 *
1012 * Add @bo to @bdev->bd_holder_list, create symlinks. 827 * /sys/block/dm-0/slaves/sda --> /sys/block/sda
828 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
1013 * 829 *
1014 * Returns 0 if symlinks are created. 830 * The caller must have claimed @bdev before calling this function and
1015 * Returns -ve if something fails. 831 * ensure that both @bdev and @disk are valid during the creation and
832 * lifetime of these symlinks.
833 *
834 * CONTEXT:
835 * Might sleep.
836 *
837 * RETURNS:
838 * 0 on success, -errno on failure.
1016 */ 839 */
1017static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 840int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
1018{ 841{
1019 int err; 842 struct bd_holder_disk *holder;
843 int ret = 0;
1020 844
1021 if (!bo) 845 mutex_lock(&bdev->bd_mutex);
1022 return -EINVAL;
1023 846
1024 if (!bd_holder_grab_dirs(bdev, bo)) 847 WARN_ON_ONCE(!bdev->bd_holder);
1025 return -EBUSY;
1026 848
1027 err = add_symlink(bo->sdir, bo->sdev); 849 /* FIXME: remove the following once add_disk() handles errors */
1028 if (err) 850 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
1029 return err; 851 goto out_unlock;
1030 852
1031 err = add_symlink(bo->hdir, bo->hdev); 853 holder = bd_find_holder_disk(bdev, disk);
1032 if (err) { 854 if (holder) {
1033 del_symlink(bo->sdir, bo->sdev); 855 holder->refcnt++;
1034 return err; 856 goto out_unlock;
1035 } 857 }
1036 858
1037 list_add_tail(&bo->list, &bdev->bd_holder_list); 859 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
1038 return 0; 860 if (!holder) {
1039} 861 ret = -ENOMEM;
1040 862 goto out_unlock;
1041/**
1042 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
1043 *
1044 * @bdev: block device to be bd_claimed
1045 * @kobj: holder's kobject
1046 *
1047 * If there is matching entry with @kobj in @bdev->bd_holder_list
1048 * and no other bd_claim() from the same kobject,
1049 * remove the struct bd_holder from the list, delete symlinks for it.
1050 *
1051 * Returns a pointer to the struct bd_holder when it's removed from the list
1052 * and ready to be freed.
1053 * Returns NULL if matching claim isn't found or there is other bd_claim()
1054 * by the same kobject.
1055 */
1056static struct bd_holder *del_bd_holder(struct block_device *bdev,
1057 struct kobject *kobj)
1058{
1059 struct bd_holder *bo;
1060
1061 list_for_each_entry(bo, &bdev->bd_holder_list, list) {
1062 if (bo->sdir == kobj) {
1063 bo->count--;
1064 BUG_ON(bo->count < 0);
1065 if (!bo->count) {
1066 list_del(&bo->list);
1067 del_symlink(bo->sdir, bo->sdev);
1068 del_symlink(bo->hdir, bo->hdev);
1069 bd_holder_release_dirs(bo);
1070 return bo;
1071 }
1072 break;
1073 }
1074 } 863 }
1075 864
1076 return NULL; 865 INIT_LIST_HEAD(&holder->list);
1077} 866 holder->disk = disk;
867 holder->refcnt = 1;
1078 868
1079/** 869 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1080 * bd_claim_by_kobject - bd_claim() with additional kobject signature 870 if (ret)
1081 * 871 goto out_free;
1082 * @bdev: block device to be claimed
1083 * @holder: holder's signature
1084 * @kobj: holder's kobject
1085 *
1086 * Do bd_claim() and if it succeeds, create sysfs symlinks between
1087 * the bdev and the holder's kobject.
1088 * Use bd_release_from_kobject() when relesing the claimed bdev.
1089 *
1090 * Returns 0 on success. (same as bd_claim())
1091 * Returns errno on failure.
1092 */
1093static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
1094 struct kobject *kobj)
1095{
1096 int err;
1097 struct bd_holder *bo, *found;
1098 872
1099 if (!kobj) 873 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
1100 return -EINVAL; 874 if (ret)
1101 875 goto out_del;
1102 bo = alloc_bd_holder(kobj);
1103 if (!bo)
1104 return -ENOMEM;
1105
1106 mutex_lock(&bdev->bd_mutex);
1107 876
1108 err = bd_claim(bdev, holder); 877 list_add(&holder->list, &bdev->bd_holder_disks);
1109 if (err) 878 goto out_unlock;
1110 goto fail;
1111 879
1112 found = find_bd_holder(bdev, bo); 880out_del:
1113 if (found) 881 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1114 goto fail; 882out_free:
1115 883 kfree(holder);
1116 err = add_bd_holder(bdev, bo); 884out_unlock:
1117 if (err)
1118 bd_release(bdev);
1119 else
1120 bo = NULL;
1121fail:
1122 mutex_unlock(&bdev->bd_mutex); 885 mutex_unlock(&bdev->bd_mutex);
1123 free_bd_holder(bo); 886 return ret;
1124 return err;
1125} 887}
888EXPORT_SYMBOL_GPL(bd_link_disk_holder);
1126 889
1127/** 890/**
1128 * bd_release_from_kobject - bd_release() with additional kobject signature 891 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
892 * @bdev: the calimed slave bdev
893 * @disk: the holding disk
1129 * 894 *
1130 * @bdev: block device to be released 895 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
1131 * @kobj: holder's kobject
1132 * 896 *
1133 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 897 * CONTEXT:
898 * Might sleep.
1134 */ 899 */
1135static void bd_release_from_kobject(struct block_device *bdev, 900void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
1136 struct kobject *kobj)
1137{ 901{
1138 if (!kobj) 902 struct bd_holder_disk *holder;
1139 return;
1140 903
1141 mutex_lock(&bdev->bd_mutex); 904 mutex_lock(&bdev->bd_mutex);
1142 bd_release(bdev);
1143 free_bd_holder(del_bd_holder(bdev, kobj));
1144 mutex_unlock(&bdev->bd_mutex);
1145}
1146 905
1147/** 906 holder = bd_find_holder_disk(bdev, disk);
1148 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
1149 *
1150 * @bdev: block device to be claimed
1151 * @holder: holder's signature
1152 * @disk: holder's gendisk
1153 *
1154 * Call bd_claim_by_kobject() with getting @disk->slave_dir.
1155 */
1156int bd_claim_by_disk(struct block_device *bdev, void *holder,
1157 struct gendisk *disk)
1158{
1159 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
1160}
1161EXPORT_SYMBOL_GPL(bd_claim_by_disk);
1162 907
1163/** 908 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
1164 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 909 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
1165 * 910 del_symlink(bdev->bd_part->holder_dir,
1166 * @bdev: block device to be claimed 911 &disk_to_dev(disk)->kobj);
1167 * @disk: holder's gendisk 912 list_del_init(&holder->list);
1168 * 913 kfree(holder);
1169 * Call bd_release_from_kobject() and put @disk->slave_dir. 914 }
1170 */
1171void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
1172{
1173 bd_release_from_kobject(bdev, disk->slave_dir);
1174 kobject_put(disk->slave_dir);
1175}
1176EXPORT_SYMBOL_GPL(bd_release_from_disk);
1177#endif
1178 915
1179/* 916 mutex_unlock(&bdev->bd_mutex);
1180 * Tries to open block device by device number. Use it ONLY if you
1181 * really do not have anything better - i.e. when you are behind a
1182 * truly sucky interface and all you are given is a device number. _Never_
1183 * to be used for internal purposes. If you ever need it - reconsider
1184 * your API.
1185 */
1186struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
1187{
1188 struct block_device *bdev = bdget(dev);
1189 int err = -ENOMEM;
1190 if (bdev)
1191 err = blkdev_get(bdev, mode);
1192 return err ? ERR_PTR(err) : bdev;
1193} 917}
1194 918EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
1195EXPORT_SYMBOL(open_by_devnum); 919#endif
1196 920
1197/** 921/**
1198 * flush_disk - invalidates all buffer-cache entries on a disk 922 * flush_disk - invalidates all buffer-cache entries on a disk
@@ -1288,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
1288{ 1012{
1289 struct gendisk *disk = bdev->bd_disk; 1013 struct gendisk *disk = bdev->bd_disk;
1290 const struct block_device_operations *bdops = disk->fops; 1014 const struct block_device_operations *bdops = disk->fops;
1015 unsigned int events;
1291 1016
1292 if (!bdops->media_changed) 1017 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1293 return 0; 1018 DISK_EVENT_EJECT_REQUEST);
1294 if (!bdops->media_changed(bdev->bd_disk)) 1019 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1295 return 0; 1020 return 0;
1296 1021
1297 flush_disk(bdev); 1022 flush_disk(bdev);
@@ -1390,7 +1115,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1390 bdi = blk_get_backing_dev_info(bdev); 1115 bdi = blk_get_backing_dev_info(bdev);
1391 if (bdi == NULL) 1116 if (bdi == NULL)
1392 bdi = &default_backing_dev_info; 1117 bdi = &default_backing_dev_info;
1393 bdev->bd_inode->i_data.backing_dev_info = bdi; 1118 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1394 } 1119 }
1395 if (bdev->bd_invalidated) 1120 if (bdev->bd_invalidated)
1396 rescan_partitions(disk, bdev); 1121 rescan_partitions(disk, bdev);
@@ -1405,8 +1130,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1405 if (ret) 1130 if (ret)
1406 goto out_clear; 1131 goto out_clear;
1407 bdev->bd_contains = whole; 1132 bdev->bd_contains = whole;
1408 bdev->bd_inode->i_data.backing_dev_info = 1133 bdev_inode_switch_bdi(bdev->bd_inode,
1409 whole->bd_inode->i_data.backing_dev_info; 1134 whole->bd_inode->i_data.backing_dev_info);
1410 bdev->bd_part = disk_get_part(disk, partno); 1135 bdev->bd_part = disk_get_part(disk, partno);
1411 if (!(disk->flags & GENHD_FL_UP) || 1136 if (!(disk->flags & GENHD_FL_UP) ||
1412 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1137 !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1164,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1439 disk_put_part(bdev->bd_part); 1164 disk_put_part(bdev->bd_part);
1440 bdev->bd_disk = NULL; 1165 bdev->bd_disk = NULL;
1441 bdev->bd_part = NULL; 1166 bdev->bd_part = NULL;
1442 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1167 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1443 if (bdev != bdev->bd_contains) 1168 if (bdev != bdev->bd_contains)
1444 __blkdev_put(bdev->bd_contains, mode, 1); 1169 __blkdev_put(bdev->bd_contains, mode, 1);
1445 bdev->bd_contains = NULL; 1170 bdev->bd_contains = NULL;
@@ -1454,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1454 return ret; 1179 return ret;
1455} 1180}
1456 1181
1457int blkdev_get(struct block_device *bdev, fmode_t mode) 1182/**
1183 * blkdev_get - open a block device
1184 * @bdev: block_device to open
1185 * @mode: FMODE_* mask
1186 * @holder: exclusive holder identifier
1187 *
1188 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
1189 * open with exclusive access. Specifying %FMODE_EXCL with %NULL
1190 * @holder is invalid. Exclusive opens may nest for the same @holder.
1191 *
1192 * On success, the reference count of @bdev is unchanged. On failure,
1193 * @bdev is put.
1194 *
1195 * CONTEXT:
1196 * Might sleep.
1197 *
1198 * RETURNS:
1199 * 0 on success, -errno on failure.
1200 */
1201int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1458{ 1202{
1459 return __blkdev_get(bdev, mode, 0); 1203 struct block_device *whole = NULL;
1204 int res;
1205
1206 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1207
1208 if ((mode & FMODE_EXCL) && holder) {
1209 whole = bd_start_claiming(bdev, holder);
1210 if (IS_ERR(whole)) {
1211 bdput(bdev);
1212 return PTR_ERR(whole);
1213 }
1214 }
1215
1216 res = __blkdev_get(bdev, mode, 0);
1217
1218 /* __blkdev_get() may alter read only status, check it afterwards */
1219 if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1220 __blkdev_put(bdev, mode, 0);
1221 res = -EACCES;
1222 }
1223
1224 if (whole) {
1225 /* finish claiming */
1226 mutex_lock(&bdev->bd_mutex);
1227 spin_lock(&bdev_lock);
1228
1229 if (!res) {
1230 BUG_ON(!bd_may_claim(bdev, whole, holder));
1231 /*
1232 * Note that for a whole device bd_holders
1233 * will be incremented twice, and bd_holder
1234 * will be set to bd_may_claim before being
1235 * set to holder
1236 */
1237 whole->bd_holders++;
1238 whole->bd_holder = bd_may_claim;
1239 bdev->bd_holders++;
1240 bdev->bd_holder = holder;
1241 }
1242
1243 /* tell others that we're done */
1244 BUG_ON(whole->bd_claiming != holder);
1245 whole->bd_claiming = NULL;
1246 wake_up_bit(&whole->bd_claiming, 0);
1247
1248 spin_unlock(&bdev_lock);
1249
1250 /*
1251 * Block event polling for write claims. Any write
1252 * holder makes the write_holder state stick until all
1253 * are released. This is good enough and tracking
1254 * individual writeable reference is too fragile given
1255 * the way @mode is used in blkdev_get/put().
1256 */
1257 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
1258 bdev->bd_write_holder = true;
1259 disk_block_events(bdev->bd_disk);
1260 }
1261
1262 mutex_unlock(&bdev->bd_mutex);
1263 bdput(whole);
1264 }
1265
1266 return res;
1460} 1267}
1461EXPORT_SYMBOL(blkdev_get); 1268EXPORT_SYMBOL(blkdev_get);
1462 1269
1270/**
1271 * blkdev_get_by_path - open a block device by name
1272 * @path: path to the block device to open
1273 * @mode: FMODE_* mask
1274 * @holder: exclusive holder identifier
1275 *
1276 * Open the blockdevice described by the device file at @path. @mode
1277 * and @holder are identical to blkdev_get().
1278 *
1279 * On success, the returned block_device has reference count of one.
1280 *
1281 * CONTEXT:
1282 * Might sleep.
1283 *
1284 * RETURNS:
1285 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1286 */
1287struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1288 void *holder)
1289{
1290 struct block_device *bdev;
1291 int err;
1292
1293 bdev = lookup_bdev(path);
1294 if (IS_ERR(bdev))
1295 return bdev;
1296
1297 err = blkdev_get(bdev, mode, holder);
1298 if (err)
1299 return ERR_PTR(err);
1300
1301 return bdev;
1302}
1303EXPORT_SYMBOL(blkdev_get_by_path);
1304
1305/**
1306 * blkdev_get_by_dev - open a block device by device number
1307 * @dev: device number of block device to open
1308 * @mode: FMODE_* mask
1309 * @holder: exclusive holder identifier
1310 *
1311 * Open the blockdevice described by device number @dev. @mode and
1312 * @holder are identical to blkdev_get().
1313 *
1314 * Use it ONLY if you really do not have anything better - i.e. when
1315 * you are behind a truly sucky interface and all you are given is a
1316 * device number. _Never_ to be used for internal purposes. If you
1317 * ever need it - reconsider your API.
1318 *
1319 * On success, the returned block_device has reference count of one.
1320 *
1321 * CONTEXT:
1322 * Might sleep.
1323 *
1324 * RETURNS:
1325 * Pointer to block_device on success, ERR_PTR(-errno) on failure.
1326 */
1327struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
1328{
1329 struct block_device *bdev;
1330 int err;
1331
1332 bdev = bdget(dev);
1333 if (!bdev)
1334 return ERR_PTR(-ENOMEM);
1335
1336 err = blkdev_get(bdev, mode, holder);
1337 if (err)
1338 return ERR_PTR(err);
1339
1340 return bdev;
1341}
1342EXPORT_SYMBOL(blkdev_get_by_dev);
1343
1463static int blkdev_open(struct inode * inode, struct file * filp) 1344static int blkdev_open(struct inode * inode, struct file * filp)
1464{ 1345{
1465 struct block_device *whole = NULL;
1466 struct block_device *bdev; 1346 struct block_device *bdev;
1467 int res;
1468 1347
1469 /* 1348 /*
1470 * Preserve backwards compatibility and allow large file access 1349 * Preserve backwards compatibility and allow large file access
@@ -1485,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1485 if (bdev == NULL) 1364 if (bdev == NULL)
1486 return -ENOMEM; 1365 return -ENOMEM;
1487 1366
1488 if (filp->f_mode & FMODE_EXCL) {
1489 whole = bd_start_claiming(bdev, filp);
1490 if (IS_ERR(whole)) {
1491 bdput(bdev);
1492 return PTR_ERR(whole);
1493 }
1494 }
1495
1496 filp->f_mapping = bdev->bd_inode->i_mapping; 1367 filp->f_mapping = bdev->bd_inode->i_mapping;
1497 1368
1498 res = blkdev_get(bdev, filp->f_mode); 1369 return blkdev_get(bdev, filp->f_mode, filp);
1499
1500 if (whole) {
1501 if (res == 0)
1502 bd_finish_claiming(bdev, whole, filp);
1503 else
1504 bd_abort_claiming(whole, filp);
1505 }
1506
1507 return res;
1508} 1370}
1509 1371
1510static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1372static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1518,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1518 bdev->bd_part_count--; 1380 bdev->bd_part_count--;
1519 1381
1520 if (!--bdev->bd_openers) { 1382 if (!--bdev->bd_openers) {
1383 WARN_ON_ONCE(bdev->bd_holders);
1521 sync_blockdev(bdev); 1384 sync_blockdev(bdev);
1522 kill_bdev(bdev); 1385 kill_bdev(bdev);
1523 } 1386 }
@@ -1533,7 +1396,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1533 disk_put_part(bdev->bd_part); 1396 disk_put_part(bdev->bd_part);
1534 bdev->bd_part = NULL; 1397 bdev->bd_part = NULL;
1535 bdev->bd_disk = NULL; 1398 bdev->bd_disk = NULL;
1536 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1399 bdev_inode_switch_bdi(bdev->bd_inode,
1400 &default_backing_dev_info);
1537 if (bdev != bdev->bd_contains) 1401 if (bdev != bdev->bd_contains)
1538 victim = bdev->bd_contains; 1402 victim = bdev->bd_contains;
1539 bdev->bd_contains = NULL; 1403 bdev->bd_contains = NULL;
@@ -1547,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1547 1411
1548int blkdev_put(struct block_device *bdev, fmode_t mode) 1412int blkdev_put(struct block_device *bdev, fmode_t mode)
1549{ 1413{
1414 if (mode & FMODE_EXCL) {
1415 bool bdev_free;
1416
1417 /*
1418 * Release a claim on the device. The holder fields
1419 * are protected with bdev_lock. bd_mutex is to
1420 * synchronize disk_holder unlinking.
1421 */
1422 mutex_lock(&bdev->bd_mutex);
1423 spin_lock(&bdev_lock);
1424
1425 WARN_ON_ONCE(--bdev->bd_holders < 0);
1426 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
1427
1428 /* bd_contains might point to self, check in a separate step */
1429 if ((bdev_free = !bdev->bd_holders))
1430 bdev->bd_holder = NULL;
1431 if (!bdev->bd_contains->bd_holders)
1432 bdev->bd_contains->bd_holder = NULL;
1433
1434 spin_unlock(&bdev_lock);
1435
1436 /*
1437 * If this was the last claim, remove holder link and
1438 * unblock evpoll if it was a write holder.
1439 */
1440 if (bdev_free) {
1441 if (bdev->bd_write_holder) {
1442 disk_unblock_events(bdev->bd_disk);
1443 bdev->bd_write_holder = false;
1444 } else
1445 disk_check_events(bdev->bd_disk);
1446 }
1447
1448 mutex_unlock(&bdev->bd_mutex);
1449 } else
1450 disk_check_events(bdev->bd_disk);
1451
1550 return __blkdev_put(bdev, mode, 0); 1452 return __blkdev_put(bdev, mode, 0);
1551} 1453}
1552EXPORT_SYMBOL(blkdev_put); 1454EXPORT_SYMBOL(blkdev_put);
@@ -1554,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
1554static int blkdev_close(struct inode * inode, struct file * filp) 1456static int blkdev_close(struct inode * inode, struct file * filp)
1555{ 1457{
1556 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1458 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1557 if (bdev->bd_holder == filp) 1459
1558 bd_release(bdev);
1559 return blkdev_put(bdev, filp->f_mode); 1460 return blkdev_put(bdev, filp->f_mode);
1560} 1461}
1561 1462
@@ -1700,67 +1601,6 @@ fail:
1700} 1601}
1701EXPORT_SYMBOL(lookup_bdev); 1602EXPORT_SYMBOL(lookup_bdev);
1702 1603
1703/**
1704 * open_bdev_exclusive - open a block device by name and set it up for use
1705 *
1706 * @path: special file representing the block device
1707 * @mode: FMODE_... combination to pass be used
1708 * @holder: owner for exclusion
1709 *
1710 * Open the blockdevice described by the special file at @path, claim it
1711 * for the @holder.
1712 */
1713struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1714{
1715 struct block_device *bdev, *whole;
1716 int error;
1717
1718 bdev = lookup_bdev(path);
1719 if (IS_ERR(bdev))
1720 return bdev;
1721
1722 whole = bd_start_claiming(bdev, holder);
1723 if (IS_ERR(whole)) {
1724 bdput(bdev);
1725 return whole;
1726 }
1727
1728 error = blkdev_get(bdev, mode);
1729 if (error)
1730 goto out_abort_claiming;
1731
1732 error = -EACCES;
1733 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1734 goto out_blkdev_put;
1735
1736 bd_finish_claiming(bdev, whole, holder);
1737 return bdev;
1738
1739out_blkdev_put:
1740 blkdev_put(bdev, mode);
1741out_abort_claiming:
1742 bd_abort_claiming(whole, holder);
1743 return ERR_PTR(error);
1744}
1745
1746EXPORT_SYMBOL(open_bdev_exclusive);
1747
1748/**
1749 * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive()
1750 *
1751 * @bdev: blockdevice to close
1752 * @mode: mode, must match that used to open.
1753 *
1754 * This is the counterpart to open_bdev_exclusive().
1755 */
1756void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
1757{
1758 bd_release(bdev);
1759 blkdev_put(bdev, mode);
1760}
1761
1762EXPORT_SYMBOL(close_bdev_exclusive);
1763
1764int __invalidate_device(struct block_device *bdev) 1604int __invalidate_device(struct block_device *bdev)
1765{ 1605{
1766 struct super_block *sb = get_super(bdev); 1606 struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 889ce1348e64..9c949348510b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -193,18 +193,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
193 return ret; 193 return ret;
194} 194}
195 195
196int btrfs_check_acl(struct inode *inode, int mask) 196int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
197{ 197{
198 struct posix_acl *acl;
199 int error = -EAGAIN; 198 int error = -EAGAIN;
200 199
201 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); 200 if (flags & IPERM_FLAG_RCU) {
201 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
202 error = -ECHILD;
202 203
203 if (IS_ERR(acl)) 204 } else {
204 return PTR_ERR(acl); 205 struct posix_acl *acl;
205 if (acl) { 206 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
206 error = posix_acl_permission(inode, acl, mask); 207 if (IS_ERR(acl))
207 posix_acl_release(acl); 208 return PTR_ERR(acl);
209 if (acl) {
210 error = posix_acl_permission(inode, acl, mask);
211 posix_acl_release(acl);
212 }
208 } 213 }
209 214
210 return error; 215 return error;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 72195378bef9..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2583,7 +2583,7 @@ do { \
2583 2583
2584/* acl.c */ 2584/* acl.c */
2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2586int btrfs_check_acl(struct inode *inode, int mask); 2586int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
2587#else 2587#else
2588#define btrfs_check_acl NULL 2588#define btrfs_check_acl NULL
2589#endif 2589#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b36eeef19194..fdce8799b98d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2127,7 +2127,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2127 if (uptodate) { 2127 if (uptodate) {
2128 set_buffer_uptodate(bh); 2128 set_buffer_uptodate(bh);
2129 } else { 2129 } else {
2130 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2130 if (printk_ratelimit()) {
2131 printk(KERN_WARNING "lost page write due to " 2131 printk(KERN_WARNING "lost page write due to "
2132 "I/O error on %s\n", 2132 "I/O error on %s\n",
2133 bdevname(bh->b_bdev, b)); 2133 bdevname(bh->b_bdev, b));
@@ -2264,21 +2264,10 @@ static int write_dev_supers(struct btrfs_device *device,
2264 bh->b_end_io = btrfs_end_buffer_write_sync; 2264 bh->b_end_io = btrfs_end_buffer_write_sync;
2265 } 2265 }
2266 2266
2267 if (i == last_barrier && do_barriers && device->barriers) { 2267 if (i == last_barrier && do_barriers)
2268 ret = submit_bh(WRITE_BARRIER, bh); 2268 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2269 if (ret == -EOPNOTSUPP) { 2269 else
2270 printk("btrfs: disabling barriers on dev %s\n",
2271 device->name);
2272 set_buffer_uptodate(bh);
2273 device->barriers = 0;
2274 /* one reference for submit_bh */
2275 get_bh(bh);
2276 lock_buffer(bh);
2277 ret = submit_bh(WRITE_SYNC, bh);
2278 }
2279 } else {
2280 ret = submit_bh(WRITE_SYNC, bh); 2270 ret = submit_bh(WRITE_SYNC, bh);
2281 }
2282 2271
2283 if (ret) 2272 if (ret)
2284 errors++; 2273 errors++;
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 3220ad1aafc8..ff27d7a477b2 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
65{ 65{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
67 struct btrfs_root *root; 67 struct btrfs_root *root;
68 struct dentry *dentry;
69 struct inode *inode; 68 struct inode *inode;
70 struct btrfs_key key; 69 struct btrfs_key key;
71 int index; 70 int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
108 return ERR_PTR(-ESTALE); 107 return ERR_PTR(-ESTALE);
109 } 108 }
110 109
111 dentry = d_obtain_alias(inode); 110 return d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 dentry->d_op = &btrfs_dentry_operations;
114 return dentry;
115fail: 111fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index); 112 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err); 113 return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
166static struct dentry *btrfs_get_parent(struct dentry *child) 162static struct dentry *btrfs_get_parent(struct dentry *child)
167{ 163{
168 struct inode *dir = child->d_inode; 164 struct inode *dir = child->d_inode;
169 static struct dentry *dentry;
170 struct btrfs_root *root = BTRFS_I(dir)->root; 165 struct btrfs_root *root = BTRFS_I(dir)->root;
171 struct btrfs_path *path; 166 struct btrfs_path *path;
172 struct extent_buffer *leaf; 167 struct extent_buffer *leaf;
@@ -225,10 +220,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
225 220
226 key.type = BTRFS_INODE_ITEM_KEY; 221 key.type = BTRFS_INODE_ITEM_KEY;
227 key.offset = 0; 222 key.offset = 0;
228 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); 223 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
229 if (!IS_ERR(dentry))
230 dentry->d_op = &btrfs_dentry_operations;
231 return dentry;
232fail: 224fail:
233 btrfs_free_path(path); 225 btrfs_free_path(path);
234 return ERR_PTR(ret); 226 return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 565e22d77b1b..4e7e012ad667 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1743,8 +1743,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1743static void btrfs_issue_discard(struct block_device *bdev, 1743static void btrfs_issue_discard(struct block_device *bdev,
1744 u64 start, u64 len) 1744 u64 start, u64 len)
1745{ 1745{
1746 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1746 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
1747 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1748} 1747}
1749 1748
1750static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1749static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8862dda46ff6..5e76a474cb7e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3083,7 +3083,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3083 eb->len = len; 3083 eb->len = len;
3084 spin_lock_init(&eb->lock); 3084 spin_lock_init(&eb->lock);
3085 init_waitqueue_head(&eb->lock_wq); 3085 init_waitqueue_head(&eb->lock_wq);
3086 INIT_RCU_HEAD(&eb->rcu_head);
3087 3086
3088#if LEAK_DEBUG 3087#if LEAK_DEBUG
3089 spin_lock_irqsave(&leak_lock, flags); 3088 spin_lock_irqsave(&leak_lock, flags);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b0ff34b96607..c1d3a818731a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
@@ -1258,6 +1259,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1258 return 0; 1259 return 0;
1259} 1260}
1260 1261
1262static long btrfs_fallocate(struct file *file, int mode,
1263 loff_t offset, loff_t len)
1264{
1265 struct inode *inode = file->f_path.dentry->d_inode;
1266 struct extent_state *cached_state = NULL;
1267 u64 cur_offset;
1268 u64 last_byte;
1269 u64 alloc_start;
1270 u64 alloc_end;
1271 u64 alloc_hint = 0;
1272 u64 locked_end;
1273 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1274 struct extent_map *em;
1275 int ret;
1276
1277 alloc_start = offset & ~mask;
1278 alloc_end = (offset + len + mask) & ~mask;
1279
1280 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1281 if (mode & ~FALLOC_FL_KEEP_SIZE)
1282 return -EOPNOTSUPP;
1283
1284 /*
1285 * wait for ordered IO before we have any locks. We'll loop again
1286 * below with the locks held.
1287 */
1288 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1289
1290 mutex_lock(&inode->i_mutex);
1291 ret = inode_newsize_ok(inode, alloc_end);
1292 if (ret)
1293 goto out;
1294
1295 if (alloc_start > inode->i_size) {
1296 ret = btrfs_cont_expand(inode, alloc_start);
1297 if (ret)
1298 goto out;
1299 }
1300
1301 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1302 if (ret)
1303 goto out;
1304
1305 locked_end = alloc_end - 1;
1306 while (1) {
1307 struct btrfs_ordered_extent *ordered;
1308
1309 /* the extent lock is ordered inside the running
1310 * transaction
1311 */
1312 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1313 locked_end, 0, &cached_state, GFP_NOFS);
1314 ordered = btrfs_lookup_first_ordered_extent(inode,
1315 alloc_end - 1);
1316 if (ordered &&
1317 ordered->file_offset + ordered->len > alloc_start &&
1318 ordered->file_offset < alloc_end) {
1319 btrfs_put_ordered_extent(ordered);
1320 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1321 alloc_start, locked_end,
1322 &cached_state, GFP_NOFS);
1323 /*
1324 * we can't wait on the range with the transaction
1325 * running or with the extent lock held
1326 */
1327 btrfs_wait_ordered_range(inode, alloc_start,
1328 alloc_end - alloc_start);
1329 } else {
1330 if (ordered)
1331 btrfs_put_ordered_extent(ordered);
1332 break;
1333 }
1334 }
1335
1336 cur_offset = alloc_start;
1337 while (1) {
1338 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1339 alloc_end - cur_offset, 0);
1340 BUG_ON(IS_ERR(em) || !em);
1341 last_byte = min(extent_map_end(em), alloc_end);
1342 last_byte = (last_byte + mask) & ~mask;
1343 if (em->block_start == EXTENT_MAP_HOLE ||
1344 (cur_offset >= inode->i_size &&
1345 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1346 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1347 last_byte - cur_offset,
1348 1 << inode->i_blkbits,
1349 offset + len,
1350 &alloc_hint);
1351 if (ret < 0) {
1352 free_extent_map(em);
1353 break;
1354 }
1355 }
1356 free_extent_map(em);
1357
1358 cur_offset = last_byte;
1359 if (cur_offset >= alloc_end) {
1360 ret = 0;
1361 break;
1362 }
1363 }
1364 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1365 &cached_state, GFP_NOFS);
1366
1367 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1368out:
1369 mutex_unlock(&inode->i_mutex);
1370 return ret;
1371}
1372
1261const struct file_operations btrfs_file_operations = { 1373const struct file_operations btrfs_file_operations = {
1262 .llseek = generic_file_llseek, 1374 .llseek = generic_file_llseek,
1263 .read = do_sync_read, 1375 .read = do_sync_read,
@@ -1269,6 +1381,7 @@ const struct file_operations btrfs_file_operations = {
1269 .open = generic_file_open, 1381 .open = generic_file_open,
1270 .release = btrfs_release_file, 1382 .release = btrfs_release_file,
1271 .fsync = btrfs_sync_file, 1383 .fsync = btrfs_sync_file,
1384 .fallocate = btrfs_fallocate,
1272 .unlocked_ioctl = btrfs_ioctl, 1385 .unlocked_ioctl = btrfs_ioctl,
1273#ifdef CONFIG_COMPAT 1386#ifdef CONFIG_COMPAT
1274 .compat_ioctl = btrfs_ioctl, 1387 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c9bc0afdbfc6..bcc461a9695f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3902,7 +3902,7 @@ again:
3902 p = &root->inode_tree.rb_node; 3902 p = &root->inode_tree.rb_node;
3903 parent = NULL; 3903 parent = NULL;
3904 3904
3905 if (hlist_unhashed(&inode->i_hash)) 3905 if (inode_unhashed(inode))
3906 return; 3906 return;
3907 3907
3908 spin_lock(&root->inode_lock); 3908 spin_lock(&root->inode_lock);
@@ -4109,8 +4109,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4109 int index; 4109 int index;
4110 int ret; 4110 int ret;
4111 4111
4112 dentry->d_op = &btrfs_dentry_operations;
4113
4114 if (dentry->d_name.len > BTRFS_NAME_LEN) 4112 if (dentry->d_name.len > BTRFS_NAME_LEN)
4115 return ERR_PTR(-ENAMETOOLONG); 4113 return ERR_PTR(-ENAMETOOLONG);
4116 4114
@@ -4152,7 +4150,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4152 return inode; 4150 return inode;
4153} 4151}
4154 4152
4155static int btrfs_dentry_delete(struct dentry *dentry) 4153static int btrfs_dentry_delete(const struct dentry *dentry)
4156{ 4154{
4157 struct btrfs_root *root; 4155 struct btrfs_root *root;
4158 4156
@@ -4830,7 +4828,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4830 } 4828 }
4831 4829
4832 btrfs_set_trans_block_group(trans, dir); 4830 btrfs_set_trans_block_group(trans, dir);
4833 atomic_inc(&inode->i_count); 4831 ihold(inode);
4834 4832
4835 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 4833 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
4836 4834
@@ -6530,6 +6528,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6530 return inode; 6528 return inode;
6531} 6529}
6532 6530
6531static void btrfs_i_callback(struct rcu_head *head)
6532{
6533 struct inode *inode = container_of(head, struct inode, i_rcu);
6534 INIT_LIST_HEAD(&inode->i_dentry);
6535 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
6536}
6537
6533void btrfs_destroy_inode(struct inode *inode) 6538void btrfs_destroy_inode(struct inode *inode)
6534{ 6539{
6535 struct btrfs_ordered_extent *ordered; 6540 struct btrfs_ordered_extent *ordered;
@@ -6599,7 +6604,7 @@ void btrfs_destroy_inode(struct inode *inode)
6599 inode_tree_del(inode); 6604 inode_tree_del(inode);
6600 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6605 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
6601free: 6606free:
6602 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6607 call_rcu(&inode->i_rcu, btrfs_i_callback);
6603} 6608}
6604 6609
6605int btrfs_drop_inode(struct inode *inode) 6610int btrfs_drop_inode(struct inode *inode)
@@ -7128,118 +7133,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
7128 min_size, actual_len, alloc_hint, trans); 7133 min_size, actual_len, alloc_hint, trans);
7129} 7134}
7130 7135
7131static long btrfs_fallocate(struct inode *inode, int mode,
7132 loff_t offset, loff_t len)
7133{
7134 struct extent_state *cached_state = NULL;
7135 u64 cur_offset;
7136 u64 last_byte;
7137 u64 alloc_start;
7138 u64 alloc_end;
7139 u64 alloc_hint = 0;
7140 u64 locked_end;
7141 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
7142 struct extent_map *em;
7143 int ret;
7144
7145 alloc_start = offset & ~mask;
7146 alloc_end = (offset + len + mask) & ~mask;
7147
7148 /*
7149 * wait for ordered IO before we have any locks. We'll loop again
7150 * below with the locks held.
7151 */
7152 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
7153
7154 mutex_lock(&inode->i_mutex);
7155 ret = inode_newsize_ok(inode, alloc_end);
7156 if (ret)
7157 goto out;
7158
7159 if (alloc_start > inode->i_size) {
7160 ret = btrfs_cont_expand(inode, alloc_start);
7161 if (ret)
7162 goto out;
7163 }
7164
7165 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
7166 if (ret)
7167 goto out;
7168
7169 locked_end = alloc_end - 1;
7170 while (1) {
7171 struct btrfs_ordered_extent *ordered;
7172
7173 /* the extent lock is ordered inside the running
7174 * transaction
7175 */
7176 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
7177 locked_end, 0, &cached_state, GFP_NOFS);
7178 ordered = btrfs_lookup_first_ordered_extent(inode,
7179 alloc_end - 1);
7180 if (ordered &&
7181 ordered->file_offset + ordered->len > alloc_start &&
7182 ordered->file_offset < alloc_end) {
7183 btrfs_put_ordered_extent(ordered);
7184 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
7185 alloc_start, locked_end,
7186 &cached_state, GFP_NOFS);
7187 /*
7188 * we can't wait on the range with the transaction
7189 * running or with the extent lock held
7190 */
7191 btrfs_wait_ordered_range(inode, alloc_start,
7192 alloc_end - alloc_start);
7193 } else {
7194 if (ordered)
7195 btrfs_put_ordered_extent(ordered);
7196 break;
7197 }
7198 }
7199
7200 cur_offset = alloc_start;
7201 while (1) {
7202 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
7203 alloc_end - cur_offset, 0);
7204 BUG_ON(IS_ERR(em) || !em);
7205 last_byte = min(extent_map_end(em), alloc_end);
7206 last_byte = (last_byte + mask) & ~mask;
7207 if (em->block_start == EXTENT_MAP_HOLE ||
7208 (cur_offset >= inode->i_size &&
7209 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7210 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
7211 last_byte - cur_offset,
7212 1 << inode->i_blkbits,
7213 offset + len,
7214 &alloc_hint);
7215 if (ret < 0) {
7216 free_extent_map(em);
7217 break;
7218 }
7219 }
7220 free_extent_map(em);
7221
7222 cur_offset = last_byte;
7223 if (cur_offset >= alloc_end) {
7224 ret = 0;
7225 break;
7226 }
7227 }
7228 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
7229 &cached_state, GFP_NOFS);
7230
7231 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
7232out:
7233 mutex_unlock(&inode->i_mutex);
7234 return ret;
7235}
7236
7237static int btrfs_set_page_dirty(struct page *page) 7136static int btrfs_set_page_dirty(struct page *page)
7238{ 7137{
7239 return __set_page_dirty_nobuffers(page); 7138 return __set_page_dirty_nobuffers(page);
7240} 7139}
7241 7140
7242static int btrfs_permission(struct inode *inode, int mask) 7141static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
7243{ 7142{
7244 struct btrfs_root *root = BTRFS_I(inode)->root; 7143 struct btrfs_root *root = BTRFS_I(inode)->root;
7245 7144
@@ -7247,7 +7146,7 @@ static int btrfs_permission(struct inode *inode, int mask)
7247 return -EROFS; 7146 return -EROFS;
7248 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7147 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
7249 return -EACCES; 7148 return -EACCES;
7250 return generic_permission(inode, mask, btrfs_check_acl); 7149 return generic_permission(inode, mask, flags, btrfs_check_acl);
7251} 7150}
7252 7151
7253static const struct inode_operations btrfs_dir_inode_operations = { 7152static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7340,7 +7239,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
7340 .listxattr = btrfs_listxattr, 7239 .listxattr = btrfs_listxattr,
7341 .removexattr = btrfs_removexattr, 7240 .removexattr = btrfs_removexattr,
7342 .permission = btrfs_permission, 7241 .permission = btrfs_permission,
7343 .fallocate = btrfs_fallocate,
7344 .fiemap = btrfs_fiemap, 7242 .fiemap = btrfs_fiemap,
7345}; 7243};
7346static const struct inode_operations btrfs_special_inode_operations = { 7244static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0209b5fc772c..a004008f7d28 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -567,6 +567,7 @@ static int btrfs_fill_super(struct super_block *sb,
567 sb->s_maxbytes = MAX_LFS_FILESIZE; 567 sb->s_maxbytes = MAX_LFS_FILESIZE;
568 sb->s_magic = BTRFS_SUPER_MAGIC; 568 sb->s_magic = BTRFS_SUPER_MAGIC;
569 sb->s_op = &btrfs_super_ops; 569 sb->s_op = &btrfs_super_ops;
570 sb->s_d_op = &btrfs_dentry_operations;
570 sb->s_export_op = &btrfs_export_ops; 571 sb->s_export_op = &btrfs_export_ops;
571 sb->s_xattr = btrfs_xattr_handlers; 572 sb->s_xattr = btrfs_xattr_handlers;
572 sb->s_time_gran = 1; 573 sb->s_time_gran = 1;
@@ -698,8 +699,8 @@ static int btrfs_set_super(struct super_block *s, void *data)
698 * Note: This is based on get_sb_bdev from fs/super.c with a few additions 699 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
699 * for multiple device setup. Make sure to keep it in sync. 700 * for multiple device setup. Make sure to keep it in sync.
700 */ 701 */
701static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 702static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
702 const char *dev_name, void *data, struct vfsmount *mnt) 703 const char *dev_name, void *data)
703{ 704{
704 struct block_device *bdev = NULL; 705 struct block_device *bdev = NULL;
705 struct super_block *s; 706 struct super_block *s;
@@ -719,7 +720,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
719 &subvol_name, &subvol_objectid, 720 &subvol_name, &subvol_objectid,
720 &fs_devices); 721 &fs_devices);
721 if (error) 722 if (error)
722 return error; 723 return ERR_PTR(error);
723 724
724 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); 725 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
725 if (error) 726 if (error)
@@ -812,11 +813,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
812 root = new_root; 813 root = new_root;
813 } 814 }
814 815
815 mnt->mnt_sb = s;
816 mnt->mnt_root = root;
817
818 kfree(subvol_name); 816 kfree(subvol_name);
819 return 0; 817 return root;
820 818
821error_s: 819error_s:
822 error = PTR_ERR(s); 820 error = PTR_ERR(s);
@@ -826,7 +824,7 @@ error_close_devices:
826 kfree(tree_root); 824 kfree(tree_root);
827error_free_subvol_name: 825error_free_subvol_name:
828 kfree(subvol_name); 826 kfree(subvol_name);
829 return error; 827 return ERR_PTR(error);
830} 828}
831 829
832static int btrfs_remount(struct super_block *sb, int *flags, char *data) 830static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1043,7 +1041,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1043static struct file_system_type btrfs_fs_type = { 1041static struct file_system_type btrfs_fs_type = {
1044 .owner = THIS_MODULE, 1042 .owner = THIS_MODULE,
1045 .name = "btrfs", 1043 .name = "btrfs",
1046 .get_sb = btrfs_get_sb, 1044 .mount = btrfs_mount,
1047 .kill_sb = kill_anon_super, 1045 .kill_sb = kill_anon_super,
1048 .fs_flags = FS_REQUIRES_DEV, 1046 .fs_flags = FS_REQUIRES_DEV,
1049}; 1047};
@@ -1112,6 +1110,7 @@ static const struct file_operations btrfs_ctl_fops = {
1112 .unlocked_ioctl = btrfs_control_ioctl, 1110 .unlocked_ioctl = btrfs_control_ioctl,
1113 .compat_ioctl = btrfs_control_ioctl, 1111 .compat_ioctl = btrfs_control_ioctl,
1114 .owner = THIS_MODULE, 1112 .owner = THIS_MODULE,
1113 .llseek = noop_llseek,
1115}; 1114};
1116 1115
1117static struct miscdevice btrfs_misc = { 1116static struct miscdevice btrfs_misc = {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7cad59353b09..2636a051e4b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -399,7 +399,6 @@ static noinline int device_list_add(const char *path,
399 device->work.func = pending_bios_fn; 399 device->work.func = pending_bios_fn;
400 memcpy(device->uuid, disk_super->dev_item.uuid, 400 memcpy(device->uuid, disk_super->dev_item.uuid,
401 BTRFS_UUID_SIZE); 401 BTRFS_UUID_SIZE);
402 device->barriers = 1;
403 spin_lock_init(&device->io_lock); 402 spin_lock_init(&device->io_lock);
404 device->name = kstrdup(path, GFP_NOFS); 403 device->name = kstrdup(path, GFP_NOFS);
405 if (!device->name) { 404 if (!device->name) {
@@ -467,7 +466,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
467 device->devid = orig_dev->devid; 466 device->devid = orig_dev->devid;
468 device->work.func = pending_bios_fn; 467 device->work.func = pending_bios_fn;
469 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 468 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
470 device->barriers = 1;
471 spin_lock_init(&device->io_lock); 469 spin_lock_init(&device->io_lock);
472 INIT_LIST_HEAD(&device->dev_list); 470 INIT_LIST_HEAD(&device->dev_list);
473 INIT_LIST_HEAD(&device->dev_alloc_list); 471 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -496,7 +494,7 @@ again:
496 continue; 494 continue;
497 495
498 if (device->bdev) { 496 if (device->bdev) {
499 close_bdev_exclusive(device->bdev, device->mode); 497 blkdev_put(device->bdev, device->mode);
500 device->bdev = NULL; 498 device->bdev = NULL;
501 fs_devices->open_devices--; 499 fs_devices->open_devices--;
502 } 500 }
@@ -530,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
530 528
531 list_for_each_entry(device, &fs_devices->devices, dev_list) { 529 list_for_each_entry(device, &fs_devices->devices, dev_list) {
532 if (device->bdev) { 530 if (device->bdev) {
533 close_bdev_exclusive(device->bdev, device->mode); 531 blkdev_put(device->bdev, device->mode);
534 fs_devices->open_devices--; 532 fs_devices->open_devices--;
535 } 533 }
536 if (device->writeable) { 534 if (device->writeable) {
@@ -587,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
587 int seeding = 1; 585 int seeding = 1;
588 int ret = 0; 586 int ret = 0;
589 587
588 flags |= FMODE_EXCL;
589
590 list_for_each_entry(device, head, dev_list) { 590 list_for_each_entry(device, head, dev_list) {
591 if (device->bdev) 591 if (device->bdev)
592 continue; 592 continue;
593 if (!device->name) 593 if (!device->name)
594 continue; 594 continue;
595 595
596 bdev = open_bdev_exclusive(device->name, flags, holder); 596 bdev = blkdev_get_by_path(device->name, flags, holder);
597 if (IS_ERR(bdev)) { 597 if (IS_ERR(bdev)) {
598 printk(KERN_INFO "open %s failed\n", device->name); 598 printk(KERN_INFO "open %s failed\n", device->name);
599 goto error; 599 goto error;
@@ -647,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
647error_brelse: 647error_brelse:
648 brelse(bh); 648 brelse(bh);
649error_close: 649error_close:
650 close_bdev_exclusive(bdev, FMODE_READ); 650 blkdev_put(bdev, flags);
651error: 651error:
652 continue; 652 continue;
653 } 653 }
@@ -693,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
693 693
694 mutex_lock(&uuid_mutex); 694 mutex_lock(&uuid_mutex);
695 695
696 bdev = open_bdev_exclusive(path, flags, holder); 696 flags |= FMODE_EXCL;
697 bdev = blkdev_get_by_path(path, flags, holder);
697 698
698 if (IS_ERR(bdev)) { 699 if (IS_ERR(bdev)) {
699 ret = PTR_ERR(bdev); 700 ret = PTR_ERR(bdev);
@@ -725,7 +726,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
725 726
726 brelse(bh); 727 brelse(bh);
727error_close: 728error_close:
728 close_bdev_exclusive(bdev, flags); 729 blkdev_put(bdev, flags);
729error: 730error:
730 mutex_unlock(&uuid_mutex); 731 mutex_unlock(&uuid_mutex);
731 return ret; 732 return ret;
@@ -1299,8 +1300,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1299 goto out; 1300 goto out;
1300 } 1301 }
1301 } else { 1302 } else {
1302 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1303 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1303 root->fs_info->bdev_holder); 1304 root->fs_info->bdev_holder);
1304 if (IS_ERR(bdev)) { 1305 if (IS_ERR(bdev)) {
1305 ret = PTR_ERR(bdev); 1306 ret = PTR_ERR(bdev);
1306 goto out; 1307 goto out;
@@ -1367,7 +1368,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1367 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1368 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1368 1369
1369 if (device->bdev) { 1370 if (device->bdev) {
1370 close_bdev_exclusive(device->bdev, device->mode); 1371 blkdev_put(device->bdev, device->mode);
1371 device->bdev = NULL; 1372 device->bdev = NULL;
1372 device->fs_devices->open_devices--; 1373 device->fs_devices->open_devices--;
1373 } 1374 }
@@ -1410,7 +1411,7 @@ error_brelse:
1410 brelse(bh); 1411 brelse(bh);
1411error_close: 1412error_close:
1412 if (bdev) 1413 if (bdev)
1413 close_bdev_exclusive(bdev, FMODE_READ); 1414 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1414out: 1415out:
1415 mutex_unlock(&root->fs_info->volume_mutex); 1416 mutex_unlock(&root->fs_info->volume_mutex);
1416 mutex_unlock(&uuid_mutex); 1417 mutex_unlock(&uuid_mutex);
@@ -1562,7 +1563,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1562 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1563 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1563 return -EINVAL; 1564 return -EINVAL;
1564 1565
1565 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1566 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1567 root->fs_info->bdev_holder);
1566 if (IS_ERR(bdev)) 1568 if (IS_ERR(bdev))
1567 return PTR_ERR(bdev); 1569 return PTR_ERR(bdev);
1568 1570
@@ -1616,7 +1618,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1616 1618
1617 lock_chunks(root); 1619 lock_chunks(root);
1618 1620
1619 device->barriers = 1;
1620 device->writeable = 1; 1621 device->writeable = 1;
1621 device->work.func = pending_bios_fn; 1622 device->work.func = pending_bios_fn;
1622 generate_random_uuid(device->uuid); 1623 generate_random_uuid(device->uuid);
@@ -1695,7 +1696,7 @@ out:
1695 mutex_unlock(&root->fs_info->volume_mutex); 1696 mutex_unlock(&root->fs_info->volume_mutex);
1696 return ret; 1697 return ret;
1697error: 1698error:
1698 close_bdev_exclusive(bdev, 0); 1699 blkdev_put(bdev, FMODE_EXCL);
1699 if (seeding_dev) { 1700 if (seeding_dev) {
1700 mutex_unlock(&uuid_mutex); 1701 mutex_unlock(&uuid_mutex);
1701 up_write(&sb->s_umount); 1702 up_write(&sb->s_umount);
@@ -3393,7 +3394,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3393 return NULL; 3394 return NULL;
3394 list_add(&device->dev_list, 3395 list_add(&device->dev_list,
3395 &fs_devices->devices); 3396 &fs_devices->devices);
3396 device->barriers = 1;
3397 device->dev_root = root->fs_info->dev_root; 3397 device->dev_root = root->fs_info->dev_root;
3398 device->devid = devid; 3398 device->devid = devid;
3399 device->work.func = pending_bios_fn; 3399 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7af6144a7954..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -45,7 +45,6 @@ struct btrfs_device {
45 int running_pending; 45 int running_pending;
46 u64 generation; 46 u64 generation;
47 47
48 int barriers;
49 int writeable; 48 int writeable;
50 int in_fs_metadata; 49 int in_fs_metadata;
51 int missing; 50 int missing;
@@ -54,7 +53,7 @@ struct btrfs_device {
54 53
55 struct block_device *bdev; 54 struct block_device *bdev;
56 55
57 /* the mode sent to open_bdev_exclusive */ 56 /* the mode sent to blkdev_get */
58 fmode_t mode; 57 fmode_t mode;
59 58
60 char *name; 59 char *name;
diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..2219a76e2caf 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
156 if (uptodate) { 156 if (uptodate) {
157 set_buffer_uptodate(bh); 157 set_buffer_uptodate(bh);
158 } else { 158 } else {
159 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { 159 if (!quiet_error(bh)) {
160 buffer_io_error(bh); 160 buffer_io_error(bh);
161 printk(KERN_WARNING "lost page write due to " 161 printk(KERN_WARNING "lost page write due to "
162 "I/O error on %s\n", 162 "I/O error on %s\n",
@@ -905,7 +905,6 @@ try_again:
905 905
906 bh->b_state = 0; 906 bh->b_state = 0;
907 atomic_set(&bh->b_count, 0); 907 atomic_set(&bh->b_count, 0);
908 bh->b_private = NULL;
909 bh->b_size = size; 908 bh->b_size = size;
910 909
911 /* Link the buffer to its page */ 910 /* Link the buffer to its page */
@@ -1271,12 +1270,10 @@ static inline void check_irqs_on(void)
1271static void bh_lru_install(struct buffer_head *bh) 1270static void bh_lru_install(struct buffer_head *bh)
1272{ 1271{
1273 struct buffer_head *evictee = NULL; 1272 struct buffer_head *evictee = NULL;
1274 struct bh_lru *lru;
1275 1273
1276 check_irqs_on(); 1274 check_irqs_on();
1277 bh_lru_lock(); 1275 bh_lru_lock();
1278 lru = &__get_cpu_var(bh_lrus); 1276 if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1279 if (lru->bhs[0] != bh) {
1280 struct buffer_head *bhs[BH_LRU_SIZE]; 1277 struct buffer_head *bhs[BH_LRU_SIZE];
1281 int in; 1278 int in;
1282 int out = 0; 1279 int out = 0;
@@ -1284,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh)
1284 get_bh(bh); 1281 get_bh(bh);
1285 bhs[out++] = bh; 1282 bhs[out++] = bh;
1286 for (in = 0; in < BH_LRU_SIZE; in++) { 1283 for (in = 0; in < BH_LRU_SIZE; in++) {
1287 struct buffer_head *bh2 = lru->bhs[in]; 1284 struct buffer_head *bh2 =
1285 __this_cpu_read(bh_lrus.bhs[in]);
1288 1286
1289 if (bh2 == bh) { 1287 if (bh2 == bh) {
1290 __brelse(bh2); 1288 __brelse(bh2);
@@ -1299,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh)
1299 } 1297 }
1300 while (out < BH_LRU_SIZE) 1298 while (out < BH_LRU_SIZE)
1301 bhs[out++] = NULL; 1299 bhs[out++] = NULL;
1302 memcpy(lru->bhs, bhs, sizeof(bhs)); 1300 memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1303 } 1301 }
1304 bh_lru_unlock(); 1302 bh_lru_unlock();
1305 1303
@@ -1314,23 +1312,22 @@ static struct buffer_head *
1314lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) 1312lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1315{ 1313{
1316 struct buffer_head *ret = NULL; 1314 struct buffer_head *ret = NULL;
1317 struct bh_lru *lru;
1318 unsigned int i; 1315 unsigned int i;
1319 1316
1320 check_irqs_on(); 1317 check_irqs_on();
1321 bh_lru_lock(); 1318 bh_lru_lock();
1322 lru = &__get_cpu_var(bh_lrus);
1323 for (i = 0; i < BH_LRU_SIZE; i++) { 1319 for (i = 0; i < BH_LRU_SIZE; i++) {
1324 struct buffer_head *bh = lru->bhs[i]; 1320 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1325 1321
1326 if (bh && bh->b_bdev == bdev && 1322 if (bh && bh->b_bdev == bdev &&
1327 bh->b_blocknr == block && bh->b_size == size) { 1323 bh->b_blocknr == block && bh->b_size == size) {
1328 if (i) { 1324 if (i) {
1329 while (i) { 1325 while (i) {
1330 lru->bhs[i] = lru->bhs[i - 1]; 1326 __this_cpu_write(bh_lrus.bhs[i],
1327 __this_cpu_read(bh_lrus.bhs[i - 1]));
1331 i--; 1328 i--;
1332 } 1329 }
1333 lru->bhs[0] = bh; 1330 __this_cpu_write(bh_lrus.bhs[0], bh);
1334 } 1331 }
1335 get_bh(bh); 1332 get_bh(bh);
1336 ret = bh; 1333 ret = bh;
@@ -1706,7 +1703,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1706 * and kswapd activity, but those code paths have their own 1703 * and kswapd activity, but those code paths have their own
1707 * higher-level throttling. 1704 * higher-level throttling.
1708 */ 1705 */
1709 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1706 if (wbc->sync_mode != WB_SYNC_NONE) {
1710 lock_buffer(bh); 1707 lock_buffer(bh);
1711 } else if (!trylock_buffer(bh)) { 1708 } else if (!trylock_buffer(bh)) {
1712 redirty_page_for_writepage(wbc, page); 1709 redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1831,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1834} 1831}
1835EXPORT_SYMBOL(page_zero_new_buffers); 1832EXPORT_SYMBOL(page_zero_new_buffers);
1836 1833
1837int block_prepare_write(struct page *page, unsigned from, unsigned to, 1834int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1838 get_block_t *get_block) 1835 get_block_t *get_block)
1839{ 1836{
1837 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1838 unsigned to = from + len;
1840 struct inode *inode = page->mapping->host; 1839 struct inode *inode = page->mapping->host;
1841 unsigned block_start, block_end; 1840 unsigned block_start, block_end;
1842 sector_t block; 1841 sector_t block;
@@ -1916,7 +1915,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
1916 } 1915 }
1917 return err; 1916 return err;
1918} 1917}
1919EXPORT_SYMBOL(block_prepare_write); 1918EXPORT_SYMBOL(__block_write_begin);
1920 1919
1921static int __block_commit_write(struct inode *inode, struct page *page, 1920static int __block_commit_write(struct inode *inode, struct page *page,
1922 unsigned from, unsigned to) 1921 unsigned from, unsigned to)
@@ -1953,15 +1952,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1953 return 0; 1952 return 0;
1954} 1953}
1955 1954
1956int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1957 get_block_t *get_block)
1958{
1959 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
1960
1961 return block_prepare_write(page, start, start + len, get_block);
1962}
1963EXPORT_SYMBOL(__block_write_begin);
1964
1965/* 1955/*
1966 * block_write_begin takes care of the basic task of block allocation and 1956 * block_write_begin takes care of the basic task of block allocation and
1967 * bringing partial write blocks uptodate first. 1957 * bringing partial write blocks uptodate first.
@@ -2379,7 +2369,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2379 else 2369 else
2380 end = PAGE_CACHE_SIZE; 2370 end = PAGE_CACHE_SIZE;
2381 2371
2382 ret = block_prepare_write(page, 0, end, get_block); 2372 ret = __block_write_begin(page, 0, end, get_block);
2383 if (!ret) 2373 if (!ret)
2384 ret = block_commit_write(page, 0, end); 2374 ret = block_commit_write(page, 0, end);
2385 2375
@@ -2466,11 +2456,10 @@ int nobh_write_begin(struct address_space *mapping,
2466 *fsdata = NULL; 2456 *fsdata = NULL;
2467 2457
2468 if (page_has_buffers(page)) { 2458 if (page_has_buffers(page)) {
2469 unlock_page(page); 2459 ret = __block_write_begin(page, pos, len, get_block);
2470 page_cache_release(page); 2460 if (unlikely(ret))
2471 *pagep = NULL; 2461 goto out_release;
2472 return block_write_begin(mapping, pos, len, flags, pagep, 2462 return ret;
2473 get_block);
2474 } 2463 }
2475 2464
2476 if (PageMappedToDisk(page)) 2465 if (PageMappedToDisk(page))
@@ -2891,7 +2880,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2891 2880
2892 if (err == -EOPNOTSUPP) { 2881 if (err == -EOPNOTSUPP) {
2893 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2882 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2894 set_bit(BH_Eopnotsupp, &bh->b_state);
2895 } 2883 }
2896 2884
2897 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) 2885 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3019,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3031 bh->b_end_io = end_buffer_write_sync; 3019 bh->b_end_io = end_buffer_write_sync;
3032 ret = submit_bh(rw, bh); 3020 ret = submit_bh(rw, bh);
3033 wait_on_buffer(bh); 3021 wait_on_buffer(bh);
3034 if (buffer_eopnotsupp(bh)) {
3035 clear_buffer_eopnotsupp(bh);
3036 ret = -EOPNOTSUPP;
3037 }
3038 if (!ret && !buffer_uptodate(bh)) 3022 if (!ret && !buffer_uptodate(bh))
3039 ret = -EIO; 3023 ret = -EIO;
3040 } else { 3024 } else {
@@ -3217,22 +3201,23 @@ static void recalc_bh_state(void)
3217 int i; 3201 int i;
3218 int tot = 0; 3202 int tot = 0;
3219 3203
3220 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) 3204 if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3221 return; 3205 return;
3222 __get_cpu_var(bh_accounting).ratelimit = 0; 3206 __this_cpu_write(bh_accounting.ratelimit, 0);
3223 for_each_online_cpu(i) 3207 for_each_online_cpu(i)
3224 tot += per_cpu(bh_accounting, i).nr; 3208 tot += per_cpu(bh_accounting, i).nr;
3225 buffer_heads_over_limit = (tot > max_buffer_heads); 3209 buffer_heads_over_limit = (tot > max_buffer_heads);
3226} 3210}
3227 3211
3228struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3212struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3229{ 3213{
3230 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); 3214 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3231 if (ret) { 3215 if (ret) {
3232 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3216 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3233 get_cpu_var(bh_accounting).nr++; 3217 preempt_disable();
3218 __this_cpu_inc(bh_accounting.nr);
3234 recalc_bh_state(); 3219 recalc_bh_state();
3235 put_cpu_var(bh_accounting); 3220 preempt_enable();
3236 } 3221 }
3237 return ret; 3222 return ret;
3238} 3223}
@@ -3242,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh)
3242{ 3227{
3243 BUG_ON(!list_empty(&bh->b_assoc_buffers)); 3228 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3244 kmem_cache_free(bh_cachep, bh); 3229 kmem_cache_free(bh_cachep, bh);
3245 get_cpu_var(bh_accounting).nr--; 3230 preempt_disable();
3231 __this_cpu_dec(bh_accounting.nr);
3246 recalc_bh_state(); 3232 recalc_bh_state();
3247 put_cpu_var(bh_accounting); 3233 preempt_enable();
3248} 3234}
3249EXPORT_SYMBOL(free_buffer_head); 3235EXPORT_SYMBOL(free_buffer_head);
3250 3236
@@ -3257,9 +3243,8 @@ static void buffer_exit_cpu(int cpu)
3257 brelse(b->bhs[i]); 3243 brelse(b->bhs[i]);
3258 b->bhs[i] = NULL; 3244 b->bhs[i] = NULL;
3259 } 3245 }
3260 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; 3246 this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3261 per_cpu(bh_accounting, cpu).nr = 0; 3247 per_cpu(bh_accounting, cpu).nr = 0;
3262 put_cpu_var(bh_accounting);
3263} 3248}
3264 3249
3265static int buffer_cpu_notify(struct notifier_block *self, 3250static int buffer_cpu_notify(struct notifier_block *self,
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 727caedcdd92..0a1467b15516 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
55 .read = cachefiles_daemon_read, 55 .read = cachefiles_daemon_read,
56 .write = cachefiles_daemon_write, 56 .write = cachefiles_daemon_write,
57 .poll = cachefiles_daemon_poll, 57 .poll = cachefiles_daemon_poll,
58 .llseek = noop_llseek,
58}; 59};
59 60
60struct cachefiles_daemon_cmd { 61struct cachefiles_daemon_cmd {
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
1config CEPH_FS 1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)" 2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select CEPH_LIB
4 select LIBCRC32C 5 select LIBCRC32C
5 select CRYPTO_AES 6 select CRYPTO_AES
6 select CRYPTO 7 select CRYPTO
8 default n
7 help 9 help
8 Choose Y or M here to include support for mounting the 10 Choose Y or M here to include support for mounting the
9 experimental Ceph distributed file system. Ceph is an extremely 11 experimental Ceph distributed file system. Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
14 16
15 If unsure, say N. 17 If unsure, say N.
16 18
17config CEPH_FS_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_FS
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This icnreases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,38 +2,10 @@
2# Makefile for CEPH filesystem. 2# Makefile for CEPH filesystem.
3# 3#
4 4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o 5obj-$(CONFIG_CEPH_FS) += ceph.o
8 6
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 7ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 8 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 9 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 mds_client.o mdsmap.o \ 10 debugfs.o
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38 11
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..561438b6a50c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -10,7 +10,8 @@
10#include <linux/task_io_accounting_ops.h> 10#include <linux/task_io_accounting_ops.h>
11 11
12#include "super.h" 12#include "super.h"
13#include "osd_client.h" 13#include "mds_client.h"
14#include <linux/ceph/osd_client.h>
14 15
15/* 16/*
16 * Ceph address space ops. 17 * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
193{ 194{
194 struct inode *inode = filp->f_dentry->d_inode; 195 struct inode *inode = filp->f_dentry->d_inode;
195 struct ceph_inode_info *ci = ceph_inode(inode); 196 struct ceph_inode_info *ci = ceph_inode(inode);
196 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 197 struct ceph_osd_client *osdc =
198 &ceph_inode_to_client(inode)->client->osdc;
197 int err = 0; 199 int err = 0;
198 u64 len = PAGE_CACHE_SIZE; 200 u64 len = PAGE_CACHE_SIZE;
199 201
@@ -202,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
202 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
203 page->index << PAGE_CACHE_SHIFT, &len, 205 page->index << PAGE_CACHE_SHIFT, &len,
204 ci->i_truncate_seq, ci->i_truncate_size, 206 ci->i_truncate_seq, ci->i_truncate_size,
205 &page, 1); 207 &page, 1, 0);
206 if (err == -ENOENT) 208 if (err == -ENOENT)
207 err = 0; 209 err = 0;
208 if (err < 0) { 210 if (err < 0) {
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
265{ 267{
266 struct inode *inode = file->f_dentry->d_inode; 268 struct inode *inode = file->f_dentry->d_inode;
267 struct ceph_inode_info *ci = ceph_inode(inode); 269 struct ceph_inode_info *ci = ceph_inode(inode);
268 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 270 struct ceph_osd_client *osdc =
271 &ceph_inode_to_client(inode)->client->osdc;
269 int rc = 0; 272 int rc = 0;
270 struct page **pages; 273 struct page **pages;
271 loff_t offset; 274 loff_t offset;
@@ -284,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
284 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
285 offset, &len, 288 offset, &len,
286 ci->i_truncate_seq, ci->i_truncate_size, 289 ci->i_truncate_seq, ci->i_truncate_size,
287 pages, nr_pages); 290 pages, nr_pages, 0);
288 if (rc == -ENOENT) 291 if (rc == -ENOENT)
289 rc = 0; 292 rc = 0;
290 if (rc < 0) 293 if (rc < 0)
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
365{ 368{
366 struct inode *inode; 369 struct inode *inode;
367 struct ceph_inode_info *ci; 370 struct ceph_inode_info *ci;
368 struct ceph_client *client; 371 struct ceph_fs_client *fsc;
369 struct ceph_osd_client *osdc; 372 struct ceph_osd_client *osdc;
370 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 373 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
371 int len = PAGE_CACHE_SIZE; 374 int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
383 } 386 }
384 inode = page->mapping->host; 387 inode = page->mapping->host;
385 ci = ceph_inode(inode); 388 ci = ceph_inode(inode);
386 client = ceph_inode_to_client(inode); 389 fsc = ceph_inode_to_client(inode);
387 osdc = &client->osdc; 390 osdc = &fsc->client->osdc;
388 391
389 /* verify this is a writeable snap context */ 392 /* verify this is a writeable snap context */
390 snapc = (void *)page->private; 393 snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
414 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 417 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
415 inode, page, page->index, page_off, len, snapc); 418 inode, page, page->index, page_off, len, snapc);
416 419
417 writeback_stat = atomic_long_inc_return(&client->writeback_count); 420 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
418 if (writeback_stat > 421 if (writeback_stat >
419 CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) 422 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
420 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 423 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
421 424
422 set_page_writeback(page); 425 set_page_writeback(page);
423 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 426 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
496 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
497 __s32 rc = -EIO; 500 __s32 rc = -EIO;
498 u64 bytes = 0; 501 u64 bytes = 0;
499 struct ceph_client *client = ceph_inode_to_client(inode); 502 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
500 long writeback_stat; 503 long writeback_stat;
501 unsigned issued = ceph_caps_issued(ci); 504 unsigned issued = ceph_caps_issued(ci);
502 505
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
529 WARN_ON(!PageUptodate(page)); 532 WARN_ON(!PageUptodate(page));
530 533
531 writeback_stat = 534 writeback_stat =
532 atomic_long_dec_return(&client->writeback_count); 535 atomic_long_dec_return(&fsc->writeback_count);
533 if (writeback_stat < 536 if (writeback_stat <
534 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) 537 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
535 clear_bdi_congested(&client->backing_dev_info, 538 clear_bdi_congested(&fsc->backing_dev_info,
536 BLK_RW_ASYNC); 539 BLK_RW_ASYNC);
537 540
538 ceph_put_snap_context((void *)page->private); 541 ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
569 * mempool. we avoid the mempool if we can because req->r_num_pages 572 * mempool. we avoid the mempool if we can because req->r_num_pages
570 * may be less than the maximum write size. 573 * may be less than the maximum write size.
571 */ 574 */
572static void alloc_page_vec(struct ceph_client *client, 575static void alloc_page_vec(struct ceph_fs_client *fsc,
573 struct ceph_osd_request *req) 576 struct ceph_osd_request *req)
574{ 577{
575 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 578 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
576 GFP_NOFS); 579 GFP_NOFS);
577 if (!req->r_pages) { 580 if (!req->r_pages) {
578 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); 581 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
579 req->r_pages_from_pool = 1; 582 req->r_pages_from_pool = 1;
580 WARN_ON(!req->r_pages); 583 WARN_ON(!req->r_pages);
581 } 584 }
@@ -588,9 +591,8 @@ static int ceph_writepages_start(struct address_space *mapping,
588 struct writeback_control *wbc) 591 struct writeback_control *wbc)
589{ 592{
590 struct inode *inode = mapping->host; 593 struct inode *inode = mapping->host;
591 struct backing_dev_info *bdi = mapping->backing_dev_info;
592 struct ceph_inode_info *ci = ceph_inode(inode); 594 struct ceph_inode_info *ci = ceph_inode(inode);
593 struct ceph_client *client; 595 struct ceph_fs_client *fsc;
594 pgoff_t index, start, end; 596 pgoff_t index, start, end;
595 int range_whole = 0; 597 int range_whole = 0;
596 int should_loop = 1; 598 int should_loop = 1;
@@ -617,26 +619,19 @@ static int ceph_writepages_start(struct address_space *mapping,
617 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 619 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
618 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 620 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
619 621
620 client = ceph_inode_to_client(inode); 622 fsc = ceph_inode_to_client(inode);
621 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { 623 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
622 pr_warning("writepage_start %p on forced umount\n", inode); 624 pr_warning("writepage_start %p on forced umount\n", inode);
623 return -EIO; /* we're in a forced umount, don't write! */ 625 return -EIO; /* we're in a forced umount, don't write! */
624 } 626 }
625 if (client->mount_args->wsize && client->mount_args->wsize < wsize) 627 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
626 wsize = client->mount_args->wsize; 628 wsize = fsc->mount_options->wsize;
627 if (wsize < PAGE_CACHE_SIZE) 629 if (wsize < PAGE_CACHE_SIZE)
628 wsize = PAGE_CACHE_SIZE; 630 wsize = PAGE_CACHE_SIZE;
629 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 631 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
630 632
631 pagevec_init(&pvec, 0); 633 pagevec_init(&pvec, 0);
632 634
633 /* ?? */
634 if (wbc->nonblocking && bdi_write_congested(bdi)) {
635 dout(" writepages congested\n");
636 wbc->encountered_congestion = 1;
637 goto out_final;
638 }
639
640 /* where to start/end? */ 635 /* where to start/end? */
641 if (wbc->range_cyclic) { 636 if (wbc->range_cyclic) {
642 start = mapping->writeback_index; /* Start from prev offset */ 637 start = mapping->writeback_index; /* Start from prev offset */
@@ -769,7 +764,7 @@ get_more_pages:
769 offset = (unsigned long long)page->index 764 offset = (unsigned long long)page->index
770 << PAGE_CACHE_SHIFT; 765 << PAGE_CACHE_SHIFT;
771 len = wsize; 766 len = wsize;
772 req = ceph_osdc_new_request(&client->osdc, 767 req = ceph_osdc_new_request(&fsc->client->osdc,
773 &ci->i_layout, 768 &ci->i_layout,
774 ceph_vino(inode), 769 ceph_vino(inode),
775 offset, &len, 770 offset, &len,
@@ -779,10 +774,10 @@ get_more_pages:
779 snapc, do_sync, 774 snapc, do_sync,
780 ci->i_truncate_seq, 775 ci->i_truncate_seq,
781 ci->i_truncate_size, 776 ci->i_truncate_size,
782 &inode->i_mtime, true, 1); 777 &inode->i_mtime, true, 1, 0);
783 max_pages = req->r_num_pages; 778 max_pages = req->r_num_pages;
784 779
785 alloc_page_vec(client, req); 780 alloc_page_vec(fsc, req);
786 req->r_callback = writepages_finish; 781 req->r_callback = writepages_finish;
787 req->r_inode = inode; 782 req->r_inode = inode;
788 } 783 }
@@ -794,10 +789,10 @@ get_more_pages:
794 inode, page, page->index); 789 inode, page, page->index);
795 790
796 writeback_stat = 791 writeback_stat =
797 atomic_long_inc_return(&client->writeback_count); 792 atomic_long_inc_return(&fsc->writeback_count);
798 if (writeback_stat > CONGESTION_ON_THRESH( 793 if (writeback_stat > CONGESTION_ON_THRESH(
799 client->mount_args->congestion_kb)) { 794 fsc->mount_options->congestion_kb)) {
800 set_bdi_congested(&client->backing_dev_info, 795 set_bdi_congested(&fsc->backing_dev_info,
801 BLK_RW_ASYNC); 796 BLK_RW_ASYNC);
802 } 797 }
803 798
@@ -846,7 +841,7 @@ get_more_pages:
846 op->payload_len = cpu_to_le32(len); 841 op->payload_len = cpu_to_le32(len);
847 req->r_request->hdr.data_len = cpu_to_le32(len); 842 req->r_request->hdr.data_len = cpu_to_le32(len);
848 843
849 ceph_osdc_start_request(&client->osdc, req, true); 844 ceph_osdc_start_request(&fsc->client->osdc, req, true);
850 req = NULL; 845 req = NULL;
851 846
852 /* continue? */ 847 /* continue? */
@@ -882,7 +877,6 @@ out:
882 rc = 0; /* vfs expects us to return 0 */ 877 rc = 0; /* vfs expects us to return 0 */
883 ceph_put_snap_context(snapc); 878 ceph_put_snap_context(snapc);
884 dout("writepages done, rc = %d\n", rc); 879 dout("writepages done, rc = %d\n", rc);
885out_final:
886 return rc; 880 return rc;
887} 881}
888 882
@@ -915,7 +909,7 @@ static int ceph_update_writeable_page(struct file *file,
915{ 909{
916 struct inode *inode = file->f_dentry->d_inode; 910 struct inode *inode = file->f_dentry->d_inode;
917 struct ceph_inode_info *ci = ceph_inode(inode); 911 struct ceph_inode_info *ci = ceph_inode(inode);
918 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 912 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
919 loff_t page_off = pos & PAGE_CACHE_MASK; 913 loff_t page_off = pos & PAGE_CACHE_MASK;
920 int pos_in_page = pos & ~PAGE_CACHE_MASK; 914 int pos_in_page = pos & ~PAGE_CACHE_MASK;
921 int end_in_page = pos_in_page + len; 915 int end_in_page = pos_in_page + len;
@@ -1053,8 +1047,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1053 struct page *page, void *fsdata) 1047 struct page *page, void *fsdata)
1054{ 1048{
1055 struct inode *inode = file->f_dentry->d_inode; 1049 struct inode *inode = file->f_dentry->d_inode;
1056 struct ceph_client *client = ceph_inode_to_client(inode); 1050 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1057 struct ceph_mds_client *mdsc = &client->mdsc; 1051 struct ceph_mds_client *mdsc = fsc->mdsc;
1058 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1052 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1059 int check_cap = 0; 1053 int check_cap = 0;
1060 1054
@@ -1123,7 +1117,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1123{ 1117{
1124 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1118 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1125 struct page *page = vmf->page; 1119 struct page *page = vmf->page;
1126 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1120 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1127 loff_t off = page->index << PAGE_CACHE_SHIFT; 1121 loff_t off = page->index << PAGE_CACHE_SHIFT;
1128 loff_t size, len; 1122 loff_t size, len;
1129 int ret; 1123 int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index eb2a666b0be7..000000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
1
2#include <linux/errno.h>
3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
7/*
8 * base64 encode/decode.
9 */
10
11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
13
14static int encode_bits(int c)
15{
16 return pem_key[c];
17}
18
19static int decode_bits(char c)
20{
21 if (c >= 'A' && c <= 'Z')
22 return c - 'A';
23 if (c >= 'a' && c <= 'z')
24 return c - 'a' + 26;
25 if (c >= '0' && c <= '9')
26 return c - '0' + 52;
27 if (c == '+')
28 return 62;
29 if (c == '/')
30 return 63;
31 if (c == '=')
32 return 0; /* just non-negative, please */
33 return -EINVAL;
34}
35
36int ceph_armor(char *dst, const char *src, const char *end)
37{
38 int olen = 0;
39 int line = 0;
40
41 while (src < end) {
42 unsigned char a, b, c;
43
44 a = *src++;
45 *dst++ = encode_bits(a >> 2);
46 if (src < end) {
47 b = *src++;
48 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
49 if (src < end) {
50 c = *src++;
51 *dst++ = encode_bits(((b & 15) << 2) |
52 (c >> 6));
53 *dst++ = encode_bits(c & 63);
54 } else {
55 *dst++ = encode_bits((b & 15) << 2);
56 *dst++ = '=';
57 }
58 } else {
59 *dst++ = encode_bits(((a & 3) << 4));
60 *dst++ = '=';
61 *dst++ = '=';
62 }
63 olen += 4;
64 line += 4;
65 if (line == 64) {
66 line = 0;
67 *(dst++) = '\n';
68 olen++;
69 }
70 }
71 return olen;
72}
73
74int ceph_unarmor(char *dst, const char *src, const char *end)
75{
76 int olen = 0;
77
78 while (src < end) {
79 int a, b, c, d;
80
81 if (src < end && src[0] == '\n')
82 src++;
83 if (src + 4 > end)
84 return -EINVAL;
85 a = decode_bits(src[0]);
86 b = decode_bits(src[1]);
87 c = decode_bits(src[2]);
88 d = decode_bits(src[3]);
89 if (a < 0 || b < 0 || c < 0 || d < 0)
90 return -EINVAL;
91
92 *dst++ = (a << 2) | (b >> 4);
93 if (src[2] == '=')
94 return olen + 1;
95 *dst++ = ((b & 15) << 4) | (c >> 2);
96 if (src[3] == '=')
97 return olen + 2;
98 *dst++ = ((c & 3) << 6) | d;
99 olen += 3;
100 src += 4;
101 }
102 return olen;
103}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 6d2e30600627..000000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/slab.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a137..000000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 const char *name;
19
20 /*
21 * true if we are authenticated and can connect to
22 * services.
23 */
24 int (*is_authenticated)(struct ceph_auth_client *ac);
25
26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
33 * build requests and process replies during monitor
34 * handshake. if handle_reply returns -EAGAIN, we build
35 * another request.
36 */
37 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
38 int (*handle_reply)(struct ceph_auth_client *ac, int result,
39 void *buf, void *end);
40
41 /*
42 * Create authorizer for connecting to a service, and verify
43 * the response to authenticate the service.
44 */
45 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
46 struct ceph_authorizer **a,
47 void **buf, size_t *len,
48 void **reply_buf, size_t *reply_len);
49 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
50 struct ceph_authorizer *a, size_t len);
51 void (*destroy_authorizer)(struct ceph_auth_client *ac,
52 struct ceph_authorizer *a);
53 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
54 int peer_type);
55
56 /* reset when we (re)connect to a monitor */
57 void (*reset)(struct ceph_auth_client *ac);
58
59 void (*destroy)(struct ceph_auth_client *ac);
60};
61
62struct ceph_auth_client {
63 u32 protocol; /* CEPH_AUTH_* */
64 void *private; /* for use by protocol implementation */
65 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
66
67 bool negotiating; /* true if negotiating protocol */
68 const char *name; /* entity name */
69 u64 global_id; /* our unique id in system */
70 const char *secret; /* our secret key */
71 unsigned want_keys; /* which services we want */
72};
73
74extern struct ceph_auth_client *ceph_auth_init(const char *name,
75 const char *secret);
76extern void ceph_auth_destroy(struct ceph_auth_client *ac);
77
78extern void ceph_auth_reset(struct ceph_auth_client *ac);
79
80extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
81 void *buf, size_t len);
82extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
83 void *buf, size_t len,
84 void *reply_buf, size_t reply_len);
85extern int ceph_entity_name_encode(const char *name, void **p, void *end);
86
87extern int ceph_build_auth(struct ceph_auth_client *ac,
88 void *msg_buf, size_t msg_len);
89
90extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
91
92#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c7..000000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
41/*
42 * the generic auth code decode the global_id, and we carry no actual
43 * authenticate state, so nothing happens here.
44 */
45static int handle_reply(struct ceph_auth_client *ac, int result,
46 void *buf, void *end)
47{
48 struct ceph_auth_none_info *xi = ac->private;
49
50 xi->starting = false;
51 return result;
52}
53
54/*
55 * build an 'authorizer' with our entity_name and global_id. we can
56 * reuse a single static copy since it is identical for all services
57 * we connect to.
58 */
59static int ceph_auth_none_create_authorizer(
60 struct ceph_auth_client *ac, int peer_type,
61 struct ceph_authorizer **a,
62 void **buf, size_t *len,
63 void **reply_buf, size_t *reply_len)
64{
65 struct ceph_auth_none_info *ai = ac->private;
66 struct ceph_none_authorizer *au = &ai->au;
67 void *p, *end;
68 int ret;
69
70 if (!ai->built_authorizer) {
71 p = au->buf;
72 end = p + sizeof(au->buf);
73 ceph_encode_8(&p, 1);
74 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
75 if (ret < 0)
76 goto bad;
77 ceph_decode_need(&p, end, sizeof(u64), bad2);
78 ceph_encode_64(&p, ac->global_id);
79 au->buf_len = p - (void *)au->buf;
80 ai->built_authorizer = true;
81 dout("built authorizer len %d\n", au->buf_len);
82 }
83
84 *a = (struct ceph_authorizer *)au;
85 *buf = au->buf;
86 *len = au->buf_len;
87 *reply_buf = au->reply_buf;
88 *reply_len = sizeof(au->reply_buf);
89 return 0;
90
91bad2:
92 ret = -ERANGE;
93bad:
94 return ret;
95}
96
97static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
98 struct ceph_authorizer *a)
99{
100 /* nothing to do */
101}
102
103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
105 .reset = reset,
106 .destroy = destroy,
107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
109 .handle_reply = handle_reply,
110 .create_authorizer = ceph_auth_none_create_authorizer,
111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
112};
113
114int ceph_auth_none_init(struct ceph_auth_client *ac)
115{
116 struct ceph_auth_none_info *xi;
117
118 dout("ceph_auth_none_init %p\n", ac);
119 xi = kzalloc(sizeof(*xi), GFP_NOFS);
120 if (!xi)
121 return -ENOMEM;
122
123 xi->starting = true;
124 xi->built_authorizer = false;
125
126 ac->protocol = CEPH_AUTH_NONE;
127 ac->private = xi;
128 ac->ops = &ceph_auth_none_ops;
129 return 0;
130}
131
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08be..000000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5
6#include "auth.h"
7
8/*
9 * null security mode.
10 *
11 * we use a single static authorizer that simply encodes our entity name
12 * and global id.
13 */
14
15struct ceph_none_authorizer {
16 char buf[128];
17 int buf_len;
18 char reply_buf[0];
19};
20
21struct ceph_auth_none_info {
22 bool starting;
23 bool built_authorizer;
24 struct ceph_none_authorizer au; /* we only need one; it's static */
25};
26
27extern int ceph_auth_none_init(struct ceph_auth_client *ac);
28
29#endif
30
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index a2d002cbdec2..000000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15#define TEMP_TICKET_BUF_LEN 256
16
17static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
18
19static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
20{
21 struct ceph_x_info *xi = ac->private;
22 int need;
23
24 ceph_x_validate_tickets(ac, &need);
25 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
26 ac->want_keys, need, xi->have_keys);
27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28}
29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
41static int ceph_x_encrypt_buflen(int ilen)
42{
43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
44 sizeof(u32);
45}
46
47static int ceph_x_encrypt(struct ceph_crypto_key *secret,
48 void *ibuf, int ilen, void *obuf, size_t olen)
49{
50 struct ceph_x_encrypt_header head = {
51 .struct_v = 1,
52 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
53 };
54 size_t len = olen - sizeof(u32);
55 int ret;
56
57 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
58 &head, sizeof(head), ibuf, ilen);
59 if (ret)
60 return ret;
61 ceph_encode_32(&obuf, len);
62 return len + sizeof(u32);
63}
64
65static int ceph_x_decrypt(struct ceph_crypto_key *secret,
66 void **p, void *end, void *obuf, size_t olen)
67{
68 struct ceph_x_encrypt_header head;
69 size_t head_len = sizeof(head);
70 int len, ret;
71
72 len = ceph_decode_32(p);
73 if (*p + len > end)
74 return -EINVAL;
75
76 dout("ceph_x_decrypt len %d\n", len);
77 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
78 *p, len);
79 if (ret)
80 return ret;
81 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
82 return -EPERM;
83 *p += len;
84 return olen;
85}
86
87/*
88 * get existing (or insert new) ticket handler
89 */
90static struct ceph_x_ticket_handler *
91get_ticket_handler(struct ceph_auth_client *ac, int service)
92{
93 struct ceph_x_ticket_handler *th;
94 struct ceph_x_info *xi = ac->private;
95 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
96
97 while (*p) {
98 parent = *p;
99 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
100 if (service < th->service)
101 p = &(*p)->rb_left;
102 else if (service > th->service)
103 p = &(*p)->rb_right;
104 else
105 return th;
106 }
107
108 /* add it */
109 th = kzalloc(sizeof(*th), GFP_NOFS);
110 if (!th)
111 return ERR_PTR(-ENOMEM);
112 th->service = service;
113 rb_link_node(&th->node, parent, p);
114 rb_insert_color(&th->node, &xi->ticket_handlers);
115 return th;
116}
117
118static void remove_ticket_handler(struct ceph_auth_client *ac,
119 struct ceph_x_ticket_handler *th)
120{
121 struct ceph_x_info *xi = ac->private;
122
123 dout("remove_ticket_handler %p %d\n", th, th->service);
124 rb_erase(&th->node, &xi->ticket_handlers);
125 ceph_crypto_key_destroy(&th->session_key);
126 if (th->ticket_blob)
127 ceph_buffer_put(th->ticket_blob);
128 kfree(th);
129}
130
131static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
132 struct ceph_crypto_key *secret,
133 void *buf, void *end)
134{
135 struct ceph_x_info *xi = ac->private;
136 int num;
137 void *p = buf;
138 int ret;
139 char *dbuf;
140 char *ticket_buf;
141 u8 reply_struct_v;
142
143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
144 if (!dbuf)
145 return -ENOMEM;
146
147 ret = -ENOMEM;
148 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
149 if (!ticket_buf)
150 goto out_dbuf;
151
152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
153 reply_struct_v = ceph_decode_8(&p);
154 if (reply_struct_v != 1)
155 goto bad;
156 num = ceph_decode_32(&p);
157 dout("%d tickets\n", num);
158 while (num--) {
159 int type;
160 u8 tkt_struct_v, blob_struct_v;
161 struct ceph_x_ticket_handler *th;
162 void *dp, *dend;
163 int dlen;
164 char is_enc;
165 struct timespec validity;
166 struct ceph_crypto_key old_key;
167 void *tp, *tpend;
168 struct ceph_timespec new_validity;
169 struct ceph_crypto_key new_session_key;
170 struct ceph_buffer *new_ticket_blob;
171 unsigned long new_expires, new_renew_after;
172 u64 new_secret_id;
173
174 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
175
176 type = ceph_decode_32(&p);
177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
178
179 tkt_struct_v = ceph_decode_8(&p);
180 if (tkt_struct_v != 1)
181 goto bad;
182
183 th = get_ticket_handler(ac, type);
184 if (IS_ERR(th)) {
185 ret = PTR_ERR(th);
186 goto out;
187 }
188
189 /* blob for me */
190 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
191 TEMP_TICKET_BUF_LEN);
192 if (dlen <= 0) {
193 ret = dlen;
194 goto out;
195 }
196 dout(" decrypted %d bytes\n", dlen);
197 dend = dbuf + dlen;
198 dp = dbuf;
199
200 tkt_struct_v = ceph_decode_8(&dp);
201 if (tkt_struct_v != 1)
202 goto bad;
203
204 memcpy(&old_key, &th->session_key, sizeof(old_key));
205 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
206 if (ret)
207 goto out;
208
209 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
210 ceph_decode_timespec(&validity, &new_validity);
211 new_expires = get_seconds() + validity.tv_sec;
212 new_renew_after = new_expires - (validity.tv_sec / 4);
213 dout(" expires=%lu renew_after=%lu\n", new_expires,
214 new_renew_after);
215
216 /* ticket blob for service */
217 ceph_decode_8_safe(&p, end, is_enc, bad);
218 tp = ticket_buf;
219 if (is_enc) {
220 /* encrypted */
221 dout(" encrypted ticket\n");
222 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
223 TEMP_TICKET_BUF_LEN);
224 if (dlen < 0) {
225 ret = dlen;
226 goto out;
227 }
228 dlen = ceph_decode_32(&tp);
229 } else {
230 /* unencrypted */
231 ceph_decode_32_safe(&p, end, dlen, bad);
232 ceph_decode_need(&p, end, dlen, bad);
233 ceph_decode_copy(&p, ticket_buf, dlen);
234 }
235 tpend = tp + dlen;
236 dout(" ticket blob is %d bytes\n", dlen);
237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
238 blob_struct_v = ceph_decode_8(&tp);
239 new_secret_id = ceph_decode_64(&tp);
240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
241 if (ret)
242 goto out;
243
244 /* all is well, update our ticket */
245 ceph_crypto_key_destroy(&th->session_key);
246 if (th->ticket_blob)
247 ceph_buffer_put(th->ticket_blob);
248 th->session_key = new_session_key;
249 th->ticket_blob = new_ticket_blob;
250 th->validity = new_validity;
251 th->secret_id = new_secret_id;
252 th->expires = new_expires;
253 th->renew_after = new_renew_after;
254 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
255 type, ceph_entity_type_name(type), th->secret_id,
256 (int)th->ticket_blob->vec.iov_len);
257 xi->have_keys |= th->service;
258 }
259
260 ret = 0;
261out:
262 kfree(ticket_buf);
263out_dbuf:
264 kfree(dbuf);
265 return ret;
266
267bad:
268 ret = -EINVAL;
269 goto out;
270}
271
272static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
273 struct ceph_x_ticket_handler *th,
274 struct ceph_x_authorizer *au)
275{
276 int maxlen;
277 struct ceph_x_authorize_a *msg_a;
278 struct ceph_x_authorize_b msg_b;
279 void *p, *end;
280 int ret;
281 int ticket_blob_len =
282 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
283
284 dout("build_authorizer for %s %p\n",
285 ceph_entity_type_name(th->service), au);
286
287 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
288 ceph_x_encrypt_buflen(ticket_blob_len);
289 dout(" need len %d\n", maxlen);
290 if (au->buf && au->buf->alloc_len < maxlen) {
291 ceph_buffer_put(au->buf);
292 au->buf = NULL;
293 }
294 if (!au->buf) {
295 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
296 if (!au->buf)
297 return -ENOMEM;
298 }
299 au->service = th->service;
300
301 msg_a = au->buf->vec.iov_base;
302 msg_a->struct_v = 1;
303 msg_a->global_id = cpu_to_le64(ac->global_id);
304 msg_a->service_id = cpu_to_le32(th->service);
305 msg_a->ticket_blob.struct_v = 1;
306 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
307 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
308 if (ticket_blob_len) {
309 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
310 th->ticket_blob->vec.iov_len);
311 }
312 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
313 le64_to_cpu(msg_a->ticket_blob.secret_id));
314
315 p = msg_a + 1;
316 p += ticket_blob_len;
317 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
318
319 get_random_bytes(&au->nonce, sizeof(au->nonce));
320 msg_b.struct_v = 1;
321 msg_b.nonce = cpu_to_le64(au->nonce);
322 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
323 p, end - p);
324 if (ret < 0)
325 goto out_buf;
326 p += ret;
327 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
328 dout(" built authorizer nonce %llx len %d\n", au->nonce,
329 (int)au->buf->vec.iov_len);
330 BUG_ON(au->buf->vec.iov_len > maxlen);
331 return 0;
332
333out_buf:
334 ceph_buffer_put(au->buf);
335 au->buf = NULL;
336 return ret;
337}
338
339static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
340 void **p, void *end)
341{
342 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
343 ceph_encode_8(p, 1);
344 ceph_encode_64(p, th->secret_id);
345 if (th->ticket_blob) {
346 const char *buf = th->ticket_blob->vec.iov_base;
347 u32 len = th->ticket_blob->vec.iov_len;
348
349 ceph_encode_32_safe(p, end, len, bad);
350 ceph_encode_copy_safe(p, end, buf, len, bad);
351 } else {
352 ceph_encode_32_safe(p, end, 0, bad);
353 }
354
355 return 0;
356bad:
357 return -ERANGE;
358}
359
360static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
361{
362 int want = ac->want_keys;
363 struct ceph_x_info *xi = ac->private;
364 int service;
365
366 *pneed = ac->want_keys & ~(xi->have_keys);
367
368 for (service = 1; service <= want; service <<= 1) {
369 struct ceph_x_ticket_handler *th;
370
371 if (!(ac->want_keys & service))
372 continue;
373
374 if (*pneed & service)
375 continue;
376
377 th = get_ticket_handler(ac, service);
378
379 if (IS_ERR(th)) {
380 *pneed |= service;
381 continue;
382 }
383
384 if (get_seconds() >= th->renew_after)
385 *pneed |= service;
386 if (get_seconds() >= th->expires)
387 xi->have_keys &= ~service;
388 }
389}
390
391
392static int ceph_x_build_request(struct ceph_auth_client *ac,
393 void *buf, void *end)
394{
395 struct ceph_x_info *xi = ac->private;
396 int need;
397 struct ceph_x_request_header *head = buf;
398 int ret;
399 struct ceph_x_ticket_handler *th =
400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
401
402 if (IS_ERR(th))
403 return PTR_ERR(th);
404
405 ceph_x_validate_tickets(ac, &need);
406
407 dout("build_request want %x have %x need %x\n",
408 ac->want_keys, xi->have_keys, need);
409
410 if (need & CEPH_ENTITY_TYPE_AUTH) {
411 struct ceph_x_authenticate *auth = (void *)(head + 1);
412 void *p = auth + 1;
413 struct ceph_x_challenge_blob tmp;
414 char tmp_enc[40];
415 u64 *u;
416
417 if (p > end)
418 return -ERANGE;
419
420 dout(" get_auth_session_key\n");
421 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
422
423 /* encrypt and hash */
424 get_random_bytes(&auth->client_challenge, sizeof(u64));
425 tmp.client_challenge = auth->client_challenge;
426 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
427 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
428 tmp_enc, sizeof(tmp_enc));
429 if (ret < 0)
430 return ret;
431
432 auth->struct_v = 1;
433 auth->key = 0;
434 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
435 auth->key ^= *(__le64 *)u;
436 dout(" server_challenge %llx client_challenge %llx key %llx\n",
437 xi->server_challenge, le64_to_cpu(auth->client_challenge),
438 le64_to_cpu(auth->key));
439
440 /* now encode the old ticket if exists */
441 ret = ceph_x_encode_ticket(th, &p, end);
442 if (ret < 0)
443 return ret;
444
445 return p - buf;
446 }
447
448 if (need) {
449 void *p = head + 1;
450 struct ceph_x_service_ticket_request *req;
451
452 if (p > end)
453 return -ERANGE;
454 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
455
456 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
457 if (ret)
458 return ret;
459 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
460 xi->auth_authorizer.buf->vec.iov_len);
461
462 req = p;
463 req->keys = cpu_to_le32(need);
464 p += sizeof(*req);
465 return p - buf;
466 }
467
468 return 0;
469}
470
471static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
472 void *buf, void *end)
473{
474 struct ceph_x_info *xi = ac->private;
475 struct ceph_x_reply_header *head = buf;
476 struct ceph_x_ticket_handler *th;
477 int len = end - buf;
478 int op;
479 int ret;
480
481 if (result)
482 return result; /* XXX hmm? */
483
484 if (xi->starting) {
485 /* it's a hello */
486 struct ceph_x_server_challenge *sc = buf;
487
488 if (len != sizeof(*sc))
489 return -EINVAL;
490 xi->server_challenge = le64_to_cpu(sc->server_challenge);
491 dout("handle_reply got server challenge %llx\n",
492 xi->server_challenge);
493 xi->starting = false;
494 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
495 return -EAGAIN;
496 }
497
498 op = le16_to_cpu(head->op);
499 result = le32_to_cpu(head->result);
500 dout("handle_reply op %d result %d\n", op, result);
501 switch (op) {
502 case CEPHX_GET_AUTH_SESSION_KEY:
503 /* verify auth key */
504 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
505 buf + sizeof(*head), end);
506 break;
507
508 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
509 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
510 if (IS_ERR(th))
511 return PTR_ERR(th);
512 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
513 buf + sizeof(*head), end);
514 break;
515
516 default:
517 return -EINVAL;
518 }
519 if (ret)
520 return ret;
521 if (ac->want_keys == xi->have_keys)
522 return 0;
523 return -EAGAIN;
524}
525
526static int ceph_x_create_authorizer(
527 struct ceph_auth_client *ac, int peer_type,
528 struct ceph_authorizer **a,
529 void **buf, size_t *len,
530 void **reply_buf, size_t *reply_len)
531{
532 struct ceph_x_authorizer *au;
533 struct ceph_x_ticket_handler *th;
534 int ret;
535
536 th = get_ticket_handler(ac, peer_type);
537 if (IS_ERR(th))
538 return PTR_ERR(th);
539
540 au = kzalloc(sizeof(*au), GFP_NOFS);
541 if (!au)
542 return -ENOMEM;
543
544 ret = ceph_x_build_authorizer(ac, th, au);
545 if (ret) {
546 kfree(au);
547 return ret;
548 }
549
550 *a = (struct ceph_authorizer *)au;
551 *buf = au->buf->vec.iov_base;
552 *len = au->buf->vec.iov_len;
553 *reply_buf = au->reply_buf;
554 *reply_len = sizeof(au->reply_buf);
555 return 0;
556}
557
558static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
559 struct ceph_authorizer *a, size_t len)
560{
561 struct ceph_x_authorizer *au = (void *)a;
562 struct ceph_x_ticket_handler *th;
563 int ret = 0;
564 struct ceph_x_authorize_reply reply;
565 void *p = au->reply_buf;
566 void *end = p + sizeof(au->reply_buf);
567
568 th = get_ticket_handler(ac, au->service);
569 if (IS_ERR(th))
570 return PTR_ERR(th);
571 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
572 if (ret < 0)
573 return ret;
574 if (ret != sizeof(reply))
575 return -EPERM;
576
577 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
578 ret = -EPERM;
579 else
580 ret = 0;
581 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
582 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
583 return ret;
584}
585
586static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
587 struct ceph_authorizer *a)
588{
589 struct ceph_x_authorizer *au = (void *)a;
590
591 ceph_buffer_put(au->buf);
592 kfree(au);
593}
594
595
596static void ceph_x_reset(struct ceph_auth_client *ac)
597{
598 struct ceph_x_info *xi = ac->private;
599
600 dout("reset\n");
601 xi->starting = true;
602 xi->server_challenge = 0;
603}
604
605static void ceph_x_destroy(struct ceph_auth_client *ac)
606{
607 struct ceph_x_info *xi = ac->private;
608 struct rb_node *p;
609
610 dout("ceph_x_destroy %p\n", ac);
611 ceph_crypto_key_destroy(&xi->secret);
612
613 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
614 struct ceph_x_ticket_handler *th =
615 rb_entry(p, struct ceph_x_ticket_handler, node);
616 remove_ticket_handler(ac, th);
617 }
618
619 if (xi->auth_authorizer.buf)
620 ceph_buffer_put(xi->auth_authorizer.buf);
621
622 kfree(ac->private);
623 ac->private = NULL;
624}
625
626static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
627 int peer_type)
628{
629 struct ceph_x_ticket_handler *th;
630
631 th = get_ticket_handler(ac, peer_type);
632 if (!IS_ERR(th))
633 remove_ticket_handler(ac, th);
634}
635
636
637static const struct ceph_auth_client_ops ceph_x_ops = {
638 .name = "x",
639 .is_authenticated = ceph_x_is_authenticated,
640 .should_authenticate = ceph_x_should_authenticate,
641 .build_request = ceph_x_build_request,
642 .handle_reply = ceph_x_handle_reply,
643 .create_authorizer = ceph_x_create_authorizer,
644 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
645 .destroy_authorizer = ceph_x_destroy_authorizer,
646 .invalidate_authorizer = ceph_x_invalidate_authorizer,
647 .reset = ceph_x_reset,
648 .destroy = ceph_x_destroy,
649};
650
651
652int ceph_x_init(struct ceph_auth_client *ac)
653{
654 struct ceph_x_info *xi;
655 int ret;
656
657 dout("ceph_x_init %p\n", ac);
658 ret = -ENOMEM;
659 xi = kzalloc(sizeof(*xi), GFP_NOFS);
660 if (!xi)
661 goto out;
662
663 ret = -EINVAL;
664 if (!ac->secret) {
665 pr_err("no secret set (for auth_x protocol)\n");
666 goto out_nomem;
667 }
668
669 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
670 if (ret)
671 goto out_nomem;
672
673 xi->starting = true;
674 xi->ticket_handlers = RB_ROOT;
675
676 ac->protocol = CEPH_AUTH_CEPHX;
677 ac->private = xi;
678 ac->ops = &ceph_x_ops;
679 return 0;
680
681out_nomem:
682 kfree(xi);
683out:
684 return ret;
685}
686
687
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e681..000000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4f..000000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index cd39f17021de..000000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
51{
52 size_t len;
53
54 ceph_decode_need(p, end, sizeof(u32), bad);
55 len = ceph_decode_32(p);
56 dout("decode_buffer len %d\n", (int)len);
57 ceph_decode_need(p, end, len, bad);
58 *b = ceph_buffer_new(len, GFP_NOFS);
59 if (!*b)
60 return -ENOMEM;
61 ceph_decode_copy(p, (*b)->vec.iov_base, len);
62 return 0;
63bad:
64 return -EINVAL;
65}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068f..000000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5e9da996a151..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
@@ -9,8 +9,9 @@
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10 10
11#include "super.h" 11#include "super.h"
12#include "decode.h" 12#include "mds_client.h"
13#include "messenger.h" 13#include <linux/ceph/decode.h>
14#include <linux/ceph/messenger.h>
14 15
15/* 16/*
16 * Capability management 17 * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
287 spin_unlock(&mdsc->caps_list_lock); 288 spin_unlock(&mdsc->caps_list_lock);
288} 289}
289 290
290void ceph_reservation_status(struct ceph_client *client, 291void ceph_reservation_status(struct ceph_fs_client *fsc,
291 int *total, int *avail, int *used, int *reserved, 292 int *total, int *avail, int *used, int *reserved,
292 int *min) 293 int *min)
293{ 294{
294 struct ceph_mds_client *mdsc = &client->mdsc; 295 struct ceph_mds_client *mdsc = fsc->mdsc;
295 296
296 if (total) 297 if (total)
297 *total = mdsc->caps_total_count; 298 *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
399static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 400static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
400 struct ceph_inode_info *ci) 401 struct ceph_inode_info *ci)
401{ 402{
402 struct ceph_mount_args *ma = mdsc->client->mount_args; 403 struct ceph_mount_options *ma = mdsc->fsc->mount_options;
403 404
404 ci->i_hold_caps_min = round_jiffies(jiffies + 405 ci->i_hold_caps_min = round_jiffies(jiffies +
405 ma->caps_wanted_delay_min * HZ); 406 ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
515 unsigned seq, unsigned mseq, u64 realmino, int flags, 516 unsigned seq, unsigned mseq, u64 realmino, int flags,
516 struct ceph_cap_reservation *caps_reservation) 517 struct ceph_cap_reservation *caps_reservation)
517{ 518{
518 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 519 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
519 struct ceph_inode_info *ci = ceph_inode(inode); 520 struct ceph_inode_info *ci = ceph_inode(inode);
520 struct ceph_cap *new_cap = NULL; 521 struct ceph_cap *new_cap = NULL;
521 struct ceph_cap *cap; 522 struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
873 struct ceph_mds_session *session = cap->session; 874 struct ceph_mds_session *session = cap->session;
874 struct ceph_inode_info *ci = cap->ci; 875 struct ceph_inode_info *ci = cap->ci;
875 struct ceph_mds_client *mdsc = 876 struct ceph_mds_client *mdsc =
876 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
877 int removed = 0; 878 int removed = 0;
878 879
879 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
1210 int mds; 1211 int mds;
1211 struct ceph_cap_snap *capsnap; 1212 struct ceph_cap_snap *capsnap;
1212 u32 mseq; 1213 u32 mseq;
1213 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1214 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1215 session->s_mutex */ 1216 session->s_mutex */
1216 u64 next_follows = 0; /* keep track of how far we've gotten through the 1217 u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1336void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1337void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1337{ 1338{
1338 struct ceph_mds_client *mdsc = 1339 struct ceph_mds_client *mdsc =
1339 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 1340 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1340 struct inode *inode = &ci->vfs_inode; 1341 struct inode *inode = &ci->vfs_inode;
1341 int was = ci->i_dirty_caps; 1342 int was = ci->i_dirty_caps;
1342 int dirty = 0; 1343 int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1378static int __mark_caps_flushing(struct inode *inode, 1379static int __mark_caps_flushing(struct inode *inode,
1379 struct ceph_mds_session *session) 1380 struct ceph_mds_session *session)
1380{ 1381{
1381 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1382 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1382 struct ceph_inode_info *ci = ceph_inode(inode); 1383 struct ceph_inode_info *ci = ceph_inode(inode);
1383 int flushing; 1384 int flushing;
1384 1385
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
1416/* 1417/*
1417 * try to invalidate mapping pages without blocking. 1418 * try to invalidate mapping pages without blocking.
1418 */ 1419 */
1419static int mapping_is_empty(struct address_space *mapping)
1420{
1421 struct page *page = find_get_page(mapping, 0);
1422
1423 if (!page)
1424 return 1;
1425
1426 put_page(page);
1427 return 0;
1428}
1429
1430static int try_nonblocking_invalidate(struct inode *inode) 1420static int try_nonblocking_invalidate(struct inode *inode)
1431{ 1421{
1432 struct ceph_inode_info *ci = ceph_inode(inode); 1422 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,12 +1426,12 @@ static int try_nonblocking_invalidate(struct inode *inode)
1436 invalidate_mapping_pages(&inode->i_data, 0, -1); 1426 invalidate_mapping_pages(&inode->i_data, 0, -1);
1437 spin_lock(&inode->i_lock); 1427 spin_lock(&inode->i_lock);
1438 1428
1439 if (mapping_is_empty(&inode->i_data) && 1429 if (inode->i_data.nrpages == 0 &&
1440 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1441 /* success. */ 1431 /* success. */
1442 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
1443 ci->i_rdcache_gen = 0; 1433 /* save any racing async invalidate some trouble */
1444 ci->i_rdcache_revoking = 0; 1434 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
1445 return 0; 1435 return 0;
1446 } 1436 }
1447 dout("try_nonblocking_invalidate %p failed\n", inode); 1437 dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1462void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1452void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1463 struct ceph_mds_session *session) 1453 struct ceph_mds_session *session)
1464{ 1454{
1465 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1455 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1466 struct ceph_mds_client *mdsc = &client->mdsc; 1456 struct ceph_mds_client *mdsc = fsc->mdsc;
1467 struct inode *inode = &ci->vfs_inode; 1457 struct inode *inode = &ci->vfs_inode;
1468 struct ceph_cap *cap; 1458 struct ceph_cap *cap;
1469 int file_wanted, used; 1459 int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
1533 */ 1523 */
1534 if ((!is_delayed || mdsc->stopping) && 1524 if ((!is_delayed || mdsc->stopping) &&
1535 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1525 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1536 ci->i_rdcache_gen && /* may have cached pages */ 1526 inode->i_data.nrpages && /* have cached pages */
1537 (file_wanted == 0 || /* no open files */ 1527 (file_wanted == 0 || /* no open files */
1538 (revoking & (CEPH_CAP_FILE_CACHE| 1528 (revoking & (CEPH_CAP_FILE_CACHE|
1539 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ 1529 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
@@ -1570,9 +1560,10 @@ retry_locked:
1570 /* NOTE: no side-effects allowed, until we take s_mutex */ 1560 /* NOTE: no side-effects allowed, until we take s_mutex */
1571 1561
1572 revoking = cap->implemented & ~cap->issued; 1562 revoking = cap->implemented & ~cap->issued;
1573 if (revoking) 1563 dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
1574 dout(" mds%d revoking %s\n", cap->mds, 1564 cap->mds, cap, ceph_cap_string(cap->issued),
1575 ceph_cap_string(revoking)); 1565 ceph_cap_string(cap->implemented),
1566 ceph_cap_string(revoking));
1576 1567
1577 if (cap == ci->i_auth_cap && 1568 if (cap == ci->i_auth_cap &&
1578 (cap->issued & CEPH_CAP_FILE_WR)) { 1569 (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1668,6 +1659,8 @@ ack:
1668 1659
1669 if (cap == ci->i_auth_cap && ci->i_dirty_caps) 1660 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1670 flushing = __mark_caps_flushing(inode, session); 1661 flushing = __mark_caps_flushing(inode, session);
1662 else
1663 flushing = 0;
1671 1664
1672 mds = cap->mds; /* remember mds, so we don't repeat */ 1665 mds = cap->mds; /* remember mds, so we don't repeat */
1673 sent++; 1666 sent++;
@@ -1706,7 +1699,7 @@ ack:
1706static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1699static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1707 unsigned *flush_tid) 1700 unsigned *flush_tid)
1708{ 1701{
1709 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1702 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1710 struct ceph_inode_info *ci = ceph_inode(inode); 1703 struct ceph_inode_info *ci = ceph_inode(inode);
1711 int unlock_session = session ? 0 : 1; 1704 int unlock_session = session ? 0 : 1;
1712 int flushing = 0; 1705 int flushing = 0;
@@ -1872,7 +1865,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1872 caps_are_flushed(inode, flush_tid)); 1865 caps_are_flushed(inode, flush_tid));
1873 } else { 1866 } else {
1874 struct ceph_mds_client *mdsc = 1867 struct ceph_mds_client *mdsc =
1875 &ceph_sb_to_client(inode->i_sb)->mdsc; 1868 ceph_sb_to_client(inode->i_sb)->mdsc;
1876 1869
1877 spin_lock(&inode->i_lock); 1870 spin_lock(&inode->i_lock);
1878 if (__ceph_caps_dirty(ci)) 1871 if (__ceph_caps_dirty(ci))
@@ -1950,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1950 } 1943 }
1951} 1944}
1952 1945
1946static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1947 struct ceph_mds_session *session,
1948 struct inode *inode)
1949{
1950 struct ceph_inode_info *ci = ceph_inode(inode);
1951 struct ceph_cap *cap;
1952 int delayed = 0;
1953
1954 spin_lock(&inode->i_lock);
1955 cap = ci->i_auth_cap;
1956 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1957 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1958 __ceph_flush_snaps(ci, &session, 1);
1959 if (ci->i_flushing_caps) {
1960 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1961 __ceph_caps_used(ci),
1962 __ceph_caps_wanted(ci),
1963 cap->issued | cap->implemented,
1964 ci->i_flushing_caps, NULL);
1965 if (delayed) {
1966 spin_lock(&inode->i_lock);
1967 __cap_delay_requeue(mdsc, ci);
1968 spin_unlock(&inode->i_lock);
1969 }
1970 } else {
1971 spin_unlock(&inode->i_lock);
1972 }
1973}
1974
1953 1975
1954/* 1976/*
1955 * Take references to capabilities we hold, so that we don't release 1977 * Take references to capabilities we hold, so that we don't release
@@ -2283,8 +2305,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2283{ 2305{
2284 struct ceph_inode_info *ci = ceph_inode(inode); 2306 struct ceph_inode_info *ci = ceph_inode(inode);
2285 int mds = session->s_mds; 2307 int mds = session->s_mds;
2286 unsigned seq = le32_to_cpu(grant->seq); 2308 int seq = le32_to_cpu(grant->seq);
2287 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2288 int newcaps = le32_to_cpu(grant->caps); 2309 int newcaps = le32_to_cpu(grant->caps);
2289 int issued, implemented, used, wanted, dirty; 2310 int issued, implemented, used, wanted, dirty;
2290 u64 size = le64_to_cpu(grant->size); 2311 u64 size = le64_to_cpu(grant->size);
@@ -2296,8 +2317,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2296 int revoked_rdcache = 0; 2317 int revoked_rdcache = 0;
2297 int queue_invalidate = 0; 2318 int queue_invalidate = 0;
2298 2319
2299 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", 2320 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2300 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); 2321 inode, cap, mds, seq, ceph_cap_string(newcaps));
2301 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2322 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2302 inode->i_size); 2323 inode->i_size);
2303 2324
@@ -2393,7 +2414,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2393 } 2414 }
2394 2415
2395 cap->seq = seq; 2416 cap->seq = seq;
2396 cap->issue_seq = issue_seq;
2397 2417
2398 /* file layout may have changed */ 2418 /* file layout may have changed */
2399 ci->i_layout = grant->layout; 2419 ci->i_layout = grant->layout;
@@ -2465,7 +2485,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2465 __releases(inode->i_lock) 2485 __releases(inode->i_lock)
2466{ 2486{
2467 struct ceph_inode_info *ci = ceph_inode(inode); 2487 struct ceph_inode_info *ci = ceph_inode(inode);
2468 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 2488 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2469 unsigned seq = le32_to_cpu(m->seq); 2489 unsigned seq = le32_to_cpu(m->seq);
2470 int dirty = le32_to_cpu(m->dirty); 2490 int dirty = le32_to_cpu(m->dirty);
2471 int cleaned = 0; 2491 int cleaned = 0;
@@ -2699,8 +2719,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2699 ceph_add_cap(inode, session, cap_id, -1, 2719 ceph_add_cap(inode, session, cap_id, -1,
2700 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, 2720 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2701 NULL /* no caps context */); 2721 NULL /* no caps context */);
2702 try_flush_caps(inode, session, NULL); 2722 kick_flushing_inode_caps(mdsc, session, inode);
2703 up_read(&mdsc->snap_rwsem); 2723 up_read(&mdsc->snap_rwsem);
2724
2725 /* make sure we re-request max_size, if necessary */
2726 spin_lock(&inode->i_lock);
2727 ci->i_requested_max_size = 0;
2728 spin_unlock(&inode->i_lock);
2704} 2729}
2705 2730
2706/* 2731/*
@@ -2713,7 +2738,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2713 struct ceph_msg *msg) 2738 struct ceph_msg *msg)
2714{ 2739{
2715 struct ceph_mds_client *mdsc = session->s_mdsc; 2740 struct ceph_mds_client *mdsc = session->s_mdsc;
2716 struct super_block *sb = mdsc->client->sb; 2741 struct super_block *sb = mdsc->fsc->sb;
2717 struct inode *inode; 2742 struct inode *inode;
2718 struct ceph_cap *cap; 2743 struct ceph_cap *cap;
2719 struct ceph_mds_caps *h; 2744 struct ceph_mds_caps *h;
@@ -2792,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2792 case CEPH_CAP_OP_IMPORT: 2817 case CEPH_CAP_OP_IMPORT:
2793 handle_cap_import(mdsc, inode, h, session, 2818 handle_cap_import(mdsc, inode, h, session,
2794 snaptrace, snaptrace_len); 2819 snaptrace, snaptrace_len);
2795 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2820 ceph_check_caps(ceph_inode(inode), 0, session);
2796 session);
2797 goto done_unlocked; 2821 goto done_unlocked;
2798 } 2822 }
2799 2823
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c2305610..000000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Ceph 'frag' type 2 * Ceph 'frag' type
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6int ceph_frag_compare(__u32 a, __u32 b) 7int ceph_frag_compare(__u32 a, __u32 b)
7{ 8{
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 5babb8e95352..000000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
1#ifndef FS_CEPH_FRAG_H
2#define FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 3ac6cc7c1156..000000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32 int mode;
33
34#ifdef O_DIRECTORY /* fixme */
35 if ((flags & O_DIRECTORY) == O_DIRECTORY)
36 return CEPH_FILE_MODE_PIN;
37#endif
38 if ((flags & O_APPEND) == O_APPEND)
39 flags |= O_WRONLY;
40
41 if ((flags & O_ACCMODE) == O_RDWR)
42 mode = CEPH_FILE_MODE_RDWR;
43 else if ((flags & O_ACCMODE) == O_WRONLY)
44 mode = CEPH_FILE_MODE_WR;
45 else
46 mode = CEPH_FILE_MODE_RD;
47
48#ifdef O_LAZY
49 if (flags & O_LAZY)
50 mode |= CEPH_FILE_MODE_LAZY;
51#endif
52
53 return mode;
54}
55
56int ceph_caps_for_mode(int mode)
57{
58 int caps = CEPH_CAP_PIN;
59
60 if (mode & CEPH_FILE_MODE_RD)
61 caps |= CEPH_CAP_FILE_SHARED |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
63 if (mode & CEPH_FILE_MODE_WR)
64 caps |= CEPH_CAP_FILE_EXCL |
65 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
66 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
67 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
68 if (mode & CEPH_FILE_MODE_LAZY)
69 caps |= CEPH_CAP_FILE_LAZYIO;
70
71 return caps;
72}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index d5619ac86711..000000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef CEPH_FS_H
13#define CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * subprotocol versions. when specific messages types or high-level
20 * protocols change, bump the affected components. we keep rev
21 * internal cluster protocols separately from the public,
22 * client-facing protocol.
23 */
24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
27#define CEPH_OSDC_PROTOCOL 24 /* server/client */
28#define CEPH_MDSC_PROTOCOL 32 /* server/client */
29#define CEPH_MONC_PROTOCOL 15 /* server/client */
30
31
32#define CEPH_INO_ROOT 1
33#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
34
35/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
36#define CEPH_MAX_MON 31
37
38
39/*
40 * feature bits
41 */
42#define CEPH_FEATURE_UID (1<<0)
43#define CEPH_FEATURE_NOSRCADDR (1<<1)
44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
45#define CEPH_FEATURE_FLOCK (1<<3)
46
47
48/*
49 * ceph_file_layout - describe data layout for a file/inode
50 */
51struct ceph_file_layout {
52 /* file -> object mapping */
53 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
54 of page size. */
55 __le32 fl_stripe_count; /* over this many objects */
56 __le32 fl_object_size; /* until objects are this big, then move to
57 new objects */
58 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
59
60 /* pg -> disk layout */
61 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
62
63 /* object -> pg layout */
64 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
65 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
66} __attribute__ ((packed));
67
68#define CEPH_MIN_STRIPE_UNIT 65536
69
70int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
71
72
73/* crypto algorithms */
74#define CEPH_CRYPTO_NONE 0x0
75#define CEPH_CRYPTO_AES 0x1
76
77#define CEPH_AES_IV "cephsageyudagreg"
78
79/* security/authentication protocols */
80#define CEPH_AUTH_UNKNOWN 0x0
81#define CEPH_AUTH_NONE 0x1
82#define CEPH_AUTH_CEPHX 0x2
83
84#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
85
86
87/*********************************************
88 * message layer
89 */
90
91/*
92 * message types
93 */
94
95/* misc */
96#define CEPH_MSG_SHUTDOWN 1
97#define CEPH_MSG_PING 2
98
99/* client <-> monitor */
100#define CEPH_MSG_MON_MAP 4
101#define CEPH_MSG_MON_GET_MAP 5
102#define CEPH_MSG_STATFS 13
103#define CEPH_MSG_STATFS_REPLY 14
104#define CEPH_MSG_MON_SUBSCRIBE 15
105#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
106#define CEPH_MSG_AUTH 17
107#define CEPH_MSG_AUTH_REPLY 18
108
109/* client <-> mds */
110#define CEPH_MSG_MDS_MAP 21
111
112#define CEPH_MSG_CLIENT_SESSION 22
113#define CEPH_MSG_CLIENT_RECONNECT 23
114
115#define CEPH_MSG_CLIENT_REQUEST 24
116#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
117#define CEPH_MSG_CLIENT_REPLY 26
118#define CEPH_MSG_CLIENT_CAPS 0x310
119#define CEPH_MSG_CLIENT_LEASE 0x311
120#define CEPH_MSG_CLIENT_SNAP 0x312
121#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
122
123/* pool ops */
124#define CEPH_MSG_POOLOP_REPLY 48
125#define CEPH_MSG_POOLOP 49
126
127
128/* osd */
129#define CEPH_MSG_OSD_MAP 41
130#define CEPH_MSG_OSD_OP 42
131#define CEPH_MSG_OSD_OPREPLY 43
132
133/* pool operations */
134enum {
135 POOL_OP_CREATE = 0x01,
136 POOL_OP_DELETE = 0x02,
137 POOL_OP_AUID_CHANGE = 0x03,
138 POOL_OP_CREATE_SNAP = 0x11,
139 POOL_OP_DELETE_SNAP = 0x12,
140 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
141 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
142};
143
144struct ceph_mon_request_header {
145 __le64 have_version;
146 __le16 session_mon;
147 __le64 session_mon_tid;
148} __attribute__ ((packed));
149
150struct ceph_mon_statfs {
151 struct ceph_mon_request_header monhdr;
152 struct ceph_fsid fsid;
153} __attribute__ ((packed));
154
155struct ceph_statfs {
156 __le64 kb, kb_used, kb_avail;
157 __le64 num_objects;
158} __attribute__ ((packed));
159
160struct ceph_mon_statfs_reply {
161 struct ceph_fsid fsid;
162 __le64 version;
163 struct ceph_statfs st;
164} __attribute__ ((packed));
165
166const char *ceph_pool_op_name(int op);
167
168struct ceph_mon_poolop {
169 struct ceph_mon_request_header monhdr;
170 struct ceph_fsid fsid;
171 __le32 pool;
172 __le32 op;
173 __le64 auid;
174 __le64 snapid;
175 __le32 name_len;
176} __attribute__ ((packed));
177
178struct ceph_mon_poolop_reply {
179 struct ceph_mon_request_header monhdr;
180 struct ceph_fsid fsid;
181 __le32 reply_code;
182 __le32 epoch;
183 char has_data;
184 char data[0];
185} __attribute__ ((packed));
186
187struct ceph_mon_unmanaged_snap {
188 __le64 snapid;
189} __attribute__ ((packed));
190
191struct ceph_osd_getmap {
192 struct ceph_mon_request_header monhdr;
193 struct ceph_fsid fsid;
194 __le32 start;
195} __attribute__ ((packed));
196
197struct ceph_mds_getmap {
198 struct ceph_mon_request_header monhdr;
199 struct ceph_fsid fsid;
200} __attribute__ ((packed));
201
202struct ceph_client_mount {
203 struct ceph_mon_request_header monhdr;
204} __attribute__ ((packed));
205
206struct ceph_mon_subscribe_item {
207 __le64 have_version; __le64 have;
208 __u8 onetime;
209} __attribute__ ((packed));
210
211struct ceph_mon_subscribe_ack {
212 __le32 duration; /* seconds */
213 struct ceph_fsid fsid;
214} __attribute__ ((packed));
215
216/*
217 * mds states
218 * > 0 -> in
219 * <= 0 -> out
220 */
221#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
222#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
223 empty log. */
224#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
225#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
226#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
227#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
228#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
229
230#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
231#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
232 operations (import, rename, etc.) */
233#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
234#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
235#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
236#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
237#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
238
239extern const char *ceph_mds_state_name(int s);
240
241
242/*
243 * metadata lock types.
244 * - these are bitmasks.. we can compose them
245 * - they also define the lock ordering by the MDS
246 * - a few of these are internal to the mds
247 */
248#define CEPH_LOCK_DVERSION 1
249#define CEPH_LOCK_DN 2
250#define CEPH_LOCK_ISNAP 16
251#define CEPH_LOCK_IVERSION 32 /* mds internal */
252#define CEPH_LOCK_IFILE 64
253#define CEPH_LOCK_IAUTH 128
254#define CEPH_LOCK_ILINK 256
255#define CEPH_LOCK_IDFT 512 /* dir frag tree */
256#define CEPH_LOCK_INEST 1024 /* mds internal */
257#define CEPH_LOCK_IXATTR 2048
258#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
259#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
260
261/* client_session ops */
262enum {
263 CEPH_SESSION_REQUEST_OPEN,
264 CEPH_SESSION_OPEN,
265 CEPH_SESSION_REQUEST_CLOSE,
266 CEPH_SESSION_CLOSE,
267 CEPH_SESSION_REQUEST_RENEWCAPS,
268 CEPH_SESSION_RENEWCAPS,
269 CEPH_SESSION_STALE,
270 CEPH_SESSION_RECALL_STATE,
271};
272
273extern const char *ceph_session_op_name(int op);
274
275struct ceph_mds_session_head {
276 __le32 op;
277 __le64 seq;
278 struct ceph_timespec stamp;
279 __le32 max_caps, max_leases;
280} __attribute__ ((packed));
281
282/* client_request */
283/*
284 * metadata ops.
285 * & 0x001000 -> write op
286 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
287 & & 0x100000 -> use weird ino/path trace
288 */
289#define CEPH_MDS_OP_WRITE 0x001000
290enum {
291 CEPH_MDS_OP_LOOKUP = 0x00100,
292 CEPH_MDS_OP_GETATTR = 0x00101,
293 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
294 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
295
296 CEPH_MDS_OP_SETXATTR = 0x01105,
297 CEPH_MDS_OP_RMXATTR = 0x01106,
298 CEPH_MDS_OP_SETLAYOUT = 0x01107,
299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
302
303 CEPH_MDS_OP_MKNOD = 0x01201,
304 CEPH_MDS_OP_LINK = 0x01202,
305 CEPH_MDS_OP_UNLINK = 0x01203,
306 CEPH_MDS_OP_RENAME = 0x01204,
307 CEPH_MDS_OP_MKDIR = 0x01220,
308 CEPH_MDS_OP_RMDIR = 0x01221,
309 CEPH_MDS_OP_SYMLINK = 0x01222,
310
311 CEPH_MDS_OP_CREATE = 0x01301,
312 CEPH_MDS_OP_OPEN = 0x00302,
313 CEPH_MDS_OP_READDIR = 0x00305,
314
315 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
316 CEPH_MDS_OP_MKSNAP = 0x01400,
317 CEPH_MDS_OP_RMSNAP = 0x01401,
318 CEPH_MDS_OP_LSSNAP = 0x00402,
319};
320
321extern const char *ceph_mds_op_name(int op);
322
323
324#define CEPH_SETATTR_MODE 1
325#define CEPH_SETATTR_UID 2
326#define CEPH_SETATTR_GID 4
327#define CEPH_SETATTR_MTIME 8
328#define CEPH_SETATTR_ATIME 16
329#define CEPH_SETATTR_SIZE 32
330#define CEPH_SETATTR_CTIME 64
331
332union ceph_mds_request_args {
333 struct {
334 __le32 mask; /* CEPH_CAP_* */
335 } __attribute__ ((packed)) getattr;
336 struct {
337 __le32 mode;
338 __le32 uid;
339 __le32 gid;
340 struct ceph_timespec mtime;
341 struct ceph_timespec atime;
342 __le64 size, old_size; /* old_size needed by truncate */
343 __le32 mask; /* CEPH_SETATTR_* */
344 } __attribute__ ((packed)) setattr;
345 struct {
346 __le32 frag; /* which dir fragment */
347 __le32 max_entries; /* how many dentries to grab */
348 __le32 max_bytes;
349 } __attribute__ ((packed)) readdir;
350 struct {
351 __le32 mode;
352 __le32 rdev;
353 } __attribute__ ((packed)) mknod;
354 struct {
355 __le32 mode;
356 } __attribute__ ((packed)) mkdir;
357 struct {
358 __le32 flags;
359 __le32 mode;
360 __le32 stripe_unit; /* layout for newly created file */
361 __le32 stripe_count; /* ... */
362 __le32 object_size;
363 __le32 file_replication;
364 __le32 preferred;
365 } __attribute__ ((packed)) open;
366 struct {
367 __le32 flags;
368 } __attribute__ ((packed)) setxattr;
369 struct {
370 struct ceph_file_layout layout;
371 } __attribute__ ((packed)) setlayout;
372 struct {
373 __u8 rule; /* currently fcntl or flock */
374 __u8 type; /* shared, exclusive, remove*/
375 __le64 pid; /* process id requesting the lock */
376 __le64 pid_namespace;
377 __le64 start; /* initial location to lock */
378 __le64 length; /* num bytes to lock from start */
379 __u8 wait; /* will caller wait for lock to become available? */
380 } __attribute__ ((packed)) filelock_change;
381} __attribute__ ((packed));
382
383#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
384#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
385
386struct ceph_mds_request_head {
387 __le64 oldest_client_tid;
388 __le32 mdsmap_epoch; /* on client */
389 __le32 flags; /* CEPH_MDS_FLAG_* */
390 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
391 __le16 num_releases; /* # include cap/lease release records */
392 __le32 op; /* mds op code */
393 __le32 caller_uid, caller_gid;
394 __le64 ino; /* use this ino for openc, mkdir, mknod,
395 etc. (if replaying) */
396 union ceph_mds_request_args args;
397} __attribute__ ((packed));
398
399/* cap/lease release record */
400struct ceph_mds_request_release {
401 __le64 ino, cap_id; /* ino and unique cap id */
402 __le32 caps, wanted; /* new issued, wanted */
403 __le32 seq, issue_seq, mseq;
404 __le32 dname_seq; /* if releasing a dentry lease, a */
405 __le32 dname_len; /* string follows. */
406} __attribute__ ((packed));
407
408/* client reply */
409struct ceph_mds_reply_head {
410 __le32 op;
411 __le32 result;
412 __le32 mdsmap_epoch;
413 __u8 safe; /* true if committed to disk */
414 __u8 is_dentry, is_target; /* true if dentry, target inode records
415 are included with reply */
416} __attribute__ ((packed));
417
418/* one for each node split */
419struct ceph_frag_tree_split {
420 __le32 frag; /* this frag splits... */
421 __le32 by; /* ...by this many bits */
422} __attribute__ ((packed));
423
424struct ceph_frag_tree_head {
425 __le32 nsplits; /* num ceph_frag_tree_split records */
426 struct ceph_frag_tree_split splits[];
427} __attribute__ ((packed));
428
429/* capability issue, for bundling with mds reply */
430struct ceph_mds_reply_cap {
431 __le32 caps, wanted; /* caps issued, wanted */
432 __le64 cap_id;
433 __le32 seq, mseq;
434 __le64 realm; /* snap realm */
435 __u8 flags; /* CEPH_CAP_FLAG_* */
436} __attribute__ ((packed));
437
438#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
439
440/* inode record, for bundling with mds reply */
441struct ceph_mds_reply_inode {
442 __le64 ino;
443 __le64 snapid;
444 __le32 rdev;
445 __le64 version; /* inode version */
446 __le64 xattr_version; /* version for xattr blob */
447 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
448 struct ceph_file_layout layout;
449 struct ceph_timespec ctime, mtime, atime;
450 __le32 time_warp_seq;
451 __le64 size, max_size, truncate_size;
452 __le32 truncate_seq;
453 __le32 mode, uid, gid;
454 __le32 nlink;
455 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
456 struct ceph_timespec rctime;
457 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
458} __attribute__ ((packed));
459/* followed by frag array, then symlink string, then xattr blob */
460
461/* reply_lease follows dname, and reply_inode */
462struct ceph_mds_reply_lease {
463 __le16 mask; /* lease type(s) */
464 __le32 duration_ms; /* lease duration */
465 __le32 seq;
466} __attribute__ ((packed));
467
468struct ceph_mds_reply_dirfrag {
469 __le32 frag; /* fragment */
470 __le32 auth; /* auth mds, if this is a delegation point */
471 __le32 ndist; /* number of mds' this is replicated on */
472 __le32 dist[];
473} __attribute__ ((packed));
474
475#define CEPH_LOCK_FCNTL 1
476#define CEPH_LOCK_FLOCK 2
477
478#define CEPH_LOCK_SHARED 1
479#define CEPH_LOCK_EXCL 2
480#define CEPH_LOCK_UNLOCK 4
481
482struct ceph_filelock {
483 __le64 start;/* file offset to start lock at */
484 __le64 length; /* num bytes to lock; 0 for all following start */
485 __le64 client; /* which client holds the lock */
486 __le64 pid; /* process id holding the lock on the client */
487 __le64 pid_namespace;
488 __u8 type; /* shared lock, exclusive lock, or unlock */
489} __attribute__ ((packed));
490
491
492/* file access modes */
493#define CEPH_FILE_MODE_PIN 0
494#define CEPH_FILE_MODE_RD 1
495#define CEPH_FILE_MODE_WR 2
496#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
497#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
498#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
499
500int ceph_flags_to_mode(int flags);
501
502
503/* capability bits */
504#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
505
506/* generic cap bits */
507#define CEPH_CAP_GSHARED 1 /* client can reads */
508#define CEPH_CAP_GEXCL 2 /* client can read and update */
509#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
510#define CEPH_CAP_GRD 8 /* (file) client can read */
511#define CEPH_CAP_GWR 16 /* (file) client can write */
512#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
513#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
514#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
515
516/* per-lock shift */
517#define CEPH_CAP_SAUTH 2
518#define CEPH_CAP_SLINK 4
519#define CEPH_CAP_SXATTR 6
520#define CEPH_CAP_SFILE 8
521#define CEPH_CAP_SFLOCK 20
522
523#define CEPH_CAP_BITS 22
524
525/* composed values */
526#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
527#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
528#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
529#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
530#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
531#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
532#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
533#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
534#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
535#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
536#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
537#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
538#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
539#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
540#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
541#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
542#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
543
544
545/* cap masks (for getattr) */
546#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
547#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
548#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
549#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
550#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
551#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
552#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
553#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
554#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
555#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
556#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
557#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
558#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
559 CEPH_CAP_AUTH_SHARED | \
560 CEPH_CAP_LINK_SHARED | \
561 CEPH_CAP_FILE_SHARED | \
562 CEPH_CAP_XATTR_SHARED)
563
564#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
565 CEPH_CAP_LINK_SHARED | \
566 CEPH_CAP_XATTR_SHARED | \
567 CEPH_CAP_FILE_SHARED)
568#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
569 CEPH_CAP_FILE_CACHE)
570
571#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
572 CEPH_CAP_LINK_EXCL | \
573 CEPH_CAP_XATTR_EXCL | \
574 CEPH_CAP_FILE_EXCL)
575#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
576 CEPH_CAP_FILE_EXCL)
577#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
578#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
579 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
580 CEPH_CAP_PIN)
581
582#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
583 CEPH_LOCK_IXATTR)
584
585int ceph_caps_for_mode(int mode);
586
587enum {
588 CEPH_CAP_OP_GRANT, /* mds->client grant */
589 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
590 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
591 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
592 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
593 CEPH_CAP_OP_UPDATE, /* client->mds update */
594 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
595 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
596 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
597 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
598 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
599 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
600 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
601};
602
603extern const char *ceph_cap_op_name(int op);
604
605/*
606 * caps message, used for capability callbacks, acks, requests, etc.
607 */
608struct ceph_mds_caps {
609 __le32 op; /* CEPH_CAP_OP_* */
610 __le64 ino, realm;
611 __le64 cap_id;
612 __le32 seq, issue_seq;
613 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
614 __le32 migrate_seq;
615 __le64 snap_follows;
616 __le32 snap_trace_len;
617
618 /* authlock */
619 __le32 uid, gid, mode;
620
621 /* linklock */
622 __le32 nlink;
623
624 /* xattrlock */
625 __le32 xattr_len;
626 __le64 xattr_version;
627
628 /* filelock */
629 __le64 size, max_size, truncate_size;
630 __le32 truncate_seq;
631 struct ceph_timespec mtime, atime, ctime;
632 struct ceph_file_layout layout;
633 __le32 time_warp_seq;
634} __attribute__ ((packed));
635
636/* cap release msg head */
637struct ceph_mds_cap_release {
638 __le32 num; /* number of cap_items that follow */
639} __attribute__ ((packed));
640
641struct ceph_mds_cap_item {
642 __le64 ino;
643 __le64 cap_id;
644 __le32 migrate_seq, seq;
645} __attribute__ ((packed));
646
647#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
648#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
649#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
650#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
651
652extern const char *ceph_lease_op_name(int o);
653
654/* lease msg header */
655struct ceph_mds_lease {
656 __u8 action; /* CEPH_MDS_LEASE_* */
657 __le16 mask; /* which lease */
658 __le64 ino;
659 __le64 first, last; /* snap range */
660 __le32 seq;
661 __le32 duration_ms; /* duration of renewal */
662} __attribute__ ((packed));
663/* followed by a __le32+string for dname */
664
665/* client reconnect */
666struct ceph_mds_cap_reconnect {
667 __le64 cap_id;
668 __le32 wanted;
669 __le32 issued;
670 __le64 snaprealm;
671 __le64 pathbase; /* base ino for our path to this ino */
672 __le32 flock_len; /* size of flock state blob, if any */
673} __attribute__ ((packed));
674/* followed by flock blob */
675
676struct ceph_mds_cap_reconnect_v1 {
677 __le64 cap_id;
678 __le32 wanted;
679 __le32 issued;
680 __le64 size;
681 struct ceph_timespec mtime, atime;
682 __le64 snaprealm;
683 __le64 pathbase; /* base ino for our path to this ino */
684} __attribute__ ((packed));
685
686struct ceph_mds_snaprealm_reconnect {
687 __le64 ino; /* snap realm base */
688 __le64 seq; /* snap seq for this snap realm */
689 __le64 parent; /* parent realm */
690} __attribute__ ((packed));
691
692/*
693 * snaps
694 */
695enum {
696 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
697 CEPH_SNAP_OP_CREATE,
698 CEPH_SNAP_OP_DESTROY,
699 CEPH_SNAP_OP_SPLIT,
700};
701
702extern const char *ceph_snap_op_name(int o);
703
704/* snap msg header */
705struct ceph_mds_snap_head {
706 __le32 op; /* CEPH_SNAP_OP_* */
707 __le64 split; /* ino to split off, if any */
708 __le32 num_split_inos; /* # inos belonging to new child realm */
709 __le32 num_split_realms; /* # child realms udner new child realm */
710 __le32 trace_len; /* size of snap trace blob */
711} __attribute__ ((packed));
712/* followed by split ino list, then split realms, then the trace blob */
713
714/*
715 * encode info about a snaprealm, as viewed by a client
716 */
717struct ceph_mds_snap_realm {
718 __le64 ino; /* ino */
719 __le64 created; /* snap: when created */
720 __le64 parent; /* ino: parent realm */
721 __le64 parent_since; /* snap: same parent since */
722 __le64 seq; /* snap: version */
723 __le32 num_snaps;
724 __le32 num_prior_parent_snaps;
725} __attribute__ ((packed));
726/* followed by my snap list, then prior parent snap list */
727
728#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d147..000000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index d099c3f90236..000000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#ifndef FS_CEPH_HASH_H
2#define FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e5779..000000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index 97e435b191f4..000000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
1#ifndef CEPH_CRUSH_CRUSH_H
2#define CEPH_CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694bf..000000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index 91e884230d5d..000000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
1#ifndef CEPH_CRUSH_HASH_H
2#define CEPH_CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258e..000000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x10000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
311
312 for (rep = outpos; rep < numrep; rep++) {
313 /* keep trying until we get a non-out, non-colliding item */
314 ftotal = 0;
315 skip_rep = 0;
316 do {
317 retry_descent = 0;
318 in = bucket; /* initial bucket */
319
320 /* choose through intervening buckets */
321 flocal = 0;
322 do {
323 collide = 0;
324 retry_bucket = 0;
325 r = rep;
326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
327 /* be careful */
328 if (firstn || numrep >= in->size)
329 /* r' = r + f_total */
330 r += ftotal;
331 else if (in->size % numrep == 0)
332 /* r'=r+(n+1)*f_local */
333 r += (numrep+1) *
334 (flocal+ftotal);
335 else
336 /* r' = r + n*f_local */
337 r += numrep * (flocal+ftotal);
338 } else {
339 if (firstn)
340 /* r' = r + f_total */
341 r += ftotal;
342 else
343 /* r' = r + n*f_local */
344 r += numrep * (flocal+ftotal);
345 }
346
347 /* bucket choose */
348 if (in->size == 0) {
349 reject = 1;
350 goto reject;
351 }
352 if (flocal >= (in->size>>1) &&
353 flocal > orig_tries)
354 item = bucket_perm_choose(in, x, r);
355 else
356 item = crush_bucket_choose(in, x, r);
357 BUG_ON(item >= map->max_devices);
358
359 /* desired type? */
360 if (item < 0)
361 itemtype = map->buckets[-1-item]->type;
362 else
363 itemtype = 0;
364 dprintk(" item %d type %d\n", item, itemtype);
365
366 /* keep going? */
367 if (itemtype != type) {
368 BUG_ON(item >= 0 ||
369 (-1-item) >= map->max_buckets);
370 in = map->buckets[-1-item];
371 retry_bucket = 1;
372 continue;
373 }
374
375 /* collision? */
376 for (i = 0; i < outpos; i++) {
377 if (out[i] == item) {
378 collide = 1;
379 break;
380 }
381 }
382
383 reject = 0;
384 if (recurse_to_leaf) {
385 if (item < 0) {
386 if (crush_choose(map,
387 map->buckets[-1-item],
388 weight,
389 x, outpos+1, 0,
390 out2, outpos,
391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
402 /* out? */
403 if (itemtype == 0)
404 reject = is_out(map, weight,
405 item, x);
406 else
407 reject = 0;
408 }
409
410reject:
411 if (reject || collide) {
412 ftotal++;
413 flocal++;
414
415 if (collide && flocal < 3)
416 /* retry locally a few times */
417 retry_bucket = 1;
418 else if (flocal < in->size + orig_tries)
419 /* exhaustive bucket search */
420 retry_bucket = 1;
421 else if (ftotal < 20)
422 /* then retry descent */
423 retry_descent = 1;
424 else
425 /* else give up */
426 skip_rep = 1;
427 dprintk(" reject %d collide %d "
428 "ftotal %d flocal %d\n",
429 reject, collide, ftotal,
430 flocal);
431 }
432 } while (retry_bucket);
433 } while (retry_descent);
434
435 if (skip_rep) {
436 dprintk("skip rep\n");
437 continue;
438 }
439
440 dprintk("CHOOSE got %d\n", item);
441 out[outpos] = item;
442 outpos++;
443 }
444
445 dprintk("CHOOSE returns %d\n", outpos);
446 return outpos;
447}
448
449
450/**
451 * crush_do_rule - calculate a mapping with the given input and rule
452 * @map: the crush_map
453 * @ruleno: the rule id
454 * @x: hash input
455 * @result: pointer to result vector
456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
458 */
459int crush_do_rule(struct crush_map *map,
460 int ruleno, int x, int *result, int result_max,
461 int force, __u32 *weight)
462{
463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
466 int a[CRUSH_MAX_SET];
467 int b[CRUSH_MAX_SET];
468 int c[CRUSH_MAX_SET];
469 int recurse_to_leaf;
470 int *w;
471 int wsize = 0;
472 int *o;
473 int osize;
474 int *tmp;
475 struct crush_rule *rule;
476 int step;
477 int i, j;
478 int numrep;
479 int firstn;
480 int rc = -1;
481
482 BUG_ON(ruleno >= map->max_rules);
483
484 rule = map->rules[ruleno];
485 result_len = 0;
486 w = a;
487 o = b;
488
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
513
514 for (step = 0; step < rule->len; step++) {
515 firstn = 0;
516 switch (rule->steps[step].op) {
517 case CRUSH_RULE_TAKE:
518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
523 wsize = 1;
524 break;
525
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1;
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0);
532
533 recurse_to_leaf =
534 rule->steps[step].op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538
539 /* reset output */
540 osize = 0;
541
542 for (i = 0; i < wsize; i++) {
543 /*
544 * see CRUSH_N, CRUSH_N_MINUS macros.
545 * basically, numrep <= 0 means relative to
546 * the provided result_max
547 */
548 numrep = rule->steps[step].arg1;
549 if (numrep <= 0) {
550 numrep += result_max;
551 if (numrep <= 0)
552 continue;
553 }
554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map,
570 map->buckets[-1-w[i]],
571 weight,
572 x, numrep,
573 rule->steps[step].arg2,
574 o+osize, j,
575 firstn,
576 recurse_to_leaf, c+osize);
577 }
578
579 if (recurse_to_leaf)
580 /* copy final _leaf_ values to output set */
581 memcpy(o, c, osize*sizeof(*o));
582
583 /* swap t and w arrays */
584 tmp = o;
585 o = w;
586 w = tmp;
587 wsize = osize;
588 break;
589
590
591 case CRUSH_RULE_EMIT:
592 for (i = 0; i < wsize && result_len < result_max; i++) {
593 result[result_len] = w[i];
594 result_len++;
595 }
596 wsize = 0;
597 break;
598
599 default:
600 BUG_ON(1);
601 }
602 }
603 rc = result_len;
604
605out:
606 return rc;
607}
608
609
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index c46b99c18bb0..000000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
1#ifndef CEPH_CRUSH_MAPPER_H
2#define CEPH_CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index a3e627f63293..000000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79
80static int ceph_aes_encrypt(const void *key, int key_len,
81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
83{
84 struct scatterlist sg_in[2], sg_out[1];
85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
86 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
87 int ret;
88 void *iv;
89 int ivsize;
90 size_t zero_padding = (0x10 - (src_len & 0x0f));
91 char pad[16];
92
93 if (IS_ERR(tfm))
94 return PTR_ERR(tfm);
95
96 memset(pad, zero_padding, zero_padding);
97
98 *dst_len = src_len + zero_padding;
99
100 crypto_blkcipher_setkey((void *)tfm, key, key_len);
101 sg_init_table(sg_in, 2);
102 sg_set_buf(&sg_in[0], src, src_len);
103 sg_set_buf(&sg_in[1], pad, zero_padding);
104 sg_init_table(sg_out, 1);
105 sg_set_buf(sg_out, dst, *dst_len);
106 iv = crypto_blkcipher_crt(tfm)->iv;
107 ivsize = crypto_blkcipher_ivsize(tfm);
108
109 memcpy(iv, aes_iv, ivsize);
110 /*
111 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
112 key, key_len, 1);
113 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
114 src, src_len, 1);
115 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
116 pad, zero_padding, 1);
117 */
118 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
119 src_len + zero_padding);
120 crypto_free_blkcipher(tfm);
121 if (ret < 0)
122 pr_err("ceph_aes_crypt failed %d\n", ret);
123 /*
124 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
125 dst, *dst_len, 1);
126 */
127 return 0;
128}
129
130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
131 size_t *dst_len,
132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
134{
135 struct scatterlist sg_in[3], sg_out[1];
136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
137 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
138 int ret;
139 void *iv;
140 int ivsize;
141 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
142 char pad[16];
143
144 if (IS_ERR(tfm))
145 return PTR_ERR(tfm);
146
147 memset(pad, zero_padding, zero_padding);
148
149 *dst_len = src1_len + src2_len + zero_padding;
150
151 crypto_blkcipher_setkey((void *)tfm, key, key_len);
152 sg_init_table(sg_in, 3);
153 sg_set_buf(&sg_in[0], src1, src1_len);
154 sg_set_buf(&sg_in[1], src2, src2_len);
155 sg_set_buf(&sg_in[2], pad, zero_padding);
156 sg_init_table(sg_out, 1);
157 sg_set_buf(sg_out, dst, *dst_len);
158 iv = crypto_blkcipher_crt(tfm)->iv;
159 ivsize = crypto_blkcipher_ivsize(tfm);
160
161 memcpy(iv, aes_iv, ivsize);
162 /*
163 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
164 key, key_len, 1);
165 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
166 src1, src1_len, 1);
167 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
168 src2, src2_len, 1);
169 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
170 pad, zero_padding, 1);
171 */
172 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
173 src1_len + src2_len + zero_padding);
174 crypto_free_blkcipher(tfm);
175 if (ret < 0)
176 pr_err("ceph_aes_crypt2 failed %d\n", ret);
177 /*
178 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
179 dst, *dst_len, 1);
180 */
181 return 0;
182}
183
184static int ceph_aes_decrypt(const void *key, int key_len,
185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
187{
188 struct scatterlist sg_in[1], sg_out[2];
189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
190 struct blkcipher_desc desc = { .tfm = tfm };
191 char pad[16];
192 void *iv;
193 int ivsize;
194 int ret;
195 int last_byte;
196
197 if (IS_ERR(tfm))
198 return PTR_ERR(tfm);
199
200 crypto_blkcipher_setkey((void *)tfm, key, key_len);
201 sg_init_table(sg_in, 1);
202 sg_init_table(sg_out, 2);
203 sg_set_buf(sg_in, src, src_len);
204 sg_set_buf(&sg_out[0], dst, *dst_len);
205 sg_set_buf(&sg_out[1], pad, sizeof(pad));
206
207 iv = crypto_blkcipher_crt(tfm)->iv;
208 ivsize = crypto_blkcipher_ivsize(tfm);
209
210 memcpy(iv, aes_iv, ivsize);
211
212 /*
213 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
214 key, key_len, 1);
215 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
216 src, src_len, 1);
217 */
218
219 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
220 crypto_free_blkcipher(tfm);
221 if (ret < 0) {
222 pr_err("ceph_aes_decrypt failed %d\n", ret);
223 return ret;
224 }
225
226 if (src_len <= *dst_len)
227 last_byte = ((char *)dst)[src_len - 1];
228 else
229 last_byte = pad[src_len - *dst_len - 1];
230 if (last_byte <= 16 && src_len >= last_byte) {
231 *dst_len = src_len - last_byte;
232 } else {
233 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
234 last_byte, (int)src_len);
235 return -EPERM; /* bad padding */
236 }
237 /*
238 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
239 dst, *dst_len, 1);
240 */
241 return 0;
242}
243
244static int ceph_aes_decrypt2(const void *key, int key_len,
245 void *dst1, size_t *dst1_len,
246 void *dst2, size_t *dst2_len,
247 const void *src, size_t src_len)
248{
249 struct scatterlist sg_in[1], sg_out[3];
250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
251 struct blkcipher_desc desc = { .tfm = tfm };
252 char pad[16];
253 void *iv;
254 int ivsize;
255 int ret;
256 int last_byte;
257
258 if (IS_ERR(tfm))
259 return PTR_ERR(tfm);
260
261 sg_init_table(sg_in, 1);
262 sg_set_buf(sg_in, src, src_len);
263 sg_init_table(sg_out, 3);
264 sg_set_buf(&sg_out[0], dst1, *dst1_len);
265 sg_set_buf(&sg_out[1], dst2, *dst2_len);
266 sg_set_buf(&sg_out[2], pad, sizeof(pad));
267
268 crypto_blkcipher_setkey((void *)tfm, key, key_len);
269 iv = crypto_blkcipher_crt(tfm)->iv;
270 ivsize = crypto_blkcipher_ivsize(tfm);
271
272 memcpy(iv, aes_iv, ivsize);
273
274 /*
275 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
276 key, key_len, 1);
277 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
278 src, src_len, 1);
279 */
280
281 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
282 crypto_free_blkcipher(tfm);
283 if (ret < 0) {
284 pr_err("ceph_aes_decrypt failed %d\n", ret);
285 return ret;
286 }
287
288 if (src_len <= *dst1_len)
289 last_byte = ((char *)dst1)[src_len - 1];
290 else if (src_len <= *dst1_len + *dst2_len)
291 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
292 else
293 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
294 if (last_byte <= 16 && src_len >= last_byte) {
295 src_len -= last_byte;
296 } else {
297 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
298 last_byte, (int)src_len);
299 return -EPERM; /* bad padding */
300 }
301
302 if (src_len < *dst1_len) {
303 *dst1_len = src_len;
304 *dst2_len = 0;
305 } else {
306 *dst2_len = src_len - *dst1_len;
307 }
308 /*
309 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
310 dst1, *dst1_len, 1);
311 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
312 dst2, *dst2_len, 1);
313 */
314
315 return 0;
316}
317
318
319int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
320 const void *src, size_t src_len)
321{
322 switch (secret->type) {
323 case CEPH_CRYPTO_NONE:
324 if (*dst_len < src_len)
325 return -ERANGE;
326 memcpy(dst, src, src_len);
327 *dst_len = src_len;
328 return 0;
329
330 case CEPH_CRYPTO_AES:
331 return ceph_aes_decrypt(secret->key, secret->len, dst,
332 dst_len, src, src_len);
333
334 default:
335 return -EINVAL;
336 }
337}
338
339int ceph_decrypt2(struct ceph_crypto_key *secret,
340 void *dst1, size_t *dst1_len,
341 void *dst2, size_t *dst2_len,
342 const void *src, size_t src_len)
343{
344 size_t t;
345
346 switch (secret->type) {
347 case CEPH_CRYPTO_NONE:
348 if (*dst1_len + *dst2_len < src_len)
349 return -ERANGE;
350 t = min(*dst1_len, src_len);
351 memcpy(dst1, src, t);
352 *dst1_len = t;
353 src += t;
354 src_len -= t;
355 if (src_len) {
356 t = min(*dst2_len, src_len);
357 memcpy(dst2, src, t);
358 *dst2_len = t;
359 }
360 return 0;
361
362 case CEPH_CRYPTO_AES:
363 return ceph_aes_decrypt2(secret->key, secret->len,
364 dst1, dst1_len, dst2, dst2_len,
365 src, src_len);
366
367 default:
368 return -EINVAL;
369 }
370}
371
372int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
373 const void *src, size_t src_len)
374{
375 switch (secret->type) {
376 case CEPH_CRYPTO_NONE:
377 if (*dst_len < src_len)
378 return -ERANGE;
379 memcpy(dst, src, src_len);
380 *dst_len = src_len;
381 return 0;
382
383 case CEPH_CRYPTO_AES:
384 return ceph_aes_encrypt(secret->key, secret->len, dst,
385 dst_len, src, src_len);
386
387 default:
388 return -EINVAL;
389 }
390}
391
392int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
393 const void *src1, size_t src1_len,
394 const void *src2, size_t src2_len)
395{
396 switch (secret->type) {
397 case CEPH_CRYPTO_NONE:
398 if (*dst_len < src1_len + src2_len)
399 return -ERANGE;
400 memcpy(dst, src1, src1_len);
401 memcpy(dst + src1_len, src2, src2_len);
402 *dst_len = src1_len + src2_len;
403 return 0;
404
405 case CEPH_CRYPTO_AES:
406 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
407 src1, src1_len, src2, src2_len);
408
409 default:
410 return -EINVAL;
411 }
412}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index bdf38607323c..000000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
@@ -7,143 +7,49 @@
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9 9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
10#include "super.h" 15#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14 16
15#ifdef CONFIG_DEBUG_FS 17#ifdef CONFIG_DEBUG_FS
16 18
17/* 19#include "mds_client.h"
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53 20
54static int mdsmap_show(struct seq_file *s, void *p) 21static int mdsmap_show(struct seq_file *s, void *p)
55{ 22{
56 int i; 23 int i;
57 struct ceph_client *client = s->private; 24 struct ceph_fs_client *fsc = s->private;
58 25
59 if (client->mdsc.mdsmap == NULL) 26 if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
60 return 0; 27 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); 28 seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); 29 seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n", 30 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout); 31 fsc->mdsc->mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n", 32 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose); 33 fsc->mdsc->mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { 34 for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr = 35 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr; 36 &fsc->mdsc->mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state; 37 int state = fsc->mdsc->mdsmap->m_info[i].state;
71 38
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
40 ceph_pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state)); 41 ceph_mds_state_name(state));
74 } 42 }
75 return 0; 43 return 0;
76} 44}
77 45
78static int osdmap_show(struct seq_file *s, void *p) 46/*
79{ 47 * mdsc debugfs
80 int i; 48 */
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 __u16 op;
131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
137 }
138
139 mutex_unlock(&monc->mutex);
140 return 0;
141}
142
143static int mdsc_show(struct seq_file *s, void *p) 49static int mdsc_show(struct seq_file *s, void *p)
144{ 50{
145 struct ceph_client *client = s->private; 51 struct ceph_fs_client *fsc = s->private;
146 struct ceph_mds_client *mdsc = &client->mdsc; 52 struct ceph_mds_client *mdsc = fsc->mdsc;
147 struct ceph_mds_request *req; 53 struct ceph_mds_request *req;
148 struct rb_node *rp; 54 struct rb_node *rp;
149 int pathlen; 55 int pathlen;
@@ -154,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
154 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { 60 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
155 req = rb_entry(rp, struct ceph_mds_request, r_node); 61 req = rb_entry(rp, struct ceph_mds_request, r_node);
156 62
157 if (req->r_request) 63 if (req->r_request && req->r_session)
158 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); 64 seq_printf(s, "%lld\tmds%d\t", req->r_tid,
159 else 65 req->r_session->s_mds);
66 else if (!req->r_request)
160 seq_printf(s, "%lld\t(no request)\t", req->r_tid); 67 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
68 else
69 seq_printf(s, "%lld\t(no session)\t", req->r_tid);
161 70
162 seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); 71 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
163 72
@@ -214,61 +123,12 @@ static int mdsc_show(struct seq_file *s, void *p)
214 return 0; 123 return 0;
215} 124}
216 125
217static int osdc_show(struct seq_file *s, void *pp)
218{
219 struct ceph_client *client = s->private;
220 struct ceph_osd_client *osdc = &client->osdc;
221 struct rb_node *p;
222
223 mutex_lock(&osdc->request_mutex);
224 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
225 struct ceph_osd_request *req;
226 struct ceph_osd_request_head *head;
227 struct ceph_osd_op *op;
228 int num_ops;
229 int opcode, olen;
230 int i;
231
232 req = rb_entry(p, struct ceph_osd_request, r_node);
233
234 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
235 req->r_osd ? req->r_osd->o_osd : -1,
236 le32_to_cpu(req->r_pgid.pool),
237 le16_to_cpu(req->r_pgid.ps));
238
239 head = req->r_request->front.iov_base;
240 op = (void *)(head + 1);
241
242 num_ops = le16_to_cpu(head->num_ops);
243 olen = le32_to_cpu(head->object_len);
244 seq_printf(s, "%.*s", olen,
245 (const char *)(head->ops + num_ops));
246
247 if (req->r_reassert_version.epoch)
248 seq_printf(s, "\t%u'%llu",
249 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
250 le64_to_cpu(req->r_reassert_version.version));
251 else
252 seq_printf(s, "\t");
253
254 for (i = 0; i < num_ops; i++) {
255 opcode = le16_to_cpu(op->op);
256 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
257 op++;
258 }
259
260 seq_printf(s, "\n");
261 }
262 mutex_unlock(&osdc->request_mutex);
263 return 0;
264}
265
266static int caps_show(struct seq_file *s, void *p) 126static int caps_show(struct seq_file *s, void *p)
267{ 127{
268 struct ceph_client *client = s->private; 128 struct ceph_fs_client *fsc = s->private;
269 int total, avail, used, reserved, min; 129 int total, avail, used, reserved, min;
270 130
271 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 131 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
272 seq_printf(s, "total\t\t%d\n" 132 seq_printf(s, "total\t\t%d\n"
273 "avail\t\t%d\n" 133 "avail\t\t%d\n"
274 "used\t\t%d\n" 134 "used\t\t%d\n"
@@ -280,8 +140,8 @@ static int caps_show(struct seq_file *s, void *p)
280 140
281static int dentry_lru_show(struct seq_file *s, void *ptr) 141static int dentry_lru_show(struct seq_file *s, void *ptr)
282{ 142{
283 struct ceph_client *client = s->private; 143 struct ceph_fs_client *fsc = s->private;
284 struct ceph_mds_client *mdsc = &client->mdsc; 144 struct ceph_mds_client *mdsc = fsc->mdsc;
285 struct ceph_dentry_info *di; 145 struct ceph_dentry_info *di;
286 146
287 spin_lock(&mdsc->dentry_lru_lock); 147 spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +155,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
295 return 0; 155 return 0;
296} 156}
297 157
298#define DEFINE_SHOW_FUNC(name) \ 158CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
299static int name##_open(struct inode *inode, struct file *file) \ 159CEPH_DEFINE_SHOW_FUNC(mdsc_show)
300{ \ 160CEPH_DEFINE_SHOW_FUNC(caps_show)
301 struct seq_file *sf; \ 161CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
302 int ret; \ 162
303 \
304 ret = single_open(file, name, NULL); \
305 sf = file->private_data; \
306 sf->private = inode->i_private; \
307 return ret; \
308} \
309 \
310static const struct file_operations name##_fops = { \
311 .open = name##_open, \
312 .read = seq_read, \
313 .llseek = seq_lseek, \
314 .release = single_release, \
315};
316
317DEFINE_SHOW_FUNC(monmap_show)
318DEFINE_SHOW_FUNC(mdsmap_show)
319DEFINE_SHOW_FUNC(osdmap_show)
320DEFINE_SHOW_FUNC(monc_show)
321DEFINE_SHOW_FUNC(mdsc_show)
322DEFINE_SHOW_FUNC(osdc_show)
323DEFINE_SHOW_FUNC(dentry_lru_show)
324DEFINE_SHOW_FUNC(caps_show)
325 163
164/*
165 * debugfs
166 */
326static int congestion_kb_set(void *data, u64 val) 167static int congestion_kb_set(void *data, u64 val)
327{ 168{
328 struct ceph_client *client = (struct ceph_client *)data; 169 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
329
330 if (client)
331 client->mount_args->congestion_kb = (int)val;
332 170
171 fsc->mount_options->congestion_kb = (int)val;
333 return 0; 172 return 0;
334} 173}
335 174
336static int congestion_kb_get(void *data, u64 *val) 175static int congestion_kb_get(void *data, u64 *val)
337{ 176{
338 struct ceph_client *client = (struct ceph_client *)data; 177 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
339
340 if (client)
341 *val = (u64)client->mount_args->congestion_kb;
342 178
179 *val = (u64)fsc->mount_options->congestion_kb;
343 return 0; 180 return 0;
344} 181}
345 182
346
347DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, 183DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
348 congestion_kb_set, "%llu\n"); 184 congestion_kb_set, "%llu\n");
349 185
350int __init ceph_debugfs_init(void)
351{
352 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
353 if (!ceph_debugfs_dir)
354 return -ENOMEM;
355 return 0;
356}
357 186
358void ceph_debugfs_cleanup(void) 187void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
359{ 188{
360 debugfs_remove(ceph_debugfs_dir); 189 dout("ceph_fs_debugfs_cleanup\n");
190 debugfs_remove(fsc->debugfs_bdi);
191 debugfs_remove(fsc->debugfs_congestion_kb);
192 debugfs_remove(fsc->debugfs_mdsmap);
193 debugfs_remove(fsc->debugfs_caps);
194 debugfs_remove(fsc->debugfs_mdsc);
195 debugfs_remove(fsc->debugfs_dentry_lru);
361} 196}
362 197
363int ceph_debugfs_client_init(struct ceph_client *client) 198int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
364{ 199{
365 int ret = 0; 200 char name[100];
366 char name[80]; 201 int err = -ENOMEM;
367
368 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
369 client->monc.auth->global_id);
370
371 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
372 if (!client->debugfs_dir)
373 goto out;
374 202
375 client->monc.debugfs_file = debugfs_create_file("monc", 203 dout("ceph_fs_debugfs_init\n");
376 0600, 204 fsc->debugfs_congestion_kb =
377 client->debugfs_dir, 205 debugfs_create_file("writeback_congestion_kb",
378 client, 206 0600,
379 &monc_show_fops); 207 fsc->client->debugfs_dir,
380 if (!client->monc.debugfs_file) 208 fsc,
381 goto out; 209 &congestion_kb_fops);
382 210 if (!fsc->debugfs_congestion_kb)
383 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
384 0600,
385 client->debugfs_dir,
386 client,
387 &mdsc_show_fops);
388 if (!client->mdsc.debugfs_file)
389 goto out; 211 goto out;
390 212
391 client->osdc.debugfs_file = debugfs_create_file("osdc", 213 dout("a\n");
392 0600,
393 client->debugfs_dir,
394 client,
395 &osdc_show_fops);
396 if (!client->osdc.debugfs_file)
397 goto out;
398 214
399 client->debugfs_monmap = debugfs_create_file("monmap", 215 snprintf(name, sizeof(name), "../../bdi/%s",
400 0600, 216 dev_name(fsc->backing_dev_info.dev));
401 client->debugfs_dir, 217 fsc->debugfs_bdi =
402 client, 218 debugfs_create_symlink("bdi",
403 &monmap_show_fops); 219 fsc->client->debugfs_dir,
404 if (!client->debugfs_monmap) 220 name);
221 if (!fsc->debugfs_bdi)
405 goto out; 222 goto out;
406 223
407 client->debugfs_mdsmap = debugfs_create_file("mdsmap", 224 dout("b\n");
225 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
408 0600, 226 0600,
409 client->debugfs_dir, 227 fsc->client->debugfs_dir,
410 client, 228 fsc,
411 &mdsmap_show_fops); 229 &mdsmap_show_fops);
412 if (!client->debugfs_mdsmap) 230 if (!fsc->debugfs_mdsmap)
413 goto out; 231 goto out;
414 232
415 client->debugfs_osdmap = debugfs_create_file("osdmap", 233 dout("ca\n");
416 0600, 234 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
417 client->debugfs_dir, 235 0600,
418 client, 236 fsc->client->debugfs_dir,
419 &osdmap_show_fops); 237 fsc,
420 if (!client->debugfs_osdmap) 238 &mdsc_show_fops);
239 if (!fsc->debugfs_mdsc)
421 goto out; 240 goto out;
422 241
423 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 242 dout("da\n");
424 0600, 243 fsc->debugfs_caps = debugfs_create_file("caps",
425 client->debugfs_dir,
426 client,
427 &dentry_lru_show_fops);
428 if (!client->debugfs_dentry_lru)
429 goto out;
430
431 client->debugfs_caps = debugfs_create_file("caps",
432 0400, 244 0400,
433 client->debugfs_dir, 245 fsc->client->debugfs_dir,
434 client, 246 fsc,
435 &caps_show_fops); 247 &caps_show_fops);
436 if (!client->debugfs_caps) 248 if (!fsc->debugfs_caps)
437 goto out; 249 goto out;
438 250
439 client->debugfs_congestion_kb = 251 dout("ea\n");
440 debugfs_create_file("writeback_congestion_kb", 252 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
441 0600, 253 0600,
442 client->debugfs_dir, 254 fsc->client->debugfs_dir,
443 client, 255 fsc,
444 &congestion_kb_fops); 256 &dentry_lru_show_fops);
445 if (!client->debugfs_congestion_kb) 257 if (!fsc->debugfs_dentry_lru)
446 goto out; 258 goto out;
447 259
448 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
449 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
450 name);
451
452 return 0; 260 return 0;
453 261
454out: 262out:
455 ceph_debugfs_client_cleanup(client); 263 ceph_fs_debugfs_cleanup(fsc);
456 return ret; 264 return err;
457} 265}
458 266
459void ceph_debugfs_client_cleanup(struct ceph_client *client)
460{
461 debugfs_remove(client->debugfs_bdi);
462 debugfs_remove(client->debugfs_caps);
463 debugfs_remove(client->debugfs_dentry_lru);
464 debugfs_remove(client->debugfs_osdmap);
465 debugfs_remove(client->debugfs_mdsmap);
466 debugfs_remove(client->debugfs_monmap);
467 debugfs_remove(client->osdc.debugfs_file);
468 debugfs_remove(client->mdsc.debugfs_file);
469 debugfs_remove(client->monc.debugfs_file);
470 debugfs_remove(client->debugfs_congestion_kb);
471 debugfs_remove(client->debugfs_dir);
472}
473 267
474#else /* CONFIG_DEBUG_FS */ 268#else /* CONFIG_DEBUG_FS */
475 269
476int __init ceph_debugfs_init(void) 270int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
477{
478 return 0;
479}
480
481void ceph_debugfs_cleanup(void)
482{
483}
484
485int ceph_debugfs_client_init(struct ceph_client *client)
486{ 271{
487 return 0; 272 return 0;
488} 273}
489 274
490void ceph_debugfs_client_cleanup(struct ceph_client *client) 275void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
491{ 276{
492} 277}
493 278
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 3d25415afe63..000000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,196 +0,0 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 __be16 ss_family = htons(a->in_addr.ss_family);
103 a->in_addr.ss_family = *(__u16 *)&ss_family;
104}
105static inline void ceph_decode_addr(struct ceph_entity_addr *a)
106{
107 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
108 a->in_addr.ss_family = ntohs(ss_family);
109 WARN_ON(a->in_addr.ss_family == 512);
110}
111
112/*
113 * encoders
114 */
115static inline void ceph_encode_64(void **p, u64 v)
116{
117 put_unaligned_le64(v, (__le64 *)*p);
118 *p += sizeof(u64);
119}
120static inline void ceph_encode_32(void **p, u32 v)
121{
122 put_unaligned_le32(v, (__le32 *)*p);
123 *p += sizeof(u32);
124}
125static inline void ceph_encode_16(void **p, u16 v)
126{
127 put_unaligned_le16(v, (__le16 *)*p);
128 *p += sizeof(u16);
129}
130static inline void ceph_encode_8(void **p, u8 v)
131{
132 *(u8 *)*p = v;
133 (*p)++;
134}
135static inline void ceph_encode_copy(void **p, const void *s, int len)
136{
137 memcpy(*p, s, len);
138 *p += len;
139}
140
141/*
142 * filepath, string encoders
143 */
144static inline void ceph_encode_filepath(void **p, void *end,
145 u64 ino, const char *path)
146{
147 u32 len = path ? strlen(path) : 0;
148 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
149 ceph_encode_8(p, 1);
150 ceph_encode_64(p, ino);
151 ceph_encode_32(p, len);
152 if (len)
153 memcpy(*p, path, len);
154 *p += len;
155}
156
157static inline void ceph_encode_string(void **p, void *end,
158 const char *s, u32 len)
159{
160 BUG_ON(*p + sizeof(len) + len > end);
161 ceph_encode_32(p, len);
162 if (len)
163 memcpy(*p, s, len);
164 *p += len;
165}
166
167#define ceph_encode_need(p, end, n, bad) \
168 do { \
169 if (unlikely(*(p) + (n) > (end))) \
170 goto bad; \
171 } while (0)
172
173#define ceph_encode_64_safe(p, end, v, bad) \
174 do { \
175 ceph_encode_need(p, end, sizeof(u64), bad); \
176 ceph_encode_64(p, v); \
177 } while (0)
178#define ceph_encode_32_safe(p, end, v, bad) \
179 do { \
180 ceph_encode_need(p, end, sizeof(u32), bad); \
181 ceph_encode_32(p, v); \
182 } while (0)
183#define ceph_encode_16_safe(p, end, v, bad) \
184 do { \
185 ceph_encode_need(p, end, sizeof(u16), bad); \
186 ceph_encode_16(p, v); \
187 } while (0)
188
189#define ceph_encode_copy_safe(p, end, pv, n, bad) \
190 do { \
191 ceph_encode_need(p, end, n, bad); \
192 ceph_encode_copy(p, pv, n); \
193 } while (0)
194
195
196#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/spinlock.h> 3#include <linux/spinlock.h>
4#include <linux/fs_struct.h> 4#include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8 8
9#include "super.h" 9#include "super.h"
10#include "mds_client.h"
10 11
11/* 12/*
12 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
@@ -39,12 +40,13 @@ int ceph_init_dentry(struct dentry *dentry)
39 if (dentry->d_fsdata) 40 if (dentry->d_fsdata)
40 return 0; 41 return 0;
41 42
42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) 43 if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
43 dentry->d_op = &ceph_dentry_ops; 44 ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
45 d_set_d_op(dentry, &ceph_dentry_ops);
44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) 46 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
45 dentry->d_op = &ceph_snapdir_dentry_ops; 47 d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
46 else 48 else
47 dentry->d_op = &ceph_snap_dentry_ops; 49 d_set_d_op(dentry, &ceph_snap_dentry_ops);
48 50
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); 51 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
50 if (!di) 52 if (!di)
@@ -94,10 +96,7 @@ static unsigned fpos_off(loff_t p)
94 */ 96 */
95static int __dcache_readdir(struct file *filp, 97static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 98 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
99{ 99{
100 struct inode *inode = filp->f_dentry->d_inode;
101 struct ceph_file_info *fi = filp->private_data; 100 struct ceph_file_info *fi = filp->private_data;
102 struct dentry *parent = filp->f_dentry; 101 struct dentry *parent = filp->f_dentry;
103 struct inode *dir = parent->d_inode; 102 struct inode *dir = parent->d_inode;
@@ -113,11 +112,11 @@ static int __dcache_readdir(struct file *filp,
113 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 112 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
114 last); 113 last);
115 114
116 spin_lock(&dcache_lock); 115 spin_lock(&parent->d_lock);
117 116
118 /* start at beginning? */ 117 /* start at beginning? */
119 if (filp->f_pos == 2 || (last && 118 if (filp->f_pos == 2 || last == NULL ||
120 filp->f_pos < ceph_dentry(last)->offset)) { 119 filp->f_pos < ceph_dentry(last)->offset) {
121 if (list_empty(&parent->d_subdirs)) 120 if (list_empty(&parent->d_subdirs))
122 goto out_unlock; 121 goto out_unlock;
123 p = parent->d_subdirs.prev; 122 p = parent->d_subdirs.prev;
@@ -137,6 +136,7 @@ more:
137 fi->at_end = 1; 136 fi->at_end = 1;
138 goto out_unlock; 137 goto out_unlock;
139 } 138 }
139 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
140 if (!d_unhashed(dentry) && dentry->d_inode && 140 if (!d_unhashed(dentry) && dentry->d_inode &&
141 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 141 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
142 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 142 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -146,14 +146,15 @@ more:
146 dentry->d_name.len, dentry->d_name.name, di->offset, 146 dentry->d_name.len, dentry->d_name.name, di->offset,
147 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 147 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
148 !dentry->d_inode ? " null" : ""); 148 !dentry->d_inode ? " null" : "");
149 spin_unlock(&dentry->d_lock);
149 p = p->prev; 150 p = p->prev;
150 dentry = list_entry(p, struct dentry, d_u.d_child); 151 dentry = list_entry(p, struct dentry, d_u.d_child);
151 di = ceph_dentry(dentry); 152 di = ceph_dentry(dentry);
152 } 153 }
153 154
154 atomic_inc(&dentry->d_count); 155 dget_dlock(dentry);
155 spin_unlock(&dcache_lock); 156 spin_unlock(&dentry->d_lock);
156 spin_unlock(&inode->i_lock); 157 spin_unlock(&parent->d_lock);
157 158
158 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 159 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
159 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 160 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +172,30 @@ more:
171 } else { 172 } else {
172 dput(last); 173 dput(last);
173 } 174 }
174 last = NULL;
175 } 175 }
176
177 spin_lock(&inode->i_lock);
178 spin_lock(&dcache_lock);
179
180 last = dentry; 176 last = dentry;
181 177
182 if (err < 0) 178 if (err < 0)
183 goto out_unlock; 179 goto out;
184 180
185 p = p->prev;
186 filp->f_pos++; 181 filp->f_pos++;
187 182
188 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 183 /* make sure a dentry wasn't dropped while we didn't have parent lock */
189 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 184 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
190 goto more; 185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
191 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 186 err = -EAGAIN;
192 err = -EAGAIN; 187 goto out;
188 }
193 189
194out_unlock: 190 spin_lock(&parent->d_lock);
195 spin_unlock(&dcache_lock); 191 p = p->prev; /* advance to next dentry */
192 goto more;
196 193
197 if (last) { 194out_unlock:
198 spin_unlock(&inode->i_lock); 195 spin_unlock(&parent->d_lock);
196out:
197 if (last)
199 dput(last); 198 dput(last);
200 spin_lock(&inode->i_lock);
201 }
202
203 return err; 199 return err;
204} 200}
205 201
@@ -227,15 +223,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
227 struct ceph_file_info *fi = filp->private_data; 223 struct ceph_file_info *fi = filp->private_data;
228 struct inode *inode = filp->f_dentry->d_inode; 224 struct inode *inode = filp->f_dentry->d_inode;
229 struct ceph_inode_info *ci = ceph_inode(inode); 225 struct ceph_inode_info *ci = ceph_inode(inode);
230 struct ceph_client *client = ceph_inode_to_client(inode); 226 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
231 struct ceph_mds_client *mdsc = &client->mdsc; 227 struct ceph_mds_client *mdsc = fsc->mdsc;
232 unsigned frag = fpos_frag(filp->f_pos); 228 unsigned frag = fpos_frag(filp->f_pos);
233 int off = fpos_off(filp->f_pos); 229 int off = fpos_off(filp->f_pos);
234 int err; 230 int err;
235 u32 ftype; 231 u32 ftype;
236 struct ceph_mds_reply_info_parsed *rinfo; 232 struct ceph_mds_reply_info_parsed *rinfo;
237 const int max_entries = client->mount_args->max_readdir; 233 const int max_entries = fsc->mount_options->max_readdir;
238 const int max_bytes = client->mount_args->max_readdir_bytes; 234 const int max_bytes = fsc->mount_options->max_readdir_bytes;
239 235
240 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 236 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
241 if (fi->at_end) 237 if (fi->at_end)
@@ -267,17 +263,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
267 /* can we use the dcache? */ 263 /* can we use the dcache? */
268 spin_lock(&inode->i_lock); 264 spin_lock(&inode->i_lock);
269 if ((filp->f_pos == 2 || fi->dentry) && 265 if ((filp->f_pos == 2 || fi->dentry) &&
270 !ceph_test_opt(client, NOASYNCREADDIR) && 266 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
271 ceph_snap(inode) != CEPH_SNAPDIR && 267 ceph_snap(inode) != CEPH_SNAPDIR &&
272 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 268 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
273 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 269 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
270 spin_unlock(&inode->i_lock);
274 err = __dcache_readdir(filp, dirent, filldir); 271 err = __dcache_readdir(filp, dirent, filldir);
275 if (err != -EAGAIN) { 272 if (err != -EAGAIN)
276 spin_unlock(&inode->i_lock);
277 return err; 273 return err;
278 } 274 } else {
275 spin_unlock(&inode->i_lock);
279 } 276 }
280 spin_unlock(&inode->i_lock);
281 if (fi->dentry) { 277 if (fi->dentry) {
282 err = note_last_dentry(fi, fi->dentry->d_name.name, 278 err = note_last_dentry(fi, fi->dentry->d_name.name,
283 fi->dentry->d_name.len); 279 fi->dentry->d_name.len);
@@ -344,7 +340,10 @@ more:
344 if (req->r_reply_info.dir_end) { 340 if (req->r_reply_info.dir_end) {
345 kfree(fi->last_name); 341 kfree(fi->last_name);
346 fi->last_name = NULL; 342 fi->last_name = NULL;
347 fi->next_offset = 2; 343 if (ceph_frag_is_rightmost(frag))
344 fi->next_offset = 2;
345 else
346 fi->next_offset = 0;
348 } else { 347 } else {
349 rinfo = &req->r_reply_info; 348 rinfo = &req->r_reply_info;
350 err = note_last_dentry(fi, 349 err = note_last_dentry(fi,
@@ -363,18 +362,22 @@ more:
363 u64 pos = ceph_make_fpos(frag, off); 362 u64 pos = ceph_make_fpos(frag, off);
364 struct ceph_mds_reply_inode *in = 363 struct ceph_mds_reply_inode *in =
365 rinfo->dir_in[off - fi->offset].in; 364 rinfo->dir_in[off - fi->offset].in;
365 struct ceph_vino vino;
366 ino_t ino;
367
366 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 368 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
367 off, off - fi->offset, rinfo->dir_nr, pos, 369 off, off - fi->offset, rinfo->dir_nr, pos,
368 rinfo->dir_dname_len[off - fi->offset], 370 rinfo->dir_dname_len[off - fi->offset],
369 rinfo->dir_dname[off - fi->offset], in); 371 rinfo->dir_dname[off - fi->offset], in);
370 BUG_ON(!in); 372 BUG_ON(!in);
371 ftype = le32_to_cpu(in->mode) >> 12; 373 ftype = le32_to_cpu(in->mode) >> 12;
374 vino.ino = le64_to_cpu(in->ino);
375 vino.snap = le64_to_cpu(in->snapid);
376 ino = ceph_vino_to_ino(vino);
372 if (filldir(dirent, 377 if (filldir(dirent,
373 rinfo->dir_dname[off - fi->offset], 378 rinfo->dir_dname[off - fi->offset],
374 rinfo->dir_dname_len[off - fi->offset], 379 rinfo->dir_dname_len[off - fi->offset],
375 pos, 380 pos, ino, ftype) < 0) {
376 le64_to_cpu(in->ino),
377 ftype) < 0) {
378 dout("filldir stopping us...\n"); 381 dout("filldir stopping us...\n");
379 return 0; 382 return 0;
380 } 383 }
@@ -422,6 +425,7 @@ static void reset_readdir(struct ceph_file_info *fi)
422 fi->last_readdir = NULL; 425 fi->last_readdir = NULL;
423 } 426 }
424 kfree(fi->last_name); 427 kfree(fi->last_name);
428 fi->last_name = NULL;
425 fi->next_offset = 2; /* compensate for . and .. */ 429 fi->next_offset = 2; /* compensate for . and .. */
426 if (fi->dentry) { 430 if (fi->dentry) {
427 dput(fi->dentry); 431 dput(fi->dentry);
@@ -487,14 +491,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
487struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 491struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
488 struct dentry *dentry, int err) 492 struct dentry *dentry, int err)
489{ 493{
490 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 494 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
491 struct inode *parent = dentry->d_parent->d_inode; 495 struct inode *parent = dentry->d_parent->d_inode;
492 496
493 /* .snap dir? */ 497 /* .snap dir? */
494 if (err == -ENOENT && 498 if (err == -ENOENT &&
495 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
496 strcmp(dentry->d_name.name, 499 strcmp(dentry->d_name.name,
497 client->mount_args->snapdir_name) == 0) { 500 fsc->mount_options->snapdir_name) == 0) {
498 struct inode *inode = ceph_get_snapdir(parent); 501 struct inode *inode = ceph_get_snapdir(parent);
499 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 502 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
500 dentry, dentry->d_name.len, dentry->d_name.name, inode); 503 dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +542,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
539static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 542static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
540 struct nameidata *nd) 543 struct nameidata *nd)
541{ 544{
542 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 545 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
543 struct ceph_mds_client *mdsc = &client->mdsc; 546 struct ceph_mds_client *mdsc = fsc->mdsc;
544 struct ceph_mds_request *req; 547 struct ceph_mds_request *req;
545 int op; 548 int op;
546 int err; 549 int err;
@@ -572,7 +575,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
572 spin_lock(&dir->i_lock); 575 spin_lock(&dir->i_lock);
573 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 576 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
574 if (strncmp(dentry->d_name.name, 577 if (strncmp(dentry->d_name.name,
575 client->mount_args->snapdir_name, 578 fsc->mount_options->snapdir_name,
576 dentry->d_name.len) && 579 dentry->d_name.len) &&
577 !is_root_ceph_dentry(dir, dentry) && 580 !is_root_ceph_dentry(dir, dentry) &&
578 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 581 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +632,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
629static int ceph_mknod(struct inode *dir, struct dentry *dentry, 632static int ceph_mknod(struct inode *dir, struct dentry *dentry,
630 int mode, dev_t rdev) 633 int mode, dev_t rdev)
631{ 634{
632 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 635 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
633 struct ceph_mds_client *mdsc = &client->mdsc; 636 struct ceph_mds_client *mdsc = fsc->mdsc;
634 struct ceph_mds_request *req; 637 struct ceph_mds_request *req;
635 int err; 638 int err;
636 639
@@ -685,8 +688,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
685static int ceph_symlink(struct inode *dir, struct dentry *dentry, 688static int ceph_symlink(struct inode *dir, struct dentry *dentry,
686 const char *dest) 689 const char *dest)
687{ 690{
688 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 691 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
689 struct ceph_mds_client *mdsc = &client->mdsc; 692 struct ceph_mds_client *mdsc = fsc->mdsc;
690 struct ceph_mds_request *req; 693 struct ceph_mds_request *req;
691 int err; 694 int err;
692 695
@@ -716,8 +719,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
716 719
717static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 720static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
718{ 721{
719 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 722 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
720 struct ceph_mds_client *mdsc = &client->mdsc; 723 struct ceph_mds_client *mdsc = fsc->mdsc;
721 struct ceph_mds_request *req; 724 struct ceph_mds_request *req;
722 int err = -EROFS; 725 int err = -EROFS;
723 int op; 726 int op;
@@ -758,8 +761,8 @@ out:
758static int ceph_link(struct dentry *old_dentry, struct inode *dir, 761static int ceph_link(struct dentry *old_dentry, struct inode *dir,
759 struct dentry *dentry) 762 struct dentry *dentry)
760{ 763{
761 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 764 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
762 struct ceph_mds_client *mdsc = &client->mdsc; 765 struct ceph_mds_client *mdsc = fsc->mdsc;
763 struct ceph_mds_request *req; 766 struct ceph_mds_request *req;
764 int err; 767 int err;
765 768
@@ -813,8 +816,8 @@ static int drop_caps_for_unlink(struct inode *inode)
813 */ 816 */
814static int ceph_unlink(struct inode *dir, struct dentry *dentry) 817static int ceph_unlink(struct inode *dir, struct dentry *dentry)
815{ 818{
816 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 819 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
817 struct ceph_mds_client *mdsc = &client->mdsc; 820 struct ceph_mds_client *mdsc = fsc->mdsc;
818 struct inode *inode = dentry->d_inode; 821 struct inode *inode = dentry->d_inode;
819 struct ceph_mds_request *req; 822 struct ceph_mds_request *req;
820 int err = -EROFS; 823 int err = -EROFS;
@@ -854,8 +857,8 @@ out:
854static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 857static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
855 struct inode *new_dir, struct dentry *new_dentry) 858 struct inode *new_dir, struct dentry *new_dentry)
856{ 859{
857 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 860 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
858 struct ceph_mds_client *mdsc = &client->mdsc; 861 struct ceph_mds_client *mdsc = fsc->mdsc;
859 struct ceph_mds_request *req; 862 struct ceph_mds_request *req;
860 int err; 863 int err;
861 864
@@ -987,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
987 */ 990 */
988static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) 991static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
989{ 992{
990 struct inode *dir = dentry->d_parent->d_inode; 993 struct inode *dir;
994
995 if (nd->flags & LOOKUP_RCU)
996 return -ECHILD;
997
998 dir = dentry->d_parent->d_inode;
991 999
992 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, 1000 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
993 dentry->d_name.len, dentry->d_name.name, dentry->d_inode, 1001 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
@@ -1076,7 +1084,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1076 struct ceph_inode_info *ci = ceph_inode(inode); 1084 struct ceph_inode_info *ci = ceph_inode(inode);
1077 int left; 1085 int left;
1078 1086
1079 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1087 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1080 return -EISDIR; 1088 return -EISDIR;
1081 1089
1082 if (!cf->dir_info) { 1090 if (!cf->dir_info) {
@@ -1177,7 +1185,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1177 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1185 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1178 dn->d_name.len, dn->d_name.name); 1186 dn->d_name.len, dn->d_name.name);
1179 if (di) { 1187 if (di) {
1180 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1188 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1181 spin_lock(&mdsc->dentry_lru_lock); 1189 spin_lock(&mdsc->dentry_lru_lock);
1182 list_add_tail(&di->lru, &mdsc->dentry_lru); 1190 list_add_tail(&di->lru, &mdsc->dentry_lru);
1183 mdsc->num_dentry++; 1191 mdsc->num_dentry++;
@@ -1193,7 +1201,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1193 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1201 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1194 dn->d_name.len, dn->d_name.name, di->offset); 1202 dn->d_name.len, dn->d_name.name, di->offset);
1195 if (di) { 1203 if (di) {
1196 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1204 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1197 spin_lock(&mdsc->dentry_lru_lock); 1205 spin_lock(&mdsc->dentry_lru_lock);
1198 list_move_tail(&di->lru, &mdsc->dentry_lru); 1206 list_move_tail(&di->lru, &mdsc->dentry_lru);
1199 spin_unlock(&mdsc->dentry_lru_lock); 1207 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1216,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1208 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1216 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1209 dn->d_name.len, dn->d_name.name); 1217 dn->d_name.len, dn->d_name.name);
1210 if (di) { 1218 if (di) {
1211 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1219 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1212 spin_lock(&mdsc->dentry_lru_lock); 1220 spin_lock(&mdsc->dentry_lru_lock);
1213 list_del_init(&di->lru); 1221 list_del_init(&di->lru);
1214 mdsc->num_dentry--; 1222 mdsc->num_dentry--;
@@ -1216,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
1216 } 1224 }
1217} 1225}
1218 1226
1227/*
1228 * Return name hash for a given dentry. This is dependent on
1229 * the parent directory's hash function.
1230 */
1231unsigned ceph_dentry_hash(struct dentry *dn)
1232{
1233 struct inode *dir = dn->d_parent->d_inode;
1234 struct ceph_inode_info *dci = ceph_inode(dir);
1235
1236 switch (dci->i_dir_layout.dl_dir_hash) {
1237 case 0: /* for backward compat */
1238 case CEPH_STR_HASH_LINUX:
1239 return dn->d_name.hash;
1240
1241 default:
1242 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1243 dn->d_name.name, dn->d_name.len);
1244 }
1245}
1246
1219const struct file_operations ceph_dir_fops = { 1247const struct file_operations ceph_dir_fops = {
1220 .read = ceph_read_dir, 1248 .read = ceph_read_dir,
1221 .readdir = ceph_readdir, 1249 .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e38423e82f2e..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/exportfs.h> 3#include <linux/exportfs.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <asm/unaligned.h> 5#include <asm/unaligned.h>
6 6
7#include "super.h" 7#include "super.h"
8#include "mds_client.h"
8 9
9/* 10/*
10 * NFS export support 11 * NFS export support
@@ -58,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
58 dout("encode_fh %p connectable\n", dentry); 59 dout("encode_fh %p connectable\n", dentry);
59 cfh->ino = ceph_ino(dentry->d_inode); 60 cfh->ino = ceph_ino(dentry->d_inode);
60 cfh->parent_ino = ceph_ino(parent->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode);
61 cfh->parent_name_hash = parent->d_name.hash; 62 cfh->parent_name_hash = ceph_dentry_hash(parent);
62 *max_len = connected_handle_length; 63 *max_len = connected_handle_length;
63 type = 2; 64 type = 2;
64 } else if (*max_len >= handle_length) { 65 } else if (*max_len >= handle_length) {
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
120static struct dentry *__cfh_to_dentry(struct super_block *sb, 121static struct dentry *__cfh_to_dentry(struct super_block *sb,
121 struct ceph_nfs_confh *cfh) 122 struct ceph_nfs_confh *cfh)
122{ 123{
123 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; 124 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
124 struct inode *inode; 125 struct inode *inode;
125 struct dentry *dentry; 126 struct dentry *dentry;
126 struct ceph_vino vino; 127 struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba22..7d0e4a82d898 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/file.h> 6#include <linux/file.h>
@@ -38,8 +39,8 @@
38static struct ceph_mds_request * 39static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode) 40prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{ 41{
41 struct ceph_client *client = ceph_sb_to_client(sb); 42 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc; 43 struct ceph_mds_client *mdsc = fsc->mdsc;
43 struct ceph_mds_request *req; 44 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS; 45 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 46 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
117int ceph_open(struct inode *inode, struct file *file) 118int ceph_open(struct inode *inode, struct file *file)
118{ 119{
119 struct ceph_inode_info *ci = ceph_inode(inode); 120 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 121 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
122 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -153,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
153 } 154 }
154 155
155 /* 156 /*
156 * No need to block if we have any caps. Update wanted set 157 * No need to block if we have caps on the auth MDS (for
158 * write) or any MDS (for read). Update wanted set
157 * asynchronously. 159 * asynchronously.
158 */ 160 */
159 spin_lock(&inode->i_lock); 161 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) { 162 if (__ceph_is_any_real_caps(ci) &&
163 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci); 164 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL); 165 int issued = __ceph_caps_issued(ci, NULL);
163 166
@@ -216,8 +219,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode, 219 struct nameidata *nd, int mode,
217 int locked_dir) 220 int locked_dir)
218{ 221{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 222 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc; 223 struct ceph_mds_client *mdsc = fsc->mdsc;
221 struct file *file = nd->intent.open.file; 224 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); 225 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req; 226 struct ceph_mds_request *req;
@@ -270,163 +273,6 @@ int ceph_release(struct inode *inode, struct file *file)
270} 273}
271 274
272/* 275/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over 276 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.) 277 * objects we stripe over. (That's not atomic, but good enough for now.)
432 * 278 *
@@ -436,11 +282,13 @@ static void zero_page_vector_range(int off, int len, struct page **pages)
436static int striped_read(struct inode *inode, 282static int striped_read(struct inode *inode,
437 u64 off, u64 len, 283 u64 off, u64 len,
438 struct page **pages, int num_pages, 284 struct page **pages, int num_pages,
439 int *checkeof) 285 int *checkeof, bool align_to_pages,
286 unsigned long buf_align)
440{ 287{
441 struct ceph_client *client = ceph_inode_to_client(inode); 288 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode); 289 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len; 290 u64 pos, this_len;
291 int io_align, page_align;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 292 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left; 293 int left, pages_left;
446 int read; 294 int read;
@@ -456,14 +304,19 @@ static int striped_read(struct inode *inode,
456 page_pos = pages; 304 page_pos = pages;
457 pages_left = num_pages; 305 pages_left = num_pages;
458 read = 0; 306 read = 0;
307 io_align = off & ~PAGE_MASK;
459 308
460more: 309more:
310 if (align_to_pages)
311 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
312 else
313 page_align = pos & ~PAGE_MASK;
461 this_len = left; 314 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), 315 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len, 316 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq, 317 ci->i_truncate_seq,
465 ci->i_truncate_size, 318 ci->i_truncate_size,
466 page_pos, pages_left); 319 page_pos, pages_left, page_align);
467 hit_stripe = this_len < left; 320 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len; 321 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT) 322 if (ret == -ENOENT)
@@ -477,8 +330,8 @@ more:
477 330
478 if (read < pos - off) { 331 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos); 332 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read, 333 ceph_zero_page_vector_range(page_off + read,
481 pos - off - read, pages); 334 pos - off - read, pages);
482 } 335 }
483 pos += ret; 336 pos += ret;
484 read = pos - off; 337 read = pos - off;
@@ -495,8 +348,8 @@ more:
495 /* was original extent fully inside i_size? */ 348 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) { 349 if (pos + left <= inode->i_size) {
497 dout("zero tail\n"); 350 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read, 351 ceph_zero_page_vector_range(page_off + read, len - read,
499 pages); 352 pages);
500 read = len; 353 read = len;
501 goto out; 354 goto out;
502 } 355 }
@@ -524,41 +377,43 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
524 struct inode *inode = file->f_dentry->d_inode; 377 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages; 378 struct page **pages;
526 u64 off = *poff; 379 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len); 380 int num_pages, ret;
528 int ret;
529 381
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len, 382 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 383 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532 384
533 if (file->f_flags & O_DIRECT) { 385 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len); 386 num_pages = calc_pages_for((unsigned long)data, len);
535 387 pages = ceph_get_direct_page_vector(data, num_pages, true);
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else { 388 } else {
389 num_pages = calc_pages_for(off, len);
543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 390 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
544 } 391 }
545 if (IS_ERR(pages)) 392 if (IS_ERR(pages))
546 return PTR_ERR(pages); 393 return PTR_ERR(pages);
547 394
395 /*
396 * flush any page cache pages in this range. this
397 * will make concurrent normal and sync io slow,
398 * but it will at least behave sensibly when they are
399 * in sequence.
400 */
548 ret = filemap_write_and_wait(inode->i_mapping); 401 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0) 402 if (ret < 0)
550 goto done; 403 goto done;
551 404
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 405 ret = striped_read(inode, off, len, pages, num_pages, checkeof,
406 file->f_flags & O_DIRECT,
407 (unsigned long)data & ~PAGE_MASK);
553 408
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 409 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret); 410 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0) 411 if (ret >= 0)
557 *poff = off + ret; 412 *poff = off + ret;
558 413
559done: 414done:
560 if (file->f_flags & O_DIRECT) 415 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages); 416 ceph_put_page_vector(pages, num_pages, true);
562 else 417 else
563 ceph_release_page_vector(pages, num_pages); 418 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret); 419 dout("sync_read result %d\n", ret);
@@ -594,7 +449,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
594{ 449{
595 struct inode *inode = file->f_dentry->d_inode; 450 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode); 451 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode); 452 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req; 453 struct ceph_osd_request *req;
599 struct page **pages; 454 struct page **pages;
600 int num_pages; 455 int num_pages;
@@ -604,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
604 int flags; 459 int flags;
605 int do_sync = 0; 460 int do_sync = 0;
606 int check_caps = 0; 461 int check_caps = 0;
462 int page_align, io_align;
463 unsigned long buf_align;
607 int ret; 464 int ret;
608 struct timespec mtime = CURRENT_TIME; 465 struct timespec mtime = CURRENT_TIME;
609 466
@@ -618,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
618 else 475 else
619 pos = *offset; 476 pos = *offset;
620 477
478 io_align = pos & ~PAGE_MASK;
479 buf_align = (unsigned long)data & ~PAGE_MASK;
480
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 481 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0) 482 if (ret < 0)
623 return ret; 483 return ret;
@@ -642,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
642 */ 502 */
643more: 503more:
644 len = left; 504 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, 505 if (file->f_flags & O_DIRECT) {
506 /* write from beginning of first page, regardless of
507 io alignment */
508 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
509 num_pages = calc_pages_for((unsigned long)data, len);
510 } else {
511 page_align = pos & ~PAGE_MASK;
512 num_pages = calc_pages_for(pos, len);
513 }
514 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len, 515 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags, 516 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context, 517 ci->i_snap_realm->cached_context,
649 do_sync, 518 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size, 519 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2); 520 &mtime, false, 2, page_align);
652 if (!req) 521 if (!req)
653 return -ENOMEM; 522 return -ENOMEM;
654 523
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) { 524 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len); 525 pages = ceph_get_direct_page_vector(data, num_pages, false);
659 if (IS_ERR(pages)) { 526 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages); 527 ret = PTR_ERR(pages);
661 goto out; 528 goto out;
@@ -673,7 +540,7 @@ more:
673 ret = PTR_ERR(pages); 540 ret = PTR_ERR(pages);
674 goto out; 541 goto out;
675 } 542 }
676 ret = copy_user_to_page_vector(pages, data, pos, len); 543 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) { 544 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages); 545 ceph_release_page_vector(pages, num_pages);
679 goto out; 546 goto out;
@@ -689,7 +556,7 @@ more:
689 req->r_num_pages = num_pages; 556 req->r_num_pages = num_pages;
690 req->r_inode = inode; 557 req->r_inode = inode;
691 558
692 ret = ceph_osdc_start_request(&client->osdc, req, false); 559 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
693 if (!ret) { 560 if (!ret) {
694 if (req->r_safe_callback) { 561 if (req->r_safe_callback) {
695 /* 562 /*
@@ -701,11 +568,11 @@ more:
701 spin_unlock(&ci->i_unsafe_lock); 568 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 569 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 570 }
704 ret = ceph_osdc_wait_request(&client->osdc, req); 571 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
705 } 572 }
706 573
707 if (file->f_flags & O_DIRECT) 574 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages); 575 ceph_put_page_vector(pages, num_pages, false);
709 else if (file->f_flags & O_SYNC) 576 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages); 577 ceph_release_page_vector(pages, num_pages);
711 578
@@ -814,7 +681,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
814 struct ceph_file_info *fi = file->private_data; 681 struct ceph_file_info *fi = file->private_data;
815 struct inode *inode = file->f_dentry->d_inode; 682 struct inode *inode = file->f_dentry->d_inode;
816 struct ceph_inode_info *ci = ceph_inode(inode); 683 struct ceph_inode_info *ci = ceph_inode(inode);
817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 684 struct ceph_osd_client *osdc =
685 &ceph_sb_to_client(inode->i_sb)->client->osdc;
818 loff_t endoff = pos + iov->iov_len; 686 loff_t endoff = pos + iov->iov_len;
819 int want, got = 0; 687 int want, got = 0;
820 int ret, err; 688 int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..5625463aa479 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,8 +1,7 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h> 5#include <linux/slab.h>
7#include <linux/string.h> 6#include <linux/string.h>
8#include <linux/uaccess.h> 7#include <linux/uaccess.h>
@@ -13,7 +12,8 @@
13#include <linux/pagevec.h> 12#include <linux/pagevec.h>
14 13
15#include "super.h" 14#include "super.h"
16#include "decode.h" 15#include "mds_client.h"
16#include <linux/ceph/decode.h>
17 17
18/* 18/*
19 * Ceph inode operations 19 * Ceph inode operations
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
297 ci->i_release_count = 0; 297 ci->i_release_count = 0;
298 ci->i_symlink = NULL; 298 ci->i_symlink = NULL;
299 299
300 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
301
300 ci->i_fragtree = RB_ROOT; 302 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex); 303 mutex_init(&ci->i_fragtree_mutex);
302 304
@@ -368,6 +370,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
368 return &ci->vfs_inode; 370 return &ci->vfs_inode;
369} 371}
370 372
373static void ceph_i_callback(struct rcu_head *head)
374{
375 struct inode *inode = container_of(head, struct inode, i_rcu);
376 struct ceph_inode_info *ci = ceph_inode(inode);
377
378 INIT_LIST_HEAD(&inode->i_dentry);
379 kmem_cache_free(ceph_inode_cachep, ci);
380}
381
371void ceph_destroy_inode(struct inode *inode) 382void ceph_destroy_inode(struct inode *inode)
372{ 383{
373 struct ceph_inode_info *ci = ceph_inode(inode); 384 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -384,7 +395,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 395 */
385 if (ci->i_snap_realm) { 396 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 397 struct ceph_mds_client *mdsc =
387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 398 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 399 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 400
390 dout(" dropping residual ref to snap realm %p\n", realm); 401 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -407,7 +418,7 @@ void ceph_destroy_inode(struct inode *inode)
407 if (ci->i_xattrs.prealloc_blob) 418 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 419 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409 420
410 kmem_cache_free(ceph_inode_cachep, ci); 421 call_rcu(&inode->i_rcu, ceph_i_callback);
411} 422}
412 423
413 424
@@ -470,7 +481,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
470 481
471 if (issued & (CEPH_CAP_FILE_EXCL| 482 if (issued & (CEPH_CAP_FILE_EXCL|
472 CEPH_CAP_FILE_WR| 483 CEPH_CAP_FILE_WR|
473 CEPH_CAP_FILE_BUFFER)) { 484 CEPH_CAP_FILE_BUFFER|
485 CEPH_CAP_AUTH_EXCL|
486 CEPH_CAP_XATTR_EXCL)) {
474 if (timespec_compare(ctime, &inode->i_ctime) > 0) { 487 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
475 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", 488 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
476 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, 489 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -510,7 +523,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
510 warn = 1; 523 warn = 1;
511 } 524 }
512 } else { 525 } else {
513 /* we have no write caps; whatever the MDS says is true */ 526 /* we have no write|excl caps; whatever the MDS says is true */
514 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { 527 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
515 inode->i_ctime = *ctime; 528 inode->i_ctime = *ctime;
516 inode->i_mtime = *mtime; 529 inode->i_mtime = *mtime;
@@ -566,12 +579,17 @@ static int fill_inode(struct inode *inode,
566 579
567 /* 580 /*
568 * provided version will be odd if inode value is projected, 581 * provided version will be odd if inode value is projected,
569 * even if stable. skip the update if we have a newer info 582 * even if stable. skip the update if we have newer stable
570 * (e.g., due to inode info racing form multiple MDSs), or if 583 * info (ours>=theirs, e.g. due to racing mds replies), unless
571 * we are getting projected (unstable) inode info. 584 * we are getting projected (unstable) info (in which case the
585 * version is odd, and we want ours>theirs).
586 * us them
587 * 2 2 skip
588 * 3 2 skip
589 * 3 3 update
572 */ 590 */
573 if (le64_to_cpu(info->version) > 0 && 591 if (le64_to_cpu(info->version) > 0 &&
574 (ci->i_version & ~1) > le64_to_cpu(info->version)) 592 (ci->i_version & ~1) >= le64_to_cpu(info->version))
575 goto no_change; 593 goto no_change;
576 594
577 issued = __ceph_caps_issued(ci, &implemented); 595 issued = __ceph_caps_issued(ci, &implemented);
@@ -605,7 +623,14 @@ static int fill_inode(struct inode *inode,
605 le32_to_cpu(info->time_warp_seq), 623 le32_to_cpu(info->time_warp_seq),
606 &ctime, &mtime, &atime); 624 &ctime, &mtime, &atime);
607 625
608 ci->i_max_size = le64_to_cpu(info->max_size); 626 /* only update max_size on auth cap */
627 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
628 ci->i_max_size != le64_to_cpu(info->max_size)) {
629 dout("max_size %lld -> %llu\n", ci->i_max_size,
630 le64_to_cpu(info->max_size));
631 ci->i_max_size = le64_to_cpu(info->max_size);
632 }
633
609 ci->i_layout = info->layout; 634 ci->i_layout = info->layout;
610 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 635 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
611 636
@@ -666,6 +691,8 @@ static int fill_inode(struct inode *inode,
666 inode->i_op = &ceph_dir_iops; 691 inode->i_op = &ceph_dir_iops;
667 inode->i_fop = &ceph_dir_fops; 692 inode->i_fop = &ceph_dir_fops;
668 693
694 ci->i_dir_layout = iinfo->dir_layout;
695
669 ci->i_files = le64_to_cpu(info->files); 696 ci->i_files = le64_to_cpu(info->files);
670 ci->i_subdirs = le64_to_cpu(info->subdirs); 697 ci->i_subdirs = le64_to_cpu(info->subdirs);
671 ci->i_rbytes = le64_to_cpu(info->rbytes); 698 ci->i_rbytes = le64_to_cpu(info->rbytes);
@@ -683,10 +710,6 @@ static int fill_inode(struct inode *inode,
683 ci->i_ceph_flags |= CEPH_I_COMPLETE; 710 ci->i_ceph_flags |= CEPH_I_COMPLETE;
684 ci->i_max_offset = 2; 711 ci->i_max_offset = 2;
685 } 712 }
686
687 /* it may be better to set st_size in getattr instead? */
688 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
689 inode->i_size = ci->i_rbytes;
690 break; 713 break;
691 default: 714 default:
692 pr_err("fill_inode %llx.%llx BAD mode 0%o\n", 715 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -827,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
827 di->offset = ceph_inode(inode)->i_max_offset++; 850 di->offset = ceph_inode(inode)->i_max_offset++;
828 spin_unlock(&inode->i_lock); 851 spin_unlock(&inode->i_lock);
829 852
830 spin_lock(&dcache_lock); 853 spin_lock(&dir->d_lock);
831 spin_lock(&dn->d_lock); 854 spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
832 list_move(&dn->d_u.d_child, &dir->d_subdirs); 855 list_move(&dn->d_u.d_child, &dir->d_subdirs);
833 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, 856 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
834 dn->d_u.d_child.prev, dn->d_u.d_child.next); 857 dn->d_u.d_child.prev, dn->d_u.d_child.next);
835 spin_unlock(&dn->d_lock); 858 spin_unlock(&dn->d_lock);
836 spin_unlock(&dcache_lock); 859 spin_unlock(&dir->d_lock);
837} 860}
838 861
839/* 862/*
@@ -865,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
865 } else if (realdn) { 888 } else if (realdn) {
866 dout("dn %p (%d) spliced with %p (%d) " 889 dout("dn %p (%d) spliced with %p (%d) "
867 "inode %p ino %llx.%llx\n", 890 "inode %p ino %llx.%llx\n",
868 dn, atomic_read(&dn->d_count), 891 dn, dn->d_count,
869 realdn, atomic_read(&realdn->d_count), 892 realdn, realdn->d_count,
870 realdn->d_inode, ceph_vinop(realdn->d_inode)); 893 realdn->d_inode, ceph_vinop(realdn->d_inode));
871 dput(dn); 894 dput(dn);
872 dn = realdn; 895 dn = realdn;
@@ -901,7 +924,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
901 struct inode *in = NULL; 924 struct inode *in = NULL;
902 struct ceph_mds_reply_inode *ininfo; 925 struct ceph_mds_reply_inode *ininfo;
903 struct ceph_vino vino; 926 struct ceph_vino vino;
904 struct ceph_client *client = ceph_sb_to_client(sb); 927 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
905 int i = 0; 928 int i = 0;
906 int err = 0; 929 int err = 0;
907 930
@@ -965,7 +988,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
965 */ 988 */
966 if (rinfo->head->is_dentry && !req->r_aborted && 989 if (rinfo->head->is_dentry && !req->r_aborted &&
967 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 990 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
968 client->mount_args->snapdir_name, 991 fsc->mount_options->snapdir_name,
969 req->r_dentry->d_name.len))) { 992 req->r_dentry->d_name.len))) {
970 /* 993 /*
971 * lookup link rename : null -> possibly existing inode 994 * lookup link rename : null -> possibly existing inode
@@ -1054,7 +1077,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1054 ininfo = rinfo->targeti.in; 1077 ininfo = rinfo->targeti.in;
1055 vino.ino = le64_to_cpu(ininfo->ino); 1078 vino.ino = le64_to_cpu(ininfo->ino);
1056 vino.snap = le64_to_cpu(ininfo->snapid); 1079 vino.snap = le64_to_cpu(ininfo->snapid);
1057 if (!dn->d_inode) { 1080 in = dn->d_inode;
1081 if (!in) {
1058 in = ceph_get_inode(sb, vino); 1082 in = ceph_get_inode(sb, vino);
1059 if (IS_ERR(in)) { 1083 if (IS_ERR(in)) {
1060 pr_err("fill_trace bad get_inode " 1084 pr_err("fill_trace bad get_inode "
@@ -1216,11 +1240,11 @@ retry_lookup:
1216 goto retry_lookup; 1240 goto retry_lookup;
1217 } else { 1241 } else {
1218 /* reorder parent's d_subdirs */ 1242 /* reorder parent's d_subdirs */
1219 spin_lock(&dcache_lock); 1243 spin_lock(&parent->d_lock);
1220 spin_lock(&dn->d_lock); 1244 spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
1221 list_move(&dn->d_u.d_child, &parent->d_subdirs); 1245 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1222 spin_unlock(&dn->d_lock); 1246 spin_unlock(&dn->d_lock);
1223 spin_unlock(&dcache_lock); 1247 spin_unlock(&parent->d_lock);
1224 } 1248 }
1225 1249
1226 di = dn->d_fsdata; 1250 di = dn->d_fsdata;
@@ -1385,11 +1409,8 @@ static void ceph_invalidate_work(struct work_struct *work)
1385 spin_lock(&inode->i_lock); 1409 spin_lock(&inode->i_lock);
1386 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1410 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1387 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1411 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1388 if (ci->i_rdcache_gen == 0 || 1412 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1389 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1390 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1391 /* nevermind! */ 1413 /* nevermind! */
1392 ci->i_rdcache_revoking = 0;
1393 spin_unlock(&inode->i_lock); 1414 spin_unlock(&inode->i_lock);
1394 goto out; 1415 goto out;
1395 } 1416 }
@@ -1399,15 +1420,16 @@ static void ceph_invalidate_work(struct work_struct *work)
1399 ceph_invalidate_nondirty_pages(inode->i_mapping); 1420 ceph_invalidate_nondirty_pages(inode->i_mapping);
1400 1421
1401 spin_lock(&inode->i_lock); 1422 spin_lock(&inode->i_lock);
1402 if (orig_gen == ci->i_rdcache_gen) { 1423 if (orig_gen == ci->i_rdcache_gen &&
1424 orig_gen == ci->i_rdcache_revoking) {
1403 dout("invalidate_pages %p gen %d successful\n", inode, 1425 dout("invalidate_pages %p gen %d successful\n", inode,
1404 ci->i_rdcache_gen); 1426 ci->i_rdcache_gen);
1405 ci->i_rdcache_gen = 0; 1427 ci->i_rdcache_revoking--;
1406 ci->i_rdcache_revoking = 0;
1407 check = 1; 1428 check = 1;
1408 } else { 1429 } else {
1409 dout("invalidate_pages %p gen %d raced, gen now %d\n", 1430 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
1410 inode, orig_gen, ci->i_rdcache_gen); 1431 inode, orig_gen, ci->i_rdcache_gen,
1432 ci->i_rdcache_revoking);
1411 } 1433 }
1412 spin_unlock(&inode->i_lock); 1434 spin_unlock(&inode->i_lock);
1413 1435
@@ -1533,7 +1555,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1533 struct inode *parent_inode = dentry->d_parent->d_inode; 1555 struct inode *parent_inode = dentry->d_parent->d_inode;
1534 const unsigned int ia_valid = attr->ia_valid; 1556 const unsigned int ia_valid = attr->ia_valid;
1535 struct ceph_mds_request *req; 1557 struct ceph_mds_request *req;
1536 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; 1558 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
1537 int issued; 1559 int issued;
1538 int release = 0, dirtied = 0; 1560 int release = 0, dirtied = 0;
1539 int mask = 0; 1561 int mask = 0;
@@ -1728,8 +1750,8 @@ out:
1728 */ 1750 */
1729int ceph_do_getattr(struct inode *inode, int mask) 1751int ceph_do_getattr(struct inode *inode, int mask)
1730{ 1752{
1731 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 1753 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1732 struct ceph_mds_client *mdsc = &client->mdsc; 1754 struct ceph_mds_client *mdsc = fsc->mdsc;
1733 struct ceph_mds_request *req; 1755 struct ceph_mds_request *req;
1734 int err; 1756 int err;
1735 1757
@@ -1738,7 +1760,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
1738 return 0; 1760 return 0;
1739 } 1761 }
1740 1762
1741 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); 1763 dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
1742 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1764 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1743 return 0; 1765 return 0;
1744 1766
@@ -1759,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
1759 * Check inode permissions. We verify we have a valid value for 1781 * Check inode permissions. We verify we have a valid value for
1760 * the AUTH cap, then call the generic handler. 1782 * the AUTH cap, then call the generic handler.
1761 */ 1783 */
1762int ceph_permission(struct inode *inode, int mask) 1784int ceph_permission(struct inode *inode, int mask, unsigned int flags)
1763{ 1785{
1764 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); 1786 int err;
1787
1788 if (flags & IPERM_FLAG_RCU)
1789 return -ECHILD;
1790
1791 err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1765 1792
1766 if (!err) 1793 if (!err)
1767 err = generic_permission(inode, mask, NULL); 1794 err = generic_permission(inode, mask, flags, NULL);
1768 return err; 1795 return err;
1769} 1796}
1770 1797
@@ -1788,7 +1815,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1788 else 1815 else
1789 stat->dev = 0; 1816 stat->dev = 0;
1790 if (S_ISDIR(inode->i_mode)) { 1817 if (S_ISDIR(inode->i_mode)) {
1791 stat->size = ci->i_rbytes; 1818 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
1819 RBYTES))
1820 stat->size = ci->i_rbytes;
1821 else
1822 stat->size = ci->i_files + ci->i_subdirs;
1792 stat->blocks = 0; 1823 stat->blocks = 0;
1793 stat->blksize = 65536; 1824 stat->blksize = 65536;
1794 } 1825 }
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
1#include <linux/in.h> 1#include <linux/in.h>
2 2
3#include "ioctl.h"
4#include "super.h" 3#include "super.h"
5#include "ceph_debug.h" 4#include "mds_client.h"
5#include <linux/ceph/ceph_debug.h>
6
7#include "ioctl.h"
6 8
7 9
8/* 10/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{ 39{
38 struct inode *inode = file->f_dentry->d_inode; 40 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 41 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
43 int err, i; 45 int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
90} 92}
91 93
92/* 94/*
95 * Set a layout policy on a directory inode. All items in the tree
96 * rooted at this inode will inherit this layout on creation,
97 * (It doesn't apply retroactively )
98 * unless a subdirectory has its own layout policy.
99 */
100static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
101{
102 struct inode *inode = file->f_dentry->d_inode;
103 struct ceph_mds_request *req;
104 struct ceph_ioctl_layout l;
105 int err, i;
106 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
107
108 /* copy and validate */
109 if (copy_from_user(&l, arg, sizeof(l)))
110 return -EFAULT;
111
112 if ((l.object_size & ~PAGE_MASK) ||
113 (l.stripe_unit & ~PAGE_MASK) ||
114 !l.stripe_unit ||
115 (l.object_size &&
116 (unsigned)l.object_size % (unsigned)l.stripe_unit))
117 return -EINVAL;
118
119 /* make sure it's a valid data pool */
120 if (l.data_pool > 0) {
121 mutex_lock(&mdsc->mutex);
122 err = -EINVAL;
123 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
124 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
125 err = 0;
126 break;
127 }
128 mutex_unlock(&mdsc->mutex);
129 if (err)
130 return err;
131 }
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
134 USE_AUTH_MDS);
135
136 if (IS_ERR(req))
137 return PTR_ERR(req);
138 req->r_inode = igrab(inode);
139
140 req->r_args.setlayout.layout.fl_stripe_unit =
141 cpu_to_le32(l.stripe_unit);
142 req->r_args.setlayout.layout.fl_stripe_count =
143 cpu_to_le32(l.stripe_count);
144 req->r_args.setlayout.layout.fl_object_size =
145 cpu_to_le32(l.object_size);
146 req->r_args.setlayout.layout.fl_pg_pool =
147 cpu_to_le32(l.data_pool);
148 req->r_args.setlayout.layout.fl_pg_preferred =
149 cpu_to_le32(l.preferred_osd);
150
151 err = ceph_mdsc_do_request(mdsc, inode, req);
152 ceph_mdsc_put_request(req);
153 return err;
154}
155
156/*
93 * Return object name, size/offset information, and location (OSD 157 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset. 158 * number, network address) for a given file offset.
95 */ 159 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 162 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 163 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 164 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 165 struct ceph_osd_client *osdc =
166 &ceph_sb_to_client(inode->i_sb)->client->osdc;
102 u64 len = 1, olen; 167 u64 len = 1, olen;
103 u64 tmp; 168 u64 tmp;
104 struct ceph_object_layout ol; 169 struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
174 case CEPH_IOC_SET_LAYOUT: 239 case CEPH_IOC_SET_LAYOUT:
175 return ceph_ioctl_set_layout(file, (void __user *)arg); 240 return ceph_ioctl_set_layout(file, (void __user *)arg);
176 241
242 case CEPH_IOC_SET_LAYOUT_POLICY:
243 return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
244
177 case CEPH_IOC_GET_DATALOC: 245 case CEPH_IOC_GET_DATALOC:
178 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 246 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179 247
180 case CEPH_IOC_LAZYIO: 248 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file); 249 return ceph_ioctl_lazyio(file);
182 } 250 }
251
183 return -ENOTTY; 252 return -ENOTTY;
184} 253}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..52e8fd74d450 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
17 struct ceph_ioctl_layout) 17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ 18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout) 19 struct ceph_ioctl_layout)
20#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
21 struct ceph_ioctl_layout)
20 22
21/* 23/*
22 * Extract identity, address of the OSD and object storing a given 24 * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..476b329867d4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,50 +1,78 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/namei.h> 4#include <linux/namei.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "mds_client.h" 7#include "mds_client.h"
8#include "pagelist.h" 8#include <linux/ceph/pagelist.h>
9 9
10/** 10/**
11 * Implement fcntl and flock locking functions. 11 * Implement fcntl and flock locking functions.
12 */ 12 */
13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, 13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 u64 pid, u64 pid_ns, 14 int cmd, u8 wait, struct file_lock *fl)
15 int cmd, u64 start, u64 length, u8 wait)
16{ 15{
17 struct inode *inode = file->f_dentry->d_inode; 16 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc = 17 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc; 18 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req; 19 struct ceph_mds_request *req;
21 int err; 20 int err;
21 u64 length = 0;
22 22
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req)) 24 if (IS_ERR(req))
25 return PTR_ERR(req); 25 return PTR_ERR(req);
26 req->r_inode = igrab(inode); 26 req->r_inode = igrab(inode);
27 27
28 /* mds requires start and length rather than start and end */
29 if (LLONG_MAX == fl->fl_end)
30 length = 0;
31 else
32 length = fl->fl_end - fl->fl_start + 1;
33
28 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 34 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
29 "length: %llu, wait: %d, type`: %d", (int)lock_type, 35 "length: %llu, wait: %d, type`: %d", (int)lock_type,
30 (int)operation, pid, start, length, wait, cmd); 36 (int)operation, (u64)fl->fl_pid, fl->fl_start,
37 length, wait, fl->fl_type);
38
31 39
32 req->r_args.filelock_change.rule = lock_type; 40 req->r_args.filelock_change.rule = lock_type;
33 req->r_args.filelock_change.type = cmd; 41 req->r_args.filelock_change.type = cmd;
34 req->r_args.filelock_change.pid = cpu_to_le64(pid); 42 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
35 /* This should be adjusted, but I'm not sure if 43 /* This should be adjusted, but I'm not sure if
36 namespaces actually get id numbers*/ 44 namespaces actually get id numbers*/
37 req->r_args.filelock_change.pid_namespace = 45 req->r_args.filelock_change.pid_namespace =
38 cpu_to_le64((u64)pid_ns); 46 cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
39 req->r_args.filelock_change.start = cpu_to_le64(start); 47 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
40 req->r_args.filelock_change.length = cpu_to_le64(length); 48 req->r_args.filelock_change.length = cpu_to_le64(length);
41 req->r_args.filelock_change.wait = wait; 49 req->r_args.filelock_change.wait = wait;
42 50
43 err = ceph_mdsc_do_request(mdsc, inode, req); 51 err = ceph_mdsc_do_request(mdsc, inode, req);
52
53 if ( operation == CEPH_MDS_OP_GETFILELOCK){
54 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
55 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
56 fl->fl_type = F_RDLCK;
57 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
58 fl->fl_type = F_WRLCK;
59 else
60 fl->fl_type = F_UNLCK;
61
62 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
63 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
64 le64_to_cpu(req->r_reply_info.filelock_reply->length);
65 if (length >= 1)
66 fl->fl_end = length -1;
67 else
68 fl->fl_end = 0;
69
70 }
44 ceph_mdsc_put_request(req); 71 ceph_mdsc_put_request(req);
45 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 72 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
46 "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, 73 "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
47 (int)operation, pid, start, length, wait, cmd, err); 74 (int)operation, (u64)fl->fl_pid, fl->fl_start,
75 length, wait, fl->fl_type, err);
48 return err; 76 return err;
49} 77}
50 78
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
54 */ 82 */
55int ceph_lock(struct file *file, int cmd, struct file_lock *fl) 83int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
56{ 84{
57 u64 length;
58 u8 lock_cmd; 85 u8 lock_cmd;
59 int err; 86 int err;
60 u8 wait = 0; 87 u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
76 else 103 else
77 lock_cmd = CEPH_LOCK_UNLOCK; 104 lock_cmd = CEPH_LOCK_UNLOCK;
78 105
79 if (LLONG_MAX == fl->fl_end) 106 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
80 length = 0;
81 else
82 length = fl->fl_end - fl->fl_start + 1;
83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid,
86 (u64)(unsigned long)fl->fl_nspid,
87 lock_cmd, fl->fl_start,
88 length, wait);
89 if (!err) { 107 if (!err) {
90 dout("mds locked, locking locally"); 108 if ( op != CEPH_MDS_OP_GETFILELOCK ){
91 err = posix_lock_file(file, fl, NULL); 109 dout("mds locked, locking locally");
92 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { 110 err = posix_lock_file(file, fl, NULL);
93 /* undo! This should only happen if the kernel detects 111 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
94 * local deadlock. */ 112 /* undo! This should only happen if the kernel detects
95 ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 113 * local deadlock. */
96 (u64)fl->fl_pid, 114 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
97 (u64)(unsigned long)fl->fl_nspid, 115 CEPH_LOCK_UNLOCK, 0, fl);
98 CEPH_LOCK_UNLOCK, fl->fl_start, 116 dout("got %d on posix_lock_file, undid lock", err);
99 length, 0); 117 }
100 dout("got %d on posix_lock_file, undid lock", err);
101 } 118 }
119
102 } else { 120 } else {
103 dout("mds returned error code %d", err); 121 dout("mds returned error code %d", err);
104 } 122 }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
107 125
108int ceph_flock(struct file *file, int cmd, struct file_lock *fl) 126int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
109{ 127{
110 u64 length;
111 u8 lock_cmd; 128 u8 lock_cmd;
112 int err; 129 int err;
113 u8 wait = 1; 130 u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
127 lock_cmd = CEPH_LOCK_EXCL; 144 lock_cmd = CEPH_LOCK_EXCL;
128 else 145 else
129 lock_cmd = CEPH_LOCK_UNLOCK; 146 lock_cmd = CEPH_LOCK_UNLOCK;
130 /* mds requires start and length rather than start and end */
131 if (LLONG_MAX == fl->fl_end)
132 length = 0;
133 else
134 length = fl->fl_end - fl->fl_start + 1;
135 147
136 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 148 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
137 file, (u64)fl->fl_pid, 149 file, lock_cmd, wait, fl);
138 (u64)(unsigned long)fl->fl_nspid,
139 lock_cmd, fl->fl_start,
140 length, wait);
141 if (!err) { 150 if (!err) {
142 err = flock_lock_file_wait(file, fl); 151 err = flock_lock_file_wait(file, fl);
143 if (err) { 152 if (err) {
144 ceph_lock_message(CEPH_LOCK_FLOCK, 153 ceph_lock_message(CEPH_LOCK_FLOCK,
145 CEPH_MDS_OP_SETFILELOCK, 154 CEPH_MDS_OP_SETFILELOCK,
146 file, (u64)fl->fl_pid, 155 file, CEPH_LOCK_UNLOCK, 0, fl);
147 (u64)(unsigned long)fl->fl_nspid,
148 CEPH_LOCK_UNLOCK, fl->fl_start,
149 length, 0);
150 dout("got %d on flock_lock_file_wait, undid lock", err); 156 dout("got %d on flock_lock_file_wait, undid lock", err);
151 } 157 }
152 } else { 158 } else {
@@ -181,8 +187,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
181 * Encode the flock and fcntl locks for the given inode into the pagelist. 187 * Encode the flock and fcntl locks for the given inode into the pagelist.
182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 188 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
183 * sequential flock locks. 189 * sequential flock locks.
184 * Must be called with BLK already held, and the lock numbers should have 190 * Must be called with lock_flocks() already held.
185 * been gathered under the same lock holding window. 191 * If we encounter more of a specific lock type than expected,
192 * we return the value 1.
186 */ 193 */
187int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, 194int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
188 int num_fcntl_locks, int num_flock_locks) 195 int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +197,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
190 struct file_lock *lock; 197 struct file_lock *lock;
191 struct ceph_filelock cephlock; 198 struct ceph_filelock cephlock;
192 int err = 0; 199 int err = 0;
200 int seen_fcntl = 0;
201 int seen_flock = 0;
193 202
194 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 203 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
195 num_fcntl_locks); 204 num_fcntl_locks);
@@ -198,6 +207,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
198 goto fail; 207 goto fail;
199 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 208 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
200 if (lock->fl_flags & FL_POSIX) { 209 if (lock->fl_flags & FL_POSIX) {
210 ++seen_fcntl;
211 if (seen_fcntl > num_fcntl_locks) {
212 err = -ENOSPC;
213 goto fail;
214 }
201 err = lock_to_ceph_filelock(lock, &cephlock); 215 err = lock_to_ceph_filelock(lock, &cephlock);
202 if (err) 216 if (err)
203 goto fail; 217 goto fail;
@@ -213,6 +227,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
213 goto fail; 227 goto fail;
214 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 228 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
215 if (lock->fl_flags & FL_FLOCK) { 229 if (lock->fl_flags & FL_FLOCK) {
230 ++seen_flock;
231 if (seen_flock > num_flock_locks) {
232 err = -ENOSPC;
233 goto fail;
234 }
216 err = lock_to_ceph_filelock(lock, &cephlock); 235 err = lock_to_ceph_filelock(lock, &cephlock);
217 if (err) 236 if (err)
218 goto fail; 237 goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..a1ee8fa3a8e7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,20 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h>
3#include <linux/wait.h> 4#include <linux/wait.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
6#include <linux/smp_lock.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
7 9
8#include "mds_client.h"
9#include "mon_client.h"
10#include "super.h" 10#include "super.h"
11#include "messenger.h" 11#include "mds_client.h"
12#include "decode.h" 12
13#include "auth.h" 13#include <linux/ceph/messenger.h>
14#include "pagelist.h" 14#include <linux/ceph/decode.h>
15#include <linux/ceph/pagelist.h>
16#include <linux/ceph/auth.h>
17#include <linux/ceph/debugfs.h>
15 18
16/* 19/*
17 * A cluster of MDS (metadata server) daemons is responsible for 20 * A cluster of MDS (metadata server) daemons is responsible for
@@ -57,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
57 * parse individual inode info 60 * parse individual inode info
58 */ 61 */
59static int parse_reply_info_in(void **p, void *end, 62static int parse_reply_info_in(void **p, void *end,
60 struct ceph_mds_reply_info_in *info) 63 struct ceph_mds_reply_info_in *info,
64 int features)
61{ 65{
62 int err = -EIO; 66 int err = -EIO;
63 67
@@ -71,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
71 info->symlink = *p; 75 info->symlink = *p;
72 *p += info->symlink_len; 76 *p += info->symlink_len;
73 77
78 if (features & CEPH_FEATURE_DIRLAYOUTHASH)
79 ceph_decode_copy_safe(p, end, &info->dir_layout,
80 sizeof(info->dir_layout), bad);
81 else
82 memset(&info->dir_layout, 0, sizeof(info->dir_layout));
83
74 ceph_decode_32_safe(p, end, info->xattr_len, bad); 84 ceph_decode_32_safe(p, end, info->xattr_len, bad);
75 ceph_decode_need(p, end, info->xattr_len, bad); 85 ceph_decode_need(p, end, info->xattr_len, bad);
76 info->xattr_data = *p; 86 info->xattr_data = *p;
@@ -85,12 +95,13 @@ bad:
85 * target inode. 95 * target inode.
86 */ 96 */
87static int parse_reply_info_trace(void **p, void *end, 97static int parse_reply_info_trace(void **p, void *end,
88 struct ceph_mds_reply_info_parsed *info) 98 struct ceph_mds_reply_info_parsed *info,
99 int features)
89{ 100{
90 int err; 101 int err;
91 102
92 if (info->head->is_dentry) { 103 if (info->head->is_dentry) {
93 err = parse_reply_info_in(p, end, &info->diri); 104 err = parse_reply_info_in(p, end, &info->diri, features);
94 if (err < 0) 105 if (err < 0)
95 goto out_bad; 106 goto out_bad;
96 107
@@ -111,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
111 } 122 }
112 123
113 if (info->head->is_target) { 124 if (info->head->is_target) {
114 err = parse_reply_info_in(p, end, &info->targeti); 125 err = parse_reply_info_in(p, end, &info->targeti, features);
115 if (err < 0) 126 if (err < 0)
116 goto out_bad; 127 goto out_bad;
117 } 128 }
@@ -131,7 +142,8 @@ out_bad:
131 * parse readdir results 142 * parse readdir results
132 */ 143 */
133static int parse_reply_info_dir(void **p, void *end, 144static int parse_reply_info_dir(void **p, void *end,
134 struct ceph_mds_reply_info_parsed *info) 145 struct ceph_mds_reply_info_parsed *info,
146 int features)
135{ 147{
136 u32 num, i = 0; 148 u32 num, i = 0;
137 int err; 149 int err;
@@ -179,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
179 *p += sizeof(struct ceph_mds_reply_lease); 191 *p += sizeof(struct ceph_mds_reply_lease);
180 192
181 /* inode */ 193 /* inode */
182 err = parse_reply_info_in(p, end, &info->dir_in[i]); 194 err = parse_reply_info_in(p, end, &info->dir_in[i], features);
183 if (err < 0) 195 if (err < 0)
184 goto out_bad; 196 goto out_bad;
185 i++; 197 i++;
@@ -199,10 +211,45 @@ out_bad:
199} 211}
200 212
201/* 213/*
214 * parse fcntl F_GETLK results
215 */
216static int parse_reply_info_filelock(void **p, void *end,
217 struct ceph_mds_reply_info_parsed *info,
218 int features)
219{
220 if (*p + sizeof(*info->filelock_reply) > end)
221 goto bad;
222
223 info->filelock_reply = *p;
224 *p += sizeof(*info->filelock_reply);
225
226 if (unlikely(*p != end))
227 goto bad;
228 return 0;
229
230bad:
231 return -EIO;
232}
233
234/*
235 * parse extra results
236 */
237static int parse_reply_info_extra(void **p, void *end,
238 struct ceph_mds_reply_info_parsed *info,
239 int features)
240{
241 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
242 return parse_reply_info_filelock(p, end, info, features);
243 else
244 return parse_reply_info_dir(p, end, info, features);
245}
246
247/*
202 * parse entire mds reply 248 * parse entire mds reply
203 */ 249 */
204static int parse_reply_info(struct ceph_msg *msg, 250static int parse_reply_info(struct ceph_msg *msg,
205 struct ceph_mds_reply_info_parsed *info) 251 struct ceph_mds_reply_info_parsed *info,
252 int features)
206{ 253{
207 void *p, *end; 254 void *p, *end;
208 u32 len; 255 u32 len;
@@ -215,15 +262,15 @@ static int parse_reply_info(struct ceph_msg *msg,
215 /* trace */ 262 /* trace */
216 ceph_decode_32_safe(&p, end, len, bad); 263 ceph_decode_32_safe(&p, end, len, bad);
217 if (len > 0) { 264 if (len > 0) {
218 err = parse_reply_info_trace(&p, p+len, info); 265 err = parse_reply_info_trace(&p, p+len, info, features);
219 if (err < 0) 266 if (err < 0)
220 goto out_bad; 267 goto out_bad;
221 } 268 }
222 269
223 /* dir content */ 270 /* extra */
224 ceph_decode_32_safe(&p, end, len, bad); 271 ceph_decode_32_safe(&p, end, len, bad);
225 if (len > 0) { 272 if (len > 0) {
226 err = parse_reply_info_dir(&p, p+len, info); 273 err = parse_reply_info_extra(&p, p+len, info, features);
227 if (err < 0) 274 if (err < 0)
228 goto out_bad; 275 goto out_bad;
229 } 276 }
@@ -286,8 +333,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
286 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 333 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
287 if (atomic_dec_and_test(&s->s_ref)) { 334 if (atomic_dec_and_test(&s->s_ref)) {
288 if (s->s_authorizer) 335 if (s->s_authorizer)
289 s->s_mdsc->client->monc.auth->ops->destroy_authorizer( 336 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
290 s->s_mdsc->client->monc.auth, s->s_authorizer); 337 s->s_mdsc->fsc->client->monc.auth,
338 s->s_authorizer);
291 kfree(s); 339 kfree(s);
292 } 340 }
293} 341}
@@ -344,7 +392,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
344 s->s_seq = 0; 392 s->s_seq = 0;
345 mutex_init(&s->s_mutex); 393 mutex_init(&s->s_mutex);
346 394
347 ceph_con_init(mdsc->client->msgr, &s->s_con); 395 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
348 s->s_con.private = s; 396 s->s_con.private = s;
349 s->s_con.ops = &mds_con_ops; 397 s->s_con.ops = &mds_con_ops;
350 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 398 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -524,6 +572,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
524 ceph_mdsc_get_request(req); 572 ceph_mdsc_get_request(req);
525 __insert_request(mdsc, req); 573 __insert_request(mdsc, req);
526 574
575 req->r_uid = current_fsuid();
576 req->r_gid = current_fsgid();
577
527 if (dir) { 578 if (dir) {
528 struct ceph_inode_info *ci = ceph_inode(dir); 579 struct ceph_inode_info *ci = ceph_inode(dir);
529 580
@@ -599,7 +650,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
599 } else if (req->r_dentry) { 650 } else if (req->r_dentry) {
600 struct inode *dir = req->r_dentry->d_parent->d_inode; 651 struct inode *dir = req->r_dentry->d_parent->d_inode;
601 652
602 if (dir->i_sb != mdsc->client->sb) { 653 if (dir->i_sb != mdsc->fsc->sb) {
603 /* not this fs! */ 654 /* not this fs! */
604 inode = req->r_dentry->d_inode; 655 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) { 656 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -615,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
615 } else { 666 } else {
616 /* dir + name */ 667 /* dir + name */
617 inode = dir; 668 inode = dir;
618 hash = req->r_dentry->d_name.hash; 669 hash = ceph_dentry_hash(req->r_dentry);
619 is_hash = true; 670 is_hash = true;
620 } 671 }
621 } 672 }
@@ -642,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
642 dout("choose_mds %p %llx.%llx " 693 dout("choose_mds %p %llx.%llx "
643 "frag %u mds%d (%d/%d)\n", 694 "frag %u mds%d (%d/%d)\n",
644 inode, ceph_vinop(inode), 695 inode, ceph_vinop(inode),
645 frag.frag, frag.mds, 696 frag.frag, mds,
646 (int)r, frag.ndist); 697 (int)r, frag.ndist);
647 return mds; 698 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
699 CEPH_MDS_STATE_ACTIVE)
700 return mds;
648 } 701 }
649 702
650 /* since this file/dir wasn't known to be 703 /* since this file/dir wasn't known to be
@@ -657,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
657 dout("choose_mds %p %llx.%llx " 710 dout("choose_mds %p %llx.%llx "
658 "frag %u mds%d (auth)\n", 711 "frag %u mds%d (auth)\n",
659 inode, ceph_vinop(inode), frag.frag, mds); 712 inode, ceph_vinop(inode), frag.frag, mds);
660 return mds; 713 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
714 CEPH_MDS_STATE_ACTIVE)
715 return mds;
661 } 716 }
662 } 717 }
663 } 718 }
@@ -884,7 +939,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
884 __ceph_remove_cap(cap); 939 __ceph_remove_cap(cap);
885 if (!__ceph_is_any_real_caps(ci)) { 940 if (!__ceph_is_any_real_caps(ci)) {
886 struct ceph_mds_client *mdsc = 941 struct ceph_mds_client *mdsc =
887 &ceph_sb_to_client(inode->i_sb)->mdsc; 942 ceph_sb_to_client(inode->i_sb)->mdsc;
888 943
889 spin_lock(&mdsc->cap_dirty_lock); 944 spin_lock(&mdsc->cap_dirty_lock);
890 if (!list_empty(&ci->i_dirty_item)) { 945 if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1201,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1146 struct ceph_msg *msg, *partial = NULL; 1201 struct ceph_msg *msg, *partial = NULL;
1147 struct ceph_mds_cap_release *head; 1202 struct ceph_mds_cap_release *head;
1148 int err = -ENOMEM; 1203 int err = -ENOMEM;
1149 int extra = mdsc->client->mount_args->cap_release_safety; 1204 int extra = mdsc->fsc->mount_options->cap_release_safety;
1150 int num; 1205 int num;
1151 1206
1152 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, 1207 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -1447,7 +1502,7 @@ retry:
1447 *base = ceph_ino(temp->d_inode); 1502 *base = ceph_ino(temp->d_inode);
1448 *plen = len; 1503 *plen = len;
1449 dout("build_path on %p %d built %llx '%.*s'\n", 1504 dout("build_path on %p %d built %llx '%.*s'\n",
1450 dentry, atomic_read(&dentry->d_count), *base, len, path); 1505 dentry, dentry->d_count, *base, len, path);
1451 return path; 1506 return path;
1452} 1507}
1453 1508
@@ -1583,8 +1638,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1583 1638
1584 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); 1639 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1585 head->op = cpu_to_le32(req->r_op); 1640 head->op = cpu_to_le32(req->r_op);
1586 head->caller_uid = cpu_to_le32(current_fsuid()); 1641 head->caller_uid = cpu_to_le32(req->r_uid);
1587 head->caller_gid = cpu_to_le32(current_fsgid()); 1642 head->caller_gid = cpu_to_le32(req->r_gid);
1588 head->args = req->r_args; 1643 head->args = req->r_args;
1589 1644
1590 ceph_encode_filepath(&p, end, ino1, path1); 1645 ceph_encode_filepath(&p, end, ino1, path1);
@@ -1654,7 +1709,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1654 struct ceph_msg *msg; 1709 struct ceph_msg *msg;
1655 int flags = 0; 1710 int flags = 0;
1656 1711
1657 req->r_mds = mds;
1658 req->r_attempts++; 1712 req->r_attempts++;
1659 if (req->r_inode) { 1713 if (req->r_inode) {
1660 struct ceph_cap *cap = 1714 struct ceph_cap *cap =
@@ -1741,6 +1795,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1741 goto finish; 1795 goto finish;
1742 } 1796 }
1743 1797
1798 put_request_session(req);
1799
1744 mds = __choose_mds(mdsc, req); 1800 mds = __choose_mds(mdsc, req);
1745 if (mds < 0 || 1801 if (mds < 0 ||
1746 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { 1802 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1758,6 +1814,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1758 goto finish; 1814 goto finish;
1759 } 1815 }
1760 } 1816 }
1817 req->r_session = get_session(session);
1818
1761 dout("do_request mds%d session %p state %s\n", mds, session, 1819 dout("do_request mds%d session %p state %s\n", mds, session,
1762 session_state_name(session->s_state)); 1820 session_state_name(session->s_state));
1763 if (session->s_state != CEPH_MDS_SESSION_OPEN && 1821 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1770,7 +1828,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
1770 } 1828 }
1771 1829
1772 /* send request */ 1830 /* send request */
1773 req->r_session = get_session(session);
1774 req->r_resend_mds = -1; /* forget any previous mds hint */ 1831 req->r_resend_mds = -1; /* forget any previous mds hint */
1775 1832
1776 if (req->r_request_started == 0) /* note request start time */ 1833 if (req->r_request_started == 0) /* note request start time */
@@ -1824,7 +1881,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1824 if (req->r_session && 1881 if (req->r_session &&
1825 req->r_session->s_mds == mds) { 1882 req->r_session->s_mds == mds) {
1826 dout(" kicking tid %llu\n", req->r_tid); 1883 dout(" kicking tid %llu\n", req->r_tid);
1827 put_request_session(req);
1828 __do_request(mdsc, req); 1884 __do_request(mdsc, req);
1829 } 1885 }
1830 } 1886 }
@@ -2017,8 +2073,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2017 goto out; 2073 goto out;
2018 } else { 2074 } else {
2019 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2075 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
2020 struct ceph_cap *cap = 2076 struct ceph_cap *cap = NULL;
2021 ceph_get_cap_for_mds(ci, req->r_mds);; 2077
2078 if (req->r_session)
2079 cap = ceph_get_cap_for_mds(ci,
2080 req->r_session->s_mds);
2022 2081
2023 dout("already using auth"); 2082 dout("already using auth");
2024 if ((!cap || cap != ci->i_auth_cap) || 2083 if ((!cap || cap != ci->i_auth_cap) ||
@@ -2062,12 +2121,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2062 2121
2063 dout("handle_reply tid %lld result %d\n", tid, result); 2122 dout("handle_reply tid %lld result %d\n", tid, result);
2064 rinfo = &req->r_reply_info; 2123 rinfo = &req->r_reply_info;
2065 err = parse_reply_info(msg, rinfo); 2124 err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
2066 mutex_unlock(&mdsc->mutex); 2125 mutex_unlock(&mdsc->mutex);
2067 2126
2068 mutex_lock(&session->s_mutex); 2127 mutex_lock(&session->s_mutex);
2069 if (err < 0) { 2128 if (err < 0) {
2070 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); 2129 pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
2071 ceph_msg_dump(msg); 2130 ceph_msg_dump(msg);
2072 goto out_err; 2131 goto out_err;
2073 } 2132 }
@@ -2085,9 +2144,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2085 2144
2086 /* insert trace into our cache */ 2145 /* insert trace into our cache */
2087 mutex_lock(&req->r_fill_mutex); 2146 mutex_lock(&req->r_fill_mutex);
2088 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 2147 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2089 if (err == 0) { 2148 if (err == 0) {
2090 if (result == 0 && rinfo->dir_nr) 2149 if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
2150 rinfo->dir_nr)
2091 ceph_readdir_prepopulate(req, req->r_session); 2151 ceph_readdir_prepopulate(req, req->r_session);
2092 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2152 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2093 } 2153 }
@@ -2361,19 +2421,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2361 2421
2362 if (recon_state->flock) { 2422 if (recon_state->flock) {
2363 int num_fcntl_locks, num_flock_locks; 2423 int num_fcntl_locks, num_flock_locks;
2364 2424 struct ceph_pagelist_cursor trunc_point;
2365 lock_kernel(); 2425
2366 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2426 ceph_pagelist_set_cursor(pagelist, &trunc_point);
2367 rec.v2.flock_len = (2*sizeof(u32) + 2427 do {
2368 (num_fcntl_locks+num_flock_locks) * 2428 lock_flocks();
2369 sizeof(struct ceph_filelock)); 2429 ceph_count_locks(inode, &num_fcntl_locks,
2370 2430 &num_flock_locks);
2371 err = ceph_pagelist_append(pagelist, &rec, reclen); 2431 rec.v2.flock_len = (2*sizeof(u32) +
2372 if (!err) 2432 (num_fcntl_locks+num_flock_locks) *
2373 err = ceph_encode_locks(inode, pagelist, 2433 sizeof(struct ceph_filelock));
2374 num_fcntl_locks, 2434 unlock_flocks();
2375 num_flock_locks); 2435
2376 unlock_kernel(); 2436 /* pre-alloc pagelist */
2437 ceph_pagelist_truncate(pagelist, &trunc_point);
2438 err = ceph_pagelist_append(pagelist, &rec, reclen);
2439 if (!err)
2440 err = ceph_pagelist_reserve(pagelist,
2441 rec.v2.flock_len);
2442
2443 /* encode locks */
2444 if (!err) {
2445 lock_flocks();
2446 err = ceph_encode_locks(inode,
2447 pagelist,
2448 num_fcntl_locks,
2449 num_flock_locks);
2450 unlock_flocks();
2451 }
2452 } while (err == -ENOSPC);
2377 } else { 2453 } else {
2378 err = ceph_pagelist_append(pagelist, &rec, reclen); 2454 err = ceph_pagelist_append(pagelist, &rec, reclen);
2379 } 2455 }
@@ -2613,7 +2689,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2613 struct ceph_mds_session *session, 2689 struct ceph_mds_session *session,
2614 struct ceph_msg *msg) 2690 struct ceph_msg *msg)
2615{ 2691{
2616 struct super_block *sb = mdsc->client->sb; 2692 struct super_block *sb = mdsc->fsc->sb;
2617 struct inode *inode; 2693 struct inode *inode;
2618 struct ceph_inode_info *ci; 2694 struct ceph_inode_info *ci;
2619 struct dentry *parent, *dentry; 2695 struct dentry *parent, *dentry;
@@ -2891,10 +2967,16 @@ static void delayed_work(struct work_struct *work)
2891 schedule_delayed(mdsc); 2967 schedule_delayed(mdsc);
2892} 2968}
2893 2969
2970int ceph_mdsc_init(struct ceph_fs_client *fsc)
2894 2971
2895int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2896{ 2972{
2897 mdsc->client = client; 2973 struct ceph_mds_client *mdsc;
2974
2975 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
2976 if (!mdsc)
2977 return -ENOMEM;
2978 mdsc->fsc = fsc;
2979 fsc->mdsc = mdsc;
2898 mutex_init(&mdsc->mutex); 2980 mutex_init(&mdsc->mutex);
2899 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2981 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2900 if (mdsc->mdsmap == NULL) 2982 if (mdsc->mdsmap == NULL)
@@ -2927,7 +3009,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2927 INIT_LIST_HEAD(&mdsc->dentry_lru); 3009 INIT_LIST_HEAD(&mdsc->dentry_lru);
2928 3010
2929 ceph_caps_init(mdsc); 3011 ceph_caps_init(mdsc);
2930 ceph_adjust_min_caps(mdsc, client->min_caps); 3012 ceph_adjust_min_caps(mdsc, fsc->min_caps);
2931 3013
2932 return 0; 3014 return 0;
2933} 3015}
@@ -2939,7 +3021,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2939static void wait_requests(struct ceph_mds_client *mdsc) 3021static void wait_requests(struct ceph_mds_client *mdsc)
2940{ 3022{
2941 struct ceph_mds_request *req; 3023 struct ceph_mds_request *req;
2942 struct ceph_client *client = mdsc->client; 3024 struct ceph_fs_client *fsc = mdsc->fsc;
2943 3025
2944 mutex_lock(&mdsc->mutex); 3026 mutex_lock(&mdsc->mutex);
2945 if (__get_oldest_req(mdsc)) { 3027 if (__get_oldest_req(mdsc)) {
@@ -2947,7 +3029,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
2947 3029
2948 dout("wait_requests waiting for requests\n"); 3030 dout("wait_requests waiting for requests\n");
2949 wait_for_completion_timeout(&mdsc->safe_umount_waiters, 3031 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2950 client->mount_args->mount_timeout * HZ); 3032 fsc->client->options->mount_timeout * HZ);
2951 3033
2952 /* tear down remaining requests */ 3034 /* tear down remaining requests */
2953 mutex_lock(&mdsc->mutex); 3035 mutex_lock(&mdsc->mutex);
@@ -3030,7 +3112,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3030{ 3112{
3031 u64 want_tid, want_flush; 3113 u64 want_tid, want_flush;
3032 3114
3033 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3115 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3034 return; 3116 return;
3035 3117
3036 dout("sync\n"); 3118 dout("sync\n");
@@ -3053,7 +3135,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
3053{ 3135{
3054 int i, n = 0; 3136 int i, n = 0;
3055 3137
3056 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3138 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3057 return true; 3139 return true;
3058 3140
3059 mutex_lock(&mdsc->mutex); 3141 mutex_lock(&mdsc->mutex);
@@ -3071,8 +3153,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3071{ 3153{
3072 struct ceph_mds_session *session; 3154 struct ceph_mds_session *session;
3073 int i; 3155 int i;
3074 struct ceph_client *client = mdsc->client; 3156 struct ceph_fs_client *fsc = mdsc->fsc;
3075 unsigned long timeout = client->mount_args->mount_timeout * HZ; 3157 unsigned long timeout = fsc->client->options->mount_timeout * HZ;
3076 3158
3077 dout("close_sessions\n"); 3159 dout("close_sessions\n");
3078 3160
@@ -3119,7 +3201,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3119 dout("stopped\n"); 3201 dout("stopped\n");
3120} 3202}
3121 3203
3122void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3204static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3123{ 3205{
3124 dout("stop\n"); 3206 dout("stop\n");
3125 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3207 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3211,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3129 ceph_caps_finalize(mdsc); 3211 ceph_caps_finalize(mdsc);
3130} 3212}
3131 3213
3214void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3215{
3216 struct ceph_mds_client *mdsc = fsc->mdsc;
3217
3218 ceph_mdsc_stop(mdsc);
3219 fsc->mdsc = NULL;
3220 kfree(mdsc);
3221}
3222
3132 3223
3133/* 3224/*
3134 * handle mds map update. 3225 * handle mds map update.
@@ -3145,14 +3236,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3145 3236
3146 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); 3237 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3147 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3238 ceph_decode_copy(&p, &fsid, sizeof(fsid));
3148 if (ceph_check_fsid(mdsc->client, &fsid) < 0) 3239 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3149 return; 3240 return;
3150 epoch = ceph_decode_32(&p); 3241 epoch = ceph_decode_32(&p);
3151 maplen = ceph_decode_32(&p); 3242 maplen = ceph_decode_32(&p);
3152 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3243 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3153 3244
3154 /* do we need it? */ 3245 /* do we need it? */
3155 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); 3246 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3156 mutex_lock(&mdsc->mutex); 3247 mutex_lock(&mdsc->mutex);
3157 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3248 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3158 dout("handle_map epoch %u <= our %u\n", 3249 dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3267,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3176 } else { 3267 } else {
3177 mdsc->mdsmap = newmap; /* first mds map */ 3268 mdsc->mdsmap = newmap; /* first mds map */
3178 } 3269 }
3179 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3270 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3180 3271
3181 __wake_requests(mdsc, &mdsc->waiting_for_map); 3272 __wake_requests(mdsc, &mdsc->waiting_for_map);
3182 3273
@@ -3277,7 +3368,7 @@ static int get_authorizer(struct ceph_connection *con,
3277{ 3368{
3278 struct ceph_mds_session *s = con->private; 3369 struct ceph_mds_session *s = con->private;
3279 struct ceph_mds_client *mdsc = s->s_mdsc; 3370 struct ceph_mds_client *mdsc = s->s_mdsc;
3280 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3371 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3281 int ret = 0; 3372 int ret = 0;
3282 3373
3283 if (force_new && s->s_authorizer) { 3374 if (force_new && s->s_authorizer) {
@@ -3311,7 +3402,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
3311{ 3402{
3312 struct ceph_mds_session *s = con->private; 3403 struct ceph_mds_session *s = con->private;
3313 struct ceph_mds_client *mdsc = s->s_mdsc; 3404 struct ceph_mds_client *mdsc = s->s_mdsc;
3314 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3405 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3315 3406
3316 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); 3407 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3317} 3408}
@@ -3320,12 +3411,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
3320{ 3411{
3321 struct ceph_mds_session *s = con->private; 3412 struct ceph_mds_session *s = con->private;
3322 struct ceph_mds_client *mdsc = s->s_mdsc; 3413 struct ceph_mds_client *mdsc = s->s_mdsc;
3323 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3414 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3324 3415
3325 if (ac->ops->invalidate_authorizer) 3416 if (ac->ops->invalidate_authorizer)
3326 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); 3417 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3327 3418
3328 return ceph_monc_validate_auth(&mdsc->client->monc); 3419 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3329} 3420}
3330 3421
3331static const struct ceph_connection_operations mds_con_ops = { 3422static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3429,4 @@ static const struct ceph_connection_operations mds_con_ops = {
3338 .peer_reset = peer_reset, 3429 .peer_reset = peer_reset,
3339}; 3430};
3340 3431
3341
3342
3343
3344/* eof */ 3432/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10 10
11#include "types.h" 11#include <linux/ceph/types.h>
12#include "messenger.h" 12#include <linux/ceph/messenger.h>
13#include "mdsmap.h" 13#include <linux/ceph/mdsmap.h>
14 14
15/* 15/*
16 * Some lock dependencies: 16 * Some lock dependencies:
@@ -26,7 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29struct ceph_client; 29struct ceph_fs_client;
30struct ceph_cap; 30struct ceph_cap;
31 31
32/* 32/*
@@ -35,6 +35,7 @@ struct ceph_cap;
35 */ 35 */
36struct ceph_mds_reply_info_in { 36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in; 37 struct ceph_mds_reply_inode *in;
38 struct ceph_dir_layout dir_layout;
38 u32 symlink_len; 39 u32 symlink_len;
39 char *symlink; 40 char *symlink;
40 u32 xattr_len; 41 u32 xattr_len;
@@ -42,26 +43,37 @@ struct ceph_mds_reply_info_in {
42}; 43};
43 44
44/* 45/*
45 * parsed info about an mds reply, including information about the 46 * parsed info about an mds reply, including information about
46 * target inode and/or its parent directory and dentry, and directory 47 * either: 1) the target inode and/or its parent directory and dentry,
47 * contents (for readdir results). 48 * and directory contents (for readdir results), or
49 * 2) the file range lock info (for fcntl F_GETLK results).
48 */ 50 */
49struct ceph_mds_reply_info_parsed { 51struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head; 52 struct ceph_mds_reply_head *head;
51 53
54 /* trace */
52 struct ceph_mds_reply_info_in diri, targeti; 55 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag; 56 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname; 57 char *dname;
55 u32 dname_len; 58 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease; 59 struct ceph_mds_reply_lease *dlease;
57 60
58 struct ceph_mds_reply_dirfrag *dir_dir; 61 /* extra */
59 int dir_nr; 62 union {
60 char **dir_dname; 63 /* for fcntl F_GETLK results */
61 u32 *dir_dname_len; 64 struct ceph_filelock *filelock_reply;
62 struct ceph_mds_reply_lease **dir_dlease; 65
63 struct ceph_mds_reply_info_in *dir_in; 66 /* for readdir results */
64 u8 dir_complete, dir_end; 67 struct {
68 struct ceph_mds_reply_dirfrag *dir_dir;
69 int dir_nr;
70 char **dir_dname;
71 u32 *dir_dname_len;
72 struct ceph_mds_reply_lease **dir_dlease;
73 struct ceph_mds_reply_info_in *dir_in;
74 u8 dir_complete, dir_end;
75 };
76 };
65 77
66 /* encoded blob describing snapshot contexts for certain 78 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */ 79 operations (e.g., open) */
@@ -154,7 +166,6 @@ struct ceph_mds_request {
154 struct ceph_mds_client *r_mdsc; 166 struct ceph_mds_client *r_mdsc;
155 167
156 int r_op; /* mds op code */ 168 int r_op; /* mds op code */
157 int r_mds;
158 169
159 /* operation on what? */ 170 /* operation on what? */
160 struct inode *r_inode; /* arg1 */ 171 struct inode *r_inode; /* arg1 */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
170 181
171 union ceph_mds_request_args r_args; 182 union ceph_mds_request_args r_args;
172 int r_fmode; /* file mode, if expecting cap */ 183 int r_fmode; /* file mode, if expecting cap */
184 uid_t r_uid;
185 gid_t r_gid;
173 186
174 /* for choosing which mds to send this request to */ 187 /* for choosing which mds to send this request to */
175 int r_direct_mode; 188 int r_direct_mode;
@@ -230,7 +243,7 @@ struct ceph_mds_request {
230 * mds client state 243 * mds client state
231 */ 244 */
232struct ceph_mds_client { 245struct ceph_mds_client {
233 struct ceph_client *client; 246 struct ceph_fs_client *fsc;
234 struct mutex mutex; /* all nested structures */ 247 struct mutex mutex; /* all nested structures */
235 248
236 struct ceph_mdsmap *mdsmap; 249 struct ceph_mdsmap *mdsmap;
@@ -289,11 +302,6 @@ struct ceph_mds_client {
289 int caps_avail_count; /* unused, unreserved */ 302 int caps_avail_count; /* unused, unreserved */
290 int caps_min_count; /* keep at least this many 303 int caps_min_count; /* keep at least this many
291 (unreserved) */ 304 (unreserved) */
292
293#ifdef CONFIG_DEBUG_FS
294 struct dentry *debugfs_file;
295#endif
296
297 spinlock_t dentry_lru_lock; 305 spinlock_t dentry_lru_lock;
298 struct list_head dentry_lru; 306 struct list_head dentry_lru;
299 int num_dentry; 307 int num_dentry;
@@ -316,10 +324,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
316extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 324extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
317 struct ceph_msg *msg, int mds); 325 struct ceph_msg *msg, int mds);
318 326
319extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, 327extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
320 struct ceph_client *client);
321extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 328extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
322extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); 329extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
323 330
324extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 331extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
325 332
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/bug.h> 3#include <linux/bug.h>
4#include <linux/err.h> 4#include <linux/err.h>
@@ -6,9 +6,9 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include "mdsmap.h" 9#include <linux/ceph/mdsmap.h>
10#include "messenger.h" 10#include <linux/ceph/messenger.h>
11#include "decode.h" 11#include <linux/ceph/decode.h>
12 12
13#include "super.h" 13#include "super.h"
14 14
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
117 } 117 }
118 118
119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
120 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), 120 i+1, n, global_id, mds, inc,
121 ceph_pr_addr(&addr.in_addr),
121 ceph_mds_state_name(state)); 122 ceph_mds_state_name(state));
122 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 123 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
123 m->m_info[mds].global_id = global_id; 124 m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index 4c5cb0880bba..000000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 bool laggy;
17 u32 *export_targets;
18};
19
20struct ceph_mdsmap {
21 u32 m_epoch, m_client_epoch, m_last_failure;
22 u32 m_root;
23 u32 m_session_timeout; /* seconds */
24 u32 m_session_autoclose; /* seconds */
25 u64 m_max_file_size;
26 u32 m_max_mds; /* size of m_addr, m_state arrays */
27 struct ceph_mds_info *m_info;
28
29 /* which object pools file data can be stored in */
30 int m_num_data_pg_pools;
31 u32 *m_data_pg_pools;
32 u32 m_cas_pg_pool;
33};
34
35static inline struct ceph_entity_addr *
36ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
37{
38 if (w >= m->m_max_mds)
39 return NULL;
40 return &m->m_info[w].addr;
41}
42
43static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
44{
45 BUG_ON(w < 0);
46 if (w >= m->m_max_mds)
47 return CEPH_MDS_STATE_DNE;
48 return m->m_info[w].state;
49}
50
51static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
52{
53 if (w >= 0 && w < m->m_max_mds)
54 return m->m_info[w].laggy;
55 return false;
56}
57
58extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
59extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
60extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
61
62#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 2502d76fcec1..000000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2277 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42/*
43 * nicely render a sockaddr as a string.
44 */
45#define MAX_ADDR_STR 20
46#define MAX_ADDR_STR_LEN 60
47static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
48static DEFINE_SPINLOCK(addr_str_lock);
49static int last_addr_str;
50
51const char *pr_addr(const struct sockaddr_storage *ss)
52{
53 int i;
54 char *s;
55 struct sockaddr_in *in4 = (void *)ss;
56 struct sockaddr_in6 *in6 = (void *)ss;
57
58 spin_lock(&addr_str_lock);
59 i = last_addr_str++;
60 if (last_addr_str == MAX_ADDR_STR)
61 last_addr_str = 0;
62 spin_unlock(&addr_str_lock);
63 s = addr_str[i];
64
65 switch (ss->ss_family) {
66 case AF_INET:
67 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
68 (unsigned int)ntohs(in4->sin_port));
69 break;
70
71 case AF_INET6:
72 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
73 (unsigned int)ntohs(in6->sin6_port));
74 break;
75
76 default:
77 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
78 }
79
80 return s;
81}
82
83static void encode_my_addr(struct ceph_messenger *msgr)
84{
85 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
86 ceph_encode_addr(&msgr->my_enc_addr);
87}
88
89/*
90 * work queue for all reading and writing to/from the socket.
91 */
92struct workqueue_struct *ceph_msgr_wq;
93
94int __init ceph_msgr_init(void)
95{
96 ceph_msgr_wq = create_workqueue("ceph-msgr");
97 if (IS_ERR(ceph_msgr_wq)) {
98 int ret = PTR_ERR(ceph_msgr_wq);
99 pr_err("msgr_init failed to create workqueue: %d\n", ret);
100 ceph_msgr_wq = NULL;
101 return ret;
102 }
103 return 0;
104}
105
106void ceph_msgr_exit(void)
107{
108 destroy_workqueue(ceph_msgr_wq);
109}
110
111void ceph_msgr_flush(void)
112{
113 flush_workqueue(ceph_msgr_wq);
114}
115
116
117/*
118 * socket callback functions
119 */
120
121/* data available on socket, or listen socket received a connect */
122static void ceph_data_ready(struct sock *sk, int count_unused)
123{
124 struct ceph_connection *con =
125 (struct ceph_connection *)sk->sk_user_data;
126 if (sk->sk_state != TCP_CLOSE_WAIT) {
127 dout("ceph_data_ready on %p state = %lu, queueing work\n",
128 con, con->state);
129 queue_con(con);
130 }
131}
132
133/* socket has buffer space for writing */
134static void ceph_write_space(struct sock *sk)
135{
136 struct ceph_connection *con =
137 (struct ceph_connection *)sk->sk_user_data;
138
139 /* only queue to workqueue if there is data we want to write. */
140 if (test_bit(WRITE_PENDING, &con->state)) {
141 dout("ceph_write_space %p queueing write work\n", con);
142 queue_con(con);
143 } else {
144 dout("ceph_write_space %p nothing to write\n", con);
145 }
146
147 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
148 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
149}
150
151/* socket's state has changed */
152static void ceph_state_change(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 dout("ceph_state_change %p state = %lu sk_state = %u\n",
158 con, con->state, sk->sk_state);
159
160 if (test_bit(CLOSED, &con->state))
161 return;
162
163 switch (sk->sk_state) {
164 case TCP_CLOSE:
165 dout("ceph_state_change TCP_CLOSE\n");
166 case TCP_CLOSE_WAIT:
167 dout("ceph_state_change TCP_CLOSE_WAIT\n");
168 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
169 if (test_bit(CONNECTING, &con->state))
170 con->error_msg = "connection failed";
171 else
172 con->error_msg = "socket closed";
173 queue_con(con);
174 }
175 break;
176 case TCP_ESTABLISHED:
177 dout("ceph_state_change TCP_ESTABLISHED\n");
178 queue_con(con);
179 break;
180 }
181}
182
183/*
184 * set up socket callbacks
185 */
186static void set_sock_callbacks(struct socket *sock,
187 struct ceph_connection *con)
188{
189 struct sock *sk = sock->sk;
190 sk->sk_user_data = (void *)con;
191 sk->sk_data_ready = ceph_data_ready;
192 sk->sk_write_space = ceph_write_space;
193 sk->sk_state_change = ceph_state_change;
194}
195
196
197/*
198 * socket helpers
199 */
200
201/*
202 * initiate connection to a remote socket.
203 */
204static struct socket *ceph_tcp_connect(struct ceph_connection *con)
205{
206 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
207 struct socket *sock;
208 int ret;
209
210 BUG_ON(con->sock);
211 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
212 IPPROTO_TCP, &sock);
213 if (ret)
214 return ERR_PTR(ret);
215 con->sock = sock;
216 sock->sk->sk_allocation = GFP_NOFS;
217
218#ifdef CONFIG_LOCKDEP
219 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
220#endif
221
222 set_sock_callbacks(sock, con);
223
224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
225
226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK);
228 if (ret == -EINPROGRESS) {
229 dout("connect %s EINPROGRESS sk_state = %u\n",
230 pr_addr(&con->peer_addr.in_addr),
231 sock->sk->sk_state);
232 ret = 0;
233 }
234 if (ret < 0) {
235 pr_err("connect %s error %d\n",
236 pr_addr(&con->peer_addr.in_addr), ret);
237 sock_release(sock);
238 con->sock = NULL;
239 con->error_msg = "connect error";
240 }
241
242 if (ret < 0)
243 return ERR_PTR(ret);
244 return sock;
245}
246
247static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
248{
249 struct kvec iov = {buf, len};
250 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
251
252 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
253}
254
255/*
256 * write something. @more is true if caller will be sending more data
257 * shortly.
258 */
259static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
260 size_t kvlen, size_t len, int more)
261{
262 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
263
264 if (more)
265 msg.msg_flags |= MSG_MORE;
266 else
267 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
268
269 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
270}
271
272
273/*
274 * Shutdown/close the socket for the given connection.
275 */
276static int con_close_socket(struct ceph_connection *con)
277{
278 int rc;
279
280 dout("con_close_socket on %p sock %p\n", con, con->sock);
281 if (!con->sock)
282 return 0;
283 set_bit(SOCK_CLOSED, &con->state);
284 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
285 sock_release(con->sock);
286 con->sock = NULL;
287 clear_bit(SOCK_CLOSED, &con->state);
288 return rc;
289}
290
291/*
292 * Reset a connection. Discard all incoming and outgoing messages
293 * and clear *_seq state.
294 */
295static void ceph_msg_remove(struct ceph_msg *msg)
296{
297 list_del_init(&msg->list_head);
298 ceph_msg_put(msg);
299}
300static void ceph_msg_remove_list(struct list_head *head)
301{
302 while (!list_empty(head)) {
303 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
304 list_head);
305 ceph_msg_remove(msg);
306 }
307}
308
309static void reset_connection(struct ceph_connection *con)
310{
311 /* reset connection, out_queue, msg_ and connect_seq */
312 /* discard existing out_queue and msg_seq */
313 ceph_msg_remove_list(&con->out_queue);
314 ceph_msg_remove_list(&con->out_sent);
315
316 if (con->in_msg) {
317 ceph_msg_put(con->in_msg);
318 con->in_msg = NULL;
319 }
320
321 con->connect_seq = 0;
322 con->out_seq = 0;
323 if (con->out_msg) {
324 ceph_msg_put(con->out_msg);
325 con->out_msg = NULL;
326 }
327 con->out_keepalive_pending = false;
328 con->in_seq = 0;
329 con->in_seq_acked = 0;
330}
331
332/*
333 * mark a peer down. drop any open connections.
334 */
335void ceph_con_close(struct ceph_connection *con)
336{
337 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
338 set_bit(CLOSED, &con->state); /* in case there's queued work */
339 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
340 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
341 clear_bit(KEEPALIVE_PENDING, &con->state);
342 clear_bit(WRITE_PENDING, &con->state);
343 mutex_lock(&con->mutex);
344 reset_connection(con);
345 con->peer_global_seq = 0;
346 cancel_delayed_work(&con->work);
347 mutex_unlock(&con->mutex);
348 queue_con(con);
349}
350
351/*
352 * Reopen a closed connection, with a new peer address.
353 */
354void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
355{
356 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
357 set_bit(OPENING, &con->state);
358 clear_bit(CLOSED, &con->state);
359 memcpy(&con->peer_addr, addr, sizeof(*addr));
360 con->delay = 0; /* reset backoff memory */
361 queue_con(con);
362}
363
364/*
365 * return true if this connection ever successfully opened
366 */
367bool ceph_con_opened(struct ceph_connection *con)
368{
369 return con->connect_seq > 0;
370}
371
372/*
373 * generic get/put
374 */
375struct ceph_connection *ceph_con_get(struct ceph_connection *con)
376{
377 dout("con_get %p nref = %d -> %d\n", con,
378 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
379 if (atomic_inc_not_zero(&con->nref))
380 return con;
381 return NULL;
382}
383
384void ceph_con_put(struct ceph_connection *con)
385{
386 dout("con_put %p nref = %d -> %d\n", con,
387 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
388 BUG_ON(atomic_read(&con->nref) == 0);
389 if (atomic_dec_and_test(&con->nref)) {
390 BUG_ON(con->sock);
391 kfree(con);
392 }
393}
394
395/*
396 * initialize a new connection.
397 */
398void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
399{
400 dout("con_init %p\n", con);
401 memset(con, 0, sizeof(*con));
402 atomic_set(&con->nref, 1);
403 con->msgr = msgr;
404 mutex_init(&con->mutex);
405 INIT_LIST_HEAD(&con->out_queue);
406 INIT_LIST_HEAD(&con->out_sent);
407 INIT_DELAYED_WORK(&con->work, con_work);
408}
409
410
411/*
412 * We maintain a global counter to order connection attempts. Get
413 * a unique seq greater than @gt.
414 */
415static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
416{
417 u32 ret;
418
419 spin_lock(&msgr->global_seq_lock);
420 if (msgr->global_seq < gt)
421 msgr->global_seq = gt;
422 ret = ++msgr->global_seq;
423 spin_unlock(&msgr->global_seq_lock);
424 return ret;
425}
426
427
428/*
429 * Prepare footer for currently outgoing message, and finish things
430 * off. Assumes out_kvec* are already valid.. we just add on to the end.
431 */
432static void prepare_write_message_footer(struct ceph_connection *con, int v)
433{
434 struct ceph_msg *m = con->out_msg;
435
436 dout("prepare_write_message_footer %p\n", con);
437 con->out_kvec_is_msg = true;
438 con->out_kvec[v].iov_base = &m->footer;
439 con->out_kvec[v].iov_len = sizeof(m->footer);
440 con->out_kvec_bytes += sizeof(m->footer);
441 con->out_kvec_left++;
442 con->out_more = m->more_to_follow;
443 con->out_msg_done = true;
444}
445
446/*
447 * Prepare headers for the next outgoing message.
448 */
449static void prepare_write_message(struct ceph_connection *con)
450{
451 struct ceph_msg *m;
452 int v = 0;
453
454 con->out_kvec_bytes = 0;
455 con->out_kvec_is_msg = true;
456 con->out_msg_done = false;
457
458 /* Sneak an ack in there first? If we can get it into the same
459 * TCP packet that's a good thing. */
460 if (con->in_seq > con->in_seq_acked) {
461 con->in_seq_acked = con->in_seq;
462 con->out_kvec[v].iov_base = &tag_ack;
463 con->out_kvec[v++].iov_len = 1;
464 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
465 con->out_kvec[v].iov_base = &con->out_temp_ack;
466 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
467 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
468 }
469
470 m = list_first_entry(&con->out_queue,
471 struct ceph_msg, list_head);
472 con->out_msg = m;
473 if (test_bit(LOSSYTX, &con->state)) {
474 list_del_init(&m->list_head);
475 } else {
476 /* put message on sent list */
477 ceph_msg_get(m);
478 list_move_tail(&m->list_head, &con->out_sent);
479 }
480
481 /*
482 * only assign outgoing seq # if we haven't sent this message
483 * yet. if it is requeued, resend with it's original seq.
484 */
485 if (m->needs_out_seq) {
486 m->hdr.seq = cpu_to_le64(++con->out_seq);
487 m->needs_out_seq = false;
488 }
489
490 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
491 m, con->out_seq, le16_to_cpu(m->hdr.type),
492 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
493 le32_to_cpu(m->hdr.data_len),
494 m->nr_pages);
495 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
496
497 /* tag + hdr + front + middle */
498 con->out_kvec[v].iov_base = &tag_msg;
499 con->out_kvec[v++].iov_len = 1;
500 con->out_kvec[v].iov_base = &m->hdr;
501 con->out_kvec[v++].iov_len = sizeof(m->hdr);
502 con->out_kvec[v++] = m->front;
503 if (m->middle)
504 con->out_kvec[v++] = m->middle->vec;
505 con->out_kvec_left = v;
506 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
507 (m->middle ? m->middle->vec.iov_len : 0);
508 con->out_kvec_cur = con->out_kvec;
509
510 /* fill in crc (except data pages), footer */
511 con->out_msg->hdr.crc =
512 cpu_to_le32(crc32c(0, (void *)&m->hdr,
513 sizeof(m->hdr) - sizeof(m->hdr.crc)));
514 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
515 con->out_msg->footer.front_crc =
516 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
517 if (m->middle)
518 con->out_msg->footer.middle_crc =
519 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
520 m->middle->vec.iov_len));
521 else
522 con->out_msg->footer.middle_crc = 0;
523 con->out_msg->footer.data_crc = 0;
524 dout("prepare_write_message front_crc %u data_crc %u\n",
525 le32_to_cpu(con->out_msg->footer.front_crc),
526 le32_to_cpu(con->out_msg->footer.middle_crc));
527
528 /* is there a data payload? */
529 if (le32_to_cpu(m->hdr.data_len) > 0) {
530 /* initialize page iterator */
531 con->out_msg_pos.page = 0;
532 con->out_msg_pos.page_pos =
533 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
534 con->out_msg_pos.data_pos = 0;
535 con->out_msg_pos.did_page_crc = 0;
536 con->out_more = 1; /* data + footer will follow */
537 } else {
538 /* no, queue up footer too and be done */
539 prepare_write_message_footer(con, v);
540 }
541
542 set_bit(WRITE_PENDING, &con->state);
543}
544
545/*
546 * Prepare an ack.
547 */
548static void prepare_write_ack(struct ceph_connection *con)
549{
550 dout("prepare_write_ack %p %llu -> %llu\n", con,
551 con->in_seq_acked, con->in_seq);
552 con->in_seq_acked = con->in_seq;
553
554 con->out_kvec[0].iov_base = &tag_ack;
555 con->out_kvec[0].iov_len = 1;
556 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
557 con->out_kvec[1].iov_base = &con->out_temp_ack;
558 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
559 con->out_kvec_left = 2;
560 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
561 con->out_kvec_cur = con->out_kvec;
562 con->out_more = 1; /* more will follow.. eventually.. */
563 set_bit(WRITE_PENDING, &con->state);
564}
565
566/*
567 * Prepare to write keepalive byte.
568 */
569static void prepare_write_keepalive(struct ceph_connection *con)
570{
571 dout("prepare_write_keepalive %p\n", con);
572 con->out_kvec[0].iov_base = &tag_keepalive;
573 con->out_kvec[0].iov_len = 1;
574 con->out_kvec_left = 1;
575 con->out_kvec_bytes = 1;
576 con->out_kvec_cur = con->out_kvec;
577 set_bit(WRITE_PENDING, &con->state);
578}
579
580/*
581 * Connection negotiation.
582 */
583
584static void prepare_connect_authorizer(struct ceph_connection *con)
585{
586 void *auth_buf;
587 int auth_len = 0;
588 int auth_protocol = 0;
589
590 mutex_unlock(&con->mutex);
591 if (con->ops->get_authorizer)
592 con->ops->get_authorizer(con, &auth_buf, &auth_len,
593 &auth_protocol, &con->auth_reply_buf,
594 &con->auth_reply_buf_len,
595 con->auth_retry);
596 mutex_lock(&con->mutex);
597
598 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
599 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
600
601 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
602 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
603 con->out_kvec_left++;
604 con->out_kvec_bytes += auth_len;
605}
606
607/*
608 * We connected to a peer and are saying hello.
609 */
610static void prepare_write_banner(struct ceph_messenger *msgr,
611 struct ceph_connection *con)
612{
613 int len = strlen(CEPH_BANNER);
614
615 con->out_kvec[0].iov_base = CEPH_BANNER;
616 con->out_kvec[0].iov_len = len;
617 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
618 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
619 con->out_kvec_left = 2;
620 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
621 con->out_kvec_cur = con->out_kvec;
622 con->out_more = 0;
623 set_bit(WRITE_PENDING, &con->state);
624}
625
626static void prepare_write_connect(struct ceph_messenger *msgr,
627 struct ceph_connection *con,
628 int after_banner)
629{
630 unsigned global_seq = get_global_seq(con->msgr, 0);
631 int proto;
632
633 switch (con->peer_name.type) {
634 case CEPH_ENTITY_TYPE_MON:
635 proto = CEPH_MONC_PROTOCOL;
636 break;
637 case CEPH_ENTITY_TYPE_OSD:
638 proto = CEPH_OSDC_PROTOCOL;
639 break;
640 case CEPH_ENTITY_TYPE_MDS:
641 proto = CEPH_MDSC_PROTOCOL;
642 break;
643 default:
644 BUG();
645 }
646
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto);
649
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq);
654 con->out_connect.protocol_version = cpu_to_le32(proto);
655 con->out_connect.flags = 0;
656
657 if (!after_banner) {
658 con->out_kvec_left = 0;
659 con->out_kvec_bytes = 0;
660 }
661 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
662 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
663 con->out_kvec_left++;
664 con->out_kvec_bytes += sizeof(con->out_connect);
665 con->out_kvec_cur = con->out_kvec;
666 con->out_more = 0;
667 set_bit(WRITE_PENDING, &con->state);
668
669 prepare_connect_authorizer(con);
670}
671
672
673/*
674 * write as much of pending kvecs to the socket as we can.
675 * 1 -> done
676 * 0 -> socket full, but more to do
677 * <0 -> error
678 */
679static int write_partial_kvec(struct ceph_connection *con)
680{
681 int ret;
682
683 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
684 while (con->out_kvec_bytes > 0) {
685 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
686 con->out_kvec_left, con->out_kvec_bytes,
687 con->out_more);
688 if (ret <= 0)
689 goto out;
690 con->out_kvec_bytes -= ret;
691 if (con->out_kvec_bytes == 0)
692 break; /* done */
693 while (ret > 0) {
694 if (ret >= con->out_kvec_cur->iov_len) {
695 ret -= con->out_kvec_cur->iov_len;
696 con->out_kvec_cur++;
697 con->out_kvec_left--;
698 } else {
699 con->out_kvec_cur->iov_len -= ret;
700 con->out_kvec_cur->iov_base += ret;
701 ret = 0;
702 break;
703 }
704 }
705 }
706 con->out_kvec_left = 0;
707 con->out_kvec_is_msg = false;
708 ret = 1;
709out:
710 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
711 con->out_kvec_bytes, con->out_kvec_left, ret);
712 return ret; /* done! */
713}
714
715/*
716 * Write as much message data payload as we can. If we finish, queue
717 * up the footer.
718 * 1 -> done, footer is now queued in out_kvec[].
719 * 0 -> socket full, but more to do
720 * <0 -> error
721 */
722static int write_partial_msg_pages(struct ceph_connection *con)
723{
724 struct ceph_msg *msg = con->out_msg;
725 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
726 size_t len;
727 int crc = con->msgr->nocrc;
728 int ret;
729
730 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
731 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
732 con->out_msg_pos.page_pos);
733
734 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
735 struct page *page = NULL;
736 void *kaddr = NULL;
737
738 /*
739 * if we are calculating the data crc (the default), we need
740 * to map the page. if our pages[] has been revoked, use the
741 * zero page.
742 */
743 if (msg->pages) {
744 page = msg->pages[con->out_msg_pos.page];
745 if (crc)
746 kaddr = kmap(page);
747 } else if (msg->pagelist) {
748 page = list_first_entry(&msg->pagelist->head,
749 struct page, lru);
750 if (crc)
751 kaddr = kmap(page);
752 } else {
753 page = con->msgr->zero_page;
754 if (crc)
755 kaddr = page_address(con->msgr->zero_page);
756 }
757 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
758 (int)(data_len - con->out_msg_pos.data_pos));
759 if (crc && !con->out_msg_pos.did_page_crc) {
760 void *base = kaddr + con->out_msg_pos.page_pos;
761 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
762
763 BUG_ON(kaddr == NULL);
764 con->out_msg->footer.data_crc =
765 cpu_to_le32(crc32c(tmpcrc, base, len));
766 con->out_msg_pos.did_page_crc = 1;
767 }
768
769 ret = kernel_sendpage(con->sock, page,
770 con->out_msg_pos.page_pos, len,
771 MSG_DONTWAIT | MSG_NOSIGNAL |
772 MSG_MORE);
773
774 if (crc && (msg->pages || msg->pagelist))
775 kunmap(page);
776
777 if (ret <= 0)
778 goto out;
779
780 con->out_msg_pos.data_pos += ret;
781 con->out_msg_pos.page_pos += ret;
782 if (ret == len) {
783 con->out_msg_pos.page_pos = 0;
784 con->out_msg_pos.page++;
785 con->out_msg_pos.did_page_crc = 0;
786 if (msg->pagelist)
787 list_move_tail(&page->lru,
788 &msg->pagelist->head);
789 }
790 }
791
792 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
793
794 /* prepare and queue up footer, too */
795 if (!crc)
796 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
797 con->out_kvec_bytes = 0;
798 con->out_kvec_left = 0;
799 con->out_kvec_cur = con->out_kvec;
800 prepare_write_message_footer(con, 0);
801 ret = 1;
802out:
803 return ret;
804}
805
806/*
807 * write some zeros
808 */
809static int write_partial_skip(struct ceph_connection *con)
810{
811 int ret;
812
813 while (con->out_skip > 0) {
814 struct kvec iov = {
815 .iov_base = page_address(con->msgr->zero_page),
816 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
817 };
818
819 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
820 if (ret <= 0)
821 goto out;
822 con->out_skip -= ret;
823 }
824 ret = 1;
825out:
826 return ret;
827}
828
829/*
830 * Prepare to read connection handshake, or an ack.
831 */
832static void prepare_read_banner(struct ceph_connection *con)
833{
834 dout("prepare_read_banner %p\n", con);
835 con->in_base_pos = 0;
836}
837
838static void prepare_read_connect(struct ceph_connection *con)
839{
840 dout("prepare_read_connect %p\n", con);
841 con->in_base_pos = 0;
842}
843
844static void prepare_read_ack(struct ceph_connection *con)
845{
846 dout("prepare_read_ack %p\n", con);
847 con->in_base_pos = 0;
848}
849
850static void prepare_read_tag(struct ceph_connection *con)
851{
852 dout("prepare_read_tag %p\n", con);
853 con->in_base_pos = 0;
854 con->in_tag = CEPH_MSGR_TAG_READY;
855}
856
857/*
858 * Prepare to read a message.
859 */
860static int prepare_read_message(struct ceph_connection *con)
861{
862 dout("prepare_read_message %p\n", con);
863 BUG_ON(con->in_msg != NULL);
864 con->in_base_pos = 0;
865 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
866 return 0;
867}
868
869
870static int read_partial(struct ceph_connection *con,
871 int *to, int size, void *object)
872{
873 *to += size;
874 while (con->in_base_pos < *to) {
875 int left = *to - con->in_base_pos;
876 int have = size - left;
877 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
878 if (ret <= 0)
879 return ret;
880 con->in_base_pos += ret;
881 }
882 return 1;
883}
884
885
886/*
887 * Read all or part of the connect-side handshake on a new connection
888 */
889static int read_partial_banner(struct ceph_connection *con)
890{
891 int ret, to = 0;
892
893 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
894
895 /* peer's banner */
896 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
897 if (ret <= 0)
898 goto out;
899 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
900 &con->actual_peer_addr);
901 if (ret <= 0)
902 goto out;
903 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
904 &con->peer_addr_for_me);
905 if (ret <= 0)
906 goto out;
907out:
908 return ret;
909}
910
911static int read_partial_connect(struct ceph_connection *con)
912{
913 int ret, to = 0;
914
915 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
916
917 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
918 if (ret <= 0)
919 goto out;
920 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
921 con->auth_reply_buf);
922 if (ret <= 0)
923 goto out;
924
925 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
926 con, (int)con->in_reply.tag,
927 le32_to_cpu(con->in_reply.connect_seq),
928 le32_to_cpu(con->in_reply.global_seq));
929out:
930 return ret;
931
932}
933
934/*
935 * Verify the hello banner looks okay.
936 */
937static int verify_hello(struct ceph_connection *con)
938{
939 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
940 pr_err("connect to %s got bad banner\n",
941 pr_addr(&con->peer_addr.in_addr));
942 con->error_msg = "protocol error, bad banner";
943 return -1;
944 }
945 return 0;
946}
947
948static bool addr_is_blank(struct sockaddr_storage *ss)
949{
950 switch (ss->ss_family) {
951 case AF_INET:
952 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
953 case AF_INET6:
954 return
955 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
956 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
957 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
958 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
959 }
960 return false;
961}
962
963static int addr_port(struct sockaddr_storage *ss)
964{
965 switch (ss->ss_family) {
966 case AF_INET:
967 return ntohs(((struct sockaddr_in *)ss)->sin_port);
968 case AF_INET6:
969 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
970 }
971 return 0;
972}
973
974static void addr_set_port(struct sockaddr_storage *ss, int p)
975{
976 switch (ss->ss_family) {
977 case AF_INET:
978 ((struct sockaddr_in *)ss)->sin_port = htons(p);
979 case AF_INET6:
980 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
981 }
982}
983
984/*
985 * Parse an ip[:port] list into an addr array. Use the default
986 * monitor port if a port isn't specified.
987 */
988int ceph_parse_ips(const char *c, const char *end,
989 struct ceph_entity_addr *addr,
990 int max_count, int *count)
991{
992 int i;
993 const char *p = c;
994
995 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
996 for (i = 0; i < max_count; i++) {
997 const char *ipend;
998 struct sockaddr_storage *ss = &addr[i].in_addr;
999 struct sockaddr_in *in4 = (void *)ss;
1000 struct sockaddr_in6 *in6 = (void *)ss;
1001 int port;
1002 char delim = ',';
1003
1004 if (*p == '[') {
1005 delim = ']';
1006 p++;
1007 }
1008
1009 memset(ss, 0, sizeof(*ss));
1010 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1011 delim, &ipend))
1012 ss->ss_family = AF_INET;
1013 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1014 delim, &ipend))
1015 ss->ss_family = AF_INET6;
1016 else
1017 goto bad;
1018 p = ipend;
1019
1020 if (delim == ']') {
1021 if (*p != ']') {
1022 dout("missing matching ']'\n");
1023 goto bad;
1024 }
1025 p++;
1026 }
1027
1028 /* port? */
1029 if (p < end && *p == ':') {
1030 port = 0;
1031 p++;
1032 while (p < end && *p >= '0' && *p <= '9') {
1033 port = (port * 10) + (*p - '0');
1034 p++;
1035 }
1036 if (port > 65535 || port == 0)
1037 goto bad;
1038 } else {
1039 port = CEPH_MON_PORT;
1040 }
1041
1042 addr_set_port(ss, port);
1043
1044 dout("parse_ips got %s\n", pr_addr(ss));
1045
1046 if (p == end)
1047 break;
1048 if (*p != ',')
1049 goto bad;
1050 p++;
1051 }
1052
1053 if (p != end)
1054 goto bad;
1055
1056 if (count)
1057 *count = i + 1;
1058 return 0;
1059
1060bad:
1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1062 return -EINVAL;
1063}
1064
1065static int process_banner(struct ceph_connection *con)
1066{
1067 dout("process_banner on %p\n", con);
1068
1069 if (verify_hello(con) < 0)
1070 return -1;
1071
1072 ceph_decode_addr(&con->actual_peer_addr);
1073 ceph_decode_addr(&con->peer_addr_for_me);
1074
1075 /*
1076 * Make sure the other end is who we wanted. note that the other
1077 * end may not yet know their ip address, so if it's 0.0.0.0, give
1078 * them the benefit of the doubt.
1079 */
1080 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1081 sizeof(con->peer_addr)) != 0 &&
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1085 pr_addr(&con->peer_addr.in_addr),
1086 (int)le32_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr),
1088 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address";
1090 return -1;
1091 }
1092
1093 /*
1094 * did we learn our address?
1095 */
1096 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1097 int port = addr_port(&con->msgr->inst.addr.in_addr);
1098
1099 memcpy(&con->msgr->inst.addr.in_addr,
1100 &con->peer_addr_for_me.in_addr,
1101 sizeof(con->peer_addr_for_me.in_addr));
1102 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1103 encode_my_addr(con->msgr);
1104 dout("process_banner learned my addr is %s\n",
1105 pr_addr(&con->msgr->inst.addr.in_addr));
1106 }
1107
1108 set_bit(NEGOTIATING, &con->state);
1109 prepare_read_connect(con);
1110 return 0;
1111}
1112
1113static void fail_protocol(struct ceph_connection *con)
1114{
1115 reset_connection(con);
1116 set_bit(CLOSED, &con->state); /* in case there's queued work */
1117
1118 mutex_unlock(&con->mutex);
1119 if (con->ops->bad_proto)
1120 con->ops->bad_proto(con);
1121 mutex_lock(&con->mutex);
1122}
1123
1124static int process_connect(struct ceph_connection *con)
1125{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1131
1132 switch (con->in_reply.tag) {
1133 case CEPH_MSGR_TAG_FEATURES:
1134 pr_err("%s%lld %s feature set mismatch,"
1135 " my %llx < server's %llx, missing %llx\n",
1136 ENTITY_NAME(con->peer_name),
1137 pr_addr(&con->peer_addr.in_addr),
1138 sup_feat, server_feat, server_feat & ~sup_feat);
1139 con->error_msg = "missing required protocol features";
1140 fail_protocol(con);
1141 return -1;
1142
1143 case CEPH_MSGR_TAG_BADPROTOVER:
1144 pr_err("%s%lld %s protocol version mismatch,"
1145 " my %d != server's %d\n",
1146 ENTITY_NAME(con->peer_name),
1147 pr_addr(&con->peer_addr.in_addr),
1148 le32_to_cpu(con->out_connect.protocol_version),
1149 le32_to_cpu(con->in_reply.protocol_version));
1150 con->error_msg = "protocol version mismatch";
1151 fail_protocol(con);
1152 return -1;
1153
1154 case CEPH_MSGR_TAG_BADAUTHORIZER:
1155 con->auth_retry++;
1156 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1157 con->auth_retry);
1158 if (con->auth_retry == 2) {
1159 con->error_msg = "connect authorization failure";
1160 reset_connection(con);
1161 set_bit(CLOSED, &con->state);
1162 return -1;
1163 }
1164 con->auth_retry = 1;
1165 prepare_write_connect(con->msgr, con, 0);
1166 prepare_read_connect(con);
1167 break;
1168
1169 case CEPH_MSGR_TAG_RESETSESSION:
1170 /*
1171 * If we connected with a large connect_seq but the peer
1172 * has no record of a session with us (no connection, or
1173 * connect_seq == 0), they will send RESETSESION to indicate
1174 * that they must have reset their session, and may have
1175 * dropped messages.
1176 */
1177 dout("process_connect got RESET peer seq %u\n",
1178 le32_to_cpu(con->in_connect.connect_seq));
1179 pr_err("%s%lld %s connection reset\n",
1180 ENTITY_NAME(con->peer_name),
1181 pr_addr(&con->peer_addr.in_addr));
1182 reset_connection(con);
1183 prepare_write_connect(con->msgr, con, 0);
1184 prepare_read_connect(con);
1185
1186 /* Tell ceph about it. */
1187 mutex_unlock(&con->mutex);
1188 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1189 if (con->ops->peer_reset)
1190 con->ops->peer_reset(con);
1191 mutex_lock(&con->mutex);
1192 break;
1193
1194 case CEPH_MSGR_TAG_RETRY_SESSION:
1195 /*
1196 * If we sent a smaller connect_seq than the peer has, try
1197 * again with a larger value.
1198 */
1199 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1200 le32_to_cpu(con->out_connect.connect_seq),
1201 le32_to_cpu(con->in_connect.connect_seq));
1202 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1203 prepare_write_connect(con->msgr, con, 0);
1204 prepare_read_connect(con);
1205 break;
1206
1207 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1208 /*
1209 * If we sent a smaller global_seq than the peer has, try
1210 * again with a larger value.
1211 */
1212 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1213 con->peer_global_seq,
1214 le32_to_cpu(con->in_connect.global_seq));
1215 get_global_seq(con->msgr,
1216 le32_to_cpu(con->in_connect.global_seq));
1217 prepare_write_connect(con->msgr, con, 0);
1218 prepare_read_connect(con);
1219 break;
1220
1221 case CEPH_MSGR_TAG_READY:
1222 if (req_feat & ~server_feat) {
1223 pr_err("%s%lld %s protocol feature mismatch,"
1224 " my required %llx > server's %llx, need %llx\n",
1225 ENTITY_NAME(con->peer_name),
1226 pr_addr(&con->peer_addr.in_addr),
1227 req_feat, server_feat, req_feat & ~server_feat);
1228 con->error_msg = "missing required protocol features";
1229 fail_protocol(con);
1230 return -1;
1231 }
1232 clear_bit(CONNECTING, &con->state);
1233 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1234 con->connect_seq++;
1235 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq),
1239 con->connect_seq);
1240 WARN_ON(con->connect_seq !=
1241 le32_to_cpu(con->in_reply.connect_seq));
1242
1243 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1244 set_bit(LOSSYTX, &con->state);
1245
1246 prepare_read_tag(con);
1247 break;
1248
1249 case CEPH_MSGR_TAG_WAIT:
1250 /*
1251 * If there is a connection race (we are opening
1252 * connections to each other), one of us may just have
1253 * to WAIT. This shouldn't happen if we are the
1254 * client.
1255 */
1256 pr_err("process_connect peer connecting WAIT\n");
1257
1258 default:
1259 pr_err("connect protocol error, will retry\n");
1260 con->error_msg = "protocol error, garbage tag during connect";
1261 return -1;
1262 }
1263 return 0;
1264}
1265
1266
1267/*
1268 * read (part of) an ack
1269 */
1270static int read_partial_ack(struct ceph_connection *con)
1271{
1272 int to = 0;
1273
1274 return read_partial(con, &to, sizeof(con->in_temp_ack),
1275 &con->in_temp_ack);
1276}
1277
1278
1279/*
1280 * We can finally discard anything that's been acked.
1281 */
1282static void process_ack(struct ceph_connection *con)
1283{
1284 struct ceph_msg *m;
1285 u64 ack = le64_to_cpu(con->in_temp_ack);
1286 u64 seq;
1287
1288 while (!list_empty(&con->out_sent)) {
1289 m = list_first_entry(&con->out_sent, struct ceph_msg,
1290 list_head);
1291 seq = le64_to_cpu(m->hdr.seq);
1292 if (seq > ack)
1293 break;
1294 dout("got ack for seq %llu type %d at %p\n", seq,
1295 le16_to_cpu(m->hdr.type), m);
1296 ceph_msg_remove(m);
1297 }
1298 prepare_read_tag(con);
1299}
1300
1301
1302
1303
1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section,
1306 unsigned int sec_len, u32 *crc)
1307{
1308 int left;
1309 int ret;
1310
1311 BUG_ON(!section);
1312
1313 while (section->iov_len < sec_len) {
1314 BUG_ON(section->iov_base == NULL);
1315 left = sec_len - section->iov_len;
1316 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1317 section->iov_len, left);
1318 if (ret <= 0)
1319 return ret;
1320 section->iov_len += ret;
1321 if (section->iov_len == sec_len)
1322 *crc = crc32c(0, section->iov_base,
1323 section->iov_len);
1324 }
1325
1326 return 1;
1327}
1328
1329static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1330 struct ceph_msg_header *hdr,
1331 int *skip);
1332/*
1333 * read (part of) a message.
1334 */
1335static int read_partial_message(struct ceph_connection *con)
1336{
1337 struct ceph_msg *m = con->in_msg;
1338 void *p;
1339 int ret;
1340 int to, left;
1341 unsigned front_len, middle_len, data_len, data_off;
1342 int datacrc = con->msgr->nocrc;
1343 int skip;
1344 u64 seq;
1345
1346 dout("read_partial_message con %p msg %p\n", con, m);
1347
1348 /* header */
1349 while (con->in_base_pos < sizeof(con->in_hdr)) {
1350 left = sizeof(con->in_hdr) - con->in_base_pos;
1351 ret = ceph_tcp_recvmsg(con->sock,
1352 (char *)&con->in_hdr + con->in_base_pos,
1353 left);
1354 if (ret <= 0)
1355 return ret;
1356 con->in_base_pos += ret;
1357 if (con->in_base_pos == sizeof(con->in_hdr)) {
1358 u32 crc = crc32c(0, (void *)&con->in_hdr,
1359 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1360 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1361 pr_err("read_partial_message bad hdr "
1362 " crc %u != expected %u\n",
1363 crc, con->in_hdr.crc);
1364 return -EBADMSG;
1365 }
1366 }
1367 }
1368 front_len = le32_to_cpu(con->in_hdr.front_len);
1369 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1370 return -EIO;
1371 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1372 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1373 return -EIO;
1374 data_len = le32_to_cpu(con->in_hdr.data_len);
1375 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1376 return -EIO;
1377 data_off = le16_to_cpu(con->in_hdr.data_off);
1378
1379 /* verify seq# */
1380 seq = le64_to_cpu(con->in_hdr.seq);
1381 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer);
1388 con->in_tag = CEPH_MSGR_TAG_READY;
1389 con->in_seq++;
1390 return 0;
1391 } else if ((s64)seq - (s64)con->in_seq > 1) {
1392 pr_err("read_partial_message bad seq %lld expected %lld\n",
1393 seq, con->in_seq + 1);
1394 con->error_msg = "bad message sequence # for incoming message";
1395 return -EBADMSG;
1396 }
1397
1398 /* allocate message? */
1399 if (!con->in_msg) {
1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1401 con->in_hdr.front_len, con->in_hdr.data_len);
1402 skip = 0;
1403 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1404 if (skip) {
1405 /* skip this message */
1406 dout("alloc_msg said skip message\n");
1407 BUG_ON(con->in_msg);
1408 con->in_base_pos = -front_len - middle_len - data_len -
1409 sizeof(m->footer);
1410 con->in_tag = CEPH_MSGR_TAG_READY;
1411 con->in_seq++;
1412 return 0;
1413 }
1414 if (!con->in_msg) {
1415 con->error_msg =
1416 "error allocating memory for incoming message";
1417 return -ENOMEM;
1418 }
1419 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */
1421 if (m->middle)
1422 m->middle->vec.iov_len = 0;
1423
1424 con->in_msg_pos.page = 0;
1425 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1426 con->in_msg_pos.data_pos = 0;
1427 }
1428
1429 /* front */
1430 ret = read_partial_message_section(con, &m->front, front_len,
1431 &con->in_front_crc);
1432 if (ret <= 0)
1433 return ret;
1434
1435 /* middle */
1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec,
1438 middle_len,
1439 &con->in_middle_crc);
1440 if (ret <= 0)
1441 return ret;
1442 }
1443
1444 /* (page) data */
1445 while (con->in_msg_pos.data_pos < data_len) {
1446 left = min((int)(data_len - con->in_msg_pos.data_pos),
1447 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1448 BUG_ON(m->pages == NULL);
1449 p = kmap(m->pages[con->in_msg_pos.page]);
1450 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1451 left);
1452 if (ret > 0 && datacrc)
1453 con->in_data_crc =
1454 crc32c(con->in_data_crc,
1455 p + con->in_msg_pos.page_pos, ret);
1456 kunmap(m->pages[con->in_msg_pos.page]);
1457 if (ret <= 0)
1458 return ret;
1459 con->in_msg_pos.data_pos += ret;
1460 con->in_msg_pos.page_pos += ret;
1461 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1462 con->in_msg_pos.page_pos = 0;
1463 con->in_msg_pos.page++;
1464 }
1465 }
1466
1467 /* footer */
1468 to = sizeof(m->hdr) + sizeof(m->footer);
1469 while (con->in_base_pos < to) {
1470 left = to - con->in_base_pos;
1471 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1472 (con->in_base_pos - sizeof(m->hdr)),
1473 left);
1474 if (ret <= 0)
1475 return ret;
1476 con->in_base_pos += ret;
1477 }
1478 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1479 m, front_len, m->footer.front_crc, middle_len,
1480 m->footer.middle_crc, data_len, m->footer.data_crc);
1481
1482 /* crc ok? */
1483 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1484 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1485 m, con->in_front_crc, m->footer.front_crc);
1486 return -EBADMSG;
1487 }
1488 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1489 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1490 m, con->in_middle_crc, m->footer.middle_crc);
1491 return -EBADMSG;
1492 }
1493 if (datacrc &&
1494 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1495 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1496 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1497 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1498 return -EBADMSG;
1499 }
1500
1501 return 1; /* done! */
1502}
1503
1504/*
1505 * Process message. This happens in the worker thread. The callback should
1506 * be careful not to do anything that waits on other incoming messages or it
1507 * may deadlock.
1508 */
1509static void process_message(struct ceph_connection *con)
1510{
1511 struct ceph_msg *msg;
1512
1513 msg = con->in_msg;
1514 con->in_msg = NULL;
1515
1516 /* if first message, set peer_name */
1517 if (con->peer_name.type == 0)
1518 con->peer_name = msg->hdr.src;
1519
1520 con->in_seq++;
1521 mutex_unlock(&con->mutex);
1522
1523 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1524 msg, le64_to_cpu(msg->hdr.seq),
1525 ENTITY_NAME(msg->hdr.src),
1526 le16_to_cpu(msg->hdr.type),
1527 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1528 le32_to_cpu(msg->hdr.front_len),
1529 le32_to_cpu(msg->hdr.data_len),
1530 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1531 con->ops->dispatch(con, msg);
1532
1533 mutex_lock(&con->mutex);
1534 prepare_read_tag(con);
1535}
1536
1537
1538/*
1539 * Write something to the socket. Called in a worker thread when the
1540 * socket appears to be writeable and we have something ready to send.
1541 */
1542static int try_write(struct ceph_connection *con)
1543{
1544 struct ceph_messenger *msgr = con->msgr;
1545 int ret = 1;
1546
1547 dout("try_write start %p state %lu nref %d\n", con, con->state,
1548 atomic_read(&con->nref));
1549
1550more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552
1553 /* open the socket first? */
1554 if (con->sock == NULL) {
1555 /*
1556 * if we were STANDBY and are reconnecting _this_
1557 * connection, bump connect_seq now. Always bump
1558 * global_seq.
1559 */
1560 if (test_and_clear_bit(STANDBY, &con->state))
1561 con->connect_seq++;
1562
1563 prepare_write_banner(msgr, con);
1564 prepare_write_connect(msgr, con, 1);
1565 prepare_read_banner(con);
1566 set_bit(CONNECTING, &con->state);
1567 clear_bit(NEGOTIATING, &con->state);
1568
1569 BUG_ON(con->in_msg);
1570 con->in_tag = CEPH_MSGR_TAG_READY;
1571 dout("try_write initiating connect on %p new state %lu\n",
1572 con, con->state);
1573 con->sock = ceph_tcp_connect(con);
1574 if (IS_ERR(con->sock)) {
1575 con->sock = NULL;
1576 con->error_msg = "connect error";
1577 ret = -1;
1578 goto out;
1579 }
1580 }
1581
1582more_kvec:
1583 /* kvec data queued? */
1584 if (con->out_skip) {
1585 ret = write_partial_skip(con);
1586 if (ret <= 0)
1587 goto done;
1588 if (ret < 0) {
1589 dout("try_write write_partial_skip err %d\n", ret);
1590 goto done;
1591 }
1592 }
1593 if (con->out_kvec_left) {
1594 ret = write_partial_kvec(con);
1595 if (ret <= 0)
1596 goto done;
1597 }
1598
1599 /* msg pages? */
1600 if (con->out_msg) {
1601 if (con->out_msg_done) {
1602 ceph_msg_put(con->out_msg);
1603 con->out_msg = NULL; /* we're done with this one */
1604 goto do_next;
1605 }
1606
1607 ret = write_partial_msg_pages(con);
1608 if (ret == 1)
1609 goto more_kvec; /* we need to send the footer, too! */
1610 if (ret == 0)
1611 goto done;
1612 if (ret < 0) {
1613 dout("try_write write_partial_msg_pages err %d\n",
1614 ret);
1615 goto done;
1616 }
1617 }
1618
1619do_next:
1620 if (!test_bit(CONNECTING, &con->state)) {
1621 /* is anything else pending? */
1622 if (!list_empty(&con->out_queue)) {
1623 prepare_write_message(con);
1624 goto more;
1625 }
1626 if (con->in_seq > con->in_seq_acked) {
1627 prepare_write_ack(con);
1628 goto more;
1629 }
1630 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1631 prepare_write_keepalive(con);
1632 goto more;
1633 }
1634 }
1635
1636 /* Nothing to do! */
1637 clear_bit(WRITE_PENDING, &con->state);
1638 dout("try_write nothing else to write.\n");
1639done:
1640 ret = 0;
1641out:
1642 dout("try_write done on %p\n", con);
1643 return ret;
1644}
1645
1646
1647
1648/*
1649 * Read what we can from the socket.
1650 */
1651static int try_read(struct ceph_connection *con)
1652{
1653 int ret = -1;
1654
1655 if (!con->sock)
1656 return 0;
1657
1658 if (test_bit(STANDBY, &con->state))
1659 return 0;
1660
1661 dout("try_read start on %p\n", con);
1662
1663more:
1664 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1665 con->in_base_pos);
1666 if (test_bit(CONNECTING, &con->state)) {
1667 if (!test_bit(NEGOTIATING, &con->state)) {
1668 dout("try_read connecting\n");
1669 ret = read_partial_banner(con);
1670 if (ret <= 0)
1671 goto done;
1672 if (process_banner(con) < 0) {
1673 ret = -1;
1674 goto out;
1675 }
1676 }
1677 ret = read_partial_connect(con);
1678 if (ret <= 0)
1679 goto done;
1680 if (process_connect(con) < 0) {
1681 ret = -1;
1682 goto out;
1683 }
1684 goto more;
1685 }
1686
1687 if (con->in_base_pos < 0) {
1688 /*
1689 * skipping + discarding content.
1690 *
1691 * FIXME: there must be a better way to do this!
1692 */
1693 static char buf[1024];
1694 int skip = min(1024, -con->in_base_pos);
1695 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1696 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1697 if (ret <= 0)
1698 goto done;
1699 con->in_base_pos += ret;
1700 if (con->in_base_pos)
1701 goto more;
1702 }
1703 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1704 /*
1705 * what's next?
1706 */
1707 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1708 if (ret <= 0)
1709 goto done;
1710 dout("try_read got tag %d\n", (int)con->in_tag);
1711 switch (con->in_tag) {
1712 case CEPH_MSGR_TAG_MSG:
1713 prepare_read_message(con);
1714 break;
1715 case CEPH_MSGR_TAG_ACK:
1716 prepare_read_ack(con);
1717 break;
1718 case CEPH_MSGR_TAG_CLOSE:
1719 set_bit(CLOSED, &con->state); /* fixme */
1720 goto done;
1721 default:
1722 goto bad_tag;
1723 }
1724 }
1725 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1726 ret = read_partial_message(con);
1727 if (ret <= 0) {
1728 switch (ret) {
1729 case -EBADMSG:
1730 con->error_msg = "bad crc";
1731 ret = -EIO;
1732 goto out;
1733 case -EIO:
1734 con->error_msg = "io error";
1735 goto out;
1736 default:
1737 goto done;
1738 }
1739 }
1740 if (con->in_tag == CEPH_MSGR_TAG_READY)
1741 goto more;
1742 process_message(con);
1743 goto more;
1744 }
1745 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1746 ret = read_partial_ack(con);
1747 if (ret <= 0)
1748 goto done;
1749 process_ack(con);
1750 goto more;
1751 }
1752
1753done:
1754 ret = 0;
1755out:
1756 dout("try_read done on %p\n", con);
1757 return ret;
1758
1759bad_tag:
1760 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1761 con->error_msg = "protocol error, garbage tag";
1762 ret = -1;
1763 goto out;
1764}
1765
1766
1767/*
1768 * Atomically queue work on a connection. Bump @con reference to
1769 * avoid races with connection teardown.
1770 *
1771 * There is some trickery going on with QUEUED and BUSY because we
1772 * only want a _single_ thread operating on each connection at any
1773 * point in time, but we want to use all available CPUs.
1774 *
1775 * The worker thread only proceeds if it can atomically set BUSY. It
1776 * clears QUEUED and does it's thing. When it thinks it's done, it
1777 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1778 * (tries again to set BUSY).
1779 *
1780 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1781 * try to queue work. If that fails (work is already queued, or BUSY)
1782 * we give up (work also already being done or is queued) but leave QUEUED
1783 * set so that the worker thread will loop if necessary.
1784 */
1785static void queue_con(struct ceph_connection *con)
1786{
1787 if (test_bit(DEAD, &con->state)) {
1788 dout("queue_con %p ignoring: DEAD\n",
1789 con);
1790 return;
1791 }
1792
1793 if (!con->ops->get(con)) {
1794 dout("queue_con %p ref count 0\n", con);
1795 return;
1796 }
1797
1798 set_bit(QUEUED, &con->state);
1799 if (test_bit(BUSY, &con->state)) {
1800 dout("queue_con %p - already BUSY\n", con);
1801 con->ops->put(con);
1802 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1803 dout("queue_con %p - already queued\n", con);
1804 con->ops->put(con);
1805 } else {
1806 dout("queue_con %p\n", con);
1807 }
1808}
1809
1810/*
1811 * Do some work on a connection. Drop a connection ref when we're done.
1812 */
1813static void con_work(struct work_struct *work)
1814{
1815 struct ceph_connection *con = container_of(work, struct ceph_connection,
1816 work.work);
1817 int backoff = 0;
1818
1819more:
1820 if (test_and_set_bit(BUSY, &con->state) != 0) {
1821 dout("con_work %p BUSY already set\n", con);
1822 goto out;
1823 }
1824 dout("con_work %p start, clearing QUEUED\n", con);
1825 clear_bit(QUEUED, &con->state);
1826
1827 mutex_lock(&con->mutex);
1828
1829 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1830 dout("con_work CLOSED\n");
1831 con_close_socket(con);
1832 goto done;
1833 }
1834 if (test_and_clear_bit(OPENING, &con->state)) {
1835 /* reopen w/ new peer */
1836 dout("con_work OPENING\n");
1837 con_close_socket(con);
1838 }
1839
1840 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1841 try_read(con) < 0 ||
1842 try_write(con) < 0) {
1843 mutex_unlock(&con->mutex);
1844 backoff = 1;
1845 ceph_fault(con); /* error/fault path */
1846 goto done_unlocked;
1847 }
1848
1849done:
1850 mutex_unlock(&con->mutex);
1851
1852done_unlocked:
1853 clear_bit(BUSY, &con->state);
1854 dout("con->state=%lu\n", con->state);
1855 if (test_bit(QUEUED, &con->state)) {
1856 if (!backoff || test_bit(OPENING, &con->state)) {
1857 dout("con_work %p QUEUED reset, looping\n", con);
1858 goto more;
1859 }
1860 dout("con_work %p QUEUED reset, but just faulted\n", con);
1861 clear_bit(QUEUED, &con->state);
1862 }
1863 dout("con_work %p done\n", con);
1864
1865out:
1866 con->ops->put(con);
1867}
1868
1869
1870/*
1871 * Generic error/fault handler. A retry mechanism is used with
1872 * exponential backoff
1873 */
1874static void ceph_fault(struct ceph_connection *con)
1875{
1876 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1877 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1878 dout("fault %p state %lu to peer %s\n",
1879 con, con->state, pr_addr(&con->peer_addr.in_addr));
1880
1881 if (test_bit(LOSSYTX, &con->state)) {
1882 dout("fault on LOSSYTX channel\n");
1883 goto out;
1884 }
1885
1886 mutex_lock(&con->mutex);
1887 if (test_bit(CLOSED, &con->state))
1888 goto out_unlock;
1889
1890 con_close_socket(con);
1891
1892 if (con->in_msg) {
1893 ceph_msg_put(con->in_msg);
1894 con->in_msg = NULL;
1895 }
1896
1897 /* Requeue anything that hasn't been acked */
1898 list_splice_init(&con->out_sent, &con->out_queue);
1899
1900 /* If there are no messages in the queue, place the connection
1901 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1902 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1903 dout("fault setting STANDBY\n");
1904 set_bit(STANDBY, &con->state);
1905 } else {
1906 /* retry after a delay. */
1907 if (con->delay == 0)
1908 con->delay = BASE_DELAY_INTERVAL;
1909 else if (con->delay < MAX_DELAY_INTERVAL)
1910 con->delay *= 2;
1911 dout("fault queueing %p delay %lu\n", con, con->delay);
1912 con->ops->get(con);
1913 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1914 round_jiffies_relative(con->delay)) == 0)
1915 con->ops->put(con);
1916 }
1917
1918out_unlock:
1919 mutex_unlock(&con->mutex);
1920out:
1921 /*
1922 * in case we faulted due to authentication, invalidate our
1923 * current tickets so that we can get new ones.
1924 */
1925 if (con->auth_retry && con->ops->invalidate_authorizer) {
1926 dout("calling invalidate_authorizer()\n");
1927 con->ops->invalidate_authorizer(con);
1928 }
1929
1930 if (con->ops->fault)
1931 con->ops->fault(con);
1932}
1933
1934
1935
1936/*
1937 * create a new messenger instance
1938 */
1939struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1940{
1941 struct ceph_messenger *msgr;
1942
1943 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1944 if (msgr == NULL)
1945 return ERR_PTR(-ENOMEM);
1946
1947 spin_lock_init(&msgr->global_seq_lock);
1948
1949 /* the zero page is needed if a request is "canceled" while the message
1950 * is being written over the socket */
1951 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1952 if (!msgr->zero_page) {
1953 kfree(msgr);
1954 return ERR_PTR(-ENOMEM);
1955 }
1956 kmap(msgr->zero_page);
1957
1958 if (myaddr)
1959 msgr->inst.addr = *myaddr;
1960
1961 /* select a random nonce */
1962 msgr->inst.addr.type = 0;
1963 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1964 encode_my_addr(msgr);
1965
1966 dout("messenger_create %p\n", msgr);
1967 return msgr;
1968}
1969
1970void ceph_messenger_destroy(struct ceph_messenger *msgr)
1971{
1972 dout("destroy %p\n", msgr);
1973 kunmap(msgr->zero_page);
1974 __free_page(msgr->zero_page);
1975 kfree(msgr);
1976 dout("destroyed messenger %p\n", msgr);
1977}
1978
1979/*
1980 * Queue up an outgoing message on the given connection.
1981 */
1982void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1983{
1984 if (test_bit(CLOSED, &con->state)) {
1985 dout("con_send %p closed, dropping %p\n", con, msg);
1986 ceph_msg_put(msg);
1987 return;
1988 }
1989
1990 /* set src+dst */
1991 msg->hdr.src = con->msgr->inst.name;
1992
1993 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1994
1995 msg->needs_out_seq = true;
1996
1997 /* queue */
1998 mutex_lock(&con->mutex);
1999 BUG_ON(!list_empty(&msg->list_head));
2000 list_add_tail(&msg->list_head, &con->out_queue);
2001 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2002 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2003 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2004 le32_to_cpu(msg->hdr.front_len),
2005 le32_to_cpu(msg->hdr.middle_len),
2006 le32_to_cpu(msg->hdr.data_len));
2007 mutex_unlock(&con->mutex);
2008
2009 /* if there wasn't anything waiting to send before, queue
2010 * new work */
2011 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2012 queue_con(con);
2013}
2014
2015/*
2016 * Revoke a message that was previously queued for send
2017 */
2018void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2019{
2020 mutex_lock(&con->mutex);
2021 if (!list_empty(&msg->list_head)) {
2022 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2023 list_del_init(&msg->list_head);
2024 ceph_msg_put(msg);
2025 msg->hdr.seq = 0;
2026 }
2027 if (con->out_msg == msg) {
2028 dout("con_revoke %p msg %p - was sending\n", con, msg);
2029 con->out_msg = NULL;
2030 if (con->out_kvec_is_msg) {
2031 con->out_skip = con->out_kvec_bytes;
2032 con->out_kvec_is_msg = false;
2033 }
2034 ceph_msg_put(msg);
2035 msg->hdr.seq = 0;
2036 }
2037 mutex_unlock(&con->mutex);
2038}
2039
2040/*
2041 * Revoke a message that we may be reading data into
2042 */
2043void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2044{
2045 mutex_lock(&con->mutex);
2046 if (con->in_msg && con->in_msg == msg) {
2047 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2048 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2049 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2050
2051 /* skip rest of message */
2052 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2053 con->in_base_pos = con->in_base_pos -
2054 sizeof(struct ceph_msg_header) -
2055 front_len -
2056 middle_len -
2057 data_len -
2058 sizeof(struct ceph_msg_footer);
2059 ceph_msg_put(con->in_msg);
2060 con->in_msg = NULL;
2061 con->in_tag = CEPH_MSGR_TAG_READY;
2062 con->in_seq++;
2063 } else {
2064 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2065 con, con->in_msg, msg);
2066 }
2067 mutex_unlock(&con->mutex);
2068}
2069
2070/*
2071 * Queue a keepalive byte to ensure the tcp connection is alive.
2072 */
2073void ceph_con_keepalive(struct ceph_connection *con)
2074{
2075 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2076 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2077 queue_con(con);
2078}
2079
2080
2081/*
2082 * construct a new message with given type, size
2083 * the new msg has a ref count of 1.
2084 */
2085struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2086{
2087 struct ceph_msg *m;
2088
2089 m = kmalloc(sizeof(*m), flags);
2090 if (m == NULL)
2091 goto out;
2092 kref_init(&m->kref);
2093 INIT_LIST_HEAD(&m->list_head);
2094
2095 m->hdr.tid = 0;
2096 m->hdr.type = cpu_to_le16(type);
2097 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2098 m->hdr.version = 0;
2099 m->hdr.front_len = cpu_to_le32(front_len);
2100 m->hdr.middle_len = 0;
2101 m->hdr.data_len = 0;
2102 m->hdr.data_off = 0;
2103 m->hdr.reserved = 0;
2104 m->footer.front_crc = 0;
2105 m->footer.middle_crc = 0;
2106 m->footer.data_crc = 0;
2107 m->footer.flags = 0;
2108 m->front_max = front_len;
2109 m->front_is_vmalloc = false;
2110 m->more_to_follow = false;
2111 m->pool = NULL;
2112
2113 /* front */
2114 if (front_len) {
2115 if (front_len > PAGE_CACHE_SIZE) {
2116 m->front.iov_base = __vmalloc(front_len, flags,
2117 PAGE_KERNEL);
2118 m->front_is_vmalloc = true;
2119 } else {
2120 m->front.iov_base = kmalloc(front_len, flags);
2121 }
2122 if (m->front.iov_base == NULL) {
2123 pr_err("msg_new can't allocate %d bytes\n",
2124 front_len);
2125 goto out2;
2126 }
2127 } else {
2128 m->front.iov_base = NULL;
2129 }
2130 m->front.iov_len = front_len;
2131
2132 /* middle */
2133 m->middle = NULL;
2134
2135 /* data */
2136 m->nr_pages = 0;
2137 m->pages = NULL;
2138 m->pagelist = NULL;
2139
2140 dout("ceph_msg_new %p front %d\n", m, front_len);
2141 return m;
2142
2143out2:
2144 ceph_msg_put(m);
2145out:
2146 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2147 return NULL;
2148}
2149
2150/*
2151 * Allocate "middle" portion of a message, if it is needed and wasn't
2152 * allocated by alloc_msg. This allows us to read a small fixed-size
2153 * per-type header in the front and then gracefully fail (i.e.,
2154 * propagate the error to the caller based on info in the front) when
2155 * the middle is too large.
2156 */
2157static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2158{
2159 int type = le16_to_cpu(msg->hdr.type);
2160 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2161
2162 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2163 ceph_msg_type_name(type), middle_len);
2164 BUG_ON(!middle_len);
2165 BUG_ON(msg->middle);
2166
2167 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2168 if (!msg->middle)
2169 return -ENOMEM;
2170 return 0;
2171}
2172
2173/*
2174 * Generic message allocator, for incoming messages.
2175 */
2176static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2177 struct ceph_msg_header *hdr,
2178 int *skip)
2179{
2180 int type = le16_to_cpu(hdr->type);
2181 int front_len = le32_to_cpu(hdr->front_len);
2182 int middle_len = le32_to_cpu(hdr->middle_len);
2183 struct ceph_msg *msg = NULL;
2184 int ret;
2185
2186 if (con->ops->alloc_msg) {
2187 mutex_unlock(&con->mutex);
2188 msg = con->ops->alloc_msg(con, hdr, skip);
2189 mutex_lock(&con->mutex);
2190 if (!msg || *skip)
2191 return NULL;
2192 }
2193 if (!msg) {
2194 *skip = 0;
2195 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2196 if (!msg) {
2197 pr_err("unable to allocate msg type %d len %d\n",
2198 type, front_len);
2199 return NULL;
2200 }
2201 }
2202 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2203
2204 if (middle_len && !msg->middle) {
2205 ret = ceph_alloc_middle(con, msg);
2206 if (ret < 0) {
2207 ceph_msg_put(msg);
2208 return NULL;
2209 }
2210 }
2211
2212 return msg;
2213}
2214
2215
2216/*
2217 * Free a generically kmalloc'd message.
2218 */
2219void ceph_msg_kfree(struct ceph_msg *m)
2220{
2221 dout("msg_kfree %p\n", m);
2222 if (m->front_is_vmalloc)
2223 vfree(m->front.iov_base);
2224 else
2225 kfree(m->front.iov_base);
2226 kfree(m);
2227}
2228
2229/*
2230 * Drop a msg ref. Destroy as needed.
2231 */
2232void ceph_msg_last_put(struct kref *kref)
2233{
2234 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2235
2236 dout("ceph_msg_put last one on %p\n", m);
2237 WARN_ON(!list_empty(&m->list_head));
2238
2239 /* drop middle, data, if any */
2240 if (m->middle) {
2241 ceph_buffer_put(m->middle);
2242 m->middle = NULL;
2243 }
2244 m->nr_pages = 0;
2245 m->pages = NULL;
2246
2247 if (m->pagelist) {
2248 ceph_pagelist_release(m->pagelist);
2249 kfree(m->pagelist);
2250 m->pagelist = NULL;
2251 }
2252
2253 if (m->pool)
2254 ceph_msgpool_put(m->pool, m);
2255 else
2256 ceph_msg_kfree(m);
2257}
2258
2259void ceph_msg_dump(struct ceph_msg *msg)
2260{
2261 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2262 msg->front_max, msg->nr_pages);
2263 print_hex_dump(KERN_DEBUG, "header: ",
2264 DUMP_PREFIX_OFFSET, 16, 1,
2265 &msg->hdr, sizeof(msg->hdr), true);
2266 print_hex_dump(KERN_DEBUG, " front: ",
2267 DUMP_PREFIX_OFFSET, 16, 1,
2268 msg->front.iov_base, msg->front.iov_len, true);
2269 if (msg->middle)
2270 print_hex_dump(KERN_DEBUG, "middle: ",
2271 DUMP_PREFIX_OFFSET, 16, 1,
2272 msg->middle->vec.iov_base,
2273 msg->middle->vec.iov_len, true);
2274 print_hex_dump(KERN_DEBUG, "footer: ",
2275 DUMP_PREFIX_OFFSET, 16, 1,
2276 &msg->footer, sizeof(msg->footer), true);
2277}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc13..000000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52/* use format string %s%d */
53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
54
55struct ceph_messenger {
56 struct ceph_entity_inst inst; /* my name+address */
57 struct ceph_entity_addr my_enc_addr;
58 struct page *zero_page; /* used in certain error cases */
59
60 bool nocrc;
61
62 /*
63 * the global_seq counts connections i (attempt to) initiate
64 * in order to disambiguate certain connect race conditions.
65 */
66 u32 global_seq;
67 spinlock_t global_seq_lock;
68};
69
70/*
71 * a single message. it contains a header (src, dest, message type, etc.),
72 * footer (crc values, mainly), a "front" message body, and possibly a
73 * data payload (stored in some number of pages).
74 */
75struct ceph_msg {
76 struct ceph_msg_header hdr; /* header */
77 struct ceph_msg_footer footer; /* footer */
78 struct kvec front; /* unaligned blobs of message */
79 struct ceph_buffer *middle;
80 struct page **pages; /* data payload. NOT OWNER. */
81 unsigned nr_pages; /* size of page array */
82 struct ceph_pagelist *pagelist; /* instead of pages */
83 struct list_head list_head;
84 struct kref kref;
85 bool front_is_vmalloc;
86 bool more_to_follow;
87 bool needs_out_seq;
88 int front_max;
89
90 struct ceph_msgpool *pool;
91};
92
93struct ceph_msg_pos {
94 int page, page_pos; /* which page; offset in page */
95 int data_pos; /* offset in data payload */
96 int did_page_crc; /* true if we've calculated crc for current page */
97};
98
99/* ceph connection fault delay defaults, for exponential backoff */
100#define BASE_DELAY_INTERVAL (HZ/2)
101#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
102
103/*
104 * ceph_connection state bit flags
105 *
106 * QUEUED and BUSY are used together to ensure that only a single
107 * thread is currently opening, reading or writing data to the socket.
108 */
109#define LOSSYTX 0 /* we can close channel or drop messages on errors */
110#define CONNECTING 1
111#define NEGOTIATING 2
112#define KEEPALIVE_PENDING 3
113#define WRITE_PENDING 4 /* we have data ready to send */
114#define QUEUED 5 /* there is work queued on this connection */
115#define BUSY 6 /* work is being done */
116#define STANDBY 8 /* no outgoing messages, socket closed. we keep
117 * the ceph_connection around to maintain shared
118 * state with the peer. */
119#define CLOSED 10 /* we've closed the connection */
120#define SOCK_CLOSED 11 /* socket state changed to closed */
121#define OPENING 13 /* open connection w/ (possibly new) peer */
122#define DEAD 14 /* dead, about to kfree */
123
124/*
125 * A single connection with another host.
126 *
127 * We maintain a queue of outgoing messages, and some session state to
128 * ensure that we can preserve the lossless, ordered delivery of
129 * messages in the case of a TCP disconnect.
130 */
131struct ceph_connection {
132 void *private;
133 atomic_t nref;
134
135 const struct ceph_connection_operations *ops;
136
137 struct ceph_messenger *msgr;
138 struct socket *sock;
139 unsigned long state; /* connection state (see flags above) */
140 const char *error_msg; /* error message, if any */
141
142 struct ceph_entity_addr peer_addr; /* peer address */
143 struct ceph_entity_name peer_name; /* peer name */
144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 bool out_keepalive_pending;
161
162 u64 in_seq, in_seq_acked; /* last message received, acked */
163
164 /* connection negotiation temps */
165 char in_banner[CEPH_BANNER_MAX_LEN];
166 union {
167 struct { /* outgoing connection */
168 struct ceph_msg_connect out_connect;
169 struct ceph_msg_connect_reply in_reply;
170 };
171 struct { /* incoming */
172 struct ceph_msg_connect in_connect;
173 struct ceph_msg_connect_reply out_reply;
174 };
175 };
176 struct ceph_entity_addr actual_peer_addr;
177
178 /* message out temps */
179 struct ceph_msg *out_msg; /* sending message (== tail of
180 out_sent) */
181 bool out_msg_done;
182 struct ceph_msg_pos out_msg_pos;
183
184 struct kvec out_kvec[8], /* sending header/footer data */
185 *out_kvec_cur;
186 int out_kvec_left; /* kvec's left in out_kvec */
187 int out_skip; /* skip this many bytes */
188 int out_kvec_bytes; /* total bytes left */
189 bool out_kvec_is_msg; /* kvec refers to out_msg */
190 int out_more; /* there is more data after the kvecs */
191 __le64 out_temp_ack; /* for writing an ack */
192
193 /* message in temps */
194 struct ceph_msg_header in_hdr;
195 struct ceph_msg *in_msg;
196 struct ceph_msg_pos in_msg_pos;
197 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
198
199 char in_tag; /* protocol control byte */
200 int in_base_pos; /* bytes read */
201 __le64 in_temp_ack; /* for reading an ack */
202
203 struct delayed_work work; /* send|recv work */
204 unsigned long delay; /* current delay interval */
205};
206
207
208extern const char *pr_addr(const struct sockaddr_storage *ss);
209extern int ceph_parse_ips(const char *c, const char *end,
210 struct ceph_entity_addr *addr,
211 int max_count, int *count);
212
213
214extern int ceph_msgr_init(void);
215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
237extern void ceph_msg_kfree(struct ceph_msg *m);
238
239
240static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
241{
242 kref_get(&msg->kref);
243 return msg;
244}
245extern void ceph_msg_last_put(struct kref *kref);
246static inline void ceph_msg_put(struct ceph_msg *msg)
247{
248 kref_put(&msg->kref, ceph_msg_last_put);
249}
250
251extern void ceph_msg_dump(struct ceph_msg *msg);
252
253#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index b2a5a3e4a671..000000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31static const struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
108 ceph_msg_get(monc->m_auth); /* keep our ref */
109 ceph_con_send(monc->con, monc->m_auth);
110}
111
112/*
113 * Close monitor session, if any.
114 */
115static void __close_session(struct ceph_mon_client *monc)
116{
117 if (monc->con) {
118 dout("__close_session closing mon%d\n", monc->cur_mon);
119 ceph_con_revoke(monc->con, monc->m_auth);
120 ceph_con_close(monc->con);
121 monc->cur_mon = -1;
122 monc->pending_auth = 0;
123 ceph_auth_reset(monc->auth);
124 }
125}
126
127/*
128 * Open a session with a (new) monitor.
129 */
130static int __open_session(struct ceph_mon_client *monc)
131{
132 char r;
133 int ret;
134
135 if (monc->cur_mon < 0) {
136 get_random_bytes(&r, 1);
137 monc->cur_mon = r % monc->monmap->num_mon;
138 dout("open_session num=%d r=%d -> mon%d\n",
139 monc->monmap->num_mon, r, monc->cur_mon);
140 monc->sub_sent = 0;
141 monc->sub_renew_after = jiffies; /* i.e., expired */
142 monc->want_next_osdmap = !!monc->want_next_osdmap;
143
144 dout("open_session mon%d opening\n", monc->cur_mon);
145 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
146 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
147 ceph_con_open(monc->con,
148 &monc->monmap->mon_inst[monc->cur_mon].addr);
149
150 /* initiatiate authentication handshake */
151 ret = ceph_auth_build_hello(monc->auth,
152 monc->m_auth->front.iov_base,
153 monc->m_auth->front_max);
154 __send_prepared_auth_request(monc, ret);
155 } else {
156 dout("open_session mon%d already open\n", monc->cur_mon);
157 }
158 return 0;
159}
160
161static bool __sub_expired(struct ceph_mon_client *monc)
162{
163 return time_after_eq(jiffies, monc->sub_renew_after);
164}
165
166/*
167 * Reschedule delayed work timer.
168 */
169static void __schedule_delayed(struct ceph_mon_client *monc)
170{
171 unsigned delay;
172
173 if (monc->cur_mon < 0 || __sub_expired(monc))
174 delay = 10 * HZ;
175 else
176 delay = 20 * HZ;
177 dout("__schedule_delayed after %u\n", delay);
178 schedule_delayed_work(&monc->delayed_work, delay);
179}
180
181/*
182 * Send subscribe request for mdsmap and/or osdmap.
183 */
184static void __send_subscribe(struct ceph_mon_client *monc)
185{
186 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
187 (unsigned)monc->sub_sent, __sub_expired(monc),
188 monc->want_next_osdmap);
189 if ((__sub_expired(monc) && !monc->sub_sent) ||
190 monc->want_next_osdmap == 1) {
191 struct ceph_msg *msg = monc->m_subscribe;
192 struct ceph_mon_subscribe_item *i;
193 void *p, *end;
194
195 p = msg->front.iov_base;
196 end = p + msg->front_max;
197
198 dout("__send_subscribe to 'mdsmap' %u+\n",
199 (unsigned)monc->have_mdsmap);
200 if (monc->want_next_osdmap) {
201 dout("__send_subscribe to 'osdmap' %u\n",
202 (unsigned)monc->have_osdmap);
203 ceph_encode_32(&p, 3);
204 ceph_encode_string(&p, end, "osdmap", 6);
205 i = p;
206 i->have = cpu_to_le64(monc->have_osdmap);
207 i->onetime = 1;
208 p += sizeof(*i);
209 monc->want_next_osdmap = 2; /* requested */
210 } else {
211 ceph_encode_32(&p, 2);
212 }
213 ceph_encode_string(&p, end, "mdsmap", 6);
214 i = p;
215 i->have = cpu_to_le64(monc->have_mdsmap);
216 i->onetime = 0;
217 p += sizeof(*i);
218 ceph_encode_string(&p, end, "monmap", 6);
219 i = p;
220 i->have = 0;
221 i->onetime = 0;
222 p += sizeof(*i);
223
224 msg->front.iov_len = p - msg->front.iov_base;
225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
228
229 monc->sub_sent = jiffies | 1; /* never 0 */
230 }
231}
232
233static void handle_subscribe_ack(struct ceph_mon_client *monc,
234 struct ceph_msg *msg)
235{
236 unsigned seconds;
237 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
238
239 if (msg->front.iov_len < sizeof(*h))
240 goto bad;
241 seconds = le32_to_cpu(h->duration);
242
243 mutex_lock(&monc->mutex);
244 if (monc->hunting) {
245 pr_info("mon%d %s session established\n",
246 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
247 monc->hunting = false;
248 }
249 dout("handle_subscribe_ack after %d seconds\n", seconds);
250 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
251 monc->sub_sent = 0;
252 mutex_unlock(&monc->mutex);
253 return;
254bad:
255 pr_err("got corrupt subscribe-ack msg\n");
256 ceph_msg_dump(msg);
257}
258
259/*
260 * Keep track of which maps we have
261 */
262int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
263{
264 mutex_lock(&monc->mutex);
265 monc->have_mdsmap = got;
266 mutex_unlock(&monc->mutex);
267 return 0;
268}
269
270int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
271{
272 mutex_lock(&monc->mutex);
273 monc->have_osdmap = got;
274 monc->want_next_osdmap = 0;
275 mutex_unlock(&monc->mutex);
276 return 0;
277}
278
279/*
280 * Register interest in the next osdmap
281 */
282void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
283{
284 dout("request_next_osdmap have %u\n", monc->have_osdmap);
285 mutex_lock(&monc->mutex);
286 if (!monc->want_next_osdmap)
287 monc->want_next_osdmap = 1;
288 if (monc->want_next_osdmap < 2)
289 __send_subscribe(monc);
290 mutex_unlock(&monc->mutex);
291}
292
293/*
294 *
295 */
296int ceph_monc_open_session(struct ceph_mon_client *monc)
297{
298 if (!monc->con) {
299 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
300 if (!monc->con)
301 return -ENOMEM;
302 ceph_con_init(monc->client->msgr, monc->con);
303 monc->con->private = monc;
304 monc->con->ops = &mon_con_ops;
305 }
306
307 mutex_lock(&monc->mutex);
308 __open_session(monc);
309 __schedule_delayed(monc);
310 mutex_unlock(&monc->mutex);
311 return 0;
312}
313
314/*
315 * The monitor responds with mount ack indicate mount success. The
316 * included client ticket allows the client to talk to MDSs and OSDs.
317 */
318static void ceph_monc_handle_map(struct ceph_mon_client *monc,
319 struct ceph_msg *msg)
320{
321 struct ceph_client *client = monc->client;
322 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
323 void *p, *end;
324
325 mutex_lock(&monc->mutex);
326
327 dout("handle_monmap\n");
328 p = msg->front.iov_base;
329 end = p + msg->front.iov_len;
330
331 monmap = ceph_monmap_decode(p, end);
332 if (IS_ERR(monmap)) {
333 pr_err("problem decoding monmap, %d\n",
334 (int)PTR_ERR(monmap));
335 goto out;
336 }
337
338 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
339 kfree(monmap);
340 goto out;
341 }
342
343 client->monc.monmap = monmap;
344 kfree(old);
345
346out:
347 mutex_unlock(&monc->mutex);
348 wake_up_all(&client->auth_wq);
349}
350
351/*
352 * generic requests (e.g., statfs, poolop)
353 */
354static struct ceph_mon_generic_request *__lookup_generic_req(
355 struct ceph_mon_client *monc, u64 tid)
356{
357 struct ceph_mon_generic_request *req;
358 struct rb_node *n = monc->generic_request_tree.rb_node;
359
360 while (n) {
361 req = rb_entry(n, struct ceph_mon_generic_request, node);
362 if (tid < req->tid)
363 n = n->rb_left;
364 else if (tid > req->tid)
365 n = n->rb_right;
366 else
367 return req;
368 }
369 return NULL;
370}
371
372static void __insert_generic_request(struct ceph_mon_client *monc,
373 struct ceph_mon_generic_request *new)
374{
375 struct rb_node **p = &monc->generic_request_tree.rb_node;
376 struct rb_node *parent = NULL;
377 struct ceph_mon_generic_request *req = NULL;
378
379 while (*p) {
380 parent = *p;
381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
382 if (new->tid < req->tid)
383 p = &(*p)->rb_left;
384 else if (new->tid > req->tid)
385 p = &(*p)->rb_right;
386 else
387 BUG();
388 }
389
390 rb_link_node(&new->node, parent, p);
391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403
404 kfree(req);
405}
406
407static void put_generic_request(struct ceph_mon_generic_request *req)
408{
409 kref_put(&req->kref, release_generic_request);
410}
411
412static void get_generic_request(struct ceph_mon_generic_request *req)
413{
414 kref_get(&req->kref);
415}
416
417static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
418 struct ceph_msg_header *hdr,
419 int *skip)
420{
421 struct ceph_mon_client *monc = con->private;
422 struct ceph_mon_generic_request *req;
423 u64 tid = le64_to_cpu(hdr->tid);
424 struct ceph_msg *m;
425
426 mutex_lock(&monc->mutex);
427 req = __lookup_generic_req(monc, tid);
428 if (!req) {
429 dout("get_generic_reply %lld dne\n", tid);
430 *skip = 1;
431 m = NULL;
432 } else {
433 dout("get_generic_reply %lld got %p\n", tid, req->reply);
434 m = ceph_msg_get(req->reply);
435 /*
436 * we don't need to track the connection reading into
437 * this reply because we only have one open connection
438 * at a time, ever.
439 */
440 }
441 mutex_unlock(&monc->mutex);
442 return m;
443}
444
445static int do_generic_request(struct ceph_mon_client *monc,
446 struct ceph_mon_generic_request *req)
447{
448 int err;
449
450 /* register request */
451 mutex_lock(&monc->mutex);
452 req->tid = ++monc->last_tid;
453 req->request->hdr.tid = cpu_to_le64(req->tid);
454 __insert_generic_request(monc, req);
455 monc->num_generic_requests++;
456 ceph_con_send(monc->con, ceph_msg_get(req->request));
457 mutex_unlock(&monc->mutex);
458
459 err = wait_for_completion_interruptible(&req->completion);
460
461 mutex_lock(&monc->mutex);
462 rb_erase(&req->node, &monc->generic_request_tree);
463 monc->num_generic_requests--;
464 mutex_unlock(&monc->mutex);
465
466 if (!err)
467 err = req->result;
468 return err;
469}
470
471/*
472 * statfs
473 */
474static void handle_statfs_reply(struct ceph_mon_client *monc,
475 struct ceph_msg *msg)
476{
477 struct ceph_mon_generic_request *req;
478 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
479 u64 tid = le64_to_cpu(msg->hdr.tid);
480
481 if (msg->front.iov_len != sizeof(*reply))
482 goto bad;
483 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
484
485 mutex_lock(&monc->mutex);
486 req = __lookup_generic_req(monc, tid);
487 if (req) {
488 *(struct ceph_statfs *)req->buf = reply->st;
489 req->result = 0;
490 get_generic_request(req);
491 }
492 mutex_unlock(&monc->mutex);
493 if (req) {
494 complete_all(&req->completion);
495 put_generic_request(req);
496 }
497 return;
498
499bad:
500 pr_err("corrupt generic reply, tid %llu\n", tid);
501 ceph_msg_dump(msg);
502}
503
504/*
505 * Do a synchronous statfs().
506 */
507int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
508{
509 struct ceph_mon_generic_request *req;
510 struct ceph_mon_statfs *h;
511 int err;
512
513 req = kzalloc(sizeof(*req), GFP_NOFS);
514 if (!req)
515 return -ENOMEM;
516
517 kref_init(&req->kref);
518 req->buf = buf;
519 req->buf_len = sizeof(*buf);
520 init_completion(&req->completion);
521
522 err = -ENOMEM;
523 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
524 if (!req->request)
525 goto out;
526 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
527 if (!req->reply)
528 goto out;
529
530 /* fill out request */
531 h = req->request->front.iov_base;
532 h->monhdr.have_version = 0;
533 h->monhdr.session_mon = cpu_to_le16(-1);
534 h->monhdr.session_mon_tid = 0;
535 h->fsid = monc->monmap->fsid;
536
537 err = do_generic_request(monc, req);
538
539out:
540 kref_put(&req->kref, release_generic_request);
541 return err;
542}
543
544/*
545 * pool ops
546 */
547static int get_poolop_reply_buf(const char *src, size_t src_len,
548 char *dst, size_t dst_len)
549{
550 u32 buf_len;
551
552 if (src_len != sizeof(u32) + dst_len)
553 return -EINVAL;
554
555 buf_len = le32_to_cpu(*(u32 *)src);
556 if (buf_len != dst_len)
557 return -EINVAL;
558
559 memcpy(dst, src + sizeof(u32), dst_len);
560 return 0;
561}
562
563static void handle_poolop_reply(struct ceph_mon_client *monc,
564 struct ceph_msg *msg)
565{
566 struct ceph_mon_generic_request *req;
567 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
568 u64 tid = le64_to_cpu(msg->hdr.tid);
569
570 if (msg->front.iov_len < sizeof(*reply))
571 goto bad;
572 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
573
574 mutex_lock(&monc->mutex);
575 req = __lookup_generic_req(monc, tid);
576 if (req) {
577 if (req->buf_len &&
578 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
579 msg->front.iov_len - sizeof(*reply),
580 req->buf, req->buf_len) < 0) {
581 mutex_unlock(&monc->mutex);
582 goto bad;
583 }
584 req->result = le32_to_cpu(reply->reply_code);
585 get_generic_request(req);
586 }
587 mutex_unlock(&monc->mutex);
588 if (req) {
589 complete(&req->completion);
590 put_generic_request(req);
591 }
592 return;
593
594bad:
595 pr_err("corrupt generic reply, tid %llu\n", tid);
596 ceph_msg_dump(msg);
597}
598
599/*
600 * Do a synchronous pool op.
601 */
602int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
603 u32 pool, u64 snapid,
604 char *buf, int len)
605{
606 struct ceph_mon_generic_request *req;
607 struct ceph_mon_poolop *h;
608 int err;
609
610 req = kzalloc(sizeof(*req), GFP_NOFS);
611 if (!req)
612 return -ENOMEM;
613
614 kref_init(&req->kref);
615 req->buf = buf;
616 req->buf_len = len;
617 init_completion(&req->completion);
618
619 err = -ENOMEM;
620 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
621 if (!req->request)
622 goto out;
623 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
624 if (!req->reply)
625 goto out;
626
627 /* fill out request */
628 req->request->hdr.version = cpu_to_le16(2);
629 h = req->request->front.iov_base;
630 h->monhdr.have_version = 0;
631 h->monhdr.session_mon = cpu_to_le16(-1);
632 h->monhdr.session_mon_tid = 0;
633 h->fsid = monc->monmap->fsid;
634 h->pool = cpu_to_le32(pool);
635 h->op = cpu_to_le32(op);
636 h->auid = 0;
637 h->snapid = cpu_to_le64(snapid);
638 h->name_len = 0;
639
640 err = do_generic_request(monc, req);
641
642out:
643 kref_put(&req->kref, release_generic_request);
644 return err;
645}
646
647int ceph_monc_create_snapid(struct ceph_mon_client *monc,
648 u32 pool, u64 *snapid)
649{
650 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
651 pool, 0, (char *)snapid, sizeof(*snapid));
652
653}
654
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid)
657{
658 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
659 pool, snapid, 0, 0);
660
661}
662
663/*
664 * Resend pending generic requests.
665 */
666static void __resend_generic_request(struct ceph_mon_client *monc)
667{
668 struct ceph_mon_generic_request *req;
669 struct rb_node *p;
670
671 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
672 req = rb_entry(p, struct ceph_mon_generic_request, node);
673 ceph_con_revoke(monc->con, req->request);
674 ceph_con_send(monc->con, ceph_msg_get(req->request));
675 }
676}
677
678/*
679 * Delayed work. If we haven't mounted yet, retry. Otherwise,
680 * renew/retry subscription as needed (in case it is timing out, or we
681 * got an ENOMEM). And keep the monitor connection alive.
682 */
683static void delayed_work(struct work_struct *work)
684{
685 struct ceph_mon_client *monc =
686 container_of(work, struct ceph_mon_client, delayed_work.work);
687
688 dout("monc delayed_work\n");
689 mutex_lock(&monc->mutex);
690 if (monc->hunting) {
691 __close_session(monc);
692 __open_session(monc); /* continue hunting */
693 } else {
694 ceph_con_keepalive(monc->con);
695
696 __validate_auth(monc);
697
698 if (monc->auth->ops->is_authenticated(monc->auth))
699 __send_subscribe(monc);
700 }
701 __schedule_delayed(monc);
702 mutex_unlock(&monc->mutex);
703}
704
705/*
706 * On startup, we build a temporary monmap populated with the IPs
707 * provided by mount(2).
708 */
709static int build_initial_monmap(struct ceph_mon_client *monc)
710{
711 struct ceph_mount_args *args = monc->client->mount_args;
712 struct ceph_entity_addr *mon_addr = args->mon_addr;
713 int num_mon = args->num_mon;
714 int i;
715
716 /* build initial monmap */
717 monc->monmap = kzalloc(sizeof(*monc->monmap) +
718 num_mon*sizeof(monc->monmap->mon_inst[0]),
719 GFP_KERNEL);
720 if (!monc->monmap)
721 return -ENOMEM;
722 for (i = 0; i < num_mon; i++) {
723 monc->monmap->mon_inst[i].addr = mon_addr[i];
724 monc->monmap->mon_inst[i].addr.nonce = 0;
725 monc->monmap->mon_inst[i].name.type =
726 CEPH_ENTITY_TYPE_MON;
727 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
728 }
729 monc->monmap->num_mon = num_mon;
730 monc->have_fsid = false;
731
732 /* release addr memory */
733 kfree(args->mon_addr);
734 args->mon_addr = NULL;
735 args->num_mon = 0;
736 return 0;
737}
738
739int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
740{
741 int err = 0;
742
743 dout("init\n");
744 memset(monc, 0, sizeof(*monc));
745 monc->client = cl;
746 monc->monmap = NULL;
747 mutex_init(&monc->mutex);
748
749 err = build_initial_monmap(monc);
750 if (err)
751 goto out;
752
753 monc->con = NULL;
754
755 /* authentication */
756 monc->auth = ceph_auth_init(cl->mount_args->name,
757 cl->mount_args->secret);
758 if (IS_ERR(monc->auth))
759 return PTR_ERR(monc->auth);
760 monc->auth->want_keys =
761 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
762 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
763
764 /* msgs */
765 err = -ENOMEM;
766 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
767 sizeof(struct ceph_mon_subscribe_ack),
768 GFP_NOFS);
769 if (!monc->m_subscribe_ack)
770 goto out_monmap;
771
772 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
773 if (!monc->m_subscribe)
774 goto out_subscribe_ack;
775
776 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
777 if (!monc->m_auth_reply)
778 goto out_subscribe;
779
780 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
781 monc->pending_auth = 0;
782 if (!monc->m_auth)
783 goto out_auth_reply;
784
785 monc->cur_mon = -1;
786 monc->hunting = true;
787 monc->sub_renew_after = jiffies;
788 monc->sub_sent = 0;
789
790 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
791 monc->generic_request_tree = RB_ROOT;
792 monc->num_generic_requests = 0;
793 monc->last_tid = 0;
794
795 monc->have_mdsmap = 0;
796 monc->have_osdmap = 0;
797 monc->want_next_osdmap = 1;
798 return 0;
799
800out_auth_reply:
801 ceph_msg_put(monc->m_auth_reply);
802out_subscribe:
803 ceph_msg_put(monc->m_subscribe);
804out_subscribe_ack:
805 ceph_msg_put(monc->m_subscribe_ack);
806out_monmap:
807 kfree(monc->monmap);
808out:
809 return err;
810}
811
812void ceph_monc_stop(struct ceph_mon_client *monc)
813{
814 dout("stop\n");
815 cancel_delayed_work_sync(&monc->delayed_work);
816
817 mutex_lock(&monc->mutex);
818 __close_session(monc);
819 if (monc->con) {
820 monc->con->private = NULL;
821 monc->con->ops->put(monc->con);
822 monc->con = NULL;
823 }
824 mutex_unlock(&monc->mutex);
825
826 ceph_auth_destroy(monc->auth);
827
828 ceph_msg_put(monc->m_auth);
829 ceph_msg_put(monc->m_auth_reply);
830 ceph_msg_put(monc->m_subscribe);
831 ceph_msg_put(monc->m_subscribe_ack);
832
833 kfree(monc->monmap);
834}
835
836static void handle_auth_reply(struct ceph_mon_client *monc,
837 struct ceph_msg *msg)
838{
839 int ret;
840 int was_auth = 0;
841
842 mutex_lock(&monc->mutex);
843 if (monc->auth->ops)
844 was_auth = monc->auth->ops->is_authenticated(monc->auth);
845 monc->pending_auth = 0;
846 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
847 msg->front.iov_len,
848 monc->m_auth->front.iov_base,
849 monc->m_auth->front_max);
850 if (ret < 0) {
851 monc->client->auth_err = ret;
852 wake_up_all(&monc->client->auth_wq);
853 } else if (ret > 0) {
854 __send_prepared_auth_request(monc, ret);
855 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
856 dout("authenticated, starting session\n");
857
858 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
859 monc->client->msgr->inst.name.num =
860 cpu_to_le64(monc->auth->global_id);
861
862 __send_subscribe(monc);
863 __resend_generic_request(monc);
864 }
865 mutex_unlock(&monc->mutex);
866}
867
868static int __validate_auth(struct ceph_mon_client *monc)
869{
870 int ret;
871
872 if (monc->pending_auth)
873 return 0;
874
875 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
876 monc->m_auth->front_max);
877 if (ret <= 0)
878 return ret; /* either an error, or no need to authenticate */
879 __send_prepared_auth_request(monc, ret);
880 return 0;
881}
882
883int ceph_monc_validate_auth(struct ceph_mon_client *monc)
884{
885 int ret;
886
887 mutex_lock(&monc->mutex);
888 ret = __validate_auth(monc);
889 mutex_unlock(&monc->mutex);
890 return ret;
891}
892
893/*
894 * handle incoming message
895 */
896static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
897{
898 struct ceph_mon_client *monc = con->private;
899 int type = le16_to_cpu(msg->hdr.type);
900
901 if (!monc)
902 return;
903
904 switch (type) {
905 case CEPH_MSG_AUTH_REPLY:
906 handle_auth_reply(monc, msg);
907 break;
908
909 case CEPH_MSG_MON_SUBSCRIBE_ACK:
910 handle_subscribe_ack(monc, msg);
911 break;
912
913 case CEPH_MSG_STATFS_REPLY:
914 handle_statfs_reply(monc, msg);
915 break;
916
917 case CEPH_MSG_POOLOP_REPLY:
918 handle_poolop_reply(monc, msg);
919 break;
920
921 case CEPH_MSG_MON_MAP:
922 ceph_monc_handle_map(monc, msg);
923 break;
924
925 case CEPH_MSG_MDS_MAP:
926 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
927 break;
928
929 case CEPH_MSG_OSD_MAP:
930 ceph_osdc_handle_map(&monc->client->osdc, msg);
931 break;
932
933 default:
934 pr_err("received unknown message type %d %s\n", type,
935 ceph_msg_type_name(type));
936 }
937 ceph_msg_put(msg);
938}
939
940/*
941 * Allocate memory for incoming message
942 */
943static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
944 struct ceph_msg_header *hdr,
945 int *skip)
946{
947 struct ceph_mon_client *monc = con->private;
948 int type = le16_to_cpu(hdr->type);
949 int front_len = le32_to_cpu(hdr->front_len);
950 struct ceph_msg *m = NULL;
951
952 *skip = 0;
953
954 switch (type) {
955 case CEPH_MSG_MON_SUBSCRIBE_ACK:
956 m = ceph_msg_get(monc->m_subscribe_ack);
957 break;
958 case CEPH_MSG_POOLOP_REPLY:
959 case CEPH_MSG_STATFS_REPLY:
960 return get_generic_reply(con, hdr, skip);
961 case CEPH_MSG_AUTH_REPLY:
962 m = ceph_msg_get(monc->m_auth_reply);
963 break;
964 case CEPH_MSG_MON_MAP:
965 case CEPH_MSG_MDS_MAP:
966 case CEPH_MSG_OSD_MAP:
967 m = ceph_msg_new(type, front_len, GFP_NOFS);
968 break;
969 }
970
971 if (!m) {
972 pr_info("alloc_msg unknown type %d\n", type);
973 *skip = 1;
974 }
975 return m;
976}
977
978/*
979 * If the monitor connection resets, pick a new monitor and resubmit
980 * any pending requests.
981 */
982static void mon_fault(struct ceph_connection *con)
983{
984 struct ceph_mon_client *monc = con->private;
985
986 if (!monc)
987 return;
988
989 dout("mon_fault\n");
990 mutex_lock(&monc->mutex);
991 if (!con->private)
992 goto out;
993
994 if (monc->con && !monc->hunting)
995 pr_info("mon%d %s session lost, "
996 "hunting for new mon\n", monc->cur_mon,
997 pr_addr(&monc->con->peer_addr.in_addr));
998
999 __close_session(monc);
1000 if (!monc->hunting) {
1001 /* start hunting */
1002 monc->hunting = true;
1003 __open_session(monc);
1004 } else {
1005 /* already hunting, let's wait a bit */
1006 __schedule_delayed(monc);
1007 }
1008out:
1009 mutex_unlock(&monc->mutex);
1010}
1011
1012static const struct ceph_connection_operations mon_con_ops = {
1013 .get = ceph_con_get,
1014 .put = ceph_con_put,
1015 .dispatch = dispatch,
1016 .fault = mon_fault,
1017 .alloc_msg = mon_alloc_msg,
1018};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 8e396f2c0963..000000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/rbtree.h>
7
8#include "messenger.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_generic_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
45 * to the caller
46 */
47struct ceph_mon_generic_request {
48 struct kref kref;
49 u64 tid;
50 struct rb_node node;
51 int result;
52 void *buf;
53 int buf_len;
54 struct completion completion;
55 struct ceph_msg *request; /* original request */
56 struct ceph_msg *reply; /* and reply */
57};
58
59struct ceph_mon_client {
60 struct ceph_client *client;
61 struct ceph_monmap *monmap;
62
63 struct mutex mutex;
64 struct delayed_work delayed_work;
65
66 struct ceph_auth_client *auth;
67 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
68 int pending_auth;
69
70 bool hunting;
71 int cur_mon; /* last monitor i contacted */
72 unsigned long sub_sent, sub_renew_after;
73 struct ceph_connection *con;
74 bool have_fsid;
75
76 /* pending generic requests */
77 struct rb_root generic_request_tree;
78 int num_generic_requests;
79 u64 last_tid;
80
81 /* mds/osd map */
82 int want_next_osdmap; /* 1 = want, 2 = want+asked */
83 u32 have_osdmap, have_mdsmap;
84
85#ifdef CONFIG_DEBUG_FS
86 struct dentry *debugfs_file;
87#endif
88};
89
90extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
91extern int ceph_monmap_contains(struct ceph_monmap *m,
92 struct ceph_entity_addr *addr);
93
94extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
95extern void ceph_monc_stop(struct ceph_mon_client *monc);
96
97/*
98 * The model here is to indicate that we need a new map of at least
99 * epoch @want, and also call in when we receive a map. We will
100 * periodically rerequest the map from the monitor cluster until we
101 * get what we want.
102 */
103extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
104extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
105
106extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
107
108extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
109 struct ceph_statfs *buf);
110
111extern int ceph_monc_open_session(struct ceph_mon_client *monc);
112
113extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
114
115extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
116 u32 pool, u64 *snapid);
117
118extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
119 u32 pool, u64 snapid);
120
121#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a6438131..000000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{
12 struct ceph_msgpool *pool = arg;
13 void *p;
14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
20
21static void free_fn(void *element, void *arg)
22{
23 ceph_msg_put(element);
24}
25
26int ceph_msgpool_init(struct ceph_msgpool *pool,
27 int front_len, int size, bool blocking, const char *name)
28{
29 pool->front_len = front_len;
30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
31 if (!pool->pool)
32 return -ENOMEM;
33 pool->name = name;
34 return 0;
35}
36
37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
38{
39 mempool_destroy(pool->pool);
40}
41
42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
44{
45 if (front_len > pool->front_len) {
46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
47 pool->name, front_len, pool->front_len);
48 WARN_ON(1);
49
50 /* try to alloc a fresh message */
51 return ceph_msg_new(0, front_len, GFP_NOFS);
52 }
53
54 return mempool_alloc(pool->pool, GFP_NOFS);
55}
56
57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
58{
59 /* reset msg front_len; user may have changed it */
60 msg->front.iov_len = pool->front_len;
61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
62
63 kref_init(&msg->kref); /* retake single ref */
64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f9368..000000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include <linux/mempool.h>
5#include "messenger.h"
6
7/*
8 * we use memory pools for preallocating messages we may receive, to
9 * avoid unexpected OOM conditions.
10 */
11struct ceph_msgpool {
12 const char *name;
13 mempool_t *pool;
14 int front_len; /* preallocated payload size */
15};
16
17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
18 int front_len, int size, bool blocking,
19 const char *name);
20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
22 int front_len);
23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
24
25#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 680d3d648cac..000000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
1#ifndef CEPH_MSGR_H
2#define CEPH_MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_AUTH 0x20
54
55#define CEPH_ENTITY_TYPE_ANY 0xFF
56
57extern const char *ceph_entity_type_name(int type);
58
59/*
60 * entity_addr -- network address
61 */
62struct ceph_entity_addr {
63 __le32 type;
64 __le32 nonce; /* unique id for process (e.g. pid) */
65 struct sockaddr_storage in_addr;
66} __attribute__ ((packed));
67
68struct ceph_entity_inst {
69 struct ceph_entity_name name;
70 struct ceph_entity_addr addr;
71} __attribute__ ((packed));
72
73
74/* used by message exchange protocol */
75#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
76#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
77#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
78 incoming connection */
79#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
80 with higher cseq */
81#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
82 with higher gseq */
83#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
84#define CEPH_MSGR_TAG_MSG 7 /* message */
85#define CEPH_MSGR_TAG_ACK 8 /* message ack */
86#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
90
91
92/*
93 * connection negotiation
94 */
95struct ceph_msg_connect {
96 __le64 features; /* supported feature bits */
97 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
98 __le32 global_seq; /* count connections initiated by this host */
99 __le32 connect_seq; /* count connections initiated in this session */
100 __le32 protocol_version;
101 __le32 authorizer_protocol;
102 __le32 authorizer_len;
103 __u8 flags; /* CEPH_MSG_CONNECT_* */
104} __attribute__ ((packed));
105
106struct ceph_msg_connect_reply {
107 __u8 tag;
108 __le64 features; /* feature bits for this session */
109 __le32 global_seq;
110 __le32 connect_seq;
111 __le32 protocol_version;
112 __le32 authorizer_len;
113 __u8 flags;
114} __attribute__ ((packed));
115
116#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
117
118
119/*
120 * message header
121 */
122struct ceph_msg_header_old {
123 __le64 seq; /* message seq# for this session */
124 __le64 tid; /* transaction id */
125 __le16 type; /* message type */
126 __le16 priority; /* priority. higher value == higher priority */
127 __le16 version; /* version of message encoding */
128
129 __le32 front_len; /* bytes in main payload */
130 __le32 middle_len;/* bytes in middle payload */
131 __le32 data_len; /* bytes of data payload */
132 __le16 data_off; /* sender: include full offset;
133 receiver: mask against ~PAGE_MASK */
134
135 struct ceph_entity_inst src, orig_src;
136 __le32 reserved;
137 __le32 crc; /* header crc32c */
138} __attribute__ ((packed));
139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
158#define CEPH_MSG_PRIO_LOW 64
159#define CEPH_MSG_PRIO_DEFAULT 127
160#define CEPH_MSG_PRIO_HIGH 196
161#define CEPH_MSG_PRIO_HIGHEST 255
162
163/*
164 * follows data payload
165 */
166struct ceph_msg_footer {
167 __le32 front_crc, middle_crc, data_crc;
168 __u8 flags;
169} __attribute__ ((packed));
170
171#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
172#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
173
174
175#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index 3b5571b8ce22..000000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return NULL;
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (!msg) {
169 ceph_osdc_put_request(req);
170 return NULL;
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (!msg) {
183 ceph_osdc_put_request(req);
184 return NULL;
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
369 kfree(osd);
370 }
371}
372
373/*
374 * remove an osd from our map
375 */
376static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
377{
378 dout("__remove_osd %p\n", osd);
379 BUG_ON(!list_empty(&osd->o_requests));
380 rb_erase(&osd->o_node, &osdc->osds);
381 list_del_init(&osd->o_osd_lru);
382 ceph_con_close(&osd->o_con);
383 put_osd(osd);
384}
385
386static void __move_osd_to_lru(struct ceph_osd_client *osdc,
387 struct ceph_osd *osd)
388{
389 dout("__move_osd_to_lru %p\n", osd);
390 BUG_ON(!list_empty(&osd->o_osd_lru));
391 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
392 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
393}
394
395static void __remove_osd_from_lru(struct ceph_osd *osd)
396{
397 dout("__remove_osd_from_lru %p\n", osd);
398 if (!list_empty(&osd->o_osd_lru))
399 list_del_init(&osd->o_osd_lru);
400}
401
402static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
403{
404 struct ceph_osd *osd, *nosd;
405
406 dout("__remove_old_osds %p\n", osdc);
407 mutex_lock(&osdc->request_mutex);
408 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
409 if (!remove_all && time_before(jiffies, osd->lru_ttl))
410 break;
411 __remove_osd(osdc, osd);
412 }
413 mutex_unlock(&osdc->request_mutex);
414}
415
416/*
417 * reset osd connect
418 */
419static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
420{
421 struct ceph_osd_request *req;
422 int ret = 0;
423
424 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
425 if (list_empty(&osd->o_requests)) {
426 __remove_osd(osdc, osd);
427 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
428 &osd->o_con.peer_addr,
429 sizeof(osd->o_con.peer_addr)) == 0 &&
430 !ceph_con_opened(&osd->o_con)) {
431 dout(" osd addr hasn't changed and connection never opened,"
432 " letting msgr retry");
433 /* touch each r_stamp for handle_timeout()'s benfit */
434 list_for_each_entry(req, &osd->o_requests, r_osd_item)
435 req->r_stamp = jiffies;
436 ret = -EAGAIN;
437 } else {
438 ceph_con_close(&osd->o_con);
439 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
440 osd->o_incarnation++;
441 }
442 return ret;
443}
444
445static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
446{
447 struct rb_node **p = &osdc->osds.rb_node;
448 struct rb_node *parent = NULL;
449 struct ceph_osd *osd = NULL;
450
451 while (*p) {
452 parent = *p;
453 osd = rb_entry(parent, struct ceph_osd, o_node);
454 if (new->o_osd < osd->o_osd)
455 p = &(*p)->rb_left;
456 else if (new->o_osd > osd->o_osd)
457 p = &(*p)->rb_right;
458 else
459 BUG();
460 }
461
462 rb_link_node(&new->o_node, parent, p);
463 rb_insert_color(&new->o_node, &osdc->osds);
464}
465
466static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
467{
468 struct ceph_osd *osd;
469 struct rb_node *n = osdc->osds.rb_node;
470
471 while (n) {
472 osd = rb_entry(n, struct ceph_osd, o_node);
473 if (o < osd->o_osd)
474 n = n->rb_left;
475 else if (o > osd->o_osd)
476 n = n->rb_right;
477 else
478 return osd;
479 }
480 return NULL;
481}
482
483static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
484{
485 schedule_delayed_work(&osdc->timeout_work,
486 osdc->client->mount_args->osd_keepalive_timeout * HZ);
487}
488
489static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
490{
491 cancel_delayed_work(&osdc->timeout_work);
492}
493
494/*
495 * Register request, assign tid. If this is the first request, set up
496 * the timeout event.
497 */
498static void register_request(struct ceph_osd_client *osdc,
499 struct ceph_osd_request *req)
500{
501 mutex_lock(&osdc->request_mutex);
502 req->r_tid = ++osdc->last_tid;
503 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
504 INIT_LIST_HEAD(&req->r_req_lru_item);
505
506 dout("register_request %p tid %lld\n", req, req->r_tid);
507 __insert_request(osdc, req);
508 ceph_osdc_get_request(req);
509 osdc->num_requests++;
510
511 if (osdc->num_requests == 1) {
512 dout(" first request, scheduling timeout\n");
513 __schedule_osd_timeout(osdc);
514 }
515 mutex_unlock(&osdc->request_mutex);
516}
517
518/*
519 * called under osdc->request_mutex
520 */
521static void __unregister_request(struct ceph_osd_client *osdc,
522 struct ceph_osd_request *req)
523{
524 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
525 rb_erase(&req->r_node, &osdc->requests);
526 osdc->num_requests--;
527
528 if (req->r_osd) {
529 /* make sure the original request isn't in flight. */
530 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
531
532 list_del_init(&req->r_osd_item);
533 if (list_empty(&req->r_osd->o_requests))
534 __move_osd_to_lru(osdc, req->r_osd);
535 req->r_osd = NULL;
536 }
537
538 ceph_osdc_put_request(req);
539
540 list_del_init(&req->r_req_lru_item);
541 if (osdc->num_requests == 0) {
542 dout(" no requests, canceling timeout\n");
543 __cancel_osd_timeout(osdc);
544 }
545}
546
547/*
548 * Cancel a previously queued request message
549 */
550static void __cancel_request(struct ceph_osd_request *req)
551{
552 if (req->r_sent && req->r_osd) {
553 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
554 req->r_sent = 0;
555 }
556 list_del_init(&req->r_req_lru_item);
557}
558
559/*
560 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
561 * (as needed), and set the request r_osd appropriately. If there is
562 * no up osd, set r_osd to NULL.
563 *
564 * Return 0 if unchanged, 1 if changed, or negative on error.
565 *
566 * Caller should hold map_sem for read and request_mutex.
567 */
568static int __map_osds(struct ceph_osd_client *osdc,
569 struct ceph_osd_request *req)
570{
571 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
572 struct ceph_pg pgid;
573 int acting[CEPH_PG_MAX_SIZE];
574 int o = -1, num = 0;
575 int err;
576
577 dout("map_osds %p tid %lld\n", req, req->r_tid);
578 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
579 &req->r_file_layout, osdc->osdmap);
580 if (err)
581 return err;
582 pgid = reqhead->layout.ol_pgid;
583 req->r_pgid = pgid;
584
585 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
586 if (err > 0) {
587 o = acting[0];
588 num = err;
589 }
590
591 if ((req->r_osd && req->r_osd->o_osd == o &&
592 req->r_sent >= req->r_osd->o_incarnation &&
593 req->r_num_pg_osds == num &&
594 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
595 (req->r_osd == NULL && o == -1))
596 return 0; /* no change */
597
598 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
599 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
600 req->r_osd ? req->r_osd->o_osd : -1);
601
602 /* record full pg acting set */
603 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
604 req->r_num_pg_osds = num;
605
606 if (req->r_osd) {
607 __cancel_request(req);
608 list_del_init(&req->r_osd_item);
609 req->r_osd = NULL;
610 }
611
612 req->r_osd = __lookup_osd(osdc, o);
613 if (!req->r_osd && o >= 0) {
614 err = -ENOMEM;
615 req->r_osd = create_osd(osdc);
616 if (!req->r_osd)
617 goto out;
618
619 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
620 req->r_osd->o_osd = o;
621 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
622 __insert_osd(osdc, req->r_osd);
623
624 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
625 }
626
627 if (req->r_osd) {
628 __remove_osd_from_lru(req->r_osd);
629 list_add(&req->r_osd_item, &req->r_osd->o_requests);
630 }
631 err = 1; /* osd or pg changed */
632
633out:
634 return err;
635}
636
637/*
638 * caller should hold map_sem (for read) and request_mutex
639 */
640static int __send_request(struct ceph_osd_client *osdc,
641 struct ceph_osd_request *req)
642{
643 struct ceph_osd_request_head *reqhead;
644 int err;
645
646 err = __map_osds(osdc, req);
647 if (err < 0)
648 return err;
649 if (req->r_osd == NULL) {
650 dout("send_request %p no up osds in pg\n", req);
651 ceph_monc_request_next_osdmap(&osdc->client->monc);
652 return 0;
653 }
654
655 dout("send_request %p tid %llu to osd%d flags %d\n",
656 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
657
658 reqhead = req->r_request->front.iov_base;
659 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
660 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
661 reqhead->reassert_version = req->r_reassert_version;
662
663 req->r_stamp = jiffies;
664 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
665
666 ceph_msg_get(req->r_request); /* send consumes a ref */
667 ceph_con_send(&req->r_osd->o_con, req->r_request);
668 req->r_sent = req->r_osd->o_incarnation;
669 return 0;
670}
671
672/*
673 * Timeout callback, called every N seconds when 1 or more osd
674 * requests has been active for more than N seconds. When this
675 * happens, we ping all OSDs with requests who have timed out to
676 * ensure any communications channel reset is detected. Reset the
677 * request timeouts another N seconds in the future as we go.
678 * Reschedule the timeout event another N seconds in future (unless
679 * there are no open requests).
680 */
681static void handle_timeout(struct work_struct *work)
682{
683 struct ceph_osd_client *osdc =
684 container_of(work, struct ceph_osd_client, timeout_work.work);
685 struct ceph_osd_request *req, *last_req = NULL;
686 struct ceph_osd *osd;
687 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
688 unsigned long keepalive =
689 osdc->client->mount_args->osd_keepalive_timeout * HZ;
690 unsigned long last_stamp = 0;
691 struct rb_node *p;
692 struct list_head slow_osds;
693
694 dout("timeout\n");
695 down_read(&osdc->map_sem);
696
697 ceph_monc_request_next_osdmap(&osdc->client->monc);
698
699 mutex_lock(&osdc->request_mutex);
700 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
701 req = rb_entry(p, struct ceph_osd_request, r_node);
702
703 if (req->r_resend) {
704 int err;
705
706 dout("osdc resending prev failed %lld\n", req->r_tid);
707 err = __send_request(osdc, req);
708 if (err)
709 dout("osdc failed again on %lld\n", req->r_tid);
710 else
711 req->r_resend = false;
712 continue;
713 }
714 }
715
716 /*
717 * reset osds that appear to be _really_ unresponsive. this
718 * is a failsafe measure.. we really shouldn't be getting to
719 * this point if the system is working properly. the monitors
720 * should mark the osd as failed and we should find out about
721 * it from an updated osd map.
722 */
723 while (timeout && !list_empty(&osdc->req_lru)) {
724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
725 r_req_lru_item);
726
727 if (time_before(jiffies, req->r_stamp + timeout))
728 break;
729
730 BUG_ON(req == last_req && req->r_stamp == last_stamp);
731 last_req = req;
732 last_stamp = req->r_stamp;
733
734 osd = req->r_osd;
735 BUG_ON(!osd);
736 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
737 req->r_tid, osd->o_osd);
738 __kick_requests(osdc, osd);
739 }
740
741 /*
742 * ping osds that are a bit slow. this ensures that if there
743 * is a break in the TCP connection we will notice, and reopen
744 * a connection with that osd (from the fault callback).
745 */
746 INIT_LIST_HEAD(&slow_osds);
747 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
748 if (time_before(jiffies, req->r_stamp + keepalive))
749 break;
750
751 osd = req->r_osd;
752 BUG_ON(!osd);
753 dout(" tid %llu is slow, will send keepalive on osd%d\n",
754 req->r_tid, osd->o_osd);
755 list_move_tail(&osd->o_keepalive_item, &slow_osds);
756 }
757 while (!list_empty(&slow_osds)) {
758 osd = list_entry(slow_osds.next, struct ceph_osd,
759 o_keepalive_item);
760 list_del_init(&osd->o_keepalive_item);
761 ceph_con_keepalive(&osd->o_con);
762 }
763
764 __schedule_osd_timeout(osdc);
765 mutex_unlock(&osdc->request_mutex);
766
767 up_read(&osdc->map_sem);
768}
769
770static void handle_osds_timeout(struct work_struct *work)
771{
772 struct ceph_osd_client *osdc =
773 container_of(work, struct ceph_osd_client,
774 osds_timeout_work.work);
775 unsigned long delay =
776 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
777
778 dout("osds timeout\n");
779 down_read(&osdc->map_sem);
780 remove_old_osds(osdc, 0);
781 up_read(&osdc->map_sem);
782
783 schedule_delayed_work(&osdc->osds_timeout_work,
784 round_jiffies_relative(delay));
785}
786
787/*
788 * handle osd op reply. either call the callback if it is specified,
789 * or do the completion to wake up the waiting thread.
790 */
791static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
792 struct ceph_connection *con)
793{
794 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
795 struct ceph_osd_request *req;
796 u64 tid;
797 int numops, object_len, flags;
798 s32 result;
799
800 tid = le64_to_cpu(msg->hdr.tid);
801 if (msg->front.iov_len < sizeof(*rhead))
802 goto bad;
803 numops = le32_to_cpu(rhead->num_ops);
804 object_len = le32_to_cpu(rhead->object_len);
805 result = le32_to_cpu(rhead->result);
806 if (msg->front.iov_len != sizeof(*rhead) + object_len +
807 numops * sizeof(struct ceph_osd_op))
808 goto bad;
809 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
810
811 /* lookup */
812 mutex_lock(&osdc->request_mutex);
813 req = __lookup_request(osdc, tid);
814 if (req == NULL) {
815 dout("handle_reply tid %llu dne\n", tid);
816 mutex_unlock(&osdc->request_mutex);
817 return;
818 }
819 ceph_osdc_get_request(req);
820 flags = le32_to_cpu(rhead->flags);
821
822 /*
823 * if this connection filled our message, drop our reference now, to
824 * avoid a (safe but slower) revoke later.
825 */
826 if (req->r_con_filling_msg == con && req->r_reply == msg) {
827 dout(" dropping con_filling_msg ref %p\n", con);
828 req->r_con_filling_msg = NULL;
829 ceph_con_put(con);
830 }
831
832 if (!req->r_got_reply) {
833 unsigned bytes;
834
835 req->r_result = le32_to_cpu(rhead->result);
836 bytes = le32_to_cpu(msg->hdr.data_len);
837 dout("handle_reply result %d bytes %d\n", req->r_result,
838 bytes);
839 if (req->r_result == 0)
840 req->r_result = bytes;
841
842 /* in case this is a write and we need to replay, */
843 req->r_reassert_version = rhead->reassert_version;
844
845 req->r_got_reply = 1;
846 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
847 dout("handle_reply tid %llu dup ack\n", tid);
848 mutex_unlock(&osdc->request_mutex);
849 goto done;
850 }
851
852 dout("handle_reply tid %llu flags %d\n", tid, flags);
853
854 /* either this is a read, or we got the safe response */
855 if (result < 0 ||
856 (flags & CEPH_OSD_FLAG_ONDISK) ||
857 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
858 __unregister_request(osdc, req);
859
860 mutex_unlock(&osdc->request_mutex);
861
862 if (req->r_callback)
863 req->r_callback(req, msg);
864 else
865 complete_all(&req->r_completion);
866
867 if (flags & CEPH_OSD_FLAG_ONDISK) {
868 if (req->r_safe_callback)
869 req->r_safe_callback(req, msg);
870 complete_all(&req->r_safe_completion); /* fsync waiter */
871 }
872
873done:
874 ceph_osdc_put_request(req);
875 return;
876
877bad:
878 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
879 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
880 (int)sizeof(*rhead));
881 ceph_msg_dump(msg);
882}
883
884
885static int __kick_requests(struct ceph_osd_client *osdc,
886 struct ceph_osd *kickosd)
887{
888 struct ceph_osd_request *req;
889 struct rb_node *p, *n;
890 int needmap = 0;
891 int err;
892
893 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
894 if (kickosd) {
895 err = __reset_osd(osdc, kickosd);
896 if (err == -EAGAIN)
897 return 1;
898 } else {
899 for (p = rb_first(&osdc->osds); p; p = n) {
900 struct ceph_osd *osd =
901 rb_entry(p, struct ceph_osd, o_node);
902
903 n = rb_next(p);
904 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
905 memcmp(&osd->o_con.peer_addr,
906 ceph_osd_addr(osdc->osdmap,
907 osd->o_osd),
908 sizeof(struct ceph_entity_addr)) != 0)
909 __reset_osd(osdc, osd);
910 }
911 }
912
913 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
914 req = rb_entry(p, struct ceph_osd_request, r_node);
915
916 if (req->r_resend) {
917 dout(" r_resend set on tid %llu\n", req->r_tid);
918 __cancel_request(req);
919 goto kick;
920 }
921 if (req->r_osd && kickosd == req->r_osd) {
922 __cancel_request(req);
923 goto kick;
924 }
925
926 err = __map_osds(osdc, req);
927 if (err == 0)
928 continue; /* no change */
929 if (err < 0) {
930 /*
931 * FIXME: really, we should set the request
932 * error and fail if this isn't a 'nofail'
933 * request, but that's a fair bit more
934 * complicated to do. So retry!
935 */
936 dout(" setting r_resend on %llu\n", req->r_tid);
937 req->r_resend = true;
938 continue;
939 }
940 if (req->r_osd == NULL) {
941 dout("tid %llu maps to no valid osd\n", req->r_tid);
942 needmap++; /* request a newer map */
943 continue;
944 }
945
946kick:
947 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
948 req->r_osd ? req->r_osd->o_osd : -1);
949 req->r_flags |= CEPH_OSD_FLAG_RETRY;
950 err = __send_request(osdc, req);
951 if (err) {
952 dout(" setting r_resend on %llu\n", req->r_tid);
953 req->r_resend = true;
954 }
955 }
956
957 return needmap;
958}
959
960/*
961 * Resubmit osd requests whose osd or osd address has changed. Request
962 * a new osd map if osds are down, or we are otherwise unable to determine
963 * how to direct a request.
964 *
965 * Close connections to down osds.
966 *
967 * If @who is specified, resubmit requests for that specific osd.
968 *
969 * Caller should hold map_sem for read and request_mutex.
970 */
971static void kick_requests(struct ceph_osd_client *osdc,
972 struct ceph_osd *kickosd)
973{
974 int needmap;
975
976 mutex_lock(&osdc->request_mutex);
977 needmap = __kick_requests(osdc, kickosd);
978 mutex_unlock(&osdc->request_mutex);
979
980 if (needmap) {
981 dout("%d requests for down osds, need new map\n", needmap);
982 ceph_monc_request_next_osdmap(&osdc->client->monc);
983 }
984
985}
986/*
987 * Process updated osd map.
988 *
989 * The message contains any number of incremental and full maps, normally
990 * indicating some sort of topology change in the cluster. Kick requests
991 * off to different OSDs as needed.
992 */
993void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
994{
995 void *p, *end, *next;
996 u32 nr_maps, maplen;
997 u32 epoch;
998 struct ceph_osdmap *newmap = NULL, *oldmap;
999 int err;
1000 struct ceph_fsid fsid;
1001
1002 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1003 p = msg->front.iov_base;
1004 end = p + msg->front.iov_len;
1005
1006 /* verify fsid */
1007 ceph_decode_need(&p, end, sizeof(fsid), bad);
1008 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1009 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1010 return;
1011
1012 down_write(&osdc->map_sem);
1013
1014 /* incremental maps */
1015 ceph_decode_32_safe(&p, end, nr_maps, bad);
1016 dout(" %d inc maps\n", nr_maps);
1017 while (nr_maps > 0) {
1018 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1019 epoch = ceph_decode_32(&p);
1020 maplen = ceph_decode_32(&p);
1021 ceph_decode_need(&p, end, maplen, bad);
1022 next = p + maplen;
1023 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1024 dout("applying incremental map %u len %d\n",
1025 epoch, maplen);
1026 newmap = osdmap_apply_incremental(&p, next,
1027 osdc->osdmap,
1028 osdc->client->msgr);
1029 if (IS_ERR(newmap)) {
1030 err = PTR_ERR(newmap);
1031 goto bad;
1032 }
1033 BUG_ON(!newmap);
1034 if (newmap != osdc->osdmap) {
1035 ceph_osdmap_destroy(osdc->osdmap);
1036 osdc->osdmap = newmap;
1037 }
1038 } else {
1039 dout("ignoring incremental map %u len %d\n",
1040 epoch, maplen);
1041 }
1042 p = next;
1043 nr_maps--;
1044 }
1045 if (newmap)
1046 goto done;
1047
1048 /* full maps */
1049 ceph_decode_32_safe(&p, end, nr_maps, bad);
1050 dout(" %d full maps\n", nr_maps);
1051 while (nr_maps) {
1052 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1053 epoch = ceph_decode_32(&p);
1054 maplen = ceph_decode_32(&p);
1055 ceph_decode_need(&p, end, maplen, bad);
1056 if (nr_maps > 1) {
1057 dout("skipping non-latest full map %u len %d\n",
1058 epoch, maplen);
1059 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1060 dout("skipping full map %u len %d, "
1061 "older than our %u\n", epoch, maplen,
1062 osdc->osdmap->epoch);
1063 } else {
1064 dout("taking full map %u len %d\n", epoch, maplen);
1065 newmap = osdmap_decode(&p, p+maplen);
1066 if (IS_ERR(newmap)) {
1067 err = PTR_ERR(newmap);
1068 goto bad;
1069 }
1070 BUG_ON(!newmap);
1071 oldmap = osdc->osdmap;
1072 osdc->osdmap = newmap;
1073 if (oldmap)
1074 ceph_osdmap_destroy(oldmap);
1075 }
1076 p += maplen;
1077 nr_maps--;
1078 }
1079
1080done:
1081 downgrade_write(&osdc->map_sem);
1082 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1083 if (newmap)
1084 kick_requests(osdc, NULL);
1085 up_read(&osdc->map_sem);
1086 wake_up_all(&osdc->client->auth_wq);
1087 return;
1088
1089bad:
1090 pr_err("osdc handle_map corrupt msg\n");
1091 ceph_msg_dump(msg);
1092 up_write(&osdc->map_sem);
1093 return;
1094}
1095
1096/*
1097 * Register request, send initial attempt.
1098 */
1099int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1100 struct ceph_osd_request *req,
1101 bool nofail)
1102{
1103 int rc = 0;
1104
1105 req->r_request->pages = req->r_pages;
1106 req->r_request->nr_pages = req->r_num_pages;
1107
1108 register_request(osdc, req);
1109
1110 down_read(&osdc->map_sem);
1111 mutex_lock(&osdc->request_mutex);
1112 /*
1113 * a racing kick_requests() may have sent the message for us
1114 * while we dropped request_mutex above, so only send now if
1115 * the request still han't been touched yet.
1116 */
1117 if (req->r_sent == 0) {
1118 rc = __send_request(osdc, req);
1119 if (rc) {
1120 if (nofail) {
1121 dout("osdc_start_request failed send, "
1122 " marking %lld\n", req->r_tid);
1123 req->r_resend = true;
1124 rc = 0;
1125 } else {
1126 __unregister_request(osdc, req);
1127 }
1128 }
1129 }
1130 mutex_unlock(&osdc->request_mutex);
1131 up_read(&osdc->map_sem);
1132 return rc;
1133}
1134
1135/*
1136 * wait for a request to complete
1137 */
1138int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1139 struct ceph_osd_request *req)
1140{
1141 int rc;
1142
1143 rc = wait_for_completion_interruptible(&req->r_completion);
1144 if (rc < 0) {
1145 mutex_lock(&osdc->request_mutex);
1146 __cancel_request(req);
1147 __unregister_request(osdc, req);
1148 mutex_unlock(&osdc->request_mutex);
1149 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1150 return rc;
1151 }
1152
1153 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1154 return req->r_result;
1155}
1156
1157/*
1158 * sync - wait for all in-flight requests to flush. avoid starvation.
1159 */
1160void ceph_osdc_sync(struct ceph_osd_client *osdc)
1161{
1162 struct ceph_osd_request *req;
1163 u64 last_tid, next_tid = 0;
1164
1165 mutex_lock(&osdc->request_mutex);
1166 last_tid = osdc->last_tid;
1167 while (1) {
1168 req = __lookup_request_ge(osdc, next_tid);
1169 if (!req)
1170 break;
1171 if (req->r_tid > last_tid)
1172 break;
1173
1174 next_tid = req->r_tid + 1;
1175 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1176 continue;
1177
1178 ceph_osdc_get_request(req);
1179 mutex_unlock(&osdc->request_mutex);
1180 dout("sync waiting on tid %llu (last is %llu)\n",
1181 req->r_tid, last_tid);
1182 wait_for_completion(&req->r_safe_completion);
1183 mutex_lock(&osdc->request_mutex);
1184 ceph_osdc_put_request(req);
1185 }
1186 mutex_unlock(&osdc->request_mutex);
1187 dout("sync done (thru tid %llu)\n", last_tid);
1188}
1189
1190/*
1191 * init, shutdown
1192 */
1193int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1194{
1195 int err;
1196
1197 dout("init\n");
1198 osdc->client = client;
1199 osdc->osdmap = NULL;
1200 init_rwsem(&osdc->map_sem);
1201 init_completion(&osdc->map_waiters);
1202 osdc->last_requested_map = 0;
1203 mutex_init(&osdc->request_mutex);
1204 osdc->last_tid = 0;
1205 osdc->osds = RB_ROOT;
1206 INIT_LIST_HEAD(&osdc->osd_lru);
1207 osdc->requests = RB_ROOT;
1208 INIT_LIST_HEAD(&osdc->req_lru);
1209 osdc->num_requests = 0;
1210 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1211 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1212
1213 schedule_delayed_work(&osdc->osds_timeout_work,
1214 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1215
1216 err = -ENOMEM;
1217 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1218 sizeof(struct ceph_osd_request));
1219 if (!osdc->req_mempool)
1220 goto out;
1221
1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1224 if (err < 0)
1225 goto out_mempool;
1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1229 if (err < 0)
1230 goto out_msgpool;
1231 return 0;
1232
1233out_msgpool:
1234 ceph_msgpool_destroy(&osdc->msgpool_op);
1235out_mempool:
1236 mempool_destroy(osdc->req_mempool);
1237out:
1238 return err;
1239}
1240
1241void ceph_osdc_stop(struct ceph_osd_client *osdc)
1242{
1243 cancel_delayed_work_sync(&osdc->timeout_work);
1244 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1245 if (osdc->osdmap) {
1246 ceph_osdmap_destroy(osdc->osdmap);
1247 osdc->osdmap = NULL;
1248 }
1249 remove_old_osds(osdc, 1);
1250 mempool_destroy(osdc->req_mempool);
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1253}
1254
1255/*
1256 * Read some contiguous pages. If we cross a stripe boundary, shorten
1257 * *plen. Return number of bytes read, or error.
1258 */
1259int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1260 struct ceph_vino vino, struct ceph_file_layout *layout,
1261 u64 off, u64 *plen,
1262 u32 truncate_seq, u64 truncate_size,
1263 struct page **pages, int num_pages)
1264{
1265 struct ceph_osd_request *req;
1266 int rc = 0;
1267
1268 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1269 vino.snap, off, *plen);
1270 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1272 NULL, 0, truncate_seq, truncate_size, NULL,
1273 false, 1);
1274 if (!req)
1275 return -ENOMEM;
1276
1277 /* it may be a short read due to an object boundary */
1278 req->r_pages = pages;
1279
1280 dout("readpages final extent is %llu~%llu (%d pages)\n",
1281 off, *plen, req->r_num_pages);
1282
1283 rc = ceph_osdc_start_request(osdc, req, false);
1284 if (!rc)
1285 rc = ceph_osdc_wait_request(osdc, req);
1286
1287 ceph_osdc_put_request(req);
1288 dout("readpages result %d\n", rc);
1289 return rc;
1290}
1291
1292/*
1293 * do a synchronous write on N pages
1294 */
1295int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1296 struct ceph_file_layout *layout,
1297 struct ceph_snap_context *snapc,
1298 u64 off, u64 len,
1299 u32 truncate_seq, u64 truncate_size,
1300 struct timespec *mtime,
1301 struct page **pages, int num_pages,
1302 int flags, int do_sync, bool nofail)
1303{
1304 struct ceph_osd_request *req;
1305 int rc = 0;
1306
1307 BUG_ON(vino.snap != CEPH_NOSNAP);
1308 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1309 CEPH_OSD_OP_WRITE,
1310 flags | CEPH_OSD_FLAG_ONDISK |
1311 CEPH_OSD_FLAG_WRITE,
1312 snapc, do_sync,
1313 truncate_seq, truncate_size, mtime,
1314 nofail, 1);
1315 if (!req)
1316 return -ENOMEM;
1317
1318 /* it may be a short write due to an object boundary */
1319 req->r_pages = pages;
1320 dout("writepages %llu~%llu (%d pages)\n", off, len,
1321 req->r_num_pages);
1322
1323 rc = ceph_osdc_start_request(osdc, req, nofail);
1324 if (!rc)
1325 rc = ceph_osdc_wait_request(osdc, req);
1326
1327 ceph_osdc_put_request(req);
1328 if (rc == 0)
1329 rc = len;
1330 dout("writepages result %d\n", rc);
1331 return rc;
1332}
1333
1334/*
1335 * handle incoming message
1336 */
1337static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1338{
1339 struct ceph_osd *osd = con->private;
1340 struct ceph_osd_client *osdc;
1341 int type = le16_to_cpu(msg->hdr.type);
1342
1343 if (!osd)
1344 goto out;
1345 osdc = osd->o_osdc;
1346
1347 switch (type) {
1348 case CEPH_MSG_OSD_MAP:
1349 ceph_osdc_handle_map(osdc, msg);
1350 break;
1351 case CEPH_MSG_OSD_OPREPLY:
1352 handle_reply(osdc, msg, con);
1353 break;
1354
1355 default:
1356 pr_err("received unknown message type %d %s\n", type,
1357 ceph_msg_type_name(type));
1358 }
1359out:
1360 ceph_msg_put(msg);
1361}
1362
1363/*
1364 * lookup and return message for incoming reply. set up reply message
1365 * pages.
1366 */
1367static struct ceph_msg *get_reply(struct ceph_connection *con,
1368 struct ceph_msg_header *hdr,
1369 int *skip)
1370{
1371 struct ceph_osd *osd = con->private;
1372 struct ceph_osd_client *osdc = osd->o_osdc;
1373 struct ceph_msg *m;
1374 struct ceph_osd_request *req;
1375 int front = le32_to_cpu(hdr->front_len);
1376 int data_len = le32_to_cpu(hdr->data_len);
1377 u64 tid;
1378
1379 tid = le64_to_cpu(hdr->tid);
1380 mutex_lock(&osdc->request_mutex);
1381 req = __lookup_request(osdc, tid);
1382 if (!req) {
1383 *skip = 1;
1384 m = NULL;
1385 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1386 osd->o_osd);
1387 goto out;
1388 }
1389
1390 if (req->r_con_filling_msg) {
1391 dout("get_reply revoking msg %p from old con %p\n",
1392 req->r_reply, req->r_con_filling_msg);
1393 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1394 ceph_con_put(req->r_con_filling_msg);
1395 req->r_con_filling_msg = NULL;
1396 }
1397
1398 if (front > req->r_reply->front.iov_len) {
1399 pr_warning("get_reply front %d > preallocated %d\n",
1400 front, (int)req->r_reply->front.iov_len);
1401 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1402 if (!m)
1403 goto out;
1404 ceph_msg_put(req->r_reply);
1405 req->r_reply = m;
1406 }
1407 m = ceph_msg_get(req->r_reply);
1408
1409 if (data_len > 0) {
1410 unsigned data_off = le16_to_cpu(hdr->data_off);
1411 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1412
1413 if (unlikely(req->r_num_pages < want)) {
1414 pr_warning("tid %lld reply %d > expected %d pages\n",
1415 tid, want, m->nr_pages);
1416 *skip = 1;
1417 ceph_msg_put(m);
1418 m = NULL;
1419 goto out;
1420 }
1421 m->pages = req->r_pages;
1422 m->nr_pages = req->r_num_pages;
1423 }
1424 *skip = 0;
1425 req->r_con_filling_msg = ceph_con_get(con);
1426 dout("get_reply tid %lld %p\n", tid, m);
1427
1428out:
1429 mutex_unlock(&osdc->request_mutex);
1430 return m;
1431
1432}
1433
1434static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1435 struct ceph_msg_header *hdr,
1436 int *skip)
1437{
1438 struct ceph_osd *osd = con->private;
1439 int type = le16_to_cpu(hdr->type);
1440 int front = le32_to_cpu(hdr->front_len);
1441
1442 switch (type) {
1443 case CEPH_MSG_OSD_MAP:
1444 return ceph_msg_new(type, front, GFP_NOFS);
1445 case CEPH_MSG_OSD_OPREPLY:
1446 return get_reply(con, hdr, skip);
1447 default:
1448 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1449 osd->o_osd);
1450 *skip = 1;
1451 return NULL;
1452 }
1453}
1454
1455/*
1456 * Wrappers to refcount containing ceph_osd struct
1457 */
1458static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1459{
1460 struct ceph_osd *osd = con->private;
1461 if (get_osd(osd))
1462 return con;
1463 return NULL;
1464}
1465
1466static void put_osd_con(struct ceph_connection *con)
1467{
1468 struct ceph_osd *osd = con->private;
1469 put_osd(osd);
1470}
1471
1472/*
1473 * authentication
1474 */
1475static int get_authorizer(struct ceph_connection *con,
1476 void **buf, int *len, int *proto,
1477 void **reply_buf, int *reply_len, int force_new)
1478{
1479 struct ceph_osd *o = con->private;
1480 struct ceph_osd_client *osdc = o->o_osdc;
1481 struct ceph_auth_client *ac = osdc->client->monc.auth;
1482 int ret = 0;
1483
1484 if (force_new && o->o_authorizer) {
1485 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1486 o->o_authorizer = NULL;
1487 }
1488 if (o->o_authorizer == NULL) {
1489 ret = ac->ops->create_authorizer(
1490 ac, CEPH_ENTITY_TYPE_OSD,
1491 &o->o_authorizer,
1492 &o->o_authorizer_buf,
1493 &o->o_authorizer_buf_len,
1494 &o->o_authorizer_reply_buf,
1495 &o->o_authorizer_reply_buf_len);
1496 if (ret)
1497 return ret;
1498 }
1499
1500 *proto = ac->protocol;
1501 *buf = o->o_authorizer_buf;
1502 *len = o->o_authorizer_buf_len;
1503 *reply_buf = o->o_authorizer_reply_buf;
1504 *reply_len = o->o_authorizer_reply_buf_len;
1505 return 0;
1506}
1507
1508
1509static int verify_authorizer_reply(struct ceph_connection *con, int len)
1510{
1511 struct ceph_osd *o = con->private;
1512 struct ceph_osd_client *osdc = o->o_osdc;
1513 struct ceph_auth_client *ac = osdc->client->monc.auth;
1514
1515 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1516}
1517
1518static int invalidate_authorizer(struct ceph_connection *con)
1519{
1520 struct ceph_osd *o = con->private;
1521 struct ceph_osd_client *osdc = o->o_osdc;
1522 struct ceph_auth_client *ac = osdc->client->monc.auth;
1523
1524 if (ac->ops->invalidate_authorizer)
1525 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1526
1527 return ceph_monc_validate_auth(&osdc->client->monc);
1528}
1529
1530static const struct ceph_connection_operations osd_con_ops = {
1531 .get = get_osd_con,
1532 .put = put_osd_con,
1533 .dispatch = dispatch,
1534 .get_authorizer = get_authorizer,
1535 .verify_authorizer_reply = verify_authorizer_reply,
1536 .invalidate_authorizer = invalidate_authorizer,
1537 .alloc_msg = alloc_msg,
1538 .fault = osd_reset,
1539};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6a..000000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51 int r_pg_osds[CEPH_PG_MAX_SIZE];
52 int r_num_pg_osds;
53
54 struct ceph_connection *r_con_filling_msg;
55
56 struct ceph_msg *r_request, *r_reply;
57 int r_result;
58 int r_flags; /* any additional flags for the osd */
59 u32 r_sent; /* >0 if r_request is sending/sent */
60 int r_got_reply;
61
62 struct ceph_osd_client *r_osdc;
63 struct kref r_kref;
64 bool r_mempool;
65 struct completion r_completion, r_safe_completion;
66 ceph_osdc_callback_t r_callback, r_safe_callback;
67 struct ceph_eversion r_reassert_version;
68 struct list_head r_unsafe_item;
69
70 struct inode *r_inode; /* for use by callbacks */
71
72 char r_oid[40]; /* object name */
73 int r_oid_len;
74 unsigned long r_stamp; /* send OR check time */
75 bool r_resend; /* msg send failed, needs retry */
76
77 struct ceph_file_layout r_file_layout;
78 struct ceph_snap_context *r_snapc; /* snap context for writes */
79 unsigned r_num_pages; /* size of page array (follows) */
80 struct page **r_pages; /* pages for data payload */
81 int r_pages_from_pool;
82 int r_own_pages; /* if true, i own page list */
83};
84
85struct ceph_osd_client {
86 struct ceph_client *client;
87
88 struct ceph_osdmap *osdmap; /* current map */
89 struct rw_semaphore map_sem;
90 struct completion map_waiters;
91 u64 last_requested_map;
92
93 struct mutex request_mutex;
94 struct rb_root osds; /* osds */
95 struct list_head osd_lru; /* idle osds */
96 u64 timeout_tid; /* tid of timeout triggering rq */
97 u64 last_tid; /* tid of last request */
98 struct rb_root requests; /* pending requests */
99 struct list_head req_lru; /* pending requests lru */
100 int num_requests;
101 struct delayed_work timeout_work;
102 struct delayed_work osds_timeout_work;
103#ifdef CONFIG_DEBUG_FS
104 struct dentry *debugfs_file;
105#endif
106
107 mempool_t *req_mempool;
108
109 struct ceph_msgpool msgpool_op;
110 struct ceph_msgpool msgpool_op_reply;
111};
112
113extern int ceph_osdc_init(struct ceph_osd_client *osdc,
114 struct ceph_client *client);
115extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
116
117extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
118 struct ceph_msg *msg);
119extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
120 struct ceph_msg *msg);
121
122extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 offset, u64 *len, int op, int flags,
126 struct ceph_snap_context *snapc,
127 int do_sync, u32 truncate_seq,
128 u64 truncate_size,
129 struct timespec *mtime,
130 bool use_mempool, int num_reply);
131
132static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
133{
134 kref_get(&req->r_kref);
135}
136extern void ceph_osdc_release_request(struct kref *kref);
137static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
138{
139 kref_put(&req->r_kref, ceph_osdc_release_request);
140}
141
142extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
143 struct ceph_osd_request *req,
144 bool nofail);
145extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
146 struct ceph_osd_request *req);
147extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
148
149extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
150 struct ceph_vino vino,
151 struct ceph_file_layout *layout,
152 u64 off, u64 *plen,
153 u32 truncate_seq, u64 truncate_size,
154 struct page **pages, int nr_pages);
155
156extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
157 struct ceph_vino vino,
158 struct ceph_file_layout *layout,
159 struct ceph_snap_context *sc,
160 u64 off, u64 len,
161 u32 truncate_seq, u64 truncate_size,
162 struct timespec *mtime,
163 struct page **pages, int nr_pages,
164 int flags, int do_sync, bool nofail);
165
166#endif
167
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index e31f118f1392..000000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1110 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
428{
429 unsigned n, m;
430
431 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
432 calc_pg_masks(pi);
433
434 /* num_snaps * snap_info_t */
435 n = le32_to_cpu(pi->v.num_snaps);
436 while (n--) {
437 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
438 sizeof(struct ceph_timespec), bad);
439 *p += sizeof(u64) + /* key */
440 1 + sizeof(u64) + /* u8, snapid */
441 sizeof(struct ceph_timespec);
442 m = ceph_decode_32(p); /* snap name */
443 *p += m;
444 }
445
446 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
447 return 0;
448
449bad:
450 return -EINVAL;
451}
452
453static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
454{
455 struct ceph_pg_pool_info *pi;
456 u32 num, len, pool;
457
458 ceph_decode_32_safe(p, end, num, bad);
459 dout(" %d pool names\n", num);
460 while (num--) {
461 ceph_decode_32_safe(p, end, pool, bad);
462 ceph_decode_32_safe(p, end, len, bad);
463 dout(" pool %d len %d\n", pool, len);
464 pi = __lookup_pg_pool(&map->pg_pools, pool);
465 if (pi) {
466 kfree(pi->name);
467 pi->name = kmalloc(len + 1, GFP_NOFS);
468 if (pi->name) {
469 memcpy(pi->name, *p, len);
470 pi->name[len] = '\0';
471 dout(" name is %s\n", pi->name);
472 }
473 }
474 *p += len;
475 }
476 return 0;
477
478bad:
479 return -EINVAL;
480}
481
482/*
483 * osd map
484 */
485void ceph_osdmap_destroy(struct ceph_osdmap *map)
486{
487 dout("osdmap_destroy %p\n", map);
488 if (map->crush)
489 crush_destroy(map->crush);
490 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
491 struct ceph_pg_mapping *pg =
492 rb_entry(rb_first(&map->pg_temp),
493 struct ceph_pg_mapping, node);
494 rb_erase(&pg->node, &map->pg_temp);
495 kfree(pg);
496 }
497 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
498 struct ceph_pg_pool_info *pi =
499 rb_entry(rb_first(&map->pg_pools),
500 struct ceph_pg_pool_info, node);
501 __remove_pg_pool(&map->pg_pools, pi);
502 }
503 kfree(map->osd_state);
504 kfree(map->osd_weight);
505 kfree(map->osd_addr);
506 kfree(map);
507}
508
509/*
510 * adjust max osd value. reallocate arrays.
511 */
512static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
513{
514 u8 *state;
515 struct ceph_entity_addr *addr;
516 u32 *weight;
517
518 state = kcalloc(max, sizeof(*state), GFP_NOFS);
519 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
520 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
521 if (state == NULL || addr == NULL || weight == NULL) {
522 kfree(state);
523 kfree(addr);
524 kfree(weight);
525 return -ENOMEM;
526 }
527
528 /* copy old? */
529 if (map->osd_state) {
530 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
531 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
532 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
533 kfree(map->osd_state);
534 kfree(map->osd_addr);
535 kfree(map->osd_weight);
536 }
537
538 map->osd_state = state;
539 map->osd_weight = weight;
540 map->osd_addr = addr;
541 map->max_osd = max;
542 return 0;
543}
544
545/*
546 * decode a full map.
547 */
548struct ceph_osdmap *osdmap_decode(void **p, void *end)
549{
550 struct ceph_osdmap *map;
551 u16 version;
552 u32 len, max, i;
553 u8 ev;
554 int err = -EINVAL;
555 void *start = *p;
556 struct ceph_pg_pool_info *pi;
557
558 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
559
560 map = kzalloc(sizeof(*map), GFP_NOFS);
561 if (map == NULL)
562 return ERR_PTR(-ENOMEM);
563 map->pg_temp = RB_ROOT;
564
565 ceph_decode_16_safe(p, end, version, bad);
566 if (version > CEPH_OSDMAP_VERSION) {
567 pr_warning("got unknown v %d > %d of osdmap\n", version,
568 CEPH_OSDMAP_VERSION);
569 goto bad;
570 }
571
572 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
573 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
574 map->epoch = ceph_decode_32(p);
575 ceph_decode_copy(p, &map->created, sizeof(map->created));
576 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
577
578 ceph_decode_32_safe(p, end, max, bad);
579 while (max--) {
580 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
581 pi = kzalloc(sizeof(*pi), GFP_NOFS);
582 if (!pi)
583 goto bad;
584 pi->id = ceph_decode_32(p);
585 ev = ceph_decode_8(p); /* encoding version */
586 if (ev > CEPH_PG_POOL_VERSION) {
587 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
588 ev, CEPH_PG_POOL_VERSION);
589 kfree(pi);
590 goto bad;
591 }
592 err = __decode_pool(p, end, pi);
593 if (err < 0)
594 goto bad;
595 __insert_pg_pool(&map->pg_pools, pi);
596 }
597
598 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
599 goto bad;
600
601 ceph_decode_32_safe(p, end, map->pool_max, bad);
602
603 ceph_decode_32_safe(p, end, map->flags, bad);
604
605 max = ceph_decode_32(p);
606
607 /* (re)alloc osd arrays */
608 err = osdmap_set_max_osd(map, max);
609 if (err < 0)
610 goto bad;
611 dout("osdmap_decode max_osd = %d\n", map->max_osd);
612
613 /* osds */
614 err = -EINVAL;
615 ceph_decode_need(p, end, 3*sizeof(u32) +
616 map->max_osd*(1 + sizeof(*map->osd_weight) +
617 sizeof(*map->osd_addr)), bad);
618 *p += 4; /* skip length field (should match max) */
619 ceph_decode_copy(p, map->osd_state, map->max_osd);
620
621 *p += 4; /* skip length field (should match max) */
622 for (i = 0; i < map->max_osd; i++)
623 map->osd_weight[i] = ceph_decode_32(p);
624
625 *p += 4; /* skip length field (should match max) */
626 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
627 for (i = 0; i < map->max_osd; i++)
628 ceph_decode_addr(&map->osd_addr[i]);
629
630 /* pg_temp */
631 ceph_decode_32_safe(p, end, len, bad);
632 for (i = 0; i < len; i++) {
633 int n, j;
634 struct ceph_pg pgid;
635 struct ceph_pg_mapping *pg;
636
637 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
638 ceph_decode_copy(p, &pgid, sizeof(pgid));
639 n = ceph_decode_32(p);
640 ceph_decode_need(p, end, n * sizeof(u32), bad);
641 err = -ENOMEM;
642 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
643 if (!pg)
644 goto bad;
645 pg->pgid = pgid;
646 pg->len = n;
647 for (j = 0; j < n; j++)
648 pg->osds[j] = ceph_decode_32(p);
649
650 err = __insert_pg_mapping(pg, &map->pg_temp);
651 if (err)
652 goto bad;
653 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
654 }
655
656 /* crush */
657 ceph_decode_32_safe(p, end, len, bad);
658 dout("osdmap_decode crush len %d from off 0x%x\n", len,
659 (int)(*p - start));
660 ceph_decode_need(p, end, len, bad);
661 map->crush = crush_decode(*p, end);
662 *p += len;
663 if (IS_ERR(map->crush)) {
664 err = PTR_ERR(map->crush);
665 map->crush = NULL;
666 goto bad;
667 }
668
669 /* ignore the rest of the map */
670 *p = end;
671
672 dout("osdmap_decode done %p %p\n", *p, end);
673 return map;
674
675bad:
676 dout("osdmap_decode fail\n");
677 ceph_osdmap_destroy(map);
678 return ERR_PTR(err);
679}
680
681/*
682 * decode and apply an incremental map update.
683 */
684struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
685 struct ceph_osdmap *map,
686 struct ceph_messenger *msgr)
687{
688 struct crush_map *newcrush = NULL;
689 struct ceph_fsid fsid;
690 u32 epoch = 0;
691 struct ceph_timespec modified;
692 u32 len, pool;
693 __s32 new_pool_max, new_flags, max;
694 void *start = *p;
695 int err = -EINVAL;
696 u16 version;
697 struct rb_node *rbp;
698
699 ceph_decode_16_safe(p, end, version, bad);
700 if (version > CEPH_OSDMAP_INC_VERSION) {
701 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
702 CEPH_OSDMAP_INC_VERSION);
703 goto bad;
704 }
705
706 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
707 bad);
708 ceph_decode_copy(p, &fsid, sizeof(fsid));
709 epoch = ceph_decode_32(p);
710 BUG_ON(epoch != map->epoch+1);
711 ceph_decode_copy(p, &modified, sizeof(modified));
712 new_pool_max = ceph_decode_32(p);
713 new_flags = ceph_decode_32(p);
714
715 /* full map? */
716 ceph_decode_32_safe(p, end, len, bad);
717 if (len > 0) {
718 dout("apply_incremental full map len %d, %p to %p\n",
719 len, *p, end);
720 return osdmap_decode(p, min(*p+len, end));
721 }
722
723 /* new crush? */
724 ceph_decode_32_safe(p, end, len, bad);
725 if (len > 0) {
726 dout("apply_incremental new crush map len %d, %p to %p\n",
727 len, *p, end);
728 newcrush = crush_decode(*p, min(*p+len, end));
729 if (IS_ERR(newcrush))
730 return ERR_CAST(newcrush);
731 *p += len;
732 }
733
734 /* new flags? */
735 if (new_flags >= 0)
736 map->flags = new_flags;
737 if (new_pool_max >= 0)
738 map->pool_max = new_pool_max;
739
740 ceph_decode_need(p, end, 5*sizeof(u32), bad);
741
742 /* new max? */
743 max = ceph_decode_32(p);
744 if (max >= 0) {
745 err = osdmap_set_max_osd(map, max);
746 if (err < 0)
747 goto bad;
748 }
749
750 map->epoch++;
751 map->modified = map->modified;
752 if (newcrush) {
753 if (map->crush)
754 crush_destroy(map->crush);
755 map->crush = newcrush;
756 newcrush = NULL;
757 }
758
759 /* new_pool */
760 ceph_decode_32_safe(p, end, len, bad);
761 while (len--) {
762 __u8 ev;
763 struct ceph_pg_pool_info *pi;
764
765 ceph_decode_32_safe(p, end, pool, bad);
766 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
767 ev = ceph_decode_8(p); /* encoding version */
768 if (ev > CEPH_PG_POOL_VERSION) {
769 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
770 ev, CEPH_PG_POOL_VERSION);
771 goto bad;
772 }
773 pi = __lookup_pg_pool(&map->pg_pools, pool);
774 if (!pi) {
775 pi = kzalloc(sizeof(*pi), GFP_NOFS);
776 if (!pi) {
777 err = -ENOMEM;
778 goto bad;
779 }
780 pi->id = pool;
781 __insert_pg_pool(&map->pg_pools, pi);
782 }
783 err = __decode_pool(p, end, pi);
784 if (err < 0)
785 goto bad;
786 }
787 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
788 goto bad;
789
790 /* old_pool */
791 ceph_decode_32_safe(p, end, len, bad);
792 while (len--) {
793 struct ceph_pg_pool_info *pi;
794
795 ceph_decode_32_safe(p, end, pool, bad);
796 pi = __lookup_pg_pool(&map->pg_pools, pool);
797 if (pi)
798 __remove_pg_pool(&map->pg_pools, pi);
799 }
800
801 /* new_up */
802 err = -EINVAL;
803 ceph_decode_32_safe(p, end, len, bad);
804 while (len--) {
805 u32 osd;
806 struct ceph_entity_addr addr;
807 ceph_decode_32_safe(p, end, osd, bad);
808 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
809 ceph_decode_addr(&addr);
810 pr_info("osd%d up\n", osd);
811 BUG_ON(osd >= map->max_osd);
812 map->osd_state[osd] |= CEPH_OSD_UP;
813 map->osd_addr[osd] = addr;
814 }
815
816 /* new_down */
817 ceph_decode_32_safe(p, end, len, bad);
818 while (len--) {
819 u32 osd;
820 ceph_decode_32_safe(p, end, osd, bad);
821 (*p)++; /* clean flag */
822 pr_info("osd%d down\n", osd);
823 if (osd < map->max_osd)
824 map->osd_state[osd] &= ~CEPH_OSD_UP;
825 }
826
827 /* new_weight */
828 ceph_decode_32_safe(p, end, len, bad);
829 while (len--) {
830 u32 osd, off;
831 ceph_decode_need(p, end, sizeof(u32)*2, bad);
832 osd = ceph_decode_32(p);
833 off = ceph_decode_32(p);
834 pr_info("osd%d weight 0x%x %s\n", osd, off,
835 off == CEPH_OSD_IN ? "(in)" :
836 (off == CEPH_OSD_OUT ? "(out)" : ""));
837 if (osd < map->max_osd)
838 map->osd_weight[osd] = off;
839 }
840
841 /* new_pg_temp */
842 rbp = rb_first(&map->pg_temp);
843 ceph_decode_32_safe(p, end, len, bad);
844 while (len--) {
845 struct ceph_pg_mapping *pg;
846 int j;
847 struct ceph_pg pgid;
848 u32 pglen;
849 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
850 ceph_decode_copy(p, &pgid, sizeof(pgid));
851 pglen = ceph_decode_32(p);
852
853 /* remove any? */
854 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
855 node)->pgid, pgid) <= 0) {
856 struct ceph_pg_mapping *cur =
857 rb_entry(rbp, struct ceph_pg_mapping, node);
858
859 rbp = rb_next(rbp);
860 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
861 rb_erase(&cur->node, &map->pg_temp);
862 kfree(cur);
863 }
864
865 if (pglen) {
866 /* insert */
867 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
868 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
869 if (!pg) {
870 err = -ENOMEM;
871 goto bad;
872 }
873 pg->pgid = pgid;
874 pg->len = pglen;
875 for (j = 0; j < pglen; j++)
876 pg->osds[j] = ceph_decode_32(p);
877 err = __insert_pg_mapping(pg, &map->pg_temp);
878 if (err) {
879 kfree(pg);
880 goto bad;
881 }
882 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
883 pglen);
884 }
885 }
886 while (rbp) {
887 struct ceph_pg_mapping *cur =
888 rb_entry(rbp, struct ceph_pg_mapping, node);
889
890 rbp = rb_next(rbp);
891 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
892 rb_erase(&cur->node, &map->pg_temp);
893 kfree(cur);
894 }
895
896 /* ignore the rest */
897 *p = end;
898 return map;
899
900bad:
901 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
902 epoch, (int)(*p - start), *p, start, end);
903 print_hex_dump(KERN_DEBUG, "osdmap: ",
904 DUMP_PREFIX_OFFSET, 16, 1,
905 start, end - start, true);
906 if (newcrush)
907 crush_destroy(newcrush);
908 return ERR_PTR(err);
909}
910
911
912
913
914/*
915 * calculate file layout from given offset, length.
916 * fill in correct oid, logical length, and object extent
917 * offset, length.
918 *
919 * for now, we write only a single su, until we can
920 * pass a stride back to the caller.
921 */
922void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
923 u64 off, u64 *plen,
924 u64 *ono,
925 u64 *oxoff, u64 *oxlen)
926{
927 u32 osize = le32_to_cpu(layout->fl_object_size);
928 u32 su = le32_to_cpu(layout->fl_stripe_unit);
929 u32 sc = le32_to_cpu(layout->fl_stripe_count);
930 u32 bl, stripeno, stripepos, objsetno;
931 u32 su_per_object;
932 u64 t, su_offset;
933
934 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
935 osize, su);
936 su_per_object = osize / su;
937 dout("osize %u / su %u = su_per_object %u\n", osize, su,
938 su_per_object);
939
940 BUG_ON((su & ~PAGE_MASK) != 0);
941 /* bl = *off / su; */
942 t = off;
943 do_div(t, su);
944 bl = t;
945 dout("off %llu / su %u = bl %u\n", off, su, bl);
946
947 stripeno = bl / sc;
948 stripepos = bl % sc;
949 objsetno = stripeno / su_per_object;
950
951 *ono = objsetno * sc + stripepos;
952 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
953
954 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
955 t = off;
956 su_offset = do_div(t, su);
957 *oxoff = su_offset + (stripeno % su_per_object) * su;
958
959 /*
960 * Calculate the length of the extent being written to the selected
961 * object. This is the minimum of the full length requested (plen) or
962 * the remainder of the current stripe being written to.
963 */
964 *oxlen = min_t(u64, *plen, su - su_offset);
965 *plen = *oxlen;
966
967 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
968}
969
970/*
971 * calculate an object layout (i.e. pgid) from an oid,
972 * file_layout, and osdmap
973 */
974int ceph_calc_object_layout(struct ceph_object_layout *ol,
975 const char *oid,
976 struct ceph_file_layout *fl,
977 struct ceph_osdmap *osdmap)
978{
979 unsigned num, num_mask;
980 struct ceph_pg pgid;
981 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
982 int poolid = le32_to_cpu(fl->fl_pg_pool);
983 struct ceph_pg_pool_info *pool;
984 unsigned ps;
985
986 BUG_ON(!osdmap);
987
988 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
989 if (!pool)
990 return -EIO;
991 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
992 if (preferred >= 0) {
993 ps += preferred;
994 num = le32_to_cpu(pool->v.lpg_num);
995 num_mask = pool->lpg_num_mask;
996 } else {
997 num = le32_to_cpu(pool->v.pg_num);
998 num_mask = pool->pg_num_mask;
999 }
1000
1001 pgid.ps = cpu_to_le16(ps);
1002 pgid.preferred = cpu_to_le16(preferred);
1003 pgid.pool = fl->fl_pg_pool;
1004 if (preferred >= 0)
1005 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1006 (int)preferred);
1007 else
1008 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1009
1010 ol->ol_pgid = pgid;
1011 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1012 return 0;
1013}
1014
1015/*
1016 * Calculate raw osd vector for the given pgid. Return pointer to osd
1017 * array, or NULL on failure.
1018 */
1019static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1020 int *osds, int *num)
1021{
1022 struct ceph_pg_mapping *pg;
1023 struct ceph_pg_pool_info *pool;
1024 int ruleno;
1025 unsigned poolid, ps, pps;
1026 int preferred;
1027
1028 /* pg_temp? */
1029 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1030 if (pg) {
1031 *num = pg->len;
1032 return pg->osds;
1033 }
1034
1035 /* crush */
1036 poolid = le32_to_cpu(pgid.pool);
1037 ps = le16_to_cpu(pgid.ps);
1038 preferred = (s16)le16_to_cpu(pgid.preferred);
1039
1040 /* don't forcefeed bad device ids to crush */
1041 if (preferred >= osdmap->max_osd ||
1042 preferred >= osdmap->crush->max_devices)
1043 preferred = -1;
1044
1045 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1046 if (!pool)
1047 return NULL;
1048 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1049 pool->v.type, pool->v.size);
1050 if (ruleno < 0) {
1051 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1052 poolid, pool->v.crush_ruleset, pool->v.type,
1053 pool->v.size);
1054 return NULL;
1055 }
1056
1057 if (preferred >= 0)
1058 pps = ceph_stable_mod(ps,
1059 le32_to_cpu(pool->v.lpgp_num),
1060 pool->lpgp_num_mask);
1061 else
1062 pps = ceph_stable_mod(ps,
1063 le32_to_cpu(pool->v.pgp_num),
1064 pool->pgp_num_mask);
1065 pps += poolid;
1066 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1067 min_t(int, pool->v.size, *num),
1068 preferred, osdmap->osd_weight);
1069 return osds;
1070}
1071
1072/*
1073 * Return acting set for given pgid.
1074 */
1075int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1076 int *acting)
1077{
1078 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1079 int i, o, num = CEPH_PG_MAX_SIZE;
1080
1081 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1082 if (!osds)
1083 return -1;
1084
1085 /* primary is first up osd */
1086 o = 0;
1087 for (i = 0; i < num; i++)
1088 if (ceph_osd_is_up(osdmap, osds[i]))
1089 acting[o++] = osds[i];
1090 return o;
1091}
1092
1093/*
1094 * Return primary osd for given pgid, or -1 if none.
1095 */
1096int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1097{
1098 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1099 int i, num = CEPH_PG_MAX_SIZE;
1100
1101 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1102 if (!osds)
1103 return -1;
1104
1105 /* primary is first up osd */
1106 for (i = 0; i < num; i++)
1107 if (ceph_osd_is_up(osdmap, osds[i]))
1108 return osds[i];
1109 return -1;
1110}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510d..000000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid);
127
128#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 struct page *page = list_entry(pl->head.prev, struct page,
11 lru);
12 kunmap(page);
13}
14
15int ceph_pagelist_release(struct ceph_pagelist *pl)
16{
17 if (pl->mapped_tail)
18 ceph_pagelist_unmap_tail(pl);
19
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 return 0;
27}
28
29static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
30{
31 struct page *page = __page_cache_alloc(GFP_NOFS);
32 if (!page)
33 return -ENOMEM;
34 pl->room += PAGE_SIZE;
35 list_add_tail(&page->lru, &pl->head);
36 if (pl->mapped_tail)
37 ceph_pagelist_unmap_tail(pl);
38 pl->mapped_tail = kmap(page);
39 return 0;
40}
41
42int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
43{
44 while (pl->room < len) {
45 size_t bit = pl->room;
46 int ret;
47
48 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
49 buf, bit);
50 pl->length += bit;
51 pl->room -= bit;
52 buf += bit;
53 len -= bit;
54 ret = ceph_pagelist_addpage(pl);
55 if (ret)
56 return ret;
57 }
58
59 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
60 pl->length += len;
61 pl->room -= len;
62 return 0;
63}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e1087..000000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 6d5247f2e81b..000000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
1#ifndef CEPH_RADOS_H
2#define CEPH_RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
62
63/*
64 * placement group.
65 * we encode this into one __le64.
66 */
67struct ceph_pg {
68 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */
71} __attribute__ ((packed));
72
73/*
74 * pg_pool is a set of pgs storing a pool of objects
75 *
76 * pg_num -- base number of pseudorandomly placed pgs
77 *
78 * pgp_num -- effective number when calculating pg placement. this
79 * is used for pg_num increases. new pgs result in data being "split"
80 * into new pgs. for this to proceed smoothly, new pgs are intiially
81 * colocated with their parents; that is, pgp_num doesn't increase
82 * until the new pgs have successfully split. only _then_ are the new
83 * pgs placed independently.
84 *
85 * lpg_num -- localized pg count (per device). replicas are randomly
86 * selected.
87 *
88 * lpgp_num -- as above.
89 */
90#define CEPH_PG_TYPE_REP 1
91#define CEPH_PG_TYPE_RAID4 2
92#define CEPH_PG_POOL_VERSION 2
93struct ceph_pg_pool {
94 __u8 type; /* CEPH_PG_TYPE_* */
95 __u8 size; /* number of osds in each pg */
96 __u8 crush_ruleset; /* crush placement rule */
97 __u8 object_hash; /* hash mapping object name to ps */
98 __le32 pg_num, pgp_num; /* number of pg's */
99 __le32 lpg_num, lpgp_num; /* number of localized pg's */
100 __le32 last_change; /* most recent epoch changed */
101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed));
107
108/*
109 * stable_mod func is used to control number of placement groups.
110 * similar to straight-up modulo, but produces a stable mapping as b
111 * increases over time. b is the number of bins, and bmask is the
112 * containing power of 2 minus 1.
113 *
114 * b <= bmask and bmask=(2**n)-1
115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
116 */
117static inline int ceph_stable_mod(int x, int b, int bmask)
118{
119 if ((x & bmask) < b)
120 return x & bmask;
121 else
122 return x & (bmask >> 1);
123}
124
125/*
126 * object layout - how a given object should be stored.
127 */
128struct ceph_object_layout {
129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
130 __le32 ol_stripe_unit; /* for per-object parity, if any */
131} __attribute__ ((packed));
132
133/*
134 * compound epoch+version, used by storage layer to serialize mutations
135 */
136struct ceph_eversion {
137 __le32 epoch;
138 __le64 version;
139} __attribute__ ((packed));
140
141/*
142 * osd map bits
143 */
144
145/* status bits */
146#define CEPH_OSD_EXISTS 1
147#define CEPH_OSD_UP 2
148
149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
150#define CEPH_OSD_IN 0x10000
151#define CEPH_OSD_OUT 0
152
153
154/*
155 * osd map flag bits
156 */
157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
162
163/*
164 * osd ops
165 */
166#define CEPH_OSD_OP_MODE 0xf000
167#define CEPH_OSD_OP_MODE_RD 0x1000
168#define CEPH_OSD_OP_MODE_WR 0x2000
169#define CEPH_OSD_OP_MODE_RMW 0x3000
170#define CEPH_OSD_OP_MODE_SUB 0x4000
171
172#define CEPH_OSD_OP_TYPE 0x0f00
173#define CEPH_OSD_OP_TYPE_LOCK 0x0100
174#define CEPH_OSD_OP_TYPE_DATA 0x0200
175#define CEPH_OSD_OP_TYPE_ATTR 0x0300
176#define CEPH_OSD_OP_TYPE_EXEC 0x0400
177#define CEPH_OSD_OP_TYPE_PG 0x0500
178
179enum {
180 /** data **/
181 /* read */
182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
184
185 /* fancy read */
186 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
187
188 /* write */
189 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
190 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
191 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
192 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
193 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
194
195 /* fancy write */
196 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
197 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
198 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
199 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
200
201 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
202 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
207
208 /** attrs **/
209 /* read */
210 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
213
214 /* write */
215 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
216 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
217 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
218 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
219
220 /** subop **/
221 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
222 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
223 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
224 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
225 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
226
227 /** lock **/
228 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
229 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
230 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
231 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
232 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
233 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
234
235 /** exec **/
236 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
237
238 /** pg **/
239 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
240};
241
242static inline int ceph_osd_op_type_lock(int op)
243{
244 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
245}
246static inline int ceph_osd_op_type_data(int op)
247{
248 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
249}
250static inline int ceph_osd_op_type_attr(int op)
251{
252 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
253}
254static inline int ceph_osd_op_type_exec(int op)
255{
256 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
257}
258static inline int ceph_osd_op_type_pg(int op)
259{
260 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
261}
262
263static inline int ceph_osd_op_mode_subop(int op)
264{
265 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
266}
267static inline int ceph_osd_op_mode_read(int op)
268{
269 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
270}
271static inline int ceph_osd_op_mode_modify(int op)
272{
273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
274}
275
276/*
277 * note that the following tmap stuff is also defined in the ceph librados.h
278 * any modification here needs to be updated there
279 */
280#define CEPH_OSD_TMAP_HDR 'h'
281#define CEPH_OSD_TMAP_SET 's'
282#define CEPH_OSD_TMAP_RM 'r'
283
284extern const char *ceph_osd_op_name(int op);
285
286
287/*
288 * osd op flags
289 *
290 * An op may be READ, WRITE, or READ|WRITE.
291 */
292enum {
293 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
294 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
295 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
296 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
297 CEPH_OSD_FLAG_READ = 16, /* op may read */
298 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
299 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
300 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
301 CEPH_OSD_FLAG_BALANCE_READS = 256,
302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
306};
307
308enum {
309 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
310};
311
312#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
313#define EBLACKLISTED ESHUTDOWN /* blacklisted */
314
315/* xattr comparison */
316enum {
317 CEPH_OSD_CMPXATTR_OP_NOP = 0,
318 CEPH_OSD_CMPXATTR_OP_EQ = 1,
319 CEPH_OSD_CMPXATTR_OP_NE = 2,
320 CEPH_OSD_CMPXATTR_OP_GT = 3,
321 CEPH_OSD_CMPXATTR_OP_GTE = 4,
322 CEPH_OSD_CMPXATTR_OP_LT = 5,
323 CEPH_OSD_CMPXATTR_OP_LTE = 6
324};
325
326enum {
327 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
328 CEPH_OSD_CMPXATTR_MODE_U64 = 2
329};
330
331/*
332 * an individual object operation. each may be accompanied by some data
333 * payload
334 */
335struct ceph_osd_op {
336 __le16 op; /* CEPH_OSD_OP_* */
337 __le32 flags; /* CEPH_OSD_FLAG_* */
338 union {
339 struct {
340 __le64 offset, length;
341 __le64 truncate_size;
342 __le32 truncate_seq;
343 } __attribute__ ((packed)) extent;
344 struct {
345 __le32 name_len;
346 __le32 value_len;
347 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
348 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
349 } __attribute__ ((packed)) xattr;
350 struct {
351 __u8 class_len;
352 __u8 method_len;
353 __u8 argc;
354 __le32 indata_len;
355 } __attribute__ ((packed)) cls;
356 struct {
357 __le64 cookie, count;
358 } __attribute__ ((packed)) pgls;
359 struct {
360 __le64 snapid;
361 } __attribute__ ((packed)) snap;
362 };
363 __le32 payload_len;
364} __attribute__ ((packed));
365
366/*
367 * osd request message header. each request may include multiple
368 * ceph_osd_op object operations.
369 */
370struct ceph_osd_request_head {
371 __le32 client_inc; /* client incarnation */
372 struct ceph_object_layout layout; /* pgid */
373 __le32 osdmap_epoch; /* client's osdmap epoch */
374
375 __le32 flags;
376
377 struct ceph_timespec mtime; /* for mutations only */
378 struct ceph_eversion reassert_version; /* if we are replaying op */
379
380 __le32 object_len; /* length of object name */
381
382 __le64 snapid; /* snapid to read */
383 __le64 snap_seq; /* writer's snap context */
384 __le32 num_snaps;
385
386 __le16 num_ops;
387 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
388} __attribute__ ((packed));
389
390struct ceph_osd_reply_head {
391 __le32 client_inc; /* client incarnation */
392 __le32 flags;
393 struct ceph_object_layout layout;
394 __le32 osdmap_epoch;
395 struct ceph_eversion reassert_version; /* for replaying uncommitted */
396
397 __le32 result; /* result code */
398
399 __le32 object_len; /* length of object name */
400 __le32 num_ops;
401 struct ceph_osd_op ops[0]; /* ops[], object */
402} __attribute__ ((packed));
403
404
405#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/sort.h> 3#include <linux/sort.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "decode.h" 7#include "mds_client.h"
8
9#include <linux/ceph/decode.h>
8 10
9/* 11/*
10 * Snapshots in ceph are driven in large part by cooperation from the 12 * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
526 struct ceph_cap_snap *capsnap) 528 struct ceph_cap_snap *capsnap)
527{ 529{
528 struct inode *inode = &ci->vfs_inode; 530 struct inode *inode = &ci->vfs_inode;
529 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 531 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
530 532
531 BUG_ON(capsnap->writing); 533 BUG_ON(capsnap->writing);
532 capsnap->size = inode->i_size; 534 capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
747 struct ceph_mds_session *session, 749 struct ceph_mds_session *session,
748 struct ceph_msg *msg) 750 struct ceph_msg *msg)
749{ 751{
750 struct super_block *sb = mdsc->client->sb; 752 struct super_block *sb = mdsc->fsc->sb;
751 int mds = session->s_mds; 753 int mds = session->s_mds;
752 u64 split; 754 u64 split;
753 int op; 755 int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
1/* 1/*
2 * Ceph string constants 2 * Ceph fs string constants
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
14 default: return "unknown";
15 }
16}
17
18const char *ceph_osd_op_name(int op)
19{
20 switch (op) {
21 case CEPH_OSD_OP_READ: return "read";
22 case CEPH_OSD_OP_STAT: return "stat";
23
24 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
25
26 case CEPH_OSD_OP_WRITE: return "write";
27 case CEPH_OSD_OP_DELETE: return "delete";
28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
49
50 case CEPH_OSD_OP_PULL: return "pull";
51 case CEPH_OSD_OP_PUSH: return "push";
52 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
53 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
54 case CEPH_OSD_OP_SCRUB: return "scrub";
55
56 case CEPH_OSD_OP_WRLOCK: return "wrlock";
57 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
58 case CEPH_OSD_OP_RDLOCK: return "rdlock";
59 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
60 case CEPH_OSD_OP_UPLOCK: return "uplock";
61 case CEPH_OSD_OP_DNLOCK: return "dnlock";
62
63 case CEPH_OSD_OP_CALL: return "call";
64
65 case CEPH_OSD_OP_PGLS: return "pgls";
66 }
67 return "???";
68}
69 7
70const char *ceph_mds_state_name(int s) 8const char *ceph_mds_state_name(int s)
71{ 9{
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
177 } 115 }
178 return "???"; 116 return "???";
179} 117}
180
181const char *ceph_pool_op_name(int op)
182{
183 switch (op) {
184 case POOL_OP_CREATE: return "create";
185 case POOL_OP_DELETE: return "delete";
186 case POOL_OP_AUID_CHANGE: return "auid change";
187 case POOL_OP_CREATE_SNAP: return "create snap";
188 case POOL_OP_DELETE_SNAP: return "delete snap";
189 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
190 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
191 }
192 return "???";
193}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..9c5085465a63 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
@@ -15,10 +15,13 @@
15#include <linux/statfs.h> 15#include <linux/statfs.h>
16#include <linux/string.h> 16#include <linux/string.h>
17 17
18#include "decode.h"
19#include "super.h" 18#include "super.h"
20#include "mon_client.h" 19#include "mds_client.h"
21#include "auth.h" 20
21#include <linux/ceph/decode.h>
22#include <linux/ceph/mon_client.h>
23#include <linux/ceph/auth.h>
24#include <linux/ceph/debugfs.h>
22 25
23/* 26/*
24 * Ceph superblock operations 27 * Ceph superblock operations
@@ -26,36 +29,22 @@
26 * Handle the basics of mounting, unmounting. 29 * Handle the basics of mounting, unmounting.
27 */ 30 */
28 31
29
30/*
31 * find filename portion of a path (/foo/bar/baz -> baz)
32 */
33const char *ceph_file_part(const char *s, int len)
34{
35 const char *e = s + len;
36
37 while (e != s && *(e-1) != '/')
38 e--;
39 return e;
40}
41
42
43/* 32/*
44 * super ops 33 * super ops
45 */ 34 */
46static void ceph_put_super(struct super_block *s) 35static void ceph_put_super(struct super_block *s)
47{ 36{
48 struct ceph_client *client = ceph_sb_to_client(s); 37 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
49 38
50 dout("put_super\n"); 39 dout("put_super\n");
51 ceph_mdsc_close_sessions(&client->mdsc); 40 ceph_mdsc_close_sessions(fsc->mdsc);
52 41
53 /* 42 /*
54 * ensure we release the bdi before put_anon_super releases 43 * ensure we release the bdi before put_anon_super releases
55 * the device name. 44 * the device name.
56 */ 45 */
57 if (s->s_bdi == &client->backing_dev_info) { 46 if (s->s_bdi == &fsc->backing_dev_info) {
58 bdi_unregister(&client->backing_dev_info); 47 bdi_unregister(&fsc->backing_dev_info);
59 s->s_bdi = NULL; 48 s->s_bdi = NULL;
60 } 49 }
61 50
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
64 53
65static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 54static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
66{ 55{
67 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); 56 struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
68 struct ceph_monmap *monmap = client->monc.monmap; 57 struct ceph_monmap *monmap = fsc->client->monc.monmap;
69 struct ceph_statfs st; 58 struct ceph_statfs st;
70 u64 fsid; 59 u64 fsid;
71 int err; 60 int err;
72 61
73 dout("statfs\n"); 62 dout("statfs\n");
74 err = ceph_monc_do_statfs(&client->monc, &st); 63 err = ceph_monc_do_statfs(&fsc->client->monc, &st);
75 if (err < 0) 64 if (err < 0)
76 return err; 65 return err;
77 66
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
104 93
105static int ceph_sync_fs(struct super_block *sb, int wait) 94static int ceph_sync_fs(struct super_block *sb, int wait)
106{ 95{
107 struct ceph_client *client = ceph_sb_to_client(sb); 96 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
108 97
109 if (!wait) { 98 if (!wait) {
110 dout("sync_fs (non-blocking)\n"); 99 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc); 100 ceph_flush_dirty_caps(fsc->mdsc);
112 dout("sync_fs (non-blocking) done\n"); 101 dout("sync_fs (non-blocking) done\n");
113 return 0; 102 return 0;
114 } 103 }
115 104
116 dout("sync_fs (blocking)\n"); 105 dout("sync_fs (blocking)\n");
117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); 106 ceph_osdc_sync(&fsc->client->osdc);
118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); 107 ceph_mdsc_sync(fsc->mdsc);
119 dout("sync_fs (blocking) done\n"); 108 dout("sync_fs (blocking) done\n");
120 return 0; 109 return 0;
121} 110}
122 111
123static int default_congestion_kb(void)
124{
125 int congestion_kb;
126
127 /*
128 * Copied from NFS
129 *
130 * congestion size, scale with available memory.
131 *
132 * 64MB: 8192k
133 * 128MB: 11585k
134 * 256MB: 16384k
135 * 512MB: 23170k
136 * 1GB: 32768k
137 * 2GB: 46340k
138 * 4GB: 65536k
139 * 8GB: 92681k
140 * 16GB: 131072k
141 *
142 * This allows larger machines to have larger/more transfers.
143 * Limit the default to 256M
144 */
145 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
146 if (congestion_kb > 256*1024)
147 congestion_kb = 256*1024;
148
149 return congestion_kb;
150}
151
152/**
153 * ceph_show_options - Show mount options in /proc/mounts
154 * @m: seq_file to write to
155 * @mnt: mount descriptor
156 */
157static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
158{
159 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
160 struct ceph_mount_args *args = client->mount_args;
161
162 if (args->flags & CEPH_OPT_FSID)
163 seq_printf(m, ",fsid=%pU", &args->fsid);
164 if (args->flags & CEPH_OPT_NOSHARE)
165 seq_puts(m, ",noshare");
166 if (args->flags & CEPH_OPT_DIRSTAT)
167 seq_puts(m, ",dirstat");
168 if ((args->flags & CEPH_OPT_RBYTES) == 0)
169 seq_puts(m, ",norbytes");
170 if (args->flags & CEPH_OPT_NOCRC)
171 seq_puts(m, ",nocrc");
172 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
173 seq_puts(m, ",noasyncreaddir");
174
175 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
176 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
177 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
178 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
179 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
180 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
181 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
182 seq_printf(m, ",osdkeepalivetimeout=%d",
183 args->osd_keepalive_timeout);
184 if (args->wsize)
185 seq_printf(m, ",wsize=%d", args->wsize);
186 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
187 seq_printf(m, ",rsize=%d", args->rsize);
188 if (args->congestion_kb != default_congestion_kb())
189 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
190 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
191 seq_printf(m, ",caps_wanted_delay_min=%d",
192 args->caps_wanted_delay_min);
193 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
194 seq_printf(m, ",caps_wanted_delay_max=%d",
195 args->caps_wanted_delay_max);
196 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
197 seq_printf(m, ",cap_release_safety=%d",
198 args->cap_release_safety);
199 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
200 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
201 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
202 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
203 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
204 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
205 if (args->name)
206 seq_printf(m, ",name=%s", args->name);
207 if (args->secret)
208 seq_puts(m, ",secret=<hidden>");
209 return 0;
210}
211
212/*
213 * caches
214 */
215struct kmem_cache *ceph_inode_cachep;
216struct kmem_cache *ceph_cap_cachep;
217struct kmem_cache *ceph_dentry_cachep;
218struct kmem_cache *ceph_file_cachep;
219
220static void ceph_inode_init_once(void *foo)
221{
222 struct ceph_inode_info *ci = foo;
223 inode_init_once(&ci->vfs_inode);
224}
225
226static int __init init_caches(void)
227{
228 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
229 sizeof(struct ceph_inode_info),
230 __alignof__(struct ceph_inode_info),
231 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
232 ceph_inode_init_once);
233 if (ceph_inode_cachep == NULL)
234 return -ENOMEM;
235
236 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
237 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
238 if (ceph_cap_cachep == NULL)
239 goto bad_cap;
240
241 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
242 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
243 if (ceph_dentry_cachep == NULL)
244 goto bad_dentry;
245
246 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
247 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
248 if (ceph_file_cachep == NULL)
249 goto bad_file;
250
251 return 0;
252
253bad_file:
254 kmem_cache_destroy(ceph_dentry_cachep);
255bad_dentry:
256 kmem_cache_destroy(ceph_cap_cachep);
257bad_cap:
258 kmem_cache_destroy(ceph_inode_cachep);
259 return -ENOMEM;
260}
261
262static void destroy_caches(void)
263{
264 kmem_cache_destroy(ceph_inode_cachep);
265 kmem_cache_destroy(ceph_cap_cachep);
266 kmem_cache_destroy(ceph_dentry_cachep);
267 kmem_cache_destroy(ceph_file_cachep);
268}
269
270
271/*
272 * ceph_umount_begin - initiate forced umount. Tear down down the
273 * mount, skipping steps that may hang while waiting for server(s).
274 */
275static void ceph_umount_begin(struct super_block *sb)
276{
277 struct ceph_client *client = ceph_sb_to_client(sb);
278
279 dout("ceph_umount_begin - starting forced umount\n");
280 if (!client)
281 return;
282 client->mount_state = CEPH_MOUNT_SHUTDOWN;
283 return;
284}
285
286static const struct super_operations ceph_super_ops = {
287 .alloc_inode = ceph_alloc_inode,
288 .destroy_inode = ceph_destroy_inode,
289 .write_inode = ceph_write_inode,
290 .sync_fs = ceph_sync_fs,
291 .put_super = ceph_put_super,
292 .show_options = ceph_show_options,
293 .statfs = ceph_statfs,
294 .umount_begin = ceph_umount_begin,
295};
296
297
298const char *ceph_msg_type_name(int type)
299{
300 switch (type) {
301 case CEPH_MSG_SHUTDOWN: return "shutdown";
302 case CEPH_MSG_PING: return "ping";
303 case CEPH_MSG_AUTH: return "auth";
304 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
305 case CEPH_MSG_MON_MAP: return "mon_map";
306 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
307 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
308 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
309 case CEPH_MSG_STATFS: return "statfs";
310 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
311 case CEPH_MSG_MDS_MAP: return "mds_map";
312 case CEPH_MSG_CLIENT_SESSION: return "client_session";
313 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
314 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
315 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
316 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
317 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
318 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
319 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
320 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
321 case CEPH_MSG_OSD_MAP: return "osd_map";
322 case CEPH_MSG_OSD_OP: return "osd_op";
323 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
324 default: return "unknown";
325 }
326}
327
328
329/* 112/*
330 * mount options 113 * mount options
331 */ 114 */
332enum { 115enum {
333 Opt_wsize, 116 Opt_wsize,
334 Opt_rsize, 117 Opt_rsize,
335 Opt_osdtimeout,
336 Opt_osdkeepalivetimeout,
337 Opt_mount_timeout,
338 Opt_osd_idle_ttl,
339 Opt_caps_wanted_delay_min, 118 Opt_caps_wanted_delay_min,
340 Opt_caps_wanted_delay_max, 119 Opt_caps_wanted_delay_max,
341 Opt_cap_release_safety, 120 Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
344 Opt_congestion_kb, 123 Opt_congestion_kb,
345 Opt_last_int, 124 Opt_last_int,
346 /* int args above */ 125 /* int args above */
347 Opt_fsid,
348 Opt_snapdirname, 126 Opt_snapdirname,
349 Opt_name,
350 Opt_secret,
351 Opt_last_string, 127 Opt_last_string,
352 /* string args above */ 128 /* string args above */
353 Opt_ip,
354 Opt_noshare,
355 Opt_dirstat, 129 Opt_dirstat,
356 Opt_nodirstat, 130 Opt_nodirstat,
357 Opt_rbytes, 131 Opt_rbytes,
358 Opt_norbytes, 132 Opt_norbytes,
359 Opt_nocrc,
360 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
361}; 134};
362 135
363static match_table_t arg_tokens = { 136static match_table_t fsopt_tokens = {
364 {Opt_wsize, "wsize=%d"}, 137 {Opt_wsize, "wsize=%d"},
365 {Opt_rsize, "rsize=%d"}, 138 {Opt_rsize, "rsize=%d"},
366 {Opt_osdtimeout, "osdtimeout=%d"},
367 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
368 {Opt_mount_timeout, "mount_timeout=%d"},
369 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
370 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 139 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
371 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 140 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
372 {Opt_cap_release_safety, "cap_release_safety=%d"}, 141 {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,466 @@ static match_table_t arg_tokens = {
374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 143 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
375 {Opt_congestion_kb, "write_congestion_kb=%d"}, 144 {Opt_congestion_kb, "write_congestion_kb=%d"},
376 /* int args above */ 145 /* int args above */
377 {Opt_fsid, "fsid=%s"},
378 {Opt_snapdirname, "snapdirname=%s"}, 146 {Opt_snapdirname, "snapdirname=%s"},
379 {Opt_name, "name=%s"},
380 {Opt_secret, "secret=%s"},
381 /* string args above */ 147 /* string args above */
382 {Opt_ip, "ip=%s"},
383 {Opt_noshare, "noshare"},
384 {Opt_dirstat, "dirstat"}, 148 {Opt_dirstat, "dirstat"},
385 {Opt_nodirstat, "nodirstat"}, 149 {Opt_nodirstat, "nodirstat"},
386 {Opt_rbytes, "rbytes"}, 150 {Opt_rbytes, "rbytes"},
387 {Opt_norbytes, "norbytes"}, 151 {Opt_norbytes, "norbytes"},
388 {Opt_nocrc, "nocrc"},
389 {Opt_noasyncreaddir, "noasyncreaddir"}, 152 {Opt_noasyncreaddir, "noasyncreaddir"},
390 {-1, NULL} 153 {-1, NULL}
391}; 154};
392 155
393static int parse_fsid(const char *str, struct ceph_fsid *fsid) 156static int parse_fsopt_token(char *c, void *private)
394{ 157{
395 int i = 0; 158 struct ceph_mount_options *fsopt = private;
396 char tmp[3]; 159 substring_t argstr[MAX_OPT_ARGS];
397 int err = -EINVAL; 160 int token, intval, ret;
398 int d; 161
399 162 token = match_token((char *)c, fsopt_tokens, argstr);
400 dout("parse_fsid '%s'\n", str); 163 if (token < 0)
401 tmp[2] = 0; 164 return -EINVAL;
402 while (*str && i < 16) { 165
403 if (ispunct(*str)) { 166 if (token < Opt_last_int) {
404 str++; 167 ret = match_int(&argstr[0], &intval);
405 continue; 168 if (ret < 0) {
169 pr_err("bad mount option arg (not int) "
170 "at '%s'\n", c);
171 return ret;
406 } 172 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1])) 173 dout("got int token %d val %d\n", token, intval);
408 break; 174 } else if (token > Opt_last_int && token < Opt_last_string) {
409 tmp[0] = str[0]; 175 dout("got string token %d val %s\n", token,
410 tmp[1] = str[1]; 176 argstr[0].from);
411 if (sscanf(tmp, "%x", &d) < 1) 177 } else {
412 break; 178 dout("got token %d\n", token);
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 } 179 }
417 180
418 if (i == 16) 181 switch (token) {
419 err = 0; 182 case Opt_snapdirname:
420 dout("parse_fsid ret %d got fsid %pU", err, fsid); 183 kfree(fsopt->snapdir_name);
421 return err; 184 fsopt->snapdir_name = kstrndup(argstr[0].from,
185 argstr[0].to-argstr[0].from,
186 GFP_KERNEL);
187 if (!fsopt->snapdir_name)
188 return -ENOMEM;
189 break;
190
191 /* misc */
192 case Opt_wsize:
193 fsopt->wsize = intval;
194 break;
195 case Opt_rsize:
196 fsopt->rsize = intval;
197 break;
198 case Opt_caps_wanted_delay_min:
199 fsopt->caps_wanted_delay_min = intval;
200 break;
201 case Opt_caps_wanted_delay_max:
202 fsopt->caps_wanted_delay_max = intval;
203 break;
204 case Opt_readdir_max_entries:
205 fsopt->max_readdir = intval;
206 break;
207 case Opt_readdir_max_bytes:
208 fsopt->max_readdir_bytes = intval;
209 break;
210 case Opt_congestion_kb:
211 fsopt->congestion_kb = intval;
212 break;
213 case Opt_dirstat:
214 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
215 break;
216 case Opt_nodirstat:
217 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
218 break;
219 case Opt_rbytes:
220 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
221 break;
222 case Opt_norbytes:
223 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
224 break;
225 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break;
228 default:
229 BUG_ON(token);
230 }
231 return 0;
422} 232}
423 233
424static struct ceph_mount_args *parse_mount_args(int flags, char *options, 234static void destroy_mount_options(struct ceph_mount_options *args)
425 const char *dev_name,
426 const char **path)
427{ 235{
428 struct ceph_mount_args *args; 236 dout("destroy_mount_options %p\n", args);
429 const char *c; 237 kfree(args->snapdir_name);
430 int err = -ENOMEM; 238 kfree(args);
431 substring_t argstr[MAX_OPT_ARGS]; 239}
432 240
433 args = kzalloc(sizeof(*args), GFP_KERNEL); 241static int strcmp_null(const char *s1, const char *s2)
434 if (!args) 242{
435 return ERR_PTR(-ENOMEM); 243 if (!s1 && !s2)
436 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), 244 return 0;
437 GFP_KERNEL); 245 if (s1 && !s2)
438 if (!args->mon_addr) 246 return -1;
439 goto out; 247 if (!s1 && s2)
248 return 1;
249 return strcmp(s1, s2);
250}
440 251
441 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); 252static int compare_mount_options(struct ceph_mount_options *new_fsopt,
442 253 struct ceph_options *new_opt,
443 /* start with defaults */ 254 struct ceph_fs_client *fsc)
444 args->sb_flags = flags; 255{
445 args->flags = CEPH_OPT_DEFAULT; 256 struct ceph_mount_options *fsopt1 = new_fsopt;
446 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; 257 struct ceph_mount_options *fsopt2 = fsc->mount_options;
447 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 258 int ofs = offsetof(struct ceph_mount_options, snapdir_name);
448 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 259 int ret;
449 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
450 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
451 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
452 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
453 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
454 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
455 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
456 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
457 args->congestion_kb = default_congestion_kb();
458
459 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
460 err = -EINVAL;
461 if (!dev_name)
462 goto out;
463 *path = strstr(dev_name, ":/");
464 if (*path == NULL) {
465 pr_err("device name is missing path (no :/ in %s)\n",
466 dev_name);
467 goto out;
468 }
469 260
470 /* get mon ip(s) */ 261 ret = memcmp(fsopt1, fsopt2, ofs);
471 err = ceph_parse_ips(dev_name, *path, args->mon_addr, 262 if (ret)
472 CEPH_MAX_MON, &args->num_mon); 263 return ret;
473 if (err < 0) 264
474 goto out; 265 ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
266 if (ret)
267 return ret;
268
269 return ceph_compare_options(new_opt, fsc->client);
270}
271
272static int parse_mount_options(struct ceph_mount_options **pfsopt,
273 struct ceph_options **popt,
274 int flags, char *options,
275 const char *dev_name,
276 const char **path)
277{
278 struct ceph_mount_options *fsopt;
279 const char *dev_name_end;
280 int err = -ENOMEM;
281
282 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
283 if (!fsopt)
284 return -ENOMEM;
285
286 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
287
288 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
294 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
295 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
296 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
297 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
298 fsopt->congestion_kb = default_congestion_kb();
299
300 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
301 err = -EINVAL;
302 if (!dev_name)
303 goto out;
304 *path = strstr(dev_name, ":/");
305 if (*path == NULL) {
306 pr_err("device name is missing path (no :/ in %s)\n",
307 dev_name);
308 goto out;
309 }
310 dev_name_end = *path;
311 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
475 312
476 /* path on server */ 313 /* path on server */
477 *path += 2; 314 *path += 2;
478 dout("server path '%s'\n", *path); 315 dout("server path '%s'\n", *path);
479 316
480 /* parse mount options */ 317 err = ceph_parse_options(popt, options, dev_name, dev_name_end,
481 while ((c = strsep(&options, ",")) != NULL) { 318 parse_fsopt_token, (void *)fsopt);
482 int token, intval, ret; 319 if (err)
483 if (!*c) 320 goto out;
484 continue; 321
485 err = -EINVAL; 322 /* success */
486 token = match_token((char *)c, arg_tokens, argstr); 323 *pfsopt = fsopt;
487 if (token < 0) { 324 return 0;
488 pr_err("bad mount option at '%s'\n", c);
489 goto out;
490 }
491 if (token < Opt_last_int) {
492 ret = match_int(&argstr[0], &intval);
493 if (ret < 0) {
494 pr_err("bad mount option arg (not int) "
495 "at '%s'\n", c);
496 continue;
497 }
498 dout("got int token %d val %d\n", token, intval);
499 } else if (token > Opt_last_int && token < Opt_last_string) {
500 dout("got string token %d val %s\n", token,
501 argstr[0].from);
502 } else {
503 dout("got token %d\n", token);
504 }
505 switch (token) {
506 case Opt_ip:
507 err = ceph_parse_ips(argstr[0].from,
508 argstr[0].to,
509 &args->my_addr,
510 1, NULL);
511 if (err < 0)
512 goto out;
513 args->flags |= CEPH_OPT_MYIP;
514 break;
515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
521 case Opt_snapdirname:
522 kfree(args->snapdir_name);
523 args->snapdir_name = kstrndup(argstr[0].from,
524 argstr[0].to-argstr[0].from,
525 GFP_KERNEL);
526 break;
527 case Opt_name:
528 args->name = kstrndup(argstr[0].from,
529 argstr[0].to-argstr[0].from,
530 GFP_KERNEL);
531 break;
532 case Opt_secret:
533 args->secret = kstrndup(argstr[0].from,
534 argstr[0].to-argstr[0].from,
535 GFP_KERNEL);
536 break;
537
538 /* misc */
539 case Opt_wsize:
540 args->wsize = intval;
541 break;
542 case Opt_rsize:
543 args->rsize = intval;
544 break;
545 case Opt_osdtimeout:
546 args->osd_timeout = intval;
547 break;
548 case Opt_osdkeepalivetimeout:
549 args->osd_keepalive_timeout = intval;
550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
554 case Opt_mount_timeout:
555 args->mount_timeout = intval;
556 break;
557 case Opt_caps_wanted_delay_min:
558 args->caps_wanted_delay_min = intval;
559 break;
560 case Opt_caps_wanted_delay_max:
561 args->caps_wanted_delay_max = intval;
562 break;
563 case Opt_readdir_max_entries:
564 args->max_readdir = intval;
565 break;
566 case Opt_readdir_max_bytes:
567 args->max_readdir_bytes = intval;
568 break;
569 case Opt_congestion_kb:
570 args->congestion_kb = intval;
571 break;
572
573 case Opt_noshare:
574 args->flags |= CEPH_OPT_NOSHARE;
575 break;
576
577 case Opt_dirstat:
578 args->flags |= CEPH_OPT_DIRSTAT;
579 break;
580 case Opt_nodirstat:
581 args->flags &= ~CEPH_OPT_DIRSTAT;
582 break;
583 case Opt_rbytes:
584 args->flags |= CEPH_OPT_RBYTES;
585 break;
586 case Opt_norbytes:
587 args->flags &= ~CEPH_OPT_RBYTES;
588 break;
589 case Opt_nocrc:
590 args->flags |= CEPH_OPT_NOCRC;
591 break;
592 case Opt_noasyncreaddir:
593 args->flags |= CEPH_OPT_NOASYNCREADDIR;
594 break;
595
596 default:
597 BUG_ON(token);
598 }
599 }
600 return args;
601 325
602out: 326out:
603 kfree(args->mon_addr); 327 destroy_mount_options(fsopt);
604 kfree(args); 328 return err;
605 return ERR_PTR(err);
606} 329}
607 330
608static void destroy_mount_args(struct ceph_mount_args *args) 331/**
332 * ceph_show_options - Show mount options in /proc/mounts
333 * @m: seq_file to write to
334 * @mnt: mount descriptor
335 */
336static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
609{ 337{
610 dout("destroy_mount_args %p\n", args); 338 struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
611 kfree(args->snapdir_name); 339 struct ceph_mount_options *fsopt = fsc->mount_options;
612 args->snapdir_name = NULL; 340 struct ceph_options *opt = fsc->client->options;
613 kfree(args->name); 341
614 args->name = NULL; 342 if (opt->flags & CEPH_OPT_FSID)
615 kfree(args->secret); 343 seq_printf(m, ",fsid=%pU", &opt->fsid);
616 args->secret = NULL; 344 if (opt->flags & CEPH_OPT_NOSHARE)
617 kfree(args); 345 seq_puts(m, ",noshare");
346 if (opt->flags & CEPH_OPT_NOCRC)
347 seq_puts(m, ",nocrc");
348
349 if (opt->name)
350 seq_printf(m, ",name=%s", opt->name);
351 if (opt->secret)
352 seq_puts(m, ",secret=<hidden>");
353
354 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
355 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
356 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
357 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
358 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
359 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
360 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
361 seq_printf(m, ",osdkeepalivetimeout=%d",
362 opt->osd_keepalive_timeout);
363
364 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
365 seq_puts(m, ",dirstat");
366 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
367 seq_puts(m, ",norbytes");
368 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
369 seq_puts(m, ",noasyncreaddir");
370
371 if (fsopt->wsize)
372 seq_printf(m, ",wsize=%d", fsopt->wsize);
373 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
374 seq_printf(m, ",rsize=%d", fsopt->rsize);
375 if (fsopt->congestion_kb != default_congestion_kb())
376 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
377 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
378 seq_printf(m, ",caps_wanted_delay_min=%d",
379 fsopt->caps_wanted_delay_min);
380 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
381 seq_printf(m, ",caps_wanted_delay_max=%d",
382 fsopt->caps_wanted_delay_max);
383 if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
384 seq_printf(m, ",cap_release_safety=%d",
385 fsopt->cap_release_safety);
386 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
387 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
388 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
389 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
390 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
391 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
392 return 0;
618} 393}
619 394
620/* 395/*
621 * create a fresh client instance 396 * handle any mon messages the standard library doesn't understand.
397 * return error if we don't either.
622 */ 398 */
623static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) 399static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
624{ 400{
625 struct ceph_client *client; 401 struct ceph_fs_client *fsc = client->private;
402 int type = le16_to_cpu(msg->hdr.type);
403
404 switch (type) {
405 case CEPH_MSG_MDS_MAP:
406 ceph_mdsc_handle_map(fsc->mdsc, msg);
407 return 0;
408
409 default:
410 return -1;
411 }
412}
413
414/*
415 * create a new fs client
416 */
417struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
418 struct ceph_options *opt)
419{
420 struct ceph_fs_client *fsc;
626 int err = -ENOMEM; 421 int err = -ENOMEM;
627 422
628 client = kzalloc(sizeof(*client), GFP_KERNEL); 423 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
629 if (client == NULL) 424 if (!fsc)
630 return ERR_PTR(-ENOMEM); 425 return ERR_PTR(-ENOMEM);
631 426
632 mutex_init(&client->mount_mutex); 427 fsc->client = ceph_create_client(opt, fsc);
633 428 if (IS_ERR(fsc->client)) {
634 init_waitqueue_head(&client->auth_wq); 429 err = PTR_ERR(fsc->client);
430 goto fail;
431 }
432 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
433 fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
434 CEPH_FEATURE_DIRLAYOUTHASH;
435 fsc->client->monc.want_mdsmap = 1;
635 436
636 client->sb = NULL; 437 fsc->mount_options = fsopt;
637 client->mount_state = CEPH_MOUNT_MOUNTING;
638 client->mount_args = args;
639 438
640 client->msgr = NULL; 439 fsc->sb = NULL;
440 fsc->mount_state = CEPH_MOUNT_MOUNTING;
641 441
642 client->auth_err = 0; 442 atomic_long_set(&fsc->writeback_count, 0);
643 atomic_long_set(&client->writeback_count, 0);
644 443
645 err = bdi_init(&client->backing_dev_info); 444 err = bdi_init(&fsc->backing_dev_info);
646 if (err < 0) 445 if (err < 0)
647 goto fail; 446 goto fail_client;
648 447
649 err = -ENOMEM; 448 err = -ENOMEM;
650 client->wb_wq = create_workqueue("ceph-writeback"); 449 /*
651 if (client->wb_wq == NULL) 450 * The number of concurrent works can be high but they don't need
451 * to be processed in parallel, limit concurrency.
452 */
453 fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
454 if (fsc->wb_wq == NULL)
652 goto fail_bdi; 455 goto fail_bdi;
653 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 456 fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
654 if (client->pg_inv_wq == NULL) 457 if (fsc->pg_inv_wq == NULL)
655 goto fail_wb_wq; 458 goto fail_wb_wq;
656 client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 459 fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
657 if (client->trunc_wq == NULL) 460 if (fsc->trunc_wq == NULL)
658 goto fail_pg_inv_wq; 461 goto fail_pg_inv_wq;
659 462
660 /* set up mempools */ 463 /* set up mempools */
661 err = -ENOMEM; 464 err = -ENOMEM;
662 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, 465 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
663 client->mount_args->wsize >> PAGE_CACHE_SHIFT); 466 fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
664 if (!client->wb_pagevec_pool) 467 if (!fsc->wb_pagevec_pool)
665 goto fail_trunc_wq; 468 goto fail_trunc_wq;
666 469
667 /* caps */ 470 /* caps */
668 client->min_caps = args->max_readdir; 471 fsc->min_caps = fsopt->max_readdir;
472
473 return fsc;
669 474
670 /* subsystems */
671 err = ceph_monc_init(&client->monc, client);
672 if (err < 0)
673 goto fail_mempool;
674 err = ceph_osdc_init(&client->osdc, client);
675 if (err < 0)
676 goto fail_monc;
677 err = ceph_mdsc_init(&client->mdsc, client);
678 if (err < 0)
679 goto fail_osdc;
680 return client;
681
682fail_osdc:
683 ceph_osdc_stop(&client->osdc);
684fail_monc:
685 ceph_monc_stop(&client->monc);
686fail_mempool:
687 mempool_destroy(client->wb_pagevec_pool);
688fail_trunc_wq: 475fail_trunc_wq:
689 destroy_workqueue(client->trunc_wq); 476 destroy_workqueue(fsc->trunc_wq);
690fail_pg_inv_wq: 477fail_pg_inv_wq:
691 destroy_workqueue(client->pg_inv_wq); 478 destroy_workqueue(fsc->pg_inv_wq);
692fail_wb_wq: 479fail_wb_wq:
693 destroy_workqueue(client->wb_wq); 480 destroy_workqueue(fsc->wb_wq);
694fail_bdi: 481fail_bdi:
695 bdi_destroy(&client->backing_dev_info); 482 bdi_destroy(&fsc->backing_dev_info);
483fail_client:
484 ceph_destroy_client(fsc->client);
696fail: 485fail:
697 kfree(client); 486 kfree(fsc);
698 return ERR_PTR(err); 487 return ERR_PTR(err);
699} 488}
700 489
701static void ceph_destroy_client(struct ceph_client *client) 490void destroy_fs_client(struct ceph_fs_client *fsc)
702{ 491{
703 dout("destroy_client %p\n", client); 492 dout("destroy_fs_client %p\n", fsc);
704 493
705 /* unmount */ 494 destroy_workqueue(fsc->wb_wq);
706 ceph_mdsc_stop(&client->mdsc); 495 destroy_workqueue(fsc->pg_inv_wq);
707 ceph_osdc_stop(&client->osdc); 496 destroy_workqueue(fsc->trunc_wq);
708 497
709 /* 498 bdi_destroy(&fsc->backing_dev_info);
710 * make sure mds and osd connections close out before destroying
711 * the auth module, which is needed to free those connections'
712 * ceph_authorizers.
713 */
714 ceph_msgr_flush();
715
716 ceph_monc_stop(&client->monc);
717 499
718 ceph_debugfs_client_cleanup(client); 500 mempool_destroy(fsc->wb_pagevec_pool);
719 destroy_workqueue(client->wb_wq);
720 destroy_workqueue(client->pg_inv_wq);
721 destroy_workqueue(client->trunc_wq);
722 501
723 bdi_destroy(&client->backing_dev_info); 502 destroy_mount_options(fsc->mount_options);
724 503
725 if (client->msgr) 504 ceph_fs_debugfs_cleanup(fsc);
726 ceph_messenger_destroy(client->msgr);
727 mempool_destroy(client->wb_pagevec_pool);
728 505
729 destroy_mount_args(client->mount_args); 506 ceph_destroy_client(fsc->client);
730 507
731 kfree(client); 508 kfree(fsc);
732 dout("destroy_client %p done\n", client); 509 dout("destroy_fs_client %p done\n", fsc);
733} 510}
734 511
735/* 512/*
736 * Initially learn our fsid, or verify an fsid matches. 513 * caches
737 */ 514 */
738int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) 515struct kmem_cache *ceph_inode_cachep;
516struct kmem_cache *ceph_cap_cachep;
517struct kmem_cache *ceph_dentry_cachep;
518struct kmem_cache *ceph_file_cachep;
519
520static void ceph_inode_init_once(void *foo)
739{ 521{
740 if (client->have_fsid) { 522 struct ceph_inode_info *ci = foo;
741 if (ceph_fsid_compare(&client->fsid, fsid)) { 523 inode_init_once(&ci->vfs_inode);
742 pr_err("bad fsid, had %pU got %pU", 524}
743 &client->fsid, fsid); 525
744 return -1; 526static int __init init_caches(void)
745 } 527{
746 } else { 528 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, 529 sizeof(struct ceph_inode_info),
748 fsid); 530 __alignof__(struct ceph_inode_info),
749 memcpy(&client->fsid, fsid, sizeof(*fsid)); 531 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
750 ceph_debugfs_client_init(client); 532 ceph_inode_init_once);
751 client->have_fsid = true; 533 if (ceph_inode_cachep == NULL)
752 } 534 return -ENOMEM;
535
536 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
537 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
538 if (ceph_cap_cachep == NULL)
539 goto bad_cap;
540
541 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
542 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
543 if (ceph_dentry_cachep == NULL)
544 goto bad_dentry;
545
546 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
547 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
548 if (ceph_file_cachep == NULL)
549 goto bad_file;
550
753 return 0; 551 return 0;
552
553bad_file:
554 kmem_cache_destroy(ceph_dentry_cachep);
555bad_dentry:
556 kmem_cache_destroy(ceph_cap_cachep);
557bad_cap:
558 kmem_cache_destroy(ceph_inode_cachep);
559 return -ENOMEM;
754} 560}
755 561
562static void destroy_caches(void)
563{
564 kmem_cache_destroy(ceph_inode_cachep);
565 kmem_cache_destroy(ceph_cap_cachep);
566 kmem_cache_destroy(ceph_dentry_cachep);
567 kmem_cache_destroy(ceph_file_cachep);
568}
569
570
756/* 571/*
757 * true if we have the mon map (and have thus joined the cluster) 572 * ceph_umount_begin - initiate forced umount. Tear down down the
573 * mount, skipping steps that may hang while waiting for server(s).
758 */ 574 */
759static int have_mon_and_osd_map(struct ceph_client *client) 575static void ceph_umount_begin(struct super_block *sb)
760{ 576{
761 return client->monc.monmap && client->monc.monmap->epoch && 577 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
762 client->osdc.osdmap && client->osdc.osdmap->epoch; 578
579 dout("ceph_umount_begin - starting forced umount\n");
580 if (!fsc)
581 return;
582 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
583 return;
763} 584}
764 585
586static const struct super_operations ceph_super_ops = {
587 .alloc_inode = ceph_alloc_inode,
588 .destroy_inode = ceph_destroy_inode,
589 .write_inode = ceph_write_inode,
590 .sync_fs = ceph_sync_fs,
591 .put_super = ceph_put_super,
592 .show_options = ceph_show_options,
593 .statfs = ceph_statfs,
594 .umount_begin = ceph_umount_begin,
595};
596
765/* 597/*
766 * Bootstrap mount by opening the root directory. Note the mount 598 * Bootstrap mount by opening the root directory. Note the mount
767 * @started time from caller, and time out if this takes too long. 599 * @started time from caller, and time out if this takes too long.
768 */ 600 */
769static struct dentry *open_root_dentry(struct ceph_client *client, 601static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
770 const char *path, 602 const char *path,
771 unsigned long started) 603 unsigned long started)
772{ 604{
773 struct ceph_mds_client *mdsc = &client->mdsc; 605 struct ceph_mds_client *mdsc = fsc->mdsc;
774 struct ceph_mds_request *req = NULL; 606 struct ceph_mds_request *req = NULL;
775 int err; 607 int err;
776 struct dentry *root; 608 struct dentry *root;
@@ -784,14 +616,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
784 req->r_ino1.ino = CEPH_INO_ROOT; 616 req->r_ino1.ino = CEPH_INO_ROOT;
785 req->r_ino1.snap = CEPH_NOSNAP; 617 req->r_ino1.snap = CEPH_NOSNAP;
786 req->r_started = started; 618 req->r_started = started;
787 req->r_timeout = client->mount_args->mount_timeout * HZ; 619 req->r_timeout = fsc->client->options->mount_timeout * HZ;
788 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 620 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
789 req->r_num_caps = 2; 621 req->r_num_caps = 2;
790 err = ceph_mdsc_do_request(mdsc, NULL, req); 622 err = ceph_mdsc_do_request(mdsc, NULL, req);
791 if (err == 0) { 623 if (err == 0) {
792 dout("open_root_inode success\n"); 624 dout("open_root_inode success\n");
793 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 625 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
794 client->sb->s_root == NULL) 626 fsc->sb->s_root == NULL)
795 root = d_alloc_root(req->r_target_inode); 627 root = d_alloc_root(req->r_target_inode);
796 else 628 else
797 root = d_obtain_alias(req->r_target_inode); 629 root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +636,84 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
804 return root; 636 return root;
805} 637}
806 638
639
640
641
807/* 642/*
808 * mount: join the ceph cluster, and open root directory. 643 * mount: join the ceph cluster, and open root directory.
809 */ 644 */
810static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, 645static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
811 const char *path) 646 const char *path)
812{ 647{
813 struct ceph_entity_addr *myaddr = NULL;
814 int err; 648 int err;
815 unsigned long timeout = client->mount_args->mount_timeout * HZ;
816 unsigned long started = jiffies; /* note the start time */ 649 unsigned long started = jiffies; /* note the start time */
817 struct dentry *root; 650 struct dentry *root;
651 int first = 0; /* first vfsmount for this super_block */
818 652
819 dout("mount start\n"); 653 dout("mount start\n");
820 mutex_lock(&client->mount_mutex); 654 mutex_lock(&fsc->client->mount_mutex);
821
822 /* initialize the messenger */
823 if (client->msgr == NULL) {
824 if (ceph_test_opt(client, MYIP))
825 myaddr = &client->mount_args->my_addr;
826 client->msgr = ceph_messenger_create(myaddr);
827 if (IS_ERR(client->msgr)) {
828 err = PTR_ERR(client->msgr);
829 client->msgr = NULL;
830 goto out;
831 }
832 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
833 }
834 655
835 /* open session, and wait for mon, mds, and osd maps */ 656 err = __ceph_open_session(fsc->client, started);
836 err = ceph_monc_open_session(&client->monc);
837 if (err < 0) 657 if (err < 0)
838 goto out; 658 goto out;
839 659
840 while (!have_mon_and_osd_map(client)) {
841 err = -EIO;
842 if (timeout && time_after_eq(jiffies, started + timeout))
843 goto out;
844
845 /* wait */
846 dout("mount waiting for mon_map\n");
847 err = wait_event_interruptible_timeout(client->auth_wq,
848 have_mon_and_osd_map(client) || (client->auth_err < 0),
849 timeout);
850 if (err == -EINTR || err == -ERESTARTSYS)
851 goto out;
852 if (client->auth_err < 0) {
853 err = client->auth_err;
854 goto out;
855 }
856 }
857
858 dout("mount opening root\n"); 660 dout("mount opening root\n");
859 root = open_root_dentry(client, "", started); 661 root = open_root_dentry(fsc, "", started);
860 if (IS_ERR(root)) { 662 if (IS_ERR(root)) {
861 err = PTR_ERR(root); 663 err = PTR_ERR(root);
862 goto out; 664 goto out;
863 } 665 }
864 if (client->sb->s_root) 666 if (fsc->sb->s_root) {
865 dput(root); 667 dput(root);
866 else 668 } else {
867 client->sb->s_root = root; 669 fsc->sb->s_root = root;
670 first = 1;
671
672 err = ceph_fs_debugfs_init(fsc);
673 if (err < 0)
674 goto fail;
675 }
868 676
869 if (path[0] == 0) { 677 if (path[0] == 0) {
870 dget(root); 678 dget(root);
871 } else { 679 } else {
872 dout("mount opening base mountpoint\n"); 680 dout("mount opening base mountpoint\n");
873 root = open_root_dentry(client, path, started); 681 root = open_root_dentry(fsc, path, started);
874 if (IS_ERR(root)) { 682 if (IS_ERR(root)) {
875 err = PTR_ERR(root); 683 err = PTR_ERR(root);
876 dput(client->sb->s_root); 684 goto fail;
877 client->sb->s_root = NULL;
878 goto out;
879 } 685 }
880 } 686 }
881 687
882 mnt->mnt_root = root; 688 fsc->mount_state = CEPH_MOUNT_MOUNTED;
883 mnt->mnt_sb = client->sb;
884
885 client->mount_state = CEPH_MOUNT_MOUNTED;
886 dout("mount success\n"); 689 dout("mount success\n");
887 err = 0; 690 mutex_unlock(&fsc->client->mount_mutex);
691 return root;
888 692
889out: 693out:
890 mutex_unlock(&client->mount_mutex); 694 mutex_unlock(&fsc->client->mount_mutex);
891 return err; 695 return ERR_PTR(err);
696
697fail:
698 if (first) {
699 dput(fsc->sb->s_root);
700 fsc->sb->s_root = NULL;
701 }
702 goto out;
892} 703}
893 704
894static int ceph_set_super(struct super_block *s, void *data) 705static int ceph_set_super(struct super_block *s, void *data)
895{ 706{
896 struct ceph_client *client = data; 707 struct ceph_fs_client *fsc = data;
897 int ret; 708 int ret;
898 709
899 dout("set_super %p data %p\n", s, data); 710 dout("set_super %p data %p\n", s, data);
900 711
901 s->s_flags = client->mount_args->sb_flags; 712 s->s_flags = fsc->mount_options->sb_flags;
902 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 713 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
903 714
904 s->s_fs_info = client; 715 s->s_fs_info = fsc;
905 client->sb = s; 716 fsc->sb = s;
906 717
907 s->s_op = &ceph_super_ops; 718 s->s_op = &ceph_super_ops;
908 s->s_export_op = &ceph_export_ops; 719 s->s_export_op = &ceph_export_ops;
@@ -917,7 +728,7 @@ static int ceph_set_super(struct super_block *s, void *data)
917 728
918fail: 729fail:
919 s->s_fs_info = NULL; 730 s->s_fs_info = NULL;
920 client->sb = NULL; 731 fsc->sb = NULL;
921 return ret; 732 return ret;
922} 733}
923 734
@@ -926,30 +737,23 @@ fail:
926 */ 737 */
927static int ceph_compare_super(struct super_block *sb, void *data) 738static int ceph_compare_super(struct super_block *sb, void *data)
928{ 739{
929 struct ceph_client *new = data; 740 struct ceph_fs_client *new = data;
930 struct ceph_mount_args *args = new->mount_args; 741 struct ceph_mount_options *fsopt = new->mount_options;
931 struct ceph_client *other = ceph_sb_to_client(sb); 742 struct ceph_options *opt = new->client->options;
932 int i; 743 struct ceph_fs_client *other = ceph_sb_to_client(sb);
933 744
934 dout("ceph_compare_super %p\n", sb); 745 dout("ceph_compare_super %p\n", sb);
935 if (args->flags & CEPH_OPT_FSID) { 746
936 if (ceph_fsid_compare(&args->fsid, &other->fsid)) { 747 if (compare_mount_options(fsopt, opt, other)) {
937 dout("fsid doesn't match\n"); 748 dout("monitor(s)/mount options don't match\n");
938 return 0; 749 return 0;
939 } 750 }
940 } else { 751 if ((opt->flags & CEPH_OPT_FSID) &&
941 /* do we share (a) monitor? */ 752 ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
942 for (i = 0; i < new->monc.monmap->num_mon; i++) 753 dout("fsid doesn't match\n");
943 if (ceph_monmap_contains(other->monc.monmap, 754 return 0;
944 &new->monc.monmap->mon_inst[i].addr))
945 break;
946 if (i == new->monc.monmap->num_mon) {
947 dout("mon ip not part of monmap\n");
948 return 0;
949 }
950 dout("mon ip matches existing sb %p\n", sb);
951 } 755 }
952 if (args->sb_flags != other->mount_args->sb_flags) { 756 if (fsopt->sb_flags != other->mount_options->sb_flags) {
953 dout("flags differ\n"); 757 dout("flags differ\n");
954 return 0; 758 return 0;
955 } 759 }
@@ -961,98 +765,113 @@ static int ceph_compare_super(struct super_block *sb, void *data)
961 */ 765 */
962static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 766static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
963 767
964static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 768static int ceph_register_bdi(struct super_block *sb,
769 struct ceph_fs_client *fsc)
965{ 770{
966 int err; 771 int err;
967 772
968 /* set ra_pages based on rsize mount option? */ 773 /* set ra_pages based on rsize mount option? */
969 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 774 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
970 client->backing_dev_info.ra_pages = 775 fsc->backing_dev_info.ra_pages =
971 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 776 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
972 >> PAGE_SHIFT; 777 >> PAGE_SHIFT;
973 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", 778 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
974 atomic_long_inc_return(&bdi_seq)); 779 atomic_long_inc_return(&bdi_seq));
975 if (!err) 780 if (!err)
976 sb->s_bdi = &client->backing_dev_info; 781 sb->s_bdi = &fsc->backing_dev_info;
977 return err; 782 return err;
978} 783}
979 784
980static int ceph_get_sb(struct file_system_type *fs_type, 785static struct dentry *ceph_mount(struct file_system_type *fs_type,
981 int flags, const char *dev_name, void *data, 786 int flags, const char *dev_name, void *data)
982 struct vfsmount *mnt)
983{ 787{
984 struct super_block *sb; 788 struct super_block *sb;
985 struct ceph_client *client; 789 struct ceph_fs_client *fsc;
790 struct dentry *res;
986 int err; 791 int err;
987 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 792 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
988 const char *path = NULL; 793 const char *path = NULL;
989 struct ceph_mount_args *args; 794 struct ceph_mount_options *fsopt = NULL;
795 struct ceph_options *opt = NULL;
990 796
991 dout("ceph_get_sb\n"); 797 dout("ceph_mount\n");
992 args = parse_mount_args(flags, data, dev_name, &path); 798 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
993 if (IS_ERR(args)) { 799 if (err < 0) {
994 err = PTR_ERR(args); 800 res = ERR_PTR(err);
995 goto out_final; 801 goto out_final;
996 } 802 }
997 803
998 /* create client (which we may/may not use) */ 804 /* create client (which we may/may not use) */
999 client = ceph_create_client(args); 805 fsc = create_fs_client(fsopt, opt);
1000 if (IS_ERR(client)) { 806 if (IS_ERR(fsc)) {
1001 err = PTR_ERR(client); 807 res = ERR_CAST(fsc);
808 kfree(fsopt);
809 kfree(opt);
1002 goto out_final; 810 goto out_final;
1003 } 811 }
1004 812
1005 if (client->mount_args->flags & CEPH_OPT_NOSHARE) 813 err = ceph_mdsc_init(fsc);
814 if (err < 0) {
815 res = ERR_PTR(err);
816 goto out;
817 }
818
819 if (ceph_test_opt(fsc->client, NOSHARE))
1006 compare_super = NULL; 820 compare_super = NULL;
1007 sb = sget(fs_type, compare_super, ceph_set_super, client); 821 sb = sget(fs_type, compare_super, ceph_set_super, fsc);
1008 if (IS_ERR(sb)) { 822 if (IS_ERR(sb)) {
1009 err = PTR_ERR(sb); 823 res = ERR_CAST(sb);
1010 goto out; 824 goto out;
1011 } 825 }
1012 826
1013 if (ceph_sb_to_client(sb) != client) { 827 if (ceph_sb_to_client(sb) != fsc) {
1014 ceph_destroy_client(client); 828 ceph_mdsc_destroy(fsc);
1015 client = ceph_sb_to_client(sb); 829 destroy_fs_client(fsc);
1016 dout("get_sb got existing client %p\n", client); 830 fsc = ceph_sb_to_client(sb);
831 dout("get_sb got existing client %p\n", fsc);
1017 } else { 832 } else {
1018 dout("get_sb using new client %p\n", client); 833 dout("get_sb using new client %p\n", fsc);
1019 err = ceph_register_bdi(sb, client); 834 err = ceph_register_bdi(sb, fsc);
1020 if (err < 0) 835 if (err < 0) {
836 res = ERR_PTR(err);
1021 goto out_splat; 837 goto out_splat;
838 }
1022 } 839 }
1023 840
1024 err = ceph_mount(client, mnt, path); 841 res = ceph_real_mount(fsc, path);
1025 if (err < 0) 842 if (IS_ERR(res))
1026 goto out_splat; 843 goto out_splat;
1027 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, 844 dout("root %p inode %p ino %llx.%llx\n", res,
1028 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode)); 845 res->d_inode, ceph_vinop(res->d_inode));
1029 return 0; 846 return res;
1030 847
1031out_splat: 848out_splat:
1032 ceph_mdsc_close_sessions(&client->mdsc); 849 ceph_mdsc_close_sessions(fsc->mdsc);
1033 deactivate_locked_super(sb); 850 deactivate_locked_super(sb);
1034 goto out_final; 851 goto out_final;
1035 852
1036out: 853out:
1037 ceph_destroy_client(client); 854 ceph_mdsc_destroy(fsc);
855 destroy_fs_client(fsc);
1038out_final: 856out_final:
1039 dout("ceph_get_sb fail %d\n", err); 857 dout("ceph_mount fail %ld\n", PTR_ERR(res));
1040 return err; 858 return res;
1041} 859}
1042 860
1043static void ceph_kill_sb(struct super_block *s) 861static void ceph_kill_sb(struct super_block *s)
1044{ 862{
1045 struct ceph_client *client = ceph_sb_to_client(s); 863 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1046 dout("kill_sb %p\n", s); 864 dout("kill_sb %p\n", s);
1047 ceph_mdsc_pre_umount(&client->mdsc); 865 ceph_mdsc_pre_umount(fsc->mdsc);
1048 kill_anon_super(s); /* will call put_super after sb is r/o */ 866 kill_anon_super(s); /* will call put_super after sb is r/o */
1049 ceph_destroy_client(client); 867 ceph_mdsc_destroy(fsc);
868 destroy_fs_client(fsc);
1050} 869}
1051 870
1052static struct file_system_type ceph_fs_type = { 871static struct file_system_type ceph_fs_type = {
1053 .owner = THIS_MODULE, 872 .owner = THIS_MODULE,
1054 .name = "ceph", 873 .name = "ceph",
1055 .get_sb = ceph_get_sb, 874 .mount = ceph_mount,
1056 .kill_sb = ceph_kill_sb, 875 .kill_sb = ceph_kill_sb,
1057 .fs_flags = FS_RENAME_DOES_D_MOVE, 876 .fs_flags = FS_RENAME_DOES_D_MOVE,
1058}; 877};
@@ -1062,36 +881,20 @@ static struct file_system_type ceph_fs_type = {
1062 881
1063static int __init init_ceph(void) 882static int __init init_ceph(void)
1064{ 883{
1065 int ret = 0; 884 int ret = init_caches();
1066
1067 ret = ceph_debugfs_init();
1068 if (ret < 0)
1069 goto out;
1070
1071 ret = ceph_msgr_init();
1072 if (ret < 0)
1073 goto out_debugfs;
1074
1075 ret = init_caches();
1076 if (ret) 885 if (ret)
1077 goto out_msgr; 886 goto out;
1078 887
1079 ret = register_filesystem(&ceph_fs_type); 888 ret = register_filesystem(&ceph_fs_type);
1080 if (ret) 889 if (ret)
1081 goto out_icache; 890 goto out_icache;
1082 891
1083 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", 892 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1084 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, 893
1085 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1086 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1087 return 0; 894 return 0;
1088 895
1089out_icache: 896out_icache:
1090 destroy_caches(); 897 destroy_caches();
1091out_msgr:
1092 ceph_msgr_exit();
1093out_debugfs:
1094 ceph_debugfs_cleanup();
1095out: 898out:
1096 return ret; 899 return ret;
1097} 900}
@@ -1101,8 +904,6 @@ static void __exit exit_ceph(void)
1101 dout("exit_ceph\n"); 904 dout("exit_ceph\n");
1102 unregister_filesystem(&ceph_fs_type); 905 unregister_filesystem(&ceph_fs_type);
1103 destroy_caches(); 906 destroy_caches();
1104 ceph_msgr_exit();
1105 ceph_debugfs_cleanup();
1106} 907}
1107 908
1108module_init(init_ceph); 909module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
1#ifndef _FS_CEPH_SUPER_H 1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H 2#define _FS_CEPH_SUPER_H
3 3
4#include "ceph_debug.h" 4#include <linux/ceph/ceph_debug.h>
5 5
6#include <asm/unaligned.h> 6#include <asm/unaligned.h>
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "types.h" 17#include <linux/ceph/libceph.h>
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24 18
25/* f_type in struct statfs */ 19/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400 20#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 26
33/* 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
34 * Supported features 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
35 */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38 30
39/* 31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
40 * mount options
41 */
42#define CEPH_OPT_FSID (1<<0)
43#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
44#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
45#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
46#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
47#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
48#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
49 32
50#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) 33#define ceph_set_mount_opt(fsc, opt) \
34 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
35#define ceph_test_mount_opt(fsc, opt) \
36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
51 37
52#define ceph_set_opt(client, opt) \ 38#define CEPH_MAX_READDIR_DEFAULT 1024
53 (client)->mount_args->flags |= CEPH_OPT_##opt; 39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
54#define ceph_test_opt(client, opt) \ 40#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
55 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
56 41
57 42struct ceph_mount_options {
58struct ceph_mount_args {
59 int sb_flags;
60 int flags; 43 int flags;
61 struct ceph_fsid fsid; 44 int sb_flags;
62 struct ceph_entity_addr my_addr; 45
63 int num_mon;
64 struct ceph_entity_addr *mon_addr;
65 int mount_timeout;
66 int osd_idle_ttl;
67 int osd_timeout;
68 int osd_keepalive_timeout;
69 int wsize; 46 int wsize;
70 int rsize; /* max readahead */ 47 int rsize; /* max readahead */
71 int congestion_kb; /* max writeback in flight */ 48 int congestion_kb; /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
73 int cap_release_safety; 50 int cap_release_safety;
74 int max_readdir; /* max readdir result (entires) */ 51 int max_readdir; /* max readdir result (entires) */
75 int max_readdir_bytes; /* max readdir result (bytes) */ 52 int max_readdir_bytes; /* max readdir result (bytes) */
76 char *snapdir_name; /* default ".snap" */
77 char *name;
78 char *secret;
79};
80 53
81/* 54 /*
82 * defaults 55 * everything above this point can be memcmp'd; everything below
83 */ 56 * is handled in compare_mount_options()
84#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 57 */
85#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
86#define CEPH_OSD_KEEPALIVE_DEFAULT 5
87#define CEPH_OSD_IDLE_TTL_DEFAULT 60
88#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
89#define CEPH_MAX_READDIR_DEFAULT 1024
90#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
91
92#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
93#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
94
95#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
96#define CEPH_AUTH_NAME_DEFAULT "guest"
97/*
98 * Delay telling the MDS we no longer want caps, in case we reopen
99 * the file. Delay a minimum amount of time, even if we send a cap
100 * message for some other reason. Otherwise, take the oppotunity to
101 * update the mds to avoid sending another message later.
102 */
103#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
104#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
105
106#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
107
108/* mount state */
109enum {
110 CEPH_MOUNT_MOUNTING,
111 CEPH_MOUNT_MOUNTED,
112 CEPH_MOUNT_UNMOUNTING,
113 CEPH_MOUNT_UNMOUNTED,
114 CEPH_MOUNT_SHUTDOWN,
115};
116
117/*
118 * subtract jiffies
119 */
120static inline unsigned long time_sub(unsigned long a, unsigned long b)
121{
122 BUG_ON(time_after(b, a));
123 return (long)a - (long)b;
124}
125
126/*
127 * per-filesystem client state
128 *
129 * possibly shared by multiple mount points, if they are
130 * mounting the same ceph filesystem/cluster.
131 */
132struct ceph_client {
133 struct ceph_fsid fsid;
134 bool have_fsid;
135 58
136 struct mutex mount_mutex; /* serialize mount attempts */ 59 char *snapdir_name; /* default ".snap" */
137 struct ceph_mount_args *mount_args; 60};
138 61
62struct ceph_fs_client {
139 struct super_block *sb; 63 struct super_block *sb;
140 64
141 unsigned long mount_state; 65 struct ceph_mount_options *mount_options;
142 wait_queue_head_t auth_wq; 66 struct ceph_client *client;
143
144 int auth_err;
145 67
68 unsigned long mount_state;
146 int min_caps; /* min caps i added */ 69 int min_caps; /* min caps i added */
147 70
148 struct ceph_messenger *msgr; /* messenger instance */ 71 struct ceph_mds_client *mdsc;
149 struct ceph_mon_client monc;
150 struct ceph_mds_client mdsc;
151 struct ceph_osd_client osdc;
152 72
153 /* writeback */ 73 /* writeback */
154 mempool_t *wb_pagevec_pool; 74 mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
160 struct backing_dev_info backing_dev_info; 80 struct backing_dev_info backing_dev_info;
161 81
162#ifdef CONFIG_DEBUG_FS 82#ifdef CONFIG_DEBUG_FS
163 struct dentry *debugfs_monmap; 83 struct dentry *debugfs_dentry_lru, *debugfs_caps;
164 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
165 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
166 struct dentry *debugfs_congestion_kb; 84 struct dentry *debugfs_congestion_kb;
167 struct dentry *debugfs_bdi; 85 struct dentry *debugfs_bdi;
86 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
168#endif 87#endif
169}; 88};
170 89
90
171/* 91/*
172 * File i/o capability. This tracks shared state with the metadata 92 * File i/o capability. This tracks shared state with the metadata
173 * server that allows us to cache or writeback attributes or to read 93 * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
275 int should_free_val; 195 int should_free_val;
276}; 196};
277 197
198/*
199 * Ceph dentry state
200 */
201struct ceph_dentry_info {
202 struct ceph_mds_session *lease_session;
203 u32 lease_gen, lease_shared_gen;
204 u32 lease_seq;
205 unsigned long lease_renew_after, lease_renew_from;
206 struct list_head lru;
207 struct dentry *dentry;
208 u64 time;
209 u64 offset;
210};
211
278struct ceph_inode_xattrs_info { 212struct ceph_inode_xattrs_info {
279 /* 213 /*
280 * (still encoded) xattr blob. we avoid the overhead of parsing 214 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
296/* 230/*
297 * Ceph inode. 231 * Ceph inode.
298 */ 232 */
299#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
300#define CEPH_I_NODELAY 4 /* do not delay cap release */
301#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
302#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
303
304struct ceph_inode_info { 233struct ceph_inode_info {
305 struct ceph_vino i_vino; /* ceph ino + snap */ 234 struct ceph_vino i_vino; /* ceph ino + snap */
306 235
@@ -310,6 +239,7 @@ struct ceph_inode_info {
310 unsigned i_ceph_flags; 239 unsigned i_ceph_flags;
311 unsigned long i_release_count; 240 unsigned long i_release_count;
312 241
242 struct ceph_dir_layout i_dir_layout;
313 struct ceph_file_layout i_layout; 243 struct ceph_file_layout i_layout;
314 char *i_symlink; 244 char *i_symlink;
315 245
@@ -364,9 +294,7 @@ struct ceph_inode_info {
364 int i_rd_ref, i_rdcache_ref, i_wr_ref; 294 int i_rd_ref, i_rdcache_ref, i_wr_ref;
365 int i_wrbuffer_ref, i_wrbuffer_ref_head; 295 int i_wrbuffer_ref, i_wrbuffer_ref_head;
366 u32 i_shared_gen; /* increment each time we get FILE_SHARED */ 296 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
367 u32 i_rdcache_gen; /* we increment this each time we get 297 u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
368 FILE_CACHE. If it's non-zero, we
369 _may_ have cached pages. */
370 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ 298 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
371 299
372 struct list_head i_unsafe_writes; /* uncommitted sync writes */ 300 struct list_head i_unsafe_writes; /* uncommitted sync writes */
@@ -391,6 +319,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
391 return container_of(inode, struct ceph_inode_info, vfs_inode); 319 return container_of(inode, struct ceph_inode_info, vfs_inode);
392} 320}
393 321
322static inline struct ceph_vino ceph_vino(struct inode *inode)
323{
324 return ceph_inode(inode)->i_vino;
325}
326
327/*
328 * ino_t is <64 bits on many architectures, blech.
329 *
330 * don't include snap in ino hash, at least for now.
331 */
332static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
333{
334 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
335#if BITS_PER_LONG == 32
336 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
337 if (!ino)
338 ino = 1;
339#endif
340 return ino;
341}
342
343/* for printf-style formatting */
344#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
345
346static inline u64 ceph_ino(struct inode *inode)
347{
348 return ceph_inode(inode)->i_vino.ino;
349}
350static inline u64 ceph_snap(struct inode *inode)
351{
352 return ceph_inode(inode)->i_vino.snap;
353}
354
355static inline int ceph_ino_compare(struct inode *inode, void *data)
356{
357 struct ceph_vino *pvino = (struct ceph_vino *)data;
358 struct ceph_inode_info *ci = ceph_inode(inode);
359 return ci->i_vino.ino == pvino->ino &&
360 ci->i_vino.snap == pvino->snap;
361}
362
363static inline struct inode *ceph_find_inode(struct super_block *sb,
364 struct ceph_vino vino)
365{
366 ino_t t = ceph_vino_to_ino(vino);
367 return ilookup5(sb, t, ceph_ino_compare, &vino);
368}
369
370
371/*
372 * Ceph inode.
373 */
374#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
375#define CEPH_I_NODELAY 4 /* do not delay cap release */
376#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
377#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
378
394static inline void ceph_i_clear(struct inode *inode, unsigned mask) 379static inline void ceph_i_clear(struct inode *inode, unsigned mask)
395{ 380{
396 struct ceph_inode_info *ci = ceph_inode(inode); 381 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +399,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
414 struct ceph_inode_info *ci = ceph_inode(inode); 399 struct ceph_inode_info *ci = ceph_inode(inode);
415 bool r; 400 bool r;
416 401
417 smp_mb(); 402 spin_lock(&inode->i_lock);
418 r = (ci->i_ceph_flags & mask) == mask; 403 r = (ci->i_ceph_flags & mask) == mask;
404 spin_unlock(&inode->i_lock);
419 return r; 405 return r;
420} 406}
421 407
@@ -432,20 +418,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
432 struct ceph_inode_frag *pfrag, 418 struct ceph_inode_frag *pfrag,
433 int *found); 419 int *found);
434 420
435/*
436 * Ceph dentry state
437 */
438struct ceph_dentry_info {
439 struct ceph_mds_session *lease_session;
440 u32 lease_gen, lease_shared_gen;
441 u32 lease_seq;
442 unsigned long lease_renew_after, lease_renew_from;
443 struct list_head lru;
444 struct dentry *dentry;
445 u64 time;
446 u64 offset;
447};
448
449static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) 421static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
450{ 422{
451 return (struct ceph_dentry_info *)dentry->d_fsdata; 423 return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +428,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
456 return ((loff_t)frag << 32) | (loff_t)off; 428 return ((loff_t)frag << 32) | (loff_t)off;
457} 429}
458 430
459/*
460 * ino_t is <64 bits on many architectures, blech.
461 *
462 * don't include snap in ino hash, at least for now.
463 */
464static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
465{
466 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
467#if BITS_PER_LONG == 32
468 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
469 if (!ino)
470 ino = 1;
471#endif
472 return ino;
473}
474
475static inline int ceph_set_ino_cb(struct inode *inode, void *data) 431static inline int ceph_set_ino_cb(struct inode *inode, void *data)
476{ 432{
477 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; 433 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +435,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
479 return 0; 435 return 0;
480} 436}
481 437
482static inline struct ceph_vino ceph_vino(struct inode *inode)
483{
484 return ceph_inode(inode)->i_vino;
485}
486
487/* for printf-style formatting */
488#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
489
490static inline u64 ceph_ino(struct inode *inode)
491{
492 return ceph_inode(inode)->i_vino.ino;
493}
494static inline u64 ceph_snap(struct inode *inode)
495{
496 return ceph_inode(inode)->i_vino.snap;
497}
498
499static inline int ceph_ino_compare(struct inode *inode, void *data)
500{
501 struct ceph_vino *pvino = (struct ceph_vino *)data;
502 struct ceph_inode_info *ci = ceph_inode(inode);
503 return ci->i_vino.ino == pvino->ino &&
504 ci->i_vino.snap == pvino->snap;
505}
506
507static inline struct inode *ceph_find_inode(struct super_block *sb,
508 struct ceph_vino vino)
509{
510 ino_t t = ceph_vino_to_ino(vino);
511 return ilookup5(sb, t, ceph_ino_compare, &vino);
512}
513
514
515/* 438/*
516 * caps helpers 439 * caps helpers
517 */ 440 */
@@ -576,18 +499,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
576 struct ceph_cap_reservation *ctx, int need); 499 struct ceph_cap_reservation *ctx, int need);
577extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 500extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
578 struct ceph_cap_reservation *ctx); 501 struct ceph_cap_reservation *ctx);
579extern void ceph_reservation_status(struct ceph_client *client, 502extern void ceph_reservation_status(struct ceph_fs_client *client,
580 int *total, int *avail, int *used, 503 int *total, int *avail, int *used,
581 int *reserved, int *min); 504 int *reserved, int *min);
582 505
583static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) 506static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
584{ 507{
585 return (struct ceph_client *)inode->i_sb->s_fs_info; 508 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
586} 509}
587 510
588static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) 511static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
589{ 512{
590 return (struct ceph_client *)sb->s_fs_info; 513 return (struct ceph_fs_client *)sb->s_fs_info;
591} 514}
592 515
593 516
@@ -617,51 +540,6 @@ struct ceph_file_info {
617 540
618 541
619/* 542/*
620 * snapshots
621 */
622
623/*
624 * A "snap context" is the set of existing snapshots when we
625 * write data. It is used by the OSD to guide its COW behavior.
626 *
627 * The ceph_snap_context is refcounted, and attached to each dirty
628 * page, indicating which context the dirty data belonged when it was
629 * dirtied.
630 */
631struct ceph_snap_context {
632 atomic_t nref;
633 u64 seq;
634 int num_snaps;
635 u64 snaps[];
636};
637
638static inline struct ceph_snap_context *
639ceph_get_snap_context(struct ceph_snap_context *sc)
640{
641 /*
642 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
643 atomic_read(&sc->nref)+1);
644 */
645 if (sc)
646 atomic_inc(&sc->nref);
647 return sc;
648}
649
650static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
651{
652 if (!sc)
653 return;
654 /*
655 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
656 atomic_read(&sc->nref)-1);
657 */
658 if (atomic_dec_and_test(&sc->nref)) {
659 /*printk(" deleting snap_context %p\n", sc);*/
660 kfree(sc);
661 }
662}
663
664/*
665 * A "snap realm" describes a subset of the file hierarchy sharing 543 * A "snap realm" describes a subset of the file hierarchy sharing
666 * the same set of snapshots that apply to it. The realms themselves 544 * the same set of snapshots that apply to it. The realms themselves
667 * are organized into a hierarchy, such that children inherit (some of) 545 * are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +577,33 @@ struct ceph_snap_realm {
699 spinlock_t inodes_with_caps_lock; 577 spinlock_t inodes_with_caps_lock;
700}; 578};
701 579
702 580static inline int default_congestion_kb(void)
703
704/*
705 * calculate the number of pages a given length and offset map onto,
706 * if we align the data.
707 */
708static inline int calc_pages_for(u64 off, u64 len)
709{ 581{
710 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - 582 int congestion_kb;
711 (off >> PAGE_CACHE_SHIFT); 583
584 /*
585 * Copied from NFS
586 *
587 * congestion size, scale with available memory.
588 *
589 * 64MB: 8192k
590 * 128MB: 11585k
591 * 256MB: 16384k
592 * 512MB: 23170k
593 * 1GB: 32768k
594 * 2GB: 46340k
595 * 4GB: 65536k
596 * 8GB: 92681k
597 * 16GB: 131072k
598 *
599 * This allows larger machines to have larger/more transfers.
600 * Limit the default to 256M
601 */
602 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
603 if (congestion_kb > 256*1024)
604 congestion_kb = 256*1024;
605
606 return congestion_kb;
712} 607}
713 608
714 609
@@ -741,16 +636,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
741 ci_item)->writing; 636 ci_item)->writing;
742} 637}
743 638
744
745/* super.c */
746extern struct kmem_cache *ceph_inode_cachep;
747extern struct kmem_cache *ceph_cap_cachep;
748extern struct kmem_cache *ceph_dentry_cachep;
749extern struct kmem_cache *ceph_file_cachep;
750
751extern const char *ceph_msg_type_name(int type);
752extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
753
754/* inode.c */ 639/* inode.c */
755extern const struct inode_operations ceph_file_iops; 640extern const struct inode_operations ceph_file_iops;
756 641
@@ -781,7 +666,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
781extern void ceph_queue_writeback(struct inode *inode); 666extern void ceph_queue_writeback(struct inode *inode);
782 667
783extern int ceph_do_getattr(struct inode *inode, int mask); 668extern int ceph_do_getattr(struct inode *inode, int mask);
784extern int ceph_permission(struct inode *inode, int mask); 669extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
785extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); 670extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
786extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, 671extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
787 struct kstat *stat); 672 struct kstat *stat);
@@ -857,12 +742,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
857/* file.c */ 742/* file.c */
858extern const struct file_operations ceph_file_fops; 743extern const struct file_operations ceph_file_fops;
859extern const struct address_space_operations ceph_aops; 744extern const struct address_space_operations ceph_aops;
745extern int ceph_copy_to_page_vector(struct page **pages,
746 const char *data,
747 loff_t off, size_t len);
748extern int ceph_copy_from_page_vector(struct page **pages,
749 char *data,
750 loff_t off, size_t len);
751extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
860extern int ceph_open(struct inode *inode, struct file *file); 752extern int ceph_open(struct inode *inode, struct file *file);
861extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, 753extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
862 struct nameidata *nd, int mode, 754 struct nameidata *nd, int mode,
863 int locked_dir); 755 int locked_dir);
864extern int ceph_release(struct inode *inode, struct file *filp); 756extern int ceph_release(struct inode *inode, struct file *filp);
865extern void ceph_release_page_vector(struct page **pages, int num_pages);
866 757
867/* dir.c */ 758/* dir.c */
868extern const struct file_operations ceph_dir_fops; 759extern const struct file_operations ceph_dir_fops;
@@ -878,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
878extern void ceph_dentry_lru_touch(struct dentry *dn); 769extern void ceph_dentry_lru_touch(struct dentry *dn);
879extern void ceph_dentry_lru_del(struct dentry *dn); 770extern void ceph_dentry_lru_del(struct dentry *dn);
880extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 771extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
772extern unsigned ceph_dentry_hash(struct dentry *dn);
881 773
882/* 774/*
883 * our d_ops vary depending on whether the inode is live, 775 * our d_ops vary depending on whether the inode is live,
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
892/* export.c */ 784/* export.c */
893extern const struct export_operations ceph_export_ops; 785extern const struct export_operations ceph_export_ops;
894 786
895/* debugfs.c */
896extern int ceph_debugfs_init(void);
897extern void ceph_debugfs_cleanup(void);
898extern int ceph_debugfs_client_init(struct ceph_client *client);
899extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
900
901/* locks.c */ 787/* locks.c */
902extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 788extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
903extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 789extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
914 return NULL; 800 return NULL;
915} 801}
916 802
803/* debugfs.c */
804extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
805extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
806
917#endif /* _FS_CEPH_SUPER_H */ 807#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec2..000000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2
2#include "super.h" 3#include "super.h"
3#include "decode.h" 4#include "mds_client.h"
5
6#include <linux/ceph/decode.h>
4 7
5#include <linux/xattr.h> 8#include <linux/xattr.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
@@ -216,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
216 struct rb_node **p; 219 struct rb_node **p;
217 struct rb_node *parent = NULL; 220 struct rb_node *parent = NULL;
218 struct ceph_inode_xattr *xattr = NULL; 221 struct ceph_inode_xattr *xattr = NULL;
222 int name_len = strlen(name);
219 int c; 223 int c;
220 224
221 p = &ci->i_xattrs.index.rb_node; 225 p = &ci->i_xattrs.index.rb_node;
@@ -223,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
223 parent = *p; 227 parent = *p;
224 xattr = rb_entry(parent, struct ceph_inode_xattr, node); 228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
225 c = strncmp(name, xattr->name, xattr->name_len); 229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c == 0 && name_len > xattr->name_len)
231 c = 1;
226 if (c < 0) 232 if (c < 0)
227 p = &(*p)->rb_left; 233 p = &(*p)->rb_left;
228 else if (c > 0) 234 else if (c > 0)
@@ -620,12 +626,12 @@ out:
620static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 626static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
621 const char *value, size_t size, int flags) 627 const char *value, size_t size, int flags)
622{ 628{
623 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 629 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
624 struct inode *inode = dentry->d_inode; 630 struct inode *inode = dentry->d_inode;
625 struct ceph_inode_info *ci = ceph_inode(inode); 631 struct ceph_inode_info *ci = ceph_inode(inode);
626 struct inode *parent_inode = dentry->d_parent->d_inode; 632 struct inode *parent_inode = dentry->d_parent->d_inode;
627 struct ceph_mds_request *req; 633 struct ceph_mds_request *req;
628 struct ceph_mds_client *mdsc = &client->mdsc; 634 struct ceph_mds_client *mdsc = fsc->mdsc;
629 int err; 635 int err;
630 int i, nr_pages; 636 int i, nr_pages;
631 struct page **pages = NULL; 637 struct page **pages = NULL;
@@ -713,10 +719,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
713 719
714 /* preallocate memory for xattr name, value, index node */ 720 /* preallocate memory for xattr name, value, index node */
715 err = -ENOMEM; 721 err = -ENOMEM;
716 newname = kmalloc(name_len + 1, GFP_NOFS); 722 newname = kmemdup(name, name_len + 1, GFP_NOFS);
717 if (!newname) 723 if (!newname)
718 goto out; 724 goto out;
719 memcpy(newname, name, name_len + 1);
720 725
721 if (val_len) { 726 if (val_len) {
722 newval = kmalloc(val_len + 1, GFP_NOFS); 727 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +782,8 @@ out:
777 782
778static int ceph_send_removexattr(struct dentry *dentry, const char *name) 783static int ceph_send_removexattr(struct dentry *dentry, const char *name)
779{ 784{
780 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 785 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
781 struct ceph_mds_client *mdsc = &client->mdsc; 786 struct ceph_mds_client *mdsc = fsc->mdsc;
782 struct inode *inode = dentry->d_inode; 787 struct inode *inode = dentry->d_inode;
783 struct inode *parent_inode = dentry->d_parent->d_inode; 788 struct inode *parent_inode = dentry->d_parent->d_inode;
784 struct ceph_mds_request *req; 789 struct ceph_mds_request *req;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 143d393881cb..dca9e5e0f73b 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
59} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; 59} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
60 60
61/* index in the above */ 61/* index in the above */
62static inline int major_to_index(int major) 62static inline int major_to_index(unsigned major)
63{ 63{
64 return major % CHRDEV_MAJOR_HASH_SIZE; 64 return major % CHRDEV_MAJOR_HASH_SIZE;
65} 65}
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
417 return ret; 417 return ret;
418} 418}
419 419
420int cdev_index(struct inode *inode)
421{
422 int idx;
423 struct kobject *kobj;
424
425 kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
426 if (!kobj)
427 return -1;
428 kobject_put(kobj);
429 return idx;
430}
431
432void cd_forget(struct inode *inode) 420void cd_forget(struct inode *inode)
433{ 421{
434 spin_lock(&cdev_lock); 422 spin_lock(&cdev_lock);
@@ -456,6 +444,7 @@ static void cdev_purge(struct cdev *cdev)
456 */ 444 */
457const struct file_operations def_chr_fops = { 445const struct file_operations def_chr_fops = {
458 .open = chrdev_open, 446 .open = chrdev_open,
447 .llseek = noop_llseek,
459}; 448};
460 449
461static struct kobject *exact_match(dev_t dev, int *part, void *data) 450static struct kobject *exact_match(dev_t dev, int *part, void *data)
@@ -581,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
581EXPORT_SYMBOL(cdev_alloc); 570EXPORT_SYMBOL(cdev_alloc);
582EXPORT_SYMBOL(cdev_del); 571EXPORT_SYMBOL(cdev_del);
583EXPORT_SYMBOL(cdev_add); 572EXPORT_SYMBOL(cdev_add);
584EXPORT_SYMBOL(cdev_index);
585EXPORT_SYMBOL(__register_chrdev); 573EXPORT_SYMBOL(__register_chrdev);
586EXPORT_SYMBOL(__unregister_chrdev); 574EXPORT_SYMBOL(__unregister_chrdev);
587EXPORT_SYMBOL(directly_mappable_cdev_bdi); 575EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,11 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select CRYPTO
6 select CRYPTO_MD4
7 select CRYPTO_MD5
8 select CRYPTO_HMAC
9 select CRYPTO_ARC4
5 help 10 help
6 This is the client VFS module for the Common Internet File System 11 This is the client VFS module for the Common Internet File System
7 (CIFS) protocol which is the successor to the Server Message Block 12 (CIFS) protocol which is the successor to the Server Message Block
@@ -140,6 +145,13 @@ config CIFS_FSCACHE
140 to be cached locally on disk through the general filesystem cache 145 to be cached locally on disk through the general filesystem cache
141 manager. If unsure, say N. 146 manager. If unsure, say N.
142 147
148config CIFS_ACL
149 bool "Provide CIFS ACL support (EXPERIMENTAL)"
150 depends on EXPERIMENTAL && CIFS_XATTR
151 help
152 Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
153 is handed over to the application/caller.
154
143config CIFS_EXPERIMENTAL 155config CIFS_EXPERIMENTAL
144 bool "CIFS Experimental Features (EXPERIMENTAL)" 156 bool "CIFS Experimental Features (EXPERIMENTAL)"
145 depends on CIFS && EXPERIMENTAL 157 depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bdc..d87558448e3d 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,8 +5,10 @@ obj-$(CONFIG_CIFS) += cifs.o
5 5
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ 8 cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o cifsacl.o 9 readdir.o ioctl.o sess.o export.o
10
11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
10 12
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 13cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
12 14
diff --git a/fs/cifs/README b/fs/cifs/README
index 7099a526f775..fe1683590828 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
337 wsize default write size (default 57344) 337 wsize default write size (default 57344)
338 maximum wsize currently allowed by CIFS is 57344 (fourteen 338 maximum wsize currently allowed by CIFS is 57344 (fourteen
339 4096 byte pages) 339 4096 byte pages)
340 actimeo=n attribute cache timeout in seconds (default 1 second).
341 After this timeout, the cifs client requests fresh attribute
342 information from the server. This option allows to tune the
343 attribute cache timeout to suit the workload needs. Shorter
344 timeouts mean better the cache coherency, but increased number
345 of calls to the server. Longer timeouts mean reduced number
346 of calls to the server at the expense of less stricter cache
347 coherency checks (i.e. incorrect attribute cache for a short
348 period of time).
340 rw mount the network share read-write (note that the 349 rw mount the network share read-write (note that the
341 server may still consider the share read-only) 350 server may still consider the share read-only)
342 ro mount network share read-only 351 ro mount network share read-only
@@ -443,6 +452,11 @@ A partial list of the supported mount options follows:
443 if oplock (caching token) is granted and held. Note that 452 if oplock (caching token) is granted and held. Note that
444 direct allows write operations larger than page size 453 direct allows write operations larger than page size
445 to be sent to the server. 454 to be sent to the server.
455 strictcache Use for switching on strict cache mode. In this mode the
456 client read from the cache all the time it has Oplock Level II,
457 otherwise - read from the server. All written data are stored
458 in the cache, but if the client doesn't have Exclusive Oplock,
459 it writes the data to the server.
446 acl Allow setfacl and getfacl to manage posix ACLs if server 460 acl Allow setfacl and getfacl to manage posix ACLs if server
447 supports them. (default) 461 supports them. (default)
448 noacl Do not allow setfacl and getfacl calls on this mount 462 noacl Do not allow setfacl and getfacl calls on this mount
@@ -527,6 +541,11 @@ A partial list of the supported mount options follows:
527 SFU does). In the future the bottom 9 bits of the 541 SFU does). In the future the bottom 9 bits of the
528 mode also will be emulated using queries of the security 542 mode also will be emulated using queries of the security
529 descriptor (ACL). 543 descriptor (ACL).
544 mfsymlinks Enable support for Minshall+French symlinks
545 (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
546 This option is ignored when specified together with the
547 'sfu' option. Minshall+French symlinks are used even if
548 the server supports the CIFS Unix Extensions.
530 sign Must use packet signing (helps avoid unwanted data modification 549 sign Must use packet signing (helps avoid unwanted data modification
531 by intermediate systems in the route). Note that signing 550 by intermediate systems in the route). Note that signing
532 does not work with lanman or plaintext authentication. 551 does not work with lanman or plaintext authentication.
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e52..355abcdcda98 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
81 81
82v) mount check for unmatched uids 82v) mount check for unmatched uids
83 83
84w) Add support for new vfs entry points for setlease and fallocate 84w) Add support for new vfs entry point for fallocate
85 85
86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
87processes can proceed better in parallel (on the server) 87processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 224d7bbd1fcc..e654dfd092c3 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -64,7 +64,9 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
64 void *buffer, uint16_t maxbuf) 64 void *buffer, uint16_t maxbuf)
65{ 65{
66 const struct TCP_Server_Info *server = cookie_netfs_data; 66 const struct TCP_Server_Info *server = cookie_netfs_data;
67 const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr; 67 const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
68 const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
69 const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
68 struct cifs_server_key *key = buffer; 70 struct cifs_server_key *key = buffer;
69 uint16_t key_len = sizeof(struct cifs_server_key); 71 uint16_t key_len = sizeof(struct cifs_server_key);
70 72
@@ -76,16 +78,16 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
76 */ 78 */
77 switch (sa->sa_family) { 79 switch (sa->sa_family) {
78 case AF_INET: 80 case AF_INET:
79 key->family = server->addr.sockAddr.sin_family; 81 key->family = sa->sa_family;
80 key->port = server->addr.sockAddr.sin_port; 82 key->port = addr->sin_port;
81 key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr; 83 key->addr[0].ipv4_addr = addr->sin_addr;
82 key_len += sizeof(key->addr[0].ipv4_addr); 84 key_len += sizeof(key->addr[0].ipv4_addr);
83 break; 85 break;
84 86
85 case AF_INET6: 87 case AF_INET6:
86 key->family = server->addr.sockAddr6.sin6_family; 88 key->family = sa->sa_family;
87 key->port = server->addr.sockAddr6.sin6_port; 89 key->port = addr6->sin6_port;
88 key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr; 90 key->addr[0].ipv6_addr = addr6->sin6_addr;
89 key_len += sizeof(key->addr[0].ipv6_addr); 91 key_len += sizeof(key->addr[0].ipv6_addr);
90 break; 92 break;
91 93
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index eb1ba493489f..65829d32128c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
79 spin_lock(&GlobalMid_Lock); 79 spin_lock(&GlobalMid_Lock);
80 list_for_each(tmp, &server->pending_mid_q) { 80 list_for_each(tmp, &server->pending_mid_q) {
81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
82 cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", 82 cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
83 mid_entry->midState, 83 mid_entry->midState,
84 (int)mid_entry->command, 84 (int)mid_entry->command,
85 mid_entry->pid, 85 mid_entry->pid,
86 mid_entry->tsk, 86 mid_entry->callback_data,
87 mid_entry->mid); 87 mid_entry->mid);
88#ifdef CONFIG_CIFS_STATS2 88#ifdef CONFIG_CIFS_STATS2
89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", 89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -119,36 +119,34 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
119 "Display Internal CIFS Data Structures for Debugging\n" 119 "Display Internal CIFS Data Structures for Debugging\n"
120 "---------------------------------------------------\n"); 120 "---------------------------------------------------\n");
121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); 121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
122 seq_printf(m, "Features: "); 122 seq_printf(m, "Features:");
123#ifdef CONFIG_CIFS_DFS_UPCALL 123#ifdef CONFIG_CIFS_DFS_UPCALL
124 seq_printf(m, "dfs"); 124 seq_printf(m, " dfs");
125 seq_putc(m, ' ');
126#endif 125#endif
127#ifdef CONFIG_CIFS_FSCACHE 126#ifdef CONFIG_CIFS_FSCACHE
128 seq_printf(m, "fscache"); 127 seq_printf(m, " fscache");
129 seq_putc(m, ' ');
130#endif 128#endif
131#ifdef CONFIG_CIFS_WEAK_PW_HASH 129#ifdef CONFIG_CIFS_WEAK_PW_HASH
132 seq_printf(m, "lanman"); 130 seq_printf(m, " lanman");
133 seq_putc(m, ' ');
134#endif 131#endif
135#ifdef CONFIG_CIFS_POSIX 132#ifdef CONFIG_CIFS_POSIX
136 seq_printf(m, "posix"); 133 seq_printf(m, " posix");
137 seq_putc(m, ' ');
138#endif 134#endif
139#ifdef CONFIG_CIFS_UPCALL 135#ifdef CONFIG_CIFS_UPCALL
140 seq_printf(m, "spnego"); 136 seq_printf(m, " spnego");
141 seq_putc(m, ' ');
142#endif 137#endif
143#ifdef CONFIG_CIFS_XATTR 138#ifdef CONFIG_CIFS_XATTR
144 seq_printf(m, "xattr"); 139 seq_printf(m, " xattr");
140#endif
141#ifdef CONFIG_CIFS_ACL
142 seq_printf(m, " acl");
145#endif 143#endif
146 seq_putc(m, '\n'); 144 seq_putc(m, '\n');
147 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); 145 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
148 seq_printf(m, "Servers:"); 146 seq_printf(m, "Servers:");
149 147
150 i = 0; 148 i = 0;
151 read_lock(&cifs_tcp_ses_lock); 149 spin_lock(&cifs_tcp_ses_lock);
152 list_for_each(tmp1, &cifs_tcp_ses_list) { 150 list_for_each(tmp1, &cifs_tcp_ses_list) {
153 server = list_entry(tmp1, struct TCP_Server_Info, 151 server = list_entry(tmp1, struct TCP_Server_Info,
154 tcp_ses_list); 152 tcp_ses_list);
@@ -220,17 +218,17 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
220 mid_entry = list_entry(tmp3, struct mid_q_entry, 218 mid_entry = list_entry(tmp3, struct mid_q_entry,
221 qhead); 219 qhead);
222 seq_printf(m, "\tState: %d com: %d pid:" 220 seq_printf(m, "\tState: %d com: %d pid:"
223 " %d tsk: %p mid %d\n", 221 " %d cbdata: %p mid %d\n",
224 mid_entry->midState, 222 mid_entry->midState,
225 (int)mid_entry->command, 223 (int)mid_entry->command,
226 mid_entry->pid, 224 mid_entry->pid,
227 mid_entry->tsk, 225 mid_entry->callback_data,
228 mid_entry->mid); 226 mid_entry->mid);
229 } 227 }
230 spin_unlock(&GlobalMid_Lock); 228 spin_unlock(&GlobalMid_Lock);
231 } 229 }
232 } 230 }
233 read_unlock(&cifs_tcp_ses_lock); 231 spin_unlock(&cifs_tcp_ses_lock);
234 seq_putc(m, '\n'); 232 seq_putc(m, '\n');
235 233
236 /* BB add code to dump additional info such as TCP session info now */ 234 /* BB add code to dump additional info such as TCP session info now */
@@ -270,7 +268,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
270 atomic_set(&totBufAllocCount, 0); 268 atomic_set(&totBufAllocCount, 0);
271 atomic_set(&totSmBufAllocCount, 0); 269 atomic_set(&totSmBufAllocCount, 0);
272#endif /* CONFIG_CIFS_STATS2 */ 270#endif /* CONFIG_CIFS_STATS2 */
273 read_lock(&cifs_tcp_ses_lock); 271 spin_lock(&cifs_tcp_ses_lock);
274 list_for_each(tmp1, &cifs_tcp_ses_list) { 272 list_for_each(tmp1, &cifs_tcp_ses_list) {
275 server = list_entry(tmp1, struct TCP_Server_Info, 273 server = list_entry(tmp1, struct TCP_Server_Info,
276 tcp_ses_list); 274 tcp_ses_list);
@@ -303,7 +301,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
303 } 301 }
304 } 302 }
305 } 303 }
306 read_unlock(&cifs_tcp_ses_lock); 304 spin_unlock(&cifs_tcp_ses_lock);
307 } 305 }
308 306
309 return count; 307 return count;
@@ -333,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
333 atomic_read(&totSmBufAllocCount)); 331 atomic_read(&totSmBufAllocCount));
334#endif /* CONFIG_CIFS_STATS2 */ 332#endif /* CONFIG_CIFS_STATS2 */
335 333
336 seq_printf(m, "Operations (MIDs): %d\n", midCount.counter); 334 seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
337 seq_printf(m, 335 seq_printf(m,
338 "\n%d session %d share reconnects\n", 336 "\n%d session %d share reconnects\n",
339 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); 337 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
@@ -343,7 +341,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
343 GlobalCurrentXid, GlobalMaxActiveXid); 341 GlobalCurrentXid, GlobalMaxActiveXid);
344 342
345 i = 0; 343 i = 0;
346 read_lock(&cifs_tcp_ses_lock); 344 spin_lock(&cifs_tcp_ses_lock);
347 list_for_each(tmp1, &cifs_tcp_ses_list) { 345 list_for_each(tmp1, &cifs_tcp_ses_list) {
348 server = list_entry(tmp1, struct TCP_Server_Info, 346 server = list_entry(tmp1, struct TCP_Server_Info,
349 tcp_ses_list); 347 tcp_ses_list);
@@ -397,7 +395,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
397 } 395 }
398 } 396 }
399 } 397 }
400 read_unlock(&cifs_tcp_ses_lock); 398 spin_unlock(&cifs_tcp_ses_lock);
401 399
402 seq_putc(m, '\n'); 400 seq_putc(m, '\n');
403 return 0; 401 return 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0c..8942b28cf807 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
34extern int traceSMB; /* flag which enables the function below */ 34extern int traceSMB; /* flag which enables the function below */
35void dump_smb(struct smb_hdr *, int); 35void dump_smb(struct smb_hdr *, int);
36#define CIFS_INFO 0x01 36#define CIFS_INFO 0x01
37#define CIFS_RC 0x02 37#define CIFS_RC 0x02
38#define CIFS_TIMER 0x04 38#define CIFS_TIMER 0x04
39 39
40/* 40/*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d6ced7aa23cf..0a265ad9e426 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
44void cifs_dfs_release_automount_timer(void) 44void cifs_dfs_release_automount_timer(void)
45{ 45{
46 BUG_ON(!list_empty(&cifs_dfs_automount_list)); 46 BUG_ON(!list_empty(&cifs_dfs_automount_list));
47 cancel_delayed_work(&cifs_dfs_automount_task); 47 cancel_delayed_work_sync(&cifs_dfs_automount_task);
48 flush_scheduled_work();
49} 48}
50 49
51/** 50/**
@@ -256,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
256 255
257} 256}
258 257
259static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
260 struct list_head *mntlist)
261{
262 /* stolen from afs code */
263 int err;
264
265 mntget(newmnt);
266 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
267 switch (err) {
268 case 0:
269 path_put(&nd->path);
270 nd->path.mnt = newmnt;
271 nd->path.dentry = dget(newmnt->mnt_root);
272 schedule_delayed_work(&cifs_dfs_automount_task,
273 cifs_dfs_mountpoint_expiry_timeout);
274 break;
275 case -EBUSY:
276 /* someone else made a mount here whilst we were busy */
277 while (d_mountpoint(nd->path.dentry) &&
278 follow_down(&nd->path))
279 ;
280 err = 0;
281 default:
282 mntput(newmnt);
283 break;
284 }
285 return err;
286}
287
288static void dump_referral(const struct dfs_info3_param *ref) 258static void dump_referral(const struct dfs_info3_param *ref)
289{ 259{
290 cFYI(1, "DFS: ref path: %s", ref->path_name); 260 cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -294,34 +264,23 @@ static void dump_referral(const struct dfs_info3_param *ref)
294 ref->path_consumed); 264 ref->path_consumed);
295} 265}
296 266
297 267/*
298static void* 268 * Create a vfsmount that we can automount
299cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 269 */
270static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
300{ 271{
301 struct dfs_info3_param *referrals = NULL; 272 struct dfs_info3_param *referrals = NULL;
302 unsigned int num_referrals = 0; 273 unsigned int num_referrals = 0;
303 struct cifs_sb_info *cifs_sb; 274 struct cifs_sb_info *cifs_sb;
304 struct cifsSesInfo *ses; 275 struct cifsSesInfo *ses;
305 char *full_path = NULL; 276 char *full_path;
306 int xid, i; 277 int xid, i;
307 int rc = 0; 278 int rc;
308 struct vfsmount *mnt = ERR_PTR(-ENOENT); 279 struct vfsmount *mnt;
280 struct tcon_link *tlink;
309 281
310 cFYI(1, "in %s", __func__); 282 cFYI(1, "in %s", __func__);
311 BUG_ON(IS_ROOT(dentry)); 283 BUG_ON(IS_ROOT(mntpt));
312
313 xid = GetXid();
314
315 dput(nd->path.dentry);
316 nd->path.dentry = dget(dentry);
317
318 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
319 ses = cifs_sb->tcon->ses;
320
321 if (!ses) {
322 rc = -EINVAL;
323 goto out_err;
324 }
325 284
326 /* 285 /*
327 * The MSDFS spec states that paths in DFS referral requests and 286 * The MSDFS spec states that paths in DFS referral requests and
@@ -329,56 +288,83 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
329 * the double backslashes usually used in the UNC. This function 288 * the double backslashes usually used in the UNC. This function
330 * gives us the latter, so we must adjust the result. 289 * gives us the latter, so we must adjust the result.
331 */ 290 */
332 full_path = build_path_from_dentry(dentry); 291 mnt = ERR_PTR(-ENOMEM);
333 if (full_path == NULL) { 292 full_path = build_path_from_dentry(mntpt);
334 rc = -ENOMEM; 293 if (full_path == NULL)
335 goto out_err; 294 goto cdda_exit;
295
296 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
297 tlink = cifs_sb_tlink(cifs_sb);
298 if (IS_ERR(tlink)) {
299 mnt = ERR_CAST(tlink);
300 goto free_full_path;
336 } 301 }
302 ses = tlink_tcon(tlink)->ses;
337 303
338 rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls, 304 xid = GetXid();
305 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
339 &num_referrals, &referrals, 306 &num_referrals, &referrals,
340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 307 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
308 FreeXid(xid);
309
310 cifs_put_tlink(tlink);
341 311
312 mnt = ERR_PTR(-ENOENT);
342 for (i = 0; i < num_referrals; i++) { 313 for (i = 0; i < num_referrals; i++) {
343 int len; 314 int len;
344 dump_referral(referrals+i); 315 dump_referral(referrals + i);
345 /* connect to a node */ 316 /* connect to a node */
346 len = strlen(referrals[i].node_name); 317 len = strlen(referrals[i].node_name);
347 if (len < 2) { 318 if (len < 2) {
348 cERROR(1, "%s: Net Address path too short: %s", 319 cERROR(1, "%s: Net Address path too short: %s",
349 __func__, referrals[i].node_name); 320 __func__, referrals[i].node_name);
350 rc = -EINVAL; 321 mnt = ERR_PTR(-EINVAL);
351 goto out_err; 322 break;
352 } 323 }
353 mnt = cifs_dfs_do_refmount(cifs_sb, 324 mnt = cifs_dfs_do_refmount(cifs_sb,
354 full_path, referrals + i); 325 full_path, referrals + i);
355 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 326 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
356 referrals[i].node_name, mnt); 327 referrals[i].node_name, mnt);
357
358 /* complete mount procedure if we accured submount */
359 if (!IS_ERR(mnt)) 328 if (!IS_ERR(mnt))
360 break; 329 goto success;
361 } 330 }
362 331
363 /* we need it cause for() above could exit without valid submount */ 332 /* no valid submounts were found; return error from get_dfs_path() by
364 rc = PTR_ERR(mnt); 333 * preference */
365 if (IS_ERR(mnt)) 334 if (rc != 0)
366 goto out_err; 335 mnt = ERR_PTR(rc);
367 336
368 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); 337success:
369
370out:
371 FreeXid(xid);
372 free_dfs_info_array(referrals, num_referrals); 338 free_dfs_info_array(referrals, num_referrals);
339free_full_path:
373 kfree(full_path); 340 kfree(full_path);
341cdda_exit:
374 cFYI(1, "leaving %s" , __func__); 342 cFYI(1, "leaving %s" , __func__);
375 return ERR_PTR(rc); 343 return mnt;
376out_err: 344}
377 path_put(&nd->path); 345
378 goto out; 346/*
347 * Attempt to automount the referral
348 */
349struct vfsmount *cifs_dfs_d_automount(struct path *path)
350{
351 struct vfsmount *newmnt;
352
353 cFYI(1, "in %s", __func__);
354
355 newmnt = cifs_dfs_do_automount(path->dentry);
356 if (IS_ERR(newmnt)) {
357 cFYI(1, "leaving %s [automount failed]" , __func__);
358 return newmnt;
359 }
360
361 mntget(newmnt); /* prevent immediate expiration */
362 mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
363 schedule_delayed_work(&cifs_dfs_automount_task,
364 cifs_dfs_mountpoint_expiry_timeout);
365 cFYI(1, "leaving %s [ok]" , __func__);
366 return newmnt;
379} 367}
380 368
381const struct inode_operations cifs_dfs_referral_inode_operations = { 369const struct inode_operations cifs_dfs_referral_inode_operations = {
382 .follow_link = cifs_dfs_follow_mountpoint,
383}; 370};
384
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9e771450c3b8..ac51cd2d33ae 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
15 * the GNU Lesser General Public License for more details. 15 * the GNU Lesser General Public License for more details.
16 * 16 *
17 */ 17 */
18#include <linux/rbtree.h>
19
18#ifndef _CIFS_FS_SB_H 20#ifndef _CIFS_FS_SB_H
19#define _CIFS_FS_SB_H 21#define _CIFS_FS_SB_H
20 22
@@ -36,23 +38,30 @@
36#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ 38#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
37#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ 39#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
38#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ 40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
39 44
40struct cifs_sb_info { 45struct cifs_sb_info {
41 struct cifsTconInfo *tcon; /* primary mount */ 46 struct rb_root tlink_tree;
42 struct list_head nested_tcon_q; 47 spinlock_t tlink_tree_lock;
48 struct tcon_link *master_tlink;
43 struct nls_table *local_nls; 49 struct nls_table *local_nls;
44 unsigned int rsize; 50 unsigned int rsize;
45 unsigned int wsize; 51 unsigned int wsize;
52 unsigned long actimeo; /* attribute cache timeout (jiffies) */
53 atomic_t active;
46 uid_t mnt_uid; 54 uid_t mnt_uid;
47 gid_t mnt_gid; 55 gid_t mnt_gid;
48 mode_t mnt_file_mode; 56 mode_t mnt_file_mode;
49 mode_t mnt_dir_mode; 57 mode_t mnt_dir_mode;
50 int mnt_cifs_flags; 58 unsigned int mnt_cifs_flags;
51 int prepathlen; 59 int prepathlen;
52 char *prepath; /* relative path under the share to mount to */ 60 char *prepath; /* relative path under the share to mount to */
53#ifdef CONFIG_CIFS_DFS_UPCALL 61#ifdef CONFIG_CIFS_DFS_UPCALL
54 char *mountdata; /* mount options received at mount time */ 62 char *mountdata; /* mount options received at mount time */
55#endif 63#endif
56 struct backing_dev_info bdi; 64 struct backing_dev_info bdi;
65 struct delayed_work prune_tlinks;
57}; 66};
58#endif /* _CIFS_FS_SB_H */ 67#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 87044906cd1f..4dfba8283165 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,6 +98,8 @@ struct key *
98cifs_get_spnego_key(struct cifsSesInfo *sesInfo) 98cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
99{ 99{
100 struct TCP_Server_Info *server = sesInfo->server; 100 struct TCP_Server_Info *server = sesInfo->server;
101 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
102 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
101 char *description, *dp; 103 char *description, *dp;
102 size_t desc_len; 104 size_t desc_len;
103 struct key *spnego_key; 105 struct key *spnego_key;
@@ -127,10 +129,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
127 dp = description + strlen(description); 129 dp = description + strlen(description);
128 130
129 /* add the server address */ 131 /* add the server address */
130 if (server->addr.sockAddr.sin_family == AF_INET) 132 if (server->dstaddr.ss_family == AF_INET)
131 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); 133 sprintf(dp, "ip4=%pI4", &sa->sin_addr);
132 else if (server->addr.sockAddr.sin_family == AF_INET6) 134 else if (server->dstaddr.ss_family == AF_INET6)
133 sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); 135 sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
134 else 136 else
135 goto out; 137 goto out;
136 138
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a1720..fc0fd4fde306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
44 int charlen, outlen = 0; 44 int charlen, outlen = 0;
45 int maxwords = maxbytes / 2; 45 int maxwords = maxbytes / 2;
46 char tmp[NLS_MAX_CHARSET_SIZE]; 46 char tmp[NLS_MAX_CHARSET_SIZE];
47 __u16 ftmp;
47 48
48 for (i = 0; i < maxwords && from[i]; i++) { 49 for (i = 0; i < maxwords; i++) {
49 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, 50 ftmp = get_unaligned_le16(&from[i]);
50 NLS_MAX_CHARSET_SIZE); 51 if (ftmp == 0)
52 break;
53
54 charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
51 if (charlen > 0) 55 if (charlen > 0)
52 outlen += charlen; 56 outlen += charlen;
53 else 57 else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
58} 62}
59 63
60/* 64/*
61 * cifs_mapchar - convert a little-endian char to proper char in codepage 65 * cifs_mapchar - convert a host-endian char to proper char in codepage
62 * @target - where converted character should be copied 66 * @target - where converted character should be copied
63 * @src_char - 2 byte little-endian source character 67 * @src_char - 2 byte host-endian source character
64 * @cp - codepage to which character should be converted 68 * @cp - codepage to which character should be converted
65 * @mapchar - should character be mapped according to mapchars mount option? 69 * @mapchar - should character be mapped according to mapchars mount option?
66 * 70 *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
69 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). 73 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
70 */ 74 */
71static int 75static int
72cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp, 76cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
73 bool mapchar) 77 bool mapchar)
74{ 78{
75 int len = 1; 79 int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
82 * build_path_from_dentry are modified, as they use slash as 86 * build_path_from_dentry are modified, as they use slash as
83 * separator. 87 * separator.
84 */ 88 */
85 switch (le16_to_cpu(src_char)) { 89 switch (src_char) {
86 case UNI_COLON: 90 case UNI_COLON:
87 *target = ':'; 91 *target = ':';
88 break; 92 break;
@@ -109,8 +113,7 @@ out:
109 return len; 113 return len;
110 114
111cp_convert: 115cp_convert:
112 len = cp->uni2char(le16_to_cpu(src_char), target, 116 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
113 NLS_MAX_CHARSET_SIZE);
114 if (len <= 0) { 117 if (len <= 0) {
115 *target = '?'; 118 *target = '?';
116 len = 1; 119 len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
149 int nullsize = nls_nullsize(codepage); 152 int nullsize = nls_nullsize(codepage);
150 int fromwords = fromlen / 2; 153 int fromwords = fromlen / 2;
151 char tmp[NLS_MAX_CHARSET_SIZE]; 154 char tmp[NLS_MAX_CHARSET_SIZE];
155 __u16 ftmp;
152 156
153 /* 157 /*
154 * because the chars can be of varying widths, we need to take care 158 * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
158 */ 162 */
159 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); 163 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
160 164
161 for (i = 0; i < fromwords && from[i]; i++) { 165 for (i = 0; i < fromwords; i++) {
166 ftmp = get_unaligned_le16(&from[i]);
167 if (ftmp == 0)
168 break;
169
162 /* 170 /*
163 * check to see if converting this character might make the 171 * check to see if converting this character might make the
164 * conversion bleed into the null terminator 172 * conversion bleed into the null terminator
165 */ 173 */
166 if (outlen >= safelen) { 174 if (outlen >= safelen) {
167 charlen = cifs_mapchar(tmp, from[i], codepage, mapchar); 175 charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
168 if ((outlen + charlen) > (tolen - nullsize)) 176 if ((outlen + charlen) > (tolen - nullsize))
169 break; 177 break;
170 } 178 }
171 179
172 /* put converted char into 'to' buffer */ 180 /* put converted char into 'to' buffer */
173 charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar); 181 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
174 outlen += charlen; 182 outlen += charlen;
175 } 183 }
176 184
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
193{ 201{
194 int charlen; 202 int charlen;
195 int i; 203 int i;
196 wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */ 204 wchar_t wchar_to; /* needed to quiet sparse */
197 205
198 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
199 207 charlen = codepage->char2uni(from, len, &wchar_to);
200 /* works for 2.4.0 kernel or later */
201 charlen = codepage->char2uni(from, len, &wchar_to[i]);
202 if (charlen < 1) { 208 if (charlen < 1) {
203 cERROR(1, "strtoUCS: char2uni of %d returned %d", 209 cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
204 (int)*from, charlen); 210 *from, charlen);
205 /* A question mark */ 211 /* A question mark */
206 to[i] = cpu_to_le16(0x003f); 212 wchar_to = 0x003f;
207 charlen = 1; 213 charlen = 1;
208 } else 214 }
209 to[i] = cpu_to_le16(wchar_to[i]); 215 put_unaligned_le16(wchar_to, &to[i]);
210
211 } 216 }
212 217
213 to[i] = 0; 218 put_unaligned_le16(0, &to[i]);
214 return i; 219 return i;
215} 220}
216 221
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
252 return dst; 257 return dst;
253} 258}
254 259
260/*
261 * Convert 16 bit Unicode pathname to wire format from string in current code
262 * page. Conversion may involve remapping up the six characters that are
263 * only legal in POSIX-like OS (if they are present in the string). Path
264 * names are little endian 16 bit Unicode on the wire
265 */
266int
267cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
268 const struct nls_table *cp, int mapChars)
269{
270 int i, j, charlen;
271 int len_remaining = maxlen;
272 char src_char;
273 __u16 temp;
274
275 if (!mapChars)
276 return cifs_strtoUCS(target, source, PATH_MAX, cp);
277
278 for (i = 0, j = 0; i < maxlen; j++) {
279 src_char = source[i];
280 switch (src_char) {
281 case 0:
282 put_unaligned_le16(0, &target[j]);
283 goto ctoUCS_out;
284 case ':':
285 temp = UNI_COLON;
286 break;
287 case '*':
288 temp = UNI_ASTERIK;
289 break;
290 case '?':
291 temp = UNI_QUESTION;
292 break;
293 case '<':
294 temp = UNI_LESSTHAN;
295 break;
296 case '>':
297 temp = UNI_GRTRTHAN;
298 break;
299 case '|':
300 temp = UNI_PIPE;
301 break;
302 /*
303 * FIXME: We can not handle remapping backslash (UNI_SLASH)
304 * until all the calls to build_path_from_dentry are modified,
305 * as they use backslash as separator.
306 */
307 default:
308 charlen = cp->char2uni(source+i, len_remaining,
309 &temp);
310 /*
311 * if no match, use question mark, which at least in
312 * some cases serves as wild card
313 */
314 if (charlen < 1) {
315 temp = 0x003f;
316 charlen = 1;
317 }
318 len_remaining -= charlen;
319 /*
320 * character may take more than one byte in the source
321 * string, but will take exactly two bytes in the
322 * target string
323 */
324 i += charlen;
325 continue;
326 }
327 put_unaligned_le16(temp, &target[j]);
328 i++; /* move to next char in source string */
329 len_remaining--;
330 }
331
332ctoUCS_out:
333 return i;
334}
335
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c8..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31 31
32 32
33#ifdef CONFIG_CIFS_EXPERIMENTAL
34
35static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { 33static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
36 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, 34 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
37 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, 35 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -43,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
43; 41;
44 42
45 43
46/* security id for everyone */ 44/* security id for everyone/world system group */
47static const struct cifs_sid sid_everyone = { 45static const struct cifs_sid sid_everyone = {
48 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; 46 1, 1, {0, 0, 0, 0, 0, 1}, {0} };
47/* security id for Authenticated Users system group */
48static const struct cifs_sid sid_authusers = {
49 1, 1, {0, 0, 0, 0, 0, 5}, {11} };
49/* group users */ 50/* group users */
50static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 51static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
51 52
@@ -367,10 +368,14 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
367 if (num_aces > 0) { 368 if (num_aces > 0) {
368 umode_t user_mask = S_IRWXU; 369 umode_t user_mask = S_IRWXU;
369 umode_t group_mask = S_IRWXG; 370 umode_t group_mask = S_IRWXG;
370 umode_t other_mask = S_IRWXO; 371 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
371 372
372 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), 373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
373 GFP_KERNEL); 374 GFP_KERNEL);
375 if (!ppace) {
376 cERROR(1, "DACL memory allocation error");
377 return;
378 }
374 379
375 for (i = 0; i < num_aces; ++i) { 380 for (i = 0; i < num_aces; ++i) {
376 ppace[i] = (struct cifs_ace *) (acl_base + acl_size); 381 ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
@@ -392,6 +397,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
392 ppace[i]->type, 397 ppace[i]->type,
393 &fattr->cf_mode, 398 &fattr->cf_mode,
394 &other_mask); 399 &other_mask);
400 if (compare_sids(&(ppace[i]->sid), &sid_authusers))
401 access_flags_to_mode(ppace[i]->access_req,
402 ppace[i]->type,
403 &fattr->cf_mode,
404 &other_mask);
405
395 406
396/* memcpy((void *)(&(cifscred->aces[i])), 407/* memcpy((void *)(&(cifscred->aces[i])),
397 (void *)ppace[i], 408 (void *)ppace[i],
@@ -557,13 +568,20 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
557{ 568{
558 struct cifs_ntsd *pntsd = NULL; 569 struct cifs_ntsd *pntsd = NULL;
559 int xid, rc; 570 int xid, rc;
571 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
572
573 if (IS_ERR(tlink))
574 return ERR_CAST(tlink);
560 575
561 xid = GetXid(); 576 xid = GetXid();
562 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 577 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
563 FreeXid(xid); 578 FreeXid(xid);
564 579
580 cifs_put_tlink(tlink);
565 581
566 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 582 cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
583 if (rc)
584 return ERR_PTR(rc);
567 return pntsd; 585 return pntsd;
568} 586}
569 587
@@ -574,28 +592,34 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
574 int oplock = 0; 592 int oplock = 0;
575 int xid, rc; 593 int xid, rc;
576 __u16 fid; 594 __u16 fid;
595 struct cifsTconInfo *tcon;
596 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
577 597
598 if (IS_ERR(tlink))
599 return ERR_CAST(tlink);
600
601 tcon = tlink_tcon(tlink);
578 xid = GetXid(); 602 xid = GetXid();
579 603
580 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0, 604 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
581 &fid, &oplock, NULL, cifs_sb->local_nls, 605 &fid, &oplock, NULL, cifs_sb->local_nls,
582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 606 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
583 if (rc) { 607 if (!rc) {
584 cERROR(1, "Unable to open file to get ACL"); 608 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
585 goto out; 609 CIFSSMBClose(xid, tcon, fid);
586 } 610 }
587 611
588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 612 cifs_put_tlink(tlink);
589 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
590
591 CIFSSMBClose(xid, cifs_sb->tcon, fid);
592 out:
593 FreeXid(xid); 613 FreeXid(xid);
614
615 cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
616 if (rc)
617 return ERR_PTR(rc);
594 return pntsd; 618 return pntsd;
595} 619}
596 620
597/* Retrieve an ACL from the server */ 621/* Retrieve an ACL from the server */
598static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, 622struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
599 struct inode *inode, const char *path, 623 struct inode *inode, const char *path,
600 u32 *pacllen) 624 u32 *pacllen)
601{ 625{
@@ -603,7 +627,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
603 struct cifsFileInfo *open_file = NULL; 627 struct cifsFileInfo *open_file = NULL;
604 628
605 if (inode) 629 if (inode)
606 open_file = find_readable_file(CIFS_I(inode)); 630 open_file = find_readable_file(CIFS_I(inode), true);
607 if (!open_file) 631 if (!open_file)
608 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 632 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
609 633
@@ -616,10 +640,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
616 struct cifs_ntsd *pnntsd, u32 acllen) 640 struct cifs_ntsd *pnntsd, u32 acllen)
617{ 641{
618 int xid, rc; 642 int xid, rc;
643 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
644
645 if (IS_ERR(tlink))
646 return PTR_ERR(tlink);
619 647
620 xid = GetXid(); 648 xid = GetXid();
621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 649 rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
622 FreeXid(xid); 650 FreeXid(xid);
651 cifs_put_tlink(tlink);
623 652
624 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 653 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
625 return rc; 654 return rc;
@@ -631,10 +660,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
631 int oplock = 0; 660 int oplock = 0;
632 int xid, rc; 661 int xid, rc;
633 __u16 fid; 662 __u16 fid;
663 struct cifsTconInfo *tcon;
664 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
665
666 if (IS_ERR(tlink))
667 return PTR_ERR(tlink);
634 668
669 tcon = tlink_tcon(tlink);
635 xid = GetXid(); 670 xid = GetXid();
636 671
637 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0, 672 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
638 &fid, &oplock, NULL, cifs_sb->local_nls, 673 &fid, &oplock, NULL, cifs_sb->local_nls,
639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 674 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
640 if (rc) { 675 if (rc) {
@@ -642,12 +677,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
642 goto out; 677 goto out;
643 } 678 }
644 679
645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 680 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
646 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 681 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
647 682
648 CIFSSMBClose(xid, cifs_sb->tcon, fid); 683 CIFSSMBClose(xid, tcon, fid);
649 out: 684out:
650 FreeXid(xid); 685 FreeXid(xid);
686 cifs_put_tlink(tlink);
651 return rc; 687 return rc;
652} 688}
653 689
@@ -661,7 +697,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
661 697
662 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); 698 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
663 699
664 open_file = find_readable_file(CIFS_I(inode)); 700 open_file = find_readable_file(CIFS_I(inode), true);
665 if (!open_file) 701 if (!open_file)
666 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); 702 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
667 703
@@ -671,7 +707,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
671} 707}
672 708
673/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 709/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
674void 710int
675cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, 711cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
676 struct inode *inode, const char *path, const __u16 *pfid) 712 struct inode *inode, const char *path, const __u16 *pfid)
677{ 713{
@@ -687,17 +723,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
687 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); 723 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
688 724
689 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ 725 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
690 if (pntsd) 726 if (IS_ERR(pntsd)) {
727 rc = PTR_ERR(pntsd);
728 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
729 } else {
691 rc = parse_sec_desc(pntsd, acllen, fattr); 730 rc = parse_sec_desc(pntsd, acllen, fattr);
692 if (rc) 731 kfree(pntsd);
693 cFYI(1, "parse sec desc failed rc = %d", rc); 732 if (rc)
733 cERROR(1, "parse sec desc failed rc = %d", rc);
734 }
694 735
695 kfree(pntsd); 736 return rc;
696 return;
697} 737}
698 738
699/* Convert mode bits to an ACL so we can update the ACL on the server */ 739/* Convert mode bits to an ACL so we can update the ACL on the server */
700int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) 740int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
701{ 741{
702 int rc = 0; 742 int rc = 0;
703 __u32 secdesclen = 0; 743 __u32 secdesclen = 0;
@@ -712,7 +752,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
712 /* Add three ACEs for owner, group, everyone getting rid of 752 /* Add three ACEs for owner, group, everyone getting rid of
713 other ACEs as chmod disables ACEs and set the security descriptor */ 753 other ACEs as chmod disables ACEs and set the security descriptor */
714 754
715 if (pntsd) { 755 if (IS_ERR(pntsd)) {
756 rc = PTR_ERR(pntsd);
757 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
758 } else {
716 /* allocate memory for the smb header, 759 /* allocate memory for the smb header,
717 set security descriptor request security descriptor 760 set security descriptor request security descriptor
718 parameters, and secuirty descriptor itself */ 761 parameters, and secuirty descriptor itself */
@@ -742,4 +785,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
742 785
743 return rc; 786 return rc;
744} 787}
745#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf5155..c4ae7d036563 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
74 char sidname[SIDNAMELENGTH]; 74 char sidname[SIDNAMELENGTH];
75} __attribute__((packed)); 75} __attribute__((packed));
76 76
77#ifdef CONFIG_CIFS_EXPERIMENTAL
78
79extern int match_sid(struct cifs_sid *); 77extern int match_sid(struct cifs_sid *);
80extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); 78extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
81 79
82#endif /* CONFIG_CIFS_EXPERIMENTAL */
83
84#endif /* _CIFSACL_H */ 80#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35042d8f7338..a51585f9852b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,9 +24,9 @@
24#include "cifspdu.h" 24#include "cifspdu.h"
25#include "cifsglob.h" 25#include "cifsglob.h"
26#include "cifs_debug.h" 26#include "cifs_debug.h"
27#include "md5.h"
28#include "cifs_unicode.h" 27#include "cifs_unicode.h"
29#include "cifsproto.h" 28#include "cifsproto.h"
29#include "ntlmssp.h"
30#include <linux/ctype.h> 30#include <linux/ctype.h>
31#include <linux/random.h> 31#include <linux/random.h>
32 32
@@ -36,27 +36,37 @@
36/* Note that the smb header signature field on input contains the 36/* Note that the smb header signature field on input contains the
37 sequence number before this function is called */ 37 sequence number before this function is called */
38 38
39extern void mdfour(unsigned char *out, unsigned char *in, int n);
40extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
41extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
42 unsigned char *p24);
43
44static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 39static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
45 const struct mac_key *key, char *signature) 40 struct TCP_Server_Info *server, char *signature)
46{ 41{
47 struct MD5Context context; 42 int rc;
48 43
49 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) 44 if (cifs_pdu == NULL || signature == NULL || server == NULL)
50 return -EINVAL; 45 return -EINVAL;
51 46
52 cifs_MD5_init(&context); 47 if (!server->secmech.sdescmd5) {
53 cifs_MD5_update(&context, (char *)&key->data, key->len); 48 cERROR(1, "%s: Can't generate signature\n", __func__);
54 cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); 49 return -1;
50 }
51
52 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
53 if (rc) {
54 cERROR(1, "%s: Oould not init md5\n", __func__);
55 return rc;
56 }
57
58 crypto_shash_update(&server->secmech.sdescmd5->shash,
59 server->session_key.response, server->session_key.len);
60
61 crypto_shash_update(&server->secmech.sdescmd5->shash,
62 cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
63
64 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
55 65
56 cifs_MD5_final(signature, &context);
57 return 0; 66 return 0;
58} 67}
59 68
69/* must be called with server->srv_mutex held */
60int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, 70int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
61 __u32 *pexpected_response_sequence_number) 71 __u32 *pexpected_response_sequence_number)
62{ 72{
@@ -69,17 +79,14 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
69 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 79 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
70 return rc; 80 return rc;
71 81
72 spin_lock(&GlobalMid_Lock);
73 cifs_pdu->Signature.Sequence.SequenceNumber = 82 cifs_pdu->Signature.Sequence.SequenceNumber =
74 cpu_to_le32(server->sequence_number); 83 cpu_to_le32(server->sequence_number);
75 cifs_pdu->Signature.Sequence.Reserved = 0; 84 cifs_pdu->Signature.Sequence.Reserved = 0;
76 85
77 *pexpected_response_sequence_number = server->sequence_number++; 86 *pexpected_response_sequence_number = server->sequence_number++;
78 server->sequence_number++; 87 server->sequence_number++;
79 spin_unlock(&GlobalMid_Lock);
80 88
81 rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key, 89 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
82 smb_signature);
83 if (rc) 90 if (rc)
84 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 91 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
85 else 92 else
@@ -89,16 +96,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
89} 96}
90 97
91static int cifs_calc_signature2(const struct kvec *iov, int n_vec, 98static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
92 const struct mac_key *key, char *signature) 99 struct TCP_Server_Info *server, char *signature)
93{ 100{
94 struct MD5Context context;
95 int i; 101 int i;
102 int rc;
96 103
97 if ((iov == NULL) || (signature == NULL) || (key == NULL)) 104 if (iov == NULL || signature == NULL || server == NULL)
98 return -EINVAL; 105 return -EINVAL;
99 106
100 cifs_MD5_init(&context); 107 if (!server->secmech.sdescmd5) {
101 cifs_MD5_update(&context, (char *)&key->data, key->len); 108 cERROR(1, "%s: Can't generate signature\n", __func__);
109 return -1;
110 }
111
112 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
113 if (rc) {
114 cERROR(1, "%s: Oould not init md5\n", __func__);
115 return rc;
116 }
117
118 crypto_shash_update(&server->secmech.sdescmd5->shash,
119 server->session_key.response, server->session_key.len);
120
102 for (i = 0; i < n_vec; i++) { 121 for (i = 0; i < n_vec; i++) {
103 if (iov[i].iov_len == 0) 122 if (iov[i].iov_len == 0)
104 continue; 123 continue;
@@ -111,18 +130,19 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
111 if (i == 0) { 130 if (i == 0) {
112 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 131 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
113 break; /* nothing to sign or corrupt header */ 132 break; /* nothing to sign or corrupt header */
114 cifs_MD5_update(&context, iov[0].iov_base+4, 133 crypto_shash_update(&server->secmech.sdescmd5->shash,
115 iov[0].iov_len-4); 134 iov[i].iov_base + 4, iov[i].iov_len - 4);
116 } else 135 } else
117 cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); 136 crypto_shash_update(&server->secmech.sdescmd5->shash,
137 iov[i].iov_base, iov[i].iov_len);
118 } 138 }
119 139
120 cifs_MD5_final(signature, &context); 140 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
121 141
122 return 0; 142 return rc;
123} 143}
124 144
125 145/* must be called with server->srv_mutex held */
126int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, 146int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
127 __u32 *pexpected_response_sequence_number) 147 __u32 *pexpected_response_sequence_number)
128{ 148{
@@ -136,17 +156,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
136 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 156 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
137 return rc; 157 return rc;
138 158
139 spin_lock(&GlobalMid_Lock);
140 cifs_pdu->Signature.Sequence.SequenceNumber = 159 cifs_pdu->Signature.Sequence.SequenceNumber =
141 cpu_to_le32(server->sequence_number); 160 cpu_to_le32(server->sequence_number);
142 cifs_pdu->Signature.Sequence.Reserved = 0; 161 cifs_pdu->Signature.Sequence.Reserved = 0;
143 162
144 *pexpected_response_sequence_number = server->sequence_number++; 163 *pexpected_response_sequence_number = server->sequence_number++;
145 server->sequence_number++; 164 server->sequence_number++;
146 spin_unlock(&GlobalMid_Lock);
147 165
148 rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key, 166 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
149 smb_signature);
150 if (rc) 167 if (rc)
151 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 168 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
152 else 169 else
@@ -156,14 +173,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
156} 173}
157 174
158int cifs_verify_signature(struct smb_hdr *cifs_pdu, 175int cifs_verify_signature(struct smb_hdr *cifs_pdu,
159 const struct mac_key *mac_key, 176 struct TCP_Server_Info *server,
160 __u32 expected_sequence_number) 177 __u32 expected_sequence_number)
161{ 178{
162 unsigned int rc; 179 unsigned int rc;
163 char server_response_sig[8]; 180 char server_response_sig[8];
164 char what_we_think_sig_should_be[20]; 181 char what_we_think_sig_should_be[20];
165 182
166 if ((cifs_pdu == NULL) || (mac_key == NULL)) 183 if (cifs_pdu == NULL || server == NULL)
167 return -EINVAL; 184 return -EINVAL;
168 185
169 if (cifs_pdu->Command == SMB_COM_NEGOTIATE) 186 if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +209,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
192 cpu_to_le32(expected_sequence_number); 209 cpu_to_le32(expected_sequence_number);
193 cifs_pdu->Signature.Sequence.Reserved = 0; 210 cifs_pdu->Signature.Sequence.Reserved = 0;
194 211
195 rc = cifs_calculate_signature(cifs_pdu, mac_key, 212 rc = cifs_calculate_signature(cifs_pdu, server,
196 what_we_think_sig_should_be); 213 what_we_think_sig_should_be);
197 214
198 if (rc) 215 if (rc)
@@ -208,19 +225,43 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
208 225
209} 226}
210 227
211/* We fill in key by putting in 40 byte array which was allocated by caller */ 228/* first calculate 24 bytes ntlm response and then 16 byte session key */
212int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 229int setup_ntlm_response(struct cifsSesInfo *ses)
213 const char *password)
214{ 230{
215 char temp_key[16]; 231 int rc = 0;
216 if ((key == NULL) || (rn == NULL)) 232 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
233 char temp_key[CIFS_SESS_KEY_SIZE];
234
235 if (!ses)
217 return -EINVAL; 236 return -EINVAL;
218 237
219 E_md4hash(password, temp_key); 238 ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
220 mdfour(key->data.ntlm, temp_key, 16); 239 if (!ses->auth_key.response) {
221 memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE); 240 cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
222 key->len = 40; 241 return -ENOMEM;
223 return 0; 242 }
243 ses->auth_key.len = temp_len;
244
245 rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
246 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
247 if (rc) {
248 cFYI(1, "%s Can't generate NTLM response, error: %d",
249 __func__, rc);
250 return rc;
251 }
252
253 rc = E_md4hash(ses->password, temp_key);
254 if (rc) {
255 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
256 return rc;
257 }
258
259 rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
260 if (rc)
261 cFYI(1, "%s Can't generate NTLM session key, error: %d",
262 __func__, rc);
263
264 return rc;
224} 265}
225 266
226#ifdef CONFIG_CIFS_WEAK_PW_HASH 267#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -262,109 +303,457 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
262} 303}
263#endif /* CIFS_WEAK_PW_HASH */ 304#endif /* CIFS_WEAK_PW_HASH */
264 305
265static int calc_ntlmv2_hash(struct cifsSesInfo *ses, 306/* Build a proper attribute value/target info pairs blob.
307 * Fill in netbios and dns domain name and workstation name
308 * and client time (total five av pairs and + one end of fields indicator.
309 * Allocate domain name which gets freed when session struct is deallocated.
310 */
311static int
312build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
313{
314 unsigned int dlen;
315 unsigned int wlen;
316 unsigned int size = 6 * sizeof(struct ntlmssp2_name);
317 __le64 curtime;
318 char *defdmname = "WORKGROUP";
319 unsigned char *blobptr;
320 struct ntlmssp2_name *attrptr;
321
322 if (!ses->domainName) {
323 ses->domainName = kstrdup(defdmname, GFP_KERNEL);
324 if (!ses->domainName)
325 return -ENOMEM;
326 }
327
328 dlen = strlen(ses->domainName);
329 wlen = strlen(ses->server->hostname);
330
331 /* The length of this blob is a size which is
332 * six times the size of a structure which holds name/size +
333 * two times the unicode length of a domain name +
334 * two times the unicode length of a server name +
335 * size of a timestamp (which is 8 bytes).
336 */
337 ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
338 ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
339 if (!ses->auth_key.response) {
340 ses->auth_key.len = 0;
341 cERROR(1, "Challenge target info allocation failure");
342 return -ENOMEM;
343 }
344
345 blobptr = ses->auth_key.response;
346 attrptr = (struct ntlmssp2_name *) blobptr;
347
348 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
349 attrptr->length = cpu_to_le16(2 * dlen);
350 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
351 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
352
353 blobptr += 2 * dlen;
354 attrptr = (struct ntlmssp2_name *) blobptr;
355
356 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
357 attrptr->length = cpu_to_le16(2 * wlen);
358 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
359 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
360
361 blobptr += 2 * wlen;
362 attrptr = (struct ntlmssp2_name *) blobptr;
363
364 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
365 attrptr->length = cpu_to_le16(2 * dlen);
366 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
367 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
368
369 blobptr += 2 * dlen;
370 attrptr = (struct ntlmssp2_name *) blobptr;
371
372 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
373 attrptr->length = cpu_to_le16(2 * wlen);
374 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
375 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
376
377 blobptr += 2 * wlen;
378 attrptr = (struct ntlmssp2_name *) blobptr;
379
380 attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
381 attrptr->length = cpu_to_le16(sizeof(__le64));
382 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
383 curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
384 memcpy(blobptr, &curtime, sizeof(__le64));
385
386 return 0;
387}
388
389/* Server has provided av pairs/target info in the type 2 challenge
390 * packet and we have plucked it and stored within smb session.
391 * We parse that blob here to find netbios domain name to be used
392 * as part of ntlmv2 authentication (in Target String), if not already
393 * specified on the command line.
394 * If this function returns without any error but without fetching
395 * domain name, authentication may fail against some server but
396 * may not fail against other (those who are not very particular
397 * about target string i.e. for some, just user name might suffice.
398 */
399static int
400find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
401{
402 unsigned int attrsize;
403 unsigned int type;
404 unsigned int onesize = sizeof(struct ntlmssp2_name);
405 unsigned char *blobptr;
406 unsigned char *blobend;
407 struct ntlmssp2_name *attrptr;
408
409 if (!ses->auth_key.len || !ses->auth_key.response)
410 return 0;
411
412 blobptr = ses->auth_key.response;
413 blobend = blobptr + ses->auth_key.len;
414
415 while (blobptr + onesize < blobend) {
416 attrptr = (struct ntlmssp2_name *) blobptr;
417 type = le16_to_cpu(attrptr->type);
418 if (type == NTLMSSP_AV_EOL)
419 break;
420 blobptr += 2; /* advance attr type */
421 attrsize = le16_to_cpu(attrptr->length);
422 blobptr += 2; /* advance attr size */
423 if (blobptr + attrsize > blobend)
424 break;
425 if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
426 if (!attrsize)
427 break;
428 if (!ses->domainName) {
429 ses->domainName =
430 kmalloc(attrsize + 1, GFP_KERNEL);
431 if (!ses->domainName)
432 return -ENOMEM;
433 cifs_from_ucs2(ses->domainName,
434 (__le16 *)blobptr, attrsize, attrsize,
435 nls_cp, false);
436 break;
437 }
438 }
439 blobptr += attrsize; /* advance attr value */
440 }
441
442 return 0;
443}
444
445static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
266 const struct nls_table *nls_cp) 446 const struct nls_table *nls_cp)
267{ 447{
268 int rc = 0; 448 int rc = 0;
269 int len; 449 int len;
270 char nt_hash[16]; 450 char nt_hash[CIFS_NTHASH_SIZE];
271 struct HMACMD5Context *pctxt;
272 wchar_t *user; 451 wchar_t *user;
273 wchar_t *domain; 452 wchar_t *domain;
453 wchar_t *server;
274 454
275 pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); 455 if (!ses->server->secmech.sdeschmacmd5) {
276 456 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
277 if (pctxt == NULL) 457 return -1;
278 return -ENOMEM; 458 }
279 459
280 /* calculate md4 hash of password */ 460 /* calculate md4 hash of password */
281 E_md4hash(ses->password, nt_hash); 461 E_md4hash(ses->password, nt_hash);
282 462
283 /* convert Domainname to unicode and uppercase */ 463 crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
284 hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); 464 CIFS_NTHASH_SIZE);
465
466 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
467 if (rc) {
468 cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
469 return rc;
470 }
285 471
286 /* convert ses->userName to unicode and uppercase */ 472 /* convert ses->userName to unicode and uppercase */
287 len = strlen(ses->userName); 473 len = strlen(ses->userName);
288 user = kmalloc(2 + (len * 2), GFP_KERNEL); 474 user = kmalloc(2 + (len * 2), GFP_KERNEL);
289 if (user == NULL) 475 if (user == NULL) {
476 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
477 rc = -ENOMEM;
290 goto calc_exit_2; 478 goto calc_exit_2;
479 }
291 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); 480 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
292 UniStrupr(user); 481 UniStrupr(user);
293 hmac_md5_update((char *)user, 2*len, pctxt); 482
483 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
484 (char *)user, 2 * len);
294 485
295 /* convert ses->domainName to unicode and uppercase */ 486 /* convert ses->domainName to unicode and uppercase */
296 if (ses->domainName) { 487 if (ses->domainName) {
297 len = strlen(ses->domainName); 488 len = strlen(ses->domainName);
298 489
299 domain = kmalloc(2 + (len * 2), GFP_KERNEL); 490 domain = kmalloc(2 + (len * 2), GFP_KERNEL);
300 if (domain == NULL) 491 if (domain == NULL) {
492 cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
493 rc = -ENOMEM;
301 goto calc_exit_1; 494 goto calc_exit_1;
495 }
302 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, 496 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
303 nls_cp); 497 nls_cp);
304 /* the following line was removed since it didn't work well 498 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
305 with lower cased domain name that passed as an option. 499 (char *)domain, 2 * len);
306 Maybe converting the domain name earlier makes sense */
307 /* UniStrupr(domain); */
308
309 hmac_md5_update((char *)domain, 2*len, pctxt);
310
311 kfree(domain); 500 kfree(domain);
501 } else if (ses->serverName) {
502 len = strlen(ses->serverName);
503
504 server = kmalloc(2 + (len * 2), GFP_KERNEL);
505 if (server == NULL) {
506 cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
507 rc = -ENOMEM;
508 goto calc_exit_1;
509 }
510 len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
511 nls_cp);
512 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
513 (char *)server, 2 * len);
514 kfree(server);
312 } 515 }
516
517 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
518 ntlmv2_hash);
519
313calc_exit_1: 520calc_exit_1:
314 kfree(user); 521 kfree(user);
315calc_exit_2: 522calc_exit_2:
316 /* BB FIXME what about bytes 24 through 40 of the signing key? 523 return rc;
317 compare with the NTLM example */ 524}
318 hmac_md5_final(ses->server->ntlmv2_hash, pctxt); 525
526static int
527CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
528{
529 int rc;
530 unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
531
532 if (!ses->server->secmech.sdeschmacmd5) {
533 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
534 return -1;
535 }
536
537 crypto_shash_setkey(ses->server->secmech.hmacmd5,
538 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
539
540 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
541 if (rc) {
542 cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
543 return rc;
544 }
545
546 if (ses->server->secType == RawNTLMSSP)
547 memcpy(ses->auth_key.response + offset,
548 ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
549 else
550 memcpy(ses->auth_key.response + offset,
551 ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
552 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
553 ses->auth_key.response + offset, ses->auth_key.len - offset);
554
555 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
556 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
319 557
320 kfree(pctxt);
321 return rc; 558 return rc;
322} 559}
323 560
324void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, 561
325 const struct nls_table *nls_cp) 562int
563setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
326{ 564{
327 int rc; 565 int rc;
328 struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; 566 int baselen;
329 struct HMACMD5Context context; 567 unsigned int tilen;
568 struct ntlmv2_resp *buf;
569 char ntlmv2_hash[16];
570 unsigned char *tiblob = NULL; /* target info blob */
571
572 if (ses->server->secType == RawNTLMSSP) {
573 if (!ses->domainName) {
574 rc = find_domain_name(ses, nls_cp);
575 if (rc) {
576 cERROR(1, "error %d finding domain name", rc);
577 goto setup_ntlmv2_rsp_ret;
578 }
579 }
580 } else {
581 rc = build_avpair_blob(ses, nls_cp);
582 if (rc) {
583 cERROR(1, "error %d building av pair blob", rc);
584 goto setup_ntlmv2_rsp_ret;
585 }
586 }
587
588 baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
589 tilen = ses->auth_key.len;
590 tiblob = ses->auth_key.response;
330 591
592 ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
593 if (!ses->auth_key.response) {
594 rc = ENOMEM;
595 ses->auth_key.len = 0;
596 cERROR(1, "%s: Can't allocate auth blob", __func__);
597 goto setup_ntlmv2_rsp_ret;
598 }
599 ses->auth_key.len += baselen;
600
601 buf = (struct ntlmv2_resp *)
602 (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
331 buf->blob_signature = cpu_to_le32(0x00000101); 603 buf->blob_signature = cpu_to_le32(0x00000101);
332 buf->reserved = 0; 604 buf->reserved = 0;
333 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 605 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
334 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); 606 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
335 buf->reserved2 = 0; 607 buf->reserved2 = 0;
336 buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
337 buf->names[0].length = 0;
338 buf->names[1].type = 0;
339 buf->names[1].length = 0;
340 608
341 /* calculate buf->ntlmv2_hash */ 609 memcpy(ses->auth_key.response + baselen, tiblob, tilen);
342 rc = calc_ntlmv2_hash(ses, nls_cp); 610
343 if (rc) 611 /* calculate ntlmv2_hash */
612 rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
613 if (rc) {
344 cERROR(1, "could not get v2 hash rc %d", rc); 614 cERROR(1, "could not get v2 hash rc %d", rc);
345 CalcNTLMv2_response(ses, resp_buf); 615 goto setup_ntlmv2_rsp_ret;
616 }
617
618 /* calculate first part of the client response (CR1) */
619 rc = CalcNTLMv2_response(ses, ntlmv2_hash);
620 if (rc) {
621 cERROR(1, "Could not calculate CR1 rc: %d", rc);
622 goto setup_ntlmv2_rsp_ret;
623 }
624
625 /* now calculate the session key for NTLMv2 */
626 crypto_shash_setkey(ses->server->secmech.hmacmd5,
627 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
628
629 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
630 if (rc) {
631 cERROR(1, "%s: Could not init hmacmd5\n", __func__);
632 goto setup_ntlmv2_rsp_ret;
633 }
634
635 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
636 ses->auth_key.response + CIFS_SESS_KEY_SIZE,
637 CIFS_HMAC_MD5_HASH_SIZE);
638
639 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
640 ses->auth_key.response);
641
642setup_ntlmv2_rsp_ret:
643 kfree(tiblob);
644
645 return rc;
646}
647
648int
649calc_seckey(struct cifsSesInfo *ses)
650{
651 int rc;
652 struct crypto_blkcipher *tfm_arc4;
653 struct scatterlist sgin, sgout;
654 struct blkcipher_desc desc;
655 unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
656
657 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
658
659 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
660 if (IS_ERR(tfm_arc4)) {
661 rc = PTR_ERR(tfm_arc4);
662 cERROR(1, "could not allocate crypto API arc4\n");
663 return rc;
664 }
665
666 desc.tfm = tfm_arc4;
667
668 crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
669 CIFS_SESS_KEY_SIZE);
670
671 sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
672 sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
673
674 rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
675 if (rc) {
676 cERROR(1, "could not encrypt session key rc: %d\n", rc);
677 crypto_free_blkcipher(tfm_arc4);
678 return rc;
679 }
680
681 /* make secondary_key/nonce as session key */
682 memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
683 /* and make len as that of session key only */
684 ses->auth_key.len = CIFS_SESS_KEY_SIZE;
685
686 crypto_free_blkcipher(tfm_arc4);
687
688 return 0;
689}
690
691void
692cifs_crypto_shash_release(struct TCP_Server_Info *server)
693{
694 if (server->secmech.md5)
695 crypto_free_shash(server->secmech.md5);
346 696
347 /* now calculate the MAC key for NTLMv2 */ 697 if (server->secmech.hmacmd5)
348 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 698 crypto_free_shash(server->secmech.hmacmd5);
349 hmac_md5_update(resp_buf, 16, &context);
350 hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
351 699
352 memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf, 700 kfree(server->secmech.sdeschmacmd5);
353 sizeof(struct ntlmv2_resp)); 701
354 ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp); 702 kfree(server->secmech.sdescmd5);
355} 703}
356 704
357void CalcNTLMv2_response(const struct cifsSesInfo *ses, 705int
358 char *v2_session_response) 706cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
359{ 707{
360 struct HMACMD5Context context; 708 int rc;
361 /* rest of v2 struct already generated */ 709 unsigned int size;
362 memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
363 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
364 710
365 hmac_md5_update(v2_session_response+8, 711 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
366 sizeof(struct ntlmv2_resp) - 8, &context); 712 if (IS_ERR(server->secmech.hmacmd5)) {
713 cERROR(1, "could not allocate crypto hmacmd5\n");
714 return PTR_ERR(server->secmech.hmacmd5);
715 }
716
717 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
718 if (IS_ERR(server->secmech.md5)) {
719 cERROR(1, "could not allocate crypto md5\n");
720 rc = PTR_ERR(server->secmech.md5);
721 goto crypto_allocate_md5_fail;
722 }
723
724 size = sizeof(struct shash_desc) +
725 crypto_shash_descsize(server->secmech.hmacmd5);
726 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
727 if (!server->secmech.sdeschmacmd5) {
728 cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
729 rc = -ENOMEM;
730 goto crypto_allocate_hmacmd5_sdesc_fail;
731 }
732 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
733 server->secmech.sdeschmacmd5->shash.flags = 0x0;
367 734
368 hmac_md5_final(v2_session_response, &context); 735
369/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ 736 size = sizeof(struct shash_desc) +
737 crypto_shash_descsize(server->secmech.md5);
738 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
739 if (!server->secmech.sdescmd5) {
740 cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
741 rc = -ENOMEM;
742 goto crypto_allocate_md5_sdesc_fail;
743 }
744 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
745 server->secmech.sdescmd5->shash.flags = 0x0;
746
747 return 0;
748
749crypto_allocate_md5_sdesc_fail:
750 kfree(server->secmech.sdeschmacmd5);
751
752crypto_allocate_hmacmd5_sdesc_fail:
753 crypto_free_shash(server->secmech.md5);
754
755crypto_allocate_md5_fail:
756 crypto_free_shash(server->secmech.hmacmd5);
757
758 return rc;
370} 759}
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec006474..000000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
1/*
2 * fs/cifs/cifsencrypt.h
3 *
4 * Copyright (c) International Business Machines Corp., 2005
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * Externs for misc. small encryption routines
8 * so we do not have to put them in cifsproto.h
9 *
10 * This library is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published
12 * by the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
18 * the GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25/* md4.c */
26extern void mdfour(unsigned char *out, unsigned char *in, int n);
27/* smbdes.c */
28extern void E_P16(unsigned char *p14, unsigned char *p16);
29extern void E_P24(unsigned char *p21, const unsigned char *c8,
30 unsigned char *p24);
31
32
33
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7431afdd76d..f2970136d17d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,7 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/smp_lock.h> 38#include <net/ipv6.h>
39#include "cifsfs.h" 39#include "cifsfs.h"
40#include "cifspdu.h" 40#include "cifspdu.h"
41#define DECLARE_GLOBALS_HERE 41#define DECLARE_GLOBALS_HERE
@@ -77,11 +77,33 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
77module_param(cifs_max_pending, int, 0); 77module_param(cifs_max_pending, int, 0);
78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
79 "Default: 50 Range: 2 to 256"); 79 "Default: 50 Range: 2 to 256");
80 80unsigned short echo_retries = 5;
81module_param(echo_retries, ushort, 0644);
82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
83 "reconnecting server. Default: 5. 0 means "
84 "never reconnect.");
81extern mempool_t *cifs_sm_req_poolp; 85extern mempool_t *cifs_sm_req_poolp;
82extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
83extern mempool_t *cifs_mid_poolp; 87extern mempool_t *cifs_mid_poolp;
84 88
89void
90cifs_sb_active(struct super_block *sb)
91{
92 struct cifs_sb_info *server = CIFS_SB(sb);
93
94 if (atomic_inc_return(&server->active) == 1)
95 atomic_inc(&sb->s_active);
96}
97
98void
99cifs_sb_deactive(struct super_block *sb)
100{
101 struct cifs_sb_info *server = CIFS_SB(sb);
102
103 if (atomic_dec_and_test(&server->active))
104 deactivate_super(sb);
105}
106
85static int 107static int
86cifs_read_super(struct super_block *sb, void *data, 108cifs_read_super(struct super_block *sb, void *data,
87 const char *devname, int silent) 109 const char *devname, int silent)
@@ -97,6 +119,9 @@ cifs_read_super(struct super_block *sb, void *data,
97 if (cifs_sb == NULL) 119 if (cifs_sb == NULL)
98 return -ENOMEM; 120 return -ENOMEM;
99 121
122 spin_lock_init(&cifs_sb->tlink_tree_lock);
123 cifs_sb->tlink_tree = RB_ROOT;
124
100 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 125 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
101 if (rc) { 126 if (rc) {
102 kfree(cifs_sb); 127 kfree(cifs_sb);
@@ -136,9 +161,6 @@ cifs_read_super(struct super_block *sb, void *data,
136 sb->s_magic = CIFS_MAGIC_NUMBER; 161 sb->s_magic = CIFS_MAGIC_NUMBER;
137 sb->s_op = &cifs_super_ops; 162 sb->s_op = &cifs_super_ops;
138 sb->s_bdi = &cifs_sb->bdi; 163 sb->s_bdi = &cifs_sb->bdi;
139/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
140 sb->s_blocksize =
141 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
142 sb->s_blocksize = CIFS_MAX_MSGSIZE; 164 sb->s_blocksize = CIFS_MAX_MSGSIZE;
143 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 165 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
144 inode = cifs_root_iget(sb, ROOT_I); 166 inode = cifs_root_iget(sb, ROOT_I);
@@ -156,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
156 goto out_no_root; 178 goto out_no_root;
157 } 179 }
158 180
181 /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
182 if (cifs_sb_master_tcon(cifs_sb)->nocase)
183 sb->s_d_op = &cifs_ci_dentry_ops;
184 else
185 sb->s_d_op = &cifs_dentry_ops;
186
159#ifdef CONFIG_CIFS_EXPERIMENTAL 187#ifdef CONFIG_CIFS_EXPERIMENTAL
160 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 188 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
161 cFYI(1, "export ops supported"); 189 cFYI(1, "export ops supported");
@@ -200,8 +228,6 @@ cifs_put_super(struct super_block *sb)
200 return; 228 return;
201 } 229 }
202 230
203 lock_kernel();
204
205 rc = cifs_umount(sb, cifs_sb); 231 rc = cifs_umount(sb, cifs_sb);
206 if (rc) 232 if (rc)
207 cERROR(1, "cifs_umount failed with return code %d", rc); 233 cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +241,6 @@ cifs_put_super(struct super_block *sb)
215 unload_nls(cifs_sb->local_nls); 241 unload_nls(cifs_sb->local_nls);
216 bdi_destroy(&cifs_sb->bdi); 242 bdi_destroy(&cifs_sb->bdi);
217 kfree(cifs_sb); 243 kfree(cifs_sb);
218
219 unlock_kernel();
220} 244}
221 245
222static int 246static int
@@ -224,7 +248,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 248{
225 struct super_block *sb = dentry->d_sb; 249 struct super_block *sb = dentry->d_sb;
226 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 250 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
227 struct cifsTconInfo *tcon = cifs_sb->tcon; 251 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
228 int rc = -EOPNOTSUPP; 252 int rc = -EOPNOTSUPP;
229 int xid; 253 int xid;
230 254
@@ -269,10 +293,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
269 return 0; 293 return 0;
270} 294}
271 295
272static int cifs_permission(struct inode *inode, int mask) 296static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
273{ 297{
274 struct cifs_sb_info *cifs_sb; 298 struct cifs_sb_info *cifs_sb;
275 299
300 if (flags & IPERM_FLAG_RCU)
301 return -ECHILD;
302
276 cifs_sb = CIFS_SB(inode->i_sb); 303 cifs_sb = CIFS_SB(inode->i_sb);
277 304
278 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { 305 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -284,7 +311,7 @@ static int cifs_permission(struct inode *inode, int mask)
284 on the client (above and beyond ACL on servers) for 311 on the client (above and beyond ACL on servers) for
285 servers which do not support setting and viewing mode bits, 312 servers which do not support setting and viewing mode bits,
286 so allowing client to check permissions is useful */ 313 so allowing client to check permissions is useful */
287 return generic_permission(inode, mask, NULL); 314 return generic_permission(inode, mask, flags, NULL);
288} 315}
289 316
290static struct kmem_cache *cifs_inode_cachep; 317static struct kmem_cache *cifs_inode_cachep;
@@ -304,16 +331,16 @@ cifs_alloc_inode(struct super_block *sb)
304 return NULL; 331 return NULL;
305 cifs_inode->cifsAttrs = 0x20; /* default */ 332 cifs_inode->cifsAttrs = 0x20; /* default */
306 cifs_inode->time = 0; 333 cifs_inode->time = 0;
307 cifs_inode->write_behind_rc = 0;
308 /* Until the file is open and we have gotten oplock 334 /* Until the file is open and we have gotten oplock
309 info back from the server, can not assume caching of 335 info back from the server, can not assume caching of
310 file data or metadata */ 336 file data or metadata */
311 cifs_inode->clientCanCacheRead = false; 337 cifs_set_oplock_level(cifs_inode, 0);
312 cifs_inode->clientCanCacheAll = false;
313 cifs_inode->delete_pending = false; 338 cifs_inode->delete_pending = false;
314 cifs_inode->invalid_mapping = false; 339 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 340 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 341 cifs_inode->server_eof = 0;
342 cifs_inode->uniqueid = 0;
343 cifs_inode->createtime = 0;
317 344
318 /* Can not set i_flags here - they get immediately overwritten 345 /* Can not set i_flags here - they get immediately overwritten
319 to zero by the VFS */ 346 to zero by the VFS */
@@ -322,10 +349,17 @@ cifs_alloc_inode(struct super_block *sb)
322 return &cifs_inode->vfs_inode; 349 return &cifs_inode->vfs_inode;
323} 350}
324 351
352static void cifs_i_callback(struct rcu_head *head)
353{
354 struct inode *inode = container_of(head, struct inode, i_rcu);
355 INIT_LIST_HEAD(&inode->i_dentry);
356 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
357}
358
325static void 359static void
326cifs_destroy_inode(struct inode *inode) 360cifs_destroy_inode(struct inode *inode)
327{ 361{
328 kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); 362 call_rcu(&inode->i_rcu, cifs_i_callback);
329} 363}
330 364
331static void 365static void
@@ -339,18 +373,19 @@ cifs_evict_inode(struct inode *inode)
339static void 373static void
340cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) 374cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
341{ 375{
376 struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
377 struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
378
342 seq_printf(s, ",addr="); 379 seq_printf(s, ",addr=");
343 380
344 switch (server->addr.sockAddr.sin_family) { 381 switch (server->dstaddr.ss_family) {
345 case AF_INET: 382 case AF_INET:
346 seq_printf(s, "%pI4", &server->addr.sockAddr.sin_addr.s_addr); 383 seq_printf(s, "%pI4", &sa->sin_addr.s_addr);
347 break; 384 break;
348 case AF_INET6: 385 case AF_INET6:
349 seq_printf(s, "%pI6", 386 seq_printf(s, "%pI6", &sa6->sin6_addr.s6_addr);
350 &server->addr.sockAddr6.sin6_addr.s6_addr); 387 if (sa6->sin6_scope_id)
351 if (server->addr.sockAddr6.sin6_scope_id) 388 seq_printf(s, "%%%u", sa6->sin6_scope_id);
352 seq_printf(s, "%%%u",
353 server->addr.sockAddr6.sin6_scope_id);
354 break; 389 break;
355 default: 390 default:
356 seq_printf(s, "(unknown)"); 391 seq_printf(s, "(unknown)");
@@ -366,14 +401,36 @@ static int
366cifs_show_options(struct seq_file *s, struct vfsmount *m) 401cifs_show_options(struct seq_file *s, struct vfsmount *m)
367{ 402{
368 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); 403 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
369 struct cifsTconInfo *tcon = cifs_sb->tcon; 404 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
405 struct sockaddr *srcaddr;
406 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
370 407
371 seq_printf(s, ",unc=%s", tcon->treeName); 408 seq_printf(s, ",unc=%s", tcon->treeName);
372 if (tcon->ses->userName) 409
410 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
411 seq_printf(s, ",multiuser");
412 else if (tcon->ses->userName)
373 seq_printf(s, ",username=%s", tcon->ses->userName); 413 seq_printf(s, ",username=%s", tcon->ses->userName);
414
374 if (tcon->ses->domainName) 415 if (tcon->ses->domainName)
375 seq_printf(s, ",domain=%s", tcon->ses->domainName); 416 seq_printf(s, ",domain=%s", tcon->ses->domainName);
376 417
418 if (srcaddr->sa_family != AF_UNSPEC) {
419 struct sockaddr_in *saddr4;
420 struct sockaddr_in6 *saddr6;
421 saddr4 = (struct sockaddr_in *)srcaddr;
422 saddr6 = (struct sockaddr_in6 *)srcaddr;
423 if (srcaddr->sa_family == AF_INET6)
424 seq_printf(s, ",srcaddr=%pI6c",
425 &saddr6->sin6_addr);
426 else if (srcaddr->sa_family == AF_INET)
427 seq_printf(s, ",srcaddr=%pI4",
428 &saddr4->sin_addr.s_addr);
429 else
430 seq_printf(s, ",srcaddr=BAD-AF:%i",
431 (int)(srcaddr->sa_family));
432 }
433
377 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); 434 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
378 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) 435 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
379 seq_printf(s, ",forceuid"); 436 seq_printf(s, ",forceuid");
@@ -422,9 +479,15 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
422 seq_printf(s, ",dynperm"); 479 seq_printf(s, ",dynperm");
423 if (m->mnt_sb->s_flags & MS_POSIXACL) 480 if (m->mnt_sb->s_flags & MS_POSIXACL)
424 seq_printf(s, ",acl"); 481 seq_printf(s, ",acl");
482 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
483 seq_printf(s, ",mfsymlinks");
484 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
485 seq_printf(s, ",fsc");
425 486
426 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 487 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
427 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 488 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
489 /* convert actimeo and display it in seconds */
490 seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
428 491
429 return 0; 492 return 0;
430} 493}
@@ -437,20 +500,18 @@ static void cifs_umount_begin(struct super_block *sb)
437 if (cifs_sb == NULL) 500 if (cifs_sb == NULL)
438 return; 501 return;
439 502
440 tcon = cifs_sb->tcon; 503 tcon = cifs_sb_master_tcon(cifs_sb);
441 if (tcon == NULL)
442 return;
443 504
444 read_lock(&cifs_tcp_ses_lock); 505 spin_lock(&cifs_tcp_ses_lock);
445 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { 506 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
446 /* we have other mounts to same share or we have 507 /* we have other mounts to same share or we have
447 already tried to force umount this and woken up 508 already tried to force umount this and woken up
448 all waiting network requests, nothing to do */ 509 all waiting network requests, nothing to do */
449 read_unlock(&cifs_tcp_ses_lock); 510 spin_unlock(&cifs_tcp_ses_lock);
450 return; 511 return;
451 } else if (tcon->tc_count == 1) 512 } else if (tcon->tc_count == 1)
452 tcon->tidStatus = CifsExiting; 513 tcon->tidStatus = CifsExiting;
453 read_unlock(&cifs_tcp_ses_lock); 514 spin_unlock(&cifs_tcp_ses_lock);
454 515
455 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 516 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
456 /* cancel_notify_requests(tcon); */ 517 /* cancel_notify_requests(tcon); */
@@ -509,28 +570,29 @@ static const struct super_operations cifs_super_ops = {
509#endif 570#endif
510}; 571};
511 572
512static int 573static struct dentry *
513cifs_get_sb(struct file_system_type *fs_type, 574cifs_do_mount(struct file_system_type *fs_type,
514 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 575 int flags, const char *dev_name, void *data)
515{ 576{
516 int rc; 577 int rc;
517 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); 578 struct super_block *sb;
579
580 sb = sget(fs_type, NULL, set_anon_super, NULL);
518 581
519 cFYI(1, "Devname: %s flags: %d ", dev_name, flags); 582 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
520 583
521 if (IS_ERR(sb)) 584 if (IS_ERR(sb))
522 return PTR_ERR(sb); 585 return ERR_CAST(sb);
523 586
524 sb->s_flags = flags; 587 sb->s_flags = flags;
525 588
526 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0); 589 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
527 if (rc) { 590 if (rc) {
528 deactivate_locked_super(sb); 591 deactivate_locked_super(sb);
529 return rc; 592 return ERR_PTR(rc);
530 } 593 }
531 sb->s_flags |= MS_ACTIVE; 594 sb->s_flags |= MS_ACTIVE;
532 simple_set_mnt(mnt, sb); 595 return dget(sb->s_root);
533 return 0;
534} 596}
535 597
536static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 598static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -538,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
538{ 600{
539 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 601 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
540 ssize_t written; 602 ssize_t written;
603 int rc;
541 604
542 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 605 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
543 if (!CIFS_I(inode)->clientCanCacheAll) 606
544 filemap_fdatawrite(inode->i_mapping); 607 if (CIFS_I(inode)->clientCanCacheAll)
608 return written;
609
610 rc = filemap_fdatawrite(inode->i_mapping);
611 if (rc)
612 cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
613
545 return written; 614 return written;
546} 615}
547 616
@@ -565,9 +634,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
565 634
566static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 635static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
567{ 636{
568 /* note that this is called by vfs setlease with the BKL held 637 /* note that this is called by vfs setlease with lock_flocks held
569 although I doubt that BKL is needed here in cifs */ 638 to protect *lease from going away */
570 struct inode *inode = file->f_path.dentry->d_inode; 639 struct inode *inode = file->f_path.dentry->d_inode;
640 struct cifsFileInfo *cfile = file->private_data;
571 641
572 if (!(S_ISREG(inode->i_mode))) 642 if (!(S_ISREG(inode->i_mode)))
573 return -EINVAL; 643 return -EINVAL;
@@ -578,8 +648,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
578 ((arg == F_WRLCK) && 648 ((arg == F_WRLCK) &&
579 (CIFS_I(inode)->clientCanCacheAll))) 649 (CIFS_I(inode)->clientCanCacheAll)))
580 return generic_setlease(file, arg, lease); 650 return generic_setlease(file, arg, lease);
581 else if (CIFS_SB(inode->i_sb)->tcon->local_lease && 651 else if (tlink_tcon(cfile->tlink)->local_lease &&
582 !CIFS_I(inode)->clientCanCacheRead) 652 !CIFS_I(inode)->clientCanCacheRead)
583 /* If the server claims to support oplock on this 653 /* If the server claims to support oplock on this
584 file, then we still need to check oplock even 654 file, then we still need to check oplock even
585 if the local_lease mount option is set, but there 655 if the local_lease mount option is set, but there
@@ -595,7 +665,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
595struct file_system_type cifs_fs_type = { 665struct file_system_type cifs_fs_type = {
596 .owner = THIS_MODULE, 666 .owner = THIS_MODULE,
597 .name = "cifs", 667 .name = "cifs",
598 .get_sb = cifs_get_sb, 668 .mount = cifs_do_mount,
599 .kill_sb = kill_anon_super, 669 .kill_sb = kill_anon_super,
600 /* .fs_flags */ 670 /* .fs_flags */
601}; 671};
@@ -670,6 +740,25 @@ const struct file_operations cifs_file_ops = {
670 .setlease = cifs_setlease, 740 .setlease = cifs_setlease,
671}; 741};
672 742
743const struct file_operations cifs_file_strict_ops = {
744 .read = do_sync_read,
745 .write = do_sync_write,
746 .aio_read = cifs_strict_readv,
747 .aio_write = cifs_strict_writev,
748 .open = cifs_open,
749 .release = cifs_close,
750 .lock = cifs_lock,
751 .fsync = cifs_strict_fsync,
752 .flush = cifs_flush,
753 .mmap = cifs_file_strict_mmap,
754 .splice_read = generic_file_splice_read,
755 .llseek = cifs_llseek,
756#ifdef CONFIG_CIFS_POSIX
757 .unlocked_ioctl = cifs_ioctl,
758#endif /* CONFIG_CIFS_POSIX */
759 .setlease = cifs_setlease,
760};
761
673const struct file_operations cifs_file_direct_ops = { 762const struct file_operations cifs_file_direct_ops = {
674 /* no aio, no readv - 763 /* no aio, no readv -
675 BB reevaluate whether they can be done with directio, no cache */ 764 BB reevaluate whether they can be done with directio, no cache */
@@ -688,6 +777,7 @@ const struct file_operations cifs_file_direct_ops = {
688 .llseek = cifs_llseek, 777 .llseek = cifs_llseek,
689 .setlease = cifs_setlease, 778 .setlease = cifs_setlease,
690}; 779};
780
691const struct file_operations cifs_file_nobrl_ops = { 781const struct file_operations cifs_file_nobrl_ops = {
692 .read = do_sync_read, 782 .read = do_sync_read,
693 .write = do_sync_write, 783 .write = do_sync_write,
@@ -706,6 +796,24 @@ const struct file_operations cifs_file_nobrl_ops = {
706 .setlease = cifs_setlease, 796 .setlease = cifs_setlease,
707}; 797};
708 798
799const struct file_operations cifs_file_strict_nobrl_ops = {
800 .read = do_sync_read,
801 .write = do_sync_write,
802 .aio_read = cifs_strict_readv,
803 .aio_write = cifs_strict_writev,
804 .open = cifs_open,
805 .release = cifs_close,
806 .fsync = cifs_strict_fsync,
807 .flush = cifs_flush,
808 .mmap = cifs_file_strict_mmap,
809 .splice_read = generic_file_splice_read,
810 .llseek = cifs_llseek,
811#ifdef CONFIG_CIFS_POSIX
812 .unlocked_ioctl = cifs_ioctl,
813#endif /* CONFIG_CIFS_POSIX */
814 .setlease = cifs_setlease,
815};
816
709const struct file_operations cifs_file_direct_nobrl_ops = { 817const struct file_operations cifs_file_direct_nobrl_ops = {
710 /* no mmap, no aio, no readv - 818 /* no mmap, no aio, no readv -
711 BB reevaluate whether they can be done with directio, no cache */ 819 BB reevaluate whether they can be done with directio, no cache */
@@ -897,9 +1005,8 @@ init_cifs(void)
897 GlobalCurrentXid = 0; 1005 GlobalCurrentXid = 0;
898 GlobalTotalActiveXid = 0; 1006 GlobalTotalActiveXid = 0;
899 GlobalMaxActiveXid = 0; 1007 GlobalMaxActiveXid = 0;
900 memset(Local_System_Name, 0, 15); 1008 spin_lock_init(&cifs_tcp_ses_lock);
901 rwlock_init(&GlobalSMBSeslock); 1009 spin_lock_init(&cifs_file_list_lock);
902 rwlock_init(&cifs_tcp_ses_lock);
903 spin_lock_init(&GlobalMid_Lock); 1010 spin_lock_init(&GlobalMid_Lock);
904 1011
905 if (cifs_max_pending < 2) { 1012 if (cifs_max_pending < 2) {
@@ -912,11 +1019,11 @@ init_cifs(void)
912 1019
913 rc = cifs_fscache_register(); 1020 rc = cifs_fscache_register();
914 if (rc) 1021 if (rc)
915 goto out; 1022 goto out_clean_proc;
916 1023
917 rc = cifs_init_inodecache(); 1024 rc = cifs_init_inodecache();
918 if (rc) 1025 if (rc)
919 goto out_clean_proc; 1026 goto out_unreg_fscache;
920 1027
921 rc = cifs_init_mids(); 1028 rc = cifs_init_mids();
922 if (rc) 1029 if (rc)
@@ -938,19 +1045,19 @@ init_cifs(void)
938 return 0; 1045 return 0;
939 1046
940#ifdef CONFIG_CIFS_UPCALL 1047#ifdef CONFIG_CIFS_UPCALL
941 out_unregister_filesystem: 1048out_unregister_filesystem:
942 unregister_filesystem(&cifs_fs_type); 1049 unregister_filesystem(&cifs_fs_type);
943#endif 1050#endif
944 out_destroy_request_bufs: 1051out_destroy_request_bufs:
945 cifs_destroy_request_bufs(); 1052 cifs_destroy_request_bufs();
946 out_destroy_mids: 1053out_destroy_mids:
947 cifs_destroy_mids(); 1054 cifs_destroy_mids();
948 out_destroy_inodecache: 1055out_destroy_inodecache:
949 cifs_destroy_inodecache(); 1056 cifs_destroy_inodecache();
950 out_clean_proc: 1057out_unreg_fscache:
951 cifs_proc_clean();
952 cifs_fscache_unregister(); 1058 cifs_fscache_unregister();
953 out: 1059out_clean_proc:
1060 cifs_proc_clean();
954 return rc; 1061 return rc;
955} 1062}
956 1063
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d82f5fb4761e..4a3330235d55 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
42extern const struct address_space_operations cifs_addr_ops_smallbuf; 42extern const struct address_space_operations cifs_addr_ops_smallbuf;
43 43
44/* Functions related to super block operations */ 44/* Functions related to super block operations */
45/* extern const struct super_operations cifs_super_ops;*/ 45extern void cifs_sb_active(struct super_block *sb);
46extern void cifs_read_inode(struct inode *); 46extern void cifs_sb_deactive(struct super_block *sb);
47/*extern void cifs_delete_inode(struct inode *);*/ /* BB not needed yet */
48/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
49 47
50/* Functions related to inodes */ 48/* Functions related to inodes */
51extern const struct inode_operations cifs_dir_inode_ops; 49extern const struct inode_operations cifs_dir_inode_ops;
@@ -63,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 61 struct dentry *);
64extern int cifs_revalidate_file(struct file *filp); 62extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *); 63extern int cifs_revalidate_dentry(struct dentry *);
64extern void cifs_invalidate_mapping(struct inode *inode);
66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
67extern int cifs_setattr(struct dentry *, struct iattr *); 66extern int cifs_setattr(struct dentry *, struct iattr *);
68 67
@@ -74,19 +73,27 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
74/* Functions related to files and directories */ 73/* Functions related to files and directories */
75extern const struct file_operations cifs_file_ops; 74extern const struct file_operations cifs_file_ops;
76extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ 75extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
77extern const struct file_operations cifs_file_nobrl_ops; 76extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
78extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */ 77extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
78extern const struct file_operations cifs_file_direct_nobrl_ops;
79extern const struct file_operations cifs_file_strict_nobrl_ops;
79extern int cifs_open(struct inode *inode, struct file *file); 80extern int cifs_open(struct inode *inode, struct file *file);
80extern int cifs_close(struct inode *inode, struct file *file); 81extern int cifs_close(struct inode *inode, struct file *file);
81extern int cifs_closedir(struct inode *inode, struct file *file); 82extern int cifs_closedir(struct inode *inode, struct file *file);
82extern ssize_t cifs_user_read(struct file *file, char __user *read_data, 83extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
83 size_t read_size, loff_t *poffset); 84 size_t read_size, loff_t *poffset);
85extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
86 unsigned long nr_segs, loff_t pos);
84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 87extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
85 size_t write_size, loff_t *poffset); 88 size_t write_size, loff_t *poffset);
89extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
90 unsigned long nr_segs, loff_t pos);
86extern int cifs_lock(struct file *, int, struct file_lock *); 91extern int cifs_lock(struct file *, int, struct file_lock *);
87extern int cifs_fsync(struct file *, int); 92extern int cifs_fsync(struct file *, int);
93extern int cifs_strict_fsync(struct file *, int);
88extern int cifs_flush(struct file *, fl_owner_t id); 94extern int cifs_flush(struct file *, fl_owner_t id);
89extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 95extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
96extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
90extern const struct file_operations cifs_dir_ops; 97extern const struct file_operations cifs_dir_ops;
91extern int cifs_dir_open(struct inode *inode, struct file *file); 98extern int cifs_dir_open(struct inode *inode, struct file *file);
92extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 99extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -95,6 +102,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
95extern const struct dentry_operations cifs_dentry_ops; 102extern const struct dentry_operations cifs_dentry_ops;
96extern const struct dentry_operations cifs_ci_dentry_ops; 103extern const struct dentry_operations cifs_ci_dentry_ops;
97 104
105#ifdef CONFIG_CIFS_DFS_UPCALL
106extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
107#else
108#define cifs_dfs_d_automount NULL
109#endif
110
98/* Functions related to symlinks */ 111/* Functions related to symlinks */
99extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); 112extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
100extern void cifs_put_link(struct dentry *direntry, 113extern void cifs_put_link(struct dentry *direntry,
@@ -104,7 +117,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
104extern int cifs_symlink(struct inode *inode, struct dentry *direntry, 117extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
105 const char *symname); 118 const char *symname);
106extern int cifs_removexattr(struct dentry *, const char *); 119extern int cifs_removexattr(struct dentry *, const char *);
107extern int cifs_setxattr(struct dentry *, const char *, const void *, 120extern int cifs_setxattr(struct dentry *, const char *, const void *,
108 size_t, int); 121 size_t, int);
109extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); 122extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
110extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 123extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
@@ -114,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
114extern const struct export_operations cifs_export_ops; 127extern const struct export_operations cifs_export_ops;
115#endif /* EXPERIMENTAL */ 128#endif /* EXPERIMENTAL */
116 129
117#define CIFS_VERSION "1.65" 130#define CIFS_VERSION "1.70"
118#endif /* _CIFSFS_H */ 131#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..edd5b29b53c9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include "cifs_fs_sb.h" 26#include "cifs_fs_sb.h"
27#include "cifsacl.h" 27#include "cifsacl.h"
28#include <crypto/internal/hash.h>
29#include <linux/scatterlist.h>
30
28/* 31/*
29 * The sizes of various internal tables and strings 32 * The sizes of various internal tables and strings
30 */ 33 */
@@ -42,6 +45,16 @@
42#define CIFS_MIN_RCV_POOL 4 45#define CIFS_MIN_RCV_POOL 4
43 46
44/* 47/*
48 * default attribute cache timeout (jiffies)
49 */
50#define CIFS_DEF_ACTIMEO (1 * HZ)
51
52/*
53 * max attribute cache timeout (jiffies) - 2^30
54 */
55#define CIFS_MAX_ACTIMEO (1 << 30)
56
57/*
45 * MAX_REQ is the maximum number of requests that WE will send 58 * MAX_REQ is the maximum number of requests that WE will send
46 * on one socket concurrently. It also matches the most common 59 * on one socket concurrently. It also matches the most common
47 * value of max multiplex returned by servers. We may 60 * value of max multiplex returned by servers. We may
@@ -74,7 +87,7 @@
74 * CIFS vfs client Status information (based on what we know.) 87 * CIFS vfs client Status information (based on what we know.)
75 */ 88 */
76 89
77 /* associated with each tcp and smb session */ 90/* associated with each tcp and smb session */
78enum statusEnum { 91enum statusEnum {
79 CifsNew = 0, 92 CifsNew = 0,
80 CifsGood, 93 CifsGood,
@@ -97,16 +110,31 @@ enum protocolEnum {
97 /* Netbios frames protocol not supported at this time */ 110 /* Netbios frames protocol not supported at this time */
98}; 111};
99 112
100struct mac_key { 113struct session_key {
101 unsigned int len; 114 unsigned int len;
102 union { 115 char *response;
103 char ntlm[CIFS_SESS_KEY_SIZE + 16]; 116};
104 char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */ 117
105 struct { 118/* crypto security descriptor definition */
106 char key[16]; 119struct sdesc {
107 struct ntlmv2_resp resp; 120 struct shash_desc shash;
108 } ntlmv2; 121 char ctx[];
109 } data; 122};
123
124/* crypto hashing related structure/fields, not specific to a sec mech */
125struct cifs_secmech {
126 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
127 struct crypto_shash *md5; /* md5 hash function */
128 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
129 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
130};
131
132/* per smb session structure/fields */
133struct ntlmssp_auth {
134 __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
135 __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
136 unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
137 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
110}; 138};
111 139
112struct cifs_cred { 140struct cifs_cred {
@@ -133,34 +161,27 @@ struct TCP_Server_Info {
133 int srv_count; /* reference counter */ 161 int srv_count; /* reference counter */
134 /* 15 character server name + 0x20 16th byte indicating type = srv */ 162 /* 15 character server name + 0x20 16th byte indicating type = srv */
135 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 163 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
164 enum statusEnum tcpStatus; /* what we think the status is */
136 char *hostname; /* hostname portion of UNC string */ 165 char *hostname; /* hostname portion of UNC string */
137 struct socket *ssocket; 166 struct socket *ssocket;
138 union { 167 struct sockaddr_storage dstaddr;
139 struct sockaddr_in sockAddr; 168 struct sockaddr_storage srcaddr; /* locally bind to this IP */
140 struct sockaddr_in6 sockAddr6; 169#ifdef CONFIG_NET_NS
141 } addr; 170 struct net *net;
171#endif
142 wait_queue_head_t response_q; 172 wait_queue_head_t response_q;
143 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 173 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
144 struct list_head pending_mid_q; 174 struct list_head pending_mid_q;
145 void *Server_NlsInfo; /* BB - placeholder for future NLS info */
146 unsigned short server_codepage; /* codepage for the server */
147 enum protocolEnum protocolType;
148 char versionMajor;
149 char versionMinor;
150 bool svlocal:1; /* local server or remote */
151 bool noblocksnd; /* use blocking sendmsg */ 175 bool noblocksnd; /* use blocking sendmsg */
152 bool noautotune; /* do not autotune send buf sizes */ 176 bool noautotune; /* do not autotune send buf sizes */
153 bool tcp_nodelay; 177 bool tcp_nodelay;
154 atomic_t inFlight; /* number of requests on the wire to server */ 178 atomic_t inFlight; /* number of requests on the wire to server */
155#ifdef CONFIG_CIFS_STATS2
156 atomic_t inSend; /* requests trying to send */
157 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
158#endif
159 enum statusEnum tcpStatus; /* what we think the status is */
160 struct mutex srv_mutex; 179 struct mutex srv_mutex;
161 struct task_struct *tsk; 180 struct task_struct *tsk;
162 char server_GUID[16]; 181 char server_GUID[16];
163 char secMode; 182 char secMode;
183 bool session_estab; /* mark when very first sess is established */
184 u16 dialect; /* dialect index that server chose */
164 enum securityEnum secType; 185 enum securityEnum secType;
165 unsigned int maxReq; /* Clients should submit no more */ 186 unsigned int maxReq; /* Clients should submit no more */
166 /* than maxReq distinct unanswered SMBs to the server when using */ 187 /* than maxReq distinct unanswered SMBs to the server when using */
@@ -173,30 +194,62 @@ struct TCP_Server_Info {
173 unsigned int max_vcs; /* maximum number of smb sessions, at least 194 unsigned int max_vcs; /* maximum number of smb sessions, at least
174 those that can be specified uniquely with 195 those that can be specified uniquely with
175 vcnumbers */ 196 vcnumbers */
176 char sessid[4]; /* unique token id for this session */
177 /* (returned on Negotiate */
178 int capabilities; /* allow selective disabling of caps by smb sess */ 197 int capabilities; /* allow selective disabling of caps by smb sess */
179 int timeAdj; /* Adjust for difference in server time zone in sec */ 198 int timeAdj; /* Adjust for difference in server time zone in sec */
180 __u16 CurrentMid; /* multiplex id - rotating counter */ 199 __u16 CurrentMid; /* multiplex id - rotating counter */
181 char cryptKey[CIFS_CRYPTO_KEY_SIZE]; 200 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
182 /* 16th byte of RFC1001 workstation name is always null */ 201 /* 16th byte of RFC1001 workstation name is always null */
183 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 202 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
184 __u32 sequence_number; /* needed for CIFS PDU signature */ 203 __u32 sequence_number; /* for signing, protected by srv_mutex */
185 struct mac_key mac_signing_key; 204 struct session_key session_key;
186 char ntlmv2_hash[16];
187 unsigned long lstrp; /* when we got last response from this server */ 205 unsigned long lstrp; /* when we got last response from this server */
188 u16 dialect; /* dialect index that server chose */ 206 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
189 /* extended security flavors that server supports */ 207 /* extended security flavors that server supports */
208 bool sec_ntlmssp; /* supports NTLMSSP */
209 bool sec_kerberosu2u; /* supports U2U Kerberos */
190 bool sec_kerberos; /* supports plain Kerberos */ 210 bool sec_kerberos; /* supports plain Kerberos */
191 bool sec_mskerberos; /* supports legacy MS Kerberos */ 211 bool sec_mskerberos; /* supports legacy MS Kerberos */
192 bool sec_kerberosu2u; /* supports U2U Kerberos */ 212 struct delayed_work echo; /* echo ping workqueue job */
193 bool sec_ntlmssp; /* supports NTLMSSP */
194#ifdef CONFIG_CIFS_FSCACHE 213#ifdef CONFIG_CIFS_FSCACHE
195 struct fscache_cookie *fscache; /* client index cache cookie */ 214 struct fscache_cookie *fscache; /* client index cache cookie */
196#endif 215#endif
216#ifdef CONFIG_CIFS_STATS2
217 atomic_t inSend; /* requests trying to send */
218 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
219#endif
197}; 220};
198 221
199/* 222/*
223 * Macros to allow the TCP_Server_Info->net field and related code to drop out
224 * when CONFIG_NET_NS isn't set.
225 */
226
227#ifdef CONFIG_NET_NS
228
229static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
230{
231 return srv->net;
232}
233
234static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
235{
236 srv->net = net;
237}
238
239#else
240
241static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
242{
243 return &init_net;
244}
245
246static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
247{
248}
249
250#endif
251
252/*
200 * Session structure. One of these for each uid session with a particular host 253 * Session structure. One of these for each uid session with a particular host
201 */ 254 */
202struct cifsSesInfo { 255struct cifsSesInfo {
@@ -222,6 +275,8 @@ struct cifsSesInfo {
222 char userName[MAX_USERNAME_SIZE + 1]; 275 char userName[MAX_USERNAME_SIZE + 1];
223 char *domainName; 276 char *domainName;
224 char *password; 277 char *password;
278 struct session_key auth_key;
279 struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
225 bool need_reconnect:1; /* connection reset, uid now invalid */ 280 bool need_reconnect:1; /* connection reset, uid now invalid */
226}; 281};
227/* no more than one of the following three session flags may be set */ 282/* no more than one of the following three session flags may be set */
@@ -308,6 +363,45 @@ struct cifsTconInfo {
308}; 363};
309 364
310/* 365/*
366 * This is a refcounted and timestamped container for a tcon pointer. The
367 * container holds a tcon reference. It is considered safe to free one of
368 * these when the tl_count goes to 0. The tl_time is the time of the last
369 * "get" on the container.
370 */
371struct tcon_link {
372 struct rb_node tl_rbnode;
373 uid_t tl_uid;
374 unsigned long tl_flags;
375#define TCON_LINK_MASTER 0
376#define TCON_LINK_PENDING 1
377#define TCON_LINK_IN_TREE 2
378 unsigned long tl_time;
379 atomic_t tl_count;
380 struct cifsTconInfo *tl_tcon;
381};
382
383extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
384
385static inline struct cifsTconInfo *
386tlink_tcon(struct tcon_link *tlink)
387{
388 return tlink->tl_tcon;
389}
390
391extern void cifs_put_tlink(struct tcon_link *tlink);
392
393static inline struct tcon_link *
394cifs_get_tlink(struct tcon_link *tlink)
395{
396 if (tlink && !IS_ERR(tlink))
397 atomic_inc(&tlink->tl_count);
398 return tlink;
399}
400
401/* This function is always expected to succeed */
402extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
403
404/*
311 * This info hangs off the cifsFileInfo structure, pointed to by llist. 405 * This info hangs off the cifsFileInfo structure, pointed to by llist.
312 * This is used to track byte stream locks on the file 406 * This is used to track byte stream locks on the file
313 */ 407 */
@@ -345,34 +439,29 @@ struct cifsFileInfo {
345 __u16 netfid; /* file id from remote */ 439 __u16 netfid; /* file id from remote */
346 /* BB add lock scope info here if needed */ ; 440 /* BB add lock scope info here if needed */ ;
347 /* lock scope id (0 if none) */ 441 /* lock scope id (0 if none) */
348 struct file *pfile; /* needed for writepage */ 442 struct dentry *dentry;
349 struct inode *pInode; /* needed for oplock break */ 443 unsigned int f_flags;
350 struct vfsmount *mnt; 444 struct tcon_link *tlink;
351 struct mutex lock_mutex; 445 struct mutex lock_mutex;
352 struct list_head llist; /* list of byte range locks we have. */ 446 struct list_head llist; /* list of byte range locks we have. */
353 bool closePend:1; /* file is marked to close */
354 bool invalidHandle:1; /* file closed via session abend */ 447 bool invalidHandle:1; /* file closed via session abend */
355 bool oplock_break_cancelled:1; 448 bool oplock_break_cancelled:1;
356 atomic_t count; /* reference count */ 449 int count; /* refcount protected by cifs_file_list_lock */
357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 450 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
358 struct cifs_search_info srch_inf; 451 struct cifs_search_info srch_inf;
359 struct work_struct oplock_break; /* work for oplock breaks */ 452 struct work_struct oplock_break; /* work for oplock breaks */
360}; 453};
361 454
362/* Take a reference on the file private data */ 455/*
456 * Take a reference on the file private data. Must be called with
457 * cifs_file_list_lock held.
458 */
363static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) 459static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
364{ 460{
365 atomic_inc(&cifs_file->count); 461 ++cifs_file->count;
366} 462}
367 463
368/* Release a reference on the file private data */ 464void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
369static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
370{
371 if (atomic_dec_and_test(&cifs_file->count)) {
372 iput(cifs_file->pInode);
373 kfree(cifs_file);
374 }
375}
376 465
377/* 466/*
378 * One of these for each file inode 467 * One of these for each file inode
@@ -382,15 +471,15 @@ struct cifsInodeInfo {
382 struct list_head lockList; 471 struct list_head lockList;
383 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 472 /* BB add in lists for dirty pages i.e. write caching info for oplock */
384 struct list_head openFileList; 473 struct list_head openFileList;
385 int write_behind_rc;
386 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 474 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
387 unsigned long time; /* jiffies of last update/check of inode */ 475 bool clientCanCacheRead; /* read oplock */
388 bool clientCanCacheRead:1; /* read oplock */ 476 bool clientCanCacheAll; /* read and writebehind oplock */
389 bool clientCanCacheAll:1; /* read and writebehind oplock */ 477 bool delete_pending; /* DELETE_ON_CLOSE is set */
390 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 478 bool invalid_mapping; /* pagecache is invalid */
391 bool invalid_mapping:1; /* pagecache is invalid */ 479 unsigned long time; /* jiffies of last update of inode */
392 u64 server_eof; /* current file size on server */ 480 u64 server_eof; /* current file size on server */
393 u64 uniqueid; /* server inode number */ 481 u64 uniqueid; /* server inode number */
482 u64 createtime; /* creation time on server */
394#ifdef CONFIG_CIFS_FSCACHE 483#ifdef CONFIG_CIFS_FSCACHE
395 struct fscache_cookie *fscache; 484 struct fscache_cookie *fscache;
396#endif 485#endif
@@ -445,6 +534,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
445 534
446#endif 535#endif
447 536
537struct mid_q_entry;
538
539/*
540 * This is the prototype for the mid callback function. When creating one,
541 * take special care to avoid deadlocks. Things to bear in mind:
542 *
543 * - it will be called by cifsd
544 * - the GlobalMid_Lock will be held
545 * - the mid will be removed from the pending_mid_q list
546 */
547typedef void (mid_callback_t)(struct mid_q_entry *mid);
548
448/* one of these for every pending CIFS request to the server */ 549/* one of these for every pending CIFS request to the server */
449struct mid_q_entry { 550struct mid_q_entry {
450 struct list_head qhead; /* mids waiting on reply from this server */ 551 struct list_head qhead; /* mids waiting on reply from this server */
@@ -456,7 +557,8 @@ struct mid_q_entry {
456 unsigned long when_sent; /* time when smb send finished */ 557 unsigned long when_sent; /* time when smb send finished */
457 unsigned long when_received; /* when demux complete (taken off wire) */ 558 unsigned long when_received; /* when demux complete (taken off wire) */
458#endif 559#endif
459 struct task_struct *tsk; /* task waiting for response */ 560 mid_callback_t *callback; /* call completion callback */
561 void *callback_data; /* general purpose pointer for callback */
460 struct smb_hdr *resp_buf; /* response buffer */ 562 struct smb_hdr *resp_buf; /* response buffer */
461 int midState; /* wish this were enum but can not pass to wait_event */ 563 int midState; /* wish this were enum but can not pass to wait_event */
462 __u8 command; /* smb command code */ 564 __u8 command; /* smb command code */
@@ -474,16 +576,16 @@ struct oplock_q_entry {
474 576
475/* for pending dnotify requests */ 577/* for pending dnotify requests */
476struct dir_notify_req { 578struct dir_notify_req {
477 struct list_head lhead; 579 struct list_head lhead;
478 __le16 Pid; 580 __le16 Pid;
479 __le16 PidHigh; 581 __le16 PidHigh;
480 __u16 Mid; 582 __u16 Mid;
481 __u16 Tid; 583 __u16 Tid;
482 __u16 Uid; 584 __u16 Uid;
483 __u16 netfid; 585 __u16 netfid;
484 __u32 filter; /* CompletionFilter (for multishot) */ 586 __u32 filter; /* CompletionFilter (for multishot) */
485 int multishot; 587 int multishot;
486 struct file *pfile; 588 struct file *pfile;
487}; 589};
488 590
489struct dfs_info3_param { 591struct dfs_info3_param {
@@ -511,6 +613,7 @@ struct cifs_fattr {
511 u64 cf_uniqueid; 613 u64 cf_uniqueid;
512 u64 cf_eof; 614 u64 cf_eof;
513 u64 cf_bytes; 615 u64 cf_bytes;
616 u64 cf_createtime;
514 uid_t cf_uid; 617 uid_t cf_uid;
515 gid_t cf_gid; 618 gid_t cf_gid;
516 umode_t cf_mode; 619 umode_t cf_mode;
@@ -558,12 +661,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
558#define CIFS_IOVEC 4 /* array of response buffers */ 661#define CIFS_IOVEC 4 /* array of response buffers */
559 662
560/* Type of Request to SendReceive2 */ 663/* Type of Request to SendReceive2 */
561#define CIFS_STD_OP 0 /* normal request timeout */ 664#define CIFS_BLOCKING_OP 1 /* operation can block */
562#define CIFS_LONG_OP 1 /* long op (up to 45 sec, oplock time) */ 665#define CIFS_ASYNC_OP 2 /* do not wait for response */
563#define CIFS_VLONG_OP 2 /* sloow op - can take up to 180 seconds */ 666#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */
564#define CIFS_BLOCKING_OP 4 /* operation can block */
565#define CIFS_ASYNC_OP 8 /* do not wait for response */
566#define CIFS_TIMEOUT_MASK 0x00F /* only one of 5 above set in req */
567#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */ 667#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */
568#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */ 668#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */
569#define CIFS_NO_RESP 0x040 /* no response buffer required */ 669#define CIFS_NO_RESP 0x040 /* no response buffer required */
@@ -633,7 +733,7 @@ require use of the stronger protocol */
633 * GlobalMid_Lock protects: 733 * GlobalMid_Lock protects:
634 * list operations on pending_mid_q and oplockQ 734 * list operations on pending_mid_q and oplockQ
635 * updates to XID counters, multiplex id and SMB sequence numbers 735 * updates to XID counters, multiplex id and SMB sequence numbers
636 * GlobalSMBSesLock protects: 736 * cifs_file_list_lock protects:
637 * list operations on tcp and SMB session lists and tCon lists 737 * list operations on tcp and SMB session lists and tCon lists
638 * f_owner.lock protects certain per file struct operations 738 * f_owner.lock protects certain per file struct operations
639 * mapping->page_lock protects certain per page operations 739 * mapping->page_lock protects certain per page operations
@@ -667,7 +767,7 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
667 * the reference counters for the server, smb session, and tcon. Finally, 767 * the reference counters for the server, smb session, and tcon. Finally,
668 * changes to the tcon->tidStatus should be done while holding this lock. 768 * changes to the tcon->tidStatus should be done while holding this lock.
669 */ 769 */
670GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; 770GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
671 771
672/* 772/*
673 * This lock protects the cifs_file->llist and cifs_file->flist 773 * This lock protects the cifs_file->llist and cifs_file->flist
@@ -676,7 +776,7 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
676 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then 776 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
677 * the cifs_tcp_ses_lock must be grabbed first and released last. 777 * the cifs_tcp_ses_lock must be grabbed first and released last.
678 */ 778 */
679GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 779GLOBAL_EXTERN spinlock_t cifs_file_list_lock;
680 780
681/* Outstanding dir notify requests */ 781/* Outstanding dir notify requests */
682GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 782GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
@@ -691,8 +791,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
691GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ 791GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
692GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ 792GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */
693 /* on midQ entries */ 793 /* on midQ entries */
694GLOBAL_EXTERN char Local_System_Name[15];
695
696/* 794/*
697 * Global counters, updated atomically 795 * Global counters, updated atomically
698 */ 796 */
@@ -728,6 +826,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
728GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 826GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
729GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 827GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
730 828
829/* reconnect after this many failed echo attempts */
830GLOBAL_EXTERN unsigned short echo_retries;
831
731void cifs_oplock_break(struct work_struct *work); 832void cifs_oplock_break(struct work_struct *work);
732void cifs_oplock_break_get(struct cifsFileInfo *cfile); 833void cifs_oplock_break_get(struct cifsFileInfo *cfile);
733void cifs_oplock_break_put(struct cifsFileInfo *cfile); 834void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..b5c8cc5d7a7f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
23#define _CIFSPDU_H 23#define _CIFSPDU_H
24 24
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/unaligned.h>
26#include "smbfsctl.h" 27#include "smbfsctl.h"
27 28
28#ifdef CONFIG_CIFS_WEAK_PW_HASH 29#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
50#define SMB_COM_SETATTR 0x09 /* trivial response */ 51#define SMB_COM_SETATTR 0x09 /* trivial response */
51#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */ 52#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */
52#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/ 53#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/
54#define SMB_COM_ECHO 0x2B /* echo request */
53#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */ 55#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */
54#define SMB_COM_READ_ANDX 0x2E 56#define SMB_COM_READ_ANDX 0x2E
55#define SMB_COM_WRITE_ANDX 0x2F 57#define SMB_COM_WRITE_ANDX 0x2F
@@ -131,9 +133,20 @@
131#define CIFS_CRYPTO_KEY_SIZE (8) 133#define CIFS_CRYPTO_KEY_SIZE (8)
132 134
133/* 135/*
136 * Size of the ntlm client response
137 */
138#define CIFS_AUTH_RESP_SIZE (24)
139
140/*
134 * Size of the session key (crypto key encrypted with the password 141 * Size of the session key (crypto key encrypted with the password
135 */ 142 */
136#define CIFS_SESS_KEY_SIZE (24) 143#define CIFS_SESS_KEY_SIZE (16)
144
145#define CIFS_CLIENT_CHALLENGE_SIZE (8)
146#define CIFS_SERVER_CHALLENGE_SIZE (8)
147#define CIFS_HMAC_MD5_HASH_SIZE (16)
148#define CIFS_CPHTXT_SIZE (16)
149#define CIFS_NTHASH_SIZE (16)
137 150
138/* 151/*
139 * Maximum user name length 152 * Maximum user name length
@@ -414,11 +427,49 @@ struct smb_hdr {
414 __u16 Mid; 427 __u16 Mid;
415 __u8 WordCount; 428 __u8 WordCount;
416} __attribute__((packed)); 429} __attribute__((packed));
417/* given a pointer to an smb_hdr retrieve the value of byte count */ 430
418#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 431/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
419#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 432#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
433 (2 * (smb_var)->WordCount))
434
420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 435/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
421#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2) 436#define pByteArea(smb_var) (BCC(smb_var) + 2)
437
438/* get the converted ByteCount for a SMB packet and return it */
439static inline __u16
440get_bcc(struct smb_hdr *hdr)
441{
442 __u16 *bc_ptr = (__u16 *)BCC(hdr);
443
444 return get_unaligned(bc_ptr);
445}
446
447/* get the unconverted ByteCount for a SMB packet and return it */
448static inline __u16
449get_bcc_le(struct smb_hdr *hdr)
450{
451 __le16 *bc_ptr = (__le16 *)BCC(hdr);
452
453 return get_unaligned_le16(bc_ptr);
454}
455
456/* set the ByteCount for a SMB packet in host-byte order */
457static inline void
458put_bcc(__u16 count, struct smb_hdr *hdr)
459{
460 __u16 *bc_ptr = (__u16 *)BCC(hdr);
461
462 put_unaligned(count, bc_ptr);
463}
464
465/* set the ByteCount for a SMB packet in little-endian */
466static inline void
467put_bcc_le(__u16 count, struct smb_hdr *hdr)
468{
469 __le16 *bc_ptr = (__le16 *)BCC(hdr);
470
471 put_unaligned_le16(count, bc_ptr);
472}
422 473
423/* 474/*
424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 475 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -663,7 +714,6 @@ struct ntlmv2_resp {
663 __le64 time; 714 __le64 time;
664 __u64 client_chal; /* random */ 715 __u64 client_chal; /* random */
665 __u32 reserved2; 716 __u32 reserved2;
666 struct ntlmssp2_name names[2];
667 /* array of name entries could follow ending in minimum 4 byte struct */ 717 /* array of name entries could follow ending in minimum 4 byte struct */
668} __attribute__((packed)); 718} __attribute__((packed));
669 719
@@ -750,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
750 * 800 *
751 */ 801 */
752 802
803typedef struct smb_com_echo_req {
804 struct smb_hdr hdr;
805 __le16 EchoCount;
806 __le16 ByteCount;
807 char Data[1];
808} __attribute__((packed)) ECHO_REQ;
809
810typedef struct smb_com_echo_rsp {
811 struct smb_hdr hdr;
812 __le16 SequenceNumber;
813 __le16 ByteCount;
814 char Data[1];
815} __attribute__((packed)) ECHO_RSP;
816
753typedef struct smb_com_logoff_andx_req { 817typedef struct smb_com_logoff_andx_req {
754 struct smb_hdr hdr; /* wct = 2 */ 818 struct smb_hdr hdr; /* wct = 2 */
755 __u8 AndXCommand; 819 __u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1d60c655e3e0..8096f27ad9a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,12 +54,19 @@ do { \
54 __func__, curr_xid, (int)rc); \ 54 __func__, curr_xid, (int)rc); \
55} while (0) 55} while (0)
56extern char *build_path_from_dentry(struct dentry *); 56extern char *build_path_from_dentry(struct dentry *);
57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); 57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
58 struct cifsTconInfo *tcon);
58extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 59extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
59extern char *cifs_compose_mount_options(const char *sb_mountdata, 60extern char *cifs_compose_mount_options(const char *sb_mountdata,
60 const char *fullpath, const struct dfs_info3_param *ref, 61 const char *fullpath, const struct dfs_info3_param *ref,
61 char **devname); 62 char **devname);
62/* extern void renew_parental_timestamps(struct dentry *direntry);*/ 63/* extern void renew_parental_timestamps(struct dentry *direntry);*/
64extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
65 struct TCP_Server_Info *server);
66extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
67extern int cifs_call_async(struct TCP_Server_Info *server,
68 struct smb_hdr *in_buf, mid_callback_t *callback,
69 void *cbdata);
63extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, 70extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
64 struct smb_hdr * /* input */ , 71 struct smb_hdr * /* input */ ,
65 struct smb_hdr * /* out */ , 72 struct smb_hdr * /* out */ ,
@@ -78,10 +85,10 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
78extern bool is_valid_oplock_break(struct smb_hdr *smb, 85extern bool is_valid_oplock_break(struct smb_hdr *smb,
79 struct TCP_Server_Info *); 86 struct TCP_Server_Info *);
80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 87extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); 88extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
82#ifdef CONFIG_CIFS_EXPERIMENTAL 89 unsigned int bytes_written);
83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *); 90extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
84#endif 91extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
85extern unsigned int smbCalcSize(struct smb_hdr *ptr); 92extern unsigned int smbCalcSize(struct smb_hdr *ptr);
86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 93extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
87extern int decode_negTokenInit(unsigned char *security_blob, int length, 94extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -104,13 +111,14 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
104extern u64 cifs_UnixTimeToNT(struct timespec); 111extern u64 cifs_UnixTimeToNT(struct timespec);
105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 112extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
106 int offset); 113 int offset);
114extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
107 115
108extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, 116extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
109 __u16 fileHandle, struct file *file, 117 struct file *file, struct tcon_link *tlink,
110 struct vfsmount *mnt, unsigned int oflags); 118 __u32 oplock);
111extern int cifs_posix_open(char *full_path, struct inode **pinode, 119extern int cifs_posix_open(char *full_path, struct inode **pinode,
112 struct super_block *sb, 120 struct super_block *sb,
113 int mode, int oflags, 121 int mode, unsigned int f_flags,
114 __u32 *poplock, __u16 *pnetfid, int xid); 122 __u32 *poplock, __u16 *pnetfid, int xid);
115void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr); 123void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
116extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 124extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
@@ -129,10 +137,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
129extern int cifs_get_inode_info_unix(struct inode **pinode, 137extern int cifs_get_inode_info_unix(struct inode **pinode,
130 const unsigned char *search_path, 138 const unsigned char *search_path,
131 struct super_block *sb, int xid); 139 struct super_block *sb, int xid);
132extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, 140extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
133 struct cifs_fattr *fattr, struct inode *inode, 141 struct cifs_fattr *fattr, struct inode *inode,
134 const char *path, const __u16 *pfid); 142 const char *path, const __u16 *pfid);
135extern int mode_to_acl(struct inode *inode, const char *path, __u64); 143extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
144extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
145 const char *, u32 *);
136 146
137extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, 147extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
138 const char *); 148 const char *);
@@ -345,12 +355,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
345 const __u16 netfid, const __u64 len, 355 const __u16 netfid, const __u64 len,
346 const __u64 offset, const __u32 numUnlock, 356 const __u64 offset, const __u32 numUnlock,
347 const __u32 numLock, const __u8 lockType, 357 const __u32 numLock, const __u8 lockType,
348 const bool waitFlag); 358 const bool waitFlag, const __u8 oplock_level);
349extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, 359extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
350 const __u16 smb_file_id, const int get_flag, 360 const __u16 smb_file_id, const int get_flag,
351 const __u64 len, struct file_lock *, 361 const __u64 len, struct file_lock *,
352 const __u16 lock_type, const bool waitFlag); 362 const __u16 lock_type, const bool waitFlag);
353extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); 363extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
364extern int CIFSSMBEcho(struct TCP_Server_Info *server);
354extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); 365extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
355 366
356extern struct cifsSesInfo *sesInfoAlloc(void); 367extern struct cifsSesInfo *sesInfoAlloc(void);
@@ -362,13 +373,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
362extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 373extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
363 __u32 *); 374 __u32 *);
364extern int cifs_verify_signature(struct smb_hdr *, 375extern int cifs_verify_signature(struct smb_hdr *,
365 const struct mac_key *mac_key, 376 struct TCP_Server_Info *server,
366 __u32 expected_sequence_number); 377 __u32 expected_sequence_number);
367extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 378extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
368 const char *pass); 379extern int setup_ntlm_response(struct cifsSesInfo *);
369extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); 380extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
370extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 381extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
371 const struct nls_table *); 382extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
383extern int calc_seckey(struct cifsSesInfo *);
384
372#ifdef CONFIG_CIFS_WEAK_PW_HASH 385#ifdef CONFIG_CIFS_WEAK_PW_HASH
373extern void calc_lanman_hash(const char *password, const char *cryptkey, 386extern void calc_lanman_hash(const char *password, const char *cryptkey,
374 bool encrypt, char *lnm_session_key); 387 bool encrypt, char *lnm_session_key);
@@ -408,4 +421,15 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
408extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, 421extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
409 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 422 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
410extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); 423extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
424extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
425extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
426 const unsigned char *path,
427 struct cifs_sb_info *cifs_sb, int xid);
428extern int mdfour(unsigned char *, unsigned char *, int);
429extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
430extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
431 unsigned char *p24);
432extern void E_P16(unsigned char *p14, unsigned char *p16);
433extern void E_P24(unsigned char *p21, const unsigned char *c8,
434 unsigned char *p24);
411#endif /* _CIFSPROTO_H */ 435#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7e83b356cc9e..904aa47e3515 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
91 struct list_head *tmp1; 91 struct list_head *tmp1;
92 92
93/* list all files open on tree connection and mark them invalid */ 93/* list all files open on tree connection and mark them invalid */
94 write_lock(&GlobalSMBSeslock); 94 spin_lock(&cifs_file_list_lock);
95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { 95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
96 open_file = list_entry(tmp, struct cifsFileInfo, tlist); 96 open_file = list_entry(tmp, struct cifsFileInfo, tlist);
97 open_file->invalidHandle = true; 97 open_file->invalidHandle = true;
98 open_file->oplock_break_cancelled = true; 98 open_file->oplock_break_cancelled = true;
99 } 99 }
100 write_unlock(&GlobalSMBSeslock); 100 spin_unlock(&cifs_file_list_lock);
101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted 101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
102 to this tcon */ 102 to this tcon */
103} 103}
@@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
136 } 136 }
137 } 137 }
138 138
139 if (ses->status == CifsExiting)
140 return -EIO;
141
142 /* 139 /*
143 * Give demultiplex thread up to 10 seconds to reconnect, should be 140 * Give demultiplex thread up to 10 seconds to reconnect, should be
144 * greater than cifs socket timeout which is 7 seconds 141 * greater than cifs socket timeout which is 7 seconds
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
156 * retrying until process is killed or server comes 153 * retrying until process is killed or server comes
157 * back on-line 154 * back on-line
158 */ 155 */
159 if (!tcon->retry || ses->status == CifsExiting) { 156 if (!tcon->retry) {
160 cFYI(1, "gave up waiting on reconnect in smb_init"); 157 cFYI(1, "gave up waiting on reconnect in smb_init");
161 return -EHOSTDOWN; 158 return -EHOSTDOWN;
162 } 159 }
@@ -331,37 +328,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
331 328
332static int validate_t2(struct smb_t2_rsp *pSMB) 329static int validate_t2(struct smb_t2_rsp *pSMB)
333{ 330{
334 int rc = -EINVAL; 331 unsigned int total_size;
335 int total_size; 332
336 char *pBCC; 333 /* check for plausible wct */
334 if (pSMB->hdr.WordCount < 10)
335 goto vt2_err;
337 336
338 /* check for plausible wct, bcc and t2 data and parm sizes */
339 /* check for parm and data offset going beyond end of smb */ 337 /* check for parm and data offset going beyond end of smb */
340 if (pSMB->hdr.WordCount >= 10) { 338 if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
341 if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) && 339 get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
342 (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) { 340 goto vt2_err;
343 /* check that bcc is at least as big as parms + data */ 341
344 /* check that bcc is less than negotiated smb buffer */ 342 /* check that bcc is at least as big as parms + data */
345 total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount); 343 /* check that bcc is less than negotiated smb buffer */
346 if (total_size < 512) { 344 total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
347 total_size += 345 if (total_size >= 512)
348 le16_to_cpu(pSMB->t2_rsp.DataCount); 346 goto vt2_err;
349 /* BCC le converted in SendReceive */ 347
350 pBCC = (pSMB->hdr.WordCount * 2) + 348 total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
351 sizeof(struct smb_hdr) + 349 if (total_size > get_bcc(&pSMB->hdr) ||
352 (char *)pSMB; 350 total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
353 if ((total_size <= (*(u16 *)pBCC)) && 351 goto vt2_err;
354 (total_size < 352
355 CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) { 353 return 0;
356 return 0; 354vt2_err:
357 }
358 }
359 }
360 }
361 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB, 355 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
362 sizeof(struct smb_t2_rsp) + 16); 356 sizeof(struct smb_t2_rsp) + 16);
363 return rc; 357 return -EINVAL;
364} 358}
359
365int 360int
366CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) 361CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
367{ 362{
@@ -401,15 +396,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
401 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { 396 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
402 cFYI(1, "Kerberos only mechanism, enable extended security"); 397 cFYI(1, "Kerberos only mechanism, enable extended security");
403 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 398 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
404 } 399 } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
405#ifdef CONFIG_CIFS_EXPERIMENTAL
406 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
407 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 400 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
408 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { 401 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
409 cFYI(1, "NTLMSSP only mechanism, enable extended security"); 402 cFYI(1, "NTLMSSP only mechanism, enable extended security");
410 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 403 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
411 } 404 }
412#endif
413 405
414 count = 0; 406 count = 0;
415 for (i = 0; i < CIFS_NUM_PROT; i++) { 407 for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -455,7 +447,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
455 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 447 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
456 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 448 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
457 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); 449 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
458 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
459 /* even though we do not use raw we might as well set this 450 /* even though we do not use raw we might as well set this
460 accurately, in case we ever find a need for it */ 451 accurately, in case we ever find a need for it */
461 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { 452 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -503,7 +494,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
503 494
504 if (rsp->EncryptionKeyLength == 495 if (rsp->EncryptionKeyLength ==
505 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { 496 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
506 memcpy(server->cryptKey, rsp->EncryptionKey, 497 memcpy(ses->server->cryptkey, rsp->EncryptionKey,
507 CIFS_CRYPTO_KEY_SIZE); 498 CIFS_CRYPTO_KEY_SIZE);
508 } else if (server->secMode & SECMODE_PW_ENCRYPT) { 499 } else if (server->secMode & SECMODE_PW_ENCRYPT) {
509 rc = -EIO; /* need cryptkey unless plain text */ 500 rc = -EIO; /* need cryptkey unless plain text */
@@ -569,12 +560,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
569 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 560 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
570 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 561 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
571 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); 562 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
572 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
573 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 563 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 564 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
575 server->timeAdj *= 60; 565 server->timeAdj *= 60;
576 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 566 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
577 memcpy(server->cryptKey, pSMBr->u.EncryptionKey, 567 memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
578 CIFS_CRYPTO_KEY_SIZE); 568 CIFS_CRYPTO_KEY_SIZE);
579 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) 569 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
580 && (pSMBr->EncryptionKeyLength == 0)) { 570 && (pSMBr->EncryptionKeyLength == 0)) {
@@ -593,9 +583,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
593 rc = -EIO; 583 rc = -EIO;
594 goto neg_err_exit; 584 goto neg_err_exit;
595 } 585 }
596 read_lock(&cifs_tcp_ses_lock); 586 spin_lock(&cifs_tcp_ses_lock);
597 if (server->srv_count > 1) { 587 if (server->srv_count > 1) {
598 read_unlock(&cifs_tcp_ses_lock); 588 spin_unlock(&cifs_tcp_ses_lock);
599 if (memcmp(server->server_GUID, 589 if (memcmp(server->server_GUID,
600 pSMBr->u.extended_response. 590 pSMBr->u.extended_response.
601 GUID, 16) != 0) { 591 GUID, 16) != 0) {
@@ -605,7 +595,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
605 16); 595 16);
606 } 596 }
607 } else { 597 } else {
608 read_unlock(&cifs_tcp_ses_lock); 598 spin_unlock(&cifs_tcp_ses_lock);
609 memcpy(server->server_GUID, 599 memcpy(server->server_GUID,
610 pSMBr->u.extended_response.GUID, 16); 600 pSMBr->u.extended_response.GUID, 16);
611 } 601 }
@@ -620,13 +610,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
620 rc = 0; 610 rc = 0;
621 else 611 else
622 rc = -EINVAL; 612 rc = -EINVAL;
623 613 if (server->secType == Kerberos) {
624 if (server->sec_kerberos || server->sec_mskerberos) 614 if (!server->sec_kerberos &&
625 server->secType = Kerberos; 615 !server->sec_mskerberos)
626 else if (server->sec_ntlmssp) 616 rc = -EOPNOTSUPP;
627 server->secType = RawNTLMSSP; 617 } else if (server->secType == RawNTLMSSP) {
628 else 618 if (!server->sec_ntlmssp)
629 rc = -EOPNOTSUPP; 619 rc = -EOPNOTSUPP;
620 } else
621 rc = -EOPNOTSUPP;
630 } 622 }
631 } else 623 } else
632 server->capabilities &= ~CAP_EXTENDED_SECURITY; 624 server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -707,6 +699,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
707 return rc; 699 return rc;
708} 700}
709 701
702/*
703 * This is a no-op for now. We're not really interested in the reply, but
704 * rather in the fact that the server sent one and that server->lstrp
705 * gets updated.
706 *
707 * FIXME: maybe we should consider checking that the reply matches request?
708 */
709static void
710cifs_echo_callback(struct mid_q_entry *mid)
711{
712 struct TCP_Server_Info *server = mid->callback_data;
713
714 DeleteMidQEntry(mid);
715 atomic_dec(&server->inFlight);
716 wake_up(&server->request_q);
717}
718
719int
720CIFSSMBEcho(struct TCP_Server_Info *server)
721{
722 ECHO_REQ *smb;
723 int rc = 0;
724
725 cFYI(1, "In echo request");
726
727 rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
728 if (rc)
729 return rc;
730
731 /* set up echo request */
732 smb->hdr.Tid = cpu_to_le16(0xffff);
733 smb->hdr.WordCount = 1;
734 put_unaligned_le16(1, &smb->EchoCount);
735 put_bcc_le(1, &smb->hdr);
736 smb->Data[0] = 'a';
737 smb->hdr.smb_buf_length += 3;
738
739 rc = cifs_call_async(server, (struct smb_hdr *)smb,
740 cifs_echo_callback, server);
741 if (rc)
742 cFYI(1, "Echo request failed: %d", rc);
743
744 cifs_small_buf_release(smb);
745
746 return rc;
747}
748
710int 749int
711CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) 750CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
712{ 751{
@@ -1194,7 +1233,7 @@ OldOpenRetry:
1194 pSMB->ByteCount = cpu_to_le16(count); 1233 pSMB->ByteCount = cpu_to_le16(count);
1195 /* long_op set to 1 to allow for oplock break timeouts */ 1234 /* long_op set to 1 to allow for oplock break timeouts */
1196 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1235 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1197 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1236 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1198 cifs_stats_inc(&tcon->num_opens); 1237 cifs_stats_inc(&tcon->num_opens);
1199 if (rc) { 1238 if (rc) {
1200 cFYI(1, "Error in Open = %d", rc); 1239 cFYI(1, "Error in Open = %d", rc);
@@ -1307,7 +1346,7 @@ openRetry:
1307 pSMB->ByteCount = cpu_to_le16(count); 1346 pSMB->ByteCount = cpu_to_le16(count);
1308 /* long_op set to 1 to allow for oplock break timeouts */ 1347 /* long_op set to 1 to allow for oplock break timeouts */
1309 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1348 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1310 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1349 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1311 cifs_stats_inc(&tcon->num_opens); 1350 cifs_stats_inc(&tcon->num_opens);
1312 if (rc) { 1351 if (rc) {
1313 cFYI(1, "Error in Open = %d", rc); 1352 cFYI(1, "Error in Open = %d", rc);
@@ -1389,7 +1428,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1389 iov[0].iov_base = (char *)pSMB; 1428 iov[0].iov_base = (char *)pSMB;
1390 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 1429 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
1391 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, 1430 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
1392 &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR); 1431 &resp_buf_type, CIFS_LOG_ERROR);
1393 cifs_stats_inc(&tcon->num_reads); 1432 cifs_stats_inc(&tcon->num_reads);
1394 pSMBr = (READ_RSP *)iov[0].iov_base; 1433 pSMBr = (READ_RSP *)iov[0].iov_base;
1395 if (rc) { 1434 if (rc) {
@@ -1664,7 +1703,8 @@ int
1664CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, 1703CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1665 const __u16 smb_file_id, const __u64 len, 1704 const __u16 smb_file_id, const __u64 len,
1666 const __u64 offset, const __u32 numUnlock, 1705 const __u64 offset, const __u32 numUnlock,
1667 const __u32 numLock, const __u8 lockType, const bool waitFlag) 1706 const __u32 numLock, const __u8 lockType,
1707 const bool waitFlag, const __u8 oplock_level)
1668{ 1708{
1669 int rc = 0; 1709 int rc = 0;
1670 LOCK_REQ *pSMB = NULL; 1710 LOCK_REQ *pSMB = NULL;
@@ -1692,6 +1732,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1692 pSMB->NumberOfLocks = cpu_to_le16(numLock); 1732 pSMB->NumberOfLocks = cpu_to_le16(numLock);
1693 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); 1733 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
1694 pSMB->LockType = lockType; 1734 pSMB->LockType = lockType;
1735 pSMB->OplockLevel = oplock_level;
1695 pSMB->AndXCommand = 0xFF; /* none */ 1736 pSMB->AndXCommand = 0xFF; /* none */
1696 pSMB->Fid = smb_file_id; /* netfid stays le */ 1737 pSMB->Fid = smb_file_id; /* netfid stays le */
1697 1738
@@ -2476,95 +2517,6 @@ querySymLinkRetry:
2476} 2517}
2477 2518
2478#ifdef CONFIG_CIFS_EXPERIMENTAL 2519#ifdef CONFIG_CIFS_EXPERIMENTAL
2479/* Initialize NT TRANSACT SMB into small smb request buffer.
2480 This assumes that all NT TRANSACTS that we init here have
2481 total parm and data under about 400 bytes (to fit in small cifs
2482 buffer size), which is the case so far, it easily fits. NB:
2483 Setup words themselves and ByteCount
2484 MaxSetupCount (size of returned setup area) and
2485 MaxParameterCount (returned parms size) must be set by caller */
2486static int
2487smb_init_nttransact(const __u16 sub_command, const int setup_count,
2488 const int parm_len, struct cifsTconInfo *tcon,
2489 void **ret_buf)
2490{
2491 int rc;
2492 __u32 temp_offset;
2493 struct smb_com_ntransact_req *pSMB;
2494
2495 rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
2496 (void **)&pSMB);
2497 if (rc)
2498 return rc;
2499 *ret_buf = (void *)pSMB;
2500 pSMB->Reserved = 0;
2501 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
2502 pSMB->TotalDataCount = 0;
2503 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
2504 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
2505 pSMB->ParameterCount = pSMB->TotalParameterCount;
2506 pSMB->DataCount = pSMB->TotalDataCount;
2507 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
2508 (setup_count * 2) - 4 /* for rfc1001 length itself */;
2509 pSMB->ParameterOffset = cpu_to_le32(temp_offset);
2510 pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
2511 pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
2512 pSMB->SubCommand = cpu_to_le16(sub_command);
2513 return 0;
2514}
2515
2516static int
2517validate_ntransact(char *buf, char **ppparm, char **ppdata,
2518 __u32 *pparmlen, __u32 *pdatalen)
2519{
2520 char *end_of_smb;
2521 __u32 data_count, data_offset, parm_count, parm_offset;
2522 struct smb_com_ntransact_rsp *pSMBr;
2523
2524 *pdatalen = 0;
2525 *pparmlen = 0;
2526
2527 if (buf == NULL)
2528 return -EINVAL;
2529
2530 pSMBr = (struct smb_com_ntransact_rsp *)buf;
2531
2532 /* ByteCount was converted from little endian in SendReceive */
2533 end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
2534 (char *)&pSMBr->ByteCount;
2535
2536 data_offset = le32_to_cpu(pSMBr->DataOffset);
2537 data_count = le32_to_cpu(pSMBr->DataCount);
2538 parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
2539 parm_count = le32_to_cpu(pSMBr->ParameterCount);
2540
2541 *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
2542 *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
2543
2544 /* should we also check that parm and data areas do not overlap? */
2545 if (*ppparm > end_of_smb) {
2546 cFYI(1, "parms start after end of smb");
2547 return -EINVAL;
2548 } else if (parm_count + *ppparm > end_of_smb) {
2549 cFYI(1, "parm end after end of smb");
2550 return -EINVAL;
2551 } else if (*ppdata > end_of_smb) {
2552 cFYI(1, "data starts after end of smb");
2553 return -EINVAL;
2554 } else if (data_count + *ppdata > end_of_smb) {
2555 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
2556 *ppdata, data_count, (data_count + *ppdata),
2557 end_of_smb, pSMBr);
2558 return -EINVAL;
2559 } else if (parm_count + data_count > pSMBr->ByteCount) {
2560 cFYI(1, "parm count and data count larger than SMB");
2561 return -EINVAL;
2562 }
2563 *pdatalen = data_count;
2564 *pparmlen = parm_count;
2565 return 0;
2566}
2567
2568int 2520int
2569CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, 2521CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2570 const unsigned char *searchName, 2522 const unsigned char *searchName,
@@ -3054,7 +3006,97 @@ GetExtAttrOut:
3054 3006
3055#endif /* CONFIG_POSIX */ 3007#endif /* CONFIG_POSIX */
3056 3008
3057#ifdef CONFIG_CIFS_EXPERIMENTAL 3009#ifdef CONFIG_CIFS_ACL
3010/*
3011 * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that
3012 * all NT TRANSACTS that we init here have total parm and data under about 400
3013 * bytes (to fit in small cifs buffer size), which is the case so far, it
3014 * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
3015 * returned setup area) and MaxParameterCount (returned parms size) must be set
3016 * by caller
3017 */
3018static int
3019smb_init_nttransact(const __u16 sub_command, const int setup_count,
3020 const int parm_len, struct cifsTconInfo *tcon,
3021 void **ret_buf)
3022{
3023 int rc;
3024 __u32 temp_offset;
3025 struct smb_com_ntransact_req *pSMB;
3026
3027 rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
3028 (void **)&pSMB);
3029 if (rc)
3030 return rc;
3031 *ret_buf = (void *)pSMB;
3032 pSMB->Reserved = 0;
3033 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
3034 pSMB->TotalDataCount = 0;
3035 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
3036 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
3037 pSMB->ParameterCount = pSMB->TotalParameterCount;
3038 pSMB->DataCount = pSMB->TotalDataCount;
3039 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
3040 (setup_count * 2) - 4 /* for rfc1001 length itself */;
3041 pSMB->ParameterOffset = cpu_to_le32(temp_offset);
3042 pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
3043 pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
3044 pSMB->SubCommand = cpu_to_le16(sub_command);
3045 return 0;
3046}
3047
3048static int
3049validate_ntransact(char *buf, char **ppparm, char **ppdata,
3050 __u32 *pparmlen, __u32 *pdatalen)
3051{
3052 char *end_of_smb;
3053 __u32 data_count, data_offset, parm_count, parm_offset;
3054 struct smb_com_ntransact_rsp *pSMBr;
3055
3056 *pdatalen = 0;
3057 *pparmlen = 0;
3058
3059 if (buf == NULL)
3060 return -EINVAL;
3061
3062 pSMBr = (struct smb_com_ntransact_rsp *)buf;
3063
3064 /* ByteCount was converted from little endian in SendReceive */
3065 end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
3066 (char *)&pSMBr->ByteCount;
3067
3068 data_offset = le32_to_cpu(pSMBr->DataOffset);
3069 data_count = le32_to_cpu(pSMBr->DataCount);
3070 parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
3071 parm_count = le32_to_cpu(pSMBr->ParameterCount);
3072
3073 *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
3074 *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
3075
3076 /* should we also check that parm and data areas do not overlap? */
3077 if (*ppparm > end_of_smb) {
3078 cFYI(1, "parms start after end of smb");
3079 return -EINVAL;
3080 } else if (parm_count + *ppparm > end_of_smb) {
3081 cFYI(1, "parm end after end of smb");
3082 return -EINVAL;
3083 } else if (*ppdata > end_of_smb) {
3084 cFYI(1, "data starts after end of smb");
3085 return -EINVAL;
3086 } else if (data_count + *ppdata > end_of_smb) {
3087 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
3088 *ppdata, data_count, (data_count + *ppdata),
3089 end_of_smb, pSMBr);
3090 return -EINVAL;
3091 } else if (parm_count + data_count > pSMBr->ByteCount) {
3092 cFYI(1, "parm count and data count larger than SMB");
3093 return -EINVAL;
3094 }
3095 *pdatalen = data_count;
3096 *pparmlen = parm_count;
3097 return 0;
3098}
3099
3058/* Get Security Descriptor (by handle) from remote server for a file or dir */ 3100/* Get Security Descriptor (by handle) from remote server for a file or dir */
3059int 3101int
3060CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, 3102CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3087,7 +3129,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3087 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 3129 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
3088 3130
3089 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, 3131 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
3090 CIFS_STD_OP); 3132 0);
3091 cifs_stats_inc(&tcon->num_acl_get); 3133 cifs_stats_inc(&tcon->num_acl_get);
3092 if (rc) { 3134 if (rc) {
3093 cFYI(1, "Send error in QuerySecDesc = %d", rc); 3135 cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -3212,7 +3254,7 @@ setCifsAclRetry:
3212 return (rc); 3254 return (rc);
3213} 3255}
3214 3256
3215#endif /* CONFIG_CIFS_EXPERIMENTAL */ 3257#endif /* CONFIG_CIFS_ACL */
3216 3258
3217/* Legacy Query Path Information call for lookup to old servers such 3259/* Legacy Query Path Information call for lookup to old servers such
3218 as Win9x/WinME */ 3260 as Win9x/WinME */
@@ -4869,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4869 __u16 fid, __u32 pid_of_opener, bool SetAllocation) 4911 __u16 fid, __u32 pid_of_opener, bool SetAllocation)
4870{ 4912{
4871 struct smb_com_transaction2_sfi_req *pSMB = NULL; 4913 struct smb_com_transaction2_sfi_req *pSMB = NULL;
4872 char *data_offset;
4873 struct file_end_of_file_info *parm_data; 4914 struct file_end_of_file_info *parm_data;
4874 int rc = 0; 4915 int rc = 0;
4875 __u16 params, param_offset, offset, byte_count, count; 4916 __u16 params, param_offset, offset, byte_count, count;
@@ -4893,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4893 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; 4934 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
4894 offset = param_offset + params; 4935 offset = param_offset + params;
4895 4936
4896 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
4897
4898 count = sizeof(struct file_end_of_file_info); 4937 count = sizeof(struct file_end_of_file_info);
4899 pSMB->MaxParameterCount = cpu_to_le16(2); 4938 pSMB->MaxParameterCount = cpu_to_le16(2);
4900 /* BB find exact max SMB PDU from sess structure BB */ 4939 /* BB find exact max SMB PDU from sess structure BB */
@@ -5562,7 +5601,7 @@ QAllEAsRetry:
5562 } 5601 }
5563 5602
5564 /* make sure list_len doesn't go past end of SMB */ 5603 /* make sure list_len doesn't go past end of SMB */
5565 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); 5604 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
5566 if ((char *)ea_response_data + list_len > end_of_smb) { 5605 if ((char *)ea_response_data + list_len > end_of_smb) {
5567 cFYI(1, "EA list appears to go beyond SMB"); 5606 cFYI(1, "EA list appears to go beyond SMB");
5568 rc = -EIO; 5607 rc = -EIO;
diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb1..000000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
1/*
2 * fs/cifs/cn_cifs.h
3 *
4 * Copyright (c) International Business Machines Corp., 2002
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _CN_CIFS_H
23#define _CN_CIFS_H
24#ifdef CONFIG_CIFS_UPCALL
25#include <linux/types.h>
26#include <linux/connector.h>
27
28struct cifs_upcall {
29 char signature[4]; /* CIFS */
30 enum command {
31 CIFS_GET_IP = 0x00000001, /* get ip address for hostname */
32 CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
33 } command;
34 /* union cifs upcall data follows */
35};
36#endif /* CIFS_UPCALL */
37#endif /* _CN_CIFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88c84a38bccb..257b6d895e20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,14 +47,13 @@
47#include "ntlmssp.h" 47#include "ntlmssp.h"
48#include "nterr.h" 48#include "nterr.h"
49#include "rfc1002pdu.h" 49#include "rfc1002pdu.h"
50#include "cn_cifs.h"
51#include "fscache.h" 50#include "fscache.h"
52 51
53#define CIFS_PORT 445 52#define CIFS_PORT 445
54#define RFC1001_PORT 139 53#define RFC1001_PORT 139
55 54
56extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 55/* SMB echo "timeout" -- FIXME: tunable? */
57 unsigned char *p24); 56#define SMB_ECHO_INTERVAL (60 * HZ)
58 57
59extern mempool_t *cifs_req_poolp; 58extern mempool_t *cifs_req_poolp;
60 59
@@ -65,8 +64,8 @@ struct smb_vol {
65 char *UNC; 64 char *UNC;
66 char *UNCip; 65 char *UNCip;
67 char *iocharset; /* local code page for mapping to and from Unicode */ 66 char *iocharset; /* local code page for mapping to and from Unicode */
68 char source_rfc1001_name[16]; /* netbios name of client */ 67 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
69 char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ 68 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
70 uid_t cred_uid; 69 uid_t cred_uid;
71 uid_t linux_uid; 70 uid_t linux_uid;
72 gid_t linux_gid; 71 gid_t linux_gid;
@@ -85,6 +84,7 @@ struct smb_vol {
85 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ 84 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
86 bool server_ino:1; /* use inode numbers from server ie UniqueId */ 85 bool server_ino:1; /* use inode numbers from server ie UniqueId */
87 bool direct_io:1; 86 bool direct_io:1;
87 bool strict_io:1; /* strict cache behavior */
88 bool remap:1; /* set to remap seven reserved chars in filenames */ 88 bool remap:1; /* set to remap seven reserved chars in filenames */
89 bool posix_paths:1; /* unset to not ask for posix pathnames. */ 89 bool posix_paths:1; /* unset to not ask for posix pathnames. */
90 bool no_linux_ext:1; 90 bool no_linux_ext:1;
@@ -100,16 +100,26 @@ struct smb_vol {
100 bool noautotune:1; 100 bool noautotune:1;
101 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 101 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
102 bool fsc:1; /* enable fscache */ 102 bool fsc:1; /* enable fscache */
103 bool mfsymlinks:1; /* use Minshall+French Symlinks */
104 bool multiuser:1;
103 unsigned int rsize; 105 unsigned int rsize;
104 unsigned int wsize; 106 unsigned int wsize;
105 bool sockopt_tcp_nodelay:1; 107 bool sockopt_tcp_nodelay:1;
106 unsigned short int port; 108 unsigned short int port;
109 unsigned long actimeo; /* attribute cache timeout (jiffies) */
107 char *prepath; 110 char *prepath;
111 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
108 struct nls_table *local_nls; 112 struct nls_table *local_nls;
109}; 113};
110 114
111static int ipv4_connect(struct TCP_Server_Info *server); 115/* FIXME: should these be tunable? */
112static int ipv6_connect(struct TCP_Server_Info *server); 116#define TLINK_ERROR_EXPIRE (1 * HZ)
117#define TLINK_IDLE_EXPIRE (600 * HZ)
118
119static int ip_connect(struct TCP_Server_Info *server);
120static int generic_ip_connect(struct TCP_Server_Info *server);
121static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
122static void cifs_prune_tlinks(struct work_struct *work);
113 123
114/* 124/*
115 * cifs tcp session reconnection 125 * cifs tcp session reconnection
@@ -143,7 +153,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
143 153
144 /* before reconnecting the tcp session, mark the smb session (uid) 154 /* before reconnecting the tcp session, mark the smb session (uid)
145 and the tid bad so they are not used until reconnected */ 155 and the tid bad so they are not used until reconnected */
146 read_lock(&cifs_tcp_ses_lock); 156 cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
157 spin_lock(&cifs_tcp_ses_lock);
147 list_for_each(tmp, &server->smb_ses_list) { 158 list_for_each(tmp, &server->smb_ses_list) {
148 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 159 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
149 ses->need_reconnect = true; 160 ses->need_reconnect = true;
@@ -153,8 +164,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
153 tcon->need_reconnect = true; 164 tcon->need_reconnect = true;
154 } 165 }
155 } 166 }
156 read_unlock(&cifs_tcp_ses_lock); 167 spin_unlock(&cifs_tcp_ses_lock);
168
157 /* do not want to be sending data on a socket we are freeing */ 169 /* do not want to be sending data on a socket we are freeing */
170 cFYI(1, "%s: tearing down socket", __func__);
158 mutex_lock(&server->srv_mutex); 171 mutex_lock(&server->srv_mutex);
159 if (server->ssocket) { 172 if (server->ssocket) {
160 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, 173 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -166,30 +179,32 @@ cifs_reconnect(struct TCP_Server_Info *server)
166 sock_release(server->ssocket); 179 sock_release(server->ssocket);
167 server->ssocket = NULL; 180 server->ssocket = NULL;
168 } 181 }
182 server->sequence_number = 0;
183 server->session_estab = false;
184 kfree(server->session_key.response);
185 server->session_key.response = NULL;
186 server->session_key.len = 0;
187 server->lstrp = jiffies;
188 mutex_unlock(&server->srv_mutex);
169 189
190 /* mark submitted MIDs for retry and issue callback */
191 cFYI(1, "%s: issuing mid callbacks", __func__);
170 spin_lock(&GlobalMid_Lock); 192 spin_lock(&GlobalMid_Lock);
171 list_for_each(tmp, &server->pending_mid_q) { 193 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
172 mid_entry = list_entry(tmp, struct 194 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
173 mid_q_entry, 195 if (mid_entry->midState == MID_REQUEST_SUBMITTED)
174 qhead);
175 if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
176 /* Mark other intransit requests as needing
177 retry so we do not immediately mark the
178 session bad again (ie after we reconnect
179 below) as they timeout too */
180 mid_entry->midState = MID_RETRY_NEEDED; 196 mid_entry->midState = MID_RETRY_NEEDED;
181 } 197 list_del_init(&mid_entry->qhead);
198 mid_entry->callback(mid_entry);
182 } 199 }
183 spin_unlock(&GlobalMid_Lock); 200 spin_unlock(&GlobalMid_Lock);
184 mutex_unlock(&server->srv_mutex);
185 201
186 while ((server->tcpStatus != CifsExiting) && 202 while ((server->tcpStatus != CifsExiting) &&
187 (server->tcpStatus != CifsGood)) { 203 (server->tcpStatus != CifsGood)) {
188 try_to_freeze(); 204 try_to_freeze();
189 if (server->addr.sockAddr6.sin6_family == AF_INET6) 205
190 rc = ipv6_connect(server); 206 /* we should try only the port we connected to before */
191 else 207 rc = generic_ip_connect(server);
192 rc = ipv4_connect(server);
193 if (rc) { 208 if (rc) {
194 cFYI(1, "reconnect error %d", rc); 209 cFYI(1, "reconnect error %d", rc);
195 msleep(3000); 210 msleep(3000);
@@ -198,12 +213,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
198 spin_lock(&GlobalMid_Lock); 213 spin_lock(&GlobalMid_Lock);
199 if (server->tcpStatus != CifsExiting) 214 if (server->tcpStatus != CifsExiting)
200 server->tcpStatus = CifsGood; 215 server->tcpStatus = CifsGood;
201 server->sequence_number = 0;
202 spin_unlock(&GlobalMid_Lock); 216 spin_unlock(&GlobalMid_Lock);
203 /* atomic_set(&server->inFlight,0);*/
204 wake_up(&server->response_q);
205 } 217 }
206 } 218 }
219
207 return rc; 220 return rc;
208} 221}
209 222
@@ -217,9 +230,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
217static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) 230static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
218{ 231{
219 struct smb_t2_rsp *pSMBt; 232 struct smb_t2_rsp *pSMBt;
220 int total_data_size;
221 int data_in_this_rsp;
222 int remaining; 233 int remaining;
234 __u16 total_data_size, data_in_this_rsp;
223 235
224 if (pSMB->Command != SMB_COM_TRANSACTION2) 236 if (pSMB->Command != SMB_COM_TRANSACTION2)
225 return 0; 237 return 0;
@@ -233,8 +245,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
233 245
234 pSMBt = (struct smb_t2_rsp *)pSMB; 246 pSMBt = (struct smb_t2_rsp *)pSMB;
235 247
236 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 248 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
237 data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount); 249 data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
238 250
239 remaining = total_data_size - data_in_this_rsp; 251 remaining = total_data_size - data_in_this_rsp;
240 252
@@ -260,21 +272,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
260{ 272{
261 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; 273 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
262 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; 274 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
263 int total_data_size;
264 int total_in_buf;
265 int remaining;
266 int total_in_buf2;
267 char *data_area_of_target; 275 char *data_area_of_target;
268 char *data_area_of_buf2; 276 char *data_area_of_buf2;
269 __u16 byte_count; 277 int remaining;
278 __u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
270 279
271 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 280 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
272 281
273 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { 282 if (total_data_size !=
283 get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
274 cFYI(1, "total data size of primary and secondary t2 differ"); 284 cFYI(1, "total data size of primary and secondary t2 differ");
275 }
276 285
277 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); 286 total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
278 287
279 remaining = total_data_size - total_in_buf; 288 remaining = total_data_size - total_in_buf;
280 289
@@ -284,28 +293,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
284 if (remaining == 0) /* nothing to do, ignore */ 293 if (remaining == 0) /* nothing to do, ignore */
285 return 0; 294 return 0;
286 295
287 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); 296 total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
288 if (remaining < total_in_buf2) { 297 if (remaining < total_in_buf2) {
289 cFYI(1, "transact2 2nd response contains too much data"); 298 cFYI(1, "transact2 2nd response contains too much data");
290 } 299 }
291 300
292 /* find end of first SMB data area */ 301 /* find end of first SMB data area */
293 data_area_of_target = (char *)&pSMBt->hdr.Protocol + 302 data_area_of_target = (char *)&pSMBt->hdr.Protocol +
294 le16_to_cpu(pSMBt->t2_rsp.DataOffset); 303 get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
295 /* validate target area */ 304 /* validate target area */
296 305
297 data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol + 306 data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
298 le16_to_cpu(pSMB2->t2_rsp.DataOffset); 307 get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
299 308
300 data_area_of_target += total_in_buf; 309 data_area_of_target += total_in_buf;
301 310
302 /* copy second buffer into end of first buffer */ 311 /* copy second buffer into end of first buffer */
303 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); 312 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
304 total_in_buf += total_in_buf2; 313 total_in_buf += total_in_buf2;
305 pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf); 314 put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
306 byte_count = le16_to_cpu(BCC_LE(pTargetSMB)); 315 byte_count = get_bcc_le(pTargetSMB);
307 byte_count += total_in_buf2; 316 byte_count += total_in_buf2;
308 BCC_LE(pTargetSMB) = cpu_to_le16(byte_count); 317 put_bcc_le(byte_count, pTargetSMB);
309 318
310 byte_count = pTargetSMB->smb_buf_length; 319 byte_count = pTargetSMB->smb_buf_length;
311 byte_count += total_in_buf2; 320 byte_count += total_in_buf2;
@@ -319,7 +328,30 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
319 return 0; /* we are done */ 328 return 0; /* we are done */
320 } else /* more responses to go */ 329 } else /* more responses to go */
321 return 1; 330 return 1;
331}
332
333static void
334cifs_echo_request(struct work_struct *work)
335{
336 int rc;
337 struct TCP_Server_Info *server = container_of(work,
338 struct TCP_Server_Info, echo.work);
339
340 /*
341 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
342 * Also, no need to ping if we got a response recently
343 */
344 if (server->tcpStatus != CifsGood ||
345 time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
346 goto requeue_echo;
322 347
348 rc = CIFSSMBEcho(server);
349 if (rc)
350 cFYI(1, "Unable to send echo request to server: %s",
351 server->hostname);
352
353requeue_echo:
354 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
323} 355}
324 356
325static int 357static int
@@ -333,8 +365,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
333 struct msghdr smb_msg; 365 struct msghdr smb_msg;
334 struct kvec iov; 366 struct kvec iov;
335 struct socket *csocket = server->ssocket; 367 struct socket *csocket = server->ssocket;
336 struct list_head *tmp; 368 struct list_head *tmp, *tmp2;
337 struct cifsSesInfo *ses;
338 struct task_struct *task_to_wake = NULL; 369 struct task_struct *task_to_wake = NULL;
339 struct mid_q_entry *mid_entry; 370 struct mid_q_entry *mid_entry;
340 char temp; 371 char temp;
@@ -387,7 +418,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
387 smb_msg.msg_control = NULL; 418 smb_msg.msg_control = NULL;
388 smb_msg.msg_controllen = 0; 419 smb_msg.msg_controllen = 0;
389 pdu_length = 4; /* enough to get RFC1001 header */ 420 pdu_length = 4; /* enough to get RFC1001 header */
421
390incomplete_rcv: 422incomplete_rcv:
423 if (echo_retries > 0 &&
424 time_after(jiffies, server->lstrp +
425 (echo_retries * SMB_ECHO_INTERVAL))) {
426 cERROR(1, "Server %s has not responded in %d seconds. "
427 "Reconnecting...", server->hostname,
428 (echo_retries * SMB_ECHO_INTERVAL / HZ));
429 cifs_reconnect(server);
430 csocket = server->ssocket;
431 wake_up(&server->response_q);
432 continue;
433 }
434
391 length = 435 length =
392 kernel_recvmsg(csocket, &smb_msg, 436 kernel_recvmsg(csocket, &smb_msg,
393 &iov, 1, pdu_length, 0 /* BB other flags? */); 437 &iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -464,7 +508,7 @@ incomplete_rcv:
464 * initialize frame) 508 * initialize frame)
465 */ 509 */
466 cifs_set_port((struct sockaddr *) 510 cifs_set_port((struct sockaddr *)
467 &server->addr.sockAddr, CIFS_PORT); 511 &server->dstaddr, CIFS_PORT);
468 cifs_reconnect(server); 512 cifs_reconnect(server);
469 csocket = server->ssocket; 513 csocket = server->ssocket;
470 wake_up(&server->response_q); 514 wake_up(&server->response_q);
@@ -538,19 +582,20 @@ incomplete_rcv:
538 else if (reconnect == 1) 582 else if (reconnect == 1)
539 continue; 583 continue;
540 584
541 length += 4; /* account for rfc1002 hdr */ 585 total_read += 4; /* account for rfc1002 hdr */
542
543 586
544 dump_smb(smb_buffer, length); 587 dump_smb(smb_buffer, total_read);
545 if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) { 588 if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) {
546 cifs_dump_mem("Bad SMB: ", smb_buffer, 48); 589 cifs_dump_mem("Bad SMB: ", smb_buffer,
590 total_read < 48 ? total_read : 48);
547 continue; 591 continue;
548 } 592 }
549 593
594 mid_entry = NULL;
595 server->lstrp = jiffies;
550 596
551 task_to_wake = NULL;
552 spin_lock(&GlobalMid_Lock); 597 spin_lock(&GlobalMid_Lock);
553 list_for_each(tmp, &server->pending_mid_q) { 598 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
554 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 599 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
555 600
556 if ((mid_entry->mid == smb_buffer->Mid) && 601 if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -591,20 +636,19 @@ incomplete_rcv:
591 mid_entry->resp_buf = smb_buffer; 636 mid_entry->resp_buf = smb_buffer;
592 mid_entry->largeBuf = isLargeBuf; 637 mid_entry->largeBuf = isLargeBuf;
593multi_t2_fnd: 638multi_t2_fnd:
594 task_to_wake = mid_entry->tsk;
595 mid_entry->midState = MID_RESPONSE_RECEIVED; 639 mid_entry->midState = MID_RESPONSE_RECEIVED;
596#ifdef CONFIG_CIFS_STATS2 640#ifdef CONFIG_CIFS_STATS2
597 mid_entry->when_received = jiffies; 641 mid_entry->when_received = jiffies;
598#endif 642#endif
599 /* so we do not time out requests to server 643 list_del_init(&mid_entry->qhead);
600 which is still responding (since server could 644 mid_entry->callback(mid_entry);
601 be busy but not dead) */
602 server->lstrp = jiffies;
603 break; 645 break;
604 } 646 }
647 mid_entry = NULL;
605 } 648 }
606 spin_unlock(&GlobalMid_Lock); 649 spin_unlock(&GlobalMid_Lock);
607 if (task_to_wake) { 650
651 if (mid_entry != NULL) {
608 /* Was previous buf put in mpx struct for multi-rsp? */ 652 /* Was previous buf put in mpx struct for multi-rsp? */
609 if (!isMultiRsp) { 653 if (!isMultiRsp) {
610 /* smb buffer will be freed by user thread */ 654 /* smb buffer will be freed by user thread */
@@ -613,11 +657,10 @@ multi_t2_fnd:
613 else 657 else
614 smallbuf = NULL; 658 smallbuf = NULL;
615 } 659 }
616 wake_up_process(task_to_wake);
617 } else if (!is_valid_oplock_break(smb_buffer, server) && 660 } else if (!is_valid_oplock_break(smb_buffer, server) &&
618 !isMultiRsp) { 661 !isMultiRsp) {
619 cERROR(1, "No task to wake, unknown frame received! " 662 cERROR(1, "No task to wake, unknown frame received! "
620 "NumMids %d", midCount.counter); 663 "NumMids %d", atomic_read(&midCount));
621 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 664 cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
622 sizeof(struct smb_hdr)); 665 sizeof(struct smb_hdr));
623#ifdef CONFIG_CIFS_DEBUG2 666#ifdef CONFIG_CIFS_DEBUG2
@@ -629,9 +672,9 @@ multi_t2_fnd:
629 } /* end while !EXITING */ 672 } /* end while !EXITING */
630 673
631 /* take it off the list, if it's not already */ 674 /* take it off the list, if it's not already */
632 write_lock(&cifs_tcp_ses_lock); 675 spin_lock(&cifs_tcp_ses_lock);
633 list_del_init(&server->tcp_ses_list); 676 list_del_init(&server->tcp_ses_list);
634 write_unlock(&cifs_tcp_ses_lock); 677 spin_unlock(&cifs_tcp_ses_lock);
635 678
636 spin_lock(&GlobalMid_Lock); 679 spin_lock(&GlobalMid_Lock);
637 server->tcpStatus = CifsExiting; 680 server->tcpStatus = CifsExiting;
@@ -665,44 +708,16 @@ multi_t2_fnd:
665 if (smallbuf) /* no sense logging a debug message if NULL */ 708 if (smallbuf) /* no sense logging a debug message if NULL */
666 cifs_small_buf_release(smallbuf); 709 cifs_small_buf_release(smallbuf);
667 710
668 /* 711 if (!list_empty(&server->pending_mid_q)) {
669 * BB: we shouldn't have to do any of this. It shouldn't be
670 * possible to exit from the thread with active SMB sessions
671 */
672 read_lock(&cifs_tcp_ses_lock);
673 if (list_empty(&server->pending_mid_q)) {
674 /* loop through server session structures attached to this and
675 mark them dead */
676 list_for_each(tmp, &server->smb_ses_list) {
677 ses = list_entry(tmp, struct cifsSesInfo,
678 smb_ses_list);
679 ses->status = CifsExiting;
680 ses->server = NULL;
681 }
682 read_unlock(&cifs_tcp_ses_lock);
683 } else {
684 /* although we can not zero the server struct pointer yet,
685 since there are active requests which may depnd on them,
686 mark the corresponding SMB sessions as exiting too */
687 list_for_each(tmp, &server->smb_ses_list) {
688 ses = list_entry(tmp, struct cifsSesInfo,
689 smb_ses_list);
690 ses->status = CifsExiting;
691 }
692
693 spin_lock(&GlobalMid_Lock); 712 spin_lock(&GlobalMid_Lock);
694 list_for_each(tmp, &server->pending_mid_q) { 713 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
695 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 714 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
696 if (mid_entry->midState == MID_REQUEST_SUBMITTED) { 715 cFYI(1, "Clearing Mid 0x%x - issuing callback",
697 cFYI(1, "Clearing Mid 0x%x - waking up ",
698 mid_entry->mid); 716 mid_entry->mid);
699 task_to_wake = mid_entry->tsk; 717 list_del_init(&mid_entry->qhead);
700 if (task_to_wake) 718 mid_entry->callback(mid_entry);
701 wake_up_process(task_to_wake);
702 }
703 } 719 }
704 spin_unlock(&GlobalMid_Lock); 720 spin_unlock(&GlobalMid_Lock);
705 read_unlock(&cifs_tcp_ses_lock);
706 /* 1/8th of sec is more than enough time for them to exit */ 721 /* 1/8th of sec is more than enough time for them to exit */
707 msleep(125); 722 msleep(125);
708 } 723 }
@@ -720,18 +735,6 @@ multi_t2_fnd:
720 coming home not much else we can do but free the memory */ 735 coming home not much else we can do but free the memory */
721 } 736 }
722 737
723 /* last chance to mark ses pointers invalid
724 if there are any pointing to this (e.g
725 if a crazy root user tried to kill cifsd
726 kernel thread explicitly this might happen) */
727 /* BB: This shouldn't be necessary, see above */
728 read_lock(&cifs_tcp_ses_lock);
729 list_for_each(tmp, &server->smb_ses_list) {
730 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
731 ses->server = NULL;
732 }
733 read_unlock(&cifs_tcp_ses_lock);
734
735 kfree(server->hostname); 738 kfree(server->hostname);
736 task_to_wake = xchg(&server->tsk, NULL); 739 task_to_wake = xchg(&server->tsk, NULL);
737 kfree(server); 740 kfree(server);
@@ -794,24 +797,21 @@ cifs_parse_mount_options(char *options, const char *devname,
794 short int override_gid = -1; 797 short int override_gid = -1;
795 bool uid_specified = false; 798 bool uid_specified = false;
796 bool gid_specified = false; 799 bool gid_specified = false;
800 char *nodename = utsname()->nodename;
797 801
798 separator[0] = ','; 802 separator[0] = ',';
799 separator[1] = 0; 803 separator[1] = 0;
800 804
801 if (Local_System_Name[0] != 0) 805 /*
802 memcpy(vol->source_rfc1001_name, Local_System_Name, 15); 806 * does not have to be perfect mapping since field is
803 else { 807 * informational, only used for servers that do not support
804 char *nodename = utsname()->nodename; 808 * port 445 and it can be overridden at mount time
805 int n = strnlen(nodename, 15); 809 */
806 memset(vol->source_rfc1001_name, 0x20, 15); 810 memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
807 for (i = 0; i < n; i++) { 811 for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
808 /* does not have to be perfect mapping since field is 812 vol->source_rfc1001_name[i] = toupper(nodename[i]);
809 informational, only used for servers that do not support 813
810 port 445 and it can be overridden at mount time */ 814 vol->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
811 vol->source_rfc1001_name[i] = toupper(nodename[i]);
812 }
813 }
814 vol->source_rfc1001_name[15] = 0;
815 /* null target name indicates to use *SMBSERVR default called name 815 /* null target name indicates to use *SMBSERVR default called name
816 if we end up sending RFC1001 session initialize */ 816 if we end up sending RFC1001 session initialize */
817 vol->target_rfc1001_name[0] = 0; 817 vol->target_rfc1001_name[0] = 0;
@@ -828,6 +828,8 @@ cifs_parse_mount_options(char *options, const char *devname,
828 /* default to using server inode numbers where available */ 828 /* default to using server inode numbers where available */
829 vol->server_ino = 1; 829 vol->server_ino = 1;
830 830
831 vol->actimeo = CIFS_DEF_ACTIMEO;
832
831 if (!options) 833 if (!options)
832 return 1; 834 return 1;
833 835
@@ -973,13 +975,11 @@ cifs_parse_mount_options(char *options, const char *devname,
973 return 1; 975 return 1;
974 } else if (strnicmp(value, "krb5", 4) == 0) { 976 } else if (strnicmp(value, "krb5", 4) == 0) {
975 vol->secFlg |= CIFSSEC_MAY_KRB5; 977 vol->secFlg |= CIFSSEC_MAY_KRB5;
976#ifdef CONFIG_CIFS_EXPERIMENTAL
977 } else if (strnicmp(value, "ntlmsspi", 8) == 0) { 978 } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
978 vol->secFlg |= CIFSSEC_MAY_NTLMSSP | 979 vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
979 CIFSSEC_MUST_SIGN; 980 CIFSSEC_MUST_SIGN;
980 } else if (strnicmp(value, "ntlmssp", 7) == 0) { 981 } else if (strnicmp(value, "ntlmssp", 7) == 0) {
981 vol->secFlg |= CIFSSEC_MAY_NTLMSSP; 982 vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
982#endif
983 } else if (strnicmp(value, "ntlmv2i", 7) == 0) { 983 } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
984 vol->secFlg |= CIFSSEC_MAY_NTLMV2 | 984 vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
985 CIFSSEC_MUST_SIGN; 985 CIFSSEC_MUST_SIGN;
@@ -1046,6 +1046,22 @@ cifs_parse_mount_options(char *options, const char *devname,
1046 "long\n"); 1046 "long\n");
1047 return 1; 1047 return 1;
1048 } 1048 }
1049 } else if (strnicmp(data, "srcaddr", 7) == 0) {
1050 vol->srcaddr.ss_family = AF_UNSPEC;
1051
1052 if (!value || !*value) {
1053 printk(KERN_WARNING "CIFS: srcaddr value"
1054 " not specified.\n");
1055 return 1; /* needs_arg; */
1056 }
1057 i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
1058 value, strlen(value));
1059 if (i == 0) {
1060 printk(KERN_WARNING "CIFS: Could not parse"
1061 " srcaddr: %s\n",
1062 value);
1063 return 1;
1064 }
1049 } else if (strnicmp(data, "prefixpath", 10) == 0) { 1065 } else if (strnicmp(data, "prefixpath", 10) == 0) {
1050 if (!value || !*value) { 1066 if (!value || !*value) {
1051 printk(KERN_WARNING 1067 printk(KERN_WARNING
@@ -1088,6 +1104,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1088 } else if (!strnicmp(data, "uid", 3) && value && *value) { 1104 } else if (!strnicmp(data, "uid", 3) && value && *value) {
1089 vol->linux_uid = simple_strtoul(value, &value, 0); 1105 vol->linux_uid = simple_strtoul(value, &value, 0);
1090 uid_specified = true; 1106 uid_specified = true;
1107 } else if (!strnicmp(data, "cruid", 5) && value && *value) {
1108 vol->cred_uid = simple_strtoul(value, &value, 0);
1091 } else if (!strnicmp(data, "forceuid", 8)) { 1109 } else if (!strnicmp(data, "forceuid", 8)) {
1092 override_uid = 1; 1110 override_uid = 1;
1093 } else if (!strnicmp(data, "noforceuid", 10)) { 1111 } else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1140,22 +1158,22 @@ cifs_parse_mount_options(char *options, const char *devname,
1140 if (!value || !*value || (*value == ' ')) { 1158 if (!value || !*value || (*value == ' ')) {
1141 cFYI(1, "invalid (empty) netbiosname"); 1159 cFYI(1, "invalid (empty) netbiosname");
1142 } else { 1160 } else {
1143 memset(vol->source_rfc1001_name, 0x20, 15); 1161 memset(vol->source_rfc1001_name, 0x20,
1144 for (i = 0; i < 15; i++) { 1162 RFC1001_NAME_LEN);
1145 /* BB are there cases in which a comma can be 1163 /*
1146 valid in this workstation netbios name (and need 1164 * FIXME: are there cases in which a comma can
1147 special handling)? */ 1165 * be valid in workstation netbios name (and
1148 1166 * need special handling)?
1149 /* We do not uppercase netbiosname for user */ 1167 */
1168 for (i = 0; i < RFC1001_NAME_LEN; i++) {
1169 /* don't ucase netbiosname for user */
1150 if (value[i] == 0) 1170 if (value[i] == 0)
1151 break; 1171 break;
1152 else 1172 vol->source_rfc1001_name[i] = value[i];
1153 vol->source_rfc1001_name[i] =
1154 value[i];
1155 } 1173 }
1156 /* The string has 16th byte zero still from 1174 /* The string has 16th byte zero still from
1157 set at top of the function */ 1175 set at top of the function */
1158 if ((i == 15) && (value[i] != 0)) 1176 if (i == RFC1001_NAME_LEN && value[i] != 0)
1159 printk(KERN_WARNING "CIFS: netbiosname" 1177 printk(KERN_WARNING "CIFS: netbiosname"
1160 " longer than 15 truncated.\n"); 1178 " longer than 15 truncated.\n");
1161 } 1179 }
@@ -1165,7 +1183,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1165 cFYI(1, "empty server netbiosname specified"); 1183 cFYI(1, "empty server netbiosname specified");
1166 } else { 1184 } else {
1167 /* last byte, type, is 0x20 for servr type */ 1185 /* last byte, type, is 0x20 for servr type */
1168 memset(vol->target_rfc1001_name, 0x20, 16); 1186 memset(vol->target_rfc1001_name, 0x20,
1187 RFC1001_NAME_LEN_WITH_NULL);
1169 1188
1170 for (i = 0; i < 15; i++) { 1189 for (i = 0; i < 15; i++) {
1171 /* BB are there cases in which a comma can be 1190 /* BB are there cases in which a comma can be
@@ -1182,10 +1201,20 @@ cifs_parse_mount_options(char *options, const char *devname,
1182 } 1201 }
1183 /* The string has 16th byte zero still from 1202 /* The string has 16th byte zero still from
1184 set at top of the function */ 1203 set at top of the function */
1185 if ((i == 15) && (value[i] != 0)) 1204 if (i == RFC1001_NAME_LEN && value[i] != 0)
1186 printk(KERN_WARNING "CIFS: server net" 1205 printk(KERN_WARNING "CIFS: server net"
1187 "biosname longer than 15 truncated.\n"); 1206 "biosname longer than 15 truncated.\n");
1188 } 1207 }
1208 } else if (strnicmp(data, "actimeo", 7) == 0) {
1209 if (value && *value) {
1210 vol->actimeo = HZ * simple_strtoul(value,
1211 &value, 0);
1212 if (vol->actimeo > CIFS_MAX_ACTIMEO) {
1213 cERROR(1, "CIFS: attribute cache"
1214 "timeout too large");
1215 return 1;
1216 }
1217 }
1189 } else if (strnicmp(data, "credentials", 4) == 0) { 1218 } else if (strnicmp(data, "credentials", 4) == 0) {
1190 /* ignore */ 1219 /* ignore */
1191 } else if (strnicmp(data, "version", 3) == 0) { 1220 } else if (strnicmp(data, "version", 3) == 0) {
@@ -1303,10 +1332,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1303 vol->no_psx_acl = 0; 1332 vol->no_psx_acl = 0;
1304 } else if (strnicmp(data, "noacl", 5) == 0) { 1333 } else if (strnicmp(data, "noacl", 5) == 0) {
1305 vol->no_psx_acl = 1; 1334 vol->no_psx_acl = 1;
1306#ifdef CONFIG_CIFS_EXPERIMENTAL
1307 } else if (strnicmp(data, "locallease", 6) == 0) { 1335 } else if (strnicmp(data, "locallease", 6) == 0) {
1308 vol->local_lease = 1; 1336 vol->local_lease = 1;
1309#endif
1310 } else if (strnicmp(data, "sign", 4) == 0) { 1337 } else if (strnicmp(data, "sign", 4) == 0) {
1311 vol->secFlg |= CIFSSEC_MUST_SIGN; 1338 vol->secFlg |= CIFSSEC_MUST_SIGN;
1312 } else if (strnicmp(data, "seal", 4) == 0) { 1339 } else if (strnicmp(data, "seal", 4) == 0) {
@@ -1319,12 +1346,23 @@ cifs_parse_mount_options(char *options, const char *devname,
1319 vol->direct_io = 1; 1346 vol->direct_io = 1;
1320 } else if (strnicmp(data, "forcedirectio", 13) == 0) { 1347 } else if (strnicmp(data, "forcedirectio", 13) == 0) {
1321 vol->direct_io = 1; 1348 vol->direct_io = 1;
1349 } else if (strnicmp(data, "strictcache", 11) == 0) {
1350 vol->strict_io = 1;
1322 } else if (strnicmp(data, "noac", 4) == 0) { 1351 } else if (strnicmp(data, "noac", 4) == 0) {
1323 printk(KERN_WARNING "CIFS: Mount option noac not " 1352 printk(KERN_WARNING "CIFS: Mount option noac not "
1324 "supported. Instead set " 1353 "supported. Instead set "
1325 "/proc/fs/cifs/LookupCacheEnabled to 0\n"); 1354 "/proc/fs/cifs/LookupCacheEnabled to 0\n");
1326 } else if (strnicmp(data, "fsc", 3) == 0) { 1355 } else if (strnicmp(data, "fsc", 3) == 0) {
1356#ifndef CONFIG_CIFS_FSCACHE
1357 cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
1358 "kernel config option set");
1359 return 1;
1360#endif
1327 vol->fsc = true; 1361 vol->fsc = true;
1362 } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
1363 vol->mfsymlinks = true;
1364 } else if (strnicmp(data, "multiuser", 8) == 0) {
1365 vol->multiuser = true;
1328 } else 1366 } else
1329 printk(KERN_WARNING "CIFS: Unknown mount option %s\n", 1367 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
1330 data); 1368 data);
@@ -1356,6 +1394,13 @@ cifs_parse_mount_options(char *options, const char *devname,
1356 return 1; 1394 return 1;
1357 } 1395 }
1358 } 1396 }
1397
1398 if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
1399 cERROR(1, "Multiuser mounts currently require krb5 "
1400 "authentication!");
1401 return 1;
1402 }
1403
1359 if (vol->UNCip == NULL) 1404 if (vol->UNCip == NULL)
1360 vol->UNCip = &vol->UNC[2]; 1405 vol->UNCip = &vol->UNC[2];
1361 1406
@@ -1374,33 +1419,100 @@ cifs_parse_mount_options(char *options, const char *devname,
1374 return 0; 1419 return 0;
1375} 1420}
1376 1421
1422/** Returns true if srcaddr isn't specified and rhs isn't
1423 * specified, or if srcaddr is specified and
1424 * matches the IP address of the rhs argument.
1425 */
1426static bool
1427srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1428{
1429 switch (srcaddr->sa_family) {
1430 case AF_UNSPEC:
1431 return (rhs->sa_family == AF_UNSPEC);
1432 case AF_INET: {
1433 struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
1434 struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
1435 return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
1436 }
1437 case AF_INET6: {
1438 struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
1439 struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
1440 return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
1441 }
1442 default:
1443 WARN_ON(1);
1444 return false; /* don't expect to be here */
1445 }
1446}
1447
1448/*
1449 * If no port is specified in addr structure, we try to match with 445 port
1450 * and if it fails - with 139 ports. It should be called only if address
1451 * families of server and addr are equal.
1452 */
1377static bool 1453static bool
1378match_address(struct TCP_Server_Info *server, struct sockaddr *addr) 1454match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
1379{ 1455{
1380 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; 1456 unsigned short int port, *sport;
1381 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1382 1457
1383 switch (addr->sa_family) { 1458 switch (addr->sa_family) {
1384 case AF_INET: 1459 case AF_INET:
1385 if (addr4->sin_addr.s_addr != 1460 sport = &((struct sockaddr_in *) &server->dstaddr)->sin_port;
1386 server->addr.sockAddr.sin_addr.s_addr) 1461 port = ((struct sockaddr_in *) addr)->sin_port;
1387 return false;
1388 if (addr4->sin_port &&
1389 addr4->sin_port != server->addr.sockAddr.sin_port)
1390 return false;
1391 break; 1462 break;
1392 case AF_INET6: 1463 case AF_INET6:
1393 if (!ipv6_addr_equal(&addr6->sin6_addr, 1464 sport = &((struct sockaddr_in6 *) &server->dstaddr)->sin6_port;
1394 &server->addr.sockAddr6.sin6_addr)) 1465 port = ((struct sockaddr_in6 *) addr)->sin6_port;
1466 break;
1467 default:
1468 WARN_ON(1);
1469 return false;
1470 }
1471
1472 if (!port) {
1473 port = htons(CIFS_PORT);
1474 if (port == *sport)
1475 return true;
1476
1477 port = htons(RFC1001_PORT);
1478 }
1479
1480 return port == *sport;
1481}
1482
1483static bool
1484match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1485 struct sockaddr *srcaddr)
1486{
1487 switch (addr->sa_family) {
1488 case AF_INET: {
1489 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1490 struct sockaddr_in *srv_addr4 =
1491 (struct sockaddr_in *)&server->dstaddr;
1492
1493 if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
1395 return false; 1494 return false;
1396 if (addr6->sin6_scope_id != 1495 break;
1397 server->addr.sockAddr6.sin6_scope_id) 1496 }
1497 case AF_INET6: {
1498 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
1499 struct sockaddr_in6 *srv_addr6 =
1500 (struct sockaddr_in6 *)&server->dstaddr;
1501
1502 if (!ipv6_addr_equal(&addr6->sin6_addr,
1503 &srv_addr6->sin6_addr))
1398 return false; 1504 return false;
1399 if (addr6->sin6_port && 1505 if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
1400 addr6->sin6_port != server->addr.sockAddr6.sin6_port)
1401 return false; 1506 return false;
1402 break; 1507 break;
1403 } 1508 }
1509 default:
1510 WARN_ON(1);
1511 return false; /* don't expect to be here */
1512 }
1513
1514 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
1515 return false;
1404 1516
1405 return true; 1517 return true;
1406} 1518}
@@ -1458,29 +1570,27 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1458{ 1570{
1459 struct TCP_Server_Info *server; 1571 struct TCP_Server_Info *server;
1460 1572
1461 write_lock(&cifs_tcp_ses_lock); 1573 spin_lock(&cifs_tcp_ses_lock);
1462 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 1574 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1463 /* 1575 if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
1464 * the demux thread can exit on its own while still in CifsNew
1465 * so don't accept any sockets in that state. Since the
1466 * tcpStatus never changes back to CifsNew it's safe to check
1467 * for this without a lock.
1468 */
1469 if (server->tcpStatus == CifsNew)
1470 continue; 1576 continue;
1471 1577
1472 if (!match_address(server, addr)) 1578 if (!match_address(server, addr,
1579 (struct sockaddr *)&vol->srcaddr))
1580 continue;
1581
1582 if (!match_port(server, addr))
1473 continue; 1583 continue;
1474 1584
1475 if (!match_security(server, vol)) 1585 if (!match_security(server, vol))
1476 continue; 1586 continue;
1477 1587
1478 ++server->srv_count; 1588 ++server->srv_count;
1479 write_unlock(&cifs_tcp_ses_lock); 1589 spin_unlock(&cifs_tcp_ses_lock);
1480 cFYI(1, "Existing tcp session with server found"); 1590 cFYI(1, "Existing tcp session with server found");
1481 return server; 1591 return server;
1482 } 1592 }
1483 write_unlock(&cifs_tcp_ses_lock); 1593 spin_unlock(&cifs_tcp_ses_lock);
1484 return NULL; 1594 return NULL;
1485} 1595}
1486 1596
@@ -1489,21 +1599,30 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1489{ 1599{
1490 struct task_struct *task; 1600 struct task_struct *task;
1491 1601
1492 write_lock(&cifs_tcp_ses_lock); 1602 spin_lock(&cifs_tcp_ses_lock);
1493 if (--server->srv_count > 0) { 1603 if (--server->srv_count > 0) {
1494 write_unlock(&cifs_tcp_ses_lock); 1604 spin_unlock(&cifs_tcp_ses_lock);
1495 return; 1605 return;
1496 } 1606 }
1497 1607
1608 put_net(cifs_net_ns(server));
1609
1498 list_del_init(&server->tcp_ses_list); 1610 list_del_init(&server->tcp_ses_list);
1499 write_unlock(&cifs_tcp_ses_lock); 1611 spin_unlock(&cifs_tcp_ses_lock);
1612
1613 cancel_delayed_work_sync(&server->echo);
1500 1614
1501 spin_lock(&GlobalMid_Lock); 1615 spin_lock(&GlobalMid_Lock);
1502 server->tcpStatus = CifsExiting; 1616 server->tcpStatus = CifsExiting;
1503 spin_unlock(&GlobalMid_Lock); 1617 spin_unlock(&GlobalMid_Lock);
1504 1618
1619 cifs_crypto_shash_release(server);
1505 cifs_fscache_release_client_cookie(server); 1620 cifs_fscache_release_client_cookie(server);
1506 1621
1622 kfree(server->session_key.response);
1623 server->session_key.response = NULL;
1624 server->session_key.len = 0;
1625
1507 task = xchg(&server->tsk, NULL); 1626 task = xchg(&server->tsk, NULL);
1508 if (task) 1627 if (task)
1509 force_sig(SIGKILL, task); 1628 force_sig(SIGKILL, task);
@@ -1556,10 +1675,17 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1556 goto out_err; 1675 goto out_err;
1557 } 1676 }
1558 1677
1678 rc = cifs_crypto_shash_allocate(tcp_ses);
1679 if (rc) {
1680 cERROR(1, "could not setup hash structures rc %d", rc);
1681 goto out_err;
1682 }
1683
1684 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
1559 tcp_ses->hostname = extract_hostname(volume_info->UNC); 1685 tcp_ses->hostname = extract_hostname(volume_info->UNC);
1560 if (IS_ERR(tcp_ses->hostname)) { 1686 if (IS_ERR(tcp_ses->hostname)) {
1561 rc = PTR_ERR(tcp_ses->hostname); 1687 rc = PTR_ERR(tcp_ses->hostname);
1562 goto out_err; 1688 goto out_err_crypto_release;
1563 } 1689 }
1564 1690
1565 tcp_ses->noblocksnd = volume_info->noblocksnd; 1691 tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1574,9 +1700,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1574 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1700 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1575 memcpy(tcp_ses->server_RFC1001_name, 1701 memcpy(tcp_ses->server_RFC1001_name,
1576 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1702 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1703 tcp_ses->session_estab = false;
1577 tcp_ses->sequence_number = 0; 1704 tcp_ses->sequence_number = 0;
1705 tcp_ses->lstrp = jiffies;
1578 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 1706 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
1579 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 1707 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
1708 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
1580 1709
1581 /* 1710 /*
1582 * at this point we are the only ones with the pointer 1711 * at this point we are the only ones with the pointer
@@ -1584,23 +1713,24 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1584 * no need to spinlock this init of tcpStatus or srv_count 1713 * no need to spinlock this init of tcpStatus or srv_count
1585 */ 1714 */
1586 tcp_ses->tcpStatus = CifsNew; 1715 tcp_ses->tcpStatus = CifsNew;
1716 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
1717 sizeof(tcp_ses->srcaddr));
1587 ++tcp_ses->srv_count; 1718 ++tcp_ses->srv_count;
1588 1719
1589 if (addr.ss_family == AF_INET6) { 1720 if (addr.ss_family == AF_INET6) {
1590 cFYI(1, "attempting ipv6 connect"); 1721 cFYI(1, "attempting ipv6 connect");
1591 /* BB should we allow ipv6 on port 139? */ 1722 /* BB should we allow ipv6 on port 139? */
1592 /* other OS never observed in Wild doing 139 with v6 */ 1723 /* other OS never observed in Wild doing 139 with v6 */
1593 memcpy(&tcp_ses->addr.sockAddr6, sin_server6, 1724 memcpy(&tcp_ses->dstaddr, sin_server6,
1594 sizeof(struct sockaddr_in6)); 1725 sizeof(struct sockaddr_in6));
1595 rc = ipv6_connect(tcp_ses); 1726 } else
1596 } else { 1727 memcpy(&tcp_ses->dstaddr, sin_server,
1597 memcpy(&tcp_ses->addr.sockAddr, sin_server, 1728 sizeof(struct sockaddr_in));
1598 sizeof(struct sockaddr_in)); 1729
1599 rc = ipv4_connect(tcp_ses); 1730 rc = ip_connect(tcp_ses);
1600 }
1601 if (rc < 0) { 1731 if (rc < 0) {
1602 cERROR(1, "Error connecting to socket. Aborting operation"); 1732 cERROR(1, "Error connecting to socket. Aborting operation");
1603 goto out_err; 1733 goto out_err_crypto_release;
1604 } 1734 }
1605 1735
1606 /* 1736 /*
@@ -1614,18 +1744,26 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1614 rc = PTR_ERR(tcp_ses->tsk); 1744 rc = PTR_ERR(tcp_ses->tsk);
1615 cERROR(1, "error %d create cifsd thread", rc); 1745 cERROR(1, "error %d create cifsd thread", rc);
1616 module_put(THIS_MODULE); 1746 module_put(THIS_MODULE);
1617 goto out_err; 1747 goto out_err_crypto_release;
1618 } 1748 }
1619 1749
1620 /* thread spawned, put it on the list */ 1750 /* thread spawned, put it on the list */
1621 write_lock(&cifs_tcp_ses_lock); 1751 spin_lock(&cifs_tcp_ses_lock);
1622 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); 1752 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
1623 write_unlock(&cifs_tcp_ses_lock); 1753 spin_unlock(&cifs_tcp_ses_lock);
1624 1754
1625 cifs_fscache_get_client_cookie(tcp_ses); 1755 cifs_fscache_get_client_cookie(tcp_ses);
1626 1756
1757 /* queue echo request delayed work */
1758 queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
1759
1627 return tcp_ses; 1760 return tcp_ses;
1628 1761
1762out_err_crypto_release:
1763 cifs_crypto_shash_release(tcp_ses);
1764
1765 put_net(cifs_net_ns(tcp_ses));
1766
1629out_err: 1767out_err:
1630 if (tcp_ses) { 1768 if (tcp_ses) {
1631 if (!IS_ERR(tcp_ses->hostname)) 1769 if (!IS_ERR(tcp_ses->hostname))
@@ -1642,7 +1780,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1642{ 1780{
1643 struct cifsSesInfo *ses; 1781 struct cifsSesInfo *ses;
1644 1782
1645 write_lock(&cifs_tcp_ses_lock); 1783 spin_lock(&cifs_tcp_ses_lock);
1646 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { 1784 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
1647 switch (server->secType) { 1785 switch (server->secType) {
1648 case Kerberos: 1786 case Kerberos:
@@ -1662,10 +1800,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1662 continue; 1800 continue;
1663 } 1801 }
1664 ++ses->ses_count; 1802 ++ses->ses_count;
1665 write_unlock(&cifs_tcp_ses_lock); 1803 spin_unlock(&cifs_tcp_ses_lock);
1666 return ses; 1804 return ses;
1667 } 1805 }
1668 write_unlock(&cifs_tcp_ses_lock); 1806 spin_unlock(&cifs_tcp_ses_lock);
1669 return NULL; 1807 return NULL;
1670} 1808}
1671 1809
@@ -1676,14 +1814,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1676 struct TCP_Server_Info *server = ses->server; 1814 struct TCP_Server_Info *server = ses->server;
1677 1815
1678 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count); 1816 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
1679 write_lock(&cifs_tcp_ses_lock); 1817 spin_lock(&cifs_tcp_ses_lock);
1680 if (--ses->ses_count > 0) { 1818 if (--ses->ses_count > 0) {
1681 write_unlock(&cifs_tcp_ses_lock); 1819 spin_unlock(&cifs_tcp_ses_lock);
1682 return; 1820 return;
1683 } 1821 }
1684 1822
1685 list_del_init(&ses->smb_ses_list); 1823 list_del_init(&ses->smb_ses_list);
1686 write_unlock(&cifs_tcp_ses_lock); 1824 spin_unlock(&cifs_tcp_ses_lock);
1687 1825
1688 if (ses->status == CifsGood) { 1826 if (ses->status == CifsGood) {
1689 xid = GetXid(); 1827 xid = GetXid();
@@ -1699,6 +1837,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1699{ 1837{
1700 int rc = -ENOMEM, xid; 1838 int rc = -ENOMEM, xid;
1701 struct cifsSesInfo *ses; 1839 struct cifsSesInfo *ses;
1840 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
1841 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
1702 1842
1703 xid = GetXid(); 1843 xid = GetXid();
1704 1844
@@ -1742,12 +1882,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1742 1882
1743 /* new SMB session uses our server ref */ 1883 /* new SMB session uses our server ref */
1744 ses->server = server; 1884 ses->server = server;
1745 if (server->addr.sockAddr6.sin6_family == AF_INET6) 1885 if (server->dstaddr.ss_family == AF_INET6)
1746 sprintf(ses->serverName, "%pI6", 1886 sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
1747 &server->addr.sockAddr6.sin6_addr);
1748 else 1887 else
1749 sprintf(ses->serverName, "%pI4", 1888 sprintf(ses->serverName, "%pI4", &addr->sin_addr);
1750 &server->addr.sockAddr.sin_addr.s_addr);
1751 1889
1752 if (volume_info->username) 1890 if (volume_info->username)
1753 strncpy(ses->userName, volume_info->username, 1891 strncpy(ses->userName, volume_info->username,
@@ -1760,10 +1898,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1760 goto get_ses_fail; 1898 goto get_ses_fail;
1761 } 1899 }
1762 if (volume_info->domainname) { 1900 if (volume_info->domainname) {
1763 int len = strlen(volume_info->domainname); 1901 ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
1764 ses->domainName = kmalloc(len + 1, GFP_KERNEL); 1902 if (!ses->domainName)
1765 if (ses->domainName) 1903 goto get_ses_fail;
1766 strcpy(ses->domainName, volume_info->domainname);
1767 } 1904 }
1768 ses->cred_uid = volume_info->cred_uid; 1905 ses->cred_uid = volume_info->cred_uid;
1769 ses->linux_uid = volume_info->linux_uid; 1906 ses->linux_uid = volume_info->linux_uid;
@@ -1778,9 +1915,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1778 goto get_ses_fail; 1915 goto get_ses_fail;
1779 1916
1780 /* success, put it on the list */ 1917 /* success, put it on the list */
1781 write_lock(&cifs_tcp_ses_lock); 1918 spin_lock(&cifs_tcp_ses_lock);
1782 list_add(&ses->smb_ses_list, &server->smb_ses_list); 1919 list_add(&ses->smb_ses_list, &server->smb_ses_list);
1783 write_unlock(&cifs_tcp_ses_lock); 1920 spin_unlock(&cifs_tcp_ses_lock);
1784 1921
1785 FreeXid(xid); 1922 FreeXid(xid);
1786 return ses; 1923 return ses;
@@ -1797,7 +1934,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1797 struct list_head *tmp; 1934 struct list_head *tmp;
1798 struct cifsTconInfo *tcon; 1935 struct cifsTconInfo *tcon;
1799 1936
1800 write_lock(&cifs_tcp_ses_lock); 1937 spin_lock(&cifs_tcp_ses_lock);
1801 list_for_each(tmp, &ses->tcon_list) { 1938 list_for_each(tmp, &ses->tcon_list) {
1802 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list); 1939 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
1803 if (tcon->tidStatus == CifsExiting) 1940 if (tcon->tidStatus == CifsExiting)
@@ -1806,10 +1943,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1806 continue; 1943 continue;
1807 1944
1808 ++tcon->tc_count; 1945 ++tcon->tc_count;
1809 write_unlock(&cifs_tcp_ses_lock); 1946 spin_unlock(&cifs_tcp_ses_lock);
1810 return tcon; 1947 return tcon;
1811 } 1948 }
1812 write_unlock(&cifs_tcp_ses_lock); 1949 spin_unlock(&cifs_tcp_ses_lock);
1813 return NULL; 1950 return NULL;
1814} 1951}
1815 1952
@@ -1820,14 +1957,14 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1820 struct cifsSesInfo *ses = tcon->ses; 1957 struct cifsSesInfo *ses = tcon->ses;
1821 1958
1822 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count); 1959 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
1823 write_lock(&cifs_tcp_ses_lock); 1960 spin_lock(&cifs_tcp_ses_lock);
1824 if (--tcon->tc_count > 0) { 1961 if (--tcon->tc_count > 0) {
1825 write_unlock(&cifs_tcp_ses_lock); 1962 spin_unlock(&cifs_tcp_ses_lock);
1826 return; 1963 return;
1827 } 1964 }
1828 1965
1829 list_del_init(&tcon->tcon_list); 1966 list_del_init(&tcon->tcon_list);
1830 write_unlock(&cifs_tcp_ses_lock); 1967 spin_unlock(&cifs_tcp_ses_lock);
1831 1968
1832 xid = GetXid(); 1969 xid = GetXid();
1833 CIFSSMBTDis(xid, tcon); 1970 CIFSSMBTDis(xid, tcon);
@@ -1900,9 +2037,9 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
1900 tcon->nocase = volume_info->nocase; 2037 tcon->nocase = volume_info->nocase;
1901 tcon->local_lease = volume_info->local_lease; 2038 tcon->local_lease = volume_info->local_lease;
1902 2039
1903 write_lock(&cifs_tcp_ses_lock); 2040 spin_lock(&cifs_tcp_ses_lock);
1904 list_add(&tcon->tcon_list, &ses->tcon_list); 2041 list_add(&tcon->tcon_list, &ses->tcon_list);
1905 write_unlock(&cifs_tcp_ses_lock); 2042 spin_unlock(&cifs_tcp_ses_lock);
1906 2043
1907 cifs_fscache_get_super_cookie(tcon); 2044 cifs_fscache_get_super_cookie(tcon);
1908 2045
@@ -1913,6 +2050,23 @@ out_fail:
1913 return ERR_PTR(rc); 2050 return ERR_PTR(rc);
1914} 2051}
1915 2052
2053void
2054cifs_put_tlink(struct tcon_link *tlink)
2055{
2056 if (!tlink || IS_ERR(tlink))
2057 return;
2058
2059 if (!atomic_dec_and_test(&tlink->tl_count) ||
2060 test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
2061 tlink->tl_time = jiffies;
2062 return;
2063 }
2064
2065 if (!IS_ERR(tlink_tcon(tlink)))
2066 cifs_put_tcon(tlink_tcon(tlink));
2067 kfree(tlink);
2068 return;
2069}
1916 2070
1917int 2071int
1918get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 2072get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -1997,21 +2151,135 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
1997 2151
1998} 2152}
1999 2153
2154static int
2155bind_socket(struct TCP_Server_Info *server)
2156{
2157 int rc = 0;
2158 if (server->srcaddr.ss_family != AF_UNSPEC) {
2159 /* Bind to the specified local IP address */
2160 struct socket *socket = server->ssocket;
2161 rc = socket->ops->bind(socket,
2162 (struct sockaddr *) &server->srcaddr,
2163 sizeof(server->srcaddr));
2164 if (rc < 0) {
2165 struct sockaddr_in *saddr4;
2166 struct sockaddr_in6 *saddr6;
2167 saddr4 = (struct sockaddr_in *)&server->srcaddr;
2168 saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
2169 if (saddr6->sin6_family == AF_INET6)
2170 cERROR(1, "cifs: "
2171 "Failed to bind to: %pI6c, error: %d\n",
2172 &saddr6->sin6_addr, rc);
2173 else
2174 cERROR(1, "cifs: "
2175 "Failed to bind to: %pI4, error: %d\n",
2176 &saddr4->sin_addr.s_addr, rc);
2177 }
2178 }
2179 return rc;
2180}
2000 2181
2001static int 2182static int
2002ipv4_connect(struct TCP_Server_Info *server) 2183ip_rfc1001_connect(struct TCP_Server_Info *server)
2003{ 2184{
2004 int rc = 0; 2185 int rc = 0;
2005 int val; 2186 /*
2006 bool connected = false; 2187 * some servers require RFC1001 sessinit before sending
2007 __be16 orig_port = 0; 2188 * negprot - BB check reconnection in case where second
2189 * sessinit is sent but no second negprot
2190 */
2191 struct rfc1002_session_packet *ses_init_buf;
2192 struct smb_hdr *smb_buf;
2193 ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
2194 GFP_KERNEL);
2195 if (ses_init_buf) {
2196 ses_init_buf->trailer.session_req.called_len = 32;
2197
2198 if (server->server_RFC1001_name &&
2199 server->server_RFC1001_name[0] != 0)
2200 rfc1002mangle(ses_init_buf->trailer.
2201 session_req.called_name,
2202 server->server_RFC1001_name,
2203 RFC1001_NAME_LEN_WITH_NULL);
2204 else
2205 rfc1002mangle(ses_init_buf->trailer.
2206 session_req.called_name,
2207 DEFAULT_CIFS_CALLED_NAME,
2208 RFC1001_NAME_LEN_WITH_NULL);
2209
2210 ses_init_buf->trailer.session_req.calling_len = 32;
2211
2212 /*
2213 * calling name ends in null (byte 16) from old smb
2214 * convention.
2215 */
2216 if (server->workstation_RFC1001_name &&
2217 server->workstation_RFC1001_name[0] != 0)
2218 rfc1002mangle(ses_init_buf->trailer.
2219 session_req.calling_name,
2220 server->workstation_RFC1001_name,
2221 RFC1001_NAME_LEN_WITH_NULL);
2222 else
2223 rfc1002mangle(ses_init_buf->trailer.
2224 session_req.calling_name,
2225 "LINUX_CIFS_CLNT",
2226 RFC1001_NAME_LEN_WITH_NULL);
2227
2228 ses_init_buf->trailer.session_req.scope1 = 0;
2229 ses_init_buf->trailer.session_req.scope2 = 0;
2230 smb_buf = (struct smb_hdr *)ses_init_buf;
2231
2232 /* sizeof RFC1002_SESSION_REQUEST with no scope */
2233 smb_buf->smb_buf_length = 0x81000044;
2234 rc = smb_send(server, smb_buf, 0x44);
2235 kfree(ses_init_buf);
2236 /*
2237 * RFC1001 layer in at least one server
2238 * requires very short break before negprot
2239 * presumably because not expecting negprot
2240 * to follow so fast. This is a simple
2241 * solution that works without
2242 * complicating the code and causes no
2243 * significant slowing down on mount
2244 * for everyone else
2245 */
2246 usleep_range(1000, 2000);
2247 }
2248 /*
2249 * else the negprot may still work without this
2250 * even though malloc failed
2251 */
2252
2253 return rc;
2254}
2255
2256static int
2257generic_ip_connect(struct TCP_Server_Info *server)
2258{
2259 int rc = 0;
2260 unsigned short int sport;
2261 int slen, sfamily;
2008 struct socket *socket = server->ssocket; 2262 struct socket *socket = server->ssocket;
2263 struct sockaddr *saddr;
2264
2265 saddr = (struct sockaddr *) &server->dstaddr;
2266
2267 if (server->dstaddr.ss_family == AF_INET6) {
2268 sport = ((struct sockaddr_in6 *) saddr)->sin6_port;
2269 slen = sizeof(struct sockaddr_in6);
2270 sfamily = AF_INET6;
2271 } else {
2272 sport = ((struct sockaddr_in *) saddr)->sin_port;
2273 slen = sizeof(struct sockaddr_in);
2274 sfamily = AF_INET;
2275 }
2009 2276
2010 if (socket == NULL) { 2277 if (socket == NULL) {
2011 rc = sock_create_kern(PF_INET, SOCK_STREAM, 2278 rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
2012 IPPROTO_TCP, &socket); 2279 IPPROTO_TCP, &socket, 1);
2013 if (rc < 0) { 2280 if (rc < 0) {
2014 cERROR(1, "Error %d creating socket", rc); 2281 cERROR(1, "Error %d creating socket", rc);
2282 server->ssocket = NULL;
2015 return rc; 2283 return rc;
2016 } 2284 }
2017 2285
@@ -2019,59 +2287,28 @@ ipv4_connect(struct TCP_Server_Info *server)
2019 cFYI(1, "Socket created"); 2287 cFYI(1, "Socket created");
2020 server->ssocket = socket; 2288 server->ssocket = socket;
2021 socket->sk->sk_allocation = GFP_NOFS; 2289 socket->sk->sk_allocation = GFP_NOFS;
2022 cifs_reclassify_socket4(socket); 2290 if (sfamily == AF_INET6)
2291 cifs_reclassify_socket6(socket);
2292 else
2293 cifs_reclassify_socket4(socket);
2023 } 2294 }
2024 2295
2025 /* user overrode default port */ 2296 rc = bind_socket(server);
2026 if (server->addr.sockAddr.sin_port) { 2297 if (rc < 0)
2027 rc = socket->ops->connect(socket, (struct sockaddr *) 2298 return rc;
2028 &server->addr.sockAddr,
2029 sizeof(struct sockaddr_in), 0);
2030 if (rc >= 0)
2031 connected = true;
2032 }
2033
2034 if (!connected) {
2035 /* save original port so we can retry user specified port
2036 later if fall back ports fail this time */
2037 orig_port = server->addr.sockAddr.sin_port;
2038
2039 /* do not retry on the same port we just failed on */
2040 if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
2041 server->addr.sockAddr.sin_port = htons(CIFS_PORT);
2042 rc = socket->ops->connect(socket,
2043 (struct sockaddr *)
2044 &server->addr.sockAddr,
2045 sizeof(struct sockaddr_in), 0);
2046 if (rc >= 0)
2047 connected = true;
2048 }
2049 }
2050 if (!connected) {
2051 server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
2052 rc = socket->ops->connect(socket, (struct sockaddr *)
2053 &server->addr.sockAddr,
2054 sizeof(struct sockaddr_in), 0);
2055 if (rc >= 0)
2056 connected = true;
2057 }
2058 2299
2059 /* give up here - unless we want to retry on different 2300 rc = socket->ops->connect(socket, saddr, slen, 0);
2060 protocol families some day */ 2301 if (rc < 0) {
2061 if (!connected) { 2302 cFYI(1, "Error %d connecting to server", rc);
2062 if (orig_port)
2063 server->addr.sockAddr.sin_port = orig_port;
2064 cFYI(1, "Error %d connecting to server via ipv4", rc);
2065 sock_release(socket); 2303 sock_release(socket);
2066 server->ssocket = NULL; 2304 server->ssocket = NULL;
2067 return rc; 2305 return rc;
2068 } 2306 }
2069 2307
2070
2071 /* 2308 /*
2072 * Eventually check for other socket options to change from 2309 * Eventually check for other socket options to change from
2073 * the default. sock_setsockopt not used because it expects 2310 * the default. sock_setsockopt not used because it expects
2074 * user space buffer 2311 * user space buffer
2075 */ 2312 */
2076 socket->sk->sk_rcvtimeo = 7 * HZ; 2313 socket->sk->sk_rcvtimeo = 7 * HZ;
2077 socket->sk->sk_sndtimeo = 5 * HZ; 2314 socket->sk->sk_sndtimeo = 5 * HZ;
@@ -2085,7 +2322,7 @@ ipv4_connect(struct TCP_Server_Info *server)
2085 } 2322 }
2086 2323
2087 if (server->tcp_nodelay) { 2324 if (server->tcp_nodelay) {
2088 val = 1; 2325 int val = 1;
2089 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2326 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2090 (char *)&val, sizeof(val)); 2327 (char *)&val, sizeof(val));
2091 if (rc) 2328 if (rc)
@@ -2096,157 +2333,39 @@ ipv4_connect(struct TCP_Server_Info *server)
2096 socket->sk->sk_sndbuf, 2333 socket->sk->sk_sndbuf,
2097 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); 2334 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
2098 2335
2099 /* send RFC1001 sessinit */ 2336 if (sport == htons(RFC1001_PORT))
2100 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) { 2337 rc = ip_rfc1001_connect(server);
2101 /* some servers require RFC1001 sessinit before sending
2102 negprot - BB check reconnection in case where second
2103 sessinit is sent but no second negprot */
2104 struct rfc1002_session_packet *ses_init_buf;
2105 struct smb_hdr *smb_buf;
2106 ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
2107 GFP_KERNEL);
2108 if (ses_init_buf) {
2109 ses_init_buf->trailer.session_req.called_len = 32;
2110 if (server->server_RFC1001_name &&
2111 server->server_RFC1001_name[0] != 0)
2112 rfc1002mangle(ses_init_buf->trailer.
2113 session_req.called_name,
2114 server->server_RFC1001_name,
2115 RFC1001_NAME_LEN_WITH_NULL);
2116 else
2117 rfc1002mangle(ses_init_buf->trailer.
2118 session_req.called_name,
2119 DEFAULT_CIFS_CALLED_NAME,
2120 RFC1001_NAME_LEN_WITH_NULL);
2121
2122 ses_init_buf->trailer.session_req.calling_len = 32;
2123
2124 /* calling name ends in null (byte 16) from old smb
2125 convention. */
2126 if (server->workstation_RFC1001_name &&
2127 server->workstation_RFC1001_name[0] != 0)
2128 rfc1002mangle(ses_init_buf->trailer.
2129 session_req.calling_name,
2130 server->workstation_RFC1001_name,
2131 RFC1001_NAME_LEN_WITH_NULL);
2132 else
2133 rfc1002mangle(ses_init_buf->trailer.
2134 session_req.calling_name,
2135 "LINUX_CIFS_CLNT",
2136 RFC1001_NAME_LEN_WITH_NULL);
2137
2138 ses_init_buf->trailer.session_req.scope1 = 0;
2139 ses_init_buf->trailer.session_req.scope2 = 0;
2140 smb_buf = (struct smb_hdr *)ses_init_buf;
2141 /* sizeof RFC1002_SESSION_REQUEST with no scope */
2142 smb_buf->smb_buf_length = 0x81000044;
2143 rc = smb_send(server, smb_buf, 0x44);
2144 kfree(ses_init_buf);
2145 msleep(1); /* RFC1001 layer in at least one server
2146 requires very short break before negprot
2147 presumably because not expecting negprot
2148 to follow so fast. This is a simple
2149 solution that works without
2150 complicating the code and causes no
2151 significant slowing down on mount
2152 for everyone else */
2153 }
2154 /* else the negprot may still work without this
2155 even though malloc failed */
2156
2157 }
2158 2338
2159 return rc; 2339 return rc;
2160} 2340}
2161 2341
2162static int 2342static int
2163ipv6_connect(struct TCP_Server_Info *server) 2343ip_connect(struct TCP_Server_Info *server)
2164{ 2344{
2165 int rc = 0; 2345 unsigned short int *sport;
2166 int val; 2346 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
2167 bool connected = false; 2347 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
2168 __be16 orig_port = 0;
2169 struct socket *socket = server->ssocket;
2170 2348
2171 if (socket == NULL) { 2349 if (server->dstaddr.ss_family == AF_INET6)
2172 rc = sock_create_kern(PF_INET6, SOCK_STREAM, 2350 sport = &addr6->sin6_port;
2173 IPPROTO_TCP, &socket); 2351 else
2174 if (rc < 0) { 2352 sport = &addr->sin_port;
2175 cERROR(1, "Error %d creating ipv6 socket", rc);
2176 socket = NULL;
2177 return rc;
2178 }
2179
2180 /* BB other socket options to set KEEPALIVE, NODELAY? */
2181 cFYI(1, "ipv6 Socket created");
2182 server->ssocket = socket;
2183 socket->sk->sk_allocation = GFP_NOFS;
2184 cifs_reclassify_socket6(socket);
2185 }
2186 2353
2187 /* user overrode default port */ 2354 if (*sport == 0) {
2188 if (server->addr.sockAddr6.sin6_port) { 2355 int rc;
2189 rc = socket->ops->connect(socket,
2190 (struct sockaddr *) &server->addr.sockAddr6,
2191 sizeof(struct sockaddr_in6), 0);
2192 if (rc >= 0)
2193 connected = true;
2194 }
2195
2196 if (!connected) {
2197 /* save original port so we can retry user specified port
2198 later if fall back ports fail this time */
2199
2200 orig_port = server->addr.sockAddr6.sin6_port;
2201 /* do not retry on the same port we just failed on */
2202 if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
2203 server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
2204 rc = socket->ops->connect(socket, (struct sockaddr *)
2205 &server->addr.sockAddr6,
2206 sizeof(struct sockaddr_in6), 0);
2207 if (rc >= 0)
2208 connected = true;
2209 }
2210 }
2211 if (!connected) {
2212 server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
2213 rc = socket->ops->connect(socket, (struct sockaddr *)
2214 &server->addr.sockAddr6,
2215 sizeof(struct sockaddr_in6), 0);
2216 if (rc >= 0)
2217 connected = true;
2218 }
2219 2356
2220 /* give up here - unless we want to retry on different 2357 /* try with 445 port at first */
2221 protocol families some day */ 2358 *sport = htons(CIFS_PORT);
2222 if (!connected) {
2223 if (orig_port)
2224 server->addr.sockAddr6.sin6_port = orig_port;
2225 cFYI(1, "Error %d connecting to server via ipv6", rc);
2226 sock_release(socket);
2227 server->ssocket = NULL;
2228 return rc;
2229 }
2230 2359
2231 /* 2360 rc = generic_ip_connect(server);
2232 * Eventually check for other socket options to change from 2361 if (rc >= 0)
2233 * the default. sock_setsockopt not used because it expects 2362 return rc;
2234 * user space buffer
2235 */
2236 socket->sk->sk_rcvtimeo = 7 * HZ;
2237 socket->sk->sk_sndtimeo = 5 * HZ;
2238 2363
2239 if (server->tcp_nodelay) { 2364 /* if it failed, try with 139 port */
2240 val = 1; 2365 *sport = htons(RFC1001_PORT);
2241 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2242 (char *)&val, sizeof(val));
2243 if (rc)
2244 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
2245 } 2366 }
2246 2367
2247 server->ssocket = socket; 2368 return generic_ip_connect(server);
2248
2249 return rc;
2250} 2369}
2251 2370
2252void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon, 2371void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
@@ -2383,6 +2502,8 @@ convert_delimiter(char *path, char delim)
2383static void setup_cifs_sb(struct smb_vol *pvolume_info, 2502static void setup_cifs_sb(struct smb_vol *pvolume_info,
2384 struct cifs_sb_info *cifs_sb) 2503 struct cifs_sb_info *cifs_sb)
2385{ 2504{
2505 INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
2506
2386 if (pvolume_info->rsize > CIFSMaxBufSize) { 2507 if (pvolume_info->rsize > CIFSMaxBufSize) {
2387 cERROR(1, "rsize %d too large, using MaxBufSize", 2508 cERROR(1, "rsize %d too large, using MaxBufSize",
2388 pvolume_info->rsize); 2509 pvolume_info->rsize);
@@ -2434,6 +2555,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2434 cFYI(1, "file mode: 0x%x dir mode: 0x%x", 2555 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
2435 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); 2556 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
2436 2557
2558 cifs_sb->actimeo = pvolume_info->actimeo;
2559
2437 if (pvolume_info->noperm) 2560 if (pvolume_info->noperm)
2438 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 2561 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
2439 if (pvolume_info->setuids) 2562 if (pvolume_info->setuids)
@@ -2462,10 +2585,23 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2462 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; 2585 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
2463 if (pvolume_info->fsc) 2586 if (pvolume_info->fsc)
2464 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; 2587 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
2588 if (pvolume_info->multiuser)
2589 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
2590 CIFS_MOUNT_NO_PERM);
2591 if (pvolume_info->strict_io)
2592 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
2465 if (pvolume_info->direct_io) { 2593 if (pvolume_info->direct_io) {
2466 cFYI(1, "mounting share using direct i/o"); 2594 cFYI(1, "mounting share using direct i/o");
2467 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2595 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
2468 } 2596 }
2597 if (pvolume_info->mfsymlinks) {
2598 if (pvolume_info->sfu_emul) {
2599 cERROR(1, "mount option mfsymlinks ignored if sfu "
2600 "mount option is used");
2601 } else {
2602 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
2603 }
2604 }
2469 2605
2470 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 2606 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
2471 cERROR(1, "mount option dynperm ignored if cifsacl " 2607 cERROR(1, "mount option dynperm ignored if cifsacl "
@@ -2552,6 +2688,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2552 struct TCP_Server_Info *srvTcp; 2688 struct TCP_Server_Info *srvTcp;
2553 char *full_path; 2689 char *full_path;
2554 char *mount_data = mount_data_global; 2690 char *mount_data = mount_data_global;
2691 struct tcon_link *tlink;
2555#ifdef CONFIG_CIFS_DFS_UPCALL 2692#ifdef CONFIG_CIFS_DFS_UPCALL
2556 struct dfs_info3_param *referrals = NULL; 2693 struct dfs_info3_param *referrals = NULL;
2557 unsigned int num_referrals = 0; 2694 unsigned int num_referrals = 0;
@@ -2563,6 +2700,7 @@ try_mount_again:
2563 pSesInfo = NULL; 2700 pSesInfo = NULL;
2564 srvTcp = NULL; 2701 srvTcp = NULL;
2565 full_path = NULL; 2702 full_path = NULL;
2703 tlink = NULL;
2566 2704
2567 xid = GetXid(); 2705 xid = GetXid();
2568 2706
@@ -2638,8 +2776,6 @@ try_mount_again:
2638 goto remote_path_check; 2776 goto remote_path_check;
2639 } 2777 }
2640 2778
2641 cifs_sb->tcon = tcon;
2642
2643 /* do not care if following two calls succeed - informational */ 2779 /* do not care if following two calls succeed - informational */
2644 if (!tcon->ipc) { 2780 if (!tcon->ipc) {
2645 CIFSSMBQFSDeviceInfo(xid, tcon); 2781 CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2673,13 +2809,13 @@ remote_path_check:
2673 /* check if a whole path (including prepath) is not remote */ 2809 /* check if a whole path (including prepath) is not remote */
2674 if (!rc && cifs_sb->prepathlen && tcon) { 2810 if (!rc && cifs_sb->prepathlen && tcon) {
2675 /* build_path_to_root works only when we have a valid tcon */ 2811 /* build_path_to_root works only when we have a valid tcon */
2676 full_path = cifs_build_path_to_root(cifs_sb); 2812 full_path = cifs_build_path_to_root(cifs_sb, tcon);
2677 if (full_path == NULL) { 2813 if (full_path == NULL) {
2678 rc = -ENOMEM; 2814 rc = -ENOMEM;
2679 goto mount_fail_check; 2815 goto mount_fail_check;
2680 } 2816 }
2681 rc = is_path_accessible(xid, tcon, cifs_sb, full_path); 2817 rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
2682 if (rc != -EREMOTE) { 2818 if (rc != 0 && rc != -EREMOTE) {
2683 kfree(full_path); 2819 kfree(full_path);
2684 goto mount_fail_check; 2820 goto mount_fail_check;
2685 } 2821 }
@@ -2748,6 +2884,30 @@ remote_path_check:
2748#endif 2884#endif
2749 } 2885 }
2750 2886
2887 if (rc)
2888 goto mount_fail_check;
2889
2890 /* now, hang the tcon off of the superblock */
2891 tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
2892 if (tlink == NULL) {
2893 rc = -ENOMEM;
2894 goto mount_fail_check;
2895 }
2896
2897 tlink->tl_uid = pSesInfo->linux_uid;
2898 tlink->tl_tcon = tcon;
2899 tlink->tl_time = jiffies;
2900 set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
2901 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
2902
2903 cifs_sb->master_tlink = tlink;
2904 spin_lock(&cifs_sb->tlink_tree_lock);
2905 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
2906 spin_unlock(&cifs_sb->tlink_tree_lock);
2907
2908 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
2909 TLINK_IDLE_EXPIRE);
2910
2751mount_fail_check: 2911mount_fail_check:
2752 /* on error free sesinfo and tcon struct if needed */ 2912 /* on error free sesinfo and tcon struct if needed */
2753 if (rc) { 2913 if (rc) {
@@ -2786,8 +2946,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2786 TCONX_RSP *pSMBr; 2946 TCONX_RSP *pSMBr;
2787 unsigned char *bcc_ptr; 2947 unsigned char *bcc_ptr;
2788 int rc = 0; 2948 int rc = 0;
2789 int length, bytes_left; 2949 int length;
2790 __u16 count; 2950 __u16 bytes_left, count;
2791 2951
2792 if (ses == NULL) 2952 if (ses == NULL)
2793 return -EIO; 2953 return -EIO;
@@ -2815,7 +2975,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2815 bcc_ptr++; /* skip password */ 2975 bcc_ptr++; /* skip password */
2816 /* already aligned so no need to do it below */ 2976 /* already aligned so no need to do it below */
2817 } else { 2977 } else {
2818 pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 2978 pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
2819 /* BB FIXME add code to fail this if NTLMv2 or Kerberos 2979 /* BB FIXME add code to fail this if NTLMv2 or Kerberos
2820 specified as required (when that support is added to 2980 specified as required (when that support is added to
2821 the vfs in the future) as only NTLM or the much 2981 the vfs in the future) as only NTLM or the much
@@ -2825,16 +2985,16 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2825#ifdef CONFIG_CIFS_WEAK_PW_HASH 2985#ifdef CONFIG_CIFS_WEAK_PW_HASH
2826 if ((global_secflags & CIFSSEC_MAY_LANMAN) && 2986 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
2827 (ses->server->secType == LANMAN)) 2987 (ses->server->secType == LANMAN))
2828 calc_lanman_hash(tcon->password, ses->server->cryptKey, 2988 calc_lanman_hash(tcon->password, ses->server->cryptkey,
2829 ses->server->secMode & 2989 ses->server->secMode &
2830 SECMODE_PW_ENCRYPT ? true : false, 2990 SECMODE_PW_ENCRYPT ? true : false,
2831 bcc_ptr); 2991 bcc_ptr);
2832 else 2992 else
2833#endif /* CIFS_WEAK_PW_HASH */ 2993#endif /* CIFS_WEAK_PW_HASH */
2834 SMBNTencrypt(tcon->password, ses->server->cryptKey, 2994 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
2835 bcc_ptr); 2995 bcc_ptr);
2836 2996
2837 bcc_ptr += CIFS_SESS_KEY_SIZE; 2997 bcc_ptr += CIFS_AUTH_RESP_SIZE;
2838 if (ses->capabilities & CAP_UNICODE) { 2998 if (ses->capabilities & CAP_UNICODE) {
2839 /* must align unicode strings */ 2999 /* must align unicode strings */
2840 *bcc_ptr = 0; /* null byte password */ 3000 *bcc_ptr = 0; /* null byte password */
@@ -2872,7 +3032,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2872 pSMB->ByteCount = cpu_to_le16(count); 3032 pSMB->ByteCount = cpu_to_le16(count);
2873 3033
2874 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 3034 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
2875 CIFS_STD_OP); 3035 0);
2876 3036
2877 /* above now done in SendReceive */ 3037 /* above now done in SendReceive */
2878 if ((rc == 0) && (tcon != NULL)) { 3038 if ((rc == 0) && (tcon != NULL)) {
@@ -2882,7 +3042,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2882 tcon->need_reconnect = false; 3042 tcon->need_reconnect = false;
2883 tcon->tid = smb_buffer_response->Tid; 3043 tcon->tid = smb_buffer_response->Tid;
2884 bcc_ptr = pByteArea(smb_buffer_response); 3044 bcc_ptr = pByteArea(smb_buffer_response);
2885 bytes_left = BCC(smb_buffer_response); 3045 bytes_left = get_bcc(smb_buffer_response);
2886 length = strnlen(bcc_ptr, bytes_left - 2); 3046 length = strnlen(bcc_ptr, bytes_left - 2);
2887 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) 3047 if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
2888 is_unicode = true; 3048 is_unicode = true;
@@ -2934,19 +3094,32 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2934int 3094int
2935cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) 3095cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
2936{ 3096{
2937 int rc = 0; 3097 struct rb_root *root = &cifs_sb->tlink_tree;
3098 struct rb_node *node;
3099 struct tcon_link *tlink;
2938 char *tmp; 3100 char *tmp;
2939 3101
2940 if (cifs_sb->tcon) 3102 cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
2941 cifs_put_tcon(cifs_sb->tcon); 3103
3104 spin_lock(&cifs_sb->tlink_tree_lock);
3105 while ((node = rb_first(root))) {
3106 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3107 cifs_get_tlink(tlink);
3108 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3109 rb_erase(node, root);
3110
3111 spin_unlock(&cifs_sb->tlink_tree_lock);
3112 cifs_put_tlink(tlink);
3113 spin_lock(&cifs_sb->tlink_tree_lock);
3114 }
3115 spin_unlock(&cifs_sb->tlink_tree_lock);
2942 3116
2943 cifs_sb->tcon = NULL;
2944 tmp = cifs_sb->prepath; 3117 tmp = cifs_sb->prepath;
2945 cifs_sb->prepathlen = 0; 3118 cifs_sb->prepathlen = 0;
2946 cifs_sb->prepath = NULL; 3119 cifs_sb->prepath = NULL;
2947 kfree(tmp); 3120 kfree(tmp);
2948 3121
2949 return rc; 3122 return 0;
2950} 3123}
2951 3124
2952int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses) 3125int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -2997,6 +3170,16 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2997 if (rc) { 3170 if (rc) {
2998 cERROR(1, "Send error in SessSetup = %d", rc); 3171 cERROR(1, "Send error in SessSetup = %d", rc);
2999 } else { 3172 } else {
3173 mutex_lock(&ses->server->srv_mutex);
3174 if (!server->session_estab) {
3175 server->session_key.response = ses->auth_key.response;
3176 server->session_key.len = ses->auth_key.len;
3177 server->sequence_number = 0x2;
3178 server->session_estab = true;
3179 ses->auth_key.response = NULL;
3180 }
3181 mutex_unlock(&server->srv_mutex);
3182
3000 cFYI(1, "CIFS Session Established successfully"); 3183 cFYI(1, "CIFS Session Established successfully");
3001 spin_lock(&GlobalMid_Lock); 3184 spin_lock(&GlobalMid_Lock);
3002 ses->status = CifsGood; 3185 ses->status = CifsGood;
@@ -3004,6 +3187,263 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
3004 spin_unlock(&GlobalMid_Lock); 3187 spin_unlock(&GlobalMid_Lock);
3005 } 3188 }
3006 3189
3190 kfree(ses->auth_key.response);
3191 ses->auth_key.response = NULL;
3192 ses->auth_key.len = 0;
3193 kfree(ses->ntlmssp);
3194 ses->ntlmssp = NULL;
3195
3007 return rc; 3196 return rc;
3008} 3197}
3009 3198
3199static struct cifsTconInfo *
3200cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3201{
3202 struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
3203 struct cifsSesInfo *ses;
3204 struct cifsTconInfo *tcon = NULL;
3205 struct smb_vol *vol_info;
3206 char username[MAX_USERNAME_SIZE + 1];
3207
3208 vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
3209 if (vol_info == NULL) {
3210 tcon = ERR_PTR(-ENOMEM);
3211 goto out;
3212 }
3213
3214 snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
3215 vol_info->username = username;
3216 vol_info->local_nls = cifs_sb->local_nls;
3217 vol_info->linux_uid = fsuid;
3218 vol_info->cred_uid = fsuid;
3219 vol_info->UNC = master_tcon->treeName;
3220 vol_info->retry = master_tcon->retry;
3221 vol_info->nocase = master_tcon->nocase;
3222 vol_info->local_lease = master_tcon->local_lease;
3223 vol_info->no_linux_ext = !master_tcon->unix_ext;
3224
3225 /* FIXME: allow for other secFlg settings */
3226 vol_info->secFlg = CIFSSEC_MUST_KRB5;
3227
3228 /* get a reference for the same TCP session */
3229 spin_lock(&cifs_tcp_ses_lock);
3230 ++master_tcon->ses->server->srv_count;
3231 spin_unlock(&cifs_tcp_ses_lock);
3232
3233 ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
3234 if (IS_ERR(ses)) {
3235 tcon = (struct cifsTconInfo *)ses;
3236 cifs_put_tcp_session(master_tcon->ses->server);
3237 goto out;
3238 }
3239
3240 tcon = cifs_get_tcon(ses, vol_info);
3241 if (IS_ERR(tcon)) {
3242 cifs_put_smb_ses(ses);
3243 goto out;
3244 }
3245
3246 if (ses->capabilities & CAP_UNIX)
3247 reset_cifs_unix_caps(0, tcon, NULL, vol_info);
3248out:
3249 kfree(vol_info);
3250
3251 return tcon;
3252}
3253
3254static inline struct tcon_link *
3255cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
3256{
3257 return cifs_sb->master_tlink;
3258}
3259
3260struct cifsTconInfo *
3261cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
3262{
3263 return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
3264}
3265
3266static int
3267cifs_sb_tcon_pending_wait(void *unused)
3268{
3269 schedule();
3270 return signal_pending(current) ? -ERESTARTSYS : 0;
3271}
3272
3273/* find and return a tlink with given uid */
3274static struct tcon_link *
3275tlink_rb_search(struct rb_root *root, uid_t uid)
3276{
3277 struct rb_node *node = root->rb_node;
3278 struct tcon_link *tlink;
3279
3280 while (node) {
3281 tlink = rb_entry(node, struct tcon_link, tl_rbnode);
3282
3283 if (tlink->tl_uid > uid)
3284 node = node->rb_left;
3285 else if (tlink->tl_uid < uid)
3286 node = node->rb_right;
3287 else
3288 return tlink;
3289 }
3290 return NULL;
3291}
3292
3293/* insert a tcon_link into the tree */
3294static void
3295tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
3296{
3297 struct rb_node **new = &(root->rb_node), *parent = NULL;
3298 struct tcon_link *tlink;
3299
3300 while (*new) {
3301 tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
3302 parent = *new;
3303
3304 if (tlink->tl_uid > new_tlink->tl_uid)
3305 new = &((*new)->rb_left);
3306 else
3307 new = &((*new)->rb_right);
3308 }
3309
3310 rb_link_node(&new_tlink->tl_rbnode, parent, new);
3311 rb_insert_color(&new_tlink->tl_rbnode, root);
3312}
3313
3314/*
3315 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
3316 * current task.
3317 *
3318 * If the superblock doesn't refer to a multiuser mount, then just return
3319 * the master tcon for the mount.
3320 *
3321 * First, search the rbtree for an existing tcon for this fsuid. If one
3322 * exists, then check to see if it's pending construction. If it is then wait
3323 * for construction to complete. Once it's no longer pending, check to see if
3324 * it failed and either return an error or retry construction, depending on
3325 * the timeout.
3326 *
3327 * If one doesn't exist then insert a new tcon_link struct into the tree and
3328 * try to construct a new one.
3329 */
3330struct tcon_link *
3331cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3332{
3333 int ret;
3334 uid_t fsuid = current_fsuid();
3335 struct tcon_link *tlink, *newtlink;
3336
3337 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
3338 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
3339
3340 spin_lock(&cifs_sb->tlink_tree_lock);
3341 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3342 if (tlink)
3343 cifs_get_tlink(tlink);
3344 spin_unlock(&cifs_sb->tlink_tree_lock);
3345
3346 if (tlink == NULL) {
3347 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
3348 if (newtlink == NULL)
3349 return ERR_PTR(-ENOMEM);
3350 newtlink->tl_uid = fsuid;
3351 newtlink->tl_tcon = ERR_PTR(-EACCES);
3352 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
3353 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
3354 cifs_get_tlink(newtlink);
3355
3356 spin_lock(&cifs_sb->tlink_tree_lock);
3357 /* was one inserted after previous search? */
3358 tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
3359 if (tlink) {
3360 cifs_get_tlink(tlink);
3361 spin_unlock(&cifs_sb->tlink_tree_lock);
3362 kfree(newtlink);
3363 goto wait_for_construction;
3364 }
3365 tlink = newtlink;
3366 tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
3367 spin_unlock(&cifs_sb->tlink_tree_lock);
3368 } else {
3369wait_for_construction:
3370 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
3371 cifs_sb_tcon_pending_wait,
3372 TASK_INTERRUPTIBLE);
3373 if (ret) {
3374 cifs_put_tlink(tlink);
3375 return ERR_PTR(ret);
3376 }
3377
3378 /* if it's good, return it */
3379 if (!IS_ERR(tlink->tl_tcon))
3380 return tlink;
3381
3382 /* return error if we tried this already recently */
3383 if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
3384 cifs_put_tlink(tlink);
3385 return ERR_PTR(-EACCES);
3386 }
3387
3388 if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
3389 goto wait_for_construction;
3390 }
3391
3392 tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
3393 clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
3394 wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
3395
3396 if (IS_ERR(tlink->tl_tcon)) {
3397 cifs_put_tlink(tlink);
3398 return ERR_PTR(-EACCES);
3399 }
3400
3401 return tlink;
3402}
3403
3404/*
3405 * periodic workqueue job that scans tcon_tree for a superblock and closes
3406 * out tcons.
3407 */
3408static void
3409cifs_prune_tlinks(struct work_struct *work)
3410{
3411 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
3412 prune_tlinks.work);
3413 struct rb_root *root = &cifs_sb->tlink_tree;
3414 struct rb_node *node = rb_first(root);
3415 struct rb_node *tmp;
3416 struct tcon_link *tlink;
3417
3418 /*
3419 * Because we drop the spinlock in the loop in order to put the tlink
3420 * it's not guarded against removal of links from the tree. The only
3421 * places that remove entries from the tree are this function and
3422 * umounts. Because this function is non-reentrant and is canceled
3423 * before umount can proceed, this is safe.
3424 */
3425 spin_lock(&cifs_sb->tlink_tree_lock);
3426 node = rb_first(root);
3427 while (node != NULL) {
3428 tmp = node;
3429 node = rb_next(tmp);
3430 tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
3431
3432 if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
3433 atomic_read(&tlink->tl_count) != 0 ||
3434 time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
3435 continue;
3436
3437 cifs_get_tlink(tlink);
3438 clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
3439 rb_erase(tmp, root);
3440
3441 spin_unlock(&cifs_sb->tlink_tree_lock);
3442 cifs_put_tlink(tlink);
3443 spin_lock(&cifs_sb->tlink_tree_lock);
3444 }
3445 spin_unlock(&cifs_sb->tlink_tree_lock);
3446
3447 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
3448 TLINK_IDLE_EXPIRE);
3449}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f9ed0751cc12..dd5f22918c33 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
54 int dfsplen; 54 int dfsplen;
55 char *full_path; 55 char *full_path;
56 char dirsep; 56 char dirsep;
57 struct cifs_sb_info *cifs_sb; 57 struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
58 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
58 59
59 if (direntry == NULL) 60 if (direntry == NULL)
60 return NULL; /* not much we can do if dentry is freed and 61 return NULL; /* not much we can do if dentry is freed and
61 we need to reopen the file after it was closed implicitly 62 we need to reopen the file after it was closed implicitly
62 when the server crashed */ 63 when the server crashed */
63 64
64 cifs_sb = CIFS_SB(direntry->d_sb);
65 dirsep = CIFS_DIR_SEP(cifs_sb); 65 dirsep = CIFS_DIR_SEP(cifs_sb);
66 pplen = cifs_sb->prepathlen; 66 pplen = cifs_sb->prepathlen;
67 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 67 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
68 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 68 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
69 else 69 else
70 dfsplen = 0; 70 dfsplen = 0;
71cifs_bp_rename_retry: 71cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
117 /* BB test paths to Windows with '/' in the midst of prepath */ 117 /* BB test paths to Windows with '/' in the midst of prepath */
118 118
119 if (dfsplen) { 119 if (dfsplen) {
120 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 120 strncpy(full_path, tcon->treeName, dfsplen);
121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { 121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
122 int i; 122 int i;
123 for (i = 0; i < dfsplen; i++) { 123 for (i = 0; i < dfsplen; i++) {
@@ -130,146 +130,6 @@ cifs_bp_rename_retry:
130 return full_path; 130 return full_path;
131} 131}
132 132
133struct cifsFileInfo *
134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
135 struct file *file, struct vfsmount *mnt, unsigned int oflags)
136{
137 int oplock = 0;
138 struct cifsFileInfo *pCifsFile;
139 struct cifsInodeInfo *pCifsInode;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
141
142 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
143 if (pCifsFile == NULL)
144 return pCifsFile;
145
146 if (oplockEnabled)
147 oplock = REQ_OPLOCK;
148
149 pCifsFile->netfid = fileHandle;
150 pCifsFile->pid = current->tgid;
151 pCifsFile->pInode = igrab(newinode);
152 pCifsFile->mnt = mnt;
153 pCifsFile->pfile = file;
154 pCifsFile->invalidHandle = false;
155 pCifsFile->closePend = false;
156 mutex_init(&pCifsFile->fh_mutex);
157 mutex_init(&pCifsFile->lock_mutex);
158 INIT_LIST_HEAD(&pCifsFile->llist);
159 atomic_set(&pCifsFile->count, 1);
160 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
161
162 write_lock(&GlobalSMBSeslock);
163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
164 pCifsInode = CIFS_I(newinode);
165 if (pCifsInode) {
166 /* if readable file instance put first in list*/
167 if (oflags & FMODE_READ)
168 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
169 else
170 list_add_tail(&pCifsFile->flist,
171 &pCifsInode->openFileList);
172
173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
174 pCifsInode->clientCanCacheAll = true;
175 pCifsInode->clientCanCacheRead = true;
176 cFYI(1, "Exclusive Oplock inode %p", newinode);
177 } else if ((oplock & 0xF) == OPLOCK_READ)
178 pCifsInode->clientCanCacheRead = true;
179 }
180 write_unlock(&GlobalSMBSeslock);
181
182 file->private_data = pCifsFile;
183
184 return pCifsFile;
185}
186
187int cifs_posix_open(char *full_path, struct inode **pinode,
188 struct super_block *sb, int mode, int oflags,
189 __u32 *poplock, __u16 *pnetfid, int xid)
190{
191 int rc;
192 FILE_UNIX_BASIC_INFO *presp_data;
193 __u32 posix_flags = 0;
194 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
195 struct cifs_fattr fattr;
196
197 cFYI(1, "posix open %s", full_path);
198
199 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
200 if (presp_data == NULL)
201 return -ENOMEM;
202
203/* So far cifs posix extensions can only map the following flags.
204 There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
205 so far we do not seem to need them, and we can treat them as local only */
206 if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
207 (FMODE_READ | FMODE_WRITE))
208 posix_flags = SMB_O_RDWR;
209 else if (oflags & FMODE_READ)
210 posix_flags = SMB_O_RDONLY;
211 else if (oflags & FMODE_WRITE)
212 posix_flags = SMB_O_WRONLY;
213 if (oflags & O_CREAT)
214 posix_flags |= SMB_O_CREAT;
215 if (oflags & O_EXCL)
216 posix_flags |= SMB_O_EXCL;
217 if (oflags & O_TRUNC)
218 posix_flags |= SMB_O_TRUNC;
219 /* be safe and imply O_SYNC for O_DSYNC */
220 if (oflags & O_DSYNC)
221 posix_flags |= SMB_O_SYNC;
222 if (oflags & O_DIRECTORY)
223 posix_flags |= SMB_O_DIRECTORY;
224 if (oflags & O_NOFOLLOW)
225 posix_flags |= SMB_O_NOFOLLOW;
226 if (oflags & O_DIRECT)
227 posix_flags |= SMB_O_DIRECT;
228
229 mode &= ~current_umask();
230 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
231 pnetfid, presp_data, poplock, full_path,
232 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
233 CIFS_MOUNT_MAP_SPECIAL_CHR);
234 if (rc)
235 goto posix_open_ret;
236
237 if (presp_data->Type == cpu_to_le32(-1))
238 goto posix_open_ret; /* open ok, caller does qpathinfo */
239
240 if (!pinode)
241 goto posix_open_ret; /* caller does not need info */
242
243 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
244
245 /* get new inode and set it up */
246 if (*pinode == NULL) {
247 cifs_fill_uniqueid(sb, &fattr);
248 *pinode = cifs_iget(sb, &fattr);
249 if (!*pinode) {
250 rc = -ENOMEM;
251 goto posix_open_ret;
252 }
253 } else {
254 cifs_fattr_to_inode(*pinode, &fattr);
255 }
256
257posix_open_ret:
258 kfree(presp_data);
259 return rc;
260}
261
262static void setup_cifs_dentry(struct cifsTconInfo *tcon,
263 struct dentry *direntry,
264 struct inode *newinode)
265{
266 if (tcon->nocase)
267 direntry->d_op = &cifs_ci_dentry_ops;
268 else
269 direntry->d_op = &cifs_dentry_ops;
270 d_instantiate(direntry, newinode);
271}
272
273/* Inode operations in similar order to how they appear in Linux file fs.h */ 133/* Inode operations in similar order to how they appear in Linux file fs.h */
274 134
275int 135int
@@ -291,6 +151,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
291 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 151 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
292 __u16 fileHandle; 152 __u16 fileHandle;
293 struct cifs_sb_info *cifs_sb; 153 struct cifs_sb_info *cifs_sb;
154 struct tcon_link *tlink;
294 struct cifsTconInfo *tcon; 155 struct cifsTconInfo *tcon;
295 char *full_path = NULL; 156 char *full_path = NULL;
296 FILE_ALL_INFO *buf = NULL; 157 FILE_ALL_INFO *buf = NULL;
@@ -300,21 +161,26 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
300 xid = GetXid(); 161 xid = GetXid();
301 162
302 cifs_sb = CIFS_SB(inode->i_sb); 163 cifs_sb = CIFS_SB(inode->i_sb);
303 tcon = cifs_sb->tcon; 164 tlink = cifs_sb_tlink(cifs_sb);
304 165 if (IS_ERR(tlink)) {
305 full_path = build_path_from_dentry(direntry); 166 FreeXid(xid);
306 if (full_path == NULL) { 167 return PTR_ERR(tlink);
307 rc = -ENOMEM;
308 goto cifs_create_out;
309 } 168 }
169 tcon = tlink_tcon(tlink);
310 170
311 if (oplockEnabled) 171 if (oplockEnabled)
312 oplock = REQ_OPLOCK; 172 oplock = REQ_OPLOCK;
313 173
314 if (nd && (nd->flags & LOOKUP_OPEN)) 174 if (nd && (nd->flags & LOOKUP_OPEN))
315 oflags = nd->intent.open.flags; 175 oflags = nd->intent.open.file->f_flags;
316 else 176 else
317 oflags = FMODE_READ | SMB_O_CREAT; 177 oflags = O_RDONLY | O_CREAT;
178
179 full_path = build_path_from_dentry(direntry);
180 if (full_path == NULL) {
181 rc = -ENOMEM;
182 goto cifs_create_out;
183 }
318 184
319 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 185 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
320 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 186 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -344,9 +210,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
344 /* if the file is going to stay open, then we 210 /* if the file is going to stay open, then we
345 need to set the desired access properly */ 211 need to set the desired access properly */
346 desiredAccess = 0; 212 desiredAccess = 0;
347 if (oflags & FMODE_READ) 213 if (OPEN_FMODE(oflags) & FMODE_READ)
348 desiredAccess |= GENERIC_READ; /* is this too little? */ 214 desiredAccess |= GENERIC_READ; /* is this too little? */
349 if (oflags & FMODE_WRITE) 215 if (OPEN_FMODE(oflags) & FMODE_WRITE)
350 desiredAccess |= GENERIC_WRITE; 216 desiredAccess |= GENERIC_WRITE;
351 217
352 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 218 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -375,7 +241,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
375 if (!tcon->unix_ext && (mode & S_IWUGO) == 0) 241 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
376 create_options |= CREATE_OPTION_READONLY; 242 create_options |= CREATE_OPTION_READONLY;
377 243
378 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 244 if (tcon->ses->capabilities & CAP_NT_SMBS)
379 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 245 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
380 desiredAccess, create_options, 246 desiredAccess, create_options,
381 &fileHandle, &oplock, buf, cifs_sb->local_nls, 247 &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -416,10 +282,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
416 args.uid = NO_CHANGE_64; 282 args.uid = NO_CHANGE_64;
417 args.gid = NO_CHANGE_64; 283 args.gid = NO_CHANGE_64;
418 } 284 }
419 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, 285 CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle,
420 cifs_sb->local_nls, 286 current->tgid);
421 cifs_sb->mnt_cifs_flags &
422 CIFS_MOUNT_MAP_SPECIAL_CHR);
423 } else { 287 } else {
424 /* BB implement mode setting via Windows security 288 /* BB implement mode setting via Windows security
425 descriptors e.g. */ 289 descriptors e.g. */
@@ -452,7 +316,7 @@ cifs_create_get_file_info:
452 316
453cifs_create_set_dentry: 317cifs_create_set_dentry:
454 if (rc == 0) 318 if (rc == 0)
455 setup_cifs_dentry(tcon, direntry, newinode); 319 d_instantiate(direntry, newinode);
456 else 320 else
457 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); 321 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
458 322
@@ -467,8 +331,7 @@ cifs_create_set_dentry:
467 goto cifs_create_out; 331 goto cifs_create_out;
468 } 332 }
469 333
470 pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp, 334 pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
471 nd->path.mnt, oflags);
472 if (pfile_info == NULL) { 335 if (pfile_info == NULL) {
473 fput(filp); 336 fput(filp);
474 CIFSSMBClose(xid, tcon, fileHandle); 337 CIFSSMBClose(xid, tcon, fileHandle);
@@ -481,6 +344,7 @@ cifs_create_set_dentry:
481cifs_create_out: 344cifs_create_out:
482 kfree(buf); 345 kfree(buf);
483 kfree(full_path); 346 kfree(full_path);
347 cifs_put_tlink(tlink);
484 FreeXid(xid); 348 FreeXid(xid);
485 return rc; 349 return rc;
486} 350}
@@ -491,6 +355,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
491 int rc = -EPERM; 355 int rc = -EPERM;
492 int xid; 356 int xid;
493 struct cifs_sb_info *cifs_sb; 357 struct cifs_sb_info *cifs_sb;
358 struct tcon_link *tlink;
494 struct cifsTconInfo *pTcon; 359 struct cifsTconInfo *pTcon;
495 char *full_path = NULL; 360 char *full_path = NULL;
496 struct inode *newinode = NULL; 361 struct inode *newinode = NULL;
@@ -503,10 +368,14 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
503 if (!old_valid_dev(device_number)) 368 if (!old_valid_dev(device_number))
504 return -EINVAL; 369 return -EINVAL;
505 370
506 xid = GetXid();
507
508 cifs_sb = CIFS_SB(inode->i_sb); 371 cifs_sb = CIFS_SB(inode->i_sb);
509 pTcon = cifs_sb->tcon; 372 tlink = cifs_sb_tlink(cifs_sb);
373 if (IS_ERR(tlink))
374 return PTR_ERR(tlink);
375
376 pTcon = tlink_tcon(tlink);
377
378 xid = GetXid();
510 379
511 full_path = build_path_from_dentry(direntry); 380 full_path = build_path_from_dentry(direntry);
512 if (full_path == NULL) { 381 if (full_path == NULL) {
@@ -538,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
538 407
539 rc = cifs_get_inode_info_unix(&newinode, full_path, 408 rc = cifs_get_inode_info_unix(&newinode, full_path,
540 inode->i_sb, xid); 409 inode->i_sb, xid);
541 if (pTcon->nocase)
542 direntry->d_op = &cifs_ci_dentry_ops;
543 else
544 direntry->d_op = &cifs_dentry_ops;
545 410
546 if (rc == 0) 411 if (rc == 0)
547 d_instantiate(direntry, newinode); 412 d_instantiate(direntry, newinode);
@@ -606,6 +471,7 @@ mknod_out:
606 kfree(full_path); 471 kfree(full_path);
607 kfree(buf); 472 kfree(buf);
608 FreeXid(xid); 473 FreeXid(xid);
474 cifs_put_tlink(tlink);
609 return rc; 475 return rc;
610} 476}
611 477
@@ -619,6 +485,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
619 __u16 fileHandle = 0; 485 __u16 fileHandle = 0;
620 bool posix_open = false; 486 bool posix_open = false;
621 struct cifs_sb_info *cifs_sb; 487 struct cifs_sb_info *cifs_sb;
488 struct tcon_link *tlink;
622 struct cifsTconInfo *pTcon; 489 struct cifsTconInfo *pTcon;
623 struct cifsFileInfo *cfile; 490 struct cifsFileInfo *cfile;
624 struct inode *newInode = NULL; 491 struct inode *newInode = NULL;
@@ -633,7 +500,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
633 /* check whether path exists */ 500 /* check whether path exists */
634 501
635 cifs_sb = CIFS_SB(parent_dir_inode->i_sb); 502 cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
636 pTcon = cifs_sb->tcon; 503 tlink = cifs_sb_tlink(cifs_sb);
504 if (IS_ERR(tlink)) {
505 FreeXid(xid);
506 return (struct dentry *)tlink;
507 }
508 pTcon = tlink_tcon(tlink);
637 509
638 /* 510 /*
639 * Don't allow the separator character in a path component. 511 * Don't allow the separator character in a path component.
@@ -644,8 +516,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
644 for (i = 0; i < direntry->d_name.len; i++) 516 for (i = 0; i < direntry->d_name.len; i++)
645 if (direntry->d_name.name[i] == '\\') { 517 if (direntry->d_name.name[i] == '\\') {
646 cFYI(1, "Invalid file name"); 518 cFYI(1, "Invalid file name");
647 FreeXid(xid); 519 rc = -EINVAL;
648 return ERR_PTR(-EINVAL); 520 goto lookup_out;
649 } 521 }
650 } 522 }
651 523
@@ -655,7 +527,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
655 */ 527 */
656 if (nd && (nd->flags & LOOKUP_EXCL)) { 528 if (nd && (nd->flags & LOOKUP_EXCL)) {
657 d_instantiate(direntry, NULL); 529 d_instantiate(direntry, NULL);
658 return NULL; 530 rc = 0;
531 goto lookup_out;
659 } 532 }
660 533
661 /* can not grab the rename sem here since it would 534 /* can not grab the rename sem here since it would
@@ -663,8 +536,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
663 in which we already have the sb rename sem */ 536 in which we already have the sb rename sem */
664 full_path = build_path_from_dentry(direntry); 537 full_path = build_path_from_dentry(direntry);
665 if (full_path == NULL) { 538 if (full_path == NULL) {
666 FreeXid(xid); 539 rc = -ENOMEM;
667 return ERR_PTR(-ENOMEM); 540 goto lookup_out;
668 } 541 }
669 542
670 if (direntry->d_inode != NULL) { 543 if (direntry->d_inode != NULL) {
@@ -687,11 +560,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
687 if (pTcon->unix_ext) { 560 if (pTcon->unix_ext) {
688 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 561 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
689 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 562 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
690 (nd->intent.open.flags & O_CREAT)) { 563 (nd->intent.open.file->f_flags & O_CREAT)) {
691 rc = cifs_posix_open(full_path, &newInode, 564 rc = cifs_posix_open(full_path, &newInode,
692 parent_dir_inode->i_sb, 565 parent_dir_inode->i_sb,
693 nd->intent.open.create_mode, 566 nd->intent.open.create_mode,
694 nd->intent.open.flags, &oplock, 567 nd->intent.open.file->f_flags, &oplock,
695 &fileHandle, xid); 568 &fileHandle, xid);
696 /* 569 /*
697 * The check below works around a bug in POSIX 570 * The check below works around a bug in POSIX
@@ -713,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
713 parent_dir_inode->i_sb, xid, NULL); 586 parent_dir_inode->i_sb, xid, NULL);
714 587
715 if ((rc == 0) && (newInode != NULL)) { 588 if ((rc == 0) && (newInode != NULL)) {
716 if (pTcon->nocase)
717 direntry->d_op = &cifs_ci_dentry_ops;
718 else
719 direntry->d_op = &cifs_dentry_ops;
720 d_add(direntry, newInode); 589 d_add(direntry, newInode);
721 if (posix_open) { 590 if (posix_open) {
722 filp = lookup_instantiate_filp(nd, direntry, 591 filp = lookup_instantiate_filp(nd, direntry,
@@ -727,9 +596,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
727 goto lookup_out; 596 goto lookup_out;
728 } 597 }
729 598
730 cfile = cifs_new_fileinfo(newInode, fileHandle, filp, 599 cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
731 nd->path.mnt, 600 oplock);
732 nd->intent.open.flags);
733 if (cfile == NULL) { 601 if (cfile == NULL) {
734 fput(filp); 602 fput(filp);
735 CIFSSMBClose(xid, pTcon, fileHandle); 603 CIFSSMBClose(xid, pTcon, fileHandle);
@@ -744,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
744 } else if (rc == -ENOENT) { 612 } else if (rc == -ENOENT) {
745 rc = 0; 613 rc = 0;
746 direntry->d_time = jiffies; 614 direntry->d_time = jiffies;
747 if (pTcon->nocase)
748 direntry->d_op = &cifs_ci_dentry_ops;
749 else
750 direntry->d_op = &cifs_dentry_ops;
751 d_add(direntry, NULL); 615 d_add(direntry, NULL);
752 /* if it was once a directory (but how can we tell?) we could do 616 /* if it was once a directory (but how can we tell?) we could do
753 shrink_dcache_parent(direntry); */ 617 shrink_dcache_parent(direntry); */
@@ -759,6 +623,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
759 623
760lookup_out: 624lookup_out:
761 kfree(full_path); 625 kfree(full_path);
626 cifs_put_tlink(tlink);
762 FreeXid(xid); 627 FreeXid(xid);
763 return ERR_PTR(rc); 628 return ERR_PTR(rc);
764} 629}
@@ -766,22 +631,37 @@ lookup_out:
766static int 631static int
767cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) 632cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
768{ 633{
769 int isValid = 1; 634 if (nd->flags & LOOKUP_RCU)
635 return -ECHILD;
770 636
771 if (direntry->d_inode) { 637 if (direntry->d_inode) {
772 if (cifs_revalidate_dentry(direntry)) 638 if (cifs_revalidate_dentry(direntry))
773 return 0; 639 return 0;
774 } else { 640 else
775 cFYI(1, "neg dentry 0x%p name = %s", 641 return 1;
776 direntry, direntry->d_name.name);
777 if (time_after(jiffies, direntry->d_time + HZ) ||
778 !lookupCacheEnabled) {
779 d_drop(direntry);
780 isValid = 0;
781 }
782 } 642 }
783 643
784 return isValid; 644 /*
645 * This may be nfsd (or something), anyway, we can't see the
646 * intent of this. So, since this can be for creation, drop it.
647 */
648 if (!nd)
649 return 0;
650
651 /*
652 * Drop the negative dentry, in order to make sure to use the
653 * case sensitive name which is specified by user if this is
654 * for creation.
655 */
656 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
657 if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
658 return 0;
659 }
660
661 if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
662 return 0;
663
664 return 1;
785} 665}
786 666
787/* static int cifs_d_delete(struct dentry *direntry) 667/* static int cifs_d_delete(struct dentry *direntry)
@@ -795,12 +675,14 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
795 675
796const struct dentry_operations cifs_dentry_ops = { 676const struct dentry_operations cifs_dentry_ops = {
797 .d_revalidate = cifs_d_revalidate, 677 .d_revalidate = cifs_d_revalidate,
678 .d_automount = cifs_dfs_d_automount,
798/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 679/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
799}; 680};
800 681
801static int cifs_ci_hash(struct dentry *dentry, struct qstr *q) 682static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
683 struct qstr *q)
802{ 684{
803 struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; 685 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
804 unsigned long hash; 686 unsigned long hash;
805 int i; 687 int i;
806 688
@@ -813,21 +695,16 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
813 return 0; 695 return 0;
814} 696}
815 697
816static int cifs_ci_compare(struct dentry *dentry, struct qstr *a, 698static int cifs_ci_compare(const struct dentry *parent,
817 struct qstr *b) 699 const struct inode *pinode,
700 const struct dentry *dentry, const struct inode *inode,
701 unsigned int len, const char *str, const struct qstr *name)
818{ 702{
819 struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls; 703 struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
820 704
821 if ((a->len == b->len) && 705 if ((name->len == len) &&
822 (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) { 706 (nls_strnicmp(codepage, name->name, str, len) == 0))
823 /*
824 * To preserve case, don't let an existing negative dentry's
825 * case take precedence. If a is not a negative dentry, this
826 * should have no side effects
827 */
828 memcpy((void *)a->name, b->name, a->len);
829 return 0; 707 return 0;
830 }
831 return 1; 708 return 1;
832} 709}
833 710
@@ -835,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
835 .d_revalidate = cifs_d_revalidate, 712 .d_revalidate = cifs_d_revalidate,
836 .d_hash = cifs_ci_hash, 713 .d_hash = cifs_ci_hash,
837 .d_compare = cifs_ci_compare, 714 .d_compare = cifs_ci_compare,
715 .d_automount = cifs_dfs_d_automount,
838}; 716};
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad3..548f06230a6d 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
66 /* Search for server name delimiter */ 66 /* Search for server name delimiter */
67 sep = memchr(hostname, '\\', len); 67 sep = memchr(hostname, '\\', len);
68 if (sep) 68 if (sep)
69 len = sep - unc; 69 len = sep - hostname;
70 else 70 else
71 cFYI(1, "%s: probably server name is whole unc: %s", 71 cFYI(1, "%s: probably server name is whole unc: %s",
72 __func__, unc); 72 __func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index de748c652d11..e964b1cd5dd0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -60,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
60 FILE_READ_DATA); 60 FILE_READ_DATA);
61} 61}
62 62
63static inline fmode_t cifs_posix_convert_flags(unsigned int flags) 63static u32 cifs_posix_convert_flags(unsigned int flags)
64{ 64{
65 fmode_t posix_flags = 0; 65 u32 posix_flags = 0;
66 66
67 if ((flags & O_ACCMODE) == O_RDONLY) 67 if ((flags & O_ACCMODE) == O_RDONLY)
68 posix_flags = FMODE_READ; 68 posix_flags = SMB_O_RDONLY;
69 else if ((flags & O_ACCMODE) == O_WRONLY) 69 else if ((flags & O_ACCMODE) == O_WRONLY)
70 posix_flags = FMODE_WRITE; 70 posix_flags = SMB_O_WRONLY;
71 else if ((flags & O_ACCMODE) == O_RDWR) { 71 else if ((flags & O_ACCMODE) == O_RDWR)
72 /* GENERIC_ALL is too much permission to request 72 posix_flags = SMB_O_RDWR;
73 can cause unnecessary access denied on create */ 73
74 /* return GENERIC_ALL; */ 74 if (flags & O_CREAT)
75 posix_flags = FMODE_READ | FMODE_WRITE; 75 posix_flags |= SMB_O_CREAT;
76 } 76 if (flags & O_EXCL)
77 /* can not map O_CREAT or O_EXCL or O_TRUNC flags when 77 posix_flags |= SMB_O_EXCL;
78 reopening a file. They had their effect on the original open */ 78 if (flags & O_TRUNC)
79 if (flags & O_APPEND) 79 posix_flags |= SMB_O_TRUNC;
80 posix_flags |= (fmode_t)O_APPEND; 80 /* be safe and imply O_SYNC for O_DSYNC */
81 if (flags & O_DSYNC) 81 if (flags & O_DSYNC)
82 posix_flags |= (fmode_t)O_DSYNC; 82 posix_flags |= SMB_O_SYNC;
83 if (flags & __O_SYNC)
84 posix_flags |= (fmode_t)__O_SYNC;
85 if (flags & O_DIRECTORY) 83 if (flags & O_DIRECTORY)
86 posix_flags |= (fmode_t)O_DIRECTORY; 84 posix_flags |= SMB_O_DIRECTORY;
87 if (flags & O_NOFOLLOW) 85 if (flags & O_NOFOLLOW)
88 posix_flags |= (fmode_t)O_NOFOLLOW; 86 posix_flags |= SMB_O_NOFOLLOW;
89 if (flags & O_DIRECT) 87 if (flags & O_DIRECT)
90 posix_flags |= (fmode_t)O_DIRECT; 88 posix_flags |= SMB_O_DIRECT;
91 89
92 return posix_flags; 90 return posix_flags;
93} 91}
@@ -106,117 +104,239 @@ static inline int cifs_get_disposition(unsigned int flags)
106 return FILE_OPEN; 104 return FILE_OPEN;
107} 105}
108 106
109/* all arguments to this function must be checked for validity in caller */ 107int cifs_posix_open(char *full_path, struct inode **pinode,
110static inline int 108 struct super_block *sb, int mode, unsigned int f_flags,
111cifs_posix_open_inode_helper(struct inode *inode, struct file *file, 109 __u32 *poplock, __u16 *pnetfid, int xid)
112 struct cifsInodeInfo *pCifsInode, __u32 oplock,
113 u16 netfid)
114{ 110{
111 int rc;
112 FILE_UNIX_BASIC_INFO *presp_data;
113 __u32 posix_flags = 0;
114 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
115 struct cifs_fattr fattr;
116 struct tcon_link *tlink;
117 struct cifsTconInfo *tcon;
115 118
116 write_lock(&GlobalSMBSeslock); 119 cFYI(1, "posix open %s", full_path);
117 120
118 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 121 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
119 if (pCifsInode == NULL) { 122 if (presp_data == NULL)
120 write_unlock(&GlobalSMBSeslock); 123 return -ENOMEM;
121 return -EINVAL;
122 }
123 124
124 if (pCifsInode->clientCanCacheRead) { 125 tlink = cifs_sb_tlink(cifs_sb);
125 /* we have the inode open somewhere else 126 if (IS_ERR(tlink)) {
126 no need to discard cache data */ 127 rc = PTR_ERR(tlink);
127 goto psx_client_can_cache; 128 goto posix_open_ret;
128 } 129 }
129 130
130 /* BB FIXME need to fix this check to move it earlier into posix_open 131 tcon = tlink_tcon(tlink);
131 BB fIX following section BB FIXME */ 132 mode &= ~current_umask();
132 133
133 /* if not oplocked, invalidate inode pages if mtime or file 134 posix_flags = cifs_posix_convert_flags(f_flags);
134 size changed */ 135 rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
135/* temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime)); 136 poplock, full_path, cifs_sb->local_nls,
136 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 137 cifs_sb->mnt_cifs_flags &
137 (file->f_path.dentry->d_inode->i_size == 138 CIFS_MOUNT_MAP_SPECIAL_CHR);
138 (loff_t)le64_to_cpu(buf->EndOfFile))) { 139 cifs_put_tlink(tlink);
139 cFYI(1, "inode unchanged on server"); 140
140 } else { 141 if (rc)
141 if (file->f_path.dentry->d_inode->i_mapping) { 142 goto posix_open_ret;
142 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 143
143 if (rc != 0) 144 if (presp_data->Type == cpu_to_le32(-1))
144 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 145 goto posix_open_ret; /* open ok, caller does qpathinfo */
146
147 if (!pinode)
148 goto posix_open_ret; /* caller does not need info */
149
150 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
151
152 /* get new inode and set it up */
153 if (*pinode == NULL) {
154 cifs_fill_uniqueid(sb, &fattr);
155 *pinode = cifs_iget(sb, &fattr);
156 if (!*pinode) {
157 rc = -ENOMEM;
158 goto posix_open_ret;
145 } 159 }
146 cFYI(1, "invalidating remote inode since open detected it " 160 } else {
147 "changed"); 161 cifs_fattr_to_inode(*pinode, &fattr);
148 invalidate_remote_inode(file->f_path.dentry->d_inode); 162 }
149 } */ 163
150 164posix_open_ret:
151psx_client_can_cache: 165 kfree(presp_data);
152 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 166 return rc;
153 pCifsInode->clientCanCacheAll = true;
154 pCifsInode->clientCanCacheRead = true;
155 cFYI(1, "Exclusive Oplock granted on inode %p",
156 file->f_path.dentry->d_inode);
157 } else if ((oplock & 0xF) == OPLOCK_READ)
158 pCifsInode->clientCanCacheRead = true;
159
160 /* will have to change the unlock if we reenable the
161 filemap_fdatawrite (which does not seem necessary */
162 write_unlock(&GlobalSMBSeslock);
163 return 0;
164} 167}
165 168
166/* all arguments to this function must be checked for validity in caller */ 169static int
167static inline int cifs_open_inode_helper(struct inode *inode, 170cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
168 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, 171 struct cifsTconInfo *tcon, unsigned int f_flags, __u32 *poplock,
169 char *full_path, int xid) 172 __u16 *pnetfid, int xid)
170{ 173{
171 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
172 struct timespec temp;
173 int rc; 174 int rc;
175 int desiredAccess;
176 int disposition;
177 FILE_ALL_INFO *buf;
174 178
175 if (pCifsInode->clientCanCacheRead) { 179 desiredAccess = cifs_convert_flags(f_flags);
176 /* we have the inode open somewhere else
177 no need to discard cache data */
178 goto client_can_cache;
179 }
180 180
181 /* BB need same check in cifs_create too? */ 181/*********************************************************************
182 /* if not oplocked, invalidate inode pages if mtime or file 182 * open flag mapping table:
183 size changed */ 183 *
184 temp = cifs_NTtimeToUnix(buf->LastWriteTime); 184 * POSIX Flag CIFS Disposition
185 if (timespec_equal(&inode->i_mtime, &temp) && 185 * ---------- ----------------
186 (inode->i_size == 186 * O_CREAT FILE_OPEN_IF
187 (loff_t)le64_to_cpu(buf->EndOfFile))) { 187 * O_CREAT | O_EXCL FILE_CREATE
188 cFYI(1, "inode unchanged on server"); 188 * O_CREAT | O_TRUNC FILE_OVERWRITE_IF
189 } else { 189 * O_TRUNC FILE_OVERWRITE
190 if (inode->i_mapping) { 190 * none of the above FILE_OPEN
191 /* BB no need to lock inode until after invalidate 191 *
192 since namei code should already have it locked? */ 192 * Note that there is not a direct match between disposition
193 rc = filemap_write_and_wait(inode->i_mapping); 193 * FILE_SUPERSEDE (ie create whether or not file exists although
194 if (rc != 0) 194 * O_CREAT | O_TRUNC is similar but truncates the existing
195 pCifsInode->write_behind_rc = rc; 195 * file rather than creating a new file as FILE_SUPERSEDE does
196 } 196 * (which uses the attributes / metadata passed in on open call)
197 cFYI(1, "invalidating remote inode since open detected it " 197 *?
198 "changed"); 198 *? O_SYNC is a reasonable match to CIFS writethrough flag
199 invalidate_remote_inode(inode); 199 *? and the read write flags match reasonably. O_LARGEFILE
200 } 200 *? is irrelevant because largefile support is always used
201 *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
202 * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
203 *********************************************************************/
204
205 disposition = cifs_get_disposition(f_flags);
201 206
202client_can_cache: 207 /* BB pass O_SYNC flag through on file attributes .. BB */
203 if (pTcon->unix_ext) 208
209 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
210 if (!buf)
211 return -ENOMEM;
212
213 if (tcon->ses->capabilities & CAP_NT_SMBS)
214 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
215 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
216 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
217 & CIFS_MOUNT_MAP_SPECIAL_CHR);
218 else
219 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
220 desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
221 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
222 & CIFS_MOUNT_MAP_SPECIAL_CHR);
223
224 if (rc)
225 goto out;
226
227 if (tcon->unix_ext)
204 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, 228 rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
205 xid); 229 xid);
206 else 230 else
207 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, 231 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
208 xid, NULL); 232 xid, pnetfid);
209
210 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
211 pCifsInode->clientCanCacheAll = true;
212 pCifsInode->clientCanCacheRead = true;
213 cFYI(1, "Exclusive Oplock granted on inode %p", inode);
214 } else if ((*oplock & 0xF) == OPLOCK_READ)
215 pCifsInode->clientCanCacheRead = true;
216 233
234out:
235 kfree(buf);
217 return rc; 236 return rc;
218} 237}
219 238
239struct cifsFileInfo *
240cifs_new_fileinfo(__u16 fileHandle, struct file *file,
241 struct tcon_link *tlink, __u32 oplock)
242{
243 struct dentry *dentry = file->f_path.dentry;
244 struct inode *inode = dentry->d_inode;
245 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
246 struct cifsFileInfo *pCifsFile;
247
248 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
249 if (pCifsFile == NULL)
250 return pCifsFile;
251
252 pCifsFile->count = 1;
253 pCifsFile->netfid = fileHandle;
254 pCifsFile->pid = current->tgid;
255 pCifsFile->uid = current_fsuid();
256 pCifsFile->dentry = dget(dentry);
257 pCifsFile->f_flags = file->f_flags;
258 pCifsFile->invalidHandle = false;
259 pCifsFile->tlink = cifs_get_tlink(tlink);
260 mutex_init(&pCifsFile->fh_mutex);
261 mutex_init(&pCifsFile->lock_mutex);
262 INIT_LIST_HEAD(&pCifsFile->llist);
263 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
264
265 spin_lock(&cifs_file_list_lock);
266 list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
267 /* if readable file instance put first in list*/
268 if (file->f_mode & FMODE_READ)
269 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
270 else
271 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
272 spin_unlock(&cifs_file_list_lock);
273
274 cifs_set_oplock_level(pCifsInode, oplock);
275
276 file->private_data = pCifsFile;
277 return pCifsFile;
278}
279
280/*
281 * Release a reference on the file private data. This may involve closing
282 * the filehandle out on the server. Must be called without holding
283 * cifs_file_list_lock.
284 */
285void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
286{
287 struct inode *inode = cifs_file->dentry->d_inode;
288 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
289 struct cifsInodeInfo *cifsi = CIFS_I(inode);
290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
291 struct cifsLockInfo *li, *tmp;
292
293 spin_lock(&cifs_file_list_lock);
294 if (--cifs_file->count > 0) {
295 spin_unlock(&cifs_file_list_lock);
296 return;
297 }
298
299 /* remove it from the lists */
300 list_del(&cifs_file->flist);
301 list_del(&cifs_file->tlist);
302
303 if (list_empty(&cifsi->openFileList)) {
304 cFYI(1, "closing last open instance for inode %p",
305 cifs_file->dentry->d_inode);
306
307 /* in strict cache mode we need invalidate mapping on the last
308 close because it may cause a error when we open this file
309 again and get at least level II oplock */
310 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
311 CIFS_I(inode)->invalid_mapping = true;
312
313 cifs_set_oplock_level(cifsi, 0);
314 }
315 spin_unlock(&cifs_file_list_lock);
316
317 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
318 int xid, rc;
319
320 xid = GetXid();
321 rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
322 FreeXid(xid);
323 }
324
325 /* Delete any outstanding lock records. We'll lose them when the file
326 * is closed anyway.
327 */
328 mutex_lock(&cifs_file->lock_mutex);
329 list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
330 list_del(&li->llist);
331 kfree(li);
332 }
333 mutex_unlock(&cifs_file->lock_mutex);
334
335 cifs_put_tlink(cifs_file->tlink);
336 dput(cifs_file->dentry);
337 kfree(cifs_file);
338}
339
220int cifs_open(struct inode *inode, struct file *file) 340int cifs_open(struct inode *inode, struct file *file)
221{ 341{
222 int rc = -EACCES; 342 int rc = -EACCES;
@@ -224,20 +344,21 @@ int cifs_open(struct inode *inode, struct file *file)
224 __u32 oplock; 344 __u32 oplock;
225 struct cifs_sb_info *cifs_sb; 345 struct cifs_sb_info *cifs_sb;
226 struct cifsTconInfo *tcon; 346 struct cifsTconInfo *tcon;
347 struct tcon_link *tlink;
227 struct cifsFileInfo *pCifsFile = NULL; 348 struct cifsFileInfo *pCifsFile = NULL;
228 struct cifsInodeInfo *pCifsInode;
229 char *full_path = NULL; 349 char *full_path = NULL;
230 int desiredAccess; 350 bool posix_open_ok = false;
231 int disposition;
232 __u16 netfid; 351 __u16 netfid;
233 FILE_ALL_INFO *buf = NULL;
234 352
235 xid = GetXid(); 353 xid = GetXid();
236 354
237 cifs_sb = CIFS_SB(inode->i_sb); 355 cifs_sb = CIFS_SB(inode->i_sb);
238 tcon = cifs_sb->tcon; 356 tlink = cifs_sb_tlink(cifs_sb);
239 357 if (IS_ERR(tlink)) {
240 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 358 FreeXid(xid);
359 return PTR_ERR(tlink);
360 }
361 tcon = tlink_tcon(tlink);
241 362
242 full_path = build_path_from_dentry(file->f_path.dentry); 363 full_path = build_path_from_dentry(file->f_path.dentry);
243 if (full_path == NULL) { 364 if (full_path == NULL) {
@@ -257,35 +378,13 @@ int cifs_open(struct inode *inode, struct file *file)
257 (tcon->ses->capabilities & CAP_UNIX) && 378 (tcon->ses->capabilities & CAP_UNIX) &&
258 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 379 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
259 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 380 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
260 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
261 oflags |= SMB_O_CREAT;
262 /* can not refresh inode info since size could be stale */ 381 /* can not refresh inode info since size could be stale */
263 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 382 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
264 cifs_sb->mnt_file_mode /* ignored */, 383 cifs_sb->mnt_file_mode /* ignored */,
265 oflags, &oplock, &netfid, xid); 384 file->f_flags, &oplock, &netfid, xid);
266 if (rc == 0) { 385 if (rc == 0) {
267 cFYI(1, "posix open succeeded"); 386 cFYI(1, "posix open succeeded");
268 /* no need for special case handling of setting mode 387 posix_open_ok = true;
269 on read only files needed here */
270
271 rc = cifs_posix_open_inode_helper(inode, file,
272 pCifsInode, oplock, netfid);
273 if (rc != 0) {
274 CIFSSMBClose(xid, tcon, netfid);
275 goto out;
276 }
277
278 pCifsFile = cifs_new_fileinfo(inode, netfid, file,
279 file->f_path.mnt,
280 oflags);
281 if (pCifsFile == NULL) {
282 CIFSSMBClose(xid, tcon, netfid);
283 rc = -ENOMEM;
284 }
285
286 cifs_fscache_set_inode_cookie(inode, file);
287
288 goto out;
289 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 388 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
290 if (tcon->ses->serverNOS) 389 if (tcon->ses->serverNOS)
291 cERROR(1, "server %s of type %s returned" 390 cERROR(1, "server %s of type %s returned"
@@ -302,106 +401,42 @@ int cifs_open(struct inode *inode, struct file *file)
302 or DFS errors */ 401 or DFS errors */
303 } 402 }
304 403
305 desiredAccess = cifs_convert_flags(file->f_flags); 404 if (!posix_open_ok) {
306 405 rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
307/********************************************************************* 406 file->f_flags, &oplock, &netfid, xid);
308 * open flag mapping table: 407 if (rc)
309 * 408 goto out;
310 * POSIX Flag CIFS Disposition
311 * ---------- ----------------
312 * O_CREAT FILE_OPEN_IF
313 * O_CREAT | O_EXCL FILE_CREATE
314 * O_CREAT | O_TRUNC FILE_OVERWRITE_IF
315 * O_TRUNC FILE_OVERWRITE
316 * none of the above FILE_OPEN
317 *
318 * Note that there is not a direct match between disposition
319 * FILE_SUPERSEDE (ie create whether or not file exists although
320 * O_CREAT | O_TRUNC is similar but truncates the existing
321 * file rather than creating a new file as FILE_SUPERSEDE does
322 * (which uses the attributes / metadata passed in on open call)
323 *?
324 *? O_SYNC is a reasonable match to CIFS writethrough flag
325 *? and the read write flags match reasonably. O_LARGEFILE
326 *? is irrelevant because largefile support is always used
327 *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY,
328 * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation
329 *********************************************************************/
330
331 disposition = cifs_get_disposition(file->f_flags);
332
333 /* BB pass O_SYNC flag through on file attributes .. BB */
334
335 /* Also refresh inode by passing in file_info buf returned by SMBOpen
336 and calling get_inode_info with returned buf (at least helps
337 non-Unix server case) */
338
339 /* BB we can not do this if this is the second open of a file
340 and the first handle has writebehind data, we might be
341 able to simply do a filemap_fdatawrite/filemap_fdatawait first */
342 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
343 if (!buf) {
344 rc = -ENOMEM;
345 goto out;
346 }
347
348 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
349 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
350 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
351 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
352 & CIFS_MOUNT_MAP_SPECIAL_CHR);
353 else
354 rc = -EIO; /* no NT SMB support fall into legacy open below */
355
356 if (rc == -EIO) {
357 /* Old server, try legacy style OpenX */
358 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
359 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
360 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
361 & CIFS_MOUNT_MAP_SPECIAL_CHR);
362 }
363 if (rc) {
364 cFYI(1, "cifs_open returned 0x%x", rc);
365 goto out;
366 } 409 }
367 410
368 rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid); 411 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
369 if (rc != 0)
370 goto out;
371
372 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
373 file->f_flags);
374 if (pCifsFile == NULL) { 412 if (pCifsFile == NULL) {
413 CIFSSMBClose(xid, tcon, netfid);
375 rc = -ENOMEM; 414 rc = -ENOMEM;
376 goto out; 415 goto out;
377 } 416 }
378 417
379 cifs_fscache_set_inode_cookie(inode, file); 418 cifs_fscache_set_inode_cookie(inode, file);
380 419
381 if (oplock & CIFS_CREATE_ACTION) { 420 if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
382 /* time to set mode which we can not set earlier due to 421 /* time to set mode which we can not set earlier due to
383 problems creating new read-only files */ 422 problems creating new read-only files */
384 if (tcon->unix_ext) { 423 struct cifs_unix_set_info_args args = {
385 struct cifs_unix_set_info_args args = { 424 .mode = inode->i_mode,
386 .mode = inode->i_mode, 425 .uid = NO_CHANGE_64,
387 .uid = NO_CHANGE_64, 426 .gid = NO_CHANGE_64,
388 .gid = NO_CHANGE_64, 427 .ctime = NO_CHANGE_64,
389 .ctime = NO_CHANGE_64, 428 .atime = NO_CHANGE_64,
390 .atime = NO_CHANGE_64, 429 .mtime = NO_CHANGE_64,
391 .mtime = NO_CHANGE_64, 430 .device = 0,
392 .device = 0, 431 };
393 }; 432 CIFSSMBUnixSetFileInfo(xid, tcon, &args, netfid,
394 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, 433 pCifsFile->pid);
395 cifs_sb->local_nls,
396 cifs_sb->mnt_cifs_flags &
397 CIFS_MOUNT_MAP_SPECIAL_CHR);
398 }
399 } 434 }
400 435
401out: 436out:
402 kfree(buf);
403 kfree(full_path); 437 kfree(full_path);
404 FreeXid(xid); 438 FreeXid(xid);
439 cifs_put_tlink(tlink);
405 return rc; 440 return rc;
406} 441}
407 442
@@ -416,14 +451,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
416 return rc; 451 return rc;
417} 452}
418 453
419static int cifs_reopen_file(struct file *file, bool can_flush) 454static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
420{ 455{
421 int rc = -EACCES; 456 int rc = -EACCES;
422 int xid; 457 int xid;
423 __u32 oplock; 458 __u32 oplock;
424 struct cifs_sb_info *cifs_sb; 459 struct cifs_sb_info *cifs_sb;
425 struct cifsTconInfo *tcon; 460 struct cifsTconInfo *tcon;
426 struct cifsFileInfo *pCifsFile;
427 struct cifsInodeInfo *pCifsInode; 461 struct cifsInodeInfo *pCifsInode;
428 struct inode *inode; 462 struct inode *inode;
429 char *full_path = NULL; 463 char *full_path = NULL;
@@ -431,11 +465,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
431 int disposition = FILE_OPEN; 465 int disposition = FILE_OPEN;
432 __u16 netfid; 466 __u16 netfid;
433 467
434 if (file->private_data)
435 pCifsFile = file->private_data;
436 else
437 return -EBADF;
438
439 xid = GetXid(); 468 xid = GetXid();
440 mutex_lock(&pCifsFile->fh_mutex); 469 mutex_lock(&pCifsFile->fh_mutex);
441 if (!pCifsFile->invalidHandle) { 470 if (!pCifsFile->invalidHandle) {
@@ -445,39 +474,24 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
445 return rc; 474 return rc;
446 } 475 }
447 476
448 if (file->f_path.dentry == NULL) { 477 inode = pCifsFile->dentry->d_inode;
449 cERROR(1, "no valid name if dentry freed");
450 dump_stack();
451 rc = -EBADF;
452 goto reopen_error_exit;
453 }
454
455 inode = file->f_path.dentry->d_inode;
456 if (inode == NULL) {
457 cERROR(1, "inode not valid");
458 dump_stack();
459 rc = -EBADF;
460 goto reopen_error_exit;
461 }
462
463 cifs_sb = CIFS_SB(inode->i_sb); 478 cifs_sb = CIFS_SB(inode->i_sb);
464 tcon = cifs_sb->tcon; 479 tcon = tlink_tcon(pCifsFile->tlink);
465 480
466/* can not grab rename sem here because various ops, including 481/* can not grab rename sem here because various ops, including
467 those that already have the rename sem can end up causing writepage 482 those that already have the rename sem can end up causing writepage
468 to get called and if the server was down that means we end up here, 483 to get called and if the server was down that means we end up here,
469 and we can never tell if the caller already has the rename_sem */ 484 and we can never tell if the caller already has the rename_sem */
470 full_path = build_path_from_dentry(file->f_path.dentry); 485 full_path = build_path_from_dentry(pCifsFile->dentry);
471 if (full_path == NULL) { 486 if (full_path == NULL) {
472 rc = -ENOMEM; 487 rc = -ENOMEM;
473reopen_error_exit:
474 mutex_unlock(&pCifsFile->fh_mutex); 488 mutex_unlock(&pCifsFile->fh_mutex);
475 FreeXid(xid); 489 FreeXid(xid);
476 return rc; 490 return rc;
477 } 491 }
478 492
479 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 493 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
480 inode, file->f_flags, full_path); 494 inode, pCifsFile->f_flags, full_path);
481 495
482 if (oplockEnabled) 496 if (oplockEnabled)
483 oplock = REQ_OPLOCK; 497 oplock = REQ_OPLOCK;
@@ -487,8 +501,14 @@ reopen_error_exit:
487 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 501 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
488 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 502 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
489 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 503 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
490 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 504
491 /* can not refresh inode info since size could be stale */ 505 /*
506 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
507 * original open. Must mask them off for a reopen.
508 */
509 unsigned int oflags = pCifsFile->f_flags &
510 ~(O_CREAT | O_EXCL | O_TRUNC);
511
492 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 512 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
493 cifs_sb->mnt_file_mode /* ignored */, 513 cifs_sb->mnt_file_mode /* ignored */,
494 oflags, &oplock, &netfid, xid); 514 oflags, &oplock, &netfid, xid);
@@ -500,7 +520,7 @@ reopen_error_exit:
500 in the reconnect path it is important to retry hard */ 520 in the reconnect path it is important to retry hard */
501 } 521 }
502 522
503 desiredAccess = cifs_convert_flags(file->f_flags); 523 desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
504 524
505 /* Can not refresh inode by passing in file_info buf to be returned 525 /* Can not refresh inode by passing in file_info buf to be returned
506 by SMBOpen and then calling get_inode_info with returned buf 526 by SMBOpen and then calling get_inode_info with returned buf
@@ -516,49 +536,38 @@ reopen_error_exit:
516 mutex_unlock(&pCifsFile->fh_mutex); 536 mutex_unlock(&pCifsFile->fh_mutex);
517 cFYI(1, "cifs_open returned 0x%x", rc); 537 cFYI(1, "cifs_open returned 0x%x", rc);
518 cFYI(1, "oplock: %d", oplock); 538 cFYI(1, "oplock: %d", oplock);
519 } else { 539 goto reopen_error_exit;
520reopen_success:
521 pCifsFile->netfid = netfid;
522 pCifsFile->invalidHandle = false;
523 mutex_unlock(&pCifsFile->fh_mutex);
524 pCifsInode = CIFS_I(inode);
525 if (pCifsInode) {
526 if (can_flush) {
527 rc = filemap_write_and_wait(inode->i_mapping);
528 if (rc != 0)
529 CIFS_I(inode)->write_behind_rc = rc;
530 /* temporarily disable caching while we
531 go to server to get inode info */
532 pCifsInode->clientCanCacheAll = false;
533 pCifsInode->clientCanCacheRead = false;
534 if (tcon->unix_ext)
535 rc = cifs_get_inode_info_unix(&inode,
536 full_path, inode->i_sb, xid);
537 else
538 rc = cifs_get_inode_info(&inode,
539 full_path, NULL, inode->i_sb,
540 xid, NULL);
541 } /* else we are writing out data to server already
542 and could deadlock if we tried to flush data, and
543 since we do not know if we have data that would
544 invalidate the current end of file on the server
545 we can not go to the server to get the new inod
546 info */
547 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
548 pCifsInode->clientCanCacheAll = true;
549 pCifsInode->clientCanCacheRead = true;
550 cFYI(1, "Exclusive Oplock granted on inode %p",
551 file->f_path.dentry->d_inode);
552 } else if ((oplock & 0xF) == OPLOCK_READ) {
553 pCifsInode->clientCanCacheRead = true;
554 pCifsInode->clientCanCacheAll = false;
555 } else {
556 pCifsInode->clientCanCacheRead = false;
557 pCifsInode->clientCanCacheAll = false;
558 }
559 cifs_relock_file(pCifsFile);
560 }
561 } 540 }
541
542reopen_success:
543 pCifsFile->netfid = netfid;
544 pCifsFile->invalidHandle = false;
545 mutex_unlock(&pCifsFile->fh_mutex);
546 pCifsInode = CIFS_I(inode);
547
548 if (can_flush) {
549 rc = filemap_write_and_wait(inode->i_mapping);
550 mapping_set_error(inode->i_mapping, rc);
551
552 if (tcon->unix_ext)
553 rc = cifs_get_inode_info_unix(&inode,
554 full_path, inode->i_sb, xid);
555 else
556 rc = cifs_get_inode_info(&inode,
557 full_path, NULL, inode->i_sb,
558 xid, NULL);
559 } /* else we are writing out data to server already
560 and could deadlock if we tried to flush data, and
561 since we do not know if we have data that would
562 invalidate the current end of file on the server
563 we can not go to the server to get the new inod
564 info */
565
566 cifs_set_oplock_level(pCifsInode, oplock);
567
568 cifs_relock_file(pCifsFile);
569
570reopen_error_exit:
562 kfree(full_path); 571 kfree(full_path);
563 FreeXid(xid); 572 FreeXid(xid);
564 return rc; 573 return rc;
@@ -566,79 +575,11 @@ reopen_success:
566 575
567int cifs_close(struct inode *inode, struct file *file) 576int cifs_close(struct inode *inode, struct file *file)
568{ 577{
569 int rc = 0; 578 cifsFileInfo_put(file->private_data);
570 int xid, timeout; 579 file->private_data = NULL;
571 struct cifs_sb_info *cifs_sb;
572 struct cifsTconInfo *pTcon;
573 struct cifsFileInfo *pSMBFile = file->private_data;
574 580
575 xid = GetXid(); 581 /* return code from the ->release op is always ignored */
576 582 return 0;
577 cifs_sb = CIFS_SB(inode->i_sb);
578 pTcon = cifs_sb->tcon;
579 if (pSMBFile) {
580 struct cifsLockInfo *li, *tmp;
581 write_lock(&GlobalSMBSeslock);
582 pSMBFile->closePend = true;
583 if (pTcon) {
584 /* no sense reconnecting to close a file that is
585 already closed */
586 if (!pTcon->need_reconnect) {
587 write_unlock(&GlobalSMBSeslock);
588 timeout = 2;
589 while ((atomic_read(&pSMBFile->count) != 1)
590 && (timeout <= 2048)) {
591 /* Give write a better chance to get to
592 server ahead of the close. We do not
593 want to add a wait_q here as it would
594 increase the memory utilization as
595 the struct would be in each open file,
596 but this should give enough time to
597 clear the socket */
598 cFYI(DBG2, "close delay, write pending");
599 msleep(timeout);
600 timeout *= 4;
601 }
602 if (!pTcon->need_reconnect &&
603 !pSMBFile->invalidHandle)
604 rc = CIFSSMBClose(xid, pTcon,
605 pSMBFile->netfid);
606 } else
607 write_unlock(&GlobalSMBSeslock);
608 } else
609 write_unlock(&GlobalSMBSeslock);
610
611 /* Delete any outstanding lock records.
612 We'll lose them when the file is closed anyway. */
613 mutex_lock(&pSMBFile->lock_mutex);
614 list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
615 list_del(&li->llist);
616 kfree(li);
617 }
618 mutex_unlock(&pSMBFile->lock_mutex);
619
620 write_lock(&GlobalSMBSeslock);
621 list_del(&pSMBFile->flist);
622 list_del(&pSMBFile->tlist);
623 write_unlock(&GlobalSMBSeslock);
624 cifsFileInfo_put(file->private_data);
625 file->private_data = NULL;
626 } else
627 rc = -EBADF;
628
629 read_lock(&GlobalSMBSeslock);
630 if (list_empty(&(CIFS_I(inode)->openFileList))) {
631 cFYI(1, "closing last open instance for inode %p", inode);
632 /* if the file is not open we do not know if we can cache info
633 on this inode, much less write behind and read ahead */
634 CIFS_I(inode)->clientCanCacheRead = false;
635 CIFS_I(inode)->clientCanCacheAll = false;
636 }
637 read_unlock(&GlobalSMBSeslock);
638 if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
639 rc = CIFS_I(inode)->write_behind_rc;
640 FreeXid(xid);
641 return rc;
642} 583}
643 584
644int cifs_closedir(struct inode *inode, struct file *file) 585int cifs_closedir(struct inode *inode, struct file *file)
@@ -653,25 +594,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
653 xid = GetXid(); 594 xid = GetXid();
654 595
655 if (pCFileStruct) { 596 if (pCFileStruct) {
656 struct cifsTconInfo *pTcon; 597 struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
657 struct cifs_sb_info *cifs_sb =
658 CIFS_SB(file->f_path.dentry->d_sb);
659
660 pTcon = cifs_sb->tcon;
661 598
662 cFYI(1, "Freeing private data in close dir"); 599 cFYI(1, "Freeing private data in close dir");
663 write_lock(&GlobalSMBSeslock); 600 spin_lock(&cifs_file_list_lock);
664 if (!pCFileStruct->srch_inf.endOfSearch && 601 if (!pCFileStruct->srch_inf.endOfSearch &&
665 !pCFileStruct->invalidHandle) { 602 !pCFileStruct->invalidHandle) {
666 pCFileStruct->invalidHandle = true; 603 pCFileStruct->invalidHandle = true;
667 write_unlock(&GlobalSMBSeslock); 604 spin_unlock(&cifs_file_list_lock);
668 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 605 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
669 cFYI(1, "Closing uncompleted readdir with rc %d", 606 cFYI(1, "Closing uncompleted readdir with rc %d",
670 rc); 607 rc);
671 /* not much we can do if it fails anyway, ignore rc */ 608 /* not much we can do if it fails anyway, ignore rc */
672 rc = 0; 609 rc = 0;
673 } else 610 } else
674 write_unlock(&GlobalSMBSeslock); 611 spin_unlock(&cifs_file_list_lock);
675 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 612 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
676 if (ptmp) { 613 if (ptmp) {
677 cFYI(1, "closedir free smb buf in srch struct"); 614 cFYI(1, "closedir free smb buf in srch struct");
@@ -681,6 +618,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
681 else 618 else
682 cifs_buf_release(ptmp); 619 cifs_buf_release(ptmp);
683 } 620 }
621 cifs_put_tlink(pCFileStruct->tlink);
684 kfree(file->private_data); 622 kfree(file->private_data);
685 file->private_data = NULL; 623 file->private_data = NULL;
686 } 624 }
@@ -767,13 +705,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
767 cFYI(1, "Unknown type of lock"); 705 cFYI(1, "Unknown type of lock");
768 706
769 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 707 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
770 tcon = cifs_sb->tcon; 708 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
771
772 if (file->private_data == NULL) {
773 rc = -EBADF;
774 FreeXid(xid);
775 return rc;
776 }
777 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 709 netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
778 710
779 if ((tcon->ses->capabilities & CAP_UNIX) && 711 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -799,12 +731,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
799 731
800 /* BB we could chain these into one lock request BB */ 732 /* BB we could chain these into one lock request BB */
801 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 733 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
802 0, 1, lockType, 0 /* wait flag */ ); 734 0, 1, lockType, 0 /* wait flag */, 0);
803 if (rc == 0) { 735 if (rc == 0) {
804 rc = CIFSSMBLock(xid, tcon, netfid, length, 736 rc = CIFSSMBLock(xid, tcon, netfid, length,
805 pfLock->fl_start, 1 /* numUnlock */ , 737 pfLock->fl_start, 1 /* numUnlock */ ,
806 0 /* numLock */ , lockType, 738 0 /* numLock */ , lockType,
807 0 /* wait flag */ ); 739 0 /* wait flag */, 0);
808 pfLock->fl_type = F_UNLCK; 740 pfLock->fl_type = F_UNLCK;
809 if (rc != 0) 741 if (rc != 0)
810 cERROR(1, "Error unlocking previously locked " 742 cERROR(1, "Error unlocking previously locked "
@@ -821,13 +753,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
821 rc = CIFSSMBLock(xid, tcon, netfid, length, 753 rc = CIFSSMBLock(xid, tcon, netfid, length,
822 pfLock->fl_start, 0, 1, 754 pfLock->fl_start, 0, 1,
823 lockType | LOCKING_ANDX_SHARED_LOCK, 755 lockType | LOCKING_ANDX_SHARED_LOCK,
824 0 /* wait flag */); 756 0 /* wait flag */, 0);
825 if (rc == 0) { 757 if (rc == 0) {
826 rc = CIFSSMBLock(xid, tcon, netfid, 758 rc = CIFSSMBLock(xid, tcon, netfid,
827 length, pfLock->fl_start, 1, 0, 759 length, pfLock->fl_start, 1, 0,
828 lockType | 760 lockType |
829 LOCKING_ANDX_SHARED_LOCK, 761 LOCKING_ANDX_SHARED_LOCK,
830 0 /* wait flag */); 762 0 /* wait flag */, 0);
831 pfLock->fl_type = F_RDLCK; 763 pfLock->fl_type = F_RDLCK;
832 if (rc != 0) 764 if (rc != 0)
833 cERROR(1, "Error unlocking " 765 cERROR(1, "Error unlocking "
@@ -870,8 +802,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
870 802
871 if (numLock) { 803 if (numLock) {
872 rc = CIFSSMBLock(xid, tcon, netfid, length, 804 rc = CIFSSMBLock(xid, tcon, netfid, length,
873 pfLock->fl_start, 805 pfLock->fl_start, 0, numLock, lockType,
874 0, numLock, lockType, wait_flag); 806 wait_flag, 0);
875 807
876 if (rc == 0) { 808 if (rc == 0) {
877 /* For Windows locks we must store them. */ 809 /* For Windows locks we must store them. */
@@ -891,9 +823,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
891 (pfLock->fl_start + length) >= 823 (pfLock->fl_start + length) >=
892 (li->offset + li->length)) { 824 (li->offset + li->length)) {
893 stored_rc = CIFSSMBLock(xid, tcon, 825 stored_rc = CIFSSMBLock(xid, tcon,
894 netfid, 826 netfid, li->length,
895 li->length, li->offset, 827 li->offset, 1, 0,
896 1, 0, li->type, false); 828 li->type, false, 0);
897 if (stored_rc) 829 if (stored_rc)
898 rc = stored_rc; 830 rc = stored_rc;
899 else { 831 else {
@@ -912,31 +844,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
912 return rc; 844 return rc;
913} 845}
914 846
915/*
916 * Set the timeout on write requests past EOF. For some servers (Windows)
917 * these calls can be very long.
918 *
919 * If we're writing >10M past the EOF we give a 180s timeout. Anything less
920 * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
921 * The 10M cutoff is totally arbitrary. A better scheme for this would be
922 * welcome if someone wants to suggest one.
923 *
924 * We may be able to do a better job with this if there were some way to
925 * declare that a file should be sparse.
926 */
927static int
928cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
929{
930 if (offset <= cifsi->server_eof)
931 return CIFS_STD_OP;
932 else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
933 return CIFS_VLONG_OP;
934 else
935 return CIFS_LONG_OP;
936}
937
938/* update the file size (if needed) after a write */ 847/* update the file size (if needed) after a write */
939static void 848void
940cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, 849cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
941 unsigned int bytes_written) 850 unsigned int bytes_written)
942{ 851{
@@ -949,25 +858,26 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
949ssize_t cifs_user_write(struct file *file, const char __user *write_data, 858ssize_t cifs_user_write(struct file *file, const char __user *write_data,
950 size_t write_size, loff_t *poffset) 859 size_t write_size, loff_t *poffset)
951{ 860{
861 struct inode *inode = file->f_path.dentry->d_inode;
952 int rc = 0; 862 int rc = 0;
953 unsigned int bytes_written = 0; 863 unsigned int bytes_written = 0;
954 unsigned int total_written; 864 unsigned int total_written;
955 struct cifs_sb_info *cifs_sb; 865 struct cifs_sb_info *cifs_sb;
956 struct cifsTconInfo *pTcon; 866 struct cifsTconInfo *pTcon;
957 int xid, long_op; 867 int xid;
958 struct cifsFileInfo *open_file; 868 struct cifsFileInfo *open_file;
959 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 869 struct cifsInodeInfo *cifsi = CIFS_I(inode);
960 870
961 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 871 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
962 872
963 pTcon = cifs_sb->tcon;
964
965 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size, 873 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
966 *poffset, file->f_path.dentry->d_name.name); */ 874 *poffset, file->f_path.dentry->d_name.name); */
967 875
968 if (file->private_data == NULL) 876 if (file->private_data == NULL)
969 return -EBADF; 877 return -EBADF;
878
970 open_file = file->private_data; 879 open_file = file->private_data;
880 pTcon = tlink_tcon(open_file->tlink);
971 881
972 rc = generic_write_checks(file, poffset, &write_size, 0); 882 rc = generic_write_checks(file, poffset, &write_size, 0);
973 if (rc) 883 if (rc)
@@ -975,7 +885,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
975 885
976 xid = GetXid(); 886 xid = GetXid();
977 887
978 long_op = cifs_write_timeout(cifsi, *poffset);
979 for (total_written = 0; write_size > total_written; 888 for (total_written = 0; write_size > total_written;
980 total_written += bytes_written) { 889 total_written += bytes_written) {
981 rc = -EAGAIN; 890 rc = -EAGAIN;
@@ -988,19 +897,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
988 we blocked so return what we managed to write */ 897 we blocked so return what we managed to write */
989 return total_written; 898 return total_written;
990 } 899 }
991 if (open_file->closePend) {
992 FreeXid(xid);
993 if (total_written)
994 return total_written;
995 else
996 return -EBADF;
997 }
998 if (open_file->invalidHandle) { 900 if (open_file->invalidHandle) {
999 /* we could deadlock if we called 901 /* we could deadlock if we called
1000 filemap_fdatawait from here so tell 902 filemap_fdatawait from here so tell
1001 reopen_file not to flush data to server 903 reopen_file not to flush data to server
1002 now */ 904 now */
1003 rc = cifs_reopen_file(file, false); 905 rc = cifs_reopen_file(open_file, false);
1004 if (rc != 0) 906 if (rc != 0)
1005 break; 907 break;
1006 } 908 }
@@ -1010,7 +912,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1010 min_t(const int, cifs_sb->wsize, 912 min_t(const int, cifs_sb->wsize,
1011 write_size - total_written), 913 write_size - total_written),
1012 *poffset, &bytes_written, 914 *poffset, &bytes_written,
1013 NULL, write_data + total_written, long_op); 915 NULL, write_data + total_written, 0);
1014 } 916 }
1015 if (rc || (bytes_written == 0)) { 917 if (rc || (bytes_written == 0)) {
1016 if (total_written) 918 if (total_written)
@@ -1023,83 +925,57 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1023 cifs_update_eof(cifsi, *poffset, bytes_written); 925 cifs_update_eof(cifsi, *poffset, bytes_written);
1024 *poffset += bytes_written; 926 *poffset += bytes_written;
1025 } 927 }
1026 long_op = CIFS_STD_OP; /* subsequent writes fast -
1027 15 seconds is plenty */
1028 } 928 }
1029 929
1030 cifs_stats_bytes_written(pTcon, total_written); 930 cifs_stats_bytes_written(pTcon, total_written);
1031 931
1032 /* since the write may have blocked check these pointers again */
1033 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
1034 struct inode *inode = file->f_path.dentry->d_inode;
1035/* Do not update local mtime - server will set its actual value on write 932/* Do not update local mtime - server will set its actual value on write
1036 * inode->i_ctime = inode->i_mtime = 933 * inode->i_ctime = inode->i_mtime =
1037 * current_fs_time(inode->i_sb);*/ 934 * current_fs_time(inode->i_sb);*/
1038 if (total_written > 0) { 935 if (total_written > 0) {
1039 spin_lock(&inode->i_lock); 936 spin_lock(&inode->i_lock);
1040 if (*poffset > file->f_path.dentry->d_inode->i_size) 937 if (*poffset > inode->i_size)
1041 i_size_write(file->f_path.dentry->d_inode, 938 i_size_write(inode, *poffset);
1042 *poffset); 939 spin_unlock(&inode->i_lock);
1043 spin_unlock(&inode->i_lock);
1044 }
1045 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1046 } 940 }
941 mark_inode_dirty_sync(inode);
942
1047 FreeXid(xid); 943 FreeXid(xid);
1048 return total_written; 944 return total_written;
1049} 945}
1050 946
1051static ssize_t cifs_write(struct file *file, const char *write_data, 947static ssize_t cifs_write(struct cifsFileInfo *open_file,
1052 size_t write_size, loff_t *poffset) 948 const char *write_data, size_t write_size,
949 loff_t *poffset)
1053{ 950{
1054 int rc = 0; 951 int rc = 0;
1055 unsigned int bytes_written = 0; 952 unsigned int bytes_written = 0;
1056 unsigned int total_written; 953 unsigned int total_written;
1057 struct cifs_sb_info *cifs_sb; 954 struct cifs_sb_info *cifs_sb;
1058 struct cifsTconInfo *pTcon; 955 struct cifsTconInfo *pTcon;
1059 int xid, long_op; 956 int xid;
1060 struct cifsFileInfo *open_file; 957 struct dentry *dentry = open_file->dentry;
1061 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 958 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
1062
1063 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1064 959
1065 pTcon = cifs_sb->tcon; 960 cifs_sb = CIFS_SB(dentry->d_sb);
1066 961
1067 cFYI(1, "write %zd bytes to offset %lld of %s", write_size, 962 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1068 *poffset, file->f_path.dentry->d_name.name); 963 *poffset, dentry->d_name.name);
1069 964
1070 if (file->private_data == NULL) 965 pTcon = tlink_tcon(open_file->tlink);
1071 return -EBADF;
1072 open_file = file->private_data;
1073 966
1074 xid = GetXid(); 967 xid = GetXid();
1075 968
1076 long_op = cifs_write_timeout(cifsi, *poffset);
1077 for (total_written = 0; write_size > total_written; 969 for (total_written = 0; write_size > total_written;
1078 total_written += bytes_written) { 970 total_written += bytes_written) {
1079 rc = -EAGAIN; 971 rc = -EAGAIN;
1080 while (rc == -EAGAIN) { 972 while (rc == -EAGAIN) {
1081 if (file->private_data == NULL) {
1082 /* file has been closed on us */
1083 FreeXid(xid);
1084 /* if we have gotten here we have written some data
1085 and blocked, and the file has been freed on us
1086 while we blocked so return what we managed to
1087 write */
1088 return total_written;
1089 }
1090 if (open_file->closePend) {
1091 FreeXid(xid);
1092 if (total_written)
1093 return total_written;
1094 else
1095 return -EBADF;
1096 }
1097 if (open_file->invalidHandle) { 973 if (open_file->invalidHandle) {
1098 /* we could deadlock if we called 974 /* we could deadlock if we called
1099 filemap_fdatawait from here so tell 975 filemap_fdatawait from here so tell
1100 reopen_file not to flush data to 976 reopen_file not to flush data to
1101 server now */ 977 server now */
1102 rc = cifs_reopen_file(file, false); 978 rc = cifs_reopen_file(open_file, false);
1103 if (rc != 0) 979 if (rc != 0)
1104 break; 980 break;
1105 } 981 }
@@ -1119,7 +995,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1119 rc = CIFSSMBWrite2(xid, pTcon, 995 rc = CIFSSMBWrite2(xid, pTcon,
1120 open_file->netfid, len, 996 open_file->netfid, len,
1121 *poffset, &bytes_written, 997 *poffset, &bytes_written,
1122 iov, 1, long_op); 998 iov, 1, 0);
1123 } else 999 } else
1124 rc = CIFSSMBWrite(xid, pTcon, 1000 rc = CIFSSMBWrite(xid, pTcon,
1125 open_file->netfid, 1001 open_file->netfid,
@@ -1127,7 +1003,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1127 write_size - total_written), 1003 write_size - total_written),
1128 *poffset, &bytes_written, 1004 *poffset, &bytes_written,
1129 write_data + total_written, 1005 write_data + total_written,
1130 NULL, long_op); 1006 NULL, 0);
1131 } 1007 }
1132 if (rc || (bytes_written == 0)) { 1008 if (rc || (bytes_written == 0)) {
1133 if (total_written) 1009 if (total_written)
@@ -1140,49 +1016,44 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1140 cifs_update_eof(cifsi, *poffset, bytes_written); 1016 cifs_update_eof(cifsi, *poffset, bytes_written);
1141 *poffset += bytes_written; 1017 *poffset += bytes_written;
1142 } 1018 }
1143 long_op = CIFS_STD_OP; /* subsequent writes fast -
1144 15 seconds is plenty */
1145 } 1019 }
1146 1020
1147 cifs_stats_bytes_written(pTcon, total_written); 1021 cifs_stats_bytes_written(pTcon, total_written);
1148 1022
1149 /* since the write may have blocked check these pointers again */ 1023 if (total_written > 0) {
1150 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { 1024 spin_lock(&dentry->d_inode->i_lock);
1151/*BB We could make this contingent on superblock ATIME flag too */ 1025 if (*poffset > dentry->d_inode->i_size)
1152/* file->f_path.dentry->d_inode->i_ctime = 1026 i_size_write(dentry->d_inode, *poffset);
1153 file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/ 1027 spin_unlock(&dentry->d_inode->i_lock);
1154 if (total_written > 0) {
1155 spin_lock(&file->f_path.dentry->d_inode->i_lock);
1156 if (*poffset > file->f_path.dentry->d_inode->i_size)
1157 i_size_write(file->f_path.dentry->d_inode,
1158 *poffset);
1159 spin_unlock(&file->f_path.dentry->d_inode->i_lock);
1160 }
1161 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1162 } 1028 }
1029 mark_inode_dirty_sync(dentry->d_inode);
1163 FreeXid(xid); 1030 FreeXid(xid);
1164 return total_written; 1031 return total_written;
1165} 1032}
1166 1033
1167#ifdef CONFIG_CIFS_EXPERIMENTAL 1034struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1168struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) 1035 bool fsuid_only)
1169{ 1036{
1170 struct cifsFileInfo *open_file = NULL; 1037 struct cifsFileInfo *open_file = NULL;
1038 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1039
1040 /* only filter by fsuid on multiuser mounts */
1041 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1042 fsuid_only = false;
1171 1043
1172 read_lock(&GlobalSMBSeslock); 1044 spin_lock(&cifs_file_list_lock);
1173 /* we could simply get the first_list_entry since write-only entries 1045 /* we could simply get the first_list_entry since write-only entries
1174 are always at the end of the list but since the first entry might 1046 are always at the end of the list but since the first entry might
1175 have a close pending, we go through the whole list */ 1047 have a close pending, we go through the whole list */
1176 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1048 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1177 if (open_file->closePend) 1049 if (fsuid_only && open_file->uid != current_fsuid())
1178 continue; 1050 continue;
1179 if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || 1051 if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
1180 (open_file->pfile->f_flags & O_RDONLY))) {
1181 if (!open_file->invalidHandle) { 1052 if (!open_file->invalidHandle) {
1182 /* found a good file */ 1053 /* found a good file */
1183 /* lock it so it will not be closed on us */ 1054 /* lock it so it will not be closed on us */
1184 cifsFileInfo_get(open_file); 1055 cifsFileInfo_get(open_file);
1185 read_unlock(&GlobalSMBSeslock); 1056 spin_unlock(&cifs_file_list_lock);
1186 return open_file; 1057 return open_file;
1187 } /* else might as well continue, and look for 1058 } /* else might as well continue, and look for
1188 another, or simply have the caller reopen it 1059 another, or simply have the caller reopen it
@@ -1190,14 +1061,15 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1190 } else /* write only file */ 1061 } else /* write only file */
1191 break; /* write only files are last so must be done */ 1062 break; /* write only files are last so must be done */
1192 } 1063 }
1193 read_unlock(&GlobalSMBSeslock); 1064 spin_unlock(&cifs_file_list_lock);
1194 return NULL; 1065 return NULL;
1195} 1066}
1196#endif
1197 1067
1198struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) 1068struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1069 bool fsuid_only)
1199{ 1070{
1200 struct cifsFileInfo *open_file; 1071 struct cifsFileInfo *open_file;
1072 struct cifs_sb_info *cifs_sb;
1201 bool any_available = false; 1073 bool any_available = false;
1202 int rc; 1074 int rc;
1203 1075
@@ -1211,53 +1083,41 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1211 return NULL; 1083 return NULL;
1212 } 1084 }
1213 1085
1214 read_lock(&GlobalSMBSeslock); 1086 cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1087
1088 /* only filter by fsuid on multiuser mounts */
1089 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1090 fsuid_only = false;
1091
1092 spin_lock(&cifs_file_list_lock);
1215refind_writable: 1093refind_writable:
1216 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1094 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1217 if (open_file->closePend || 1095 if (!any_available && open_file->pid != current->tgid)
1218 (!any_available && open_file->pid != current->tgid))
1219 continue; 1096 continue;
1220 1097 if (fsuid_only && open_file->uid != current_fsuid())
1221 if (open_file->pfile && 1098 continue;
1222 ((open_file->pfile->f_flags & O_RDWR) || 1099 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
1223 (open_file->pfile->f_flags & O_WRONLY))) {
1224 cifsFileInfo_get(open_file); 1100 cifsFileInfo_get(open_file);
1225 1101
1226 if (!open_file->invalidHandle) { 1102 if (!open_file->invalidHandle) {
1227 /* found a good writable file */ 1103 /* found a good writable file */
1228 read_unlock(&GlobalSMBSeslock); 1104 spin_unlock(&cifs_file_list_lock);
1229 return open_file; 1105 return open_file;
1230 } 1106 }
1231 1107
1232 read_unlock(&GlobalSMBSeslock); 1108 spin_unlock(&cifs_file_list_lock);
1109
1233 /* Had to unlock since following call can block */ 1110 /* Had to unlock since following call can block */
1234 rc = cifs_reopen_file(open_file->pfile, false); 1111 rc = cifs_reopen_file(open_file, false);
1235 if (!rc) { 1112 if (!rc)
1236 if (!open_file->closePend) 1113 return open_file;
1237 return open_file;
1238 else { /* start over in case this was deleted */
1239 /* since the list could be modified */
1240 read_lock(&GlobalSMBSeslock);
1241 cifsFileInfo_put(open_file);
1242 goto refind_writable;
1243 }
1244 }
1245 1114
1246 /* if it fails, try another handle if possible - 1115 /* if it fails, try another handle if possible */
1247 (we can not do this if closePending since
1248 loop could be modified - in which case we
1249 have to start at the beginning of the list
1250 again. Note that it would be bad
1251 to hold up writepages here (rather than
1252 in caller) with continuous retries */
1253 cFYI(1, "wp failed on reopen file"); 1116 cFYI(1, "wp failed on reopen file");
1254 read_lock(&GlobalSMBSeslock);
1255 /* can not use this handle, no write
1256 pending on this one after all */
1257 cifsFileInfo_put(open_file); 1117 cifsFileInfo_put(open_file);
1258 1118
1259 if (open_file->closePend) /* list could have changed */ 1119 spin_lock(&cifs_file_list_lock);
1260 goto refind_writable; 1120
1261 /* else we simply continue to the next entry. Thus 1121 /* else we simply continue to the next entry. Thus
1262 we do not loop on reopen errors. If we 1122 we do not loop on reopen errors. If we
1263 can not reopen the file, for example if we 1123 can not reopen the file, for example if we
@@ -1272,7 +1132,7 @@ refind_writable:
1272 any_available = true; 1132 any_available = true;
1273 goto refind_writable; 1133 goto refind_writable;
1274 } 1134 }
1275 read_unlock(&GlobalSMBSeslock); 1135 spin_unlock(&cifs_file_list_lock);
1276 return NULL; 1136 return NULL;
1277} 1137}
1278 1138
@@ -1283,8 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1283 char *write_data; 1143 char *write_data;
1284 int rc = -EFAULT; 1144 int rc = -EFAULT;
1285 int bytes_written = 0; 1145 int bytes_written = 0;
1286 struct cifs_sb_info *cifs_sb;
1287 struct cifsTconInfo *pTcon;
1288 struct inode *inode; 1146 struct inode *inode;
1289 struct cifsFileInfo *open_file; 1147 struct cifsFileInfo *open_file;
1290 1148
@@ -1292,8 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1292 return -EFAULT; 1150 return -EFAULT;
1293 1151
1294 inode = page->mapping->host; 1152 inode = page->mapping->host;
1295 cifs_sb = CIFS_SB(inode->i_sb);
1296 pTcon = cifs_sb->tcon;
1297 1153
1298 offset += (loff_t)from; 1154 offset += (loff_t)from;
1299 write_data = kmap(page); 1155 write_data = kmap(page);
@@ -1314,10 +1170,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1314 if (mapping->host->i_size - offset < (loff_t)to) 1170 if (mapping->host->i_size - offset < (loff_t)to)
1315 to = (unsigned)(mapping->host->i_size - offset); 1171 to = (unsigned)(mapping->host->i_size - offset);
1316 1172
1317 open_file = find_writable_file(CIFS_I(mapping->host)); 1173 open_file = find_writable_file(CIFS_I(mapping->host), false);
1318 if (open_file) { 1174 if (open_file) {
1319 bytes_written = cifs_write(open_file->pfile, write_data, 1175 bytes_written = cifs_write(open_file, write_data,
1320 to-from, &offset); 1176 to - from, &offset);
1321 cifsFileInfo_put(open_file); 1177 cifsFileInfo_put(open_file);
1322 /* Does mm or vfs already set times? */ 1178 /* Does mm or vfs already set times? */
1323 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1179 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1337,7 +1193,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1337static int cifs_writepages(struct address_space *mapping, 1193static int cifs_writepages(struct address_space *mapping,
1338 struct writeback_control *wbc) 1194 struct writeback_control *wbc)
1339{ 1195{
1340 struct backing_dev_info *bdi = mapping->backing_dev_info;
1341 unsigned int bytes_to_write; 1196 unsigned int bytes_to_write;
1342 unsigned int bytes_written; 1197 unsigned int bytes_written;
1343 struct cifs_sb_info *cifs_sb; 1198 struct cifs_sb_info *cifs_sb;
@@ -1352,12 +1207,13 @@ static int cifs_writepages(struct address_space *mapping,
1352 int nr_pages; 1207 int nr_pages;
1353 __u64 offset = 0; 1208 __u64 offset = 0;
1354 struct cifsFileInfo *open_file; 1209 struct cifsFileInfo *open_file;
1210 struct cifsTconInfo *tcon;
1355 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host); 1211 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
1356 struct page *page; 1212 struct page *page;
1357 struct pagevec pvec; 1213 struct pagevec pvec;
1358 int rc = 0; 1214 int rc = 0;
1359 int scanned = 0; 1215 int scanned = 0;
1360 int xid, long_op; 1216 int xid;
1361 1217
1362 cifs_sb = CIFS_SB(mapping->host->i_sb); 1218 cifs_sb = CIFS_SB(mapping->host->i_sb);
1363 1219
@@ -1368,27 +1224,30 @@ static int cifs_writepages(struct address_space *mapping,
1368 if (cifs_sb->wsize < PAGE_CACHE_SIZE) 1224 if (cifs_sb->wsize < PAGE_CACHE_SIZE)
1369 return generic_writepages(mapping, wbc); 1225 return generic_writepages(mapping, wbc);
1370 1226
1371 if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
1372 if (cifs_sb->tcon->ses->server->secMode &
1373 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
1374 if (!experimEnabled)
1375 return generic_writepages(mapping, wbc);
1376
1377 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL); 1227 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
1378 if (iov == NULL) 1228 if (iov == NULL)
1379 return generic_writepages(mapping, wbc); 1229 return generic_writepages(mapping, wbc);
1380 1230
1381
1382 /* 1231 /*
1383 * BB: Is this meaningful for a non-block-device file system? 1232 * if there's no open file, then this is likely to fail too,
1384 * If it is, we should test it again after we do I/O 1233 * but it'll at least handle the return. Maybe it should be
1234 * a BUG() instead?
1385 */ 1235 */
1386 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1236 open_file = find_writable_file(CIFS_I(mapping->host), false);
1387 wbc->encountered_congestion = 1; 1237 if (!open_file) {
1388 kfree(iov); 1238 kfree(iov);
1389 return 0; 1239 return generic_writepages(mapping, wbc);
1390 } 1240 }
1391 1241
1242 tcon = tlink_tcon(open_file->tlink);
1243 if (!experimEnabled && tcon->ses->server->secMode &
1244 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1245 cifsFileInfo_put(open_file);
1246 kfree(iov);
1247 return generic_writepages(mapping, wbc);
1248 }
1249 cifsFileInfo_put(open_file);
1250
1392 xid = GetXid(); 1251 xid = GetXid();
1393 1252
1394 pagevec_init(&pvec, 0); 1253 pagevec_init(&pvec, 0);
@@ -1492,52 +1351,67 @@ retry:
1492 break; 1351 break;
1493 } 1352 }
1494 if (n_iov) { 1353 if (n_iov) {
1495 /* Search for a writable handle every time we call 1354retry_write:
1496 * CIFSSMBWrite2. We can't rely on the last handle 1355 open_file = find_writable_file(CIFS_I(mapping->host),
1497 * we used to still be valid 1356 false);
1498 */
1499 open_file = find_writable_file(CIFS_I(mapping->host));
1500 if (!open_file) { 1357 if (!open_file) {
1501 cERROR(1, "No writable handles for inode"); 1358 cERROR(1, "No writable handles for inode");
1502 rc = -EBADF; 1359 rc = -EBADF;
1503 } else { 1360 } else {
1504 long_op = cifs_write_timeout(cifsi, offset); 1361 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
1505 rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
1506 open_file->netfid,
1507 bytes_to_write, offset, 1362 bytes_to_write, offset,
1508 &bytes_written, iov, n_iov, 1363 &bytes_written, iov, n_iov,
1509 long_op); 1364 0);
1510 cifsFileInfo_put(open_file); 1365 cifsFileInfo_put(open_file);
1511 cifs_update_eof(cifsi, offset, bytes_written); 1366 }
1512 1367
1513 if (rc || bytes_written < bytes_to_write) { 1368 cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
1514 cERROR(1, "Write2 ret %d, wrote %d", 1369
1515 rc, bytes_written); 1370 /*
1516 /* BB what if continued retry is 1371 * For now, treat a short write as if nothing got
1517 requested via mount flags? */ 1372 * written. A zero length write however indicates
1518 if (rc == -ENOSPC) 1373 * ENOSPC or EFBIG. We have no way to know which
1519 set_bit(AS_ENOSPC, &mapping->flags); 1374 * though, so call it ENOSPC for now. EFBIG would
1520 else 1375 * get translated to AS_EIO anyway.
1521 set_bit(AS_EIO, &mapping->flags); 1376 *
1522 } else { 1377 * FIXME: make it take into account the data that did
1523 cifs_stats_bytes_written(cifs_sb->tcon, 1378 * get written
1524 bytes_written); 1379 */
1525 } 1380 if (rc == 0) {
1381 if (bytes_written == 0)
1382 rc = -ENOSPC;
1383 else if (bytes_written < bytes_to_write)
1384 rc = -EAGAIN;
1526 } 1385 }
1386
1387 /* retry on data-integrity flush */
1388 if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
1389 goto retry_write;
1390
1391 /* fix the stats and EOF */
1392 if (bytes_written > 0) {
1393 cifs_stats_bytes_written(tcon, bytes_written);
1394 cifs_update_eof(cifsi, offset, bytes_written);
1395 }
1396
1527 for (i = 0; i < n_iov; i++) { 1397 for (i = 0; i < n_iov; i++) {
1528 page = pvec.pages[first + i]; 1398 page = pvec.pages[first + i];
1529 /* Should we also set page error on 1399 /* on retryable write error, redirty page */
1530 success rc but too little data written? */ 1400 if (rc == -EAGAIN)
1531 /* BB investigate retry logic on temporary 1401 redirty_page_for_writepage(wbc, page);
1532 server crash cases and how recovery works 1402 else if (rc != 0)
1533 when page marked as error */
1534 if (rc)
1535 SetPageError(page); 1403 SetPageError(page);
1536 kunmap(page); 1404 kunmap(page);
1537 unlock_page(page); 1405 unlock_page(page);
1538 end_page_writeback(page); 1406 end_page_writeback(page);
1539 page_cache_release(page); 1407 page_cache_release(page);
1540 } 1408 }
1409
1410 if (rc != -EAGAIN)
1411 mapping_set_error(mapping, rc);
1412 else
1413 rc = 0;
1414
1541 if ((wbc->nr_to_write -= n_iov) <= 0) 1415 if ((wbc->nr_to_write -= n_iov) <= 0)
1542 done = 1; 1416 done = 1;
1543 index = next; 1417 index = next;
@@ -1624,7 +1498,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1624 /* BB check if anything else missing out of ppw 1498 /* BB check if anything else missing out of ppw
1625 such as updating last write time */ 1499 such as updating last write time */
1626 page_data = kmap(page); 1500 page_data = kmap(page);
1627 rc = cifs_write(file, page_data + offset, copied, &pos); 1501 rc = cifs_write(file->private_data, page_data + offset,
1502 copied, &pos);
1628 /* if (rc < 0) should we set writebehind rc? */ 1503 /* if (rc < 0) should we set writebehind rc? */
1629 kunmap(page); 1504 kunmap(page);
1630 1505
@@ -1648,28 +1523,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1648 return rc; 1523 return rc;
1649} 1524}
1650 1525
1651int cifs_fsync(struct file *file, int datasync) 1526int cifs_strict_fsync(struct file *file, int datasync)
1652{ 1527{
1653 int xid; 1528 int xid;
1654 int rc = 0; 1529 int rc = 0;
1655 struct cifsTconInfo *tcon; 1530 struct cifsTconInfo *tcon;
1656 struct cifsFileInfo *smbfile = file->private_data; 1531 struct cifsFileInfo *smbfile = file->private_data;
1657 struct inode *inode = file->f_path.dentry->d_inode; 1532 struct inode *inode = file->f_path.dentry->d_inode;
1533 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1658 1534
1659 xid = GetXid(); 1535 xid = GetXid();
1660 1536
1661 cFYI(1, "Sync file - name: %s datasync: 0x%x", 1537 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1662 file->f_path.dentry->d_name.name, datasync); 1538 file->f_path.dentry->d_name.name, datasync);
1663 1539
1664 rc = filemap_write_and_wait(inode->i_mapping); 1540 if (!CIFS_I(inode)->clientCanCacheRead)
1665 if (rc == 0) { 1541 cifs_invalidate_mapping(inode);
1666 rc = CIFS_I(inode)->write_behind_rc; 1542
1667 CIFS_I(inode)->write_behind_rc = 0; 1543 tcon = tlink_tcon(smbfile->tlink);
1668 tcon = CIFS_SB(inode->i_sb)->tcon; 1544 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1669 if (!rc && tcon && smbfile && 1545 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1670 !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) 1546
1671 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 1547 FreeXid(xid);
1672 } 1548 return rc;
1549}
1550
1551int cifs_fsync(struct file *file, int datasync)
1552{
1553 int xid;
1554 int rc = 0;
1555 struct cifsTconInfo *tcon;
1556 struct cifsFileInfo *smbfile = file->private_data;
1557 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1558
1559 xid = GetXid();
1560
1561 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1562 file->f_path.dentry->d_name.name, datasync);
1563
1564 tcon = tlink_tcon(smbfile->tlink);
1565 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1566 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1673 1567
1674 FreeXid(xid); 1568 FreeXid(xid);
1675 return rc; 1569 return rc;
@@ -1712,92 +1606,278 @@ int cifs_flush(struct file *file, fl_owner_t id)
1712 struct inode *inode = file->f_path.dentry->d_inode; 1606 struct inode *inode = file->f_path.dentry->d_inode;
1713 int rc = 0; 1607 int rc = 0;
1714 1608
1715 /* Rather than do the steps manually: 1609 if (file->f_mode & FMODE_WRITE)
1716 lock the inode for writing 1610 rc = filemap_write_and_wait(inode->i_mapping);
1717 loop through pages looking for write behind data (dirty pages)
1718 coalesce into contiguous 16K (or smaller) chunks to write to server
1719 send to server (prefer in parallel)
1720 deal with writebehind errors
1721 unlock inode for writing
1722 filemapfdatawrite appears easier for the time being */
1723
1724 rc = filemap_fdatawrite(inode->i_mapping);
1725 /* reset wb rc if we were able to write out dirty pages */
1726 if (!rc) {
1727 rc = CIFS_I(inode)->write_behind_rc;
1728 CIFS_I(inode)->write_behind_rc = 0;
1729 }
1730 1611
1731 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); 1612 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
1732 1613
1733 return rc; 1614 return rc;
1734} 1615}
1735 1616
1736ssize_t cifs_user_read(struct file *file, char __user *read_data, 1617static int
1737 size_t read_size, loff_t *poffset) 1618cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
1738{ 1619{
1739 int rc = -EACCES; 1620 int rc = 0;
1621 unsigned long i;
1622
1623 for (i = 0; i < num_pages; i++) {
1624 pages[i] = alloc_page(__GFP_HIGHMEM);
1625 if (!pages[i]) {
1626 /*
1627 * save number of pages we have already allocated and
1628 * return with ENOMEM error
1629 */
1630 num_pages = i;
1631 rc = -ENOMEM;
1632 goto error;
1633 }
1634 }
1635
1636 return rc;
1637
1638error:
1639 for (i = 0; i < num_pages; i++)
1640 put_page(pages[i]);
1641 return rc;
1642}
1643
1644static inline
1645size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
1646{
1647 size_t num_pages;
1648 size_t clen;
1649
1650 clen = min_t(const size_t, len, wsize);
1651 num_pages = clen / PAGE_CACHE_SIZE;
1652 if (clen % PAGE_CACHE_SIZE)
1653 num_pages++;
1654
1655 if (cur_len)
1656 *cur_len = clen;
1657
1658 return num_pages;
1659}
1660
1661static ssize_t
1662cifs_iovec_write(struct file *file, const struct iovec *iov,
1663 unsigned long nr_segs, loff_t *poffset)
1664{
1665 unsigned int written;
1666 unsigned long num_pages, npages, i;
1667 size_t copied, len, cur_len;
1668 ssize_t total_written = 0;
1669 struct kvec *to_send;
1670 struct page **pages;
1671 struct iov_iter it;
1672 struct inode *inode;
1673 struct cifsFileInfo *open_file;
1674 struct cifsTconInfo *pTcon;
1675 struct cifs_sb_info *cifs_sb;
1676 int xid, rc;
1677
1678 len = iov_length(iov, nr_segs);
1679 if (!len)
1680 return 0;
1681
1682 rc = generic_write_checks(file, poffset, &len, 0);
1683 if (rc)
1684 return rc;
1685
1686 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1687 num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
1688
1689 pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
1690 if (!pages)
1691 return -ENOMEM;
1692
1693 to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
1694 if (!to_send) {
1695 kfree(pages);
1696 return -ENOMEM;
1697 }
1698
1699 rc = cifs_write_allocate_pages(pages, num_pages);
1700 if (rc) {
1701 kfree(pages);
1702 kfree(to_send);
1703 return rc;
1704 }
1705
1706 xid = GetXid();
1707 open_file = file->private_data;
1708 pTcon = tlink_tcon(open_file->tlink);
1709 inode = file->f_path.dentry->d_inode;
1710
1711 iov_iter_init(&it, iov, nr_segs, len, 0);
1712 npages = num_pages;
1713
1714 do {
1715 size_t save_len = cur_len;
1716 for (i = 0; i < npages; i++) {
1717 copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
1718 copied = iov_iter_copy_from_user(pages[i], &it, 0,
1719 copied);
1720 cur_len -= copied;
1721 iov_iter_advance(&it, copied);
1722 to_send[i+1].iov_base = kmap(pages[i]);
1723 to_send[i+1].iov_len = copied;
1724 }
1725
1726 cur_len = save_len - cur_len;
1727
1728 do {
1729 if (open_file->invalidHandle) {
1730 rc = cifs_reopen_file(open_file, false);
1731 if (rc != 0)
1732 break;
1733 }
1734 rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
1735 cur_len, *poffset, &written,
1736 to_send, npages, 0);
1737 } while (rc == -EAGAIN);
1738
1739 for (i = 0; i < npages; i++)
1740 kunmap(pages[i]);
1741
1742 if (written) {
1743 len -= written;
1744 total_written += written;
1745 cifs_update_eof(CIFS_I(inode), *poffset, written);
1746 *poffset += written;
1747 } else if (rc < 0) {
1748 if (!total_written)
1749 total_written = rc;
1750 break;
1751 }
1752
1753 /* get length and number of kvecs of the next write */
1754 npages = get_numpages(cifs_sb->wsize, len, &cur_len);
1755 } while (len > 0);
1756
1757 if (total_written > 0) {
1758 spin_lock(&inode->i_lock);
1759 if (*poffset > inode->i_size)
1760 i_size_write(inode, *poffset);
1761 spin_unlock(&inode->i_lock);
1762 }
1763
1764 cifs_stats_bytes_written(pTcon, total_written);
1765 mark_inode_dirty_sync(inode);
1766
1767 for (i = 0; i < num_pages; i++)
1768 put_page(pages[i]);
1769 kfree(to_send);
1770 kfree(pages);
1771 FreeXid(xid);
1772 return total_written;
1773}
1774
1775static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
1776 unsigned long nr_segs, loff_t pos)
1777{
1778 ssize_t written;
1779 struct inode *inode;
1780
1781 inode = iocb->ki_filp->f_path.dentry->d_inode;
1782
1783 /*
1784 * BB - optimize the way when signing is disabled. We can drop this
1785 * extra memory-to-memory copying and use iovec buffers for constructing
1786 * write request.
1787 */
1788
1789 written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
1790 if (written > 0) {
1791 CIFS_I(inode)->invalid_mapping = true;
1792 iocb->ki_pos = pos;
1793 }
1794
1795 return written;
1796}
1797
1798ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
1799 unsigned long nr_segs, loff_t pos)
1800{
1801 struct inode *inode;
1802
1803 inode = iocb->ki_filp->f_path.dentry->d_inode;
1804
1805 if (CIFS_I(inode)->clientCanCacheAll)
1806 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1807
1808 /*
1809 * In strict cache mode we need to write the data to the server exactly
1810 * from the pos to pos+len-1 rather than flush all affected pages
1811 * because it may cause a error with mandatory locks on these pages but
1812 * not on the region from pos to ppos+len-1.
1813 */
1814
1815 return cifs_user_writev(iocb, iov, nr_segs, pos);
1816}
1817
1818static ssize_t
1819cifs_iovec_read(struct file *file, const struct iovec *iov,
1820 unsigned long nr_segs, loff_t *poffset)
1821{
1822 int rc;
1823 int xid;
1824 ssize_t total_read;
1740 unsigned int bytes_read = 0; 1825 unsigned int bytes_read = 0;
1741 unsigned int total_read = 0; 1826 size_t len, cur_len;
1742 unsigned int current_read_size; 1827 int iov_offset = 0;
1743 struct cifs_sb_info *cifs_sb; 1828 struct cifs_sb_info *cifs_sb;
1744 struct cifsTconInfo *pTcon; 1829 struct cifsTconInfo *pTcon;
1745 int xid;
1746 struct cifsFileInfo *open_file; 1830 struct cifsFileInfo *open_file;
1747 char *smb_read_data;
1748 char __user *current_offset;
1749 struct smb_com_read_rsp *pSMBr; 1831 struct smb_com_read_rsp *pSMBr;
1832 char *read_data;
1833
1834 if (!nr_segs)
1835 return 0;
1836
1837 len = iov_length(iov, nr_segs);
1838 if (!len)
1839 return 0;
1750 1840
1751 xid = GetXid(); 1841 xid = GetXid();
1752 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1842 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1753 pTcon = cifs_sb->tcon;
1754 1843
1755 if (file->private_data == NULL) {
1756 rc = -EBADF;
1757 FreeXid(xid);
1758 return rc;
1759 }
1760 open_file = file->private_data; 1844 open_file = file->private_data;
1845 pTcon = tlink_tcon(open_file->tlink);
1761 1846
1762 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1847 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1763 cFYI(1, "attempting read on write only file instance"); 1848 cFYI(1, "attempting read on write only file instance");
1764 1849
1765 for (total_read = 0, current_offset = read_data; 1850 for (total_read = 0; total_read < len; total_read += bytes_read) {
1766 read_size > total_read; 1851 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
1767 total_read += bytes_read, current_offset += bytes_read) {
1768 current_read_size = min_t(const int, read_size - total_read,
1769 cifs_sb->rsize);
1770 rc = -EAGAIN; 1852 rc = -EAGAIN;
1771 smb_read_data = NULL; 1853 read_data = NULL;
1854
1772 while (rc == -EAGAIN) { 1855 while (rc == -EAGAIN) {
1773 int buf_type = CIFS_NO_BUFFER; 1856 int buf_type = CIFS_NO_BUFFER;
1774 if ((open_file->invalidHandle) && 1857 if (open_file->invalidHandle) {
1775 (!open_file->closePend)) { 1858 rc = cifs_reopen_file(open_file, true);
1776 rc = cifs_reopen_file(file, true);
1777 if (rc != 0) 1859 if (rc != 0)
1778 break; 1860 break;
1779 } 1861 }
1780 rc = CIFSSMBRead(xid, pTcon, 1862 rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
1781 open_file->netfid, 1863 cur_len, *poffset, &bytes_read,
1782 current_read_size, *poffset, 1864 &read_data, &buf_type);
1783 &bytes_read, &smb_read_data, 1865 pSMBr = (struct smb_com_read_rsp *)read_data;
1784 &buf_type); 1866 if (read_data) {
1785 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 1867 char *data_offset = read_data + 4 +
1786 if (smb_read_data) { 1868 le16_to_cpu(pSMBr->DataOffset);
1787 if (copy_to_user(current_offset, 1869 if (memcpy_toiovecend(iov, data_offset,
1788 smb_read_data + 1870 iov_offset, bytes_read))
1789 4 /* RFC1001 length field */ +
1790 le16_to_cpu(pSMBr->DataOffset),
1791 bytes_read))
1792 rc = -EFAULT; 1871 rc = -EFAULT;
1793
1794 if (buf_type == CIFS_SMALL_BUFFER) 1872 if (buf_type == CIFS_SMALL_BUFFER)
1795 cifs_small_buf_release(smb_read_data); 1873 cifs_small_buf_release(read_data);
1796 else if (buf_type == CIFS_LARGE_BUFFER) 1874 else if (buf_type == CIFS_LARGE_BUFFER)
1797 cifs_buf_release(smb_read_data); 1875 cifs_buf_release(read_data);
1798 smb_read_data = NULL; 1876 read_data = NULL;
1877 iov_offset += bytes_read;
1799 } 1878 }
1800 } 1879 }
1880
1801 if (rc || (bytes_read == 0)) { 1881 if (rc || (bytes_read == 0)) {
1802 if (total_read) { 1882 if (total_read) {
1803 break; 1883 break;
@@ -1810,13 +1890,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1810 *poffset += bytes_read; 1890 *poffset += bytes_read;
1811 } 1891 }
1812 } 1892 }
1893
1813 FreeXid(xid); 1894 FreeXid(xid);
1814 return total_read; 1895 return total_read;
1815} 1896}
1816 1897
1898ssize_t cifs_user_read(struct file *file, char __user *read_data,
1899 size_t read_size, loff_t *poffset)
1900{
1901 struct iovec iov;
1902 iov.iov_base = read_data;
1903 iov.iov_len = read_size;
1904
1905 return cifs_iovec_read(file, &iov, 1, poffset);
1906}
1907
1908static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
1909 unsigned long nr_segs, loff_t pos)
1910{
1911 ssize_t read;
1912
1913 read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
1914 if (read > 0)
1915 iocb->ki_pos = pos;
1916
1917 return read;
1918}
1919
1920ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
1921 unsigned long nr_segs, loff_t pos)
1922{
1923 struct inode *inode;
1924
1925 inode = iocb->ki_filp->f_path.dentry->d_inode;
1926
1927 if (CIFS_I(inode)->clientCanCacheRead)
1928 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1929
1930 /*
1931 * In strict cache mode we need to read from the server all the time
1932 * if we don't have level II oplock because the server can delay mtime
1933 * change - so we can't make a decision about inode invalidating.
1934 * And we can also fail with pagereading if there are mandatory locks
1935 * on pages affected by this read but not on the region from pos to
1936 * pos+len-1.
1937 */
1938
1939 return cifs_user_readv(iocb, iov, nr_segs, pos);
1940}
1817 1941
1818static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, 1942static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1819 loff_t *poffset) 1943 loff_t *poffset)
1820{ 1944{
1821 int rc = -EACCES; 1945 int rc = -EACCES;
1822 unsigned int bytes_read = 0; 1946 unsigned int bytes_read = 0;
@@ -1831,7 +1955,6 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1831 1955
1832 xid = GetXid(); 1956 xid = GetXid();
1833 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1957 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1834 pTcon = cifs_sb->tcon;
1835 1958
1836 if (file->private_data == NULL) { 1959 if (file->private_data == NULL) {
1837 rc = -EBADF; 1960 rc = -EBADF;
@@ -1839,6 +1962,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1839 return rc; 1962 return rc;
1840 } 1963 }
1841 open_file = file->private_data; 1964 open_file = file->private_data;
1965 pTcon = tlink_tcon(open_file->tlink);
1842 1966
1843 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1967 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1844 cFYI(1, "attempting read on write only file instance"); 1968 cFYI(1, "attempting read on write only file instance");
@@ -1857,9 +1981,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1857 } 1981 }
1858 rc = -EAGAIN; 1982 rc = -EAGAIN;
1859 while (rc == -EAGAIN) { 1983 while (rc == -EAGAIN) {
1860 if ((open_file->invalidHandle) && 1984 if (open_file->invalidHandle) {
1861 (!open_file->closePend)) { 1985 rc = cifs_reopen_file(open_file, true);
1862 rc = cifs_reopen_file(file, true);
1863 if (rc != 0) 1986 if (rc != 0)
1864 break; 1987 break;
1865 } 1988 }
@@ -1885,6 +2008,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1885 return total_read; 2008 return total_read;
1886} 2009}
1887 2010
2011int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
2012{
2013 int rc, xid;
2014 struct inode *inode = file->f_path.dentry->d_inode;
2015
2016 xid = GetXid();
2017
2018 if (!CIFS_I(inode)->clientCanCacheRead)
2019 cifs_invalidate_mapping(inode);
2020
2021 rc = generic_file_mmap(file, vma);
2022 FreeXid(xid);
2023 return rc;
2024}
2025
1888int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 2026int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1889{ 2027{
1890 int rc, xid; 2028 int rc, xid;
@@ -1974,7 +2112,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1974 } 2112 }
1975 open_file = file->private_data; 2113 open_file = file->private_data;
1976 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2114 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1977 pTcon = cifs_sb->tcon; 2115 pTcon = tlink_tcon(open_file->tlink);
1978 2116
1979 /* 2117 /*
1980 * Reads as many pages as possible from fscache. Returns -ENOBUFS 2118 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2022,9 +2160,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2022 read_size, contig_pages); 2160 read_size, contig_pages);
2023 rc = -EAGAIN; 2161 rc = -EAGAIN;
2024 while (rc == -EAGAIN) { 2162 while (rc == -EAGAIN) {
2025 if ((open_file->invalidHandle) && 2163 if (open_file->invalidHandle) {
2026 (!open_file->closePend)) { 2164 rc = cifs_reopen_file(open_file, true);
2027 rc = cifs_reopen_file(file, true);
2028 if (rc != 0) 2165 if (rc != 0)
2029 break; 2166 break;
2030 } 2167 }
@@ -2173,18 +2310,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
2173{ 2310{
2174 struct cifsFileInfo *open_file; 2311 struct cifsFileInfo *open_file;
2175 2312
2176 read_lock(&GlobalSMBSeslock); 2313 spin_lock(&cifs_file_list_lock);
2177 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 2314 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
2178 if (open_file->closePend) 2315 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
2179 continue; 2316 spin_unlock(&cifs_file_list_lock);
2180 if (open_file->pfile &&
2181 ((open_file->pfile->f_flags & O_RDWR) ||
2182 (open_file->pfile->f_flags & O_WRONLY))) {
2183 read_unlock(&GlobalSMBSeslock);
2184 return 1; 2317 return 1;
2185 } 2318 }
2186 } 2319 }
2187 read_unlock(&GlobalSMBSeslock); 2320 spin_unlock(&cifs_file_list_lock);
2188 return 0; 2321 return 0;
2189} 2322}
2190 2323
@@ -2310,10 +2443,9 @@ void cifs_oplock_break(struct work_struct *work)
2310{ 2443{
2311 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 2444 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2312 oplock_break); 2445 oplock_break);
2313 struct inode *inode = cfile->pInode; 2446 struct inode *inode = cfile->dentry->d_inode;
2314 struct cifsInodeInfo *cinode = CIFS_I(inode); 2447 struct cifsInodeInfo *cinode = CIFS_I(inode);
2315 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb); 2448 int rc = 0;
2316 int rc, waitrc = 0;
2317 2449
2318 if (inode && S_ISREG(inode->i_mode)) { 2450 if (inode && S_ISREG(inode->i_mode)) {
2319 if (cinode->clientCanCacheRead) 2451 if (cinode->clientCanCacheRead)
@@ -2322,13 +2454,10 @@ void cifs_oplock_break(struct work_struct *work)
2322 break_lease(inode, O_WRONLY); 2454 break_lease(inode, O_WRONLY);
2323 rc = filemap_fdatawrite(inode->i_mapping); 2455 rc = filemap_fdatawrite(inode->i_mapping);
2324 if (cinode->clientCanCacheRead == 0) { 2456 if (cinode->clientCanCacheRead == 0) {
2325 waitrc = filemap_fdatawait(inode->i_mapping); 2457 rc = filemap_fdatawait(inode->i_mapping);
2458 mapping_set_error(inode->i_mapping, rc);
2326 invalidate_remote_inode(inode); 2459 invalidate_remote_inode(inode);
2327 } 2460 }
2328 if (!rc)
2329 rc = waitrc;
2330 if (rc)
2331 cinode->write_behind_rc = rc;
2332 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 2461 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2333 } 2462 }
2334 2463
@@ -2338,33 +2467,37 @@ void cifs_oplock_break(struct work_struct *work)
2338 * not bother sending an oplock release if session to server still is 2467 * not bother sending an oplock release if session to server still is
2339 * disconnected since oplock already released by the server 2468 * disconnected since oplock already released by the server
2340 */ 2469 */
2341 if (!cfile->closePend && !cfile->oplock_break_cancelled) { 2470 if (!cfile->oplock_break_cancelled) {
2342 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, 2471 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
2343 LOCKING_ANDX_OPLOCK_RELEASE, false); 2472 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
2473 cinode->clientCanCacheRead ? 1 : 0);
2344 cFYI(1, "Oplock release rc = %d", rc); 2474 cFYI(1, "Oplock release rc = %d", rc);
2345 } 2475 }
2346 2476
2347 /* 2477 /*
2348 * We might have kicked in before is_valid_oplock_break() 2478 * We might have kicked in before is_valid_oplock_break()
2349 * finished grabbing reference for us. Make sure it's done by 2479 * finished grabbing reference for us. Make sure it's done by
2350 * waiting for GlobalSMSSeslock. 2480 * waiting for cifs_file_list_lock.
2351 */ 2481 */
2352 write_lock(&GlobalSMBSeslock); 2482 spin_lock(&cifs_file_list_lock);
2353 write_unlock(&GlobalSMBSeslock); 2483 spin_unlock(&cifs_file_list_lock);
2354 2484
2355 cifs_oplock_break_put(cfile); 2485 cifs_oplock_break_put(cfile);
2356} 2486}
2357 2487
2488/* must be called while holding cifs_file_list_lock */
2358void cifs_oplock_break_get(struct cifsFileInfo *cfile) 2489void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2359{ 2490{
2360 mntget(cfile->mnt); 2491 cifs_sb_active(cfile->dentry->d_sb);
2361 cifsFileInfo_get(cfile); 2492 cifsFileInfo_get(cfile);
2362} 2493}
2363 2494
2364void cifs_oplock_break_put(struct cifsFileInfo *cfile) 2495void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2365{ 2496{
2366 mntput(cfile->mnt); 2497 struct super_block *sb = cfile->dentry->d_sb;
2498
2367 cifsFileInfo_put(cfile); 2499 cifsFileInfo_put(cfile);
2500 cifs_sb_deactive(sb);
2368} 2501}
2369 2502
2370const struct address_space_operations cifs_addr_ops = { 2503const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 9f3f5c4be161..297a43d0ff7f 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
2 * fs/cifs/fscache.c - CIFS filesystem cache interface 2 * fs/cifs/fscache.c - CIFS filesystem cache interface
3 * 3 *
4 * Copyright (c) 2010 Novell, Inc. 4 * Copyright (c) 2010 Novell, Inc.
5 * Author(s): Suresh Jayaraman (sjayaraman@suse.de> 5 * Author(s): Suresh Jayaraman <sjayaraman@suse.de>
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published 8 * it under the terms of the GNU Lesser General Public License as published
@@ -62,15 +62,17 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
62{ 62{
63 struct cifsInodeInfo *cifsi = CIFS_I(inode); 63 struct cifsInodeInfo *cifsi = CIFS_I(inode);
64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
65 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
65 66
66 if (cifsi->fscache) 67 if (cifsi->fscache)
67 return; 68 return;
68 69
69 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, 70 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
70 &cifs_fscache_inode_object_def, 71 cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
71 cifsi); 72 &cifs_fscache_inode_object_def, cifsi);
72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", 73 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
73 cifs_sb->tcon->fscache, cifsi->fscache); 74 cifsi->fscache);
75 }
74} 76}
75 77
76void cifs_fscache_release_inode_cookie(struct inode *inode) 78void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
101{ 103{
102 if ((filp->f_flags & O_ACCMODE) != O_RDONLY) 104 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
103 cifs_fscache_disable_inode_cookie(inode); 105 cifs_fscache_disable_inode_cookie(inode);
104 else { 106 else
105 cifs_fscache_enable_inode_cookie(inode); 107 cifs_fscache_enable_inode_cookie(inode);
106 cFYI(1, "CIFS: fscache inode cookie set");
107 }
108} 108}
109 109
110void cifs_fscache_reset_inode_cookie(struct inode *inode) 110void cifs_fscache_reset_inode_cookie(struct inode *inode)
@@ -117,7 +117,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
117 /* retire the current fscache cache and get a new one */ 117 /* retire the current fscache cache and get a new one */
118 fscache_relinquish_cookie(cifsi->fscache, 1); 118 fscache_relinquish_cookie(cifsi->fscache, 1);
119 119
120 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, 120 cifsi->fscache = fscache_acquire_cookie(
121 cifs_sb_master_tcon(cifs_sb)->fscache,
121 &cifs_fscache_inode_object_def, 122 &cifs_fscache_inode_object_def,
122 cifsi); 123 cifsi);
123 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", 124 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 53cce8cc2224..8852470b4fbb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
32#include "fscache.h" 32#include "fscache.h"
33 33
34 34
35static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) 35static void cifs_set_ops(struct inode *inode)
36{ 36{
37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
38 38
@@ -44,15 +44,19 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
44 inode->i_fop = &cifs_file_direct_nobrl_ops; 44 inode->i_fop = &cifs_file_direct_nobrl_ops;
45 else 45 else
46 inode->i_fop = &cifs_file_direct_ops; 46 inode->i_fop = &cifs_file_direct_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
48 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
49 inode->i_fop = &cifs_file_strict_nobrl_ops;
50 else
51 inode->i_fop = &cifs_file_strict_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 52 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
48 inode->i_fop = &cifs_file_nobrl_ops; 53 inode->i_fop = &cifs_file_nobrl_ops;
49 else { /* not direct, send byte range locks */ 54 else { /* not direct, send byte range locks */
50 inode->i_fop = &cifs_file_ops; 55 inode->i_fop = &cifs_file_ops;
51 } 56 }
52 57
53
54 /* check if server can support readpages */ 58 /* check if server can support readpages */
55 if (cifs_sb->tcon->ses->server->maxBuf < 59 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 60 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
57 inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 61 inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
58 else 62 else
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
60 break; 64 break;
61 case S_IFDIR: 65 case S_IFDIR:
62#ifdef CONFIG_CIFS_DFS_UPCALL 66#ifdef CONFIG_CIFS_DFS_UPCALL
63 if (is_dfs_referral) { 67 if (IS_AUTOMOUNT(inode)) {
64 inode->i_op = &cifs_dfs_referral_inode_operations; 68 inode->i_op = &cifs_dfs_referral_inode_operations;
65 } else { 69 } else {
66#else /* NO DFS support, treat as a directory */ 70#else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
167 } 171 }
168 spin_unlock(&inode->i_lock); 172 spin_unlock(&inode->i_lock);
169 173
170 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); 174 if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
175 inode->i_flags |= S_AUTOMOUNT;
176 cifs_set_ops(inode);
171} 177}
172 178
173void 179void
@@ -288,8 +294,8 @@ int cifs_get_file_info_unix(struct file *filp)
288 struct cifs_fattr fattr; 294 struct cifs_fattr fattr;
289 struct inode *inode = filp->f_path.dentry->d_inode; 295 struct inode *inode = filp->f_path.dentry->d_inode;
290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 296 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
291 struct cifsTconInfo *tcon = cifs_sb->tcon;
292 struct cifsFileInfo *cfile = filp->private_data; 297 struct cifsFileInfo *cfile = filp->private_data;
298 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
293 299
294 xid = GetXid(); 300 xid = GetXid();
295 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); 301 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -313,15 +319,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
313 FILE_UNIX_BASIC_INFO find_data; 319 FILE_UNIX_BASIC_INFO find_data;
314 struct cifs_fattr fattr; 320 struct cifs_fattr fattr;
315 struct cifsTconInfo *tcon; 321 struct cifsTconInfo *tcon;
322 struct tcon_link *tlink;
316 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 323 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
317 324
318 tcon = cifs_sb->tcon;
319 cFYI(1, "Getting info on %s", full_path); 325 cFYI(1, "Getting info on %s", full_path);
320 326
327 tlink = cifs_sb_tlink(cifs_sb);
328 if (IS_ERR(tlink))
329 return PTR_ERR(tlink);
330 tcon = tlink_tcon(tlink);
331
321 /* could have done a find first instead but this returns more info */ 332 /* could have done a find first instead but this returns more info */
322 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, 333 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
323 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 334 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
324 CIFS_MOUNT_MAP_SPECIAL_CHR); 335 CIFS_MOUNT_MAP_SPECIAL_CHR);
336 cifs_put_tlink(tlink);
325 337
326 if (!rc) { 338 if (!rc) {
327 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); 339 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -332,6 +344,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
332 return rc; 344 return rc;
333 } 345 }
334 346
347 /* check for Minshall+French symlinks */
348 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
349 int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
350 if (tmprc)
351 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
352 }
353
335 if (*pinode == NULL) { 354 if (*pinode == NULL) {
336 /* get new inode */ 355 /* get new inode */
337 cifs_fill_uniqueid(sb, &fattr); 356 cifs_fill_uniqueid(sb, &fattr);
@@ -353,7 +372,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
353 int rc; 372 int rc;
354 int oplock = 0; 373 int oplock = 0;
355 __u16 netfid; 374 __u16 netfid;
356 struct cifsTconInfo *pTcon = cifs_sb->tcon; 375 struct tcon_link *tlink;
376 struct cifsTconInfo *tcon;
357 char buf[24]; 377 char buf[24];
358 unsigned int bytes_read; 378 unsigned int bytes_read;
359 char *pbuf; 379 char *pbuf;
@@ -372,7 +392,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
372 return -EINVAL; /* EOPNOTSUPP? */ 392 return -EINVAL; /* EOPNOTSUPP? */
373 } 393 }
374 394
375 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, 395 tlink = cifs_sb_tlink(cifs_sb);
396 if (IS_ERR(tlink))
397 return PTR_ERR(tlink);
398 tcon = tlink_tcon(tlink);
399
400 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
376 CREATE_NOT_DIR, &netfid, &oplock, NULL, 401 CREATE_NOT_DIR, &netfid, &oplock, NULL,
377 cifs_sb->local_nls, 402 cifs_sb->local_nls,
378 cifs_sb->mnt_cifs_flags & 403 cifs_sb->mnt_cifs_flags &
@@ -380,7 +405,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
380 if (rc == 0) { 405 if (rc == 0) {
381 int buf_type = CIFS_NO_BUFFER; 406 int buf_type = CIFS_NO_BUFFER;
382 /* Read header */ 407 /* Read header */
383 rc = CIFSSMBRead(xid, pTcon, netfid, 408 rc = CIFSSMBRead(xid, tcon, netfid,
384 24 /* length */, 0 /* offset */, 409 24 /* length */, 0 /* offset */,
385 &bytes_read, &pbuf, &buf_type); 410 &bytes_read, &pbuf, &buf_type);
386 if ((rc == 0) && (bytes_read >= 8)) { 411 if ((rc == 0) && (bytes_read >= 8)) {
@@ -422,8 +447,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
422 fattr->cf_dtype = DT_REG; 447 fattr->cf_dtype = DT_REG;
423 rc = -EOPNOTSUPP; /* or some unknown SFU type */ 448 rc = -EOPNOTSUPP; /* or some unknown SFU type */
424 } 449 }
425 CIFSSMBClose(xid, pTcon, netfid); 450 CIFSSMBClose(xid, tcon, netfid);
426 } 451 }
452 cifs_put_tlink(tlink);
427 return rc; 453 return rc;
428} 454}
429 455
@@ -441,11 +467,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
441 ssize_t rc; 467 ssize_t rc;
442 char ea_value[4]; 468 char ea_value[4];
443 __u32 mode; 469 __u32 mode;
470 struct tcon_link *tlink;
471 struct cifsTconInfo *tcon;
444 472
445 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS", 473 tlink = cifs_sb_tlink(cifs_sb);
474 if (IS_ERR(tlink))
475 return PTR_ERR(tlink);
476 tcon = tlink_tcon(tlink);
477
478 rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
446 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 479 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
447 cifs_sb->mnt_cifs_flags & 480 cifs_sb->mnt_cifs_flags &
448 CIFS_MOUNT_MAP_SPECIAL_CHR); 481 CIFS_MOUNT_MAP_SPECIAL_CHR);
482 cifs_put_tlink(tlink);
449 if (rc < 0) 483 if (rc < 0)
450 return (int)rc; 484 return (int)rc;
451 else if (rc > 3) { 485 else if (rc > 3) {
@@ -468,6 +502,8 @@ static void
468cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, 502cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
469 struct cifs_sb_info *cifs_sb, bool adjust_tz) 503 struct cifs_sb_info *cifs_sb, bool adjust_tz)
470{ 504{
505 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
506
471 memset(fattr, 0, sizeof(*fattr)); 507 memset(fattr, 0, sizeof(*fattr));
472 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes); 508 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
473 if (info->DeletePending) 509 if (info->DeletePending)
@@ -482,12 +518,13 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
482 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); 518 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
483 519
484 if (adjust_tz) { 520 if (adjust_tz) {
485 fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 521 fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
486 fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 522 fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
487 } 523 }
488 524
489 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 525 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
490 fattr->cf_bytes = le64_to_cpu(info->AllocationSize); 526 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
527 fattr->cf_createtime = le64_to_cpu(info->CreationTime);
491 528
492 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { 529 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
493 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; 530 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
@@ -515,8 +552,8 @@ int cifs_get_file_info(struct file *filp)
515 struct cifs_fattr fattr; 552 struct cifs_fattr fattr;
516 struct inode *inode = filp->f_path.dentry->d_inode; 553 struct inode *inode = filp->f_path.dentry->d_inode;
517 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 554 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
518 struct cifsTconInfo *tcon = cifs_sb->tcon;
519 struct cifsFileInfo *cfile = filp->private_data; 555 struct cifsFileInfo *cfile = filp->private_data;
556 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
520 557
521 xid = GetXid(); 558 xid = GetXid();
522 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 559 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -554,26 +591,33 @@ int cifs_get_inode_info(struct inode **pinode,
554{ 591{
555 int rc = 0, tmprc; 592 int rc = 0, tmprc;
556 struct cifsTconInfo *pTcon; 593 struct cifsTconInfo *pTcon;
594 struct tcon_link *tlink;
557 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 595 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
558 char *buf = NULL; 596 char *buf = NULL;
559 bool adjustTZ = false; 597 bool adjustTZ = false;
560 struct cifs_fattr fattr; 598 struct cifs_fattr fattr;
561 599
562 pTcon = cifs_sb->tcon; 600 tlink = cifs_sb_tlink(cifs_sb);
601 if (IS_ERR(tlink))
602 return PTR_ERR(tlink);
603 pTcon = tlink_tcon(tlink);
604
563 cFYI(1, "Getting info on %s", full_path); 605 cFYI(1, "Getting info on %s", full_path);
564 606
565 if ((pfindData == NULL) && (*pinode != NULL)) { 607 if ((pfindData == NULL) && (*pinode != NULL)) {
566 if (CIFS_I(*pinode)->clientCanCacheRead) { 608 if (CIFS_I(*pinode)->clientCanCacheRead) {
567 cFYI(1, "No need to revalidate cached inode sizes"); 609 cFYI(1, "No need to revalidate cached inode sizes");
568 return rc; 610 goto cgii_exit;
569 } 611 }
570 } 612 }
571 613
572 /* if file info not passed in then get it from server */ 614 /* if file info not passed in then get it from server */
573 if (pfindData == NULL) { 615 if (pfindData == NULL) {
574 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 616 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
575 if (buf == NULL) 617 if (buf == NULL) {
576 return -ENOMEM; 618 rc = -ENOMEM;
619 goto cgii_exit;
620 }
577 pfindData = (FILE_ALL_INFO *)buf; 621 pfindData = (FILE_ALL_INFO *)buf;
578 622
579 /* could do find first instead but this returns more info */ 623 /* could do find first instead but this returns more info */
@@ -649,18 +693,30 @@ int cifs_get_inode_info(struct inode **pinode,
649 cFYI(1, "cifs_sfu_type failed: %d", tmprc); 693 cFYI(1, "cifs_sfu_type failed: %d", tmprc);
650 } 694 }
651 695
652#ifdef CONFIG_CIFS_EXPERIMENTAL 696#ifdef CONFIG_CIFS_ACL
653 /* fill in 0777 bits from ACL */ 697 /* fill in 0777 bits from ACL */
654 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 698 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
655 cFYI(1, "Getting mode bits from ACL"); 699 rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
656 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); 700 pfid);
701 if (rc) {
702 cFYI(1, "%s: Getting ACL failed with error: %d",
703 __func__, rc);
704 goto cgii_exit;
705 }
657 } 706 }
658#endif 707#endif /* CONFIG_CIFS_ACL */
659 708
660 /* fill in remaining high mode bits e.g. SUID, VTX */ 709 /* fill in remaining high mode bits e.g. SUID, VTX */
661 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 710 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
662 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); 711 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
663 712
713 /* check for Minshall+French symlinks */
714 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
715 tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
716 if (tmprc)
717 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
718 }
719
664 if (!*pinode) { 720 if (!*pinode) {
665 *pinode = cifs_iget(sb, &fattr); 721 *pinode = cifs_iget(sb, &fattr);
666 if (!*pinode) 722 if (!*pinode)
@@ -671,6 +727,7 @@ int cifs_get_inode_info(struct inode **pinode,
671 727
672cgii_exit: 728cgii_exit:
673 kfree(buf); 729 kfree(buf);
730 cifs_put_tlink(tlink);
674 return rc; 731 return rc;
675} 732}
676 733
@@ -678,7 +735,8 @@ static const struct inode_operations cifs_ipc_inode_ops = {
678 .lookup = cifs_lookup, 735 .lookup = cifs_lookup,
679}; 736};
680 737
681char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) 738char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
739 struct cifsTconInfo *tcon)
682{ 740{
683 int pplen = cifs_sb->prepathlen; 741 int pplen = cifs_sb->prepathlen;
684 int dfsplen; 742 int dfsplen;
@@ -692,8 +750,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
692 return full_path; 750 return full_path;
693 } 751 }
694 752
695 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 753 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
696 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 754 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
697 else 755 else
698 dfsplen = 0; 756 dfsplen = 0;
699 757
@@ -702,7 +760,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
702 return full_path; 760 return full_path;
703 761
704 if (dfsplen) { 762 if (dfsplen) {
705 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 763 strncpy(full_path, tcon->treeName, dfsplen);
706 /* switch slash direction in prepath depending on whether 764 /* switch slash direction in prepath depending on whether
707 * windows or posix style path names 765 * windows or posix style path names
708 */ 766 */
@@ -728,6 +786,10 @@ cifs_find_inode(struct inode *inode, void *opaque)
728 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) 786 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
729 return 0; 787 return 0;
730 788
789 /* use createtime like an i_generation field */
790 if (CIFS_I(inode)->createtime != fattr->cf_createtime)
791 return 0;
792
731 /* don't match inode of different type */ 793 /* don't match inode of different type */
732 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) 794 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
733 return 0; 795 return 0;
@@ -745,6 +807,7 @@ cifs_init_inode(struct inode *inode, void *opaque)
745 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; 807 struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
746 808
747 CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; 809 CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
810 CIFS_I(inode)->createtime = fattr->cf_createtime;
748 return 0; 811 return 0;
749} 812}
750 813
@@ -758,14 +821,14 @@ inode_has_hashed_dentries(struct inode *inode)
758{ 821{
759 struct dentry *dentry; 822 struct dentry *dentry;
760 823
761 spin_lock(&dcache_lock); 824 spin_lock(&inode->i_lock);
762 list_for_each_entry(dentry, &inode->i_dentry, d_alias) { 825 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
763 if (!d_unhashed(dentry) || IS_ROOT(dentry)) { 826 if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
764 spin_unlock(&dcache_lock); 827 spin_unlock(&inode->i_lock);
765 return true; 828 return true;
766 } 829 }
767 } 830 }
768 spin_unlock(&dcache_lock); 831 spin_unlock(&inode->i_lock);
769 return false; 832 return false;
770} 833}
771 834
@@ -818,32 +881,34 @@ retry_iget5_locked:
818struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) 881struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
819{ 882{
820 int xid; 883 int xid;
821 struct cifs_sb_info *cifs_sb; 884 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
822 struct inode *inode = NULL; 885 struct inode *inode = NULL;
823 long rc; 886 long rc;
824 char *full_path; 887 char *full_path;
888 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
825 889
826 cifs_sb = CIFS_SB(sb); 890 full_path = cifs_build_path_to_root(cifs_sb, tcon);
827 full_path = cifs_build_path_to_root(cifs_sb);
828 if (full_path == NULL) 891 if (full_path == NULL)
829 return ERR_PTR(-ENOMEM); 892 return ERR_PTR(-ENOMEM);
830 893
831 xid = GetXid(); 894 xid = GetXid();
832 if (cifs_sb->tcon->unix_ext) 895 if (tcon->unix_ext)
833 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 896 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
834 else 897 else
835 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 898 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
836 xid, NULL); 899 xid, NULL);
837 900
838 if (!inode) 901 if (!inode) {
839 return ERR_PTR(rc); 902 inode = ERR_PTR(rc);
903 goto out;
904 }
840 905
841#ifdef CONFIG_CIFS_FSCACHE 906#ifdef CONFIG_CIFS_FSCACHE
842 /* populate tcon->resource_id */ 907 /* populate tcon->resource_id */
843 cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid; 908 tcon->resource_id = CIFS_I(inode)->uniqueid;
844#endif 909#endif
845 910
846 if (rc && cifs_sb->tcon->ipc) { 911 if (rc && tcon->ipc) {
847 cFYI(1, "ipc connection - fake read inode"); 912 cFYI(1, "ipc connection - fake read inode");
848 inode->i_mode |= S_IFDIR; 913 inode->i_mode |= S_IFDIR;
849 inode->i_nlink = 2; 914 inode->i_nlink = 2;
@@ -852,13 +917,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
852 inode->i_uid = cifs_sb->mnt_uid; 917 inode->i_uid = cifs_sb->mnt_uid;
853 inode->i_gid = cifs_sb->mnt_gid; 918 inode->i_gid = cifs_sb->mnt_gid;
854 } else if (rc) { 919 } else if (rc) {
855 kfree(full_path);
856 _FreeXid(xid);
857 iget_failed(inode); 920 iget_failed(inode);
858 return ERR_PTR(rc); 921 inode = ERR_PTR(rc);
859 } 922 }
860 923
861 924out:
862 kfree(full_path); 925 kfree(full_path);
863 /* can not call macro FreeXid here since in a void func 926 /* can not call macro FreeXid here since in a void func
864 * TODO: This is no longer true 927 * TODO: This is no longer true
@@ -879,7 +942,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
879 struct cifsFileInfo *open_file; 942 struct cifsFileInfo *open_file;
880 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 943 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
881 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 944 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
882 struct cifsTconInfo *pTcon = cifs_sb->tcon; 945 struct tcon_link *tlink = NULL;
946 struct cifsTconInfo *pTcon;
883 FILE_BASIC_INFO info_buf; 947 FILE_BASIC_INFO info_buf;
884 948
885 if (attrs == NULL) 949 if (attrs == NULL)
@@ -918,13 +982,22 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
918 /* 982 /*
919 * If the file is already open for write, just use that fileid 983 * If the file is already open for write, just use that fileid
920 */ 984 */
921 open_file = find_writable_file(cifsInode); 985 open_file = find_writable_file(cifsInode, true);
922 if (open_file) { 986 if (open_file) {
923 netfid = open_file->netfid; 987 netfid = open_file->netfid;
924 netpid = open_file->pid; 988 netpid = open_file->pid;
989 pTcon = tlink_tcon(open_file->tlink);
925 goto set_via_filehandle; 990 goto set_via_filehandle;
926 } 991 }
927 992
993 tlink = cifs_sb_tlink(cifs_sb);
994 if (IS_ERR(tlink)) {
995 rc = PTR_ERR(tlink);
996 tlink = NULL;
997 goto out;
998 }
999 pTcon = tlink_tcon(tlink);
1000
928 /* 1001 /*
929 * NT4 apparently returns success on this call, but it doesn't 1002 * NT4 apparently returns success on this call, but it doesn't
930 * really work. 1003 * really work.
@@ -968,6 +1041,8 @@ set_via_filehandle:
968 else 1041 else
969 cifsFileInfo_put(open_file); 1042 cifsFileInfo_put(open_file);
970out: 1043out:
1044 if (tlink != NULL)
1045 cifs_put_tlink(tlink);
971 return rc; 1046 return rc;
972} 1047}
973 1048
@@ -985,10 +1060,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
985 struct inode *inode = dentry->d_inode; 1060 struct inode *inode = dentry->d_inode;
986 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1061 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
987 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1062 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
988 struct cifsTconInfo *tcon = cifs_sb->tcon; 1063 struct tcon_link *tlink;
1064 struct cifsTconInfo *tcon;
989 __u32 dosattr, origattr; 1065 __u32 dosattr, origattr;
990 FILE_BASIC_INFO *info_buf = NULL; 1066 FILE_BASIC_INFO *info_buf = NULL;
991 1067
1068 tlink = cifs_sb_tlink(cifs_sb);
1069 if (IS_ERR(tlink))
1070 return PTR_ERR(tlink);
1071 tcon = tlink_tcon(tlink);
1072
992 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 1073 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
993 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, 1074 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
994 &netfid, &oplock, NULL, cifs_sb->local_nls, 1075 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1057,6 +1138,7 @@ out_close:
1057 CIFSSMBClose(xid, tcon, netfid); 1138 CIFSSMBClose(xid, tcon, netfid);
1058out: 1139out:
1059 kfree(info_buf); 1140 kfree(info_buf);
1141 cifs_put_tlink(tlink);
1060 return rc; 1142 return rc;
1061 1143
1062 /* 1144 /*
@@ -1096,12 +1178,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1096 struct cifsInodeInfo *cifs_inode; 1178 struct cifsInodeInfo *cifs_inode;
1097 struct super_block *sb = dir->i_sb; 1179 struct super_block *sb = dir->i_sb;
1098 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 1180 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
1099 struct cifsTconInfo *tcon = cifs_sb->tcon; 1181 struct tcon_link *tlink;
1182 struct cifsTconInfo *tcon;
1100 struct iattr *attrs = NULL; 1183 struct iattr *attrs = NULL;
1101 __u32 dosattr = 0, origattr = 0; 1184 __u32 dosattr = 0, origattr = 0;
1102 1185
1103 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry); 1186 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
1104 1187
1188 tlink = cifs_sb_tlink(cifs_sb);
1189 if (IS_ERR(tlink))
1190 return PTR_ERR(tlink);
1191 tcon = tlink_tcon(tlink);
1192
1105 xid = GetXid(); 1193 xid = GetXid();
1106 1194
1107 /* Unlink can be called from rename so we can not take the 1195 /* Unlink can be called from rename so we can not take the
@@ -1109,8 +1197,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1109 full_path = build_path_from_dentry(dentry); 1197 full_path = build_path_from_dentry(dentry);
1110 if (full_path == NULL) { 1198 if (full_path == NULL) {
1111 rc = -ENOMEM; 1199 rc = -ENOMEM;
1112 FreeXid(xid); 1200 goto unlink_out;
1113 return rc;
1114 } 1201 }
1115 1202
1116 if ((tcon->ses->capabilities & CAP_UNIX) && 1203 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1176,10 +1263,11 @@ out_reval:
1176 dir->i_ctime = dir->i_mtime = current_fs_time(sb); 1263 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
1177 cifs_inode = CIFS_I(dir); 1264 cifs_inode = CIFS_I(dir);
1178 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ 1265 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
1179 1266unlink_out:
1180 kfree(full_path); 1267 kfree(full_path);
1181 kfree(attrs); 1268 kfree(attrs);
1182 FreeXid(xid); 1269 FreeXid(xid);
1270 cifs_put_tlink(tlink);
1183 return rc; 1271 return rc;
1184} 1272}
1185 1273
@@ -1188,6 +1276,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1188 int rc = 0, tmprc; 1276 int rc = 0, tmprc;
1189 int xid; 1277 int xid;
1190 struct cifs_sb_info *cifs_sb; 1278 struct cifs_sb_info *cifs_sb;
1279 struct tcon_link *tlink;
1191 struct cifsTconInfo *pTcon; 1280 struct cifsTconInfo *pTcon;
1192 char *full_path = NULL; 1281 char *full_path = NULL;
1193 struct inode *newinode = NULL; 1282 struct inode *newinode = NULL;
@@ -1195,16 +1284,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1195 1284
1196 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode); 1285 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
1197 1286
1198 xid = GetXid();
1199
1200 cifs_sb = CIFS_SB(inode->i_sb); 1287 cifs_sb = CIFS_SB(inode->i_sb);
1201 pTcon = cifs_sb->tcon; 1288 tlink = cifs_sb_tlink(cifs_sb);
1289 if (IS_ERR(tlink))
1290 return PTR_ERR(tlink);
1291 pTcon = tlink_tcon(tlink);
1292
1293 xid = GetXid();
1202 1294
1203 full_path = build_path_from_dentry(direntry); 1295 full_path = build_path_from_dentry(direntry);
1204 if (full_path == NULL) { 1296 if (full_path == NULL) {
1205 rc = -ENOMEM; 1297 rc = -ENOMEM;
1206 FreeXid(xid); 1298 goto mkdir_out;
1207 return rc;
1208 } 1299 }
1209 1300
1210 if ((pTcon->ses->capabilities & CAP_UNIX) && 1301 if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1239,10 +1330,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1239/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need 1330/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
1240 to set uid/gid */ 1331 to set uid/gid */
1241 inc_nlink(inode); 1332 inc_nlink(inode);
1242 if (pTcon->nocase)
1243 direntry->d_op = &cifs_ci_dentry_ops;
1244 else
1245 direntry->d_op = &cifs_dentry_ops;
1246 1333
1247 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); 1334 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1248 cifs_fill_uniqueid(inode->i_sb, &fattr); 1335 cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1283,10 +1370,6 @@ mkdir_get_info:
1283 rc = cifs_get_inode_info(&newinode, full_path, NULL, 1370 rc = cifs_get_inode_info(&newinode, full_path, NULL,
1284 inode->i_sb, xid, NULL); 1371 inode->i_sb, xid, NULL);
1285 1372
1286 if (pTcon->nocase)
1287 direntry->d_op = &cifs_ci_dentry_ops;
1288 else
1289 direntry->d_op = &cifs_dentry_ops;
1290 d_instantiate(direntry, newinode); 1373 d_instantiate(direntry, newinode);
1291 /* setting nlink not necessary except in cases where we 1374 /* setting nlink not necessary except in cases where we
1292 * failed to get it from the server or was set bogus */ 1375 * failed to get it from the server or was set bogus */
@@ -1362,6 +1445,7 @@ mkdir_get_info:
1362mkdir_out: 1445mkdir_out:
1363 kfree(full_path); 1446 kfree(full_path);
1364 FreeXid(xid); 1447 FreeXid(xid);
1448 cifs_put_tlink(tlink);
1365 return rc; 1449 return rc;
1366} 1450}
1367 1451
@@ -1370,6 +1454,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1370 int rc = 0; 1454 int rc = 0;
1371 int xid; 1455 int xid;
1372 struct cifs_sb_info *cifs_sb; 1456 struct cifs_sb_info *cifs_sb;
1457 struct tcon_link *tlink;
1373 struct cifsTconInfo *pTcon; 1458 struct cifsTconInfo *pTcon;
1374 char *full_path = NULL; 1459 char *full_path = NULL;
1375 struct cifsInodeInfo *cifsInode; 1460 struct cifsInodeInfo *cifsInode;
@@ -1378,18 +1463,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1378 1463
1379 xid = GetXid(); 1464 xid = GetXid();
1380 1465
1381 cifs_sb = CIFS_SB(inode->i_sb);
1382 pTcon = cifs_sb->tcon;
1383
1384 full_path = build_path_from_dentry(direntry); 1466 full_path = build_path_from_dentry(direntry);
1385 if (full_path == NULL) { 1467 if (full_path == NULL) {
1386 rc = -ENOMEM; 1468 rc = -ENOMEM;
1387 FreeXid(xid); 1469 goto rmdir_exit;
1388 return rc;
1389 } 1470 }
1390 1471
1472 cifs_sb = CIFS_SB(inode->i_sb);
1473 tlink = cifs_sb_tlink(cifs_sb);
1474 if (IS_ERR(tlink)) {
1475 rc = PTR_ERR(tlink);
1476 goto rmdir_exit;
1477 }
1478 pTcon = tlink_tcon(tlink);
1479
1391 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, 1480 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
1392 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1481 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1482 cifs_put_tlink(tlink);
1393 1483
1394 if (!rc) { 1484 if (!rc) {
1395 drop_nlink(inode); 1485 drop_nlink(inode);
@@ -1410,6 +1500,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1410 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = 1500 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
1411 current_fs_time(inode->i_sb); 1501 current_fs_time(inode->i_sb);
1412 1502
1503rmdir_exit:
1413 kfree(full_path); 1504 kfree(full_path);
1414 FreeXid(xid); 1505 FreeXid(xid);
1415 return rc; 1506 return rc;
@@ -1420,10 +1511,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1420 struct dentry *to_dentry, const char *toPath) 1511 struct dentry *to_dentry, const char *toPath)
1421{ 1512{
1422 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); 1513 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1423 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1514 struct tcon_link *tlink;
1515 struct cifsTconInfo *pTcon;
1424 __u16 srcfid; 1516 __u16 srcfid;
1425 int oplock, rc; 1517 int oplock, rc;
1426 1518
1519 tlink = cifs_sb_tlink(cifs_sb);
1520 if (IS_ERR(tlink))
1521 return PTR_ERR(tlink);
1522 pTcon = tlink_tcon(tlink);
1523
1427 /* try path-based rename first */ 1524 /* try path-based rename first */
1428 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, 1525 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
1429 cifs_sb->mnt_cifs_flags & 1526 cifs_sb->mnt_cifs_flags &
@@ -1435,11 +1532,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1435 * rename by filehandle to various Windows servers. 1532 * rename by filehandle to various Windows servers.
1436 */ 1533 */
1437 if (rc == 0 || rc != -ETXTBSY) 1534 if (rc == 0 || rc != -ETXTBSY)
1438 return rc; 1535 goto do_rename_exit;
1439 1536
1440 /* open-file renames don't work across directories */ 1537 /* open-file renames don't work across directories */
1441 if (to_dentry->d_parent != from_dentry->d_parent) 1538 if (to_dentry->d_parent != from_dentry->d_parent)
1442 return rc; 1539 goto do_rename_exit;
1443 1540
1444 /* open the file to be renamed -- we need DELETE perms */ 1541 /* open the file to be renamed -- we need DELETE perms */
1445 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, 1542 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1455,7 +1552,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1455 1552
1456 CIFSSMBClose(xid, pTcon, srcfid); 1553 CIFSSMBClose(xid, pTcon, srcfid);
1457 } 1554 }
1458 1555do_rename_exit:
1556 cifs_put_tlink(tlink);
1459 return rc; 1557 return rc;
1460} 1558}
1461 1559
@@ -1465,13 +1563,17 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1465 char *fromName = NULL; 1563 char *fromName = NULL;
1466 char *toName = NULL; 1564 char *toName = NULL;
1467 struct cifs_sb_info *cifs_sb; 1565 struct cifs_sb_info *cifs_sb;
1566 struct tcon_link *tlink;
1468 struct cifsTconInfo *tcon; 1567 struct cifsTconInfo *tcon;
1469 FILE_UNIX_BASIC_INFO *info_buf_source = NULL; 1568 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1470 FILE_UNIX_BASIC_INFO *info_buf_target; 1569 FILE_UNIX_BASIC_INFO *info_buf_target;
1471 int xid, rc, tmprc; 1570 int xid, rc, tmprc;
1472 1571
1473 cifs_sb = CIFS_SB(source_dir->i_sb); 1572 cifs_sb = CIFS_SB(source_dir->i_sb);
1474 tcon = cifs_sb->tcon; 1573 tlink = cifs_sb_tlink(cifs_sb);
1574 if (IS_ERR(tlink))
1575 return PTR_ERR(tlink);
1576 tcon = tlink_tcon(tlink);
1475 1577
1476 xid = GetXid(); 1578 xid = GetXid();
1477 1579
@@ -1547,6 +1649,7 @@ cifs_rename_exit:
1547 kfree(fromName); 1649 kfree(fromName);
1548 kfree(toName); 1650 kfree(toName);
1549 FreeXid(xid); 1651 FreeXid(xid);
1652 cifs_put_tlink(tlink);
1550 return rc; 1653 return rc;
1551} 1654}
1552 1655
@@ -1554,6 +1657,7 @@ static bool
1554cifs_inode_needs_reval(struct inode *inode) 1657cifs_inode_needs_reval(struct inode *inode)
1555{ 1658{
1556 struct cifsInodeInfo *cifs_i = CIFS_I(inode); 1659 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1660 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1557 1661
1558 if (cifs_i->clientCanCacheRead) 1662 if (cifs_i->clientCanCacheRead)
1559 return false; 1663 return false;
@@ -1564,20 +1668,22 @@ cifs_inode_needs_reval(struct inode *inode)
1564 if (cifs_i->time == 0) 1668 if (cifs_i->time == 0)
1565 return true; 1669 return true;
1566 1670
1567 /* FIXME: the actimeo should be tunable */ 1671 if (!time_in_range(jiffies, cifs_i->time,
1568 if (time_after_eq(jiffies, cifs_i->time + HZ)) 1672 cifs_i->time + cifs_sb->actimeo))
1569 return true; 1673 return true;
1570 1674
1571 /* hardlinked files w/ noserverino get "special" treatment */ 1675 /* hardlinked files w/ noserverino get "special" treatment */
1572 if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && 1676 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
1573 S_ISREG(inode->i_mode) && inode->i_nlink != 1) 1677 S_ISREG(inode->i_mode) && inode->i_nlink != 1)
1574 return true; 1678 return true;
1575 1679
1576 return false; 1680 return false;
1577} 1681}
1578 1682
1579/* check invalid_mapping flag and zap the cache if it's set */ 1683/*
1580static void 1684 * Zap the cache. Called when invalid_mapping flag is set.
1685 */
1686void
1581cifs_invalidate_mapping(struct inode *inode) 1687cifs_invalidate_mapping(struct inode *inode)
1582{ 1688{
1583 int rc; 1689 int rc;
@@ -1588,8 +1694,7 @@ cifs_invalidate_mapping(struct inode *inode)
1588 /* write back any cached data */ 1694 /* write back any cached data */
1589 if (inode->i_mapping && inode->i_mapping->nrpages != 0) { 1695 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1590 rc = filemap_write_and_wait(inode->i_mapping); 1696 rc = filemap_write_and_wait(inode->i_mapping);
1591 if (rc) 1697 mapping_set_error(inode->i_mapping, rc);
1592 cifs_i->write_behind_rc = rc;
1593 } 1698 }
1594 invalidate_remote_inode(inode); 1699 invalidate_remote_inode(inode);
1595 cifs_fscache_reset_inode_cookie(inode); 1700 cifs_fscache_reset_inode_cookie(inode);
@@ -1599,11 +1704,12 @@ int cifs_revalidate_file(struct file *filp)
1599{ 1704{
1600 int rc = 0; 1705 int rc = 0;
1601 struct inode *inode = filp->f_path.dentry->d_inode; 1706 struct inode *inode = filp->f_path.dentry->d_inode;
1707 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
1602 1708
1603 if (!cifs_inode_needs_reval(inode)) 1709 if (!cifs_inode_needs_reval(inode))
1604 goto check_inval; 1710 goto check_inval;
1605 1711
1606 if (CIFS_SB(inode->i_sb)->tcon->unix_ext) 1712 if (tlink_tcon(cfile->tlink)->unix_ext)
1607 rc = cifs_get_file_info_unix(filp); 1713 rc = cifs_get_file_info_unix(filp);
1608 else 1714 else
1609 rc = cifs_get_file_info(filp); 1715 rc = cifs_get_file_info(filp);
@@ -1644,7 +1750,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
1644 "jiffies %ld", full_path, inode, inode->i_count.counter, 1750 "jiffies %ld", full_path, inode, inode->i_count.counter,
1645 dentry, dentry->d_time, jiffies); 1751 dentry, dentry->d_time, jiffies);
1646 1752
1647 if (CIFS_SB(sb)->tcon->unix_ext) 1753 if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
1648 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 1754 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1649 else 1755 else
1650 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 1756 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1660,13 +1766,29 @@ check_inval:
1660} 1766}
1661 1767
1662int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1768int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1663 struct kstat *stat) 1769 struct kstat *stat)
1664{ 1770{
1771 struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
1772 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
1665 int err = cifs_revalidate_dentry(dentry); 1773 int err = cifs_revalidate_dentry(dentry);
1774
1666 if (!err) { 1775 if (!err) {
1667 generic_fillattr(dentry->d_inode, stat); 1776 generic_fillattr(dentry->d_inode, stat);
1668 stat->blksize = CIFS_MAX_MSGSIZE; 1777 stat->blksize = CIFS_MAX_MSGSIZE;
1669 stat->ino = CIFS_I(dentry->d_inode)->uniqueid; 1778 stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
1779
1780 /*
1781 * If on a multiuser mount without unix extensions, and the
1782 * admin hasn't overridden them, set the ownership to the
1783 * fsuid/fsgid of the current process.
1784 */
1785 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
1786 !tcon->unix_ext) {
1787 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
1788 stat->uid = current_fsuid();
1789 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
1790 stat->gid = current_fsgid();
1791 }
1670 } 1792 }
1671 return err; 1793 return err;
1672} 1794}
@@ -1708,7 +1830,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1708 struct cifsFileInfo *open_file; 1830 struct cifsFileInfo *open_file;
1709 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1831 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1710 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1832 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1711 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1833 struct tcon_link *tlink = NULL;
1834 struct cifsTconInfo *pTcon = NULL;
1712 1835
1713 /* 1836 /*
1714 * To avoid spurious oplock breaks from server, in the case of 1837 * To avoid spurious oplock breaks from server, in the case of
@@ -1719,10 +1842,11 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1719 * writebehind data than the SMB timeout for the SetPathInfo 1842 * writebehind data than the SMB timeout for the SetPathInfo
1720 * request would allow 1843 * request would allow
1721 */ 1844 */
1722 open_file = find_writable_file(cifsInode); 1845 open_file = find_writable_file(cifsInode, true);
1723 if (open_file) { 1846 if (open_file) {
1724 __u16 nfid = open_file->netfid; 1847 __u16 nfid = open_file->netfid;
1725 __u32 npid = open_file->pid; 1848 __u32 npid = open_file->pid;
1849 pTcon = tlink_tcon(open_file->tlink);
1726 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1850 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1727 npid, false); 1851 npid, false);
1728 cifsFileInfo_put(open_file); 1852 cifsFileInfo_put(open_file);
@@ -1737,6 +1861,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1737 rc = -EINVAL; 1861 rc = -EINVAL;
1738 1862
1739 if (rc != 0) { 1863 if (rc != 0) {
1864 if (pTcon == NULL) {
1865 tlink = cifs_sb_tlink(cifs_sb);
1866 if (IS_ERR(tlink))
1867 return PTR_ERR(tlink);
1868 pTcon = tlink_tcon(tlink);
1869 }
1870
1740 /* Set file size by pathname rather than by handle 1871 /* Set file size by pathname rather than by handle
1741 either because no valid, writeable file handle for 1872 either because no valid, writeable file handle for
1742 it was found or because there was an error setting 1873 it was found or because there was an error setting
@@ -1766,6 +1897,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1766 CIFSSMBClose(xid, pTcon, netfid); 1897 CIFSSMBClose(xid, pTcon, netfid);
1767 } 1898 }
1768 } 1899 }
1900 if (tlink)
1901 cifs_put_tlink(tlink);
1769 } 1902 }
1770 1903
1771 if (rc == 0) { 1904 if (rc == 0) {
@@ -1786,7 +1919,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1786 struct inode *inode = direntry->d_inode; 1919 struct inode *inode = direntry->d_inode;
1787 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1920 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1788 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1921 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1789 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1922 struct tcon_link *tlink;
1923 struct cifsTconInfo *pTcon;
1790 struct cifs_unix_set_info_args *args = NULL; 1924 struct cifs_unix_set_info_args *args = NULL;
1791 struct cifsFileInfo *open_file; 1925 struct cifsFileInfo *open_file;
1792 1926
@@ -1820,10 +1954,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1820 * the flush returns error? 1954 * the flush returns error?
1821 */ 1955 */
1822 rc = filemap_write_and_wait(inode->i_mapping); 1956 rc = filemap_write_and_wait(inode->i_mapping);
1823 if (rc != 0) { 1957 mapping_set_error(inode->i_mapping, rc);
1824 cifsInode->write_behind_rc = rc; 1958 rc = 0;
1825 rc = 0;
1826 }
1827 1959
1828 if (attrs->ia_valid & ATTR_SIZE) { 1960 if (attrs->ia_valid & ATTR_SIZE) {
1829 rc = cifs_set_file_size(inode, attrs, xid, full_path); 1961 rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1873,17 +2005,25 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1873 args->ctime = NO_CHANGE_64; 2005 args->ctime = NO_CHANGE_64;
1874 2006
1875 args->device = 0; 2007 args->device = 0;
1876 open_file = find_writable_file(cifsInode); 2008 open_file = find_writable_file(cifsInode, true);
1877 if (open_file) { 2009 if (open_file) {
1878 u16 nfid = open_file->netfid; 2010 u16 nfid = open_file->netfid;
1879 u32 npid = open_file->pid; 2011 u32 npid = open_file->pid;
2012 pTcon = tlink_tcon(open_file->tlink);
1880 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 2013 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1881 cifsFileInfo_put(open_file); 2014 cifsFileInfo_put(open_file);
1882 } else { 2015 } else {
2016 tlink = cifs_sb_tlink(cifs_sb);
2017 if (IS_ERR(tlink)) {
2018 rc = PTR_ERR(tlink);
2019 goto out;
2020 }
2021 pTcon = tlink_tcon(tlink);
1883 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, 2022 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1884 cifs_sb->local_nls, 2023 cifs_sb->local_nls,
1885 cifs_sb->mnt_cifs_flags & 2024 cifs_sb->mnt_cifs_flags &
1886 CIFS_MOUNT_MAP_SPECIAL_CHR); 2025 CIFS_MOUNT_MAP_SPECIAL_CHR);
2026 cifs_put_tlink(tlink);
1887 } 2027 }
1888 2028
1889 if (rc) 2029 if (rc)
@@ -1956,10 +2096,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1956 * the flush returns error? 2096 * the flush returns error?
1957 */ 2097 */
1958 rc = filemap_write_and_wait(inode->i_mapping); 2098 rc = filemap_write_and_wait(inode->i_mapping);
1959 if (rc != 0) { 2099 mapping_set_error(inode->i_mapping, rc);
1960 cifsInode->write_behind_rc = rc; 2100 rc = 0;
1961 rc = 0;
1962 }
1963 2101
1964 if (attrs->ia_valid & ATTR_SIZE) { 2102 if (attrs->ia_valid & ATTR_SIZE) {
1965 rc = cifs_set_file_size(inode, attrs, xid, full_path); 2103 rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1988,11 +2126,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1988 2126
1989 if (attrs->ia_valid & ATTR_MODE) { 2127 if (attrs->ia_valid & ATTR_MODE) {
1990 rc = 0; 2128 rc = 0;
1991#ifdef CONFIG_CIFS_EXPERIMENTAL 2129#ifdef CONFIG_CIFS_ACL
1992 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) 2130 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
1993 rc = mode_to_acl(inode, full_path, mode); 2131 rc = mode_to_cifs_acl(inode, full_path, mode);
1994 else 2132 if (rc) {
1995#endif 2133 cFYI(1, "%s: Setting ACL failed with error: %d",
2134 __func__, rc);
2135 goto cifs_setattr_exit;
2136 }
2137 } else
2138#endif /* CONFIG_CIFS_ACL */
1996 if (((mode & S_IWUGO) == 0) && 2139 if (((mode & S_IWUGO) == 0) &&
1997 (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { 2140 (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
1998 2141
@@ -2051,7 +2194,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2051 2194
2052 setattr_copy(inode, attrs); 2195 setattr_copy(inode, attrs);
2053 mark_inode_dirty(inode); 2196 mark_inode_dirty(inode);
2054 return 0;
2055 2197
2056cifs_setattr_exit: 2198cifs_setattr_exit:
2057 kfree(full_path); 2199 kfree(full_path);
@@ -2064,7 +2206,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
2064{ 2206{
2065 struct inode *inode = direntry->d_inode; 2207 struct inode *inode = direntry->d_inode;
2066 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 2208 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2067 struct cifsTconInfo *pTcon = cifs_sb->tcon; 2209 struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
2068 2210
2069 if (pTcon->unix_ext) 2211 if (pTcon->unix_ext)
2070 return cifs_setattr_unix(direntry, attrs); 2212 return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9d38a71c8e14..0c98672d0122 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,11 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
37 int xid; 37 int xid;
38 struct cifs_sb_info *cifs_sb; 38 struct cifs_sb_info *cifs_sb;
39#ifdef CONFIG_CIFS_POSIX 39#ifdef CONFIG_CIFS_POSIX
40 struct cifsFileInfo *pSMBFile = filep->private_data;
41 struct cifsTconInfo *tcon;
40 __u64 ExtAttrBits = 0; 42 __u64 ExtAttrBits = 0;
41 __u64 ExtAttrMask = 0; 43 __u64 ExtAttrMask = 0;
42 __u64 caps; 44 __u64 caps;
43 struct cifsTconInfo *tcon;
44 struct cifsFileInfo *pSMBFile = filep->private_data;
45#endif /* CONFIG_CIFS_POSIX */ 45#endif /* CONFIG_CIFS_POSIX */
46 46
47 xid = GetXid(); 47 xid = GetXid();
@@ -50,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
50 50
51 cifs_sb = CIFS_SB(inode->i_sb); 51 cifs_sb = CIFS_SB(inode->i_sb);
52 52
53#ifdef CONFIG_CIFS_POSIX
54 tcon = cifs_sb->tcon;
55 if (tcon)
56 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
57 else {
58 rc = -EIO;
59 FreeXid(xid);
60 return -EIO;
61 }
62#endif /* CONFIG_CIFS_POSIX */
63
64 switch (command) { 53 switch (command) {
65 case CIFS_IOC_CHECKUMOUNT: 54 case CIFS_IOC_CHECKUMOUNT:
66 cFYI(1, "User unmount attempted"); 55 cFYI(1, "User unmount attempted");
@@ -73,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
73 break; 62 break;
74#ifdef CONFIG_CIFS_POSIX 63#ifdef CONFIG_CIFS_POSIX
75 case FS_IOC_GETFLAGS: 64 case FS_IOC_GETFLAGS:
65 if (pSMBFile == NULL)
66 break;
67 tcon = tlink_tcon(pSMBFile->tlink);
68 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
76 if (CIFS_UNIX_EXTATTR_CAP & caps) { 69 if (CIFS_UNIX_EXTATTR_CAP & caps) {
77 if (pSMBFile == NULL)
78 break;
79 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, 70 rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
80 &ExtAttrBits, &ExtAttrMask); 71 &ExtAttrBits, &ExtAttrMask);
81 if (rc == 0) 72 if (rc == 0)
@@ -86,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
86 break; 77 break;
87 78
88 case FS_IOC_SETFLAGS: 79 case FS_IOC_SETFLAGS:
80 if (pSMBFile == NULL)
81 break;
82 tcon = tlink_tcon(pSMBFile->tlink);
83 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
89 if (CIFS_UNIX_EXTATTR_CAP & caps) { 84 if (CIFS_UNIX_EXTATTR_CAP & caps) {
90 if (get_user(ExtAttrBits, (int __user *)arg)) { 85 if (get_user(ExtAttrBits, (int __user *)arg)) {
91 rc = -EFAULT; 86 rc = -EFAULT;
92 break; 87 break;
93 } 88 }
94 if (pSMBFile == NULL)
95 break;
96 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 89 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
97 extAttrBits, &ExtAttrMask);*/ 90 extAttrBits, &ExtAttrMask);*/
98 } 91 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca8033656..e8804d373404 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,6 +29,337 @@
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31 31
32#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
33#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
34#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
35#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
36#define CIFS_MF_SYMLINK_FILE_SIZE \
37 (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
38
39#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
40#define CIFS_MF_SYMLINK_MD5_FORMAT \
41 "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
42#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
43 md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \
44 md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \
45 md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\
46 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
47
48static int
49symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
50{
51 int rc;
52 unsigned int size;
53 struct crypto_shash *md5;
54 struct sdesc *sdescmd5;
55
56 md5 = crypto_alloc_shash("md5", 0, 0);
57 if (IS_ERR(md5)) {
58 rc = PTR_ERR(md5);
59 cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
60 return rc;
61 }
62 size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
63 sdescmd5 = kmalloc(size, GFP_KERNEL);
64 if (!sdescmd5) {
65 rc = -ENOMEM;
66 cERROR(1, "%s: Memory allocation failure\n", __func__);
67 goto symlink_hash_err;
68 }
69 sdescmd5->shash.tfm = md5;
70 sdescmd5->shash.flags = 0x0;
71
72 rc = crypto_shash_init(&sdescmd5->shash);
73 if (rc) {
74 cERROR(1, "%s: Could not init md5 shash\n", __func__);
75 goto symlink_hash_err;
76 }
77 crypto_shash_update(&sdescmd5->shash, link_str, link_len);
78 rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
79
80symlink_hash_err:
81 crypto_free_shash(md5);
82 kfree(sdescmd5);
83
84 return rc;
85}
86
87static int
88CIFSParseMFSymlink(const u8 *buf,
89 unsigned int buf_len,
90 unsigned int *_link_len,
91 char **_link_str)
92{
93 int rc;
94 unsigned int link_len;
95 const char *md5_str1;
96 const char *link_str;
97 u8 md5_hash[16];
98 char md5_str2[34];
99
100 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
101 return -EINVAL;
102
103 md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
104 link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
105
106 rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
107 if (rc != 1)
108 return -EINVAL;
109
110 rc = symlink_hash(link_len, link_str, md5_hash);
111 if (rc) {
112 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
113 return rc;
114 }
115
116 snprintf(md5_str2, sizeof(md5_str2),
117 CIFS_MF_SYMLINK_MD5_FORMAT,
118 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
119
120 if (strncmp(md5_str1, md5_str2, 17) != 0)
121 return -EINVAL;
122
123 if (_link_str) {
124 *_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
125 if (!*_link_str)
126 return -ENOMEM;
127 }
128
129 *_link_len = link_len;
130 return 0;
131}
132
133static int
134CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
135{
136 int rc;
137 unsigned int link_len;
138 unsigned int ofs;
139 u8 md5_hash[16];
140
141 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
142 return -EINVAL;
143
144 link_len = strlen(link_str);
145
146 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
147 return -ENAMETOOLONG;
148
149 rc = symlink_hash(link_len, link_str, md5_hash);
150 if (rc) {
151 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
152 return rc;
153 }
154
155 snprintf(buf, buf_len,
156 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
157 link_len,
158 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
159
160 ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
161 memcpy(buf + ofs, link_str, link_len);
162
163 ofs += link_len;
164 if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
165 buf[ofs] = '\n';
166 ofs++;
167 }
168
169 while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
170 buf[ofs] = ' ';
171 ofs++;
172 }
173
174 return 0;
175}
176
177static int
178CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
179 const char *fromName, const char *toName,
180 const struct nls_table *nls_codepage, int remap)
181{
182 int rc;
183 int oplock = 0;
184 __u16 netfid = 0;
185 u8 *buf;
186 unsigned int bytes_written = 0;
187
188 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
189 if (!buf)
190 return -ENOMEM;
191
192 rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
193 if (rc != 0) {
194 kfree(buf);
195 return rc;
196 }
197
198 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
199 CREATE_NOT_DIR, &netfid, &oplock, NULL,
200 nls_codepage, remap);
201 if (rc != 0) {
202 kfree(buf);
203 return rc;
204 }
205
206 rc = CIFSSMBWrite(xid, tcon, netfid,
207 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
208 0 /* offset */,
209 &bytes_written, buf, NULL, 0);
210 CIFSSMBClose(xid, tcon, netfid);
211 kfree(buf);
212 if (rc != 0)
213 return rc;
214
215 if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
216 return -EIO;
217
218 return 0;
219}
220
221static int
222CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
223 const unsigned char *searchName, char **symlinkinfo,
224 const struct nls_table *nls_codepage, int remap)
225{
226 int rc;
227 int oplock = 0;
228 __u16 netfid = 0;
229 u8 *buf;
230 char *pbuf;
231 unsigned int bytes_read = 0;
232 int buf_type = CIFS_NO_BUFFER;
233 unsigned int link_len = 0;
234 FILE_ALL_INFO file_info;
235
236 rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
237 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
238 nls_codepage, remap);
239 if (rc != 0)
240 return rc;
241
242 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
243 CIFSSMBClose(xid, tcon, netfid);
244 /* it's not a symlink */
245 return -EINVAL;
246 }
247
248 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
249 if (!buf)
250 return -ENOMEM;
251 pbuf = buf;
252
253 rc = CIFSSMBRead(xid, tcon, netfid,
254 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
255 0 /* offset */,
256 &bytes_read, &pbuf, &buf_type);
257 CIFSSMBClose(xid, tcon, netfid);
258 if (rc != 0) {
259 kfree(buf);
260 return rc;
261 }
262
263 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
264 kfree(buf);
265 if (rc != 0)
266 return rc;
267
268 return 0;
269}
270
271bool
272CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
273{
274 if (!(fattr->cf_mode & S_IFREG))
275 /* it's not a symlink */
276 return false;
277
278 if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
279 /* it's not a symlink */
280 return false;
281
282 return true;
283}
284
285int
286CIFSCheckMFSymlink(struct cifs_fattr *fattr,
287 const unsigned char *path,
288 struct cifs_sb_info *cifs_sb, int xid)
289{
290 int rc;
291 int oplock = 0;
292 __u16 netfid = 0;
293 struct tcon_link *tlink;
294 struct cifsTconInfo *pTcon;
295 u8 *buf;
296 char *pbuf;
297 unsigned int bytes_read = 0;
298 int buf_type = CIFS_NO_BUFFER;
299 unsigned int link_len = 0;
300 FILE_ALL_INFO file_info;
301
302 if (!CIFSCouldBeMFSymlink(fattr))
303 /* it's not a symlink */
304 return 0;
305
306 tlink = cifs_sb_tlink(cifs_sb);
307 if (IS_ERR(tlink))
308 return PTR_ERR(tlink);
309 pTcon = tlink_tcon(tlink);
310
311 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
312 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
313 cifs_sb->local_nls,
314 cifs_sb->mnt_cifs_flags &
315 CIFS_MOUNT_MAP_SPECIAL_CHR);
316 if (rc != 0)
317 goto out;
318
319 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
320 CIFSSMBClose(xid, pTcon, netfid);
321 /* it's not a symlink */
322 goto out;
323 }
324
325 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
326 if (!buf) {
327 rc = -ENOMEM;
328 goto out;
329 }
330 pbuf = buf;
331
332 rc = CIFSSMBRead(xid, pTcon, netfid,
333 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
334 0 /* offset */,
335 &bytes_read, &pbuf, &buf_type);
336 CIFSSMBClose(xid, pTcon, netfid);
337 if (rc != 0) {
338 kfree(buf);
339 goto out;
340 }
341
342 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
343 kfree(buf);
344 if (rc == -EINVAL) {
345 /* it's not a symlink */
346 rc = 0;
347 goto out;
348 }
349
350 if (rc != 0)
351 goto out;
352
353 /* it is a symlink */
354 fattr->cf_eof = link_len;
355 fattr->cf_mode &= ~S_IFMT;
356 fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
357 fattr->cf_dtype = DT_LNK;
358out:
359 cifs_put_tlink(tlink);
360 return rc;
361}
362
32int 363int
33cifs_hardlink(struct dentry *old_file, struct inode *inode, 364cifs_hardlink(struct dentry *old_file, struct inode *inode,
34 struct dentry *direntry) 365 struct dentry *direntry)
@@ -37,17 +368,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
37 int xid; 368 int xid;
38 char *fromName = NULL; 369 char *fromName = NULL;
39 char *toName = NULL; 370 char *toName = NULL;
40 struct cifs_sb_info *cifs_sb_target; 371 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
372 struct tcon_link *tlink;
41 struct cifsTconInfo *pTcon; 373 struct cifsTconInfo *pTcon;
42 struct cifsInodeInfo *cifsInode; 374 struct cifsInodeInfo *cifsInode;
43 375
44 xid = GetXid(); 376 tlink = cifs_sb_tlink(cifs_sb);
45 377 if (IS_ERR(tlink))
46 cifs_sb_target = CIFS_SB(inode->i_sb); 378 return PTR_ERR(tlink);
47 pTcon = cifs_sb_target->tcon; 379 pTcon = tlink_tcon(tlink);
48 380
49/* No need to check for cross device links since server will do that 381 xid = GetXid();
50 BB note DFS case in future though (when we may have to check) */
51 382
52 fromName = build_path_from_dentry(old_file); 383 fromName = build_path_from_dentry(old_file);
53 toName = build_path_from_dentry(direntry); 384 toName = build_path_from_dentry(direntry);
@@ -56,16 +387,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
56 goto cifs_hl_exit; 387 goto cifs_hl_exit;
57 } 388 }
58 389
59/* if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
60 if (pTcon->unix_ext) 390 if (pTcon->unix_ext)
61 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName, 391 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
62 cifs_sb_target->local_nls, 392 cifs_sb->local_nls,
63 cifs_sb_target->mnt_cifs_flags & 393 cifs_sb->mnt_cifs_flags &
64 CIFS_MOUNT_MAP_SPECIAL_CHR); 394 CIFS_MOUNT_MAP_SPECIAL_CHR);
65 else { 395 else {
66 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName, 396 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
67 cifs_sb_target->local_nls, 397 cifs_sb->local_nls,
68 cifs_sb_target->mnt_cifs_flags & 398 cifs_sb->mnt_cifs_flags &
69 CIFS_MOUNT_MAP_SPECIAL_CHR); 399 CIFS_MOUNT_MAP_SPECIAL_CHR);
70 if ((rc == -EIO) || (rc == -EINVAL)) 400 if ((rc == -EIO) || (rc == -EINVAL))
71 rc = -EOPNOTSUPP; 401 rc = -EOPNOTSUPP;
@@ -101,6 +431,7 @@ cifs_hl_exit:
101 kfree(fromName); 431 kfree(fromName);
102 kfree(toName); 432 kfree(toName);
103 FreeXid(xid); 433 FreeXid(xid);
434 cifs_put_tlink(tlink);
104 return rc; 435 return rc;
105} 436}
106 437
@@ -113,10 +444,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
113 char *full_path = NULL; 444 char *full_path = NULL;
114 char *target_path = NULL; 445 char *target_path = NULL;
115 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 446 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
116 struct cifsTconInfo *tcon = cifs_sb->tcon; 447 struct tcon_link *tlink = NULL;
448 struct cifsTconInfo *tcon;
117 449
118 xid = GetXid(); 450 xid = GetXid();
119 451
452 tlink = cifs_sb_tlink(cifs_sb);
453 if (IS_ERR(tlink)) {
454 rc = PTR_ERR(tlink);
455 tlink = NULL;
456 goto out;
457 }
458 tcon = tlink_tcon(tlink);
459
120 /* 460 /*
121 * For now, we just handle symlinks with unix extensions enabled. 461 * For now, we just handle symlinks with unix extensions enabled.
122 * Eventually we should handle NTFS reparse points, and MacOS 462 * Eventually we should handle NTFS reparse points, and MacOS
@@ -130,7 +470,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
130 * but there doesn't seem to be any harm in allowing the client to 470 * but there doesn't seem to be any harm in allowing the client to
131 * read them. 471 * read them.
132 */ 472 */
133 if (!(tcon->ses->capabilities & CAP_UNIX)) { 473 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
474 && !(tcon->ses->capabilities & CAP_UNIX)) {
134 rc = -EACCES; 475 rc = -EACCES;
135 goto out; 476 goto out;
136 } 477 }
@@ -141,8 +482,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
141 482
142 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode); 483 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
143 484
144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 485 rc = -EACCES;
145 cifs_sb->local_nls); 486 /*
487 * First try Minshall+French Symlinks, if configured
488 * and fallback to UNIX Extensions Symlinks.
489 */
490 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
491 rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
492 cifs_sb->local_nls,
493 cifs_sb->mnt_cifs_flags &
494 CIFS_MOUNT_MAP_SPECIAL_CHR);
495
496 if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
497 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
498 cifs_sb->local_nls);
499
146 kfree(full_path); 500 kfree(full_path);
147out: 501out:
148 if (rc != 0) { 502 if (rc != 0) {
@@ -151,6 +505,8 @@ out:
151 } 505 }
152 506
153 FreeXid(xid); 507 FreeXid(xid);
508 if (tlink)
509 cifs_put_tlink(tlink);
154 nd_set_link(nd, target_path); 510 nd_set_link(nd, target_path);
155 return NULL; 511 return NULL;
156} 512}
@@ -160,29 +516,37 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
160{ 516{
161 int rc = -EOPNOTSUPP; 517 int rc = -EOPNOTSUPP;
162 int xid; 518 int xid;
163 struct cifs_sb_info *cifs_sb; 519 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
520 struct tcon_link *tlink;
164 struct cifsTconInfo *pTcon; 521 struct cifsTconInfo *pTcon;
165 char *full_path = NULL; 522 char *full_path = NULL;
166 struct inode *newinode = NULL; 523 struct inode *newinode = NULL;
167 524
168 xid = GetXid(); 525 xid = GetXid();
169 526
170 cifs_sb = CIFS_SB(inode->i_sb); 527 tlink = cifs_sb_tlink(cifs_sb);
171 pTcon = cifs_sb->tcon; 528 if (IS_ERR(tlink)) {
529 rc = PTR_ERR(tlink);
530 goto symlink_exit;
531 }
532 pTcon = tlink_tcon(tlink);
172 533
173 full_path = build_path_from_dentry(direntry); 534 full_path = build_path_from_dentry(direntry);
174
175 if (full_path == NULL) { 535 if (full_path == NULL) {
176 rc = -ENOMEM; 536 rc = -ENOMEM;
177 FreeXid(xid); 537 goto symlink_exit;
178 return rc;
179 } 538 }
180 539
181 cFYI(1, "Full path: %s", full_path); 540 cFYI(1, "Full path: %s", full_path);
182 cFYI(1, "symname is %s", symname); 541 cFYI(1, "symname is %s", symname);
183 542
184 /* BB what if DFS and this volume is on different share? BB */ 543 /* BB what if DFS and this volume is on different share? BB */
185 if (pTcon->unix_ext) 544 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
545 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
546 cifs_sb->local_nls,
547 cifs_sb->mnt_cifs_flags &
548 CIFS_MOUNT_MAP_SPECIAL_CHR);
549 else if (pTcon->unix_ext)
186 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, 550 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
187 cifs_sb->local_nls); 551 cifs_sb->local_nls);
188 /* else 552 /* else
@@ -201,15 +565,12 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
201 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d", 565 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
202 rc); 566 rc);
203 } else { 567 } else {
204 if (pTcon->nocase)
205 direntry->d_op = &cifs_ci_dentry_ops;
206 else
207 direntry->d_op = &cifs_dentry_ops;
208 d_instantiate(direntry, newinode); 568 d_instantiate(direntry, newinode);
209 } 569 }
210 } 570 }
211 571symlink_exit:
212 kfree(full_path); 572 kfree(full_path);
573 cifs_put_tlink(tlink);
213 FreeXid(xid); 574 FreeXid(xid);
214 return rc; 575 return rc;
215} 576}
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d67..000000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 Unix SMB/Netbios implementation.
3 Version 1.9.
4 a implementation of MD4 designed for use in the SMB authentication protocol
5 Copyright (C) Andrew Tridgell 1997-1998.
6 Modified by Steve French (sfrench@us.ibm.com) 2002-2003
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*/
22#include <linux/module.h>
23#include <linux/fs.h>
24#include "cifsencrypt.h"
25
26/* NOTE: This code makes no attempt to be fast! */
27
28static __u32
29F(__u32 X, __u32 Y, __u32 Z)
30{
31 return (X & Y) | ((~X) & Z);
32}
33
34static __u32
35G(__u32 X, __u32 Y, __u32 Z)
36{
37 return (X & Y) | (X & Z) | (Y & Z);
38}
39
40static __u32
41H(__u32 X, __u32 Y, __u32 Z)
42{
43 return X ^ Y ^ Z;
44}
45
46static __u32
47lshift(__u32 x, int s)
48{
49 x &= 0xFFFFFFFF;
50 return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
51}
52
53#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
54#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
55#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
56
57/* this applies md4 to 64 byte chunks */
58static void
59mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
60{
61 int j;
62 __u32 AA, BB, CC, DD;
63 __u32 X[16];
64
65
66 for (j = 0; j < 16; j++)
67 X[j] = M[j];
68
69 AA = *A;
70 BB = *B;
71 CC = *C;
72 DD = *D;
73
74 ROUND1(A, B, C, D, 0, 3);
75 ROUND1(D, A, B, C, 1, 7);
76 ROUND1(C, D, A, B, 2, 11);
77 ROUND1(B, C, D, A, 3, 19);
78 ROUND1(A, B, C, D, 4, 3);
79 ROUND1(D, A, B, C, 5, 7);
80 ROUND1(C, D, A, B, 6, 11);
81 ROUND1(B, C, D, A, 7, 19);
82 ROUND1(A, B, C, D, 8, 3);
83 ROUND1(D, A, B, C, 9, 7);
84 ROUND1(C, D, A, B, 10, 11);
85 ROUND1(B, C, D, A, 11, 19);
86 ROUND1(A, B, C, D, 12, 3);
87 ROUND1(D, A, B, C, 13, 7);
88 ROUND1(C, D, A, B, 14, 11);
89 ROUND1(B, C, D, A, 15, 19);
90
91 ROUND2(A, B, C, D, 0, 3);
92 ROUND2(D, A, B, C, 4, 5);
93 ROUND2(C, D, A, B, 8, 9);
94 ROUND2(B, C, D, A, 12, 13);
95 ROUND2(A, B, C, D, 1, 3);
96 ROUND2(D, A, B, C, 5, 5);
97 ROUND2(C, D, A, B, 9, 9);
98 ROUND2(B, C, D, A, 13, 13);
99 ROUND2(A, B, C, D, 2, 3);
100 ROUND2(D, A, B, C, 6, 5);
101 ROUND2(C, D, A, B, 10, 9);
102 ROUND2(B, C, D, A, 14, 13);
103 ROUND2(A, B, C, D, 3, 3);
104 ROUND2(D, A, B, C, 7, 5);
105 ROUND2(C, D, A, B, 11, 9);
106 ROUND2(B, C, D, A, 15, 13);
107
108 ROUND3(A, B, C, D, 0, 3);
109 ROUND3(D, A, B, C, 8, 9);
110 ROUND3(C, D, A, B, 4, 11);
111 ROUND3(B, C, D, A, 12, 15);
112 ROUND3(A, B, C, D, 2, 3);
113 ROUND3(D, A, B, C, 10, 9);
114 ROUND3(C, D, A, B, 6, 11);
115 ROUND3(B, C, D, A, 14, 15);
116 ROUND3(A, B, C, D, 1, 3);
117 ROUND3(D, A, B, C, 9, 9);
118 ROUND3(C, D, A, B, 5, 11);
119 ROUND3(B, C, D, A, 13, 15);
120 ROUND3(A, B, C, D, 3, 3);
121 ROUND3(D, A, B, C, 11, 9);
122 ROUND3(C, D, A, B, 7, 11);
123 ROUND3(B, C, D, A, 15, 15);
124
125 *A += AA;
126 *B += BB;
127 *C += CC;
128 *D += DD;
129
130 *A &= 0xFFFFFFFF;
131 *B &= 0xFFFFFFFF;
132 *C &= 0xFFFFFFFF;
133 *D &= 0xFFFFFFFF;
134
135 for (j = 0; j < 16; j++)
136 X[j] = 0;
137}
138
139static void
140copy64(__u32 *M, unsigned char *in)
141{
142 int i;
143
144 for (i = 0; i < 16; i++)
145 M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
146 (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
147}
148
149static void
150copy4(unsigned char *out, __u32 x)
151{
152 out[0] = x & 0xFF;
153 out[1] = (x >> 8) & 0xFF;
154 out[2] = (x >> 16) & 0xFF;
155 out[3] = (x >> 24) & 0xFF;
156}
157
158/* produce a md4 message digest from data of length n bytes */
159void
160mdfour(unsigned char *out, unsigned char *in, int n)
161{
162 unsigned char buf[128];
163 __u32 M[16];
164 __u32 b = n * 8;
165 int i;
166 __u32 A = 0x67452301;
167 __u32 B = 0xefcdab89;
168 __u32 C = 0x98badcfe;
169 __u32 D = 0x10325476;
170
171 while (n > 64) {
172 copy64(M, in);
173 mdfour64(M, &A, &B, &C, &D);
174 in += 64;
175 n -= 64;
176 }
177
178 for (i = 0; i < 128; i++)
179 buf[i] = 0;
180 memcpy(buf, in, n);
181 buf[n] = 0x80;
182
183 if (n <= 55) {
184 copy4(buf + 56, b);
185 copy64(M, buf);
186 mdfour64(M, &A, &B, &C, &D);
187 } else {
188 copy4(buf + 120, b);
189 copy64(M, buf);
190 mdfour64(M, &A, &B, &C, &D);
191 copy64(M, buf + 64);
192 mdfour64(M, &A, &B, &C, &D);
193 }
194
195 for (i = 0; i < 128; i++)
196 buf[i] = 0;
197 copy64(M, buf);
198
199 copy4(out, A);
200 copy4(out + 4, B);
201 copy4(out + 8, C);
202 copy4(out + 12, D);
203
204 A = B = C = D = 0;
205}
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c319..000000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
1/*
2 * This code implements the MD5 message-digest algorithm.
3 * The algorithm is due to Ron Rivest. This code was
4 * written by Colin Plumb in 1993, no copyright is claimed.
5 * This code is in the public domain; do with it what you wish.
6 *
7 * Equivalent code is available from RSA Data Security, Inc.
8 * This code has been tested against that, and is equivalent,
9 * except that you don't need to include two pages of legalese
10 * with every copy.
11 *
12 * To compute the message digest of a chunk of bytes, declare an
13 * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
14 * needed on buffers full of bytes, and then call cifs_MD5_final, which
15 * will fill a supplied 16-byte array with the digest.
16 */
17
18/* This code slightly modified to fit into Samba by
19 abartlet@samba.org Jun 2001
20 and to fit the cifs vfs by
21 Steve French sfrench@us.ibm.com */
22
23#include <linux/string.h>
24#include "md5.h"
25
26static void MD5Transform(__u32 buf[4], __u32 const in[16]);
27
28/*
29 * Note: this code is harmless on little-endian machines.
30 */
31static void
32byteReverse(unsigned char *buf, unsigned longs)
33{
34 __u32 t;
35 do {
36 t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
37 ((unsigned) buf[1] << 8 | buf[0]);
38 *(__u32 *) buf = t;
39 buf += 4;
40 } while (--longs);
41}
42
43/*
44 * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
45 * initialization constants.
46 */
47void
48cifs_MD5_init(struct MD5Context *ctx)
49{
50 ctx->buf[0] = 0x67452301;
51 ctx->buf[1] = 0xefcdab89;
52 ctx->buf[2] = 0x98badcfe;
53 ctx->buf[3] = 0x10325476;
54
55 ctx->bits[0] = 0;
56 ctx->bits[1] = 0;
57}
58
59/*
60 * Update context to reflect the concatenation of another buffer full
61 * of bytes.
62 */
63void
64cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
65{
66 register __u32 t;
67
68 /* Update bitcount */
69
70 t = ctx->bits[0];
71 if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
72 ctx->bits[1]++; /* Carry from low to high */
73 ctx->bits[1] += len >> 29;
74
75 t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
76
77 /* Handle any leading odd-sized chunks */
78
79 if (t) {
80 unsigned char *p = (unsigned char *) ctx->in + t;
81
82 t = 64 - t;
83 if (len < t) {
84 memmove(p, buf, len);
85 return;
86 }
87 memmove(p, buf, t);
88 byteReverse(ctx->in, 16);
89 MD5Transform(ctx->buf, (__u32 *) ctx->in);
90 buf += t;
91 len -= t;
92 }
93 /* Process data in 64-byte chunks */
94
95 while (len >= 64) {
96 memmove(ctx->in, buf, 64);
97 byteReverse(ctx->in, 16);
98 MD5Transform(ctx->buf, (__u32 *) ctx->in);
99 buf += 64;
100 len -= 64;
101 }
102
103 /* Handle any remaining bytes of data. */
104
105 memmove(ctx->in, buf, len);
106}
107
108/*
109 * Final wrapup - pad to 64-byte boundary with the bit pattern
110 * 1 0* (64-bit count of bits processed, MSB-first)
111 */
112void
113cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
114{
115 unsigned int count;
116 unsigned char *p;
117
118 /* Compute number of bytes mod 64 */
119 count = (ctx->bits[0] >> 3) & 0x3F;
120
121 /* Set the first char of padding to 0x80. This is safe since there is
122 always at least one byte free */
123 p = ctx->in + count;
124 *p++ = 0x80;
125
126 /* Bytes of padding needed to make 64 bytes */
127 count = 64 - 1 - count;
128
129 /* Pad out to 56 mod 64 */
130 if (count < 8) {
131 /* Two lots of padding: Pad the first block to 64 bytes */
132 memset(p, 0, count);
133 byteReverse(ctx->in, 16);
134 MD5Transform(ctx->buf, (__u32 *) ctx->in);
135
136 /* Now fill the next block with 56 bytes */
137 memset(ctx->in, 0, 56);
138 } else {
139 /* Pad block to 56 bytes */
140 memset(p, 0, count - 8);
141 }
142 byteReverse(ctx->in, 14);
143
144 /* Append length in bits and transform */
145 ((__u32 *) ctx->in)[14] = ctx->bits[0];
146 ((__u32 *) ctx->in)[15] = ctx->bits[1];
147
148 MD5Transform(ctx->buf, (__u32 *) ctx->in);
149 byteReverse((unsigned char *) ctx->buf, 4);
150 memmove(digest, ctx->buf, 16);
151 memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
152}
153
154/* The four core functions - F1 is optimized somewhat */
155
156/* #define F1(x, y, z) (x & y | ~x & z) */
157#define F1(x, y, z) (z ^ (x & (y ^ z)))
158#define F2(x, y, z) F1(z, x, y)
159#define F3(x, y, z) (x ^ y ^ z)
160#define F4(x, y, z) (y ^ (x | ~z))
161
162/* This is the central step in the MD5 algorithm. */
163#define MD5STEP(f, w, x, y, z, data, s) \
164 (w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x)
165
166/*
167 * The core of the MD5 algorithm, this alters an existing MD5 hash to
168 * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks
169 * the data and converts bytes into longwords for this routine.
170 */
171static void
172MD5Transform(__u32 buf[4], __u32 const in[16])
173{
174 register __u32 a, b, c, d;
175
176 a = buf[0];
177 b = buf[1];
178 c = buf[2];
179 d = buf[3];
180
181 MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
182 MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
183 MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
184 MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
185 MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
186 MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
187 MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
188 MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
189 MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
190 MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
191 MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
192 MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
193 MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
194 MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
195 MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
196 MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
197
198 MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
199 MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
200 MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
201 MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
202 MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
203 MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
204 MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
205 MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
206 MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
207 MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
208 MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
209 MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
210 MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
211 MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
212 MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
213 MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
214
215 MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
216 MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
217 MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
218 MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
219 MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
220 MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
221 MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
222 MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
223 MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
224 MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
225 MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
226 MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
227 MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
228 MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
229 MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
230 MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
231
232 MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
233 MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
234 MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
235 MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
236 MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
237 MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
238 MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
239 MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
240 MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
241 MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
242 MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
243 MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
244 MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
245 MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
246 MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
247 MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
248
249 buf[0] += a;
250 buf[1] += b;
251 buf[2] += c;
252 buf[3] += d;
253}
254
255#if 0 /* currently unused */
256/***********************************************************************
257 the rfc 2104 version of hmac_md5 initialisation.
258***********************************************************************/
259static void
260hmac_md5_init_rfc2104(unsigned char *key, int key_len,
261 struct HMACMD5Context *ctx)
262{
263 int i;
264
265 /* if key is longer than 64 bytes reset it to key=MD5(key) */
266 if (key_len > 64) {
267 unsigned char tk[16];
268 struct MD5Context tctx;
269
270 cifs_MD5_init(&tctx);
271 cifs_MD5_update(&tctx, key, key_len);
272 cifs_MD5_final(tk, &tctx);
273
274 key = tk;
275 key_len = 16;
276 }
277
278 /* start out by storing key in pads */
279 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
280 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
281 memcpy(ctx->k_ipad, key, key_len);
282 memcpy(ctx->k_opad, key, key_len);
283
284 /* XOR key with ipad and opad values */
285 for (i = 0; i < 64; i++) {
286 ctx->k_ipad[i] ^= 0x36;
287 ctx->k_opad[i] ^= 0x5c;
288 }
289
290 cifs_MD5_init(&ctx->ctx);
291 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
292}
293#endif
294
295/***********************************************************************
296 the microsoft version of hmac_md5 initialisation.
297***********************************************************************/
298void
299hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
300 struct HMACMD5Context *ctx)
301{
302 int i;
303
304 /* if key is longer than 64 bytes truncate it */
305 if (key_len > 64)
306 key_len = 64;
307
308 /* start out by storing key in pads */
309 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
310 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
311 memcpy(ctx->k_ipad, key, key_len);
312 memcpy(ctx->k_opad, key, key_len);
313
314 /* XOR key with ipad and opad values */
315 for (i = 0; i < 64; i++) {
316 ctx->k_ipad[i] ^= 0x36;
317 ctx->k_opad[i] ^= 0x5c;
318 }
319
320 cifs_MD5_init(&ctx->ctx);
321 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
322}
323
324/***********************************************************************
325 update hmac_md5 "inner" buffer
326***********************************************************************/
327void
328hmac_md5_update(const unsigned char *text, int text_len,
329 struct HMACMD5Context *ctx)
330{
331 cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */
332}
333
334/***********************************************************************
335 finish off hmac_md5 "inner" buffer and generate outer one.
336***********************************************************************/
337void
338hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
339{
340 struct MD5Context ctx_o;
341
342 cifs_MD5_final(digest, &ctx->ctx);
343
344 cifs_MD5_init(&ctx_o);
345 cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
346 cifs_MD5_update(&ctx_o, digest, 16);
347 cifs_MD5_final(digest, &ctx_o);
348}
349
350/***********************************************************
351 single function to calculate an HMAC MD5 digest from data.
352 use the microsoft hmacmd5 init method because the key is 16 bytes.
353************************************************************/
354#if 0 /* currently unused */
355static void
356hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
357 unsigned char *digest)
358{
359 struct HMACMD5Context ctx;
360 hmac_md5_init_limK_to_64(key, 16, &ctx);
361 if (data_len != 0)
362 hmac_md5_update(data, data_len, &ctx);
363
364 hmac_md5_final(digest, &ctx);
365}
366#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402fd..000000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
1#ifndef MD5_H
2#define MD5_H
3#ifndef HEADER_MD5_H
4/* Try to avoid clashes with OpenSSL */
5#define HEADER_MD5_H
6#endif
7
8struct MD5Context {
9 __u32 buf[4];
10 __u32 bits[2];
11 unsigned char in[64];
12};
13#endif /* !MD5_H */
14
15#ifndef _HMAC_MD5_H
16struct HMACMD5Context {
17 struct MD5Context ctx;
18 unsigned char k_ipad[65];
19 unsigned char k_opad[65];
20};
21#endif /* _HMAC_MD5_H */
22
23void cifs_MD5_init(struct MD5Context *context);
24void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
25 unsigned len);
26void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
27
28/* The following definitions come from lib/hmacmd5.c */
29
30/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
31 struct HMACMD5Context *ctx);*/
32void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
33 struct HMACMD5Context *ctx);
34void hmac_md5_update(const unsigned char *text, int text_len,
35 struct HMACMD5Context *ctx);
36void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
37/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
38 unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3ccadc1326d6..2a930a752a78 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
236{ 236{
237 __u16 mid = 0; 237 __u16 mid = 0;
238 __u16 last_mid; 238 __u16 last_mid;
239 int collision; 239 bool collision;
240
241 if (server == NULL)
242 return mid;
243 240
244 spin_lock(&GlobalMid_Lock); 241 spin_lock(&GlobalMid_Lock);
245 last_mid = server->CurrentMid; /* we do not want to loop forever */ 242 last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
252 (and it would also have to have been a request that 249 (and it would also have to have been a request that
253 did not time out) */ 250 did not time out) */
254 while (server->CurrentMid != last_mid) { 251 while (server->CurrentMid != last_mid) {
255 struct list_head *tmp;
256 struct mid_q_entry *mid_entry; 252 struct mid_q_entry *mid_entry;
253 unsigned int num_mids;
257 254
258 collision = 0; 255 collision = false;
259 if (server->CurrentMid == 0) 256 if (server->CurrentMid == 0)
260 server->CurrentMid++; 257 server->CurrentMid++;
261 258
262 list_for_each(tmp, &server->pending_mid_q) { 259 num_mids = 0;
263 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 260 list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
264 261 ++num_mids;
265 if ((mid_entry->mid == server->CurrentMid) && 262 if (mid_entry->mid == server->CurrentMid &&
266 (mid_entry->midState == MID_REQUEST_SUBMITTED)) { 263 mid_entry->midState == MID_REQUEST_SUBMITTED) {
267 /* This mid is in use, try a different one */ 264 /* This mid is in use, try a different one */
268 collision = 1; 265 collision = true;
269 break; 266 break;
270 } 267 }
271 } 268 }
272 if (collision == 0) { 269
270 /*
271 * if we have more than 32k mids in the list, then something
272 * is very wrong. Possibly a local user is trying to DoS the
273 * box by issuing long-running calls and SIGKILL'ing them. If
274 * we get to 2^16 mids then we're in big trouble as this
275 * function could loop forever.
276 *
277 * Go ahead and assign out the mid in this situation, but force
278 * an eventual reconnect to clean out the pending_mid_q.
279 */
280 if (num_mids > 32768)
281 server->tcpStatus = CifsNeedReconnect;
282
283 if (!collision) {
273 mid = server->CurrentMid; 284 mid = server->CurrentMid;
274 break; 285 break;
275 } 286 }
@@ -347,7 +358,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
347 if (current_fsuid() != treeCon->ses->linux_uid) { 358 if (current_fsuid() != treeCon->ses->linux_uid) {
348 cFYI(1, "Multiuser mode and UID " 359 cFYI(1, "Multiuser mode and UID "
349 "did not match tcon uid"); 360 "did not match tcon uid");
350 read_lock(&cifs_tcp_ses_lock); 361 spin_lock(&cifs_tcp_ses_lock);
351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { 362 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list); 363 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
353 if (ses->linux_uid == current_fsuid()) { 364 if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +372,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
361 } 372 }
362 } 373 }
363 } 374 }
364 read_unlock(&cifs_tcp_ses_lock); 375 spin_unlock(&cifs_tcp_ses_lock);
365 } 376 }
366 } 377 }
367 } 378 }
@@ -381,29 +392,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
381} 392}
382 393
383static int 394static int
384checkSMBhdr(struct smb_hdr *smb, __u16 mid) 395check_smb_hdr(struct smb_hdr *smb, __u16 mid)
385{ 396{
386 /* Make sure that this really is an SMB, that it is a response, 397 /* does it have the right SMB "signature" ? */
387 and that the message ids match */ 398 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
388 if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) && 399 cERROR(1, "Bad protocol string signature header 0x%x",
389 (mid == smb->Mid)) { 400 *(unsigned int *)smb->Protocol);
390 if (smb->Flags & SMBFLG_RESPONSE) 401 return 1;
391 return 0; 402 }
392 else { 403
393 /* only one valid case where server sends us request */ 404 /* Make sure that message ids match */
394 if (smb->Command == SMB_COM_LOCKING_ANDX) 405 if (mid != smb->Mid) {
395 return 0; 406 cERROR(1, "Mids do not match. received=%u expected=%u",
396 else 407 smb->Mid, mid);
397 cERROR(1, "Received Request not response"); 408 return 1;
398 }
399 } else { /* bad signature or mid */
400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
401 cERROR(1, "Bad protocol string signature header %x",
402 *(unsigned int *) smb->Protocol);
403 if (mid != smb->Mid)
404 cERROR(1, "Mids do not match");
405 } 409 }
406 cERROR(1, "bad smb detected. The Mid=%d", smb->Mid); 410
411 /* if it's a response then accept */
412 if (smb->Flags & SMBFLG_RESPONSE)
413 return 0;
414
415 /* only one valid case where server sends us request */
416 if (smb->Command == SMB_COM_LOCKING_ANDX)
417 return 0;
418
419 cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
407 return 1; 420 return 1;
408} 421}
409 422
@@ -448,7 +461,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
448 return 1; 461 return 1;
449 } 462 }
450 463
451 if (checkSMBhdr(smb, mid)) 464 if (check_smb_hdr(smb, mid))
452 return 1; 465 return 1;
453 clc_len = smbCalcSize_LE(smb); 466 clc_len = smbCalcSize_LE(smb);
454 467
@@ -465,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
465 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 478 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
466 return 0; /* bcc wrapped */ 479 return 0; /* bcc wrapped */
467 } 480 }
468 cFYI(1, "Calculated size %d vs length %d mismatch for mid %d", 481 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
469 clc_len, 4 + len, smb->Mid); 482 clc_len, 4 + len, smb->Mid);
470 /* Windows XP can return a few bytes too much, presumably 483
471 an illegal pad, at the end of byte range lock responses 484 if (4 + len < clc_len) {
472 so we allow for that three byte pad, as long as actual 485 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
473 received length is as long or longer than calculated length */
474 /* We have now had to extend this more, since there is a
475 case in which it needs to be bigger still to handle a
476 malformed response to transact2 findfirst from WinXP when
477 access denied is returned and thus bcc and wct are zero
478 but server says length is 0x21 bytes too long as if the server
479 forget to reset the smb rfc1001 length when it reset the
480 wct and bcc to minimum size and drop the t2 parms and data */
481 if ((4+len > clc_len) && (len <= clc_len + 512))
482 return 0;
483 else {
484 cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
485 len, smb->Mid); 486 len, smb->Mid);
486 return 1; 487 return 1;
488 } else if (len > clc_len + 512) {
489 /*
490 * Some servers (Windows XP in particular) send more
491 * data than the lengths in the SMB packet would
492 * indicate on certain calls (byte range locks and
493 * trans2 find first calls in particular). While the
494 * client can handle such a frame by ignoring the
495 * trailing data, we choose limit the amount of extra
496 * data to 512 bytes.
497 */
498 cERROR(1, "RFC1001 size %u more than 512 bytes larger "
499 "than SMB for mid=%u", len, smb->Mid);
500 return 1;
487 } 501 }
488 } 502 }
489 return 0; 503 return 0;
@@ -551,7 +565,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
551 return false; 565 return false;
552 566
553 /* look up tcon based on tid & uid */ 567 /* look up tcon based on tid & uid */
554 read_lock(&cifs_tcp_ses_lock); 568 spin_lock(&cifs_tcp_ses_lock);
555 list_for_each(tmp, &srv->smb_ses_list) { 569 list_for_each(tmp, &srv->smb_ses_list) {
556 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 570 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
557 list_for_each(tmp1, &ses->tcon_list) { 571 list_for_each(tmp1, &ses->tcon_list) {
@@ -560,51 +574,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
560 continue; 574 continue;
561 575
562 cifs_stats_inc(&tcon->num_oplock_brks); 576 cifs_stats_inc(&tcon->num_oplock_brks);
563 read_lock(&GlobalSMBSeslock); 577 spin_lock(&cifs_file_list_lock);
564 list_for_each(tmp2, &tcon->openFileList) { 578 list_for_each(tmp2, &tcon->openFileList) {
565 netfile = list_entry(tmp2, struct cifsFileInfo, 579 netfile = list_entry(tmp2, struct cifsFileInfo,
566 tlist); 580 tlist);
567 if (pSMB->Fid != netfile->netfid) 581 if (pSMB->Fid != netfile->netfid)
568 continue; 582 continue;
569 583
570 /*
571 * don't do anything if file is about to be
572 * closed anyway.
573 */
574 if (netfile->closePend) {
575 read_unlock(&GlobalSMBSeslock);
576 read_unlock(&cifs_tcp_ses_lock);
577 return true;
578 }
579
580 cFYI(1, "file id match, oplock break"); 584 cFYI(1, "file id match, oplock break");
581 pCifsInode = CIFS_I(netfile->pInode); 585 pCifsInode = CIFS_I(netfile->dentry->d_inode);
582 pCifsInode->clientCanCacheAll = false;
583 if (pSMB->OplockLevel == 0)
584 pCifsInode->clientCanCacheRead = false;
585 586
587 cifs_set_oplock_level(pCifsInode,
588 pSMB->OplockLevel ? OPLOCK_READ : 0);
586 /* 589 /*
587 * cifs_oplock_break_put() can't be called 590 * cifs_oplock_break_put() can't be called
588 * from here. Get reference after queueing 591 * from here. Get reference after queueing
589 * succeeded. cifs_oplock_break() will 592 * succeeded. cifs_oplock_break() will
590 * synchronize using GlobalSMSSeslock. 593 * synchronize using cifs_file_list_lock.
591 */ 594 */
592 if (queue_work(system_nrt_wq, 595 if (queue_work(system_nrt_wq,
593 &netfile->oplock_break)) 596 &netfile->oplock_break))
594 cifs_oplock_break_get(netfile); 597 cifs_oplock_break_get(netfile);
595 netfile->oplock_break_cancelled = false; 598 netfile->oplock_break_cancelled = false;
596 599
597 read_unlock(&GlobalSMBSeslock); 600 spin_unlock(&cifs_file_list_lock);
598 read_unlock(&cifs_tcp_ses_lock); 601 spin_unlock(&cifs_tcp_ses_lock);
599 return true; 602 return true;
600 } 603 }
601 read_unlock(&GlobalSMBSeslock); 604 spin_unlock(&cifs_file_list_lock);
602 read_unlock(&cifs_tcp_ses_lock); 605 spin_unlock(&cifs_tcp_ses_lock);
603 cFYI(1, "No matching file for oplock break"); 606 cFYI(1, "No matching file for oplock break");
604 return true; 607 return true;
605 } 608 }
606 } 609 }
607 read_unlock(&cifs_tcp_ses_lock); 610 spin_unlock(&cifs_tcp_ses_lock);
608 cFYI(1, "Can not process oplock break for non-existent connection"); 611 cFYI(1, "Can not process oplock break for non-existent connection");
609 return true; 612 return true;
610} 613}
@@ -648,77 +651,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
648 return; 651 return;
649} 652}
650 653
651/* Convert 16 bit Unicode pathname to wire format from string in current code
652 page. Conversion may involve remapping up the seven characters that are
653 only legal in POSIX-like OS (if they are present in the string). Path
654 names are little endian 16 bit Unicode on the wire */
655int
656cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
657 const struct nls_table *cp, int mapChars)
658{
659 int i, j, charlen;
660 int len_remaining = maxlen;
661 char src_char;
662 __u16 temp;
663
664 if (!mapChars)
665 return cifs_strtoUCS(target, source, PATH_MAX, cp);
666
667 for (i = 0, j = 0; i < maxlen; j++) {
668 src_char = source[i];
669 switch (src_char) {
670 case 0:
671 target[j] = 0;
672 goto ctoUCS_out;
673 case ':':
674 target[j] = cpu_to_le16(UNI_COLON);
675 break;
676 case '*':
677 target[j] = cpu_to_le16(UNI_ASTERIK);
678 break;
679 case '?':
680 target[j] = cpu_to_le16(UNI_QUESTION);
681 break;
682 case '<':
683 target[j] = cpu_to_le16(UNI_LESSTHAN);
684 break;
685 case '>':
686 target[j] = cpu_to_le16(UNI_GRTRTHAN);
687 break;
688 case '|':
689 target[j] = cpu_to_le16(UNI_PIPE);
690 break;
691 /* BB We can not handle remapping slash until
692 all the calls to build_path_from_dentry
693 are modified, as they use slash as separator BB */
694 /* case '\\':
695 target[j] = cpu_to_le16(UNI_SLASH);
696 break;*/
697 default:
698 charlen = cp->char2uni(source+i,
699 len_remaining, &temp);
700 /* if no match, use question mark, which
701 at least in some cases servers as wild card */
702 if (charlen < 1) {
703 target[j] = cpu_to_le16(0x003f);
704 charlen = 1;
705 } else
706 target[j] = cpu_to_le16(temp);
707 len_remaining -= charlen;
708 /* character may take more than one byte in the
709 the source string, but will take exactly two
710 bytes in the target string */
711 i += charlen;
712 continue;
713 }
714 i++; /* move to next char in source string */
715 len_remaining--;
716 }
717
718ctoUCS_out:
719 return i;
720}
721
722void 654void
723cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) 655cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
724{ 656{
@@ -729,6 +661,26 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
729 "properly. Hardlinks will not be recognized on this " 661 "properly. Hardlinks will not be recognized on this "
730 "mount. Consider mounting with the \"noserverino\" " 662 "mount. Consider mounting with the \"noserverino\" "
731 "option to silence this message.", 663 "option to silence this message.",
732 cifs_sb->tcon->treeName); 664 cifs_sb_master_tcon(cifs_sb)->treeName);
665 }
666}
667
668void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
669{
670 oplock &= 0xF;
671
672 if (oplock == OPLOCK_EXCLUSIVE) {
673 cinode->clientCanCacheAll = true;
674 cinode->clientCanCacheRead = true;
675 cFYI(1, "Exclusive Oplock granted on inode %p",
676 &cinode->vfs_inode);
677 } else if (oplock == OPLOCK_READ) {
678 cinode->clientCanCacheAll = false;
679 cinode->clientCanCacheRead = true;
680 cFYI(1, "Level II Oplock granted on inode %p",
681 &cinode->vfs_inode);
682 } else {
683 cinode->clientCanCacheAll = false;
684 cinode->clientCanCacheRead = false;
733 } 685 }
734} 686}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62f..8d9189f64477 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
899 } 899 }
900 /* else ERRHRD class errors or junk - return EIO */ 900 /* else ERRHRD class errors or junk - return EIO */
901 901
902 cFYI(1, "Mapping smb error code %d to POSIX err %d", 902 cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
903 smberrcode, rc); 903 le32_to_cpu(smb->Status.CifsError), rc);
904 904
905 /* generic corrective action e.g. reconnect SMB session on 905 /* generic corrective action e.g. reconnect SMB session on
906 * ERRbaduid could be added */ 906 * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
916smbCalcSize(struct smb_hdr *ptr) 916smbCalcSize(struct smb_hdr *ptr)
917{ 917{
918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
919 2 /* size of the bcc field */ + BCC(ptr)); 919 2 /* size of the bcc field */ + get_bcc(ptr));
920} 920}
921 921
922unsigned int 922unsigned int
923smbCalcSize_LE(struct smb_hdr *ptr) 923smbCalcSize_LE(struct smb_hdr *ptr)
924{ 924{
925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
926 2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr))); 926 2 /* size of the bcc field */ + get_bcc_le(ptr));
927} 927}
928 928
929/* The following are taken from fs/ntfs/util.c */ 929/* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..5d52e4a3b1ed 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
62#define NTLMSSP_NEGOTIATE_56 0x80000000 62#define NTLMSSP_NEGOTIATE_56 0x80000000
63 63
64/* Define AV Pair Field IDs */
65enum av_field_type {
66 NTLMSSP_AV_EOL = 0,
67 NTLMSSP_AV_NB_COMPUTER_NAME,
68 NTLMSSP_AV_NB_DOMAIN_NAME,
69 NTLMSSP_AV_DNS_COMPUTER_NAME,
70 NTLMSSP_AV_DNS_DOMAIN_NAME,
71 NTLMSSP_AV_DNS_TREE_NAME,
72 NTLMSSP_AV_FLAGS,
73 NTLMSSP_AV_TIMESTAMP,
74 NTLMSSP_AV_RESTRICTION,
75 NTLMSSP_AV_TARGET_NAME,
76 NTLMSSP_AV_CHANNEL_BINDINGS
77};
78
64/* Although typedefs are not commonly used for structure definitions */ 79/* Although typedefs are not commonly used for structure definitions */
65/* in the Linux kernel, in this particular case they are useful */ 80/* in the Linux kernel, in this particular case they are useful */
66/* to more closely match the standards document for NTLMSSP from */ 81/* to more closely match the standards document for NTLMSSP from */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d5e591fab475..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
79 cFYI(1, "For %s", name->name); 79 cFYI(1, "For %s", name->name);
80 80
81 if (parent->d_op && parent->d_op->d_hash) 81 if (parent->d_op && parent->d_op->d_hash)
82 parent->d_op->d_hash(parent, name); 82 parent->d_op->d_hash(parent, parent->d_inode, name);
83 else 83 else
84 name->hash = full_name_hash(name->name, name->len); 84 name->hash = full_name_hash(name->name, name->len);
85 85
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
102 return NULL; 102 return NULL;
103 } 103 }
104 104
105 if (CIFS_SB(sb)->tcon->nocase)
106 dentry->d_op = &cifs_ci_dentry_ops;
107 else
108 dentry->d_op = &cifs_dentry_ops;
109
110 alias = d_materialise_unique(dentry, inode); 105 alias = d_materialise_unique(dentry, inode);
111 if (alias != NULL) { 106 if (alias != NULL) {
112 dput(dentry); 107 dput(dentry);
@@ -160,6 +155,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
160 fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); 155 fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
161 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 156 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
162 fattr->cf_bytes = le64_to_cpu(info->AllocationSize); 157 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
158 fattr->cf_createtime = le64_to_cpu(info->CreationTime);
163 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); 159 fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
164 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); 160 fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
165 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); 161 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
@@ -171,7 +167,7 @@ static void
171cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, 167cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
172 struct cifs_sb_info *cifs_sb) 168 struct cifs_sb_info *cifs_sb)
173{ 169{
174 int offset = cifs_sb->tcon->ses->server->timeAdj; 170 int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
175 171
176 memset(fattr, 0, sizeof(*fattr)); 172 memset(fattr, 0, sizeof(*fattr));
177 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate, 173 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +195,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
199 int len; 195 int len;
200 int oplock = 0; 196 int oplock = 0;
201 int rc; 197 int rc;
202 struct cifsTconInfo *ptcon = cifs_sb->tcon; 198 struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
203 char *tmpbuffer; 199 char *tmpbuffer;
204 200
205 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, 201 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,34 +219,38 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
223static int initiate_cifs_search(const int xid, struct file *file) 219static int initiate_cifs_search(const int xid, struct file *file)
224{ 220{
225 int rc = 0; 221 int rc = 0;
226 char *full_path; 222 char *full_path = NULL;
227 struct cifsFileInfo *cifsFile; 223 struct cifsFileInfo *cifsFile;
228 struct cifs_sb_info *cifs_sb; 224 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
225 struct tcon_link *tlink = NULL;
229 struct cifsTconInfo *pTcon; 226 struct cifsTconInfo *pTcon;
230 227
231 if (file->private_data == NULL) { 228 if (file->private_data == NULL) {
232 file->private_data = 229 tlink = cifs_sb_tlink(cifs_sb);
233 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 230 if (IS_ERR(tlink))
231 return PTR_ERR(tlink);
232
233 cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
234 if (cifsFile == NULL) {
235 rc = -ENOMEM;
236 goto error_exit;
237 }
238 file->private_data = cifsFile;
239 cifsFile->tlink = cifs_get_tlink(tlink);
240 pTcon = tlink_tcon(tlink);
241 } else {
242 cifsFile = file->private_data;
243 pTcon = tlink_tcon(cifsFile->tlink);
234 } 244 }
235 245
236 if (file->private_data == NULL)
237 return -ENOMEM;
238 cifsFile = file->private_data;
239 cifsFile->invalidHandle = true; 246 cifsFile->invalidHandle = true;
240 cifsFile->srch_inf.endOfSearch = false; 247 cifsFile->srch_inf.endOfSearch = false;
241 248
242 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
243 if (cifs_sb == NULL)
244 return -EINVAL;
245
246 pTcon = cifs_sb->tcon;
247 if (pTcon == NULL)
248 return -EINVAL;
249
250 full_path = build_path_from_dentry(file->f_path.dentry); 249 full_path = build_path_from_dentry(file->f_path.dentry);
251 250 if (full_path == NULL) {
252 if (full_path == NULL) 251 rc = -ENOMEM;
253 return -ENOMEM; 252 goto error_exit;
253 }
254 254
255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos); 255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
256 256
@@ -283,7 +283,9 @@ ffirst_retry:
283 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 283 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
284 goto ffirst_retry; 284 goto ffirst_retry;
285 } 285 }
286error_exit:
286 kfree(full_path); 287 kfree(full_path);
288 cifs_put_tlink(tlink);
287 return rc; 289 return rc;
288} 290}
289 291
@@ -525,14 +527,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
525 (index_to_find < first_entry_in_buffer)) { 527 (index_to_find < first_entry_in_buffer)) {
526 /* close and restart search */ 528 /* close and restart search */
527 cFYI(1, "search backing up - close and restart search"); 529 cFYI(1, "search backing up - close and restart search");
528 write_lock(&GlobalSMBSeslock); 530 spin_lock(&cifs_file_list_lock);
529 if (!cifsFile->srch_inf.endOfSearch && 531 if (!cifsFile->srch_inf.endOfSearch &&
530 !cifsFile->invalidHandle) { 532 !cifsFile->invalidHandle) {
531 cifsFile->invalidHandle = true; 533 cifsFile->invalidHandle = true;
532 write_unlock(&GlobalSMBSeslock); 534 spin_unlock(&cifs_file_list_lock);
533 CIFSFindClose(xid, pTcon, cifsFile->netfid); 535 CIFSFindClose(xid, pTcon, cifsFile->netfid);
534 } else 536 } else
535 write_unlock(&GlobalSMBSeslock); 537 spin_unlock(&cifs_file_list_lock);
536 if (cifsFile->srch_inf.ntwrk_buf_start) { 538 if (cifsFile->srch_inf.ntwrk_buf_start) {
537 cFYI(1, "freeing SMB ff cache buf on search rewind"); 539 cFYI(1, "freeing SMB ff cache buf on search rewind");
538 if (cifsFile->srch_inf.smallBuf) 540 if (cifsFile->srch_inf.smallBuf)
@@ -738,24 +740,21 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
738 cifs_autodisable_serverino(cifs_sb); 740 cifs_autodisable_serverino(cifs_sb);
739 } 741 }
740 742
743 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
744 CIFSCouldBeMFSymlink(&fattr))
745 /*
746 * trying to get the type and mode can be slow,
747 * so just call those regular files for now, and mark
748 * for reval
749 */
750 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
751
741 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 752 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
742 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); 753 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
743 754
744 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, 755 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
745 ino, fattr.cf_dtype); 756 ino, fattr.cf_dtype);
746 757
747 /*
748 * we can not return filldir errors to the caller since they are
749 * "normal" when the stat blocksize is too small - we return remapped
750 * error instead
751 *
752 * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
753 * case already. Why should we be clobbering other errors from it?
754 */
755 if (rc) {
756 cFYI(1, "filldir rc = %d", rc);
757 rc = -EOVERFLOW;
758 }
759 dput(tmp_dentry); 758 dput(tmp_dentry);
760 return rc; 759 return rc;
761} 760}
@@ -765,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
765{ 764{
766 int rc = 0; 765 int rc = 0;
767 int xid, i; 766 int xid, i;
768 struct cifs_sb_info *cifs_sb;
769 struct cifsTconInfo *pTcon; 767 struct cifsTconInfo *pTcon;
770 struct cifsFileInfo *cifsFile = NULL; 768 struct cifsFileInfo *cifsFile = NULL;
771 char *current_entry; 769 char *current_entry;
@@ -776,10 +774,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
776 774
777 xid = GetXid(); 775 xid = GetXid();
778 776
779 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 777 /*
780 pTcon = cifs_sb->tcon; 778 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
781 if (pTcon == NULL) 779 * '..'. Otherwise we won't be able to notify VFS in case of failure.
782 return -EINVAL; 780 */
781 if (file->private_data == NULL) {
782 rc = initiate_cifs_search(xid, file);
783 cFYI(1, "initiate cifs search rc %d", rc);
784 if (rc)
785 goto rddir2_exit;
786 }
783 787
784 switch ((int) file->f_pos) { 788 switch ((int) file->f_pos) {
785 case 0: 789 case 0:
@@ -805,14 +809,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
805 if after then keep searching till find it */ 809 if after then keep searching till find it */
806 810
807 if (file->private_data == NULL) { 811 if (file->private_data == NULL) {
808 rc = initiate_cifs_search(xid, file);
809 cFYI(1, "initiate cifs search rc %d", rc);
810 if (rc) {
811 FreeXid(xid);
812 return rc;
813 }
814 }
815 if (file->private_data == NULL) {
816 rc = -EINVAL; 812 rc = -EINVAL;
817 FreeXid(xid); 813 FreeXid(xid);
818 return rc; 814 return rc;
@@ -829,6 +825,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
829 CIFSFindClose(xid, pTcon, cifsFile->netfid); 825 CIFSFindClose(xid, pTcon, cifsFile->netfid);
830 } */ 826 } */
831 827
828 pTcon = tlink_tcon(cifsFile->tlink);
832 rc = find_cifs_entry(xid, pTcon, file, 829 rc = find_cifs_entry(xid, pTcon, file,
833 &current_entry, &num_to_fill); 830 &current_entry, &num_to_fill);
834 if (rc) { 831 if (rc) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..1adc9625a344 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include "cifs_spnego.h" 33#include "cifs_spnego.h"
34 34
35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
36 unsigned char *p24);
37
38/* 35/*
39 * Checks if this is the first smb session to be reconnected after 36 * Checks if this is the first smb session to be reconnected after
40 * the socket has been reestablished (so we know whether to use vc 0). 37 * the socket has been reestablished (so we know whether to use vc 0).
@@ -80,7 +77,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
80 if (max_vcs < 2) 77 if (max_vcs < 2)
81 max_vcs = 0xFFFF; 78 max_vcs = 0xFFFF;
82 79
83 write_lock(&cifs_tcp_ses_lock); 80 spin_lock(&cifs_tcp_ses_lock);
84 if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) 81 if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
85 goto get_vc_num_exit; /* vcnum will be zero */ 82 goto get_vc_num_exit; /* vcnum will be zero */
86 for (i = ses->server->srv_count - 1; i < max_vcs; i++) { 83 for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +109,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
112 vcnum = i; 109 vcnum = i;
113 ses->vcnum = vcnum; 110 ses->vcnum = vcnum;
114get_vc_num_exit: 111get_vc_num_exit:
115 write_unlock(&cifs_tcp_ses_lock); 112 spin_unlock(&cifs_tcp_ses_lock);
116 113
117 return cpu_to_le16(vcnum); 114 return cpu_to_le16(vcnum);
118} 115}
@@ -280,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
280} 277}
281 278
282static void 279static void
283decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses, 280decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
284 const struct nls_table *nls_cp) 281 const struct nls_table *nls_cp)
285{ 282{
286 int len; 283 int len;
@@ -326,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
326 return; 323 return;
327} 324}
328 325
329static int decode_ascii_ssetup(char **pbcc_area, int bleft, 326static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
330 struct cifsSesInfo *ses, 327 struct cifsSesInfo *ses,
331 const struct nls_table *nls_cp) 328 const struct nls_table *nls_cp)
332{ 329{
@@ -383,6 +380,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
383static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, 380static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
384 struct cifsSesInfo *ses) 381 struct cifsSesInfo *ses)
385{ 382{
383 unsigned int tioffset; /* challenge message target info area */
384 unsigned int tilen; /* challenge message target info area length */
385
386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
387 387
388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -399,16 +399,27 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
399 return -EINVAL; 399 return -EINVAL;
400 } 400 }
401 401
402 memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); 402 memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
403 /* BB we could decode pblob->NegotiateFlags; some may be useful */ 403 /* BB we could decode pblob->NegotiateFlags; some may be useful */
404 /* In particular we can examine sign flags */ 404 /* In particular we can examine sign flags */
405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then 405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
406 we must set the MIC field of the AUTHENTICATE_MESSAGE */ 406 we must set the MIC field of the AUTHENTICATE_MESSAGE */
407 ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
408 tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
409 tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
410 if (tilen) {
411 ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
412 if (!ses->auth_key.response) {
413 cERROR(1, "Challenge target info allocation failure");
414 return -ENOMEM;
415 }
416 memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
417 ses->auth_key.len = tilen;
418 }
407 419
408 return 0; 420 return 0;
409} 421}
410 422
411#ifdef CONFIG_CIFS_EXPERIMENTAL
412/* BB Move to ntlmssp.c eventually */ 423/* BB Move to ntlmssp.c eventually */
413 424
414/* We do not malloc the blob, it is passed in pbuffer, because 425/* We do not malloc the blob, it is passed in pbuffer, because
@@ -419,20 +430,23 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
419 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; 430 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
420 __u32 flags; 431 __u32 flags;
421 432
433 memset(pbuffer, 0, sizeof(NEGOTIATE_MESSAGE));
422 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 434 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
423 sec_blob->MessageType = NtLmNegotiate; 435 sec_blob->MessageType = NtLmNegotiate;
424 436
425 /* BB is NTLMV2 session security format easier to use here? */ 437 /* BB is NTLMV2 session security format easier to use here? */
426 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 438 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
427 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 439 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
428 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 440 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
429 if (ses->server->secMode & 441 if (ses->server->secMode &
430 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 442 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
431 flags |= NTLMSSP_NEGOTIATE_SIGN; 443 flags |= NTLMSSP_NEGOTIATE_SIGN;
432 if (ses->server->secMode & SECMODE_SIGN_REQUIRED) 444 if (!ses->server->session_estab)
433 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; 445 flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
446 NTLMSSP_NEGOTIATE_EXTENDED_SEC;
447 }
434 448
435 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 449 sec_blob->NegotiateFlags = cpu_to_le32(flags);
436 450
437 sec_blob->WorkstationName.BufferOffset = 0; 451 sec_blob->WorkstationName.BufferOffset = 0;
438 sec_blob->WorkstationName.Length = 0; 452 sec_blob->WorkstationName.Length = 0;
@@ -448,13 +462,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
448 maximum possible size is fixed and small, making this approach cleaner. 462 maximum possible size is fixed and small, making this approach cleaner.
449 This function returns the length of the data in the blob */ 463 This function returns the length of the data in the blob */
450static int build_ntlmssp_auth_blob(unsigned char *pbuffer, 464static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
465 u16 *buflen,
451 struct cifsSesInfo *ses, 466 struct cifsSesInfo *ses,
452 const struct nls_table *nls_cp, bool first) 467 const struct nls_table *nls_cp)
453{ 468{
469 int rc;
454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 470 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
455 __u32 flags; 471 __u32 flags;
456 unsigned char *tmp; 472 unsigned char *tmp;
457 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
458 473
459 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 474 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
460 sec_blob->MessageType = NtLmAuthenticate; 475 sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
462 flags = NTLMSSP_NEGOTIATE_56 | 477 flags = NTLMSSP_NEGOTIATE_56 |
463 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 478 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
464 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 479 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
465 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 480 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
466 if (ses->server->secMode & 481 if (ses->server->secMode &
467 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 482 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
468 flags |= NTLMSSP_NEGOTIATE_SIGN; 483 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -470,26 +485,27 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
470 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; 485 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
471 486
472 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); 487 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
473 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 488 sec_blob->NegotiateFlags = cpu_to_le32(flags);
474 489
475 sec_blob->LmChallengeResponse.BufferOffset = 490 sec_blob->LmChallengeResponse.BufferOffset =
476 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE)); 491 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
477 sec_blob->LmChallengeResponse.Length = 0; 492 sec_blob->LmChallengeResponse.Length = 0;
478 sec_blob->LmChallengeResponse.MaximumLength = 0; 493 sec_blob->LmChallengeResponse.MaximumLength = 0;
479 494
480 /* calculate session key, BB what about adding similar ntlmv2 path? */
481 SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
482 if (first)
483 cifs_calculate_mac_key(&ses->server->mac_signing_key,
484 ntlm_session_key, ses->password);
485
486 memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
487 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); 495 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
488 sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE); 496 rc = setup_ntlmv2_rsp(ses, nls_cp);
489 sec_blob->NtChallengeResponse.MaximumLength = 497 if (rc) {
490 cpu_to_le16(CIFS_SESS_KEY_SIZE); 498 cERROR(1, "Error %d during NTLMSSP authentication", rc);
499 goto setup_ntlmv2_ret;
500 }
501 memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
502 ses->auth_key.len - CIFS_SESS_KEY_SIZE);
503 tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
491 504
492 tmp += CIFS_SESS_KEY_SIZE; 505 sec_blob->NtChallengeResponse.Length =
506 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
507 sec_blob->NtChallengeResponse.MaximumLength =
508 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
493 509
494 if (ses->domainName == NULL) { 510 if (ses->domainName == NULL) {
495 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 511 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +517,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
501 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, 517 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
502 MAX_USERNAME_SIZE, nls_cp); 518 MAX_USERNAME_SIZE, nls_cp);
503 len *= 2; /* unicode is 2 bytes each */ 519 len *= 2; /* unicode is 2 bytes each */
504 len += 2; /* trailing null */
505 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 520 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
506 sec_blob->DomainName.Length = cpu_to_le16(len); 521 sec_blob->DomainName.Length = cpu_to_le16(len);
507 sec_blob->DomainName.MaximumLength = cpu_to_le16(len); 522 sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +533,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
518 len = cifs_strtoUCS((__le16 *)tmp, ses->userName, 533 len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
519 MAX_USERNAME_SIZE, nls_cp); 534 MAX_USERNAME_SIZE, nls_cp);
520 len *= 2; /* unicode is 2 bytes each */ 535 len *= 2; /* unicode is 2 bytes each */
521 len += 2; /* trailing null */
522 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 536 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
523 sec_blob->UserName.Length = cpu_to_le16(len); 537 sec_blob->UserName.Length = cpu_to_le16(len);
524 sec_blob->UserName.MaximumLength = cpu_to_le16(len); 538 sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,35 +544,25 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
530 sec_blob->WorkstationName.MaximumLength = 0; 544 sec_blob->WorkstationName.MaximumLength = 0;
531 tmp += 2; 545 tmp += 2;
532 546
533 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); 547 if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
534 sec_blob->SessionKey.Length = 0; 548 (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
535 sec_blob->SessionKey.MaximumLength = 0; 549 && !calc_seckey(ses)) {
536 return tmp - pbuffer; 550 memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
537} 551 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
538 552 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
539 553 sec_blob->SessionKey.MaximumLength =
540static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB, 554 cpu_to_le16(CIFS_CPHTXT_SIZE);
541 struct cifsSesInfo *ses) 555 tmp += CIFS_CPHTXT_SIZE;
542{ 556 } else {
543 build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses); 557 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
544 pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); 558 sec_blob->SessionKey.Length = 0;
545 559 sec_blob->SessionKey.MaximumLength = 0;
546 return; 560 }
547}
548
549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
550 struct cifsSesInfo *ses,
551 const struct nls_table *nls, bool first_time)
552{
553 int bloblen;
554
555 bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
556 first_time);
557 pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
558 561
559 return bloblen; 562setup_ntlmv2_ret:
563 *buflen = tmp - pbuffer;
564 return rc;
560} 565}
561#endif
562 566
563int 567int
564CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, 568CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
@@ -571,26 +575,30 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
571 char *str_area; 575 char *str_area;
572 SESSION_SETUP_ANDX *pSMB; 576 SESSION_SETUP_ANDX *pSMB;
573 __u32 capabilities; 577 __u32 capabilities;
574 int count; 578 __u16 count;
575 int resp_buf_type; 579 int resp_buf_type;
576 struct kvec iov[3]; 580 struct kvec iov[3];
577 enum securityEnum type; 581 enum securityEnum type;
578 __u16 action; 582 __u16 action, bytes_remaining;
579 int bytes_remaining;
580 struct key *spnego_key = NULL; 583 struct key *spnego_key = NULL;
581 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 584 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
582 bool first_time; 585 u16 blob_len;
586 char *ntlmsspblob = NULL;
583 587
584 if (ses == NULL) 588 if (ses == NULL)
585 return -EINVAL; 589 return -EINVAL;
586 590
587 read_lock(&cifs_tcp_ses_lock);
588 first_time = is_first_ses_reconnect(ses);
589 read_unlock(&cifs_tcp_ses_lock);
590
591 type = ses->server->secType; 591 type = ses->server->secType;
592
593 cFYI(1, "sess setup type %d", type); 592 cFYI(1, "sess setup type %d", type);
593 if (type == RawNTLMSSP) {
594 /* if memory allocation is successful, caller of this function
595 * frees it.
596 */
597 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
598 if (!ses->ntlmssp)
599 return -ENOMEM;
600 }
601
594ssetup_ntlmssp_authenticate: 602ssetup_ntlmssp_authenticate:
595 if (phase == NtLmChallenge) 603 if (phase == NtLmChallenge)
596 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ 604 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -655,10 +663,14 @@ ssetup_ntlmssp_authenticate:
655 /* no capabilities flags in old lanman negotiation */ 663 /* no capabilities flags in old lanman negotiation */
656 664
657 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 665 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
658 /* BB calculate hash with password */
659 /* and copy into bcc */
660 666
661 calc_lanman_hash(ses->password, ses->server->cryptKey, 667 /* Calculate hash with password and copy into bcc_ptr.
668 * Encryption Key (stored as in cryptkey) gets used if the
669 * security mode bit in Negottiate Protocol response states
670 * to use challenge/response method (i.e. Password bit is 1).
671 */
672
673 calc_lanman_hash(ses->password, ses->server->cryptkey,
662 ses->server->secMode & SECMODE_PW_ENCRYPT ? 674 ses->server->secMode & SECMODE_PW_ENCRYPT ?
663 true : false, lnm_session_key); 675 true : false, lnm_session_key);
664 676
@@ -676,28 +688,27 @@ ssetup_ntlmssp_authenticate:
676 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 688 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
677#endif 689#endif
678 } else if (type == NTLM) { 690 } else if (type == NTLM) {
679 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
680
681 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); 691 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
682 pSMB->req_no_secext.CaseInsensitivePasswordLength = 692 pSMB->req_no_secext.CaseInsensitivePasswordLength =
683 cpu_to_le16(CIFS_SESS_KEY_SIZE); 693 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
684 pSMB->req_no_secext.CaseSensitivePasswordLength = 694 pSMB->req_no_secext.CaseSensitivePasswordLength =
685 cpu_to_le16(CIFS_SESS_KEY_SIZE); 695 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
686 696
687 /* calculate session key */ 697 /* calculate ntlm response and session key */
688 SMBNTencrypt(ses->password, ses->server->cryptKey, 698 rc = setup_ntlm_response(ses);
689 ntlm_session_key); 699 if (rc) {
700 cERROR(1, "Error %d during NTLM authentication", rc);
701 goto ssetup_exit;
702 }
690 703
691 if (first_time) /* should this be moved into common code 704 /* copy ntlm response */
692 with similar ntlmv2 path? */ 705 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
693 cifs_calculate_mac_key(&ses->server->mac_signing_key, 706 CIFS_AUTH_RESP_SIZE);
694 ntlm_session_key, ses->password); 707 bcc_ptr += CIFS_AUTH_RESP_SIZE;
695 /* copy session key */ 708 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
709 CIFS_AUTH_RESP_SIZE);
710 bcc_ptr += CIFS_AUTH_RESP_SIZE;
696 711
697 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
698 bcc_ptr += CIFS_SESS_KEY_SIZE;
699 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
700 bcc_ptr += CIFS_SESS_KEY_SIZE;
701 if (ses->capabilities & CAP_UNICODE) { 712 if (ses->capabilities & CAP_UNICODE) {
702 /* unicode strings must be word aligned */ 713 /* unicode strings must be word aligned */
703 if (iov[0].iov_len % 2) { 714 if (iov[0].iov_len % 2) {
@@ -708,33 +719,27 @@ ssetup_ntlmssp_authenticate:
708 } else 719 } else
709 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 720 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
710 } else if (type == NTLMv2) { 721 } else if (type == NTLMv2) {
711 char *v2_sess_key =
712 kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
713
714 /* BB FIXME change all users of v2_sess_key to
715 struct ntlmv2_resp */
716
717 if (v2_sess_key == NULL) {
718 rc = -ENOMEM;
719 goto ssetup_exit;
720 }
721
722 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); 722 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
723 723
724 /* LM2 password would be here if we supported it */ 724 /* LM2 password would be here if we supported it */
725 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; 725 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
726 /* cpu_to_le16(LM2_SESS_KEY_SIZE); */
727 726
727 /* calculate nlmv2 response and session key */
728 rc = setup_ntlmv2_rsp(ses, nls_cp);
729 if (rc) {
730 cERROR(1, "Error %d during NTLMv2 authentication", rc);
731 goto ssetup_exit;
732 }
733 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
734 ses->auth_key.len - CIFS_SESS_KEY_SIZE);
735 bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
736
737 /* set case sensitive password length after tilen may get
738 * assigned, tilen is 0 otherwise.
739 */
728 pSMB->req_no_secext.CaseSensitivePasswordLength = 740 pSMB->req_no_secext.CaseSensitivePasswordLength =
729 cpu_to_le16(sizeof(struct ntlmv2_resp)); 741 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
730 742
731 /* calculate session key */
732 setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
733 /* FIXME: calculate MAC key */
734 memcpy(bcc_ptr, (char *)v2_sess_key,
735 sizeof(struct ntlmv2_resp));
736 bcc_ptr += sizeof(struct ntlmv2_resp);
737 kfree(v2_sess_key);
738 if (ses->capabilities & CAP_UNICODE) { 743 if (ses->capabilities & CAP_UNICODE) {
739 if (iov[0].iov_len % 2) { 744 if (iov[0].iov_len % 2) {
740 *bcc_ptr = 0; 745 *bcc_ptr = 0;
@@ -746,6 +751,7 @@ ssetup_ntlmssp_authenticate:
746 } else if (type == Kerberos) { 751 } else if (type == Kerberos) {
747#ifdef CONFIG_CIFS_UPCALL 752#ifdef CONFIG_CIFS_UPCALL
748 struct cifs_spnego_msg *msg; 753 struct cifs_spnego_msg *msg;
754
749 spnego_key = cifs_get_spnego_key(ses); 755 spnego_key = cifs_get_spnego_key(ses);
750 if (IS_ERR(spnego_key)) { 756 if (IS_ERR(spnego_key)) {
751 rc = PTR_ERR(spnego_key); 757 rc = PTR_ERR(spnego_key);
@@ -763,19 +769,17 @@ ssetup_ntlmssp_authenticate:
763 rc = -EKEYREJECTED; 769 rc = -EKEYREJECTED;
764 goto ssetup_exit; 770 goto ssetup_exit;
765 } 771 }
766 /* bail out if key is too long */ 772
767 if (msg->sesskey_len > 773 ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
768 sizeof(ses->server->mac_signing_key.data.krb5)) { 774 if (!ses->auth_key.response) {
769 cERROR(1, "Kerberos signing key too long (%u bytes)", 775 cERROR(1, "Kerberos can't allocate (%u bytes) memory",
770 msg->sesskey_len); 776 msg->sesskey_len);
771 rc = -EOVERFLOW; 777 rc = -ENOMEM;
772 goto ssetup_exit; 778 goto ssetup_exit;
773 } 779 }
774 if (first_time) { 780 memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
775 ses->server->mac_signing_key.len = msg->sesskey_len; 781 ses->auth_key.len = msg->sesskey_len;
776 memcpy(ses->server->mac_signing_key.data.krb5, 782
777 msg->data, msg->sesskey_len);
778 }
779 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 783 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
780 capabilities |= CAP_EXTENDED_SECURITY; 784 capabilities |= CAP_EXTENDED_SECURITY;
781 pSMB->req.Capabilities = cpu_to_le32(capabilities); 785 pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -799,54 +803,70 @@ ssetup_ntlmssp_authenticate:
799 rc = -ENOSYS; 803 rc = -ENOSYS;
800 goto ssetup_exit; 804 goto ssetup_exit;
801#endif /* CONFIG_CIFS_UPCALL */ 805#endif /* CONFIG_CIFS_UPCALL */
802 } else { 806 } else if (type == RawNTLMSSP) {
803#ifdef CONFIG_CIFS_EXPERIMENTAL 807 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
804 if (type == RawNTLMSSP) { 808 cERROR(1, "NTLMSSP requires Unicode support");
805 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { 809 rc = -ENOSYS;
806 cERROR(1, "NTLMSSP requires Unicode support"); 810 goto ssetup_exit;
807 rc = -ENOSYS; 811 }
812
813 cFYI(1, "ntlmssp session setup phase %d", phase);
814 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
815 capabilities |= CAP_EXTENDED_SECURITY;
816 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
817 switch(phase) {
818 case NtLmNegotiate:
819 build_ntlmssp_negotiate_blob(
820 pSMB->req.SecurityBlob, ses);
821 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
822 iov[1].iov_base = pSMB->req.SecurityBlob;
823 pSMB->req.SecurityBlobLength =
824 cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
825 break;
826 case NtLmAuthenticate:
827 /*
828 * 5 is an empirical value, large enough to hold
829 * authenticate message plus max 10 of av paris,
830 * domain, user, workstation names, flags, etc.
831 */
832 ntlmsspblob = kzalloc(
833 5*sizeof(struct _AUTHENTICATE_MESSAGE),
834 GFP_KERNEL);
835 if (!ntlmsspblob) {
836 cERROR(1, "Can't allocate NTLMSSP blob");
837 rc = -ENOMEM;
808 goto ssetup_exit; 838 goto ssetup_exit;
809 } 839 }
810 840
811 cFYI(1, "ntlmssp session setup phase %d", phase); 841 rc = build_ntlmssp_auth_blob(ntlmsspblob,
812 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 842 &blob_len, ses, nls_cp);
813 capabilities |= CAP_EXTENDED_SECURITY; 843 if (rc)
814 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
815 if (phase == NtLmNegotiate) {
816 setup_ntlmssp_neg_req(pSMB, ses);
817 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
818 } else if (phase == NtLmAuthenticate) {
819 int blob_len;
820 blob_len = setup_ntlmssp_auth_req(pSMB, ses,
821 nls_cp,
822 first_time);
823 iov[1].iov_len = blob_len;
824 /* Make sure that we tell the server that we
825 are using the uid that it just gave us back
826 on the response (challenge) */
827 smb_buf->Uid = ses->Suid;
828 } else {
829 cERROR(1, "invalid phase %d", phase);
830 rc = -ENOSYS;
831 goto ssetup_exit; 844 goto ssetup_exit;
832 } 845 iov[1].iov_len = blob_len;
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0]; 846 iov[1].iov_base = ntlmsspblob;
834 /* unicode strings must be word aligned */ 847 pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
835 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 848 /*
836 *bcc_ptr = 0; 849 * Make sure that we tell the server that we are using
837 bcc_ptr++; 850 * the uid that it just gave us back on the response
838 } 851 * (challenge)
839 unicode_oslm_strings(&bcc_ptr, nls_cp); 852 */
840 } else { 853 smb_buf->Uid = ses->Suid;
841 cERROR(1, "secType %d not supported!", type); 854 break;
855 default:
856 cERROR(1, "invalid phase %d", phase);
842 rc = -ENOSYS; 857 rc = -ENOSYS;
843 goto ssetup_exit; 858 goto ssetup_exit;
844 } 859 }
845#else 860 /* unicode strings must be word aligned */
861 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
862 *bcc_ptr = 0;
863 bcc_ptr++;
864 }
865 unicode_oslm_strings(&bcc_ptr, nls_cp);
866 } else {
846 cERROR(1, "secType %d not supported!", type); 867 cERROR(1, "secType %d not supported!", type);
847 rc = -ENOSYS; 868 rc = -ENOSYS;
848 goto ssetup_exit; 869 goto ssetup_exit;
849#endif
850 } 870 }
851 871
852 iov[2].iov_base = str_area; 872 iov[2].iov_base = str_area;
@@ -855,14 +875,12 @@ ssetup_ntlmssp_authenticate:
855 count = iov[1].iov_len + iov[2].iov_len; 875 count = iov[1].iov_len + iov[2].iov_len;
856 smb_buf->smb_buf_length += count; 876 smb_buf->smb_buf_length += count;
857 877
858 BCC_LE(smb_buf) = cpu_to_le16(count); 878 put_bcc_le(count, smb_buf);
859 879
860 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, 880 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
861 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 881 CIFS_LOG_ERROR);
862 /* SMB request buf freed in SendReceive2 */ 882 /* SMB request buf freed in SendReceive2 */
863 883
864 cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
865
866 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 884 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
867 smb_buf = (struct smb_hdr *)iov[0].iov_base; 885 smb_buf = (struct smb_hdr *)iov[0].iov_base;
868 886
@@ -891,11 +909,10 @@ ssetup_ntlmssp_authenticate:
891 cFYI(1, "UID = %d ", ses->Suid); 909 cFYI(1, "UID = %d ", ses->Suid);
892 /* response can have either 3 or 4 word count - Samba sends 3 */ 910 /* response can have either 3 or 4 word count - Samba sends 3 */
893 /* and lanman response is 3 */ 911 /* and lanman response is 3 */
894 bytes_remaining = BCC(smb_buf); 912 bytes_remaining = get_bcc(smb_buf);
895 bcc_ptr = pByteArea(smb_buf); 913 bcc_ptr = pByteArea(smb_buf);
896 914
897 if (smb_buf->WordCount == 4) { 915 if (smb_buf->WordCount == 4) {
898 __u16 blob_len;
899 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 916 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
900 if (blob_len > bytes_remaining) { 917 if (blob_len > bytes_remaining) {
901 cERROR(1, "bad security blob length %d", blob_len); 918 cERROR(1, "bad security blob length %d", blob_len);
@@ -931,6 +948,8 @@ ssetup_exit:
931 key_put(spnego_key); 948 key_put(spnego_key);
932 } 949 }
933 kfree(str_area); 950 kfree(str_area);
951 kfree(ntlmsspblob);
952 ntlmsspblob = NULL;
934 if (resp_buf_type == CIFS_SMALL_BUFFER) { 953 if (resp_buf_type == CIFS_SMALL_BUFFER) {
935 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base); 954 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
936 cifs_small_buf_release(iov[0].iov_base); 955 cifs_small_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500bf..04721485925d 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
45 up with a different answer to the one above) 45 up with a different answer to the one above)
46*/ 46*/
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include "cifsencrypt.h"
49#define uchar unsigned char 48#define uchar unsigned char
50 49
51static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9, 50static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20f..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,9 +32,8 @@
32#include "cifs_unicode.h" 32#include "cifs_unicode.h"
33#include "cifspdu.h" 33#include "cifspdu.h"
34#include "cifsglob.h" 34#include "cifsglob.h"
35#include "md5.h"
36#include "cifs_debug.h" 35#include "cifs_debug.h"
37#include "cifsencrypt.h" 36#include "cifsproto.h"
38 37
39#ifndef false 38#ifndef false
40#define false 0 39#define false 0
@@ -48,14 +47,58 @@
48#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) 47#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
49#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) 48#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
50 49
51/*The following definitions come from libsmb/smbencrypt.c */ 50/* produce a md4 message digest from data of length n bytes */
51int
52mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
53{
54 int rc;
55 unsigned int size;
56 struct crypto_shash *md4;
57 struct sdesc *sdescmd4;
58
59 md4 = crypto_alloc_shash("md4", 0, 0);
60 if (IS_ERR(md4)) {
61 rc = PTR_ERR(md4);
62 cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
63 return rc;
64 }
65 size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
66 sdescmd4 = kmalloc(size, GFP_KERNEL);
67 if (!sdescmd4) {
68 rc = -ENOMEM;
69 cERROR(1, "%s: Memory allocation failure\n", __func__);
70 goto mdfour_err;
71 }
72 sdescmd4->shash.tfm = md4;
73 sdescmd4->shash.flags = 0x0;
74
75 rc = crypto_shash_init(&sdescmd4->shash);
76 if (rc) {
77 cERROR(1, "%s: Could not init md4 shash\n", __func__);
78 goto mdfour_err;
79 }
80 crypto_shash_update(&sdescmd4->shash, link_str, link_len);
81 rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
52 82
53void SMBencrypt(unsigned char *passwd, const unsigned char *c8, 83mdfour_err:
54 unsigned char *p24); 84 crypto_free_shash(md4);
55void E_md4hash(const unsigned char *passwd, unsigned char *p16); 85 kfree(sdescmd4);
56static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, 86
57 unsigned char p24[24]); 87 return rc;
58void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); 88}
89
90/* Does the des encryption from the NT or LM MD4 hash. */
91static void
92SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
93 unsigned char p24[24])
94{
95 unsigned char p21[21];
96
97 memset(p21, '\0', 21);
98
99 memcpy(p21, passwd, 16);
100 E_P24(p21, c8, p24);
101}
59 102
60/* 103/*
61 This implements the X/Open SMB password encryption 104 This implements the X/Open SMB password encryption
@@ -118,9 +161,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
118 * Creates the MD4 Hash of the users password in NT UNICODE. 161 * Creates the MD4 Hash of the users password in NT UNICODE.
119 */ 162 */
120 163
121void 164int
122E_md4hash(const unsigned char *passwd, unsigned char *p16) 165E_md4hash(const unsigned char *passwd, unsigned char *p16)
123{ 166{
167 int rc;
124 int len; 168 int len;
125 __u16 wpwd[129]; 169 __u16 wpwd[129];
126 170
@@ -139,8 +183,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
139 /* Calculate length in bytes */ 183 /* Calculate length in bytes */
140 len = _my_wcslen(wpwd) * sizeof(__u16); 184 len = _my_wcslen(wpwd) * sizeof(__u16);
141 185
142 mdfour(p16, (unsigned char *) wpwd, len); 186 rc = mdfour(p16, (unsigned char *) wpwd, len);
143 memset(wpwd, 0, 129 * 2); 187 memset(wpwd, 0, 129 * 2);
188
189 return rc;
144} 190}
145 191
146#if 0 /* currently unused */ 192#if 0 /* currently unused */
@@ -212,19 +258,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
212} 258}
213#endif 259#endif
214 260
215/* Does the des encryption from the NT or LM MD4 hash. */
216static void
217SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
218 unsigned char p24[24])
219{
220 unsigned char p21[21];
221
222 memset(p21, '\0', 21);
223
224 memcpy(p21, passwd, 16);
225 E_P24(p21, c8, p24);
226}
227
228/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ 261/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
229#if 0 /* currently unused */ 262#if 0 /* currently unused */
230static void 263static void
@@ -242,16 +275,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
242#endif 275#endif
243 276
244/* Does the NT MD4 hash then des encryption. */ 277/* Does the NT MD4 hash then des encryption. */
245 278int
246void
247SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) 279SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
248{ 280{
281 int rc;
249 unsigned char p21[21]; 282 unsigned char p21[21];
250 283
251 memset(p21, '\0', 21); 284 memset(p21, '\0', 21);
252 285
253 E_md4hash(passwd, p21); 286 rc = E_md4hash(passwd, p21);
287 if (rc) {
288 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
289 return rc;
290 }
254 SMBOWFencrypt(p21, c8, p24); 291 SMBOWFencrypt(p21, c8, p24);
292 return rc;
255} 293}
256 294
257 295
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..fbc5aace54b1 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
36 36
37extern mempool_t *cifs_mid_poolp; 37extern mempool_t *cifs_mid_poolp;
38 38
39static struct mid_q_entry * 39static void
40wake_up_task(struct mid_q_entry *mid)
41{
42 wake_up_process(mid->callback_data);
43}
44
45struct mid_q_entry *
40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) 46AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
41{ 47{
42 struct mid_q_entry *temp; 48 struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ 64 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
59 /* when mid allocated can be before when sent */ 65 /* when mid allocated can be before when sent */
60 temp->when_alloc = jiffies; 66 temp->when_alloc = jiffies;
61 temp->tsk = current; 67
68 /*
69 * The default is for the mid to be synchronous, so the
70 * default callback just wakes up the current task.
71 */
72 temp->callback = wake_up_task;
73 temp->callback_data = current;
62 } 74 }
63 75
64 spin_lock(&GlobalMid_Lock);
65 list_add_tail(&temp->qhead, &server->pending_mid_q);
66 atomic_inc(&midCount); 76 atomic_inc(&midCount);
67 temp->midState = MID_REQUEST_ALLOCATED; 77 temp->midState = MID_REQUEST_ALLOCATED;
68 spin_unlock(&GlobalMid_Lock);
69 return temp; 78 return temp;
70} 79}
71 80
72static void 81void
73DeleteMidQEntry(struct mid_q_entry *midEntry) 82DeleteMidQEntry(struct mid_q_entry *midEntry)
74{ 83{
75#ifdef CONFIG_CIFS_STATS2 84#ifdef CONFIG_CIFS_STATS2
76 unsigned long now; 85 unsigned long now;
77#endif 86#endif
78 spin_lock(&GlobalMid_Lock);
79 midEntry->midState = MID_FREE; 87 midEntry->midState = MID_FREE;
80 list_del(&midEntry->qhead);
81 atomic_dec(&midCount); 88 atomic_dec(&midCount);
82 spin_unlock(&GlobalMid_Lock);
83 if (midEntry->largeBuf) 89 if (midEntry->largeBuf)
84 cifs_buf_release(midEntry->resp_buf); 90 cifs_buf_release(midEntry->resp_buf);
85 else 91 else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
103 mempool_free(midEntry, cifs_mid_poolp); 109 mempool_free(midEntry, cifs_mid_poolp);
104} 110}
105 111
112static void
113delete_mid(struct mid_q_entry *mid)
114{
115 spin_lock(&GlobalMid_Lock);
116 list_del(&mid->qhead);
117 spin_unlock(&GlobalMid_Lock);
118
119 DeleteMidQEntry(mid);
120}
121
106static int 122static int
107smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 123smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
108{ 124{
@@ -119,7 +135,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
119 if (ssocket == NULL) 135 if (ssocket == NULL)
120 return -ENOTSOCK; /* BB eventually add reconnect code here */ 136 return -ENOTSOCK; /* BB eventually add reconnect code here */
121 137
122 smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr; 138 smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
123 smb_msg.msg_namelen = sizeof(struct sockaddr); 139 smb_msg.msg_namelen = sizeof(struct sockaddr);
124 smb_msg.msg_control = NULL; 140 smb_msg.msg_control = NULL;
125 smb_msg.msg_controllen = 0; 141 smb_msg.msg_controllen = 0;
@@ -220,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
220 server->tcpStatus = CifsNeedReconnect; 236 server->tcpStatus = CifsNeedReconnect;
221 } 237 }
222 238
223 if (rc < 0) { 239 if (rc < 0 && rc != -EINTR)
224 cERROR(1, "Error %d sending data on socket to server", rc); 240 cERROR(1, "Error %d sending data on socket to server", rc);
225 } else 241 else
226 rc = 0; 242 rc = 0;
227 243
228 /* Don't want to modify the buffer as a 244 /* Don't want to modify the buffer as a
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
244 return smb_sendv(server, &iov, 1); 260 return smb_sendv(server, &iov, 1);
245} 261}
246 262
247static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) 263static int wait_for_free_request(struct TCP_Server_Info *server,
264 const int long_op)
248{ 265{
249 if (long_op == CIFS_ASYNC_OP) { 266 if (long_op == CIFS_ASYNC_OP) {
250 /* oplock breaks must not be held up */ 267 /* oplock breaks must not be held up */
251 atomic_inc(&ses->server->inFlight); 268 atomic_inc(&server->inFlight);
252 return 0; 269 return 0;
253 } 270 }
254 271
255 spin_lock(&GlobalMid_Lock); 272 spin_lock(&GlobalMid_Lock);
256 while (1) { 273 while (1) {
257 if (atomic_read(&ses->server->inFlight) >= 274 if (atomic_read(&server->inFlight) >= cifs_max_pending) {
258 cifs_max_pending){
259 spin_unlock(&GlobalMid_Lock); 275 spin_unlock(&GlobalMid_Lock);
260#ifdef CONFIG_CIFS_STATS2 276#ifdef CONFIG_CIFS_STATS2
261 atomic_inc(&ses->server->num_waiters); 277 atomic_inc(&server->num_waiters);
262#endif 278#endif
263 wait_event(ses->server->request_q, 279 wait_event(server->request_q,
264 atomic_read(&ses->server->inFlight) 280 atomic_read(&server->inFlight)
265 < cifs_max_pending); 281 < cifs_max_pending);
266#ifdef CONFIG_CIFS_STATS2 282#ifdef CONFIG_CIFS_STATS2
267 atomic_dec(&ses->server->num_waiters); 283 atomic_dec(&server->num_waiters);
268#endif 284#endif
269 spin_lock(&GlobalMid_Lock); 285 spin_lock(&GlobalMid_Lock);
270 } else { 286 } else {
271 if (ses->server->tcpStatus == CifsExiting) { 287 if (server->tcpStatus == CifsExiting) {
272 spin_unlock(&GlobalMid_Lock); 288 spin_unlock(&GlobalMid_Lock);
273 return -ENOENT; 289 return -ENOENT;
274 } 290 }
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
278 294
279 /* update # of requests on the wire to server */ 295 /* update # of requests on the wire to server */
280 if (long_op != CIFS_BLOCKING_OP) 296 if (long_op != CIFS_BLOCKING_OP)
281 atomic_inc(&ses->server->inFlight); 297 atomic_inc(&server->inFlight);
282 spin_unlock(&GlobalMid_Lock); 298 spin_unlock(&GlobalMid_Lock);
283 break; 299 break;
284 } 300 }
@@ -308,53 +324,85 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
308 *ppmidQ = AllocMidQEntry(in_buf, ses->server); 324 *ppmidQ = AllocMidQEntry(in_buf, ses->server);
309 if (*ppmidQ == NULL) 325 if (*ppmidQ == NULL)
310 return -ENOMEM; 326 return -ENOMEM;
327 spin_lock(&GlobalMid_Lock);
328 list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
329 spin_unlock(&GlobalMid_Lock);
311 return 0; 330 return 0;
312} 331}
313 332
314static int wait_for_response(struct cifsSesInfo *ses, 333static int
315 struct mid_q_entry *midQ, 334wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
316 unsigned long timeout,
317 unsigned long time_to_wait)
318{ 335{
319 unsigned long curr_timeout; 336 int error;
320 337
321 for (;;) { 338 error = wait_event_killable(server->response_q,
322 curr_timeout = timeout + jiffies; 339 midQ->midState != MID_REQUEST_SUBMITTED);
323 wait_event_timeout(ses->server->response_q, 340 if (error < 0)
324 midQ->midState != MID_REQUEST_SUBMITTED, timeout); 341 return -ERESTARTSYS;
325 342
326 if (time_after(jiffies, curr_timeout) && 343 return 0;
327 (midQ->midState == MID_REQUEST_SUBMITTED) && 344}
328 ((ses->server->tcpStatus == CifsGood) ||
329 (ses->server->tcpStatus == CifsNew))) {
330 345
331 unsigned long lrt;
332 346
333 /* We timed out. Is the server still 347/*
334 sending replies ? */ 348 * Send a SMB request and set the callback function in the mid to handle
335 spin_lock(&GlobalMid_Lock); 349 * the result. Caller is responsible for dealing with timeouts.
336 lrt = ses->server->lstrp; 350 */
337 spin_unlock(&GlobalMid_Lock); 351int
352cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
353 mid_callback_t *callback, void *cbdata)
354{
355 int rc;
356 struct mid_q_entry *mid;
338 357
339 /* Calculate time_to_wait past last receive time. 358 rc = wait_for_free_request(server, CIFS_ASYNC_OP);
340 Although we prefer not to time out if the 359 if (rc)
341 server is still responding - we will time 360 return rc;
342 out if the server takes more than 15 (or 45 361
343 or 180) seconds to respond to this request 362 /* enable signing if server requires it */
344 and has not responded to any request from 363 if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
345 other threads on the client within 10 seconds */ 364 in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
346 lrt += time_to_wait; 365
347 if (time_after(jiffies, lrt)) { 366 mutex_lock(&server->srv_mutex);
348 /* No replies for time_to_wait. */ 367 mid = AllocMidQEntry(in_buf, server);
349 cERROR(1, "server not responding"); 368 if (mid == NULL) {
350 return -1; 369 mutex_unlock(&server->srv_mutex);
351 } 370 return -ENOMEM;
352 } else {
353 return 0;
354 }
355 } 371 }
356}
357 372
373 /* put it on the pending_mid_q */
374 spin_lock(&GlobalMid_Lock);
375 list_add_tail(&mid->qhead, &server->pending_mid_q);
376 spin_unlock(&GlobalMid_Lock);
377
378 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
379 if (rc) {
380 mutex_unlock(&server->srv_mutex);
381 goto out_err;
382 }
383
384 mid->callback = callback;
385 mid->callback_data = cbdata;
386 mid->midState = MID_REQUEST_SUBMITTED;
387#ifdef CONFIG_CIFS_STATS2
388 atomic_inc(&server->inSend);
389#endif
390 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
391#ifdef CONFIG_CIFS_STATS2
392 atomic_dec(&server->inSend);
393 mid->when_sent = jiffies;
394#endif
395 mutex_unlock(&server->srv_mutex);
396 if (rc)
397 goto out_err;
398
399 return rc;
400out_err:
401 delete_mid(mid);
402 atomic_dec(&server->inFlight);
403 wake_up(&server->request_q);
404 return rc;
405}
358 406
359/* 407/*
360 * 408 *
@@ -382,6 +430,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
382 return rc; 430 return rc;
383} 431}
384 432
433static int
434sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
435{
436 int rc = 0;
437
438 cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
439 mid->mid, mid->midState);
440
441 spin_lock(&GlobalMid_Lock);
442 /* ensure that it's no longer on the pending_mid_q */
443 list_del_init(&mid->qhead);
444
445 switch (mid->midState) {
446 case MID_RESPONSE_RECEIVED:
447 spin_unlock(&GlobalMid_Lock);
448 return rc;
449 case MID_REQUEST_SUBMITTED:
450 /* socket is going down, reject all calls */
451 if (server->tcpStatus == CifsExiting) {
452 cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
453 __func__, mid->mid, mid->command, mid->midState);
454 rc = -EHOSTDOWN;
455 break;
456 }
457 case MID_RETRY_NEEDED:
458 rc = -EAGAIN;
459 break;
460 default:
461 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
462 mid->mid, mid->midState);
463 rc = -EIO;
464 }
465 spin_unlock(&GlobalMid_Lock);
466
467 DeleteMidQEntry(mid);
468 return rc;
469}
470
471/*
472 * An NT cancel request header looks just like the original request except:
473 *
474 * The Command is SMB_COM_NT_CANCEL
475 * The WordCount is zeroed out
476 * The ByteCount is zeroed out
477 *
478 * This function mangles an existing request buffer into a
479 * SMB_COM_NT_CANCEL request and then sends it.
480 */
481static int
482send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
483 struct mid_q_entry *mid)
484{
485 int rc = 0;
486
487 /* -4 for RFC1001 length and +2 for BCC field */
488 in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2;
489 in_buf->Command = SMB_COM_NT_CANCEL;
490 in_buf->WordCount = 0;
491 put_bcc_le(0, in_buf);
492
493 mutex_lock(&server->srv_mutex);
494 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
495 if (rc) {
496 mutex_unlock(&server->srv_mutex);
497 return rc;
498 }
499 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
500 mutex_unlock(&server->srv_mutex);
501
502 cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
503 in_buf->Mid, rc);
504
505 return rc;
506}
507
385int 508int
386SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, 509SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
387 struct kvec *iov, int n_vec, int *pRespBufType /* ret */, 510 struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +513,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
390 int rc = 0; 513 int rc = 0;
391 int long_op; 514 int long_op;
392 unsigned int receive_len; 515 unsigned int receive_len;
393 unsigned long timeout;
394 struct mid_q_entry *midQ; 516 struct mid_q_entry *midQ;
395 struct smb_hdr *in_buf = iov[0].iov_base; 517 struct smb_hdr *in_buf = iov[0].iov_base;
396 518
@@ -413,7 +535,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
413 to the same server. We may make this configurable later or 535 to the same server. We may make this configurable later or
414 use ses->maxReq */ 536 use ses->maxReq */
415 537
416 rc = wait_for_free_request(ses, long_op); 538 rc = wait_for_free_request(ses->server, long_op);
417 if (rc) { 539 if (rc) {
418 cifs_small_buf_release(in_buf); 540 cifs_small_buf_release(in_buf);
419 return rc; 541 return rc;
@@ -452,70 +574,41 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
452#endif 574#endif
453 575
454 mutex_unlock(&ses->server->srv_mutex); 576 mutex_unlock(&ses->server->srv_mutex);
455 cifs_small_buf_release(in_buf);
456 577
457 if (rc < 0) 578 if (rc < 0) {
458 goto out; 579 cifs_small_buf_release(in_buf);
459
460 if (long_op == CIFS_STD_OP)
461 timeout = 15 * HZ;
462 else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
463 timeout = 180 * HZ;
464 else if (long_op == CIFS_LONG_OP)
465 timeout = 45 * HZ; /* should be greater than
466 servers oplock break timeout (about 43 seconds) */
467 else if (long_op == CIFS_ASYNC_OP)
468 goto out;
469 else if (long_op == CIFS_BLOCKING_OP)
470 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
471 else {
472 cERROR(1, "unknown timeout flag %d", long_op);
473 rc = -EIO;
474 goto out; 580 goto out;
475 } 581 }
476 582
477 /* wait for 15 seconds or until woken up due to response arriving or 583 if (long_op == CIFS_ASYNC_OP) {
478 due to last connection to this server being unmounted */ 584 cifs_small_buf_release(in_buf);
479 if (signal_pending(current)) { 585 goto out;
480 /* if signal pending do not hold up user for full smb timeout
481 but we still give response a chance to complete */
482 timeout = 2 * HZ;
483 } 586 }
484 587
485 /* No user interrupts in wait - wreaks havoc with performance */ 588 rc = wait_for_response(ses->server, midQ);
486 wait_for_response(ses, midQ, timeout, 10 * HZ); 589 if (rc != 0) {
487 590 send_nt_cancel(ses->server, in_buf, midQ);
488 spin_lock(&GlobalMid_Lock); 591 spin_lock(&GlobalMid_Lock);
489
490 if (midQ->resp_buf == NULL) {
491 cERROR(1, "No response to cmd %d mid %d",
492 midQ->command, midQ->mid);
493 if (midQ->midState == MID_REQUEST_SUBMITTED) { 592 if (midQ->midState == MID_REQUEST_SUBMITTED) {
494 if (ses->server->tcpStatus == CifsExiting) 593 midQ->callback = DeleteMidQEntry;
495 rc = -EHOSTDOWN; 594 spin_unlock(&GlobalMid_Lock);
496 else { 595 cifs_small_buf_release(in_buf);
497 ses->server->tcpStatus = CifsNeedReconnect; 596 atomic_dec(&ses->server->inFlight);
498 midQ->midState = MID_RETRY_NEEDED; 597 wake_up(&ses->server->request_q);
499 } 598 return rc;
500 }
501
502 if (rc != -EHOSTDOWN) {
503 if (midQ->midState == MID_RETRY_NEEDED) {
504 rc = -EAGAIN;
505 cFYI(1, "marking request for retry");
506 } else {
507 rc = -EIO;
508 }
509 } 599 }
510 spin_unlock(&GlobalMid_Lock); 600 spin_unlock(&GlobalMid_Lock);
511 DeleteMidQEntry(midQ); 601 }
512 /* Update # of requests on wire to server */ 602
603 cifs_small_buf_release(in_buf);
604
605 rc = sync_mid_result(midQ, ses->server);
606 if (rc != 0) {
513 atomic_dec(&ses->server->inFlight); 607 atomic_dec(&ses->server->inFlight);
514 wake_up(&ses->server->request_q); 608 wake_up(&ses->server->request_q);
515 return rc; 609 return rc;
516 } 610 }
517 611
518 spin_unlock(&GlobalMid_Lock);
519 receive_len = midQ->resp_buf->smb_buf_length; 612 receive_len = midQ->resp_buf->smb_buf_length;
520 613
521 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 614 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -543,7 +636,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 636 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
544 SECMODE_SIGN_ENABLED))) { 637 SECMODE_SIGN_ENABLED))) {
545 rc = cifs_verify_signature(midQ->resp_buf, 638 rc = cifs_verify_signature(midQ->resp_buf,
546 &ses->server->mac_signing_key, 639 ses->server,
547 midQ->sequence_number+1); 640 midQ->sequence_number+1);
548 if (rc) { 641 if (rc) {
549 cERROR(1, "Unexpected SMB signature"); 642 cERROR(1, "Unexpected SMB signature");
@@ -559,19 +652,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
559 if (receive_len >= sizeof(struct smb_hdr) - 4 652 if (receive_len >= sizeof(struct smb_hdr) - 4
560 /* do not count RFC1001 header */ + 653 /* do not count RFC1001 header */ +
561 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ ) 654 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
562 BCC(midQ->resp_buf) = 655 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
563 le16_to_cpu(BCC_LE(midQ->resp_buf));
564 if ((flags & CIFS_NO_RESP) == 0) 656 if ((flags & CIFS_NO_RESP) == 0)
565 midQ->resp_buf = NULL; /* mark it so buf will 657 midQ->resp_buf = NULL; /* mark it so buf will
566 not be freed by 658 not be freed by
567 DeleteMidQEntry */ 659 delete_mid */
568 } else { 660 } else {
569 rc = -EIO; 661 rc = -EIO;
570 cFYI(1, "Bad MID state?"); 662 cFYI(1, "Bad MID state?");
571 } 663 }
572 664
573out: 665out:
574 DeleteMidQEntry(midQ); 666 delete_mid(midQ);
575 atomic_dec(&ses->server->inFlight); 667 atomic_dec(&ses->server->inFlight);
576 wake_up(&ses->server->request_q); 668 wake_up(&ses->server->request_q);
577 669
@@ -585,7 +677,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
585{ 677{
586 int rc = 0; 678 int rc = 0;
587 unsigned int receive_len; 679 unsigned int receive_len;
588 unsigned long timeout;
589 struct mid_q_entry *midQ; 680 struct mid_q_entry *midQ;
590 681
591 if (ses == NULL) { 682 if (ses == NULL) {
@@ -610,7 +701,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
610 return -EIO; 701 return -EIO;
611 } 702 }
612 703
613 rc = wait_for_free_request(ses, long_op); 704 rc = wait_for_free_request(ses->server, long_op);
614 if (rc) 705 if (rc)
615 return rc; 706 return rc;
616 707
@@ -649,64 +740,31 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
649 if (rc < 0) 740 if (rc < 0)
650 goto out; 741 goto out;
651 742
652 if (long_op == CIFS_STD_OP) 743 if (long_op == CIFS_ASYNC_OP)
653 timeout = 15 * HZ;
654 /* wait for 15 seconds or until woken up due to response arriving or
655 due to last connection to this server being unmounted */
656 else if (long_op == CIFS_ASYNC_OP)
657 goto out;
658 else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
659 timeout = 180 * HZ;
660 else if (long_op == CIFS_LONG_OP)
661 timeout = 45 * HZ; /* should be greater than
662 servers oplock break timeout (about 43 seconds) */
663 else if (long_op == CIFS_BLOCKING_OP)
664 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
665 else {
666 cERROR(1, "unknown timeout flag %d", long_op);
667 rc = -EIO;
668 goto out; 744 goto out;
669 }
670
671 if (signal_pending(current)) {
672 /* if signal pending do not hold up user for full smb timeout
673 but we still give response a chance to complete */
674 timeout = 2 * HZ;
675 }
676
677 /* No user interrupts in wait - wreaks havoc with performance */
678 wait_for_response(ses, midQ, timeout, 10 * HZ);
679 745
680 spin_lock(&GlobalMid_Lock); 746 rc = wait_for_response(ses->server, midQ);
681 if (midQ->resp_buf == NULL) { 747 if (rc != 0) {
682 cERROR(1, "No response for cmd %d mid %d", 748 send_nt_cancel(ses->server, in_buf, midQ);
683 midQ->command, midQ->mid); 749 spin_lock(&GlobalMid_Lock);
684 if (midQ->midState == MID_REQUEST_SUBMITTED) { 750 if (midQ->midState == MID_REQUEST_SUBMITTED) {
685 if (ses->server->tcpStatus == CifsExiting) 751 /* no longer considered to be "in-flight" */
686 rc = -EHOSTDOWN; 752 midQ->callback = DeleteMidQEntry;
687 else { 753 spin_unlock(&GlobalMid_Lock);
688 ses->server->tcpStatus = CifsNeedReconnect; 754 atomic_dec(&ses->server->inFlight);
689 midQ->midState = MID_RETRY_NEEDED; 755 wake_up(&ses->server->request_q);
690 } 756 return rc;
691 }
692
693 if (rc != -EHOSTDOWN) {
694 if (midQ->midState == MID_RETRY_NEEDED) {
695 rc = -EAGAIN;
696 cFYI(1, "marking request for retry");
697 } else {
698 rc = -EIO;
699 }
700 } 757 }
701 spin_unlock(&GlobalMid_Lock); 758 spin_unlock(&GlobalMid_Lock);
702 DeleteMidQEntry(midQ); 759 }
703 /* Update # of requests on wire to server */ 760
761 rc = sync_mid_result(midQ, ses->server);
762 if (rc != 0) {
704 atomic_dec(&ses->server->inFlight); 763 atomic_dec(&ses->server->inFlight);
705 wake_up(&ses->server->request_q); 764 wake_up(&ses->server->request_q);
706 return rc; 765 return rc;
707 } 766 }
708 767
709 spin_unlock(&GlobalMid_Lock);
710 receive_len = midQ->resp_buf->smb_buf_length; 768 receive_len = midQ->resp_buf->smb_buf_length;
711 769
712 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 770 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -731,7 +789,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 789 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
732 SECMODE_SIGN_ENABLED))) { 790 SECMODE_SIGN_ENABLED))) {
733 rc = cifs_verify_signature(out_buf, 791 rc = cifs_verify_signature(out_buf,
734 &ses->server->mac_signing_key, 792 ses->server,
735 midQ->sequence_number+1); 793 midQ->sequence_number+1);
736 if (rc) { 794 if (rc) {
737 cERROR(1, "Unexpected SMB signature"); 795 cERROR(1, "Unexpected SMB signature");
@@ -748,43 +806,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
748 if (receive_len >= sizeof(struct smb_hdr) - 4 806 if (receive_len >= sizeof(struct smb_hdr) - 4
749 /* do not count RFC1001 header */ + 807 /* do not count RFC1001 header */ +
750 (2 * out_buf->WordCount) + 2 /* bcc */ ) 808 (2 * out_buf->WordCount) + 2 /* bcc */ )
751 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 809 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
752 } else { 810 } else {
753 rc = -EIO; 811 rc = -EIO;
754 cERROR(1, "Bad MID state?"); 812 cERROR(1, "Bad MID state?");
755 } 813 }
756 814
757out: 815out:
758 DeleteMidQEntry(midQ); 816 delete_mid(midQ);
759 atomic_dec(&ses->server->inFlight); 817 atomic_dec(&ses->server->inFlight);
760 wake_up(&ses->server->request_q); 818 wake_up(&ses->server->request_q);
761 819
762 return rc; 820 return rc;
763} 821}
764 822
765/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
766
767static int
768send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
769 struct mid_q_entry *midQ)
770{
771 int rc = 0;
772 struct cifsSesInfo *ses = tcon->ses;
773 __u16 mid = in_buf->Mid;
774
775 header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
776 in_buf->Mid = mid;
777 mutex_lock(&ses->server->srv_mutex);
778 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
779 if (rc) {
780 mutex_unlock(&ses->server->srv_mutex);
781 return rc;
782 }
783 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
784 mutex_unlock(&ses->server->srv_mutex);
785 return rc;
786}
787
788/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows 823/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
789 blocking lock to return. */ 824 blocking lock to return. */
790 825
@@ -807,7 +842,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
807 pSMB->hdr.Mid = GetNextMid(ses->server); 842 pSMB->hdr.Mid = GetNextMid(ses->server);
808 843
809 return SendReceive(xid, ses, in_buf, out_buf, 844 return SendReceive(xid, ses, in_buf, out_buf,
810 &bytes_returned, CIFS_STD_OP); 845 &bytes_returned, 0);
811} 846}
812 847
813int 848int
@@ -845,7 +880,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
845 return -EIO; 880 return -EIO;
846 } 881 }
847 882
848 rc = wait_for_free_request(ses, CIFS_BLOCKING_OP); 883 rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
849 if (rc) 884 if (rc)
850 return rc; 885 return rc;
851 886
@@ -863,7 +898,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
863 898
864 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); 899 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
865 if (rc) { 900 if (rc) {
866 DeleteMidQEntry(midQ); 901 delete_mid(midQ);
867 mutex_unlock(&ses->server->srv_mutex); 902 mutex_unlock(&ses->server->srv_mutex);
868 return rc; 903 return rc;
869 } 904 }
@@ -880,7 +915,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
880 mutex_unlock(&ses->server->srv_mutex); 915 mutex_unlock(&ses->server->srv_mutex);
881 916
882 if (rc < 0) { 917 if (rc < 0) {
883 DeleteMidQEntry(midQ); 918 delete_mid(midQ);
884 return rc; 919 return rc;
885 } 920 }
886 921
@@ -899,10 +934,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
899 if (in_buf->Command == SMB_COM_TRANSACTION2) { 934 if (in_buf->Command == SMB_COM_TRANSACTION2) {
900 /* POSIX lock. We send a NT_CANCEL SMB to cause the 935 /* POSIX lock. We send a NT_CANCEL SMB to cause the
901 blocking lock to return. */ 936 blocking lock to return. */
902 937 rc = send_nt_cancel(ses->server, in_buf, midQ);
903 rc = send_nt_cancel(tcon, in_buf, midQ);
904 if (rc) { 938 if (rc) {
905 DeleteMidQEntry(midQ); 939 delete_mid(midQ);
906 return rc; 940 return rc;
907 } 941 }
908 } else { 942 } else {
@@ -914,47 +948,33 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
914 /* If we get -ENOLCK back the lock may have 948 /* If we get -ENOLCK back the lock may have
915 already been removed. Don't exit in this case. */ 949 already been removed. Don't exit in this case. */
916 if (rc && rc != -ENOLCK) { 950 if (rc && rc != -ENOLCK) {
917 DeleteMidQEntry(midQ); 951 delete_mid(midQ);
918 return rc; 952 return rc;
919 } 953 }
920 } 954 }
921 955
922 /* Wait 5 seconds for the response. */ 956 rc = wait_for_response(ses->server, midQ);
923 if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) { 957 if (rc) {
924 /* We got the response - restart system call. */ 958 send_nt_cancel(ses->server, in_buf, midQ);
925 rstart = 1; 959 spin_lock(&GlobalMid_Lock);
926 } 960 if (midQ->midState == MID_REQUEST_SUBMITTED) {
927 } 961 /* no longer considered to be "in-flight" */
928 962 midQ->callback = DeleteMidQEntry;
929 spin_lock(&GlobalMid_Lock); 963 spin_unlock(&GlobalMid_Lock);
930 if (midQ->resp_buf) { 964 return rc;
931 spin_unlock(&GlobalMid_Lock);
932 receive_len = midQ->resp_buf->smb_buf_length;
933 } else {
934 cERROR(1, "No response for cmd %d mid %d",
935 midQ->command, midQ->mid);
936 if (midQ->midState == MID_REQUEST_SUBMITTED) {
937 if (ses->server->tcpStatus == CifsExiting)
938 rc = -EHOSTDOWN;
939 else {
940 ses->server->tcpStatus = CifsNeedReconnect;
941 midQ->midState = MID_RETRY_NEEDED;
942 } 965 }
966 spin_unlock(&GlobalMid_Lock);
943 } 967 }
944 968
945 if (rc != -EHOSTDOWN) { 969 /* We got the response - restart system call. */
946 if (midQ->midState == MID_RETRY_NEEDED) { 970 rstart = 1;
947 rc = -EAGAIN;
948 cFYI(1, "marking request for retry");
949 } else {
950 rc = -EIO;
951 }
952 }
953 spin_unlock(&GlobalMid_Lock);
954 DeleteMidQEntry(midQ);
955 return rc;
956 } 971 }
957 972
973 rc = sync_mid_result(midQ, ses->server);
974 if (rc != 0)
975 return rc;
976
977 receive_len = midQ->resp_buf->smb_buf_length;
958 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 978 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
959 cERROR(1, "Frame too large received. Length: %d Xid: %d", 979 cERROR(1, "Frame too large received. Length: %d Xid: %d",
960 receive_len, xid); 980 receive_len, xid);
@@ -981,7 +1001,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 1001 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
982 SECMODE_SIGN_ENABLED))) { 1002 SECMODE_SIGN_ENABLED))) {
983 rc = cifs_verify_signature(out_buf, 1003 rc = cifs_verify_signature(out_buf,
984 &ses->server->mac_signing_key, 1004 ses->server,
985 midQ->sequence_number+1); 1005 midQ->sequence_number+1);
986 if (rc) { 1006 if (rc) {
987 cERROR(1, "Unexpected SMB signature"); 1007 cERROR(1, "Unexpected SMB signature");
@@ -998,10 +1018,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
998 if (receive_len >= sizeof(struct smb_hdr) - 4 1018 if (receive_len >= sizeof(struct smb_hdr) - 4
999 /* do not count RFC1001 header */ + 1019 /* do not count RFC1001 header */ +
1000 (2 * out_buf->WordCount) + 2 /* bcc */ ) 1020 (2 * out_buf->WordCount) + 2 /* bcc */ )
1001 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 1021 put_bcc(get_bcc_le(out_buf), out_buf);
1002 1022
1003out: 1023out:
1004 DeleteMidQEntry(midQ); 1024 delete_mid(midQ);
1005 if (rstart && rc == -EACCES) 1025 if (rstart && rc == -EACCES)
1006 return -ERESTARTSYS; 1026 return -ERESTARTSYS;
1007 return rc; 1027 return rc;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa6..eae2a1491608 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
30 30
31#define MAX_EA_VALUE_SIZE 65535 31#define MAX_EA_VALUE_SIZE 65535
32#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" 32#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
33#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
33#define CIFS_XATTR_USER_PREFIX "user." 34#define CIFS_XATTR_USER_PREFIX "user."
34#define CIFS_XATTR_SYSTEM_PREFIX "system." 35#define CIFS_XATTR_SYSTEM_PREFIX "system."
35#define CIFS_XATTR_OS2_PREFIX "os2." 36#define CIFS_XATTR_OS2_PREFIX "os2."
36#define CIFS_XATTR_SECURITY_PREFIX ".security" 37#define CIFS_XATTR_SECURITY_PREFIX "security."
37#define CIFS_XATTR_TRUSTED_PREFIX "trusted." 38#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
38#define XATTR_TRUSTED_PREFIX_LEN 8 39#define XATTR_TRUSTED_PREFIX_LEN 8
39#define XATTR_SECURITY_PREFIX_LEN 9 40#define XATTR_SECURITY_PREFIX_LEN 9
@@ -47,9 +48,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
47#ifdef CONFIG_CIFS_XATTR 48#ifdef CONFIG_CIFS_XATTR
48 int xid; 49 int xid;
49 struct cifs_sb_info *cifs_sb; 50 struct cifs_sb_info *cifs_sb;
51 struct tcon_link *tlink;
50 struct cifsTconInfo *pTcon; 52 struct cifsTconInfo *pTcon;
51 struct super_block *sb; 53 struct super_block *sb;
52 char *full_path; 54 char *full_path = NULL;
53 55
54 if (direntry == NULL) 56 if (direntry == NULL)
55 return -EIO; 57 return -EIO;
@@ -58,16 +60,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
58 sb = direntry->d_inode->i_sb; 60 sb = direntry->d_inode->i_sb;
59 if (sb == NULL) 61 if (sb == NULL)
60 return -EIO; 62 return -EIO;
61 xid = GetXid();
62 63
63 cifs_sb = CIFS_SB(sb); 64 cifs_sb = CIFS_SB(sb);
64 pTcon = cifs_sb->tcon; 65 tlink = cifs_sb_tlink(cifs_sb);
66 if (IS_ERR(tlink))
67 return PTR_ERR(tlink);
68 pTcon = tlink_tcon(tlink);
69
70 xid = GetXid();
65 71
66 full_path = build_path_from_dentry(direntry); 72 full_path = build_path_from_dentry(direntry);
67 if (full_path == NULL) { 73 if (full_path == NULL) {
68 rc = -ENOMEM; 74 rc = -ENOMEM;
69 FreeXid(xid); 75 goto remove_ea_exit;
70 return rc;
71 } 76 }
72 if (ea_name == NULL) { 77 if (ea_name == NULL) {
73 cFYI(1, "Null xattr names not supported"); 78 cFYI(1, "Null xattr names not supported");
@@ -91,6 +96,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
91remove_ea_exit: 96remove_ea_exit:
92 kfree(full_path); 97 kfree(full_path);
93 FreeXid(xid); 98 FreeXid(xid);
99 cifs_put_tlink(tlink);
94#endif 100#endif
95 return rc; 101 return rc;
96} 102}
@@ -102,6 +108,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
102#ifdef CONFIG_CIFS_XATTR 108#ifdef CONFIG_CIFS_XATTR
103 int xid; 109 int xid;
104 struct cifs_sb_info *cifs_sb; 110 struct cifs_sb_info *cifs_sb;
111 struct tcon_link *tlink;
105 struct cifsTconInfo *pTcon; 112 struct cifsTconInfo *pTcon;
106 struct super_block *sb; 113 struct super_block *sb;
107 char *full_path; 114 char *full_path;
@@ -113,16 +120,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
113 sb = direntry->d_inode->i_sb; 120 sb = direntry->d_inode->i_sb;
114 if (sb == NULL) 121 if (sb == NULL)
115 return -EIO; 122 return -EIO;
116 xid = GetXid();
117 123
118 cifs_sb = CIFS_SB(sb); 124 cifs_sb = CIFS_SB(sb);
119 pTcon = cifs_sb->tcon; 125 tlink = cifs_sb_tlink(cifs_sb);
126 if (IS_ERR(tlink))
127 return PTR_ERR(tlink);
128 pTcon = tlink_tcon(tlink);
129
130 xid = GetXid();
120 131
121 full_path = build_path_from_dentry(direntry); 132 full_path = build_path_from_dentry(direntry);
122 if (full_path == NULL) { 133 if (full_path == NULL) {
123 rc = -ENOMEM; 134 rc = -ENOMEM;
124 FreeXid(xid); 135 goto set_ea_exit;
125 return rc;
126 } 136 }
127 /* return dos attributes as pseudo xattr */ 137 /* return dos attributes as pseudo xattr */
128 /* return alt name if available as pseudo attr */ 138 /* return alt name if available as pseudo attr */
@@ -132,9 +142,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
132 returns as xattrs */ 142 returns as xattrs */
133 if (value_size > MAX_EA_VALUE_SIZE) { 143 if (value_size > MAX_EA_VALUE_SIZE) {
134 cFYI(1, "size of EA value too large"); 144 cFYI(1, "size of EA value too large");
135 kfree(full_path); 145 rc = -EOPNOTSUPP;
136 FreeXid(xid); 146 goto set_ea_exit;
137 return -EOPNOTSUPP;
138 } 147 }
139 148
140 if (ea_name == NULL) { 149 if (ea_name == NULL) {
@@ -198,6 +207,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
198set_ea_exit: 207set_ea_exit:
199 kfree(full_path); 208 kfree(full_path);
200 FreeXid(xid); 209 FreeXid(xid);
210 cifs_put_tlink(tlink);
201#endif 211#endif
202 return rc; 212 return rc;
203} 213}
@@ -209,6 +219,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
209#ifdef CONFIG_CIFS_XATTR 219#ifdef CONFIG_CIFS_XATTR
210 int xid; 220 int xid;
211 struct cifs_sb_info *cifs_sb; 221 struct cifs_sb_info *cifs_sb;
222 struct tcon_link *tlink;
212 struct cifsTconInfo *pTcon; 223 struct cifsTconInfo *pTcon;
213 struct super_block *sb; 224 struct super_block *sb;
214 char *full_path; 225 char *full_path;
@@ -221,16 +232,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
221 if (sb == NULL) 232 if (sb == NULL)
222 return -EIO; 233 return -EIO;
223 234
224 xid = GetXid();
225
226 cifs_sb = CIFS_SB(sb); 235 cifs_sb = CIFS_SB(sb);
227 pTcon = cifs_sb->tcon; 236 tlink = cifs_sb_tlink(cifs_sb);
237 if (IS_ERR(tlink))
238 return PTR_ERR(tlink);
239 pTcon = tlink_tcon(tlink);
240
241 xid = GetXid();
228 242
229 full_path = build_path_from_dentry(direntry); 243 full_path = build_path_from_dentry(direntry);
230 if (full_path == NULL) { 244 if (full_path == NULL) {
231 rc = -ENOMEM; 245 rc = -ENOMEM;
232 FreeXid(xid); 246 goto get_ea_exit;
233 return rc;
234 } 247 }
235 /* return dos attributes as pseudo xattr */ 248 /* return dos attributes as pseudo xattr */
236 /* return alt name if available as pseudo attr */ 249 /* return alt name if available as pseudo attr */
@@ -265,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
265 cifs_sb->local_nls, 278 cifs_sb->local_nls,
266 cifs_sb->mnt_cifs_flags & 279 cifs_sb->mnt_cifs_flags &
267 CIFS_MOUNT_MAP_SPECIAL_CHR); 280 CIFS_MOUNT_MAP_SPECIAL_CHR);
268#ifdef CONFIG_CIFS_EXPERIMENTAL
269 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
270 __u16 fid;
271 int oplock = 0;
272 struct cifs_ntsd *pacl = NULL;
273 __u32 buflen = 0;
274 if (experimEnabled)
275 rc = CIFSSMBOpen(xid, pTcon, full_path,
276 FILE_OPEN, GENERIC_READ, 0, &fid,
277 &oplock, NULL, cifs_sb->local_nls,
278 cifs_sb->mnt_cifs_flags &
279 CIFS_MOUNT_MAP_SPECIAL_CHR);
280 /* else rc is EOPNOTSUPP from above */
281
282 if (rc == 0) {
283 rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
284 &buflen);
285 CIFSSMBClose(xid, pTcon, fid);
286 }
287 }
288#endif /* EXPERIMENTAL */
289#else 281#else
290 cFYI(1, "query POSIX ACL not supported yet"); 282 cFYI(1, "Query POSIX ACL not supported yet");
291#endif /* CONFIG_CIFS_POSIX */ 283#endif /* CONFIG_CIFS_POSIX */
292 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 284 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
293 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 285 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -299,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
299 cifs_sb->mnt_cifs_flags & 291 cifs_sb->mnt_cifs_flags &
300 CIFS_MOUNT_MAP_SPECIAL_CHR); 292 CIFS_MOUNT_MAP_SPECIAL_CHR);
301#else 293#else
302 cFYI(1, "query POSIX default ACL not supported yet"); 294 cFYI(1, "Query POSIX default ACL not supported yet");
303#endif 295#endif /* CONFIG_CIFS_POSIX */
296 } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
297 strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
298#ifdef CONFIG_CIFS_ACL
299 u32 acllen;
300 struct cifs_ntsd *pacl;
301
302 pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
303 full_path, &acllen);
304 if (IS_ERR(pacl)) {
305 rc = PTR_ERR(pacl);
306 cERROR(1, "%s: error %zd getting sec desc",
307 __func__, rc);
308 } else {
309 if (ea_value) {
310 if (acllen > buf_size)
311 acllen = -ERANGE;
312 else
313 memcpy(ea_value, pacl, acllen);
314 }
315 rc = acllen;
316 kfree(pacl);
317 }
318#else
319 cFYI(1, "Query CIFS ACL not supported yet");
320#endif /* CONFIG_CIFS_ACL */
304 } else if (strncmp(ea_name, 321 } else if (strncmp(ea_name,
305 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { 322 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
306 cFYI(1, "Trusted xattr namespace not supported yet"); 323 cFYI(1, "Trusted xattr namespace not supported yet");
@@ -323,6 +340,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
323get_ea_exit: 340get_ea_exit:
324 kfree(full_path); 341 kfree(full_path);
325 FreeXid(xid); 342 FreeXid(xid);
343 cifs_put_tlink(tlink);
326#endif 344#endif
327 return rc; 345 return rc;
328} 346}
@@ -333,6 +351,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
333#ifdef CONFIG_CIFS_XATTR 351#ifdef CONFIG_CIFS_XATTR
334 int xid; 352 int xid;
335 struct cifs_sb_info *cifs_sb; 353 struct cifs_sb_info *cifs_sb;
354 struct tcon_link *tlink;
336 struct cifsTconInfo *pTcon; 355 struct cifsTconInfo *pTcon;
337 struct super_block *sb; 356 struct super_block *sb;
338 char *full_path; 357 char *full_path;
@@ -346,18 +365,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
346 return -EIO; 365 return -EIO;
347 366
348 cifs_sb = CIFS_SB(sb); 367 cifs_sb = CIFS_SB(sb);
349 pTcon = cifs_sb->tcon;
350
351 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 368 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
352 return -EOPNOTSUPP; 369 return -EOPNOTSUPP;
353 370
371 tlink = cifs_sb_tlink(cifs_sb);
372 if (IS_ERR(tlink))
373 return PTR_ERR(tlink);
374 pTcon = tlink_tcon(tlink);
375
354 xid = GetXid(); 376 xid = GetXid();
355 377
356 full_path = build_path_from_dentry(direntry); 378 full_path = build_path_from_dentry(direntry);
357 if (full_path == NULL) { 379 if (full_path == NULL) {
358 rc = -ENOMEM; 380 rc = -ENOMEM;
359 FreeXid(xid); 381 goto list_ea_exit;
360 return rc;
361 } 382 }
362 /* return dos attributes as pseudo xattr */ 383 /* return dos attributes as pseudo xattr */
363 /* return alt name if available as pseudo attr */ 384 /* return alt name if available as pseudo attr */
@@ -370,8 +391,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
370 cifs_sb->mnt_cifs_flags & 391 cifs_sb->mnt_cifs_flags &
371 CIFS_MOUNT_MAP_SPECIAL_CHR); 392 CIFS_MOUNT_MAP_SPECIAL_CHR);
372 393
394list_ea_exit:
373 kfree(full_path); 395 kfree(full_path);
374 FreeXid(xid); 396 FreeXid(xid);
397 cifs_put_tlink(tlink);
375#endif 398#endif
376 return rc; 399 return rc;
377} 400}
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22a..690157876184 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,12 +17,12 @@
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/spinlock.h>
20 21
21#include <linux/coda.h> 22#include <linux/coda.h>
22#include <linux/coda_linux.h>
23#include <linux/coda_psdev.h> 23#include <linux/coda_psdev.h>
24#include <linux/coda_fs_i.h> 24#include "coda_linux.h"
25#include <linux/coda_cache.h> 25#include "coda_cache.h"
26 26
27static atomic_t permission_epoch = ATOMIC_INIT(0); 27static atomic_t permission_epoch = ATOMIC_INIT(0);
28 28
@@ -31,19 +31,23 @@ void coda_cache_enter(struct inode *inode, int mask)
31{ 31{
32 struct coda_inode_info *cii = ITOC(inode); 32 struct coda_inode_info *cii = ITOC(inode);
33 33
34 spin_lock(&cii->c_lock);
34 cii->c_cached_epoch = atomic_read(&permission_epoch); 35 cii->c_cached_epoch = atomic_read(&permission_epoch);
35 if (cii->c_uid != current_fsuid()) { 36 if (cii->c_uid != current_fsuid()) {
36 cii->c_uid = current_fsuid(); 37 cii->c_uid = current_fsuid();
37 cii->c_cached_perm = mask; 38 cii->c_cached_perm = mask;
38 } else 39 } else
39 cii->c_cached_perm |= mask; 40 cii->c_cached_perm |= mask;
41 spin_unlock(&cii->c_lock);
40} 42}
41 43
42/* remove cached acl from an inode */ 44/* remove cached acl from an inode */
43void coda_cache_clear_inode(struct inode *inode) 45void coda_cache_clear_inode(struct inode *inode)
44{ 46{
45 struct coda_inode_info *cii = ITOC(inode); 47 struct coda_inode_info *cii = ITOC(inode);
48 spin_lock(&cii->c_lock);
46 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1; 49 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
50 spin_unlock(&cii->c_lock);
47} 51}
48 52
49/* remove all acl caches */ 53/* remove all acl caches */
@@ -57,13 +61,15 @@ void coda_cache_clear_all(struct super_block *sb)
57int coda_cache_check(struct inode *inode, int mask) 61int coda_cache_check(struct inode *inode, int mask)
58{ 62{
59 struct coda_inode_info *cii = ITOC(inode); 63 struct coda_inode_info *cii = ITOC(inode);
60 int hit; 64 int hit;
61 65
62 hit = (mask & cii->c_cached_perm) == mask && 66 spin_lock(&cii->c_lock);
63 cii->c_uid == current_fsuid() && 67 hit = (mask & cii->c_cached_perm) == mask &&
64 cii->c_cached_epoch == atomic_read(&permission_epoch); 68 cii->c_uid == current_fsuid() &&
69 cii->c_cached_epoch == atomic_read(&permission_epoch);
70 spin_unlock(&cii->c_lock);
65 71
66 return hit; 72 return hit;
67} 73}
68 74
69 75
@@ -86,7 +92,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
86 struct list_head *child; 92 struct list_head *child;
87 struct dentry *de; 93 struct dentry *de;
88 94
89 spin_lock(&dcache_lock); 95 spin_lock(&parent->d_lock);
90 list_for_each(child, &parent->d_subdirs) 96 list_for_each(child, &parent->d_subdirs)
91 { 97 {
92 de = list_entry(child, struct dentry, d_u.d_child); 98 de = list_entry(child, struct dentry, d_u.d_child);
@@ -95,7 +101,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
95 continue; 101 continue;
96 coda_flag_inode(de->d_inode, flag); 102 coda_flag_inode(de->d_inode, flag);
97 } 103 }
98 spin_unlock(&dcache_lock); 104 spin_unlock(&parent->d_lock);
99 return; 105 return;
100} 106}
101 107
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929eec..6475877b0763 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
7#include <linux/time.h> 7#include <linux/time.h>
8 8
9#include <linux/coda.h> 9#include <linux/coda.h>
10#include <linux/coda_linux.h>
11#include <linux/coda_fs_i.h>
12#include <linux/coda_psdev.h> 10#include <linux/coda_psdev.h>
11#include "coda_linux.h"
13 12
14static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) 13static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
15{ 14{
@@ -45,13 +44,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
45static int coda_test_inode(struct inode *inode, void *data) 44static int coda_test_inode(struct inode *inode, void *data)
46{ 45{
47 struct CodaFid *fid = (struct CodaFid *)data; 46 struct CodaFid *fid = (struct CodaFid *)data;
48 return coda_fideq(&(ITOC(inode)->c_fid), fid); 47 struct coda_inode_info *cii = ITOC(inode);
48 return coda_fideq(&cii->c_fid, fid);
49} 49}
50 50
51static int coda_set_inode(struct inode *inode, void *data) 51static int coda_set_inode(struct inode *inode, void *data)
52{ 52{
53 struct CodaFid *fid = (struct CodaFid *)data; 53 struct CodaFid *fid = (struct CodaFid *)data;
54 ITOC(inode)->c_fid = *fid; 54 struct coda_inode_info *cii = ITOC(inode);
55 cii->c_fid = *fid;
55 return 0; 56 return 0;
56} 57}
57 58
@@ -71,6 +72,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
71 cii = ITOC(inode); 72 cii = ITOC(inode);
72 /* we still need to set i_ino for things like stat(2) */ 73 /* we still need to set i_ino for things like stat(2) */
73 inode->i_ino = hash; 74 inode->i_ino = hash;
75 /* inode is locked and unique, no need to grab cii->c_lock */
74 cii->c_mapcount = 0; 76 cii->c_mapcount = 0;
75 unlock_new_inode(inode); 77 unlock_new_inode(inode);
76 } 78 }
@@ -107,14 +109,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
107} 109}
108 110
109 111
112/* Although we treat Coda file identifiers as immutable, there is one
113 * special case for files created during a disconnection where they may
114 * not be globally unique. When an identifier collision is detected we
115 * first try to flush the cached inode from the kernel and finally
116 * resort to renaming/rehashing in-place. Userspace remembers both old
117 * and new values of the identifier to handle any in-flight upcalls.
118 * The real solution is to use globally unique UUIDs as identifiers, but
119 * retrofitting the existing userspace code for this is non-trivial. */
110void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 120void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid,
111 struct CodaFid *newfid) 121 struct CodaFid *newfid)
112{ 122{
113 struct coda_inode_info *cii; 123 struct coda_inode_info *cii = ITOC(inode);
114 unsigned long hash = coda_f2i(newfid); 124 unsigned long hash = coda_f2i(newfid);
115 125
116 cii = ITOC(inode);
117
118 BUG_ON(!coda_fideq(&cii->c_fid, oldfid)); 126 BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
119 127
120 /* replace fid and rehash inode */ 128 /* replace fid and rehash inode */
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 000000000000..c910b5eb1ceb
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
1/* Coda filesystem -- Linux Minicache
2 *
3 * Copyright (C) 1989 - 1997 Carnegie Mellon University
4 *
5 * Carnegie Mellon University encourages users of this software to
6 * contribute improvements to the Coda project. Contact Peter Braam
7 * <coda@cs.cmu.edu>
8 */
9
10#ifndef _CFSNC_HEADER_
11#define _CFSNC_HEADER_
12
13/* credential cache */
14void coda_cache_enter(struct inode *inode, int mask);
15void coda_cache_clear_inode(struct inode *);
16void coda_cache_clear_all(struct super_block *sb);
17int coda_cache_check(struct inode *inode, int mask);
18
19/* for downcalls and attributes and lookups */
20void coda_flag_inode_children(struct inode *inode, int flag);
21
22#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 000000000000..e35071b1de0e
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
1/*
2 * coda_fs_i.h
3 *
4 * Copyright (C) 1998 Carnegie Mellon University
5 *
6 */
7
8#ifndef _LINUX_CODA_FS_I
9#define _LINUX_CODA_FS_I
10
11#include <linux/types.h>
12#include <linux/list.h>
13#include <linux/spinlock.h>
14#include <linux/coda.h>
15
16/*
17 * coda fs inode data
18 * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
19 * c_cached_perm.
20 * vfs_inode is set only when the inode is created and never changes.
21 * c_fid is set when the inode is created and should be considered immutable.
22 */
23struct coda_inode_info {
24 struct CodaFid c_fid; /* Coda identifier */
25 u_short c_flags; /* flags (see below) */
26 unsigned int c_mapcount; /* nr of times this inode is mapped */
27 unsigned int c_cached_epoch; /* epoch for cached permissions */
28 vuid_t c_uid; /* fsuid for cached permissions */
29 unsigned int c_cached_perm; /* cached access permissions */
30 spinlock_t c_lock;
31 struct inode vfs_inode;
32};
33
34/*
35 * coda fs file private data
36 */
37#define CODA_MAGIC 0xC0DAC0DA
38struct coda_file_info {
39 int cfi_magic; /* magic number */
40 struct file *cfi_container; /* container file for this cnode */
41 unsigned int cfi_mapcount; /* nr of times this file is mapped */
42};
43
44#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
45
46/* flags */
47#define C_VATTR 0x1 /* Validity of vattr in inode */
48#define C_FLUSH 0x2 /* used after a flush */
49#define C_DYING 0x4 /* from venus (which died) */
50#define C_PURGE 0x8
51
52int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
53struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
54int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
55struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
56void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
57
58#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e3..2bdbcc11b373 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
17#include <linux/string.h> 17#include <linux/string.h>
18 18
19#include <linux/coda.h> 19#include <linux/coda.h>
20#include <linux/coda_linux.h>
21#include <linux/coda_psdev.h> 20#include <linux/coda_psdev.h>
22#include <linux/coda_fs_i.h> 21#include "coda_linux.h"
23 22
24/* initialize the debugging variables */ 23/* initialize the debugging variables */
25int coda_fake_statfs; 24int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 000000000000..9b0c5323890b
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
1/*
2 * Coda File System, Linux Kernel module
3 *
4 * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
5 * Linux modifications (C) 1996, Peter J. Braam
6 * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
7 *
8 * Carnegie Mellon University encourages users of this software to
9 * contribute improvements to the Coda project.
10 */
11
12#ifndef _LINUX_CODA_FS
13#define _LINUX_CODA_FS
14
15#include <linux/kernel.h>
16#include <linux/param.h>
17#include <linux/mm.h>
18#include <linux/vmalloc.h>
19#include <linux/slab.h>
20#include <linux/wait.h>
21#include <linux/types.h>
22#include <linux/fs.h>
23#include "coda_fs_i.h"
24
25/* operations */
26extern const struct inode_operations coda_dir_inode_operations;
27extern const struct inode_operations coda_file_inode_operations;
28extern const struct inode_operations coda_ioctl_inode_operations;
29
30extern const struct dentry_operations coda_dentry_operations;
31
32extern const struct address_space_operations coda_file_aops;
33extern const struct address_space_operations coda_symlink_aops;
34
35extern const struct file_operations coda_dir_operations;
36extern const struct file_operations coda_file_operations;
37extern const struct file_operations coda_ioctl_operations;
38
39/* operations shared over more than one file */
40int coda_open(struct inode *i, struct file *f);
41int coda_release(struct inode *i, struct file *f);
42int coda_permission(struct inode *inode, int mask, unsigned int flags);
43int coda_revalidate_inode(struct dentry *);
44int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
45int coda_setattr(struct dentry *, struct iattr *);
46
47/* this file: heloers */
48char *coda_f2s(struct CodaFid *f);
49int coda_isroot(struct inode *i);
50int coda_iscontrol(const char *name, size_t length);
51
52void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
53void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
54unsigned short coda_flags_to_cflags(unsigned short);
55
56/* sysctl.h */
57void coda_sysctl_init(void);
58void coda_sysctl_clean(void);
59
60#define CODA_ALLOC(ptr, cast, size) do { \
61 if (size < PAGE_SIZE) \
62 ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
63 else \
64 ptr = (cast)vmalloc((unsigned long) size); \
65 if (!ptr) \
66 printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
67 else memset( ptr, 0, size ); \
68} while (0)
69
70
71#define CODA_FREE(ptr,size) \
72 do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
73
74/* inode to cnode access functions */
75
76static inline struct coda_inode_info *ITOC(struct inode *inode)
77{
78 return list_entry(inode, struct coda_inode_info, vfs_inode);
79}
80
81static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
82{
83 return &(ITOC(inode)->c_fid);
84}
85
86static __inline__ char *coda_i2s(struct inode *inode)
87{
88 return coda_f2s(&(ITOC(inode)->c_fid));
89}
90
91/* this will not zap the inode away */
92static __inline__ void coda_flag_inode(struct inode *inode, int flag)
93{
94 struct coda_inode_info *cii = ITOC(inode);
95
96 spin_lock(&cii->c_lock);
97 cii->c_flags |= flag;
98 spin_unlock(&cii->c_lock);
99}
100
101#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0b..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,15 +17,15 @@
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/smp_lock.h> 20#include <linux/spinlock.h>
21#include <linux/namei.h>
21 22
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23 24
24#include <linux/coda.h> 25#include <linux/coda.h>
25#include <linux/coda_linux.h>
26#include <linux/coda_psdev.h> 26#include <linux/coda_psdev.h>
27#include <linux/coda_fs_i.h> 27#include "coda_linux.h"
28#include <linux/coda_cache.h> 28#include "coda_cache.h"
29 29
30#include "coda_int.h" 30#include "coda_int.h"
31 31
@@ -47,7 +47,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
47 47
48/* dentry ops */ 48/* dentry ops */
49static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd); 49static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
50static int coda_dentry_delete(struct dentry *); 50static int coda_dentry_delete(const struct dentry *);
51 51
52/* support routines */ 52/* support routines */
53static int coda_venus_readdir(struct file *coda_file, void *buf, 53static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -60,7 +60,7 @@ static int coda_return_EIO(void)
60} 60}
61#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) 61#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
62 62
63static const struct dentry_operations coda_dentry_operations = 63const struct dentry_operations coda_dentry_operations =
64{ 64{
65 .d_revalidate = coda_dentry_revalidate, 65 .d_revalidate = coda_dentry_revalidate,
66 .d_delete = coda_dentry_delete, 66 .d_delete = coda_dentry_delete,
@@ -116,21 +116,15 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
116 goto exit; 116 goto exit;
117 } 117 }
118 118
119 lock_kernel();
120
121 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, 119 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
122 &type, &resfid); 120 &type, &resfid);
123 if (!error) 121 if (!error)
124 error = coda_cnode_make(&inode, &resfid, dir->i_sb); 122 error = coda_cnode_make(&inode, &resfid, dir->i_sb);
125 123
126 unlock_kernel();
127
128 if (error && error != -ENOENT) 124 if (error && error != -ENOENT)
129 return ERR_PTR(error); 125 return ERR_PTR(error);
130 126
131exit: 127exit:
132 entry->d_op = &coda_dentry_operations;
133
134 if (inode && (type & CODA_NOCACHE)) 128 if (inode && (type & CODA_NOCACHE))
135 coda_flag_inode(inode, C_VATTR | C_PURGE); 129 coda_flag_inode(inode, C_VATTR | C_PURGE);
136 130
@@ -138,30 +132,29 @@ exit:
138} 132}
139 133
140 134
141int coda_permission(struct inode *inode, int mask) 135int coda_permission(struct inode *inode, int mask, unsigned int flags)
142{ 136{
143 int error = 0; 137 int error;
138
139 if (flags & IPERM_FLAG_RCU)
140 return -ECHILD;
144 141
145 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 142 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
146 143
147 if (!mask) 144 if (!mask)
148 return 0; 145 return 0;
149 146
150 if ((mask & MAY_EXEC) && !execute_ok(inode)) 147 if ((mask & MAY_EXEC) && !execute_ok(inode))
151 return -EACCES; 148 return -EACCES;
152 149
153 lock_kernel();
154
155 if (coda_cache_check(inode, mask)) 150 if (coda_cache_check(inode, mask))
156 goto out; 151 return 0;
157 152
158 error = venus_access(inode->i_sb, coda_i2f(inode), mask); 153 error = venus_access(inode->i_sb, coda_i2f(inode), mask);
159 154
160 if (!error) 155 if (!error)
161 coda_cache_enter(inode, mask); 156 coda_cache_enter(inode, mask);
162 157
163 out:
164 unlock_kernel();
165 return error; 158 return error;
166} 159}
167 160
@@ -200,41 +193,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
200/* creation routines: create, mknod, mkdir, link, symlink */ 193/* creation routines: create, mknod, mkdir, link, symlink */
201static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) 194static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
202{ 195{
203 int error=0; 196 int error;
204 const char *name=de->d_name.name; 197 const char *name=de->d_name.name;
205 int length=de->d_name.len; 198 int length=de->d_name.len;
206 struct inode *inode; 199 struct inode *inode;
207 struct CodaFid newfid; 200 struct CodaFid newfid;
208 struct coda_vattr attrs; 201 struct coda_vattr attrs;
209 202
210 lock_kernel(); 203 if (coda_isroot(dir) && coda_iscontrol(name, length))
211
212 if (coda_isroot(dir) && coda_iscontrol(name, length)) {
213 unlock_kernel();
214 return -EPERM; 204 return -EPERM;
215 }
216 205
217 error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 206 error = venus_create(dir->i_sb, coda_i2f(dir), name, length,
218 0, mode, &newfid, &attrs); 207 0, mode, &newfid, &attrs);
219 208 if (error)
220 if ( error ) { 209 goto err_out;
221 unlock_kernel();
222 d_drop(de);
223 return error;
224 }
225 210
226 inode = coda_iget(dir->i_sb, &newfid, &attrs); 211 inode = coda_iget(dir->i_sb, &newfid, &attrs);
227 if ( IS_ERR(inode) ) { 212 if (IS_ERR(inode)) {
228 unlock_kernel(); 213 error = PTR_ERR(inode);
229 d_drop(de); 214 goto err_out;
230 return PTR_ERR(inode);
231 } 215 }
232 216
233 /* invalidate the directory cnode's attributes */ 217 /* invalidate the directory cnode's attributes */
234 coda_dir_update_mtime(dir); 218 coda_dir_update_mtime(dir);
235 unlock_kernel();
236 d_instantiate(de, inode); 219 d_instantiate(de, inode);
237 return 0; 220 return 0;
221err_out:
222 d_drop(de);
223 return error;
238} 224}
239 225
240static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) 226static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +232,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
246 int error; 232 int error;
247 struct CodaFid newfid; 233 struct CodaFid newfid;
248 234
249 lock_kernel(); 235 if (coda_isroot(dir) && coda_iscontrol(name, len))
250
251 if (coda_isroot(dir) && coda_iscontrol(name, len)) {
252 unlock_kernel();
253 return -EPERM; 236 return -EPERM;
254 }
255 237
256 attrs.va_mode = mode; 238 attrs.va_mode = mode;
257 error = venus_mkdir(dir->i_sb, coda_i2f(dir), 239 error = venus_mkdir(dir->i_sb, coda_i2f(dir),
258 name, len, &newfid, &attrs); 240 name, len, &newfid, &attrs);
259 241 if (error)
260 if ( error ) { 242 goto err_out;
261 unlock_kernel();
262 d_drop(de);
263 return error;
264 }
265 243
266 inode = coda_iget(dir->i_sb, &newfid, &attrs); 244 inode = coda_iget(dir->i_sb, &newfid, &attrs);
267 if ( IS_ERR(inode) ) { 245 if (IS_ERR(inode)) {
268 unlock_kernel(); 246 error = PTR_ERR(inode);
269 d_drop(de); 247 goto err_out;
270 return PTR_ERR(inode);
271 } 248 }
272 249
273 /* invalidate the directory cnode's attributes */ 250 /* invalidate the directory cnode's attributes */
274 coda_dir_inc_nlink(dir); 251 coda_dir_inc_nlink(dir);
275 coda_dir_update_mtime(dir); 252 coda_dir_update_mtime(dir);
276 unlock_kernel();
277 d_instantiate(de, inode); 253 d_instantiate(de, inode);
278 return 0; 254 return 0;
255err_out:
256 d_drop(de);
257 return error;
279} 258}
280 259
281/* try to make de an entry in dir_inodde linked to source_de */ 260/* try to make de an entry in dir_inodde linked to source_de */
@@ -287,52 +266,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
287 int len = de->d_name.len; 266 int len = de->d_name.len;
288 int error; 267 int error;
289 268
290 lock_kernel(); 269 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
291
292 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
293 unlock_kernel();
294 return -EPERM; 270 return -EPERM;
295 }
296 271
297 error = venus_link(dir_inode->i_sb, coda_i2f(inode), 272 error = venus_link(dir_inode->i_sb, coda_i2f(inode),
298 coda_i2f(dir_inode), (const char *)name, len); 273 coda_i2f(dir_inode), (const char *)name, len);
299
300 if (error) { 274 if (error) {
301 d_drop(de); 275 d_drop(de);
302 goto out; 276 return error;
303 } 277 }
304 278
305 coda_dir_update_mtime(dir_inode); 279 coda_dir_update_mtime(dir_inode);
306 atomic_inc(&inode->i_count); 280 ihold(inode);
307 d_instantiate(de, inode); 281 d_instantiate(de, inode);
308 inc_nlink(inode); 282 inc_nlink(inode);
309 283 return 0;
310out:
311 unlock_kernel();
312 return(error);
313} 284}
314 285
315 286
316static int coda_symlink(struct inode *dir_inode, struct dentry *de, 287static int coda_symlink(struct inode *dir_inode, struct dentry *de,
317 const char *symname) 288 const char *symname)
318{ 289{
319 const char *name = de->d_name.name; 290 const char *name = de->d_name.name;
320 int len = de->d_name.len; 291 int len = de->d_name.len;
321 int symlen; 292 int symlen;
322 int error = 0; 293 int error;
323
324 lock_kernel();
325 294
326 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) { 295 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
327 unlock_kernel();
328 return -EPERM; 296 return -EPERM;
329 }
330 297
331 symlen = strlen(symname); 298 symlen = strlen(symname);
332 if ( symlen > CODA_MAXPATHLEN ) { 299 if (symlen > CODA_MAXPATHLEN)
333 unlock_kernel();
334 return -ENAMETOOLONG; 300 return -ENAMETOOLONG;
335 }
336 301
337 /* 302 /*
338 * This entry is now negative. Since we do not create 303 * This entry is now negative. Since we do not create
@@ -343,10 +308,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
343 symname, symlen); 308 symname, symlen);
344 309
345 /* mtime is no good anymore */ 310 /* mtime is no good anymore */
346 if ( !error ) 311 if (!error)
347 coda_dir_update_mtime(dir_inode); 312 coda_dir_update_mtime(dir_inode);
348 313
349 unlock_kernel();
350 return error; 314 return error;
351} 315}
352 316
@@ -357,17 +321,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
357 const char *name = de->d_name.name; 321 const char *name = de->d_name.name;
358 int len = de->d_name.len; 322 int len = de->d_name.len;
359 323
360 lock_kernel();
361
362 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len); 324 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
363 if ( error ) { 325 if (error)
364 unlock_kernel();
365 return error; 326 return error;
366 }
367 327
368 coda_dir_update_mtime(dir); 328 coda_dir_update_mtime(dir);
369 drop_nlink(de->d_inode); 329 drop_nlink(de->d_inode);
370 unlock_kernel();
371 return 0; 330 return 0;
372} 331}
373 332
@@ -377,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
377 int len = de->d_name.len; 336 int len = de->d_name.len;
378 int error; 337 int error;
379 338
380 lock_kernel();
381
382 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 339 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
383 if (!error) { 340 if (!error) {
384 /* VFS may delete the child */ 341 /* VFS may delete the child */
@@ -389,7 +346,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
389 coda_dir_drop_nlink(dir); 346 coda_dir_drop_nlink(dir);
390 coda_dir_update_mtime(dir); 347 coda_dir_update_mtime(dir);
391 } 348 }
392 unlock_kernel();
393 return error; 349 return error;
394} 350}
395 351
@@ -403,15 +359,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
403 int new_length = new_dentry->d_name.len; 359 int new_length = new_dentry->d_name.len;
404 int error; 360 int error;
405 361
406 lock_kernel();
407
408 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 362 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
409 coda_i2f(new_dir), old_length, new_length, 363 coda_i2f(new_dir), old_length, new_length,
410 (const char *) old_name, (const char *)new_name); 364 (const char *) old_name, (const char *)new_name);
411 365 if (!error) {
412 if ( !error ) { 366 if (new_dentry->d_inode) {
413 if ( new_dentry->d_inode ) { 367 if (S_ISDIR(new_dentry->d_inode->i_mode)) {
414 if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
415 coda_dir_drop_nlink(old_dir); 368 coda_dir_drop_nlink(old_dir);
416 coda_dir_inc_nlink(new_dir); 369 coda_dir_inc_nlink(new_dir);
417 } 370 }
@@ -423,8 +376,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
423 coda_flag_inode(new_dir, C_VATTR); 376 coda_flag_inode(new_dir, C_VATTR);
424 } 377 }
425 } 378 }
426 unlock_kernel();
427
428 return error; 379 return error;
429} 380}
430 381
@@ -591,13 +542,14 @@ out:
591/* called when a cache lookup succeeds */ 542/* called when a cache lookup succeeds */
592static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) 543static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
593{ 544{
594 struct inode *inode = de->d_inode; 545 struct inode *inode;
595 struct coda_inode_info *cii; 546 struct coda_inode_info *cii;
596 547
597 if (!inode) 548 if (nd->flags & LOOKUP_RCU)
598 return 1; 549 return -ECHILD;
599 lock_kernel(); 550
600 if (coda_isroot(inode)) 551 inode = de->d_inode;
552 if (!inode || coda_isroot(inode))
601 goto out; 553 goto out;
602 if (is_bad_inode(inode)) 554 if (is_bad_inode(inode))
603 goto bad; 555 goto bad;
@@ -612,18 +564,17 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
612 if (cii->c_flags & C_FLUSH) 564 if (cii->c_flags & C_FLUSH)
613 coda_flag_inode_children(inode, C_FLUSH); 565 coda_flag_inode_children(inode, C_FLUSH);
614 566
615 if (atomic_read(&de->d_count) > 1) 567 if (de->d_count > 1)
616 /* pretend it's valid, but don't change the flags */ 568 /* pretend it's valid, but don't change the flags */
617 goto out; 569 goto out;
618 570
619 /* clear the flags. */ 571 /* clear the flags. */
572 spin_lock(&cii->c_lock);
620 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 573 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
621 574 spin_unlock(&cii->c_lock);
622bad: 575bad:
623 unlock_kernel();
624 return 0; 576 return 0;
625out: 577out:
626 unlock_kernel();
627 return 1; 578 return 1;
628} 579}
629 580
@@ -631,7 +582,7 @@ out:
631 * This is the callback from dput() when d_count is going to 0. 582 * This is the callback from dput() when d_count is going to 0.
632 * We use this to unhash dentries with bad inodes. 583 * We use this to unhash dentries with bad inodes.
633 */ 584 */
634static int coda_dentry_delete(struct dentry * dentry) 585static int coda_dentry_delete(const struct dentry * dentry)
635{ 586{
636 int flags; 587 int flags;
637 588
@@ -656,20 +607,19 @@ static int coda_dentry_delete(struct dentry * dentry)
656int coda_revalidate_inode(struct dentry *dentry) 607int coda_revalidate_inode(struct dentry *dentry)
657{ 608{
658 struct coda_vattr attr; 609 struct coda_vattr attr;
659 int error = 0; 610 int error;
660 int old_mode; 611 int old_mode;
661 ino_t old_ino; 612 ino_t old_ino;
662 struct inode *inode = dentry->d_inode; 613 struct inode *inode = dentry->d_inode;
663 struct coda_inode_info *cii = ITOC(inode); 614 struct coda_inode_info *cii = ITOC(inode);
664 615
665 lock_kernel(); 616 if (!cii->c_flags)
666 if ( !cii->c_flags ) 617 return 0;
667 goto ok;
668 618
669 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) { 619 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
670 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr); 620 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
671 if ( error ) 621 if (error)
672 goto return_bad; 622 return -EIO;
673 623
674 /* this inode may be lost if: 624 /* this inode may be lost if:
675 - it's ino changed 625 - it's ino changed
@@ -688,17 +638,13 @@ int coda_revalidate_inode(struct dentry *dentry)
688 /* the following can happen when a local fid is replaced 638 /* the following can happen when a local fid is replaced
689 with a global one, here we lose and declare the inode bad */ 639 with a global one, here we lose and declare the inode bad */
690 if (inode->i_ino != old_ino) 640 if (inode->i_ino != old_ino)
691 goto return_bad; 641 return -EIO;
692 642
693 coda_flag_inode_children(inode, C_FLUSH); 643 coda_flag_inode_children(inode, C_FLUSH);
644
645 spin_lock(&cii->c_lock);
694 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 646 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
647 spin_unlock(&cii->c_lock);
695 } 648 }
696
697ok:
698 unlock_kernel();
699 return 0; 649 return 0;
700
701return_bad:
702 unlock_kernel();
703 return -EIO;
704} 650}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb4..0433057be330 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,16 +15,15 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/cred.h> 16#include <linux/cred.h>
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/spinlock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23#include <linux/coda.h> 23#include <linux/coda.h>
24#include <linux/coda_linux.h>
25#include <linux/coda_fs_i.h>
26#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
27 25
26#include "coda_linux.h"
28#include "coda_int.h" 27#include "coda_int.h"
29 28
30static ssize_t 29static ssize_t
@@ -109,19 +108,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
109 108
110 coda_inode = coda_file->f_path.dentry->d_inode; 109 coda_inode = coda_file->f_path.dentry->d_inode;
111 host_inode = host_file->f_path.dentry->d_inode; 110 host_inode = host_file->f_path.dentry->d_inode;
111
112 cii = ITOC(coda_inode);
113 spin_lock(&cii->c_lock);
112 coda_file->f_mapping = host_file->f_mapping; 114 coda_file->f_mapping = host_file->f_mapping;
113 if (coda_inode->i_mapping == &coda_inode->i_data) 115 if (coda_inode->i_mapping == &coda_inode->i_data)
114 coda_inode->i_mapping = host_inode->i_mapping; 116 coda_inode->i_mapping = host_inode->i_mapping;
115 117
116 /* only allow additional mmaps as long as userspace isn't changing 118 /* only allow additional mmaps as long as userspace isn't changing
117 * the container file on us! */ 119 * the container file on us! */
118 else if (coda_inode->i_mapping != host_inode->i_mapping) 120 else if (coda_inode->i_mapping != host_inode->i_mapping) {
121 spin_unlock(&cii->c_lock);
119 return -EBUSY; 122 return -EBUSY;
123 }
120 124
121 /* keep track of how often the coda_inode/host_file has been mmapped */ 125 /* keep track of how often the coda_inode/host_file has been mmapped */
122 cii = ITOC(coda_inode);
123 cii->c_mapcount++; 126 cii->c_mapcount++;
124 cfi->cfi_mapcount++; 127 cfi->cfi_mapcount++;
128 spin_unlock(&cii->c_lock);
125 129
126 return host_file->f_op->mmap(host_file, vma); 130 return host_file->f_op->mmap(host_file, vma);
127} 131}
@@ -138,8 +142,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
138 if (!cfi) 142 if (!cfi)
139 return -ENOMEM; 143 return -ENOMEM;
140 144
141 lock_kernel();
142
143 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, 145 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
144 &host_file); 146 &host_file);
145 if (!host_file) 147 if (!host_file)
@@ -147,7 +149,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
147 149
148 if (error) { 150 if (error) {
149 kfree(cfi); 151 kfree(cfi);
150 unlock_kernel();
151 return error; 152 return error;
152 } 153 }
153 154
@@ -159,8 +160,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
159 160
160 BUG_ON(coda_file->private_data != NULL); 161 BUG_ON(coda_file->private_data != NULL);
161 coda_file->private_data = cfi; 162 coda_file->private_data = cfi;
162
163 unlock_kernel();
164 return 0; 163 return 0;
165} 164}
166 165
@@ -171,9 +170,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
171 struct coda_file_info *cfi; 170 struct coda_file_info *cfi;
172 struct coda_inode_info *cii; 171 struct coda_inode_info *cii;
173 struct inode *host_inode; 172 struct inode *host_inode;
174 int err = 0; 173 int err;
175
176 lock_kernel();
177 174
178 cfi = CODA_FTOC(coda_file); 175 cfi = CODA_FTOC(coda_file);
179 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 176 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +182,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
185 cii = ITOC(coda_inode); 182 cii = ITOC(coda_inode);
186 183
187 /* did we mmap this file? */ 184 /* did we mmap this file? */
185 spin_lock(&cii->c_lock);
188 if (coda_inode->i_mapping == &host_inode->i_data) { 186 if (coda_inode->i_mapping == &host_inode->i_data) {
189 cii->c_mapcount -= cfi->cfi_mapcount; 187 cii->c_mapcount -= cfi->cfi_mapcount;
190 if (!cii->c_mapcount) 188 if (!cii->c_mapcount)
191 coda_inode->i_mapping = &coda_inode->i_data; 189 coda_inode->i_mapping = &coda_inode->i_data;
192 } 190 }
191 spin_unlock(&cii->c_lock);
193 192
194 fput(cfi->cfi_container); 193 fput(cfi->cfi_container);
195 kfree(coda_file->private_data); 194 kfree(coda_file->private_data);
196 coda_file->private_data = NULL; 195 coda_file->private_data = NULL;
197 196
198 unlock_kernel();
199
200 /* VFS fput ignores the return value from file_operations->release, so 197 /* VFS fput ignores the return value from file_operations->release, so
201 * there is no use returning an error here */ 198 * there is no use returning an error here */
202 return 0; 199 return 0;
@@ -207,7 +204,7 @@ int coda_fsync(struct file *coda_file, int datasync)
207 struct file *host_file; 204 struct file *host_file;
208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode; 205 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 206 struct coda_file_info *cfi;
210 int err = 0; 207 int err;
211 208
212 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) || 209 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
213 S_ISLNK(coda_inode->i_mode))) 210 S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +215,8 @@ int coda_fsync(struct file *coda_file, int datasync)
218 host_file = cfi->cfi_container; 215 host_file = cfi->cfi_container;
219 216
220 err = vfs_fsync(host_file, datasync); 217 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 218 if (!err && !datasync)
222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 219 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
224 unlock_kernel();
225 }
226 220
227 return err; 221 return err;
228} 222}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6526e6f21ecf..871b27715465 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/unistd.h> 17#include <linux/unistd.h>
18#include <linux/smp_lock.h> 18#include <linux/mutex.h>
19#include <linux/spinlock.h>
19#include <linux/file.h> 20#include <linux/file.h>
20#include <linux/vfs.h> 21#include <linux/vfs.h>
21#include <linux/slab.h> 22#include <linux/slab.h>
@@ -27,10 +28,9 @@
27#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
28 29
29#include <linux/coda.h> 30#include <linux/coda.h>
30#include <linux/coda_linux.h>
31#include <linux/coda_psdev.h> 31#include <linux/coda_psdev.h>
32#include <linux/coda_fs_i.h> 32#include "coda_linux.h"
33#include <linux/coda_cache.h> 33#include "coda_cache.h"
34 34
35#include "coda_int.h" 35#include "coda_int.h"
36 36
@@ -44,21 +44,29 @@ static struct kmem_cache * coda_inode_cachep;
44static struct inode *coda_alloc_inode(struct super_block *sb) 44static struct inode *coda_alloc_inode(struct super_block *sb)
45{ 45{
46 struct coda_inode_info *ei; 46 struct coda_inode_info *ei;
47 ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); 47 ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
48 if (!ei) 48 if (!ei)
49 return NULL; 49 return NULL;
50 memset(&ei->c_fid, 0, sizeof(struct CodaFid)); 50 memset(&ei->c_fid, 0, sizeof(struct CodaFid));
51 ei->c_flags = 0; 51 ei->c_flags = 0;
52 ei->c_uid = 0; 52 ei->c_uid = 0;
53 ei->c_cached_perm = 0; 53 ei->c_cached_perm = 0;
54 spin_lock_init(&ei->c_lock);
54 return &ei->vfs_inode; 55 return &ei->vfs_inode;
55} 56}
56 57
57static void coda_destroy_inode(struct inode *inode) 58static void coda_i_callback(struct rcu_head *head)
58{ 59{
60 struct inode *inode = container_of(head, struct inode, i_rcu);
61 INIT_LIST_HEAD(&inode->i_dentry);
59 kmem_cache_free(coda_inode_cachep, ITOC(inode)); 62 kmem_cache_free(coda_inode_cachep, ITOC(inode));
60} 63}
61 64
65static void coda_destroy_inode(struct inode *inode)
66{
67 call_rcu(&inode->i_rcu, coda_i_callback);
68}
69
62static void init_once(void *foo) 70static void init_once(void *foo)
63{ 71{
64 struct coda_inode_info *ei = (struct coda_inode_info *) foo; 72 struct coda_inode_info *ei = (struct coda_inode_info *) foo;
@@ -143,7 +151,7 @@ static int get_device_index(struct coda_mount_data *data)
143static int coda_fill_super(struct super_block *sb, void *data, int silent) 151static int coda_fill_super(struct super_block *sb, void *data, int silent)
144{ 152{
145 struct inode *root = NULL; 153 struct inode *root = NULL;
146 struct venus_comm *vc = NULL; 154 struct venus_comm *vc;
147 struct CodaFid fid; 155 struct CodaFid fid;
148 int error; 156 int error;
149 int idx; 157 int idx;
@@ -157,21 +165,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
157 printk(KERN_INFO "coda_read_super: device index: %i\n", idx); 165 printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
158 166
159 vc = &coda_comms[idx]; 167 vc = &coda_comms[idx];
168 mutex_lock(&vc->vc_mutex);
169
160 if (!vc->vc_inuse) { 170 if (!vc->vc_inuse) {
161 printk("coda_read_super: No pseudo device\n"); 171 printk("coda_read_super: No pseudo device\n");
162 return -EINVAL; 172 error = -EINVAL;
173 goto unlock_out;
163 } 174 }
164 175
165 if ( vc->vc_sb ) { 176 if (vc->vc_sb) {
166 printk("coda_read_super: Device already mounted\n"); 177 printk("coda_read_super: Device already mounted\n");
167 return -EBUSY; 178 error = -EBUSY;
179 goto unlock_out;
168 } 180 }
169 181
170 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); 182 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
171 if (error) 183 if (error)
172 goto bdi_err; 184 goto unlock_out;
173 185
174 vc->vc_sb = sb; 186 vc->vc_sb = sb;
187 mutex_unlock(&vc->vc_mutex);
175 188
176 sb->s_fs_info = vc; 189 sb->s_fs_info = vc;
177 sb->s_flags |= MS_NOATIME; 190 sb->s_flags |= MS_NOATIME;
@@ -179,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
179 sb->s_blocksize_bits = 12; 192 sb->s_blocksize_bits = 12;
180 sb->s_magic = CODA_SUPER_MAGIC; 193 sb->s_magic = CODA_SUPER_MAGIC;
181 sb->s_op = &coda_super_operations; 194 sb->s_op = &coda_super_operations;
195 sb->s_d_op = &coda_dentry_operations;
182 sb->s_bdi = &vc->bdi; 196 sb->s_bdi = &vc->bdi;
183 197
184 /* get root fid from Venus: this needs the root inode */ 198 /* get root fid from Venus: this needs the root inode */
@@ -200,26 +214,33 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
200 printk("coda_read_super: rootinode is %ld dev %s\n", 214 printk("coda_read_super: rootinode is %ld dev %s\n",
201 root->i_ino, root->i_sb->s_id); 215 root->i_ino, root->i_sb->s_id);
202 sb->s_root = d_alloc_root(root); 216 sb->s_root = d_alloc_root(root);
203 if (!sb->s_root) 217 if (!sb->s_root) {
218 error = -EINVAL;
204 goto error; 219 goto error;
205 return 0; 220 }
221 return 0;
206 222
207 error: 223error:
208 bdi_destroy(&vc->bdi);
209 bdi_err:
210 if (root) 224 if (root)
211 iput(root); 225 iput(root);
212 if (vc)
213 vc->vc_sb = NULL;
214 226
215 return -EINVAL; 227 mutex_lock(&vc->vc_mutex);
228 bdi_destroy(&vc->bdi);
229 vc->vc_sb = NULL;
230 sb->s_fs_info = NULL;
231unlock_out:
232 mutex_unlock(&vc->vc_mutex);
233 return error;
216} 234}
217 235
218static void coda_put_super(struct super_block *sb) 236static void coda_put_super(struct super_block *sb)
219{ 237{
220 bdi_destroy(&coda_vcp(sb)->bdi); 238 struct venus_comm *vcp = coda_vcp(sb);
221 coda_vcp(sb)->vc_sb = NULL; 239 mutex_lock(&vcp->vc_mutex);
240 bdi_destroy(&vcp->bdi);
241 vcp->vc_sb = NULL;
222 sb->s_fs_info = NULL; 242 sb->s_fs_info = NULL;
243 mutex_unlock(&vcp->vc_mutex);
223 244
224 printk("Coda: Bye bye.\n"); 245 printk("Coda: Bye bye.\n");
225} 246}
@@ -245,8 +266,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
245 struct coda_vattr vattr; 266 struct coda_vattr vattr;
246 int error; 267 int error;
247 268
248 lock_kernel();
249
250 memset(&vattr, 0, sizeof(vattr)); 269 memset(&vattr, 0, sizeof(vattr));
251 270
252 inode->i_ctime = CURRENT_TIME_SEC; 271 inode->i_ctime = CURRENT_TIME_SEC;
@@ -256,13 +275,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
256 /* Venus is responsible for truncating the container-file!!! */ 275 /* Venus is responsible for truncating the container-file!!! */
257 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr); 276 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
258 277
259 if ( !error ) { 278 if (!error) {
260 coda_vattr_to_iattr(inode, &vattr); 279 coda_vattr_to_iattr(inode, &vattr);
261 coda_cache_clear_inode(inode); 280 coda_cache_clear_inode(inode);
262 } 281 }
263
264 unlock_kernel();
265
266 return error; 282 return error;
267} 283}
268 284
@@ -276,12 +292,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
276{ 292{
277 int error; 293 int error;
278 294
279 lock_kernel();
280
281 error = venus_statfs(dentry, buf); 295 error = venus_statfs(dentry, buf);
282 296
283 unlock_kernel();
284
285 if (error) { 297 if (error) {
286 /* fake something like AFS does */ 298 /* fake something like AFS does */
287 buf->f_blocks = 9000000; 299 buf->f_blocks = 9000000;
@@ -301,16 +313,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
301 313
302/* init_coda: used by filesystems.c to register coda */ 314/* init_coda: used by filesystems.c to register coda */
303 315
304static int coda_get_sb(struct file_system_type *fs_type, 316static struct dentry *coda_mount(struct file_system_type *fs_type,
305 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 317 int flags, const char *dev_name, void *data)
306{ 318{
307 return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt); 319 return mount_nodev(fs_type, flags, data, coda_fill_super);
308} 320}
309 321
310struct file_system_type coda_fs_type = { 322struct file_system_type coda_fs_type = {
311 .owner = THIS_MODULE, 323 .owner = THIS_MODULE,
312 .name = "coda", 324 .name = "coda",
313 .get_sb = coda_get_sb, 325 .mount = coda_mount,
314 .kill_sb = kill_anon_super, 326 .kill_sb = kill_anon_super,
315 .fs_flags = FS_BINARY_MOUNTDATA, 327 .fs_flags = FS_BINARY_MOUNTDATA,
316}; 328};
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c9..6cbb3afb36dc 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,14 +19,12 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20 20
21#include <linux/coda.h> 21#include <linux/coda.h>
22#include <linux/coda_linux.h>
23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 22#include <linux/coda_psdev.h>
25 23
26#include <linux/smp_lock.h> 24#include "coda_linux.h"
27 25
28/* pioctl ops */ 26/* pioctl ops */
29static int coda_ioctl_permission(struct inode *inode, int mask); 27static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
30static long coda_pioctl(struct file *filp, unsigned int cmd, 28static long coda_pioctl(struct file *filp, unsigned int cmd,
31 unsigned long user_data); 29 unsigned long user_data);
32 30
@@ -39,11 +37,14 @@ const struct inode_operations coda_ioctl_inode_operations = {
39const struct file_operations coda_ioctl_operations = { 37const struct file_operations coda_ioctl_operations = {
40 .owner = THIS_MODULE, 38 .owner = THIS_MODULE,
41 .unlocked_ioctl = coda_pioctl, 39 .unlocked_ioctl = coda_pioctl,
40 .llseek = noop_llseek,
42}; 41};
43 42
44/* the coda pioctl inode ops */ 43/* the coda pioctl inode ops */
45static int coda_ioctl_permission(struct inode *inode, int mask) 44static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
46{ 45{
46 if (flags & IPERM_FLAG_RCU)
47 return -ECHILD;
47 return (mask & MAY_EXEC) ? -EACCES : 0; 48 return (mask & MAY_EXEC) ? -EACCES : 0;
48} 49}
49 50
@@ -57,13 +58,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
57 struct inode *target_inode = NULL; 58 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp; 59 struct coda_inode_info *cnp;
59 60
60 lock_kernel();
61
62 /* get the Pioctl data arguments from user space */ 61 /* get the Pioctl data arguments from user space */
63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 62 if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
64 error = -EINVAL; 63 return -EINVAL;
65 goto out;
66 }
67 64
68 /* 65 /*
69 * Look up the pathname. Note that the pathname is in 66 * Look up the pathname. Note that the pathname is in
@@ -75,13 +72,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
75 error = user_lpath(data.path, &path); 72 error = user_lpath(data.path, &path);
76 73
77 if (error) 74 if (error)
78 goto out; 75 return error;
79 else 76
80 target_inode = path.dentry->d_inode; 77 target_inode = path.dentry->d_inode;
81 78
82 /* return if it is not a Coda inode */ 79 /* return if it is not a Coda inode */
83 if (target_inode->i_sb != inode->i_sb) { 80 if (target_inode->i_sb != inode->i_sb) {
84 path_put(&path);
85 error = -EINVAL; 81 error = -EINVAL;
86 goto out; 82 goto out;
87 } 83 }
@@ -90,10 +86,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
90 cnp = ITOC(target_inode); 86 cnp = ITOC(target_inode);
91 87
92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 88 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
93
94 path_put(&path);
95
96out: 89out:
97 unlock_kernel(); 90 path_put(&path);
98 return error; 91 return error;
99} 92}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 116af7546cf0..8f616e0e252c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
35#include <linux/poll.h> 35#include <linux/poll.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/smp_lock.h> 38#include <linux/mutex.h>
39#include <linux/device.h> 39#include <linux/device.h>
40#include <asm/io.h> 40#include <asm/io.h>
41#include <asm/system.h> 41#include <asm/system.h>
@@ -43,10 +43,10 @@
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44 44
45#include <linux/coda.h> 45#include <linux/coda.h>
46#include <linux/coda_linux.h>
47#include <linux/coda_fs_i.h>
48#include <linux/coda_psdev.h> 46#include <linux/coda_psdev.h>
49 47
48#include "coda_linux.h"
49
50#include "coda_int.h" 50#include "coda_int.h"
51 51
52/* statistics */ 52/* statistics */
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
67 unsigned int mask = POLLOUT | POLLWRNORM; 67 unsigned int mask = POLLOUT | POLLWRNORM;
68 68
69 poll_wait(file, &vcp->vc_waitq, wait); 69 poll_wait(file, &vcp->vc_waitq, wait);
70 mutex_lock(&vcp->vc_mutex);
70 if (!list_empty(&vcp->vc_pending)) 71 if (!list_empty(&vcp->vc_pending))
71 mask |= POLLIN | POLLRDNORM; 72 mask |= POLLIN | POLLRDNORM;
73 mutex_unlock(&vcp->vc_mutex);
72 74
73 return mask; 75 return mask;
74} 76}
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
108 return -EFAULT; 110 return -EFAULT;
109 111
110 if (DOWNCALL(hdr.opcode)) { 112 if (DOWNCALL(hdr.opcode)) {
111 struct super_block *sb = NULL; 113 union outputArgs *dcbuf;
112 union outputArgs *dcbuf;
113 int size = sizeof(*dcbuf); 114 int size = sizeof(*dcbuf);
114 115
115 sb = vcp->vc_sb;
116 if ( !sb ) {
117 count = nbytes;
118 goto out;
119 }
120
121 if ( nbytes < sizeof(struct coda_out_hdr) ) { 116 if ( nbytes < sizeof(struct coda_out_hdr) ) {
122 printk("coda_downcall opc %d uniq %d, not enough!\n", 117 printk("coda_downcall opc %d uniq %d, not enough!\n",
123 hdr.opcode, hdr.unique); 118 hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
137 } 132 }
138 133
139 /* what downcall errors does Venus handle ? */ 134 /* what downcall errors does Venus handle ? */
140 lock_kernel(); 135 error = coda_downcall(vcp, hdr.opcode, dcbuf);
141 error = coda_downcall(hdr.opcode, dcbuf, sb);
142 unlock_kernel();
143 136
144 CODA_FREE(dcbuf, nbytes); 137 CODA_FREE(dcbuf, nbytes);
145 if (error) { 138 if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
152 } 145 }
153 146
154 /* Look for the message on the processing queue. */ 147 /* Look for the message on the processing queue. */
155 lock_kernel(); 148 mutex_lock(&vcp->vc_mutex);
156 list_for_each(lh, &vcp->vc_processing) { 149 list_for_each(lh, &vcp->vc_processing) {
157 tmp = list_entry(lh, struct upc_req , uc_chain); 150 tmp = list_entry(lh, struct upc_req , uc_chain);
158 if (tmp->uc_unique == hdr.unique) { 151 if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
161 break; 154 break;
162 } 155 }
163 } 156 }
164 unlock_kernel(); 157 mutex_unlock(&vcp->vc_mutex);
165 158
166 if (!req) { 159 if (!req) {
167 printk("psdev_write: msg (%d, %d) not found\n", 160 printk("psdev_write: msg (%d, %d) not found\n",
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
216 if (nbytes == 0) 209 if (nbytes == 0)
217 return 0; 210 return 0;
218 211
219 lock_kernel(); 212 mutex_lock(&vcp->vc_mutex);
220 213
221 add_wait_queue(&vcp->vc_waitq, &wait); 214 add_wait_queue(&vcp->vc_waitq, &wait);
222 set_current_state(TASK_INTERRUPTIBLE); 215 set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
230 retval = -ERESTARTSYS; 223 retval = -ERESTARTSYS;
231 break; 224 break;
232 } 225 }
226 mutex_unlock(&vcp->vc_mutex);
233 schedule(); 227 schedule();
228 mutex_lock(&vcp->vc_mutex);
234 } 229 }
235 230
236 set_current_state(TASK_RUNNING); 231 set_current_state(TASK_RUNNING);
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
263 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); 258 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
264 kfree(req); 259 kfree(req);
265out: 260out:
266 unlock_kernel(); 261 mutex_unlock(&vcp->vc_mutex);
267 return (count ? count : retval); 262 return (count ? count : retval);
268} 263}
269 264
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
276 if (idx < 0 || idx >= MAX_CODADEVS) 271 if (idx < 0 || idx >= MAX_CODADEVS)
277 return -ENODEV; 272 return -ENODEV;
278 273
279 lock_kernel();
280
281 err = -EBUSY; 274 err = -EBUSY;
282 vcp = &coda_comms[idx]; 275 vcp = &coda_comms[idx];
276 mutex_lock(&vcp->vc_mutex);
277
283 if (!vcp->vc_inuse) { 278 if (!vcp->vc_inuse) {
284 vcp->vc_inuse++; 279 vcp->vc_inuse++;
285 280
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
293 err = 0; 288 err = 0;
294 } 289 }
295 290
296 unlock_kernel(); 291 mutex_unlock(&vcp->vc_mutex);
297 return err; 292 return err;
298} 293}
299 294
@@ -308,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
308 return -1; 303 return -1;
309 } 304 }
310 305
311 lock_kernel(); 306 mutex_lock(&vcp->vc_mutex);
312 307
313 /* Wakeup clients so they can return. */ 308 /* Wakeup clients so they can return. */
314 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) { 309 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
@@ -333,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
333 328
334 file->private_data = NULL; 329 file->private_data = NULL;
335 vcp->vc_inuse--; 330 vcp->vc_inuse--;
336 unlock_kernel(); 331 mutex_unlock(&vcp->vc_mutex);
337 return 0; 332 return 0;
338} 333}
339 334
@@ -346,6 +341,7 @@ static const struct file_operations coda_psdev_fops = {
346 .unlocked_ioctl = coda_psdev_ioctl, 341 .unlocked_ioctl = coda_psdev_ioctl,
347 .open = coda_psdev_open, 342 .open = coda_psdev_open,
348 .release = coda_psdev_release, 343 .release = coda_psdev_release,
344 .llseek = noop_llseek,
349}; 345};
350 346
351static int init_coda_psdev(void) 347static int init_coda_psdev(void)
@@ -361,9 +357,11 @@ static int init_coda_psdev(void)
361 err = PTR_ERR(coda_psdev_class); 357 err = PTR_ERR(coda_psdev_class);
362 goto out_chrdev; 358 goto out_chrdev;
363 } 359 }
364 for (i = 0; i < MAX_CODADEVS; i++) 360 for (i = 0; i < MAX_CODADEVS; i++) {
361 mutex_init(&(&coda_comms[i])->vc_mutex);
365 device_create(coda_psdev_class, NULL, 362 device_create(coda_psdev_class, NULL,
366 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); 363 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
364 }
367 coda_sysctl_init(); 365 coda_sysctl_init();
368 goto out; 366 goto out;
369 367
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b7258458..ab94ef63caef 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,12 +14,11 @@
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18 17
19#include <linux/coda.h> 18#include <linux/coda.h>
20#include <linux/coda_linux.h>
21#include <linux/coda_psdev.h> 19#include <linux/coda_psdev.h>
22#include <linux/coda_fs_i.h> 20
21#include "coda_linux.h"
23 22
24static int coda_symlink_filler(struct file *file, struct page *page) 23static int coda_symlink_filler(struct file *file, struct page *page)
25{ 24{
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
29 unsigned int len = PAGE_SIZE; 28 unsigned int len = PAGE_SIZE;
30 char *p = kmap(page); 29 char *p = kmap(page);
31 30
32 lock_kernel();
33 cii = ITOC(inode); 31 cii = ITOC(inode);
34 32
35 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); 33 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
36 unlock_kernel();
37 if (error) 34 if (error)
38 goto fail; 35 goto fail;
39 SetPageUptodate(page); 36 SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b8893ab6f9e6..9727e0c52579 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,15 +27,15 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/mutex.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
33 34
34#include <linux/coda.h> 35#include <linux/coda.h>
35#include <linux/coda_linux.h>
36#include <linux/coda_psdev.h> 36#include <linux/coda_psdev.h>
37#include <linux/coda_fs_i.h> 37#include "coda_linux.h"
38#include <linux/coda_cache.h> 38#include "coda_cache.h"
39 39
40#include "coda_int.h" 40#include "coda_int.h"
41 41
@@ -606,7 +606,8 @@ static void coda_unblock_signals(sigset_t *old)
606 (r)->uc_opcode != CODA_RELEASE) || \ 606 (r)->uc_opcode != CODA_RELEASE) || \
607 (r)->uc_flags & CODA_REQ_READ)) 607 (r)->uc_flags & CODA_REQ_READ))
608 608
609static inline void coda_waitfor_upcall(struct upc_req *req) 609static inline void coda_waitfor_upcall(struct venus_comm *vcp,
610 struct upc_req *req)
610{ 611{
611 DECLARE_WAITQUEUE(wait, current); 612 DECLARE_WAITQUEUE(wait, current);
612 unsigned long timeout = jiffies + coda_timeout * HZ; 613 unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -639,10 +640,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
639 break; 640 break;
640 } 641 }
641 642
643 mutex_unlock(&vcp->vc_mutex);
642 if (blocked) 644 if (blocked)
643 schedule_timeout(HZ); 645 schedule_timeout(HZ);
644 else 646 else
645 schedule(); 647 schedule();
648 mutex_lock(&vcp->vc_mutex);
646 } 649 }
647 if (blocked) 650 if (blocked)
648 coda_unblock_signals(&old); 651 coda_unblock_signals(&old);
@@ -667,18 +670,23 @@ static int coda_upcall(struct venus_comm *vcp,
667{ 670{
668 union outputArgs *out; 671 union outputArgs *out;
669 union inputArgs *sig_inputArgs; 672 union inputArgs *sig_inputArgs;
670 struct upc_req *req, *sig_req; 673 struct upc_req *req = NULL, *sig_req;
671 int error = 0; 674 int error;
675
676 mutex_lock(&vcp->vc_mutex);
672 677
673 if (!vcp->vc_inuse) { 678 if (!vcp->vc_inuse) {
674 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); 679 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
675 return -ENXIO; 680 error = -ENXIO;
681 goto exit;
676 } 682 }
677 683
678 /* Format the request message. */ 684 /* Format the request message. */
679 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); 685 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
680 if (!req) 686 if (!req) {
681 return -ENOMEM; 687 error = -ENOMEM;
688 goto exit;
689 }
682 690
683 req->uc_data = (void *)buffer; 691 req->uc_data = (void *)buffer;
684 req->uc_flags = 0; 692 req->uc_flags = 0;
@@ -705,7 +713,7 @@ static int coda_upcall(struct venus_comm *vcp,
705 * ENODEV. */ 713 * ENODEV. */
706 714
707 /* Go to sleep. Wake up on signals only after the timeout. */ 715 /* Go to sleep. Wake up on signals only after the timeout. */
708 coda_waitfor_upcall(req); 716 coda_waitfor_upcall(vcp, req);
709 717
710 /* Op went through, interrupt or not... */ 718 /* Op went through, interrupt or not... */
711 if (req->uc_flags & CODA_REQ_WRITE) { 719 if (req->uc_flags & CODA_REQ_WRITE) {
@@ -759,6 +767,7 @@ static int coda_upcall(struct venus_comm *vcp,
759 767
760exit: 768exit:
761 kfree(req); 769 kfree(req);
770 mutex_unlock(&vcp->vc_mutex);
762 return error; 771 return error;
763} 772}
764 773
@@ -796,21 +805,24 @@ exit:
796 * 805 *
797 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ 806 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
798 807
799int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb) 808int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
800{ 809{
801 struct inode *inode = NULL; 810 struct inode *inode = NULL;
802 struct CodaFid *fid, *newfid; 811 struct CodaFid *fid = NULL, *newfid;
812 struct super_block *sb;
803 813
804 /* Handle invalidation requests. */ 814 /* Handle invalidation requests. */
805 if ( !sb || !sb->s_root) 815 mutex_lock(&vcp->vc_mutex);
806 return 0; 816 sb = vcp->vc_sb;
817 if (!sb || !sb->s_root)
818 goto unlock_out;
807 819
808 switch (opcode) { 820 switch (opcode) {
809 case CODA_FLUSH: 821 case CODA_FLUSH:
810 coda_cache_clear_all(sb); 822 coda_cache_clear_all(sb);
811 shrink_dcache_sb(sb); 823 shrink_dcache_sb(sb);
812 if (sb->s_root->d_inode) 824 if (sb->s_root->d_inode)
813 coda_flag_inode(sb->s_root->d_inode, C_FLUSH); 825 coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
814 break; 826 break;
815 827
816 case CODA_PURGEUSER: 828 case CODA_PURGEUSER:
@@ -819,45 +831,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
819 831
820 case CODA_ZAPDIR: 832 case CODA_ZAPDIR:
821 fid = &out->coda_zapdir.CodaFid; 833 fid = &out->coda_zapdir.CodaFid;
822 inode = coda_fid_to_inode(fid, sb);
823 if (inode) {
824 coda_flag_inode_children(inode, C_PURGE);
825 coda_flag_inode(inode, C_VATTR);
826 }
827 break; 834 break;
828 835
829 case CODA_ZAPFILE: 836 case CODA_ZAPFILE:
830 fid = &out->coda_zapfile.CodaFid; 837 fid = &out->coda_zapfile.CodaFid;
831 inode = coda_fid_to_inode(fid, sb);
832 if (inode)
833 coda_flag_inode(inode, C_VATTR);
834 break; 838 break;
835 839
836 case CODA_PURGEFID: 840 case CODA_PURGEFID:
837 fid = &out->coda_purgefid.CodaFid; 841 fid = &out->coda_purgefid.CodaFid;
842 break;
843
844 case CODA_REPLACE:
845 fid = &out->coda_replace.OldFid;
846 break;
847 }
848 if (fid)
838 inode = coda_fid_to_inode(fid, sb); 849 inode = coda_fid_to_inode(fid, sb);
839 if (inode) {
840 coda_flag_inode_children(inode, C_PURGE);
841 850
842 /* catch the dentries later if some are still busy */ 851unlock_out:
843 coda_flag_inode(inode, C_PURGE); 852 mutex_unlock(&vcp->vc_mutex);
844 d_prune_aliases(inode);
845 853
846 } 854 if (!inode)
855 return 0;
856
857 switch (opcode) {
858 case CODA_ZAPDIR:
859 coda_flag_inode_children(inode, C_PURGE);
860 coda_flag_inode(inode, C_VATTR);
861 break;
862
863 case CODA_ZAPFILE:
864 coda_flag_inode(inode, C_VATTR);
865 break;
866
867 case CODA_PURGEFID:
868 coda_flag_inode_children(inode, C_PURGE);
869
870 /* catch the dentries later if some are still busy */
871 coda_flag_inode(inode, C_PURGE);
872 d_prune_aliases(inode);
847 break; 873 break;
848 874
849 case CODA_REPLACE: 875 case CODA_REPLACE:
850 fid = &out->coda_replace.OldFid;
851 newfid = &out->coda_replace.NewFid; 876 newfid = &out->coda_replace.NewFid;
852 inode = coda_fid_to_inode(fid, sb); 877 coda_replace_fid(inode, fid, newfid);
853 if (inode)
854 coda_replace_fid(inode, fid, newfid);
855 break; 878 break;
856 } 879 }
857 880 iput(inode);
858 if (inode)
859 iput(inode);
860
861 return 0; 881 return 0;
862} 882}
863 883
diff --git a/fs/compat.c b/fs/compat.c
index 0644a154672b..f6fd0a00e6cc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -29,8 +29,6 @@
29#include <linux/vfs.h> 29#include <linux/vfs.h>
30#include <linux/ioctl.h> 30#include <linux/ioctl.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/smb.h>
33#include <linux/smb_mount.h>
34#include <linux/ncp_mount.h> 32#include <linux/ncp_mount.h>
35#include <linux/nfs4_mount.h> 33#include <linux/nfs4_mount.h>
36#include <linux/syscalls.h> 34#include <linux/syscalls.h>
@@ -51,6 +49,7 @@
51#include <linux/eventpoll.h> 49#include <linux/eventpoll.h>
52#include <linux/fs_struct.h> 50#include <linux/fs_struct.h>
53#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/pagemap.h>
54 53
55#include <asm/uaccess.h> 54#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
@@ -258,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
258} 257}
259 258
260/* 259/*
261 * The following statfs calls are copies of code from fs/open.c and 260 * The following statfs calls are copies of code from fs/statfs.c and
262 * should be checked against those from time to time 261 * should be checked against those from time to time
263 */ 262 */
264asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -321,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
321 __put_user(kbuf->f_namelen, &ubuf->f_namelen) || 320 __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
322 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || 321 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
323 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || 322 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
324 __put_user(kbuf->f_frsize, &ubuf->f_frsize)) 323 __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
324 __put_user(kbuf->f_flags, &ubuf->f_flags) ||
325 __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
325 return -EFAULT; 326 return -EFAULT;
326 return 0; 327 return 0;
327} 328}
@@ -598,24 +599,22 @@ ssize_t compat_rw_copy_check_uvector(int type,
598 if (nr_segs > fast_segs) { 599 if (nr_segs > fast_segs) {
599 ret = -ENOMEM; 600 ret = -ENOMEM;
600 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 601 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
601 if (iov == NULL) { 602 if (iov == NULL)
602 *ret_pointer = fast_pointer;
603 goto out; 603 goto out;
604 }
605 } 604 }
606 *ret_pointer = iov; 605 *ret_pointer = iov;
607 606
608 /* 607 /*
609 * Single unix specification: 608 * Single unix specification:
610 * We should -EINVAL if an element length is not >= 0 and fitting an 609 * We should -EINVAL if an element length is not >= 0 and fitting an
611 * ssize_t. The total length is fitting an ssize_t 610 * ssize_t.
612 * 611 *
613 * Be careful here because iov_len is a size_t not an ssize_t 612 * In Linux, the total length is limited to MAX_RW_COUNT, there is
613 * no overflow possibility.
614 */ 614 */
615 tot_len = 0; 615 tot_len = 0;
616 ret = -EINVAL; 616 ret = -EINVAL;
617 for (seg = 0; seg < nr_segs; seg++) { 617 for (seg = 0; seg < nr_segs; seg++) {
618 compat_ssize_t tmp = tot_len;
619 compat_uptr_t buf; 618 compat_uptr_t buf;
620 compat_ssize_t len; 619 compat_ssize_t len;
621 620
@@ -626,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
626 } 625 }
627 if (len < 0) /* size_t not fitting in compat_ssize_t .. */ 626 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
628 goto out; 627 goto out;
629 tot_len += len;
630 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
631 goto out;
632 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { 628 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
633 ret = -EFAULT; 629 ret = -EFAULT;
634 goto out; 630 goto out;
635 } 631 }
632 if (len > MAX_RW_COUNT - tot_len)
633 len = MAX_RW_COUNT - tot_len;
634 tot_len += len;
636 iov->iov_base = compat_ptr(buf); 635 iov->iov_base = compat_ptr(buf);
637 iov->iov_len = (compat_size_t) len; 636 iov->iov_len = (compat_size_t) len;
638 uvector++; 637 uvector++;
@@ -745,30 +744,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
745 return raw_data; 744 return raw_data;
746} 745}
747 746
748struct compat_smb_mount_data {
749 compat_int_t version;
750 __compat_uid_t mounted_uid;
751 __compat_uid_t uid;
752 __compat_gid_t gid;
753 compat_mode_t file_mode;
754 compat_mode_t dir_mode;
755};
756
757static void *do_smb_super_data_conv(void *raw_data)
758{
759 struct smb_mount_data *s = raw_data;
760 struct compat_smb_mount_data *c_s = raw_data;
761
762 if (c_s->version != SMB_MOUNT_OLDVERSION)
763 goto out;
764 s->dir_mode = c_s->dir_mode;
765 s->file_mode = c_s->file_mode;
766 s->gid = c_s->gid;
767 s->uid = c_s->uid;
768 s->mounted_uid = c_s->mounted_uid;
769 out:
770 return raw_data;
771}
772 747
773struct compat_nfs_string { 748struct compat_nfs_string {
774 compat_uint_t len; 749 compat_uint_t len;
@@ -835,7 +810,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
835 return 0; 810 return 0;
836} 811}
837 812
838#define SMBFS_NAME "smbfs"
839#define NCPFS_NAME "ncpfs" 813#define NCPFS_NAME "ncpfs"
840#define NFS4_NAME "nfs4" 814#define NFS4_NAME "nfs4"
841 815
@@ -870,9 +844,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
870 retval = -EINVAL; 844 retval = -EINVAL;
871 845
872 if (kernel_type && data_page) { 846 if (kernel_type && data_page) {
873 if (!strcmp(kernel_type, SMBFS_NAME)) { 847 if (!strcmp(kernel_type, NCPFS_NAME)) {
874 do_smb_super_data_conv((void *)data_page);
875 } else if (!strcmp(kernel_type, NCPFS_NAME)) {
876 do_ncp_super_data_conv((void *)data_page); 848 do_ncp_super_data_conv((void *)data_page);
877 } else if (!strcmp(kernel_type, NFS4_NAME)) { 849 } else if (!strcmp(kernel_type, NFS4_NAME)) {
878 if (do_nfs4_super_data_conv((void *) data_page)) 850 if (do_nfs4_super_data_conv((void *) data_page))
@@ -1378,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
1378 argv++; 1350 argv++;
1379 if (i++ >= max) 1351 if (i++ >= max)
1380 return -E2BIG; 1352 return -E2BIG;
1353
1354 if (fatal_signal_pending(current))
1355 return -ERESTARTNOHAND;
1356 cond_resched();
1381 } 1357 }
1382 } 1358 }
1383 return i; 1359 return i;
@@ -1419,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
1419 while (len > 0) { 1395 while (len > 0) {
1420 int offset, bytes_to_copy; 1396 int offset, bytes_to_copy;
1421 1397
1398 if (fatal_signal_pending(current)) {
1399 ret = -ERESTARTNOHAND;
1400 goto out;
1401 }
1402 cond_resched();
1403
1422 offset = pos % PAGE_SIZE; 1404 offset = pos % PAGE_SIZE;
1423 if (offset == 0) 1405 if (offset == 0)
1424 offset = PAGE_SIZE; 1406 offset = PAGE_SIZE;
@@ -1435,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
1435 if (!kmapped_page || kpos != (pos & PAGE_MASK)) { 1417 if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
1436 struct page *page; 1418 struct page *page;
1437 1419
1438#ifdef CONFIG_STACK_GROWSUP 1420 page = get_arg_page(bprm, pos, 1);
1439 ret = expand_stack_downwards(bprm->vma, pos); 1421 if (!page) {
1440 if (ret < 0) {
1441 /* We've exceed the stack rlimit. */
1442 ret = -E2BIG;
1443 goto out;
1444 }
1445#endif
1446 ret = get_user_pages(current, bprm->mm, pos,
1447 1, 1, 1, &page, NULL);
1448 if (ret <= 0) {
1449 /* We've exceed the stack rlimit. */
1450 ret = -E2BIG; 1422 ret = -E2BIG;
1451 goto out; 1423 goto out;
1452 } 1424 }
@@ -1567,8 +1539,10 @@ int compat_do_execve(char * filename,
1567 return retval; 1539 return retval;
1568 1540
1569out: 1541out:
1570 if (bprm->mm) 1542 if (bprm->mm) {
1543 acct_arg_size(bprm, 0);
1571 mmput(bprm->mm); 1544 mmput(bprm->mm);
1545 }
1572 1546
1573out_file: 1547out_file:
1574 if (bprm->file) { 1548 if (bprm->file) {
@@ -1963,7 +1937,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1963} 1937}
1964#endif /* HAVE_SET_RESTORE_SIGMASK */ 1938#endif /* HAVE_SET_RESTORE_SIGMASK */
1965 1939
1966#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) 1940#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
1967/* Stuff for NFS server syscalls... */ 1941/* Stuff for NFS server syscalls... */
1968struct compat_nfsctl_svc { 1942struct compat_nfsctl_svc {
1969 u16 svc32_port; 1943 u16 svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..61abb638b4bf 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
19#include <linux/compiler.h> 19#include <linux/compiler.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/smp_lock.h>
23#include <linux/ioctl.h> 22#include <linux/ioctl.h>
24#include <linux/if.h> 23#include <linux/if.h>
25#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
@@ -43,10 +42,9 @@
43#include <linux/tty.h> 42#include <linux/tty.h>
44#include <linux/vt_kern.h> 43#include <linux/vt_kern.h>
45#include <linux/fb.h> 44#include <linux/fb.h>
46#include <linux/videodev.h> 45#include <linux/videodev2.h>
47#include <linux/netdevice.h> 46#include <linux/netdevice.h>
48#include <linux/raw.h> 47#include <linux/raw.h>
49#include <linux/smb_fs.h>
50#include <linux/blkdev.h> 48#include <linux/blkdev.h>
51#include <linux/elevator.h> 49#include <linux/elevator.h>
52#include <linux/rtc.h> 50#include <linux/rtc.h>
@@ -558,25 +556,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
558 556
559#endif /* CONFIG_BLOCK */ 557#endif /* CONFIG_BLOCK */
560 558
561static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
562 compat_uid_t __user *argp)
563{
564 mm_segment_t old_fs = get_fs();
565 __kernel_uid_t kuid;
566 int err;
567
568 cmd = SMB_IOC_GETMOUNTUID;
569
570 set_fs(KERNEL_DS);
571 err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
572 set_fs(old_fs);
573
574 if (err >= 0)
575 err = put_user(kuid, argp);
576
577 return err;
578}
579
580/* Bluetooth ioctls */ 559/* Bluetooth ioctls */
581#define HCIUARTSETPROTO _IOW('U', 200, int) 560#define HCIUARTSETPROTO _IOW('U', 200, int)
582#define HCIUARTGETPROTO _IOR('U', 201, int) 561#define HCIUARTGETPROTO _IOR('U', 201, int)
@@ -599,69 +578,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
599#define HIDPGETCONNLIST _IOR('H', 210, int) 578#define HIDPGETCONNLIST _IOR('H', 210, int)
600#define HIDPGETCONNINFO _IOR('H', 211, int) 579#define HIDPGETCONNINFO _IOR('H', 211, int)
601 580
602#ifdef CONFIG_BLOCK
603struct raw32_config_request
604{
605 compat_int_t raw_minor;
606 __u64 block_major;
607 __u64 block_minor;
608} __attribute__((packed));
609
610static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
611{
612 int ret;
613
614 if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
615 return -EFAULT;
616
617 ret = __get_user(req->raw_minor, &user_req->raw_minor);
618 ret |= __get_user(req->block_major, &user_req->block_major);
619 ret |= __get_user(req->block_minor, &user_req->block_minor);
620
621 return ret ? -EFAULT : 0;
622}
623
624static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
625{
626 int ret;
627
628 if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
629 return -EFAULT;
630
631 ret = __put_user(req->raw_minor, &user_req->raw_minor);
632 ret |= __put_user(req->block_major, &user_req->block_major);
633 ret |= __put_user(req->block_minor, &user_req->block_minor);
634
635 return ret ? -EFAULT : 0;
636}
637
638static int raw_ioctl(unsigned fd, unsigned cmd,
639 struct raw32_config_request __user *user_req)
640{
641 int ret;
642
643 switch (cmd) {
644 case RAW_SETBIND:
645 default: { /* RAW_GETBIND */
646 struct raw_config_request req;
647 mm_segment_t oldfs = get_fs();
648
649 if ((ret = get_raw32_request(&req, user_req)))
650 return ret;
651
652 set_fs(KERNEL_DS);
653 ret = sys_ioctl(fd,cmd,(unsigned long)&req);
654 set_fs(oldfs);
655
656 if ((!ret) && (cmd == RAW_GETBIND)) {
657 ret = set_raw32_request(&req, user_req);
658 }
659 break;
660 }
661 }
662 return ret;
663}
664#endif /* CONFIG_BLOCK */
665 581
666struct serial_struct32 { 582struct serial_struct32 {
667 compat_int_t type; 583 compat_int_t type;
@@ -920,6 +836,7 @@ COMPATIBLE_IOCTL(TCSETSW)
920COMPATIBLE_IOCTL(TCSETSF) 836COMPATIBLE_IOCTL(TCSETSF)
921COMPATIBLE_IOCTL(TIOCLINUX) 837COMPATIBLE_IOCTL(TIOCLINUX)
922COMPATIBLE_IOCTL(TIOCSBRK) 838COMPATIBLE_IOCTL(TIOCSBRK)
839COMPATIBLE_IOCTL(TIOCGDEV)
923COMPATIBLE_IOCTL(TIOCCBRK) 840COMPATIBLE_IOCTL(TIOCCBRK)
924COMPATIBLE_IOCTL(TIOCGSID) 841COMPATIBLE_IOCTL(TIOCGSID)
925COMPATIBLE_IOCTL(TIOCGICOUNT) 842COMPATIBLE_IOCTL(TIOCGICOUNT)
@@ -1265,8 +1182,6 @@ COMPATIBLE_IOCTL(OSS_GETVERSION)
1265/* Raw devices */ 1182/* Raw devices */
1266COMPATIBLE_IOCTL(RAW_SETBIND) 1183COMPATIBLE_IOCTL(RAW_SETBIND)
1267COMPATIBLE_IOCTL(RAW_GETBIND) 1184COMPATIBLE_IOCTL(RAW_GETBIND)
1268/* SMB ioctls which do not need any translations */
1269COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
1270/* Watchdog */ 1185/* Watchdog */
1271COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) 1186COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
1272COMPATIBLE_IOCTL(WDIOC_GETSTATUS) 1187COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1523,15 +1438,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1523 case MTIOCGET32: 1438 case MTIOCGET32:
1524 case MTIOCPOS32: 1439 case MTIOCPOS32:
1525 return mt_ioctl_trans(fd, cmd, argp); 1440 return mt_ioctl_trans(fd, cmd, argp);
1526 /* Raw devices */
1527 case RAW_SETBIND:
1528 case RAW_GETBIND:
1529 return raw_ioctl(fd, cmd, argp);
1530#endif 1441#endif
1531 /* One SMB ioctl needs translations. */
1532#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
1533 case SMB_IOC_GETMOUNTUID_32:
1534 return do_smb_getmountuid(fd, cmd, argp);
1535 /* Serial */ 1442 /* Serial */
1536 case TIOCGSERIAL: 1443 case TIOCGSERIAL:
1537 case TIOCSSERIAL: 1444 case TIOCSSERIAL:
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0b..9febcdefdfdc 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
1config CONFIGFS_FS 1config CONFIGFS_FS
2 tristate "Userspace-driven configuration filesystem" 2 tristate "Userspace-driven configuration filesystem"
3 depends on SYSFS 3 select SYSFS
4 help 4 help
5 configfs is a ram-based filesystem that provides the converse 5 configfs is a RAM-based filesystem that provides the converse
6 of sysfs's functionality. Where sysfs is a filesystem-based 6 of sysfs's functionality. Where sysfs is a filesystem-based
7 view of kernel objects, configfs is a filesystem-based manager 7 view of kernel objects, configfs is a filesystem-based manager
8 of kernel objects, or config_items. 8 of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da6061a6df40..82bda8fdfc1c 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
90extern const struct file_operations bin_fops; 90extern const struct file_operations bin_fops;
91extern const struct inode_operations configfs_dir_inode_operations; 91extern const struct inode_operations configfs_dir_inode_operations;
92extern const struct inode_operations configfs_symlink_inode_operations; 92extern const struct inode_operations configfs_symlink_inode_operations;
93extern const struct dentry_operations configfs_dentry_ops;
93 94
94extern int configfs_symlink(struct inode *dir, struct dentry *dentry, 95extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
95 const char *symname); 96 const char *symname);
@@ -120,7 +121,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
120{ 121{
121 struct config_item * item = NULL; 122 struct config_item * item = NULL;
122 123
123 spin_lock(&dcache_lock); 124 spin_lock(&dentry->d_lock);
124 if (!d_unhashed(dentry)) { 125 if (!d_unhashed(dentry)) {
125 struct configfs_dirent * sd = dentry->d_fsdata; 126 struct configfs_dirent * sd = dentry->d_fsdata;
126 if (sd->s_type & CONFIGFS_ITEM_LINK) { 127 if (sd->s_type & CONFIGFS_ITEM_LINK) {
@@ -129,7 +130,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
129 } else 130 } else
130 item = config_item_get(sd->s_element); 131 item = config_item_get(sd->s_element);
131 } 132 }
132 spin_unlock(&dcache_lock); 133 spin_unlock(&dentry->d_lock);
133 134
134 return item; 135 return item;
135} 136}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0b502f80c691..90ff3cb10de3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,12 +67,12 @@ static void configfs_d_iput(struct dentry * dentry,
67 * We _must_ delete our dentries on last dput, as the chain-to-parent 67 * We _must_ delete our dentries on last dput, as the chain-to-parent
68 * behavior is required to clear the parents of default_groups. 68 * behavior is required to clear the parents of default_groups.
69 */ 69 */
70static int configfs_d_delete(struct dentry *dentry) 70static int configfs_d_delete(const struct dentry *dentry)
71{ 71{
72 return 1; 72 return 1;
73} 73}
74 74
75static const struct dentry_operations configfs_dentry_ops = { 75const struct dentry_operations configfs_dentry_ops = {
76 .d_iput = configfs_d_iput, 76 .d_iput = configfs_d_iput,
77 /* simple_delete_dentry() isn't exported */ 77 /* simple_delete_dentry() isn't exported */
78 .d_delete = configfs_d_delete, 78 .d_delete = configfs_d_delete,
@@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
232 232
233 sd->s_mode = mode; 233 sd->s_mode = mode;
234 sd->s_dentry = dentry; 234 sd->s_dentry = dentry;
235 if (dentry) { 235 if (dentry)
236 dentry->d_fsdata = configfs_get(sd); 236 dentry->d_fsdata = configfs_get(sd);
237 dentry->d_op = &configfs_dentry_ops;
238 }
239 237
240 return 0; 238 return 0;
241} 239}
@@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p,
278 error = configfs_create(d, mode, init_dir); 276 error = configfs_create(d, mode, init_dir);
279 if (!error) { 277 if (!error) {
280 inc_nlink(p->d_inode); 278 inc_nlink(p->d_inode);
281 (d)->d_op = &configfs_dentry_ops;
282 } else { 279 } else {
283 struct configfs_dirent *sd = d->d_fsdata; 280 struct configfs_dirent *sd = d->d_fsdata;
284 if (sd) { 281 if (sd) {
@@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl,
371 CONFIGFS_ITEM_LINK); 368 CONFIGFS_ITEM_LINK);
372 if (!err) { 369 if (!err) {
373 err = configfs_create(dentry, mode, init_symlink); 370 err = configfs_create(dentry, mode, init_symlink);
374 if (!err) 371 if (err) {
375 dentry->d_op = &configfs_dentry_ops;
376 else {
377 struct configfs_dirent *sd = dentry->d_fsdata; 372 struct configfs_dirent *sd = dentry->d_fsdata;
378 if (sd) { 373 if (sd) {
379 spin_lock(&configfs_dirent_lock); 374 spin_lock(&configfs_dirent_lock);
@@ -399,8 +394,7 @@ static void remove_dir(struct dentry * d)
399 if (d->d_inode) 394 if (d->d_inode)
400 simple_rmdir(parent->d_inode,d); 395 simple_rmdir(parent->d_inode,d);
401 396
402 pr_debug(" o %s removing done (%d)\n",d->d_name.name, 397 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
403 atomic_read(&d->d_count));
404 398
405 dput(parent); 399 dput(parent);
406} 400}
@@ -448,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
448 return error; 442 return error;
449 } 443 }
450 444
451 dentry->d_op = &configfs_dentry_ops;
452 d_rehash(dentry); 445 d_rehash(dentry);
453 446
454 return 0; 447 return 0;
@@ -493,7 +486,10 @@ static struct dentry * configfs_lookup(struct inode *dir,
493 * If it doesn't exist and it isn't a NOT_PINNED item, 486 * If it doesn't exist and it isn't a NOT_PINNED item,
494 * it must be negative. 487 * it must be negative.
495 */ 488 */
496 return simple_lookup(dir, dentry, nd); 489 if (dentry->d_name.len > NAME_MAX)
490 return ERR_PTR(-ENAMETOOLONG);
491 d_add(dentry, NULL);
492 return NULL;
497 } 493 }
498 494
499out: 495out:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..c83f4768eeaa 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
135{ 135{
136 struct inode * inode = new_inode(configfs_sb); 136 struct inode * inode = new_inode(configfs_sb);
137 if (inode) { 137 if (inode) {
138 inode->i_ino = get_next_ino();
138 inode->i_mapping->a_ops = &configfs_aops; 139 inode->i_mapping->a_ops = &configfs_aops;
139 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 140 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
140 inode->i_op = &configfs_inode_operations; 141 inode->i_op = &configfs_inode_operations;
@@ -249,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
249 struct dentry * dentry = sd->s_dentry; 250 struct dentry * dentry = sd->s_dentry;
250 251
251 if (dentry) { 252 if (dentry) {
252 spin_lock(&dcache_lock);
253 spin_lock(&dentry->d_lock); 253 spin_lock(&dentry->d_lock);
254 if (!(d_unhashed(dentry) && dentry->d_inode)) { 254 if (!(d_unhashed(dentry) && dentry->d_inode)) {
255 dget_locked(dentry); 255 dget_dlock(dentry);
256 __d_drop(dentry); 256 __d_drop(dentry);
257 spin_unlock(&dentry->d_lock); 257 spin_unlock(&dentry->d_lock);
258 spin_unlock(&dcache_lock);
259 simple_unlink(parent->d_inode, dentry); 258 simple_unlink(parent->d_inode, dentry);
260 } else { 259 } else
261 spin_unlock(&dentry->d_lock); 260 spin_unlock(&dentry->d_lock);
262 spin_unlock(&dcache_lock);
263 }
264 } 261 }
265} 262}
266 263
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2d..ecc62178beda 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,19 +101,20 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
101 configfs_root_group.cg_item.ci_dentry = root; 101 configfs_root_group.cg_item.ci_dentry = root;
102 root->d_fsdata = &configfs_root; 102 root->d_fsdata = &configfs_root;
103 sb->s_root = root; 103 sb->s_root = root;
104 sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
104 return 0; 105 return 0;
105} 106}
106 107
107static int configfs_get_sb(struct file_system_type *fs_type, 108static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
108 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 109 int flags, const char *dev_name, void *data)
109{ 110{
110 return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt); 111 return mount_single(fs_type, flags, data, configfs_fill_super);
111} 112}
112 113
113static struct file_system_type configfs_fs_type = { 114static struct file_system_type configfs_fs_type = {
114 .owner = THIS_MODULE, 115 .owner = THIS_MODULE,
115 .name = "configfs", 116 .name = "configfs",
116 .get_sb = configfs_get_sb, 117 .mount = configfs_do_mount,
117 .kill_sb = kill_litter_super, 118 .kill_sb = kill_litter_super,
118}; 119};
119 120
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 1e7a33028d33..e141939080f0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
34static DEFINE_MUTEX(read_mutex); 34static DEFINE_MUTEX(read_mutex);
35 35
36 36
37/* These two macros may change in future, to provide better st_ino 37/* These macros may change in future, to provide better st_ino semantics. */
38 semantics. */
39#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1)
40#define OFFSET(x) ((x)->i_ino) 38#define OFFSET(x) ((x)->i_ino)
41 39
42static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode) 40static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
43{ 41{
42 if (!cino->offset)
43 return offset + 1;
44 if (!cino->size)
45 return offset + 1;
46
47 /*
48 * The file mode test fixes buggy mkcramfs implementations where
49 * cramfs_inode->offset is set to a non zero value for entries
50 * which did not contain data, like devices node and fifos.
51 */
52 switch (cino->mode & S_IFMT) {
53 case S_IFREG:
54 case S_IFDIR:
55 case S_IFLNK:
56 return cino->offset << 2;
57 default:
58 break;
59 }
60 return offset + 1;
61}
62
63static struct inode *get_cramfs_inode(struct super_block *sb,
64 struct cramfs_inode *cramfs_inode, unsigned int offset)
65{
66 struct inode *inode;
44 static struct timespec zerotime; 67 static struct timespec zerotime;
68
69 inode = iget_locked(sb, cramino(cramfs_inode, offset));
70 if (!inode)
71 return ERR_PTR(-ENOMEM);
72 if (!(inode->i_state & I_NEW))
73 return inode;
74
75 switch (cramfs_inode->mode & S_IFMT) {
76 case S_IFREG:
77 inode->i_fop = &generic_ro_fops;
78 inode->i_data.a_ops = &cramfs_aops;
79 break;
80 case S_IFDIR:
81 inode->i_op = &cramfs_dir_inode_operations;
82 inode->i_fop = &cramfs_directory_operations;
83 break;
84 case S_IFLNK:
85 inode->i_op = &page_symlink_inode_operations;
86 inode->i_data.a_ops = &cramfs_aops;
87 break;
88 default:
89 init_special_inode(inode, cramfs_inode->mode,
90 old_decode_dev(cramfs_inode->size));
91 }
92
45 inode->i_mode = cramfs_inode->mode; 93 inode->i_mode = cramfs_inode->mode;
46 inode->i_uid = cramfs_inode->uid; 94 inode->i_uid = cramfs_inode->uid;
47 inode->i_size = cramfs_inode->size;
48 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
49 inode->i_gid = cramfs_inode->gid; 95 inode->i_gid = cramfs_inode->gid;
96
97 /* if the lower 2 bits are zero, the inode contains data */
98 if (!(inode->i_ino & 3)) {
99 inode->i_size = cramfs_inode->size;
100 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
101 }
102
50 /* Struct copy intentional */ 103 /* Struct copy intentional */
51 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; 104 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
52 /* inode->i_nlink is left 1 - arguably wrong for directories, 105 /* inode->i_nlink is left 1 - arguably wrong for directories,
53 but it's the best we can do without reading the directory 106 but it's the best we can do without reading the directory
54 contents. 1 yields the right result in GNU find, even 107 contents. 1 yields the right result in GNU find, even
55 without -noleaf option. */ 108 without -noleaf option. */
56 if (S_ISREG(inode->i_mode)) {
57 inode->i_fop = &generic_ro_fops;
58 inode->i_data.a_ops = &cramfs_aops;
59 } else if (S_ISDIR(inode->i_mode)) {
60 inode->i_op = &cramfs_dir_inode_operations;
61 inode->i_fop = &cramfs_directory_operations;
62 } else if (S_ISLNK(inode->i_mode)) {
63 inode->i_op = &page_symlink_inode_operations;
64 inode->i_data.a_ops = &cramfs_aops;
65 } else {
66 init_special_inode(inode, inode->i_mode,
67 old_decode_dev(cramfs_inode->size));
68 }
69}
70 109
71static struct inode *get_cramfs_inode(struct super_block *sb, 110 unlock_new_inode(inode);
72 struct cramfs_inode * cramfs_inode) 111
73{
74 struct inode *inode;
75 if (CRAMINO(cramfs_inode) == 1) {
76 inode = new_inode(sb);
77 if (inode) {
78 inode->i_ino = 1;
79 setup_inode(inode, cramfs_inode);
80 }
81 } else {
82 inode = iget_locked(sb, CRAMINO(cramfs_inode));
83 if (inode && (inode->i_state & I_NEW)) {
84 setup_inode(inode, cramfs_inode);
85 unlock_new_inode(inode);
86 }
87 }
88 return inode; 112 return inode;
89} 113}
90 114
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
265 printk(KERN_ERR "cramfs: root is not a directory\n"); 289 printk(KERN_ERR "cramfs: root is not a directory\n");
266 goto out; 290 goto out;
267 } 291 }
292 /* correct strange, hard-coded permissions of mkcramfs */
293 super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
294
268 root_offset = super.root.offset << 2; 295 root_offset = super.root.offset << 2;
269 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { 296 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
270 sbi->size=super.size; 297 sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
289 316
290 /* Set it all up.. */ 317 /* Set it all up.. */
291 sb->s_op = &cramfs_ops; 318 sb->s_op = &cramfs_ops;
292 root = get_cramfs_inode(sb, &super.root); 319 root = get_cramfs_inode(sb, &super.root, 0);
293 if (!root) 320 if (!root)
294 goto out; 321 goto out;
295 sb->s_root = d_alloc_root(root); 322 sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
365 */ 392 */
366 namelen = de->namelen << 2; 393 namelen = de->namelen << 2;
367 memcpy(buf, name, namelen); 394 memcpy(buf, name, namelen);
368 ino = CRAMINO(de); 395 ino = cramino(de, OFFSET(inode) + offset);
369 mode = de->mode; 396 mode = de->mode;
370 mutex_unlock(&read_mutex); 397 mutex_unlock(&read_mutex);
371 nextoffset = offset + sizeof(*de) + namelen; 398 nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
404 struct cramfs_inode *de; 431 struct cramfs_inode *de;
405 char *name; 432 char *name;
406 int namelen, retval; 433 int namelen, retval;
434 int dir_off = OFFSET(dir) + offset;
407 435
408 de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); 436 de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
409 name = (char *)(de+1); 437 name = (char *)(de+1);
410 438
411 /* Try to take advantage of sorted directories */ 439 /* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
436 if (!retval) { 464 if (!retval) {
437 struct cramfs_inode entry = *de; 465 struct cramfs_inode entry = *de;
438 mutex_unlock(&read_mutex); 466 mutex_unlock(&read_mutex);
439 d_add(dentry, get_cramfs_inode(dir->i_sb, &entry)); 467 d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
440 return NULL; 468 return NULL;
441 } 469 }
442 /* else (retval < 0) */ 470 /* else (retval < 0) */
@@ -533,17 +561,16 @@ static const struct super_operations cramfs_ops = {
533 .statfs = cramfs_statfs, 561 .statfs = cramfs_statfs,
534}; 562};
535 563
536static int cramfs_get_sb(struct file_system_type *fs_type, 564static struct dentry *cramfs_mount(struct file_system_type *fs_type,
537 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 565 int flags, const char *dev_name, void *data)
538{ 566{
539 return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super, 567 return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
540 mnt);
541} 568}
542 569
543static struct file_system_type cramfs_fs_type = { 570static struct file_system_type cramfs_fs_type = {
544 .owner = THIS_MODULE, 571 .owner = THIS_MODULE,
545 .name = "cramfs", 572 .name = "cramfs",
546 .get_sb = cramfs_get_sb, 573 .mount = cramfs_mount,
547 .kill_sb = kill_block_super, 574 .kill_sb = kill_block_super,
548 .fs_flags = FS_REQUIRES_DEV, 575 .fs_flags = FS_REQUIRES_DEV,
549}; 576};
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..2a6bd9a4ae97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,20 +33,58 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
35#include <linux/hardirq.h> 35#include <linux/hardirq.h>
36#include <linux/bit_spinlock.h>
37#include <linux/rculist_bl.h>
36#include "internal.h" 38#include "internal.h"
37 39
40/*
41 * Usage:
42 * dcache->d_inode->i_lock protects:
43 * - i_dentry, d_alias, d_inode of aliases
44 * dcache_hash_bucket lock protects:
45 * - the dcache hash table
46 * s_anon bl list spinlock protects:
47 * - the s_anon list (see __d_drop)
48 * dcache_lru_lock protects:
49 * - the dcache lru lists and counters
50 * d_lock protects:
51 * - d_flags
52 * - d_name
53 * - d_lru
54 * - d_count
55 * - d_unhashed()
56 * - d_parent and d_subdirs
57 * - childrens' d_child and d_parent
58 * - d_alias, d_inode
59 *
60 * Ordering:
61 * dentry->d_inode->i_lock
62 * dentry->d_lock
63 * dcache_lru_lock
64 * dcache_hash_bucket lock
65 * s_anon lock
66 *
67 * If there is an ancestor relationship:
68 * dentry->d_parent->...->d_parent->d_lock
69 * ...
70 * dentry->d_parent->d_lock
71 * dentry->d_lock
72 *
73 * If no ancestor relationship:
74 * if (dentry1 < dentry2)
75 * dentry1->d_lock
76 * dentry2->d_lock
77 */
38int sysctl_vfs_cache_pressure __read_mostly = 100; 78int sysctl_vfs_cache_pressure __read_mostly = 100;
39EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); 79EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
40 80
41 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); 81static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
42__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); 82__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
43 83
44EXPORT_SYMBOL(dcache_lock); 84EXPORT_SYMBOL(rename_lock);
45 85
46static struct kmem_cache *dentry_cache __read_mostly; 86static struct kmem_cache *dentry_cache __read_mostly;
47 87
48#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
49
50/* 88/*
51 * This is the single most critical data structure when it comes 89 * This is the single most critical data structure when it comes
52 * to the dcache: the hashtable for lookups. Somebody should try 90 * to the dcache: the hashtable for lookups. Somebody should try
@@ -60,56 +98,111 @@ static struct kmem_cache *dentry_cache __read_mostly;
60 98
61static unsigned int d_hash_mask __read_mostly; 99static unsigned int d_hash_mask __read_mostly;
62static unsigned int d_hash_shift __read_mostly; 100static unsigned int d_hash_shift __read_mostly;
63static struct hlist_head *dentry_hashtable __read_mostly; 101
102struct dcache_hash_bucket {
103 struct hlist_bl_head head;
104};
105static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
106
107static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
108 unsigned long hash)
109{
110 hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
111 hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
112 return dentry_hashtable + (hash & D_HASHMASK);
113}
114
115static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
116{
117 bit_spin_lock(0, (unsigned long *)&b->head.first);
118}
119
120static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
121{
122 __bit_spin_unlock(0, (unsigned long *)&b->head.first);
123}
64 124
65/* Statistics gathering. */ 125/* Statistics gathering. */
66struct dentry_stat_t dentry_stat = { 126struct dentry_stat_t dentry_stat = {
67 .age_limit = 45, 127 .age_limit = 45,
68}; 128};
69 129
70static void __d_free(struct dentry *dentry) 130static DEFINE_PER_CPU(unsigned int, nr_dentry);
131
132#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
133static int get_nr_dentry(void)
134{
135 int i;
136 int sum = 0;
137 for_each_possible_cpu(i)
138 sum += per_cpu(nr_dentry, i);
139 return sum < 0 ? 0 : sum;
140}
141
142int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
143 size_t *lenp, loff_t *ppos)
71{ 144{
145 dentry_stat.nr_dentry = get_nr_dentry();
146 return proc_dointvec(table, write, buffer, lenp, ppos);
147}
148#endif
149
150static void __d_free(struct rcu_head *head)
151{
152 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
153
72 WARN_ON(!list_empty(&dentry->d_alias)); 154 WARN_ON(!list_empty(&dentry->d_alias));
73 if (dname_external(dentry)) 155 if (dname_external(dentry))
74 kfree(dentry->d_name.name); 156 kfree(dentry->d_name.name);
75 kmem_cache_free(dentry_cache, dentry); 157 kmem_cache_free(dentry_cache, dentry);
76} 158}
77 159
78static void d_callback(struct rcu_head *head)
79{
80 struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
81 __d_free(dentry);
82}
83
84/* 160/*
85 * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry 161 * no locks, please.
86 * inside dcache_lock.
87 */ 162 */
88static void d_free(struct dentry *dentry) 163static void d_free(struct dentry *dentry)
89{ 164{
165 BUG_ON(dentry->d_count);
166 this_cpu_dec(nr_dentry);
90 if (dentry->d_op && dentry->d_op->d_release) 167 if (dentry->d_op && dentry->d_op->d_release)
91 dentry->d_op->d_release(dentry); 168 dentry->d_op->d_release(dentry);
169
92 /* if dentry was never inserted into hash, immediate free is OK */ 170 /* if dentry was never inserted into hash, immediate free is OK */
93 if (hlist_unhashed(&dentry->d_hash)) 171 if (hlist_bl_unhashed(&dentry->d_hash))
94 __d_free(dentry); 172 __d_free(&dentry->d_u.d_rcu);
95 else 173 else
96 call_rcu(&dentry->d_u.d_rcu, d_callback); 174 call_rcu(&dentry->d_u.d_rcu, __d_free);
175}
176
177/**
178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
179 * @dentry: the target dentry
180 * After this call, in-progress rcu-walk path lookup will fail. This
181 * should be called after unhashing, and after changing d_inode (if
182 * the dentry has not already been unhashed).
183 */
184static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
185{
186 assert_spin_locked(&dentry->d_lock);
187 /* Go through a barrier */
188 write_seqcount_barrier(&dentry->d_seq);
97} 189}
98 190
99/* 191/*
100 * Release the dentry's inode, using the filesystem 192 * Release the dentry's inode, using the filesystem
101 * d_iput() operation if defined. 193 * d_iput() operation if defined. Dentry has no refcount
194 * and is unhashed.
102 */ 195 */
103static void dentry_iput(struct dentry * dentry) 196static void dentry_iput(struct dentry * dentry)
104 __releases(dentry->d_lock) 197 __releases(dentry->d_lock)
105 __releases(dcache_lock) 198 __releases(dentry->d_inode->i_lock)
106{ 199{
107 struct inode *inode = dentry->d_inode; 200 struct inode *inode = dentry->d_inode;
108 if (inode) { 201 if (inode) {
109 dentry->d_inode = NULL; 202 dentry->d_inode = NULL;
110 list_del_init(&dentry->d_alias); 203 list_del_init(&dentry->d_alias);
111 spin_unlock(&dentry->d_lock); 204 spin_unlock(&dentry->d_lock);
112 spin_unlock(&dcache_lock); 205 spin_unlock(&inode->i_lock);
113 if (!inode->i_nlink) 206 if (!inode->i_nlink)
114 fsnotify_inoderemove(inode); 207 fsnotify_inoderemove(inode);
115 if (dentry->d_op && dentry->d_op->d_iput) 208 if (dentry->d_op && dentry->d_op->d_iput)
@@ -118,69 +211,191 @@ static void dentry_iput(struct dentry * dentry)
118 iput(inode); 211 iput(inode);
119 } else { 212 } else {
120 spin_unlock(&dentry->d_lock); 213 spin_unlock(&dentry->d_lock);
121 spin_unlock(&dcache_lock);
122 } 214 }
123} 215}
124 216
125/* 217/*
126 * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. 218 * Release the dentry's inode, using the filesystem
219 * d_iput() operation if defined. dentry remains in-use.
220 */
221static void dentry_unlink_inode(struct dentry * dentry)
222 __releases(dentry->d_lock)
223 __releases(dentry->d_inode->i_lock)
224{
225 struct inode *inode = dentry->d_inode;
226 dentry->d_inode = NULL;
227 list_del_init(&dentry->d_alias);
228 dentry_rcuwalk_barrier(dentry);
229 spin_unlock(&dentry->d_lock);
230 spin_unlock(&inode->i_lock);
231 if (!inode->i_nlink)
232 fsnotify_inoderemove(inode);
233 if (dentry->d_op && dentry->d_op->d_iput)
234 dentry->d_op->d_iput(dentry, inode);
235 else
236 iput(inode);
237}
238
239/*
240 * dentry_lru_(add|del|move_tail) must be called with d_lock held.
127 */ 241 */
128static void dentry_lru_add(struct dentry *dentry) 242static void dentry_lru_add(struct dentry *dentry)
129{ 243{
130 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 244 if (list_empty(&dentry->d_lru)) {
131 dentry->d_sb->s_nr_dentry_unused++; 245 spin_lock(&dcache_lru_lock);
132 dentry_stat.nr_unused++; 246 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
247 dentry->d_sb->s_nr_dentry_unused++;
248 dentry_stat.nr_unused++;
249 spin_unlock(&dcache_lru_lock);
250 }
133} 251}
134 252
135static void dentry_lru_add_tail(struct dentry *dentry) 253static void __dentry_lru_del(struct dentry *dentry)
136{ 254{
137 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 255 list_del_init(&dentry->d_lru);
138 dentry->d_sb->s_nr_dentry_unused++; 256 dentry->d_sb->s_nr_dentry_unused--;
139 dentry_stat.nr_unused++; 257 dentry_stat.nr_unused--;
140} 258}
141 259
142static void dentry_lru_del(struct dentry *dentry) 260static void dentry_lru_del(struct dentry *dentry)
143{ 261{
144 if (!list_empty(&dentry->d_lru)) { 262 if (!list_empty(&dentry->d_lru)) {
145 list_del(&dentry->d_lru); 263 spin_lock(&dcache_lru_lock);
146 dentry->d_sb->s_nr_dentry_unused--; 264 __dentry_lru_del(dentry);
147 dentry_stat.nr_unused--; 265 spin_unlock(&dcache_lru_lock);
148 } 266 }
149} 267}
150 268
151static void dentry_lru_del_init(struct dentry *dentry) 269static void dentry_lru_move_tail(struct dentry *dentry)
152{ 270{
153 if (likely(!list_empty(&dentry->d_lru))) { 271 spin_lock(&dcache_lru_lock);
154 list_del_init(&dentry->d_lru); 272 if (list_empty(&dentry->d_lru)) {
155 dentry->d_sb->s_nr_dentry_unused--; 273 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
156 dentry_stat.nr_unused--; 274 dentry->d_sb->s_nr_dentry_unused++;
275 dentry_stat.nr_unused++;
276 } else {
277 list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
157 } 278 }
279 spin_unlock(&dcache_lru_lock);
158} 280}
159 281
160/** 282/**
161 * d_kill - kill dentry and return parent 283 * d_kill - kill dentry and return parent
162 * @dentry: dentry to kill 284 * @dentry: dentry to kill
285 * @parent: parent dentry
163 * 286 *
164 * The dentry must already be unhashed and removed from the LRU. 287 * The dentry must already be unhashed and removed from the LRU.
165 * 288 *
166 * If this is the root of the dentry tree, return NULL. 289 * If this is the root of the dentry tree, return NULL.
290 *
291 * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
292 * d_kill.
167 */ 293 */
168static struct dentry *d_kill(struct dentry *dentry) 294static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
169 __releases(dentry->d_lock) 295 __releases(dentry->d_lock)
170 __releases(dcache_lock) 296 __releases(parent->d_lock)
297 __releases(dentry->d_inode->i_lock)
171{ 298{
172 struct dentry *parent; 299 dentry->d_parent = NULL;
173
174 list_del(&dentry->d_u.d_child); 300 list_del(&dentry->d_u.d_child);
175 dentry_stat.nr_dentry--; /* For d_free, below */ 301 if (parent)
176 /*drops the locks, at that point nobody can reach this dentry */ 302 spin_unlock(&parent->d_lock);
177 dentry_iput(dentry); 303 dentry_iput(dentry);
304 /*
305 * dentry_iput drops the locks, at which point nobody (except
306 * transient RCU lookups) can reach this dentry.
307 */
308 d_free(dentry);
309 return parent;
310}
311
312/**
313 * d_drop - drop a dentry
314 * @dentry: dentry to drop
315 *
316 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
317 * be found through a VFS lookup any more. Note that this is different from
318 * deleting the dentry - d_delete will try to mark the dentry negative if
319 * possible, giving a successful _negative_ lookup, while d_drop will
320 * just make the cache lookup fail.
321 *
322 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
323 * reason (NFS timeouts or autofs deletes).
324 *
325 * __d_drop requires dentry->d_lock.
326 */
327void __d_drop(struct dentry *dentry)
328{
329 if (!(dentry->d_flags & DCACHE_UNHASHED)) {
330 if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
331 bit_spin_lock(0,
332 (unsigned long *)&dentry->d_sb->s_anon.first);
333 dentry->d_flags |= DCACHE_UNHASHED;
334 hlist_bl_del_init(&dentry->d_hash);
335 __bit_spin_unlock(0,
336 (unsigned long *)&dentry->d_sb->s_anon.first);
337 } else {
338 struct dcache_hash_bucket *b;
339 b = d_hash(dentry->d_parent, dentry->d_name.hash);
340 spin_lock_bucket(b);
341 /*
342 * We may not actually need to put DCACHE_UNHASHED
343 * manipulations under the hash lock, but follow
344 * the principle of least surprise.
345 */
346 dentry->d_flags |= DCACHE_UNHASHED;
347 hlist_bl_del_rcu(&dentry->d_hash);
348 spin_unlock_bucket(b);
349 dentry_rcuwalk_barrier(dentry);
350 }
351 }
352}
353EXPORT_SYMBOL(__d_drop);
354
355void d_drop(struct dentry *dentry)
356{
357 spin_lock(&dentry->d_lock);
358 __d_drop(dentry);
359 spin_unlock(&dentry->d_lock);
360}
361EXPORT_SYMBOL(d_drop);
362
363/*
364 * Finish off a dentry we've decided to kill.
365 * dentry->d_lock must be held, returns with it unlocked.
366 * If ref is non-zero, then decrement the refcount too.
367 * Returns dentry requiring refcount drop, or NULL if we're done.
368 */
369static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
370 __releases(dentry->d_lock)
371{
372 struct inode *inode;
373 struct dentry *parent;
374
375 inode = dentry->d_inode;
376 if (inode && !spin_trylock(&inode->i_lock)) {
377relock:
378 spin_unlock(&dentry->d_lock);
379 cpu_relax();
380 return dentry; /* try again with same dentry */
381 }
178 if (IS_ROOT(dentry)) 382 if (IS_ROOT(dentry))
179 parent = NULL; 383 parent = NULL;
180 else 384 else
181 parent = dentry->d_parent; 385 parent = dentry->d_parent;
182 d_free(dentry); 386 if (parent && !spin_trylock(&parent->d_lock)) {
183 return parent; 387 if (inode)
388 spin_unlock(&inode->i_lock);
389 goto relock;
390 }
391
392 if (ref)
393 dentry->d_count--;
394 /* if dentry was on the d_lru list delete it from there */
395 dentry_lru_del(dentry);
396 /* if it was on the hash then remove it */
397 __d_drop(dentry);
398 return d_kill(dentry, parent);
184} 399}
185 400
186/* 401/*
@@ -208,52 +423,42 @@ static struct dentry *d_kill(struct dentry *dentry)
208 * call the dentry unlink method as well as removing it from the queues and 423 * call the dentry unlink method as well as removing it from the queues and
209 * releasing its resources. If the parent dentries were scheduled for release 424 * releasing its resources. If the parent dentries were scheduled for release
210 * they too may now get deleted. 425 * they too may now get deleted.
211 *
212 * no dcache lock, please.
213 */ 426 */
214
215void dput(struct dentry *dentry) 427void dput(struct dentry *dentry)
216{ 428{
217 if (!dentry) 429 if (!dentry)
218 return; 430 return;
219 431
220repeat: 432repeat:
221 if (atomic_read(&dentry->d_count) == 1) 433 if (dentry->d_count == 1)
222 might_sleep(); 434 might_sleep();
223 if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
224 return;
225
226 spin_lock(&dentry->d_lock); 435 spin_lock(&dentry->d_lock);
227 if (atomic_read(&dentry->d_count)) { 436 BUG_ON(!dentry->d_count);
437 if (dentry->d_count > 1) {
438 dentry->d_count--;
228 spin_unlock(&dentry->d_lock); 439 spin_unlock(&dentry->d_lock);
229 spin_unlock(&dcache_lock);
230 return; 440 return;
231 } 441 }
232 442
233 /* 443 if (dentry->d_flags & DCACHE_OP_DELETE) {
234 * AV: ->d_delete() is _NOT_ allowed to block now.
235 */
236 if (dentry->d_op && dentry->d_op->d_delete) {
237 if (dentry->d_op->d_delete(dentry)) 444 if (dentry->d_op->d_delete(dentry))
238 goto unhash_it; 445 goto kill_it;
239 } 446 }
447
240 /* Unreachable? Get rid of it */ 448 /* Unreachable? Get rid of it */
241 if (d_unhashed(dentry)) 449 if (d_unhashed(dentry))
242 goto kill_it; 450 goto kill_it;
243 if (list_empty(&dentry->d_lru)) { 451
244 dentry->d_flags |= DCACHE_REFERENCED; 452 /* Otherwise leave it cached and ensure it's on the LRU */
245 dentry_lru_add(dentry); 453 dentry->d_flags |= DCACHE_REFERENCED;
246 } 454 dentry_lru_add(dentry);
247 spin_unlock(&dentry->d_lock); 455
248 spin_unlock(&dcache_lock); 456 dentry->d_count--;
457 spin_unlock(&dentry->d_lock);
249 return; 458 return;
250 459
251unhash_it:
252 __d_drop(dentry);
253kill_it: 460kill_it:
254 /* if dentry was on the d_lru list delete it from there */ 461 dentry = dentry_kill(dentry, 1);
255 dentry_lru_del(dentry);
256 dentry = d_kill(dentry);
257 if (dentry) 462 if (dentry)
258 goto repeat; 463 goto repeat;
259} 464}
@@ -276,9 +481,9 @@ int d_invalidate(struct dentry * dentry)
276 /* 481 /*
277 * If it's already been dropped, return OK. 482 * If it's already been dropped, return OK.
278 */ 483 */
279 spin_lock(&dcache_lock); 484 spin_lock(&dentry->d_lock);
280 if (d_unhashed(dentry)) { 485 if (d_unhashed(dentry)) {
281 spin_unlock(&dcache_lock); 486 spin_unlock(&dentry->d_lock);
282 return 0; 487 return 0;
283 } 488 }
284 /* 489 /*
@@ -286,9 +491,9 @@ int d_invalidate(struct dentry * dentry)
286 * to get rid of unused child entries. 491 * to get rid of unused child entries.
287 */ 492 */
288 if (!list_empty(&dentry->d_subdirs)) { 493 if (!list_empty(&dentry->d_subdirs)) {
289 spin_unlock(&dcache_lock); 494 spin_unlock(&dentry->d_lock);
290 shrink_dcache_parent(dentry); 495 shrink_dcache_parent(dentry);
291 spin_lock(&dcache_lock); 496 spin_lock(&dentry->d_lock);
292 } 497 }
293 498
294 /* 499 /*
@@ -301,36 +506,61 @@ int d_invalidate(struct dentry * dentry)
301 * we might still populate it if it was a 506 * we might still populate it if it was a
302 * working directory or similar). 507 * working directory or similar).
303 */ 508 */
304 spin_lock(&dentry->d_lock); 509 if (dentry->d_count > 1) {
305 if (atomic_read(&dentry->d_count) > 1) {
306 if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { 510 if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
307 spin_unlock(&dentry->d_lock); 511 spin_unlock(&dentry->d_lock);
308 spin_unlock(&dcache_lock);
309 return -EBUSY; 512 return -EBUSY;
310 } 513 }
311 } 514 }
312 515
313 __d_drop(dentry); 516 __d_drop(dentry);
314 spin_unlock(&dentry->d_lock); 517 spin_unlock(&dentry->d_lock);
315 spin_unlock(&dcache_lock);
316 return 0; 518 return 0;
317} 519}
318EXPORT_SYMBOL(d_invalidate); 520EXPORT_SYMBOL(d_invalidate);
319 521
320/* This should be called _only_ with dcache_lock held */ 522/* This must be called with d_lock held */
523static inline void __dget_dlock(struct dentry *dentry)
524{
525 dentry->d_count++;
526}
321 527
322static inline struct dentry * __dget_locked(struct dentry *dentry) 528static inline void __dget(struct dentry *dentry)
323{ 529{
324 atomic_inc(&dentry->d_count); 530 spin_lock(&dentry->d_lock);
325 dentry_lru_del_init(dentry); 531 __dget_dlock(dentry);
326 return dentry; 532 spin_unlock(&dentry->d_lock);
327} 533}
328 534
329struct dentry * dget_locked(struct dentry *dentry) 535struct dentry *dget_parent(struct dentry *dentry)
330{ 536{
331 return __dget_locked(dentry); 537 struct dentry *ret;
538
539repeat:
540 /*
541 * Don't need rcu_dereference because we re-check it was correct under
542 * the lock.
543 */
544 rcu_read_lock();
545 ret = dentry->d_parent;
546 if (!ret) {
547 rcu_read_unlock();
548 goto out;
549 }
550 spin_lock(&ret->d_lock);
551 if (unlikely(ret != dentry->d_parent)) {
552 spin_unlock(&ret->d_lock);
553 rcu_read_unlock();
554 goto repeat;
555 }
556 rcu_read_unlock();
557 BUG_ON(!ret->d_count);
558 ret->d_count++;
559 spin_unlock(&ret->d_lock);
560out:
561 return ret;
332} 562}
333EXPORT_SYMBOL(dget_locked); 563EXPORT_SYMBOL(dget_parent);
334 564
335/** 565/**
336 * d_find_alias - grab a hashed alias of inode 566 * d_find_alias - grab a hashed alias of inode
@@ -348,42 +578,51 @@ EXPORT_SYMBOL(dget_locked);
348 * any other hashed alias over that one unless @want_discon is set, 578 * any other hashed alias over that one unless @want_discon is set,
349 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. 579 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
350 */ 580 */
351 581static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
352static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
353{ 582{
354 struct list_head *head, *next, *tmp; 583 struct dentry *alias, *discon_alias;
355 struct dentry *alias, *discon_alias=NULL;
356 584
357 head = &inode->i_dentry; 585again:
358 next = inode->i_dentry.next; 586 discon_alias = NULL;
359 while (next != head) { 587 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
360 tmp = next; 588 spin_lock(&alias->d_lock);
361 next = tmp->next;
362 prefetch(next);
363 alias = list_entry(tmp, struct dentry, d_alias);
364 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { 589 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
365 if (IS_ROOT(alias) && 590 if (IS_ROOT(alias) &&
366 (alias->d_flags & DCACHE_DISCONNECTED)) 591 (alias->d_flags & DCACHE_DISCONNECTED)) {
367 discon_alias = alias; 592 discon_alias = alias;
368 else if (!want_discon) { 593 } else if (!want_discon) {
369 __dget_locked(alias); 594 __dget_dlock(alias);
595 spin_unlock(&alias->d_lock);
596 return alias;
597 }
598 }
599 spin_unlock(&alias->d_lock);
600 }
601 if (discon_alias) {
602 alias = discon_alias;
603 spin_lock(&alias->d_lock);
604 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
605 if (IS_ROOT(alias) &&
606 (alias->d_flags & DCACHE_DISCONNECTED)) {
607 __dget_dlock(alias);
608 spin_unlock(&alias->d_lock);
370 return alias; 609 return alias;
371 } 610 }
372 } 611 }
612 spin_unlock(&alias->d_lock);
613 goto again;
373 } 614 }
374 if (discon_alias) 615 return NULL;
375 __dget_locked(discon_alias);
376 return discon_alias;
377} 616}
378 617
379struct dentry * d_find_alias(struct inode *inode) 618struct dentry *d_find_alias(struct inode *inode)
380{ 619{
381 struct dentry *de = NULL; 620 struct dentry *de = NULL;
382 621
383 if (!list_empty(&inode->i_dentry)) { 622 if (!list_empty(&inode->i_dentry)) {
384 spin_lock(&dcache_lock); 623 spin_lock(&inode->i_lock);
385 de = __d_find_alias(inode, 0); 624 de = __d_find_alias(inode, 0);
386 spin_unlock(&dcache_lock); 625 spin_unlock(&inode->i_lock);
387 } 626 }
388 return de; 627 return de;
389} 628}
@@ -397,132 +636,153 @@ void d_prune_aliases(struct inode *inode)
397{ 636{
398 struct dentry *dentry; 637 struct dentry *dentry;
399restart: 638restart:
400 spin_lock(&dcache_lock); 639 spin_lock(&inode->i_lock);
401 list_for_each_entry(dentry, &inode->i_dentry, d_alias) { 640 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
402 spin_lock(&dentry->d_lock); 641 spin_lock(&dentry->d_lock);
403 if (!atomic_read(&dentry->d_count)) { 642 if (!dentry->d_count) {
404 __dget_locked(dentry); 643 __dget_dlock(dentry);
405 __d_drop(dentry); 644 __d_drop(dentry);
406 spin_unlock(&dentry->d_lock); 645 spin_unlock(&dentry->d_lock);
407 spin_unlock(&dcache_lock); 646 spin_unlock(&inode->i_lock);
408 dput(dentry); 647 dput(dentry);
409 goto restart; 648 goto restart;
410 } 649 }
411 spin_unlock(&dentry->d_lock); 650 spin_unlock(&dentry->d_lock);
412 } 651 }
413 spin_unlock(&dcache_lock); 652 spin_unlock(&inode->i_lock);
414} 653}
415EXPORT_SYMBOL(d_prune_aliases); 654EXPORT_SYMBOL(d_prune_aliases);
416 655
417/* 656/*
418 * Throw away a dentry - free the inode, dput the parent. This requires that 657 * Try to throw away a dentry - free the inode, dput the parent.
419 * the LRU list has already been removed. 658 * Requires dentry->d_lock is held, and dentry->d_count == 0.
659 * Releases dentry->d_lock.
420 * 660 *
421 * Try to prune ancestors as well. This is necessary to prevent 661 * This may fail if locks cannot be acquired no problem, just try again.
422 * quadratic behavior of shrink_dcache_parent(), but is also expected
423 * to be beneficial in reducing dentry cache fragmentation.
424 */ 662 */
425static void prune_one_dentry(struct dentry * dentry) 663static void try_prune_one_dentry(struct dentry *dentry)
426 __releases(dentry->d_lock) 664 __releases(dentry->d_lock)
427 __releases(dcache_lock)
428 __acquires(dcache_lock)
429{ 665{
430 __d_drop(dentry); 666 struct dentry *parent;
431 dentry = d_kill(dentry);
432 667
668 parent = dentry_kill(dentry, 0);
433 /* 669 /*
434 * Prune ancestors. Locking is simpler than in dput(), 670 * If dentry_kill returns NULL, we have nothing more to do.
435 * because dcache_lock needs to be taken anyway. 671 * if it returns the same dentry, trylocks failed. In either
672 * case, just loop again.
673 *
674 * Otherwise, we need to prune ancestors too. This is necessary
675 * to prevent quadratic behavior of shrink_dcache_parent(), but
676 * is also expected to be beneficial in reducing dentry cache
677 * fragmentation.
436 */ 678 */
437 spin_lock(&dcache_lock); 679 if (!parent)
680 return;
681 if (parent == dentry)
682 return;
683
684 /* Prune ancestors. */
685 dentry = parent;
438 while (dentry) { 686 while (dentry) {
439 if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) 687 spin_lock(&dentry->d_lock);
688 if (dentry->d_count > 1) {
689 dentry->d_count--;
690 spin_unlock(&dentry->d_lock);
440 return; 691 return;
441 692 }
442 if (dentry->d_op && dentry->d_op->d_delete) 693 dentry = dentry_kill(dentry, 1);
443 dentry->d_op->d_delete(dentry);
444 dentry_lru_del_init(dentry);
445 __d_drop(dentry);
446 dentry = d_kill(dentry);
447 spin_lock(&dcache_lock);
448 } 694 }
449} 695}
450 696
451/* 697static void shrink_dentry_list(struct list_head *list)
452 * Shrink the dentry LRU on a given superblock.
453 * @sb : superblock to shrink dentry LRU.
454 * @count: If count is NULL, we prune all dentries on superblock.
455 * @flags: If flags is non-zero, we need to do special processing based on
456 * which flags are set. This means we don't need to maintain multiple
457 * similar copies of this loop.
458 */
459static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
460{ 698{
461 LIST_HEAD(referenced);
462 LIST_HEAD(tmp);
463 struct dentry *dentry; 699 struct dentry *dentry;
464 int cnt = 0;
465
466 BUG_ON(!sb);
467 BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
468 spin_lock(&dcache_lock);
469 if (count != NULL)
470 /* called from prune_dcache() and shrink_dcache_parent() */
471 cnt = *count;
472restart:
473 if (count == NULL)
474 list_splice_init(&sb->s_dentry_lru, &tmp);
475 else {
476 while (!list_empty(&sb->s_dentry_lru)) {
477 dentry = list_entry(sb->s_dentry_lru.prev,
478 struct dentry, d_lru);
479 BUG_ON(dentry->d_sb != sb);
480 700
481 spin_lock(&dentry->d_lock); 701 rcu_read_lock();
482 /* 702 for (;;) {
483 * If we are honouring the DCACHE_REFERENCED flag and 703 dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
484 * the dentry has this flag set, don't free it. Clear 704 if (&dentry->d_lru == list)
485 * the flag and put it back on the LRU. 705 break; /* empty */
486 */
487 if ((flags & DCACHE_REFERENCED)
488 && (dentry->d_flags & DCACHE_REFERENCED)) {
489 dentry->d_flags &= ~DCACHE_REFERENCED;
490 list_move(&dentry->d_lru, &referenced);
491 spin_unlock(&dentry->d_lock);
492 } else {
493 list_move_tail(&dentry->d_lru, &tmp);
494 spin_unlock(&dentry->d_lock);
495 cnt--;
496 if (!cnt)
497 break;
498 }
499 cond_resched_lock(&dcache_lock);
500 }
501 }
502 while (!list_empty(&tmp)) {
503 dentry = list_entry(tmp.prev, struct dentry, d_lru);
504 dentry_lru_del_init(dentry);
505 spin_lock(&dentry->d_lock); 706 spin_lock(&dentry->d_lock);
707 if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
708 spin_unlock(&dentry->d_lock);
709 continue;
710 }
711
506 /* 712 /*
507 * We found an inuse dentry which was not removed from 713 * We found an inuse dentry which was not removed from
508 * the LRU because of laziness during lookup. Do not free 714 * the LRU because of laziness during lookup. Do not free
509 * it - just keep it off the LRU list. 715 * it - just keep it off the LRU list.
510 */ 716 */
511 if (atomic_read(&dentry->d_count)) { 717 if (dentry->d_count) {
718 dentry_lru_del(dentry);
512 spin_unlock(&dentry->d_lock); 719 spin_unlock(&dentry->d_lock);
513 continue; 720 continue;
514 } 721 }
515 prune_one_dentry(dentry); 722
516 /* dentry->d_lock was dropped in prune_one_dentry() */ 723 rcu_read_unlock();
517 cond_resched_lock(&dcache_lock); 724
518 } 725 try_prune_one_dentry(dentry);
519 if (count == NULL && !list_empty(&sb->s_dentry_lru)) 726
520 goto restart; 727 rcu_read_lock();
521 if (count != NULL) 728 }
522 *count = cnt; 729 rcu_read_unlock();
730}
731
732/**
733 * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
734 * @sb: superblock to shrink dentry LRU.
735 * @count: number of entries to prune
736 * @flags: flags to control the dentry processing
737 *
738 * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
739 */
740static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
741{
742 /* called from prune_dcache() and shrink_dcache_parent() */
743 struct dentry *dentry;
744 LIST_HEAD(referenced);
745 LIST_HEAD(tmp);
746 int cnt = *count;
747
748relock:
749 spin_lock(&dcache_lru_lock);
750 while (!list_empty(&sb->s_dentry_lru)) {
751 dentry = list_entry(sb->s_dentry_lru.prev,
752 struct dentry, d_lru);
753 BUG_ON(dentry->d_sb != sb);
754
755 if (!spin_trylock(&dentry->d_lock)) {
756 spin_unlock(&dcache_lru_lock);
757 cpu_relax();
758 goto relock;
759 }
760
761 /*
762 * If we are honouring the DCACHE_REFERENCED flag and the
763 * dentry has this flag set, don't free it. Clear the flag
764 * and put it back on the LRU.
765 */
766 if (flags & DCACHE_REFERENCED &&
767 dentry->d_flags & DCACHE_REFERENCED) {
768 dentry->d_flags &= ~DCACHE_REFERENCED;
769 list_move(&dentry->d_lru, &referenced);
770 spin_unlock(&dentry->d_lock);
771 } else {
772 list_move_tail(&dentry->d_lru, &tmp);
773 spin_unlock(&dentry->d_lock);
774 if (!--cnt)
775 break;
776 }
777 cond_resched_lock(&dcache_lru_lock);
778 }
523 if (!list_empty(&referenced)) 779 if (!list_empty(&referenced))
524 list_splice(&referenced, &sb->s_dentry_lru); 780 list_splice(&referenced, &sb->s_dentry_lru);
525 spin_unlock(&dcache_lock); 781 spin_unlock(&dcache_lru_lock);
782
783 shrink_dentry_list(&tmp);
784
785 *count = cnt;
526} 786}
527 787
528/** 788/**
@@ -544,7 +804,6 @@ static void prune_dcache(int count)
544 804
545 if (unused == 0 || count == 0) 805 if (unused == 0 || count == 0)
546 return; 806 return;
547 spin_lock(&dcache_lock);
548 if (count >= unused) 807 if (count >= unused)
549 prune_ratio = 1; 808 prune_ratio = 1;
550 else 809 else
@@ -581,11 +840,9 @@ static void prune_dcache(int count)
581 if (down_read_trylock(&sb->s_umount)) { 840 if (down_read_trylock(&sb->s_umount)) {
582 if ((sb->s_root != NULL) && 841 if ((sb->s_root != NULL) &&
583 (!list_empty(&sb->s_dentry_lru))) { 842 (!list_empty(&sb->s_dentry_lru))) {
584 spin_unlock(&dcache_lock);
585 __shrink_dcache_sb(sb, &w_count, 843 __shrink_dcache_sb(sb, &w_count,
586 DCACHE_REFERENCED); 844 DCACHE_REFERENCED);
587 pruned -= w_count; 845 pruned -= w_count;
588 spin_lock(&dcache_lock);
589 } 846 }
590 up_read(&sb->s_umount); 847 up_read(&sb->s_umount);
591 } 848 }
@@ -601,20 +858,27 @@ static void prune_dcache(int count)
601 if (p) 858 if (p)
602 __put_super(p); 859 __put_super(p);
603 spin_unlock(&sb_lock); 860 spin_unlock(&sb_lock);
604 spin_unlock(&dcache_lock);
605} 861}
606 862
607/** 863/**
608 * shrink_dcache_sb - shrink dcache for a superblock 864 * shrink_dcache_sb - shrink dcache for a superblock
609 * @sb: superblock 865 * @sb: superblock
610 * 866 *
611 * Shrink the dcache for the specified super block. This 867 * Shrink the dcache for the specified super block. This is used to free
612 * is used to free the dcache before unmounting a file 868 * the dcache before unmounting a file system.
613 * system
614 */ 869 */
615void shrink_dcache_sb(struct super_block * sb) 870void shrink_dcache_sb(struct super_block *sb)
616{ 871{
617 __shrink_dcache_sb(sb, NULL, 0); 872 LIST_HEAD(tmp);
873
874 spin_lock(&dcache_lru_lock);
875 while (!list_empty(&sb->s_dentry_lru)) {
876 list_splice_init(&sb->s_dentry_lru, &tmp);
877 spin_unlock(&dcache_lru_lock);
878 shrink_dentry_list(&tmp);
879 spin_lock(&dcache_lru_lock);
880 }
881 spin_unlock(&dcache_lru_lock);
618} 882}
619EXPORT_SYMBOL(shrink_dcache_sb); 883EXPORT_SYMBOL(shrink_dcache_sb);
620 884
@@ -631,10 +895,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
631 BUG_ON(!IS_ROOT(dentry)); 895 BUG_ON(!IS_ROOT(dentry));
632 896
633 /* detach this root from the system */ 897 /* detach this root from the system */
634 spin_lock(&dcache_lock); 898 spin_lock(&dentry->d_lock);
635 dentry_lru_del_init(dentry); 899 dentry_lru_del(dentry);
636 __d_drop(dentry); 900 __d_drop(dentry);
637 spin_unlock(&dcache_lock); 901 spin_unlock(&dentry->d_lock);
638 902
639 for (;;) { 903 for (;;) {
640 /* descend to the first leaf in the current subtree */ 904 /* descend to the first leaf in the current subtree */
@@ -643,14 +907,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
643 907
644 /* this is a branch with children - detach all of them 908 /* this is a branch with children - detach all of them
645 * from the system in one go */ 909 * from the system in one go */
646 spin_lock(&dcache_lock); 910 spin_lock(&dentry->d_lock);
647 list_for_each_entry(loop, &dentry->d_subdirs, 911 list_for_each_entry(loop, &dentry->d_subdirs,
648 d_u.d_child) { 912 d_u.d_child) {
649 dentry_lru_del_init(loop); 913 spin_lock_nested(&loop->d_lock,
914 DENTRY_D_LOCK_NESTED);
915 dentry_lru_del(loop);
650 __d_drop(loop); 916 __d_drop(loop);
651 cond_resched_lock(&dcache_lock); 917 spin_unlock(&loop->d_lock);
652 } 918 }
653 spin_unlock(&dcache_lock); 919 spin_unlock(&dentry->d_lock);
654 920
655 /* move to the first child */ 921 /* move to the first child */
656 dentry = list_entry(dentry->d_subdirs.next, 922 dentry = list_entry(dentry->d_subdirs.next,
@@ -662,7 +928,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
662 do { 928 do {
663 struct inode *inode; 929 struct inode *inode;
664 930
665 if (atomic_read(&dentry->d_count) != 0) { 931 if (dentry->d_count != 0) {
666 printk(KERN_ERR 932 printk(KERN_ERR
667 "BUG: Dentry %p{i=%lx,n=%s}" 933 "BUG: Dentry %p{i=%lx,n=%s}"
668 " still in use (%d)" 934 " still in use (%d)"
@@ -671,20 +937,23 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
671 dentry->d_inode ? 937 dentry->d_inode ?
672 dentry->d_inode->i_ino : 0UL, 938 dentry->d_inode->i_ino : 0UL,
673 dentry->d_name.name, 939 dentry->d_name.name,
674 atomic_read(&dentry->d_count), 940 dentry->d_count,
675 dentry->d_sb->s_type->name, 941 dentry->d_sb->s_type->name,
676 dentry->d_sb->s_id); 942 dentry->d_sb->s_id);
677 BUG(); 943 BUG();
678 } 944 }
679 945
680 if (IS_ROOT(dentry)) 946 if (IS_ROOT(dentry)) {
681 parent = NULL; 947 parent = NULL;
682 else { 948 list_del(&dentry->d_u.d_child);
949 } else {
683 parent = dentry->d_parent; 950 parent = dentry->d_parent;
684 atomic_dec(&parent->d_count); 951 spin_lock(&parent->d_lock);
952 parent->d_count--;
953 list_del(&dentry->d_u.d_child);
954 spin_unlock(&parent->d_lock);
685 } 955 }
686 956
687 list_del(&dentry->d_u.d_child);
688 detached++; 957 detached++;
689 958
690 inode = dentry->d_inode; 959 inode = dentry->d_inode;
@@ -703,26 +972,18 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
703 * otherwise we ascend to the parent and move to the 972 * otherwise we ascend to the parent and move to the
704 * next sibling if there is one */ 973 * next sibling if there is one */
705 if (!parent) 974 if (!parent)
706 goto out; 975 return;
707
708 dentry = parent; 976 dentry = parent;
709
710 } while (list_empty(&dentry->d_subdirs)); 977 } while (list_empty(&dentry->d_subdirs));
711 978
712 dentry = list_entry(dentry->d_subdirs.next, 979 dentry = list_entry(dentry->d_subdirs.next,
713 struct dentry, d_u.d_child); 980 struct dentry, d_u.d_child);
714 } 981 }
715out:
716 /* several dentries were freed, need to correct nr_dentry */
717 spin_lock(&dcache_lock);
718 dentry_stat.nr_dentry -= detached;
719 spin_unlock(&dcache_lock);
720} 982}
721 983
722/* 984/*
723 * destroy the dentries attached to a superblock on unmounting 985 * destroy the dentries attached to a superblock on unmounting
724 * - we don't need to use dentry->d_lock, and only need dcache_lock when 986 * - we don't need to use dentry->d_lock because:
725 * removing the dentry from the system lists and hashes because:
726 * - the superblock is detached from all mountings and open files, so the 987 * - the superblock is detached from all mountings and open files, so the
727 * dentry trees will not be rearranged by the VFS 988 * dentry trees will not be rearranged by the VFS
728 * - s_umount is write-locked, so the memory pressure shrinker will ignore 989 * - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -739,11 +1000,13 @@ void shrink_dcache_for_umount(struct super_block *sb)
739 1000
740 dentry = sb->s_root; 1001 dentry = sb->s_root;
741 sb->s_root = NULL; 1002 sb->s_root = NULL;
742 atomic_dec(&dentry->d_count); 1003 spin_lock(&dentry->d_lock);
1004 dentry->d_count--;
1005 spin_unlock(&dentry->d_lock);
743 shrink_dcache_for_umount_subtree(dentry); 1006 shrink_dcache_for_umount_subtree(dentry);
744 1007
745 while (!hlist_empty(&sb->s_anon)) { 1008 while (!hlist_bl_empty(&sb->s_anon)) {
746 dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash); 1009 dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
747 shrink_dcache_for_umount_subtree(dentry); 1010 shrink_dcache_for_umount_subtree(dentry);
748 } 1011 }
749} 1012}
@@ -761,15 +1024,20 @@ void shrink_dcache_for_umount(struct super_block *sb)
761 * Return true if the parent or its subdirectories contain 1024 * Return true if the parent or its subdirectories contain
762 * a mount point 1025 * a mount point
763 */ 1026 */
764
765int have_submounts(struct dentry *parent) 1027int have_submounts(struct dentry *parent)
766{ 1028{
767 struct dentry *this_parent = parent; 1029 struct dentry *this_parent;
768 struct list_head *next; 1030 struct list_head *next;
1031 unsigned seq;
1032 int locked = 0;
1033
1034 seq = read_seqbegin(&rename_lock);
1035again:
1036 this_parent = parent;
769 1037
770 spin_lock(&dcache_lock);
771 if (d_mountpoint(parent)) 1038 if (d_mountpoint(parent))
772 goto positive; 1039 goto positive;
1040 spin_lock(&this_parent->d_lock);
773repeat: 1041repeat:
774 next = this_parent->d_subdirs.next; 1042 next = this_parent->d_subdirs.next;
775resume: 1043resume:
@@ -777,27 +1045,65 @@ resume:
777 struct list_head *tmp = next; 1045 struct list_head *tmp = next;
778 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 1046 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
779 next = tmp->next; 1047 next = tmp->next;
1048
1049 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
780 /* Have we found a mount point ? */ 1050 /* Have we found a mount point ? */
781 if (d_mountpoint(dentry)) 1051 if (d_mountpoint(dentry)) {
1052 spin_unlock(&dentry->d_lock);
1053 spin_unlock(&this_parent->d_lock);
782 goto positive; 1054 goto positive;
1055 }
783 if (!list_empty(&dentry->d_subdirs)) { 1056 if (!list_empty(&dentry->d_subdirs)) {
1057 spin_unlock(&this_parent->d_lock);
1058 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
784 this_parent = dentry; 1059 this_parent = dentry;
1060 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
785 goto repeat; 1061 goto repeat;
786 } 1062 }
1063 spin_unlock(&dentry->d_lock);
787 } 1064 }
788 /* 1065 /*
789 * All done at this level ... ascend and resume the search. 1066 * All done at this level ... ascend and resume the search.
790 */ 1067 */
791 if (this_parent != parent) { 1068 if (this_parent != parent) {
792 next = this_parent->d_u.d_child.next; 1069 struct dentry *tmp;
793 this_parent = this_parent->d_parent; 1070 struct dentry *child;
1071
1072 tmp = this_parent->d_parent;
1073 rcu_read_lock();
1074 spin_unlock(&this_parent->d_lock);
1075 child = this_parent;
1076 this_parent = tmp;
1077 spin_lock(&this_parent->d_lock);
1078 /* might go back up the wrong parent if we have had a rename
1079 * or deletion */
1080 if (this_parent != child->d_parent ||
1081 (!locked && read_seqretry(&rename_lock, seq))) {
1082 spin_unlock(&this_parent->d_lock);
1083 rcu_read_unlock();
1084 goto rename_retry;
1085 }
1086 rcu_read_unlock();
1087 next = child->d_u.d_child.next;
794 goto resume; 1088 goto resume;
795 } 1089 }
796 spin_unlock(&dcache_lock); 1090 spin_unlock(&this_parent->d_lock);
1091 if (!locked && read_seqretry(&rename_lock, seq))
1092 goto rename_retry;
1093 if (locked)
1094 write_sequnlock(&rename_lock);
797 return 0; /* No mount points found in tree */ 1095 return 0; /* No mount points found in tree */
798positive: 1096positive:
799 spin_unlock(&dcache_lock); 1097 if (!locked && read_seqretry(&rename_lock, seq))
1098 goto rename_retry;
1099 if (locked)
1100 write_sequnlock(&rename_lock);
800 return 1; 1101 return 1;
1102
1103rename_retry:
1104 locked = 1;
1105 write_seqlock(&rename_lock);
1106 goto again;
801} 1107}
802EXPORT_SYMBOL(have_submounts); 1108EXPORT_SYMBOL(have_submounts);
803 1109
@@ -817,11 +1123,16 @@ EXPORT_SYMBOL(have_submounts);
817 */ 1123 */
818static int select_parent(struct dentry * parent) 1124static int select_parent(struct dentry * parent)
819{ 1125{
820 struct dentry *this_parent = parent; 1126 struct dentry *this_parent;
821 struct list_head *next; 1127 struct list_head *next;
1128 unsigned seq;
822 int found = 0; 1129 int found = 0;
1130 int locked = 0;
823 1131
824 spin_lock(&dcache_lock); 1132 seq = read_seqbegin(&rename_lock);
1133again:
1134 this_parent = parent;
1135 spin_lock(&this_parent->d_lock);
825repeat: 1136repeat:
826 next = this_parent->d_subdirs.next; 1137 next = this_parent->d_subdirs.next;
827resume: 1138resume:
@@ -830,14 +1141,17 @@ resume:
830 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 1141 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
831 next = tmp->next; 1142 next = tmp->next;
832 1143
833 dentry_lru_del_init(dentry); 1144 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1145
834 /* 1146 /*
835 * move only zero ref count dentries to the end 1147 * move only zero ref count dentries to the end
836 * of the unused list for prune_dcache 1148 * of the unused list for prune_dcache
837 */ 1149 */
838 if (!atomic_read(&dentry->d_count)) { 1150 if (!dentry->d_count) {
839 dentry_lru_add_tail(dentry); 1151 dentry_lru_move_tail(dentry);
840 found++; 1152 found++;
1153 } else {
1154 dentry_lru_del(dentry);
841 } 1155 }
842 1156
843 /* 1157 /*
@@ -845,28 +1159,63 @@ resume:
845 * ensures forward progress). We'll be coming back to find 1159 * ensures forward progress). We'll be coming back to find
846 * the rest. 1160 * the rest.
847 */ 1161 */
848 if (found && need_resched()) 1162 if (found && need_resched()) {
1163 spin_unlock(&dentry->d_lock);
849 goto out; 1164 goto out;
1165 }
850 1166
851 /* 1167 /*
852 * Descend a level if the d_subdirs list is non-empty. 1168 * Descend a level if the d_subdirs list is non-empty.
853 */ 1169 */
854 if (!list_empty(&dentry->d_subdirs)) { 1170 if (!list_empty(&dentry->d_subdirs)) {
1171 spin_unlock(&this_parent->d_lock);
1172 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
855 this_parent = dentry; 1173 this_parent = dentry;
1174 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
856 goto repeat; 1175 goto repeat;
857 } 1176 }
1177
1178 spin_unlock(&dentry->d_lock);
858 } 1179 }
859 /* 1180 /*
860 * All done at this level ... ascend and resume the search. 1181 * All done at this level ... ascend and resume the search.
861 */ 1182 */
862 if (this_parent != parent) { 1183 if (this_parent != parent) {
863 next = this_parent->d_u.d_child.next; 1184 struct dentry *tmp;
864 this_parent = this_parent->d_parent; 1185 struct dentry *child;
1186
1187 tmp = this_parent->d_parent;
1188 rcu_read_lock();
1189 spin_unlock(&this_parent->d_lock);
1190 child = this_parent;
1191 this_parent = tmp;
1192 spin_lock(&this_parent->d_lock);
1193 /* might go back up the wrong parent if we have had a rename
1194 * or deletion */
1195 if (this_parent != child->d_parent ||
1196 (!locked && read_seqretry(&rename_lock, seq))) {
1197 spin_unlock(&this_parent->d_lock);
1198 rcu_read_unlock();
1199 goto rename_retry;
1200 }
1201 rcu_read_unlock();
1202 next = child->d_u.d_child.next;
865 goto resume; 1203 goto resume;
866 } 1204 }
867out: 1205out:
868 spin_unlock(&dcache_lock); 1206 spin_unlock(&this_parent->d_lock);
1207 if (!locked && read_seqretry(&rename_lock, seq))
1208 goto rename_retry;
1209 if (locked)
1210 write_sequnlock(&rename_lock);
869 return found; 1211 return found;
1212
1213rename_retry:
1214 if (found)
1215 return found;
1216 locked = 1;
1217 write_seqlock(&rename_lock);
1218 goto again;
870} 1219}
871 1220
872/** 1221/**
@@ -905,6 +1254,7 @@ static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
905 return -1; 1254 return -1;
906 prune_dcache(nr); 1255 prune_dcache(nr);
907 } 1256 }
1257
908 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 1258 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
909} 1259}
910 1260
@@ -948,37 +1298,54 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
948 memcpy(dname, name->name, name->len); 1298 memcpy(dname, name->name, name->len);
949 dname[name->len] = 0; 1299 dname[name->len] = 0;
950 1300
951 atomic_set(&dentry->d_count, 1); 1301 dentry->d_count = 1;
952 dentry->d_flags = DCACHE_UNHASHED; 1302 dentry->d_flags = DCACHE_UNHASHED;
953 spin_lock_init(&dentry->d_lock); 1303 spin_lock_init(&dentry->d_lock);
1304 seqcount_init(&dentry->d_seq);
954 dentry->d_inode = NULL; 1305 dentry->d_inode = NULL;
955 dentry->d_parent = NULL; 1306 dentry->d_parent = NULL;
956 dentry->d_sb = NULL; 1307 dentry->d_sb = NULL;
957 dentry->d_op = NULL; 1308 dentry->d_op = NULL;
958 dentry->d_fsdata = NULL; 1309 dentry->d_fsdata = NULL;
959 dentry->d_mounted = 0; 1310 INIT_HLIST_BL_NODE(&dentry->d_hash);
960 INIT_HLIST_NODE(&dentry->d_hash);
961 INIT_LIST_HEAD(&dentry->d_lru); 1311 INIT_LIST_HEAD(&dentry->d_lru);
962 INIT_LIST_HEAD(&dentry->d_subdirs); 1312 INIT_LIST_HEAD(&dentry->d_subdirs);
963 INIT_LIST_HEAD(&dentry->d_alias); 1313 INIT_LIST_HEAD(&dentry->d_alias);
1314 INIT_LIST_HEAD(&dentry->d_u.d_child);
964 1315
965 if (parent) { 1316 if (parent) {
966 dentry->d_parent = dget(parent); 1317 spin_lock(&parent->d_lock);
1318 /*
1319 * don't need child lock because it is not subject
1320 * to concurrency here
1321 */
1322 __dget_dlock(parent);
1323 dentry->d_parent = parent;
967 dentry->d_sb = parent->d_sb; 1324 dentry->d_sb = parent->d_sb;
968 } else { 1325 d_set_d_op(dentry, dentry->d_sb->s_d_op);
969 INIT_LIST_HEAD(&dentry->d_u.d_child); 1326 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
1327 spin_unlock(&parent->d_lock);
970 } 1328 }
971 1329
972 spin_lock(&dcache_lock); 1330 this_cpu_inc(nr_dentry);
973 if (parent)
974 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
975 dentry_stat.nr_dentry++;
976 spin_unlock(&dcache_lock);
977 1331
978 return dentry; 1332 return dentry;
979} 1333}
980EXPORT_SYMBOL(d_alloc); 1334EXPORT_SYMBOL(d_alloc);
981 1335
1336struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
1337{
1338 struct dentry *dentry = d_alloc(NULL, name);
1339 if (dentry) {
1340 dentry->d_sb = sb;
1341 d_set_d_op(dentry, dentry->d_sb->s_d_op);
1342 dentry->d_parent = dentry;
1343 dentry->d_flags |= DCACHE_DISCONNECTED;
1344 }
1345 return dentry;
1346}
1347EXPORT_SYMBOL(d_alloc_pseudo);
1348
982struct dentry *d_alloc_name(struct dentry *parent, const char *name) 1349struct dentry *d_alloc_name(struct dentry *parent, const char *name)
983{ 1350{
984 struct qstr q; 1351 struct qstr q;
@@ -990,12 +1357,39 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
990} 1357}
991EXPORT_SYMBOL(d_alloc_name); 1358EXPORT_SYMBOL(d_alloc_name);
992 1359
993/* the caller must hold dcache_lock */ 1360void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
1361{
1362 WARN_ON_ONCE(dentry->d_op);
1363 WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
1364 DCACHE_OP_COMPARE |
1365 DCACHE_OP_REVALIDATE |
1366 DCACHE_OP_DELETE ));
1367 dentry->d_op = op;
1368 if (!op)
1369 return;
1370 if (op->d_hash)
1371 dentry->d_flags |= DCACHE_OP_HASH;
1372 if (op->d_compare)
1373 dentry->d_flags |= DCACHE_OP_COMPARE;
1374 if (op->d_revalidate)
1375 dentry->d_flags |= DCACHE_OP_REVALIDATE;
1376 if (op->d_delete)
1377 dentry->d_flags |= DCACHE_OP_DELETE;
1378
1379}
1380EXPORT_SYMBOL(d_set_d_op);
1381
994static void __d_instantiate(struct dentry *dentry, struct inode *inode) 1382static void __d_instantiate(struct dentry *dentry, struct inode *inode)
995{ 1383{
996 if (inode) 1384 spin_lock(&dentry->d_lock);
1385 if (inode) {
1386 if (unlikely(IS_AUTOMOUNT(inode)))
1387 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
997 list_add(&dentry->d_alias, &inode->i_dentry); 1388 list_add(&dentry->d_alias, &inode->i_dentry);
1389 }
998 dentry->d_inode = inode; 1390 dentry->d_inode = inode;
1391 dentry_rcuwalk_barrier(dentry);
1392 spin_unlock(&dentry->d_lock);
999 fsnotify_d_instantiate(dentry, inode); 1393 fsnotify_d_instantiate(dentry, inode);
1000} 1394}
1001 1395
@@ -1017,9 +1411,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1017void d_instantiate(struct dentry *entry, struct inode * inode) 1411void d_instantiate(struct dentry *entry, struct inode * inode)
1018{ 1412{
1019 BUG_ON(!list_empty(&entry->d_alias)); 1413 BUG_ON(!list_empty(&entry->d_alias));
1020 spin_lock(&dcache_lock); 1414 if (inode)
1415 spin_lock(&inode->i_lock);
1021 __d_instantiate(entry, inode); 1416 __d_instantiate(entry, inode);
1022 spin_unlock(&dcache_lock); 1417 if (inode)
1418 spin_unlock(&inode->i_lock);
1023 security_d_instantiate(entry, inode); 1419 security_d_instantiate(entry, inode);
1024} 1420}
1025EXPORT_SYMBOL(d_instantiate); 1421EXPORT_SYMBOL(d_instantiate);
@@ -1056,15 +1452,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
1056 list_for_each_entry(alias, &inode->i_dentry, d_alias) { 1452 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
1057 struct qstr *qstr = &alias->d_name; 1453 struct qstr *qstr = &alias->d_name;
1058 1454
1455 /*
1456 * Don't need alias->d_lock here, because aliases with
1457 * d_parent == entry->d_parent are not subject to name or
1458 * parent changes, because the parent inode i_mutex is held.
1459 */
1059 if (qstr->hash != hash) 1460 if (qstr->hash != hash)
1060 continue; 1461 continue;
1061 if (alias->d_parent != entry->d_parent) 1462 if (alias->d_parent != entry->d_parent)
1062 continue; 1463 continue;
1063 if (qstr->len != len) 1464 if (dentry_cmp(qstr->name, qstr->len, name, len))
1064 continue; 1465 continue;
1065 if (memcmp(qstr->name, name, len)) 1466 __dget(alias);
1066 continue;
1067 dget_locked(alias);
1068 return alias; 1467 return alias;
1069 } 1468 }
1070 1469
@@ -1078,9 +1477,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
1078 1477
1079 BUG_ON(!list_empty(&entry->d_alias)); 1478 BUG_ON(!list_empty(&entry->d_alias));
1080 1479
1081 spin_lock(&dcache_lock); 1480 if (inode)
1481 spin_lock(&inode->i_lock);
1082 result = __d_instantiate_unique(entry, inode); 1482 result = __d_instantiate_unique(entry, inode);
1083 spin_unlock(&dcache_lock); 1483 if (inode)
1484 spin_unlock(&inode->i_lock);
1084 1485
1085 if (!result) { 1486 if (!result) {
1086 security_d_instantiate(entry, inode); 1487 security_d_instantiate(entry, inode);
@@ -1113,6 +1514,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1113 res = d_alloc(NULL, &name); 1514 res = d_alloc(NULL, &name);
1114 if (res) { 1515 if (res) {
1115 res->d_sb = root_inode->i_sb; 1516 res->d_sb = root_inode->i_sb;
1517 d_set_d_op(res, res->d_sb->s_d_op);
1116 res->d_parent = res; 1518 res->d_parent = res;
1117 d_instantiate(res, root_inode); 1519 d_instantiate(res, root_inode);
1118 } 1520 }
@@ -1121,14 +1523,6 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1121} 1523}
1122EXPORT_SYMBOL(d_alloc_root); 1524EXPORT_SYMBOL(d_alloc_root);
1123 1525
1124static inline struct hlist_head *d_hash(struct dentry *parent,
1125 unsigned long hash)
1126{
1127 hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
1128 hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
1129 return dentry_hashtable + (hash & D_HASHMASK);
1130}
1131
1132/** 1526/**
1133 * d_obtain_alias - find or allocate a dentry for a given inode 1527 * d_obtain_alias - find or allocate a dentry for a given inode
1134 * @inode: inode to allocate the dentry for 1528 * @inode: inode to allocate the dentry for
@@ -1169,10 +1563,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
1169 } 1563 }
1170 tmp->d_parent = tmp; /* make sure dput doesn't croak */ 1564 tmp->d_parent = tmp; /* make sure dput doesn't croak */
1171 1565
1172 spin_lock(&dcache_lock); 1566
1567 spin_lock(&inode->i_lock);
1173 res = __d_find_alias(inode, 0); 1568 res = __d_find_alias(inode, 0);
1174 if (res) { 1569 if (res) {
1175 spin_unlock(&dcache_lock); 1570 spin_unlock(&inode->i_lock);
1176 dput(tmp); 1571 dput(tmp);
1177 goto out_iput; 1572 goto out_iput;
1178 } 1573 }
@@ -1180,14 +1575,17 @@ struct dentry *d_obtain_alias(struct inode *inode)
1180 /* attach a disconnected dentry */ 1575 /* attach a disconnected dentry */
1181 spin_lock(&tmp->d_lock); 1576 spin_lock(&tmp->d_lock);
1182 tmp->d_sb = inode->i_sb; 1577 tmp->d_sb = inode->i_sb;
1578 d_set_d_op(tmp, tmp->d_sb->s_d_op);
1183 tmp->d_inode = inode; 1579 tmp->d_inode = inode;
1184 tmp->d_flags |= DCACHE_DISCONNECTED; 1580 tmp->d_flags |= DCACHE_DISCONNECTED;
1185 tmp->d_flags &= ~DCACHE_UNHASHED;
1186 list_add(&tmp->d_alias, &inode->i_dentry); 1581 list_add(&tmp->d_alias, &inode->i_dentry);
1187 hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon); 1582 bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
1583 tmp->d_flags &= ~DCACHE_UNHASHED;
1584 hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
1585 __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
1188 spin_unlock(&tmp->d_lock); 1586 spin_unlock(&tmp->d_lock);
1587 spin_unlock(&inode->i_lock);
1189 1588
1190 spin_unlock(&dcache_lock);
1191 return tmp; 1589 return tmp;
1192 1590
1193 out_iput: 1591 out_iput:
@@ -1217,18 +1615,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1217 struct dentry *new = NULL; 1615 struct dentry *new = NULL;
1218 1616
1219 if (inode && S_ISDIR(inode->i_mode)) { 1617 if (inode && S_ISDIR(inode->i_mode)) {
1220 spin_lock(&dcache_lock); 1618 spin_lock(&inode->i_lock);
1221 new = __d_find_alias(inode, 1); 1619 new = __d_find_alias(inode, 1);
1222 if (new) { 1620 if (new) {
1223 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1621 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
1224 spin_unlock(&dcache_lock); 1622 spin_unlock(&inode->i_lock);
1225 security_d_instantiate(new, inode); 1623 security_d_instantiate(new, inode);
1226 d_move(new, dentry); 1624 d_move(new, dentry);
1227 iput(inode); 1625 iput(inode);
1228 } else { 1626 } else {
1229 /* already taking dcache_lock, so d_add() by hand */ 1627 /* already taking inode->i_lock, so d_add() by hand */
1230 __d_instantiate(dentry, inode); 1628 __d_instantiate(dentry, inode);
1231 spin_unlock(&dcache_lock); 1629 spin_unlock(&inode->i_lock);
1232 security_d_instantiate(dentry, inode); 1630 security_d_instantiate(dentry, inode);
1233 d_rehash(dentry); 1631 d_rehash(dentry);
1234 } 1632 }
@@ -1301,10 +1699,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1301 * Negative dentry: instantiate it unless the inode is a directory and 1699 * Negative dentry: instantiate it unless the inode is a directory and
1302 * already has a dentry. 1700 * already has a dentry.
1303 */ 1701 */
1304 spin_lock(&dcache_lock); 1702 spin_lock(&inode->i_lock);
1305 if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) { 1703 if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
1306 __d_instantiate(found, inode); 1704 __d_instantiate(found, inode);
1307 spin_unlock(&dcache_lock); 1705 spin_unlock(&inode->i_lock);
1308 security_d_instantiate(found, inode); 1706 security_d_instantiate(found, inode);
1309 return found; 1707 return found;
1310 } 1708 }
@@ -1314,8 +1712,8 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1314 * reference to it, move it in place and use it. 1712 * reference to it, move it in place and use it.
1315 */ 1713 */
1316 new = list_entry(inode->i_dentry.next, struct dentry, d_alias); 1714 new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
1317 dget_locked(new); 1715 __dget(new);
1318 spin_unlock(&dcache_lock); 1716 spin_unlock(&inode->i_lock);
1319 security_d_instantiate(found, inode); 1717 security_d_instantiate(found, inode);
1320 d_move(new, found); 1718 d_move(new, found);
1321 iput(inode); 1719 iput(inode);
@@ -1329,6 +1727,112 @@ err_out:
1329EXPORT_SYMBOL(d_add_ci); 1727EXPORT_SYMBOL(d_add_ci);
1330 1728
1331/** 1729/**
1730 * __d_lookup_rcu - search for a dentry (racy, store-free)
1731 * @parent: parent dentry
1732 * @name: qstr of name we wish to find
1733 * @seq: returns d_seq value at the point where the dentry was found
1734 * @inode: returns dentry->d_inode when the inode was found valid.
1735 * Returns: dentry, or NULL
1736 *
1737 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
1738 * resolution (store-free path walking) design described in
1739 * Documentation/filesystems/path-lookup.txt.
1740 *
1741 * This is not to be used outside core vfs.
1742 *
1743 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
1744 * held, and rcu_read_lock held. The returned dentry must not be stored into
1745 * without taking d_lock and checking d_seq sequence count against @seq
1746 * returned here.
1747 *
1748 * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
1749 * function.
1750 *
1751 * Alternatively, __d_lookup_rcu may be called again to look up the child of
1752 * the returned dentry, so long as its parent's seqlock is checked after the
1753 * child is looked up. Thus, an interlocking stepping of sequence lock checks
1754 * is formed, giving integrity down the path walk.
1755 */
1756struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
1757 unsigned *seq, struct inode **inode)
1758{
1759 unsigned int len = name->len;
1760 unsigned int hash = name->hash;
1761 const unsigned char *str = name->name;
1762 struct dcache_hash_bucket *b = d_hash(parent, hash);
1763 struct hlist_bl_node *node;
1764 struct dentry *dentry;
1765
1766 /*
1767 * Note: There is significant duplication with __d_lookup_rcu which is
1768 * required to prevent single threaded performance regressions
1769 * especially on architectures where smp_rmb (in seqcounts) are costly.
1770 * Keep the two functions in sync.
1771 */
1772
1773 /*
1774 * The hash list is protected using RCU.
1775 *
1776 * Carefully use d_seq when comparing a candidate dentry, to avoid
1777 * races with d_move().
1778 *
1779 * It is possible that concurrent renames can mess up our list
1780 * walk here and result in missing our dentry, resulting in the
1781 * false-negative result. d_lookup() protects against concurrent
1782 * renames using rename_lock seqlock.
1783 *
1784 * See Documentation/vfs/dcache-locking.txt for more details.
1785 */
1786 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
1787 struct inode *i;
1788 const char *tname;
1789 int tlen;
1790
1791 if (dentry->d_name.hash != hash)
1792 continue;
1793
1794seqretry:
1795 *seq = read_seqcount_begin(&dentry->d_seq);
1796 if (dentry->d_parent != parent)
1797 continue;
1798 if (d_unhashed(dentry))
1799 continue;
1800 tlen = dentry->d_name.len;
1801 tname = dentry->d_name.name;
1802 i = dentry->d_inode;
1803 prefetch(tname);
1804 if (i)
1805 prefetch(i);
1806 /*
1807 * This seqcount check is required to ensure name and
1808 * len are loaded atomically, so as not to walk off the
1809 * edge of memory when walking. If we could load this
1810 * atomically some other way, we could drop this check.
1811 */
1812 if (read_seqcount_retry(&dentry->d_seq, *seq))
1813 goto seqretry;
1814 if (parent->d_flags & DCACHE_OP_COMPARE) {
1815 if (parent->d_op->d_compare(parent, *inode,
1816 dentry, i,
1817 tlen, tname, name))
1818 continue;
1819 } else {
1820 if (dentry_cmp(tname, tlen, str, len))
1821 continue;
1822 }
1823 /*
1824 * No extra seqcount check is required after the name
1825 * compare. The caller must perform a seqcount check in
1826 * order to do anything useful with the returned dentry
1827 * anyway.
1828 */
1829 *inode = i;
1830 return dentry;
1831 }
1832 return NULL;
1833}
1834
1835/**
1332 * d_lookup - search for a dentry 1836 * d_lookup - search for a dentry
1333 * @parent: parent dentry 1837 * @parent: parent dentry
1334 * @name: qstr of name we wish to find 1838 * @name: qstr of name we wish to find
@@ -1339,10 +1843,10 @@ EXPORT_SYMBOL(d_add_ci);
1339 * dentry is returned. The caller must use dput to free the entry when it has 1843 * dentry is returned. The caller must use dput to free the entry when it has
1340 * finished using it. %NULL is returned if the dentry does not exist. 1844 * finished using it. %NULL is returned if the dentry does not exist.
1341 */ 1845 */
1342struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1846struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
1343{ 1847{
1344 struct dentry * dentry = NULL; 1848 struct dentry *dentry;
1345 unsigned long seq; 1849 unsigned seq;
1346 1850
1347 do { 1851 do {
1348 seq = read_seqbegin(&rename_lock); 1852 seq = read_seqbegin(&rename_lock);
@@ -1354,7 +1858,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1354} 1858}
1355EXPORT_SYMBOL(d_lookup); 1859EXPORT_SYMBOL(d_lookup);
1356 1860
1357/* 1861/**
1358 * __d_lookup - search for a dentry (racy) 1862 * __d_lookup - search for a dentry (racy)
1359 * @parent: parent dentry 1863 * @parent: parent dentry
1360 * @name: qstr of name we wish to find 1864 * @name: qstr of name we wish to find
@@ -1369,17 +1873,24 @@ EXPORT_SYMBOL(d_lookup);
1369 * 1873 *
1370 * __d_lookup callers must be commented. 1874 * __d_lookup callers must be commented.
1371 */ 1875 */
1372struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1876struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1373{ 1877{
1374 unsigned int len = name->len; 1878 unsigned int len = name->len;
1375 unsigned int hash = name->hash; 1879 unsigned int hash = name->hash;
1376 const unsigned char *str = name->name; 1880 const unsigned char *str = name->name;
1377 struct hlist_head *head = d_hash(parent,hash); 1881 struct dcache_hash_bucket *b = d_hash(parent, hash);
1882 struct hlist_bl_node *node;
1378 struct dentry *found = NULL; 1883 struct dentry *found = NULL;
1379 struct hlist_node *node;
1380 struct dentry *dentry; 1884 struct dentry *dentry;
1381 1885
1382 /* 1886 /*
1887 * Note: There is significant duplication with __d_lookup_rcu which is
1888 * required to prevent single threaded performance regressions
1889 * especially on architectures where smp_rmb (in seqcounts) are costly.
1890 * Keep the two functions in sync.
1891 */
1892
1893 /*
1383 * The hash list is protected using RCU. 1894 * The hash list is protected using RCU.
1384 * 1895 *
1385 * Take d_lock when comparing a candidate dentry, to avoid races 1896 * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1394,25 +1905,16 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1394 */ 1905 */
1395 rcu_read_lock(); 1906 rcu_read_lock();
1396 1907
1397 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1908 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
1398 struct qstr *qstr; 1909 const char *tname;
1910 int tlen;
1399 1911
1400 if (dentry->d_name.hash != hash) 1912 if (dentry->d_name.hash != hash)
1401 continue; 1913 continue;
1402 if (dentry->d_parent != parent)
1403 continue;
1404 1914
1405 spin_lock(&dentry->d_lock); 1915 spin_lock(&dentry->d_lock);
1406
1407 /*
1408 * Recheck the dentry after taking the lock - d_move may have
1409 * changed things. Don't bother checking the hash because
1410 * we're about to compare the whole name anyway.
1411 */
1412 if (dentry->d_parent != parent) 1916 if (dentry->d_parent != parent)
1413 goto next; 1917 goto next;
1414
1415 /* non-existing due to RCU? */
1416 if (d_unhashed(dentry)) 1918 if (d_unhashed(dentry))
1417 goto next; 1919 goto next;
1418 1920
@@ -1420,18 +1922,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1420 * It is safe to compare names since d_move() cannot 1922 * It is safe to compare names since d_move() cannot
1421 * change the qstr (protected by d_lock). 1923 * change the qstr (protected by d_lock).
1422 */ 1924 */
1423 qstr = &dentry->d_name; 1925 tlen = dentry->d_name.len;
1424 if (parent->d_op && parent->d_op->d_compare) { 1926 tname = dentry->d_name.name;
1425 if (parent->d_op->d_compare(parent, qstr, name)) 1927 if (parent->d_flags & DCACHE_OP_COMPARE) {
1928 if (parent->d_op->d_compare(parent, parent->d_inode,
1929 dentry, dentry->d_inode,
1930 tlen, tname, name))
1426 goto next; 1931 goto next;
1427 } else { 1932 } else {
1428 if (qstr->len != len) 1933 if (dentry_cmp(tname, tlen, str, len))
1429 goto next;
1430 if (memcmp(qstr->name, str, len))
1431 goto next; 1934 goto next;
1432 } 1935 }
1433 1936
1434 atomic_inc(&dentry->d_count); 1937 dentry->d_count++;
1435 found = dentry; 1938 found = dentry;
1436 spin_unlock(&dentry->d_lock); 1939 spin_unlock(&dentry->d_lock);
1437 break; 1940 break;
@@ -1460,8 +1963,8 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
1460 * routine may choose to leave the hash value unchanged. 1963 * routine may choose to leave the hash value unchanged.
1461 */ 1964 */
1462 name->hash = full_name_hash(name->name, name->len); 1965 name->hash = full_name_hash(name->name, name->len);
1463 if (dir->d_op && dir->d_op->d_hash) { 1966 if (dir->d_flags & DCACHE_OP_HASH) {
1464 if (dir->d_op->d_hash(dir, name) < 0) 1967 if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
1465 goto out; 1968 goto out;
1466 } 1969 }
1467 dentry = d_lookup(dir, name); 1970 dentry = d_lookup(dir, name);
@@ -1470,41 +1973,32 @@ out:
1470} 1973}
1471 1974
1472/** 1975/**
1473 * d_validate - verify dentry provided from insecure source 1976 * d_validate - verify dentry provided from insecure source (deprecated)
1474 * @dentry: The dentry alleged to be valid child of @dparent 1977 * @dentry: The dentry alleged to be valid child of @dparent
1475 * @dparent: The parent dentry (known to be valid) 1978 * @dparent: The parent dentry (known to be valid)
1476 * 1979 *
1477 * An insecure source has sent us a dentry, here we verify it and dget() it. 1980 * An insecure source has sent us a dentry, here we verify it and dget() it.
1478 * This is used by ncpfs in its readdir implementation. 1981 * This is used by ncpfs in its readdir implementation.
1479 * Zero is returned in the dentry is invalid. 1982 * Zero is returned in the dentry is invalid.
1983 *
1984 * This function is slow for big directories, and deprecated, do not use it.
1480 */ 1985 */
1481
1482int d_validate(struct dentry *dentry, struct dentry *dparent) 1986int d_validate(struct dentry *dentry, struct dentry *dparent)
1483{ 1987{
1484 struct hlist_head *base; 1988 struct dentry *child;
1485 struct hlist_node *lhp;
1486
1487 /* Check whether the ptr might be valid at all.. */
1488 if (!kmem_ptr_validate(dentry_cache, dentry))
1489 goto out;
1490
1491 if (dentry->d_parent != dparent)
1492 goto out;
1493 1989
1494 spin_lock(&dcache_lock); 1990 spin_lock(&dparent->d_lock);
1495 base = d_hash(dparent, dentry->d_name.hash); 1991 list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
1496 hlist_for_each(lhp,base) { 1992 if (dentry == child) {
1497 /* hlist_for_each_entry_rcu() not required for d_hash list 1993 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1498 * as it is parsed under dcache_lock 1994 __dget_dlock(dentry);
1499 */ 1995 spin_unlock(&dentry->d_lock);
1500 if (dentry == hlist_entry(lhp, struct dentry, d_hash)) { 1996 spin_unlock(&dparent->d_lock);
1501 __dget_locked(dentry);
1502 spin_unlock(&dcache_lock);
1503 return 1; 1997 return 1;
1504 } 1998 }
1505 } 1999 }
1506 spin_unlock(&dcache_lock); 2000 spin_unlock(&dparent->d_lock);
1507out: 2001
1508 return 0; 2002 return 0;
1509} 2003}
1510EXPORT_SYMBOL(d_validate); 2004EXPORT_SYMBOL(d_validate);
@@ -1532,16 +2026,23 @@ EXPORT_SYMBOL(d_validate);
1532 2026
1533void d_delete(struct dentry * dentry) 2027void d_delete(struct dentry * dentry)
1534{ 2028{
2029 struct inode *inode;
1535 int isdir = 0; 2030 int isdir = 0;
1536 /* 2031 /*
1537 * Are we the only user? 2032 * Are we the only user?
1538 */ 2033 */
1539 spin_lock(&dcache_lock); 2034again:
1540 spin_lock(&dentry->d_lock); 2035 spin_lock(&dentry->d_lock);
1541 isdir = S_ISDIR(dentry->d_inode->i_mode); 2036 inode = dentry->d_inode;
1542 if (atomic_read(&dentry->d_count) == 1) { 2037 isdir = S_ISDIR(inode->i_mode);
2038 if (dentry->d_count == 1) {
2039 if (inode && !spin_trylock(&inode->i_lock)) {
2040 spin_unlock(&dentry->d_lock);
2041 cpu_relax();
2042 goto again;
2043 }
1543 dentry->d_flags &= ~DCACHE_CANT_MOUNT; 2044 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1544 dentry_iput(dentry); 2045 dentry_unlink_inode(dentry);
1545 fsnotify_nameremove(dentry, isdir); 2046 fsnotify_nameremove(dentry, isdir);
1546 return; 2047 return;
1547 } 2048 }
@@ -1550,17 +2051,18 @@ void d_delete(struct dentry * dentry)
1550 __d_drop(dentry); 2051 __d_drop(dentry);
1551 2052
1552 spin_unlock(&dentry->d_lock); 2053 spin_unlock(&dentry->d_lock);
1553 spin_unlock(&dcache_lock);
1554 2054
1555 fsnotify_nameremove(dentry, isdir); 2055 fsnotify_nameremove(dentry, isdir);
1556} 2056}
1557EXPORT_SYMBOL(d_delete); 2057EXPORT_SYMBOL(d_delete);
1558 2058
1559static void __d_rehash(struct dentry * entry, struct hlist_head *list) 2059static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
1560{ 2060{
1561 2061 BUG_ON(!d_unhashed(entry));
2062 spin_lock_bucket(b);
1562 entry->d_flags &= ~DCACHE_UNHASHED; 2063 entry->d_flags &= ~DCACHE_UNHASHED;
1563 hlist_add_head_rcu(&entry->d_hash, list); 2064 hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
2065 spin_unlock_bucket(b);
1564} 2066}
1565 2067
1566static void _d_rehash(struct dentry * entry) 2068static void _d_rehash(struct dentry * entry)
@@ -1577,25 +2079,39 @@ static void _d_rehash(struct dentry * entry)
1577 2079
1578void d_rehash(struct dentry * entry) 2080void d_rehash(struct dentry * entry)
1579{ 2081{
1580 spin_lock(&dcache_lock);
1581 spin_lock(&entry->d_lock); 2082 spin_lock(&entry->d_lock);
1582 _d_rehash(entry); 2083 _d_rehash(entry);
1583 spin_unlock(&entry->d_lock); 2084 spin_unlock(&entry->d_lock);
1584 spin_unlock(&dcache_lock);
1585} 2085}
1586EXPORT_SYMBOL(d_rehash); 2086EXPORT_SYMBOL(d_rehash);
1587 2087
1588/* 2088/**
1589 * When switching names, the actual string doesn't strictly have to 2089 * dentry_update_name_case - update case insensitive dentry with a new name
1590 * be preserved in the target - because we're dropping the target 2090 * @dentry: dentry to be updated
1591 * anyway. As such, we can just do a simple memcpy() to copy over 2091 * @name: new name
1592 * the new name before we switch.
1593 * 2092 *
1594 * Note that we have to be a lot more careful about getting the hash 2093 * Update a case insensitive dentry with new case of name.
1595 * switched - we have to switch the hash value properly even if it 2094 *
1596 * then no longer matches the actual (corrupted) string of the target. 2095 * dentry must have been returned by d_lookup with name @name. Old and new
1597 * The hash value has to match the hash queue that the dentry is on.. 2096 * name lengths must match (ie. no d_compare which allows mismatched name
2097 * lengths).
2098 *
2099 * Parent inode i_mutex must be held over d_lookup and into this call (to
2100 * keep renames and concurrent inserts, and readdir(2) away).
1598 */ 2101 */
2102void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
2103{
2104 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2105 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
2106
2107 spin_lock(&dentry->d_lock);
2108 write_seqcount_begin(&dentry->d_seq);
2109 memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
2110 write_seqcount_end(&dentry->d_seq);
2111 spin_unlock(&dentry->d_lock);
2112}
2113EXPORT_SYMBOL(dentry_update_name_case);
2114
1599static void switch_names(struct dentry *dentry, struct dentry *target) 2115static void switch_names(struct dentry *dentry, struct dentry *target)
1600{ 2116{
1601 if (dname_external(target)) { 2117 if (dname_external(target)) {
@@ -1637,54 +2153,84 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1637 swap(dentry->d_name.len, target->d_name.len); 2153 swap(dentry->d_name.len, target->d_name.len);
1638} 2154}
1639 2155
2156static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
2157{
2158 /*
2159 * XXXX: do we really need to take target->d_lock?
2160 */
2161 if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
2162 spin_lock(&target->d_parent->d_lock);
2163 else {
2164 if (d_ancestor(dentry->d_parent, target->d_parent)) {
2165 spin_lock(&dentry->d_parent->d_lock);
2166 spin_lock_nested(&target->d_parent->d_lock,
2167 DENTRY_D_LOCK_NESTED);
2168 } else {
2169 spin_lock(&target->d_parent->d_lock);
2170 spin_lock_nested(&dentry->d_parent->d_lock,
2171 DENTRY_D_LOCK_NESTED);
2172 }
2173 }
2174 if (target < dentry) {
2175 spin_lock_nested(&target->d_lock, 2);
2176 spin_lock_nested(&dentry->d_lock, 3);
2177 } else {
2178 spin_lock_nested(&dentry->d_lock, 2);
2179 spin_lock_nested(&target->d_lock, 3);
2180 }
2181}
2182
2183static void dentry_unlock_parents_for_move(struct dentry *dentry,
2184 struct dentry *target)
2185{
2186 if (target->d_parent != dentry->d_parent)
2187 spin_unlock(&dentry->d_parent->d_lock);
2188 if (target->d_parent != target)
2189 spin_unlock(&target->d_parent->d_lock);
2190}
2191
1640/* 2192/*
1641 * We cannibalize "target" when moving dentry on top of it, 2193 * When switching names, the actual string doesn't strictly have to
1642 * because it's going to be thrown away anyway. We could be more 2194 * be preserved in the target - because we're dropping the target
1643 * polite about it, though. 2195 * anyway. As such, we can just do a simple memcpy() to copy over
1644 * 2196 * the new name before we switch.
1645 * This forceful removal will result in ugly /proc output if 2197 *
1646 * somebody holds a file open that got deleted due to a rename. 2198 * Note that we have to be a lot more careful about getting the hash
1647 * We could be nicer about the deleted file, and let it show 2199 * switched - we have to switch the hash value properly even if it
1648 * up under the name it had before it was deleted rather than 2200 * then no longer matches the actual (corrupted) string of the target.
1649 * under the original name of the file that was moved on top of it. 2201 * The hash value has to match the hash queue that the dentry is on..
1650 */ 2202 */
1651
1652/* 2203/*
1653 * d_move_locked - move a dentry 2204 * d_move - move a dentry
1654 * @dentry: entry to move 2205 * @dentry: entry to move
1655 * @target: new dentry 2206 * @target: new dentry
1656 * 2207 *
1657 * Update the dcache to reflect the move of a file name. Negative 2208 * Update the dcache to reflect the move of a file name. Negative
1658 * dcache entries should not be moved in this way. 2209 * dcache entries should not be moved in this way.
1659 */ 2210 */
1660static void d_move_locked(struct dentry * dentry, struct dentry * target) 2211void d_move(struct dentry * dentry, struct dentry * target)
1661{ 2212{
1662 struct hlist_head *list;
1663
1664 if (!dentry->d_inode) 2213 if (!dentry->d_inode)
1665 printk(KERN_WARNING "VFS: moving negative dcache entry\n"); 2214 printk(KERN_WARNING "VFS: moving negative dcache entry\n");
1666 2215
2216 BUG_ON(d_ancestor(dentry, target));
2217 BUG_ON(d_ancestor(target, dentry));
2218
1667 write_seqlock(&rename_lock); 2219 write_seqlock(&rename_lock);
1668 /*
1669 * XXXX: do we really need to take target->d_lock?
1670 */
1671 if (target < dentry) {
1672 spin_lock(&target->d_lock);
1673 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1674 } else {
1675 spin_lock(&dentry->d_lock);
1676 spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
1677 }
1678 2220
1679 /* Move the dentry to the target hash queue, if on different bucket */ 2221 dentry_lock_for_move(dentry, target);
1680 if (d_unhashed(dentry))
1681 goto already_unhashed;
1682 2222
1683 hlist_del_rcu(&dentry->d_hash); 2223 write_seqcount_begin(&dentry->d_seq);
2224 write_seqcount_begin(&target->d_seq);
1684 2225
1685already_unhashed: 2226 /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
1686 list = d_hash(target->d_parent, target->d_name.hash); 2227
1687 __d_rehash(dentry, list); 2228 /*
2229 * Move the dentry to the target hash queue. Don't bother checking
2230 * for the same hash queue because of how unlikely it is.
2231 */
2232 __d_drop(dentry);
2233 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
1688 2234
1689 /* Unhash the target: dput() will then get rid of it */ 2235 /* Unhash the target: dput() will then get rid of it */
1690 __d_drop(target); 2236 __d_drop(target);
@@ -1709,27 +2255,16 @@ already_unhashed:
1709 } 2255 }
1710 2256
1711 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); 2257 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2258
2259 write_seqcount_end(&target->d_seq);
2260 write_seqcount_end(&dentry->d_seq);
2261
2262 dentry_unlock_parents_for_move(dentry, target);
1712 spin_unlock(&target->d_lock); 2263 spin_unlock(&target->d_lock);
1713 fsnotify_d_move(dentry); 2264 fsnotify_d_move(dentry);
1714 spin_unlock(&dentry->d_lock); 2265 spin_unlock(&dentry->d_lock);
1715 write_sequnlock(&rename_lock); 2266 write_sequnlock(&rename_lock);
1716} 2267}
1717
1718/**
1719 * d_move - move a dentry
1720 * @dentry: entry to move
1721 * @target: new dentry
1722 *
1723 * Update the dcache to reflect the move of a file name. Negative
1724 * dcache entries should not be moved in this way.
1725 */
1726
1727void d_move(struct dentry * dentry, struct dentry * target)
1728{
1729 spin_lock(&dcache_lock);
1730 d_move_locked(dentry, target);
1731 spin_unlock(&dcache_lock);
1732}
1733EXPORT_SYMBOL(d_move); 2268EXPORT_SYMBOL(d_move);
1734 2269
1735/** 2270/**
@@ -1755,13 +2290,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
1755 * This helper attempts to cope with remotely renamed directories 2290 * This helper attempts to cope with remotely renamed directories
1756 * 2291 *
1757 * It assumes that the caller is already holding 2292 * It assumes that the caller is already holding
1758 * dentry->d_parent->d_inode->i_mutex and the dcache_lock 2293 * dentry->d_parent->d_inode->i_mutex and the inode->i_lock
1759 * 2294 *
1760 * Note: If ever the locking in lock_rename() changes, then please 2295 * Note: If ever the locking in lock_rename() changes, then please
1761 * remember to update this too... 2296 * remember to update this too...
1762 */ 2297 */
1763static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) 2298static struct dentry *__d_unalias(struct inode *inode,
1764 __releases(dcache_lock) 2299 struct dentry *dentry, struct dentry *alias)
1765{ 2300{
1766 struct mutex *m1 = NULL, *m2 = NULL; 2301 struct mutex *m1 = NULL, *m2 = NULL;
1767 struct dentry *ret; 2302 struct dentry *ret;
@@ -1784,10 +2319,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
1784 goto out_err; 2319 goto out_err;
1785 m2 = &alias->d_parent->d_inode->i_mutex; 2320 m2 = &alias->d_parent->d_inode->i_mutex;
1786out_unalias: 2321out_unalias:
1787 d_move_locked(alias, dentry); 2322 d_move(alias, dentry);
1788 ret = alias; 2323 ret = alias;
1789out_err: 2324out_err:
1790 spin_unlock(&dcache_lock); 2325 spin_unlock(&inode->i_lock);
1791 if (m2) 2326 if (m2)
1792 mutex_unlock(m2); 2327 mutex_unlock(m2);
1793 if (m1) 2328 if (m1)
@@ -1798,17 +2333,23 @@ out_err:
1798/* 2333/*
1799 * Prepare an anonymous dentry for life in the superblock's dentry tree as a 2334 * Prepare an anonymous dentry for life in the superblock's dentry tree as a
1800 * named dentry in place of the dentry to be replaced. 2335 * named dentry in place of the dentry to be replaced.
2336 * returns with anon->d_lock held!
1801 */ 2337 */
1802static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) 2338static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1803{ 2339{
1804 struct dentry *dparent, *aparent; 2340 struct dentry *dparent, *aparent;
1805 2341
1806 switch_names(dentry, anon); 2342 dentry_lock_for_move(anon, dentry);
1807 swap(dentry->d_name.hash, anon->d_name.hash); 2343
2344 write_seqcount_begin(&dentry->d_seq);
2345 write_seqcount_begin(&anon->d_seq);
1808 2346
1809 dparent = dentry->d_parent; 2347 dparent = dentry->d_parent;
1810 aparent = anon->d_parent; 2348 aparent = anon->d_parent;
1811 2349
2350 switch_names(dentry, anon);
2351 swap(dentry->d_name.hash, anon->d_name.hash);
2352
1812 dentry->d_parent = (aparent == anon) ? dentry : aparent; 2353 dentry->d_parent = (aparent == anon) ? dentry : aparent;
1813 list_del(&dentry->d_u.d_child); 2354 list_del(&dentry->d_u.d_child);
1814 if (!IS_ROOT(dentry)) 2355 if (!IS_ROOT(dentry))
@@ -1823,6 +2364,13 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1823 else 2364 else
1824 INIT_LIST_HEAD(&anon->d_u.d_child); 2365 INIT_LIST_HEAD(&anon->d_u.d_child);
1825 2366
2367 write_seqcount_end(&dentry->d_seq);
2368 write_seqcount_end(&anon->d_seq);
2369
2370 dentry_unlock_parents_for_move(anon, dentry);
2371 spin_unlock(&dentry->d_lock);
2372
2373 /* anon->d_lock still locked, returns locked */
1826 anon->d_flags &= ~DCACHE_DISCONNECTED; 2374 anon->d_flags &= ~DCACHE_DISCONNECTED;
1827} 2375}
1828 2376
@@ -1840,14 +2388,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
1840 2388
1841 BUG_ON(!d_unhashed(dentry)); 2389 BUG_ON(!d_unhashed(dentry));
1842 2390
1843 spin_lock(&dcache_lock);
1844
1845 if (!inode) { 2391 if (!inode) {
1846 actual = dentry; 2392 actual = dentry;
1847 __d_instantiate(dentry, NULL); 2393 __d_instantiate(dentry, NULL);
1848 goto found_lock; 2394 d_rehash(actual);
2395 goto out_nolock;
1849 } 2396 }
1850 2397
2398 spin_lock(&inode->i_lock);
2399
1851 if (S_ISDIR(inode->i_mode)) { 2400 if (S_ISDIR(inode->i_mode)) {
1852 struct dentry *alias; 2401 struct dentry *alias;
1853 2402
@@ -1858,13 +2407,12 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
1858 /* Is this an anonymous mountpoint that we could splice 2407 /* Is this an anonymous mountpoint that we could splice
1859 * into our tree? */ 2408 * into our tree? */
1860 if (IS_ROOT(alias)) { 2409 if (IS_ROOT(alias)) {
1861 spin_lock(&alias->d_lock);
1862 __d_materialise_dentry(dentry, alias); 2410 __d_materialise_dentry(dentry, alias);
1863 __d_drop(alias); 2411 __d_drop(alias);
1864 goto found; 2412 goto found;
1865 } 2413 }
1866 /* Nope, but we must(!) avoid directory aliasing */ 2414 /* Nope, but we must(!) avoid directory aliasing */
1867 actual = __d_unalias(dentry, alias); 2415 actual = __d_unalias(inode, dentry, alias);
1868 if (IS_ERR(actual)) 2416 if (IS_ERR(actual))
1869 dput(alias); 2417 dput(alias);
1870 goto out_nolock; 2418 goto out_nolock;
@@ -1875,15 +2423,14 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
1875 actual = __d_instantiate_unique(dentry, inode); 2423 actual = __d_instantiate_unique(dentry, inode);
1876 if (!actual) 2424 if (!actual)
1877 actual = dentry; 2425 actual = dentry;
1878 else if (unlikely(!d_unhashed(actual))) 2426 else
1879 goto shouldnt_be_hashed; 2427 BUG_ON(!d_unhashed(actual));
1880 2428
1881found_lock:
1882 spin_lock(&actual->d_lock); 2429 spin_lock(&actual->d_lock);
1883found: 2430found:
1884 _d_rehash(actual); 2431 _d_rehash(actual);
1885 spin_unlock(&actual->d_lock); 2432 spin_unlock(&actual->d_lock);
1886 spin_unlock(&dcache_lock); 2433 spin_unlock(&inode->i_lock);
1887out_nolock: 2434out_nolock:
1888 if (actual == dentry) { 2435 if (actual == dentry) {
1889 security_d_instantiate(dentry, inode); 2436 security_d_instantiate(dentry, inode);
@@ -1892,10 +2439,6 @@ out_nolock:
1892 2439
1893 iput(inode); 2440 iput(inode);
1894 return actual; 2441 return actual;
1895
1896shouldnt_be_hashed:
1897 spin_unlock(&dcache_lock);
1898 BUG();
1899} 2442}
1900EXPORT_SYMBOL_GPL(d_materialise_unique); 2443EXPORT_SYMBOL_GPL(d_materialise_unique);
1901 2444
@@ -1915,14 +2458,13 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
1915} 2458}
1916 2459
1917/** 2460/**
1918 * Prepend path string to a buffer 2461 * prepend_path - Prepend path string to a buffer
1919 *
1920 * @path: the dentry/vfsmount to report 2462 * @path: the dentry/vfsmount to report
1921 * @root: root vfsmnt/dentry (may be modified by this function) 2463 * @root: root vfsmnt/dentry (may be modified by this function)
1922 * @buffer: pointer to the end of the buffer 2464 * @buffer: pointer to the end of the buffer
1923 * @buflen: pointer to buffer length 2465 * @buflen: pointer to buffer length
1924 * 2466 *
1925 * Caller holds the dcache_lock. 2467 * Caller holds the rename_lock.
1926 * 2468 *
1927 * If path is not reachable from the supplied root, then the value of 2469 * If path is not reachable from the supplied root, then the value of
1928 * root is changed (without modifying refcounts). 2470 * root is changed (without modifying refcounts).
@@ -1950,7 +2492,9 @@ static int prepend_path(const struct path *path, struct path *root,
1950 } 2492 }
1951 parent = dentry->d_parent; 2493 parent = dentry->d_parent;
1952 prefetch(parent); 2494 prefetch(parent);
2495 spin_lock(&dentry->d_lock);
1953 error = prepend_name(buffer, buflen, &dentry->d_name); 2496 error = prepend_name(buffer, buflen, &dentry->d_name);
2497 spin_unlock(&dentry->d_lock);
1954 if (!error) 2498 if (!error)
1955 error = prepend(buffer, buflen, "/", 1); 2499 error = prepend(buffer, buflen, "/", 1);
1956 if (error) 2500 if (error)
@@ -1994,7 +2538,7 @@ global_root:
1994 * Returns a pointer into the buffer or an error code if the 2538 * Returns a pointer into the buffer or an error code if the
1995 * path was too long. 2539 * path was too long.
1996 * 2540 *
1997 * "buflen" should be positive. Caller holds the dcache_lock. 2541 * "buflen" should be positive.
1998 * 2542 *
1999 * If path is not reachable from the supplied root, then the value of 2543 * If path is not reachable from the supplied root, then the value of
2000 * root is changed (without modifying refcounts). 2544 * root is changed (without modifying refcounts).
@@ -2006,10 +2550,12 @@ char *__d_path(const struct path *path, struct path *root,
2006 int error; 2550 int error;
2007 2551
2008 prepend(&res, &buflen, "\0", 1); 2552 prepend(&res, &buflen, "\0", 1);
2553 write_seqlock(&rename_lock);
2009 error = prepend_path(path, root, &res, &buflen); 2554 error = prepend_path(path, root, &res, &buflen);
2555 write_sequnlock(&rename_lock);
2556
2010 if (error) 2557 if (error)
2011 return ERR_PTR(error); 2558 return ERR_PTR(error);
2012
2013 return res; 2559 return res;
2014} 2560}
2015 2561
@@ -2068,12 +2614,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
2068 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 2614 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2069 2615
2070 get_fs_root(current->fs, &root); 2616 get_fs_root(current->fs, &root);
2071 spin_lock(&dcache_lock); 2617 write_seqlock(&rename_lock);
2072 tmp = root; 2618 tmp = root;
2073 error = path_with_deleted(path, &tmp, &res, &buflen); 2619 error = path_with_deleted(path, &tmp, &res, &buflen);
2074 if (error) 2620 if (error)
2075 res = ERR_PTR(error); 2621 res = ERR_PTR(error);
2076 spin_unlock(&dcache_lock); 2622 write_sequnlock(&rename_lock);
2077 path_put(&root); 2623 path_put(&root);
2078 return res; 2624 return res;
2079} 2625}
@@ -2099,12 +2645,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2099 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 2645 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2100 2646
2101 get_fs_root(current->fs, &root); 2647 get_fs_root(current->fs, &root);
2102 spin_lock(&dcache_lock); 2648 write_seqlock(&rename_lock);
2103 tmp = root; 2649 tmp = root;
2104 error = path_with_deleted(path, &tmp, &res, &buflen); 2650 error = path_with_deleted(path, &tmp, &res, &buflen);
2105 if (!error && !path_equal(&tmp, &root)) 2651 if (!error && !path_equal(&tmp, &root))
2106 error = prepend_unreachable(&res, &buflen); 2652 error = prepend_unreachable(&res, &buflen);
2107 spin_unlock(&dcache_lock); 2653 write_sequnlock(&rename_lock);
2108 path_put(&root); 2654 path_put(&root);
2109 if (error) 2655 if (error)
2110 res = ERR_PTR(error); 2656 res = ERR_PTR(error);
@@ -2136,7 +2682,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
2136/* 2682/*
2137 * Write full pathname from the root of the filesystem into the buffer. 2683 * Write full pathname from the root of the filesystem into the buffer.
2138 */ 2684 */
2139char *__dentry_path(struct dentry *dentry, char *buf, int buflen) 2685static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2140{ 2686{
2141 char *end = buf + buflen; 2687 char *end = buf + buflen;
2142 char *retval; 2688 char *retval;
@@ -2150,10 +2696,13 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2150 2696
2151 while (!IS_ROOT(dentry)) { 2697 while (!IS_ROOT(dentry)) {
2152 struct dentry *parent = dentry->d_parent; 2698 struct dentry *parent = dentry->d_parent;
2699 int error;
2153 2700
2154 prefetch(parent); 2701 prefetch(parent);
2155 if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || 2702 spin_lock(&dentry->d_lock);
2156 (prepend(&end, &buflen, "/", 1) != 0)) 2703 error = prepend_name(&end, &buflen, &dentry->d_name);
2704 spin_unlock(&dentry->d_lock);
2705 if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
2157 goto Elong; 2706 goto Elong;
2158 2707
2159 retval = end; 2708 retval = end;
@@ -2163,14 +2712,25 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2163Elong: 2712Elong:
2164 return ERR_PTR(-ENAMETOOLONG); 2713 return ERR_PTR(-ENAMETOOLONG);
2165} 2714}
2166EXPORT_SYMBOL(__dentry_path); 2715
2716char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
2717{
2718 char *retval;
2719
2720 write_seqlock(&rename_lock);
2721 retval = __dentry_path(dentry, buf, buflen);
2722 write_sequnlock(&rename_lock);
2723
2724 return retval;
2725}
2726EXPORT_SYMBOL(dentry_path_raw);
2167 2727
2168char *dentry_path(struct dentry *dentry, char *buf, int buflen) 2728char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2169{ 2729{
2170 char *p = NULL; 2730 char *p = NULL;
2171 char *retval; 2731 char *retval;
2172 2732
2173 spin_lock(&dcache_lock); 2733 write_seqlock(&rename_lock);
2174 if (d_unlinked(dentry)) { 2734 if (d_unlinked(dentry)) {
2175 p = buf + buflen; 2735 p = buf + buflen;
2176 if (prepend(&p, &buflen, "//deleted", 10) != 0) 2736 if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2178,12 +2738,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2178 buflen++; 2738 buflen++;
2179 } 2739 }
2180 retval = __dentry_path(dentry, buf, buflen); 2740 retval = __dentry_path(dentry, buf, buflen);
2181 spin_unlock(&dcache_lock); 2741 write_sequnlock(&rename_lock);
2182 if (!IS_ERR(retval) && p) 2742 if (!IS_ERR(retval) && p)
2183 *p = '/'; /* restore '/' overriden with '\0' */ 2743 *p = '/'; /* restore '/' overriden with '\0' */
2184 return retval; 2744 return retval;
2185Elong: 2745Elong:
2186 spin_unlock(&dcache_lock);
2187 return ERR_PTR(-ENAMETOOLONG); 2746 return ERR_PTR(-ENAMETOOLONG);
2188} 2747}
2189 2748
@@ -2217,7 +2776,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2217 get_fs_root_and_pwd(current->fs, &root, &pwd); 2776 get_fs_root_and_pwd(current->fs, &root, &pwd);
2218 2777
2219 error = -ENOENT; 2778 error = -ENOENT;
2220 spin_lock(&dcache_lock); 2779 write_seqlock(&rename_lock);
2221 if (!d_unlinked(pwd.dentry)) { 2780 if (!d_unlinked(pwd.dentry)) {
2222 unsigned long len; 2781 unsigned long len;
2223 struct path tmp = root; 2782 struct path tmp = root;
@@ -2226,7 +2785,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2226 2785
2227 prepend(&cwd, &buflen, "\0", 1); 2786 prepend(&cwd, &buflen, "\0", 1);
2228 error = prepend_path(&pwd, &tmp, &cwd, &buflen); 2787 error = prepend_path(&pwd, &tmp, &cwd, &buflen);
2229 spin_unlock(&dcache_lock); 2788 write_sequnlock(&rename_lock);
2230 2789
2231 if (error) 2790 if (error)
2232 goto out; 2791 goto out;
@@ -2245,8 +2804,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2245 if (copy_to_user(buf, cwd, len)) 2804 if (copy_to_user(buf, cwd, len))
2246 error = -EFAULT; 2805 error = -EFAULT;
2247 } 2806 }
2248 } else 2807 } else {
2249 spin_unlock(&dcache_lock); 2808 write_sequnlock(&rename_lock);
2809 }
2250 2810
2251out: 2811out:
2252 path_put(&pwd); 2812 path_put(&pwd);
@@ -2274,25 +2834,25 @@ out:
2274int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) 2834int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2275{ 2835{
2276 int result; 2836 int result;
2277 unsigned long seq; 2837 unsigned seq;
2278 2838
2279 if (new_dentry == old_dentry) 2839 if (new_dentry == old_dentry)
2280 return 1; 2840 return 1;
2281 2841
2282 /*
2283 * Need rcu_readlock to protect against the d_parent trashing
2284 * due to d_move
2285 */
2286 rcu_read_lock();
2287 do { 2842 do {
2288 /* for restarting inner loop in case of seq retry */ 2843 /* for restarting inner loop in case of seq retry */
2289 seq = read_seqbegin(&rename_lock); 2844 seq = read_seqbegin(&rename_lock);
2845 /*
2846 * Need rcu_readlock to protect against the d_parent trashing
2847 * due to d_move
2848 */
2849 rcu_read_lock();
2290 if (d_ancestor(old_dentry, new_dentry)) 2850 if (d_ancestor(old_dentry, new_dentry))
2291 result = 1; 2851 result = 1;
2292 else 2852 else
2293 result = 0; 2853 result = 0;
2854 rcu_read_unlock();
2294 } while (read_seqretry(&rename_lock, seq)); 2855 } while (read_seqretry(&rename_lock, seq));
2295 rcu_read_unlock();
2296 2856
2297 return result; 2857 return result;
2298} 2858}
@@ -2324,10 +2884,15 @@ EXPORT_SYMBOL(path_is_under);
2324 2884
2325void d_genocide(struct dentry *root) 2885void d_genocide(struct dentry *root)
2326{ 2886{
2327 struct dentry *this_parent = root; 2887 struct dentry *this_parent;
2328 struct list_head *next; 2888 struct list_head *next;
2889 unsigned seq;
2890 int locked = 0;
2329 2891
2330 spin_lock(&dcache_lock); 2892 seq = read_seqbegin(&rename_lock);
2893again:
2894 this_parent = root;
2895 spin_lock(&this_parent->d_lock);
2331repeat: 2896repeat:
2332 next = this_parent->d_subdirs.next; 2897 next = this_parent->d_subdirs.next;
2333resume: 2898resume:
@@ -2335,21 +2900,62 @@ resume:
2335 struct list_head *tmp = next; 2900 struct list_head *tmp = next;
2336 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 2901 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
2337 next = tmp->next; 2902 next = tmp->next;
2338 if (d_unhashed(dentry)||!dentry->d_inode) 2903
2904 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
2905 if (d_unhashed(dentry) || !dentry->d_inode) {
2906 spin_unlock(&dentry->d_lock);
2339 continue; 2907 continue;
2908 }
2340 if (!list_empty(&dentry->d_subdirs)) { 2909 if (!list_empty(&dentry->d_subdirs)) {
2910 spin_unlock(&this_parent->d_lock);
2911 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
2341 this_parent = dentry; 2912 this_parent = dentry;
2913 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
2342 goto repeat; 2914 goto repeat;
2343 } 2915 }
2344 atomic_dec(&dentry->d_count); 2916 if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
2917 dentry->d_flags |= DCACHE_GENOCIDE;
2918 dentry->d_count--;
2919 }
2920 spin_unlock(&dentry->d_lock);
2345 } 2921 }
2346 if (this_parent != root) { 2922 if (this_parent != root) {
2347 next = this_parent->d_u.d_child.next; 2923 struct dentry *tmp;
2348 atomic_dec(&this_parent->d_count); 2924 struct dentry *child;
2349 this_parent = this_parent->d_parent; 2925
2926 tmp = this_parent->d_parent;
2927 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
2928 this_parent->d_flags |= DCACHE_GENOCIDE;
2929 this_parent->d_count--;
2930 }
2931 rcu_read_lock();
2932 spin_unlock(&this_parent->d_lock);
2933 child = this_parent;
2934 this_parent = tmp;
2935 spin_lock(&this_parent->d_lock);
2936 /* might go back up the wrong parent if we have had a rename
2937 * or deletion */
2938 if (this_parent != child->d_parent ||
2939 (!locked && read_seqretry(&rename_lock, seq))) {
2940 spin_unlock(&this_parent->d_lock);
2941 rcu_read_unlock();
2942 goto rename_retry;
2943 }
2944 rcu_read_unlock();
2945 next = child->d_u.d_child.next;
2350 goto resume; 2946 goto resume;
2351 } 2947 }
2352 spin_unlock(&dcache_lock); 2948 spin_unlock(&this_parent->d_lock);
2949 if (!locked && read_seqretry(&rename_lock, seq))
2950 goto rename_retry;
2951 if (locked)
2952 write_sequnlock(&rename_lock);
2953 return;
2954
2955rename_retry:
2956 locked = 1;
2957 write_seqlock(&rename_lock);
2958 goto again;
2353} 2959}
2354 2960
2355/** 2961/**
@@ -2403,7 +3009,7 @@ static void __init dcache_init_early(void)
2403 3009
2404 dentry_hashtable = 3010 dentry_hashtable =
2405 alloc_large_system_hash("Dentry cache", 3011 alloc_large_system_hash("Dentry cache",
2406 sizeof(struct hlist_head), 3012 sizeof(struct dcache_hash_bucket),
2407 dhash_entries, 3013 dhash_entries,
2408 13, 3014 13,
2409 HASH_EARLY, 3015 HASH_EARLY,
@@ -2412,7 +3018,7 @@ static void __init dcache_init_early(void)
2412 0); 3018 0);
2413 3019
2414 for (loop = 0; loop < (1 << d_hash_shift); loop++) 3020 for (loop = 0; loop < (1 << d_hash_shift); loop++)
2415 INIT_HLIST_HEAD(&dentry_hashtable[loop]); 3021 INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
2416} 3022}
2417 3023
2418static void __init dcache_init(void) 3024static void __init dcache_init(void)
@@ -2435,7 +3041,7 @@ static void __init dcache_init(void)
2435 3041
2436 dentry_hashtable = 3042 dentry_hashtable =
2437 alloc_large_system_hash("Dentry cache", 3043 alloc_large_system_hash("Dentry cache",
2438 sizeof(struct hlist_head), 3044 sizeof(struct dcache_hash_bucket),
2439 dhash_entries, 3045 dhash_entries,
2440 13, 3046 13,
2441 0, 3047 0,
@@ -2444,7 +3050,7 @@ static void __init dcache_init(void)
2444 0); 3050 0);
2445 3051
2446 for (loop = 0; loop < (1 << d_hash_shift); loop++) 3052 for (loop = 0; loop < (1 << d_hash_shift); loop++)
2447 INIT_HLIST_HEAD(&dentry_hashtable[loop]); 3053 INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
2448} 3054}
2449 3055
2450/* SLAB cache for __getname() consumers */ 3056/* SLAB cache for __getname() consumers */
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b2..89d394d8fe24 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
43 .read = default_read_file, 43 .read = default_read_file,
44 .write = default_write_file, 44 .write = default_write_file,
45 .open = default_open, 45 .open = default_open,
46 .llseek = noop_llseek,
46}; 47};
47 48
48static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) 49static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
454 .read = read_file_bool, 455 .read = read_file_bool,
455 .write = write_file_bool, 456 .write = write_file_bool,
456 .open = default_open, 457 .open = default_open,
458 .llseek = default_llseek,
457}; 459};
458 460
459/** 461/**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
498static const struct file_operations fops_blob = { 500static const struct file_operations fops_blob = {
499 .read = read_file_blob, 501 .read = read_file_blob,
500 .open = default_open, 502 .open = default_open,
503 .llseek = default_llseek,
501}; 504};
502 505
503/** 506/**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..37a8ca7c1222 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
40 struct inode *inode = new_inode(sb); 40 struct inode *inode = new_inode(sb);
41 41
42 if (inode) { 42 if (inode) {
43 inode->i_ino = get_next_ino();
43 inode->i_mode = mode; 44 inode->i_mode = mode;
44 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
45 switch (mode & S_IFMT) { 46 switch (mode & S_IFMT) {
@@ -134,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
134 return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); 135 return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
135} 136}
136 137
137static int debug_get_sb(struct file_system_type *fs_type, 138static struct dentry *debug_mount(struct file_system_type *fs_type,
138 int flags, const char *dev_name, 139 int flags, const char *dev_name,
139 void *data, struct vfsmount *mnt) 140 void *data)
140{ 141{
141 return get_sb_single(fs_type, flags, data, debug_fill_super, mnt); 142 return mount_single(fs_type, flags, data, debug_fill_super);
142} 143}
143 144
144static struct file_system_type debug_fs_type = { 145static struct file_system_type debug_fs_type = {
145 .owner = THIS_MODULE, 146 .owner = THIS_MODULE,
146 .name = "debugfs", 147 .name = "debugfs",
147 .get_sb = debug_get_sb, 148 .mount = debug_mount,
148 .kill_sb = kill_litter_super, 149 .kill_sb = kill_litter_super,
149}; 150};
150 151
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b5235..1bb547c9cad6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
331} 331}
332 332
333/* 333/*
334 * devpts_get_sb() 334 * devpts_mount()
335 * 335 *
336 * If the '-o newinstance' mount option was specified, mount a new 336 * If the '-o newinstance' mount option was specified, mount a new
337 * (private) instance of devpts. PTYs created in this instance are 337 * (private) instance of devpts. PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
345 * semantics in devpts while preserving backward compatibility of the 345 * semantics in devpts while preserving backward compatibility of the
346 * current 'single-namespace' semantics. i.e all mounts of devpts 346 * current 'single-namespace' semantics. i.e all mounts of devpts
347 * without the 'newinstance' mount option should bind to the initial 347 * without the 'newinstance' mount option should bind to the initial
348 * kernel mount, like get_sb_single(). 348 * kernel mount, like mount_single().
349 * 349 *
350 * Mounts with 'newinstance' option create a new, private namespace. 350 * Mounts with 'newinstance' option create a new, private namespace.
351 * 351 *
352 * NOTE: 352 * NOTE:
353 * 353 *
354 * For single-mount semantics, devpts cannot use get_sb_single(), 354 * For single-mount semantics, devpts cannot use mount_single(),
355 * because get_sb_single()/sget() find and use the super-block from 355 * because mount_single()/sget() find and use the super-block from
356 * the most recent mount of devpts. But that recent mount may be a 356 * the most recent mount of devpts. But that recent mount may be a
357 * 'newinstance' mount and get_sb_single() would pick the newinstance 357 * 'newinstance' mount and mount_single() would pick the newinstance
358 * super-block instead of the initial super-block. 358 * super-block instead of the initial super-block.
359 */ 359 */
360static int devpts_get_sb(struct file_system_type *fs_type, 360static struct dentry *devpts_mount(struct file_system_type *fs_type,
361 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 361 int flags, const char *dev_name, void *data)
362{ 362{
363 int error; 363 int error;
364 struct pts_mount_opts opts; 364 struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
366 366
367 error = parse_mount_options(data, PARSE_MOUNT, &opts); 367 error = parse_mount_options(data, PARSE_MOUNT, &opts);
368 if (error) 368 if (error)
369 return error; 369 return ERR_PTR(error);
370 370
371 if (opts.newinstance) 371 if (opts.newinstance)
372 s = sget(fs_type, NULL, set_anon_super, NULL); 372 s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
374 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); 374 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
375 375
376 if (IS_ERR(s)) 376 if (IS_ERR(s))
377 return PTR_ERR(s); 377 return ERR_CAST(s);
378 378
379 if (!s->s_root) { 379 if (!s->s_root) {
380 s->s_flags = flags; 380 s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
390 if (error) 390 if (error)
391 goto out_undo_sget; 391 goto out_undo_sget;
392 392
393 simple_set_mnt(mnt, s); 393 return dget(s->s_root);
394
395 return 0;
396 394
397out_undo_sget: 395out_undo_sget:
398 deactivate_locked_super(s); 396 deactivate_locked_super(s);
399 return error; 397 return ERR_PTR(error);
400} 398}
401 399
402#else 400#else
@@ -404,10 +402,10 @@ out_undo_sget:
404 * This supports only the legacy single-instance semantics (no 402 * This supports only the legacy single-instance semantics (no
405 * multiple-instance semantics) 403 * multiple-instance semantics)
406 */ 404 */
407static int devpts_get_sb(struct file_system_type *fs_type, int flags, 405static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
408 const char *dev_name, void *data, struct vfsmount *mnt) 406 const char *dev_name, void *data)
409{ 407{
410 return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); 408 return mount_single(fs_type, flags, data, devpts_fill_super);
411} 409}
412#endif 410#endif
413 411
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
421 419
422static struct file_system_type devpts_fs_type = { 420static struct file_system_type devpts_fs_type = {
423 .name = "devpts", 421 .name = "devpts",
424 .get_sb = devpts_get_sb, 422 .mount = devpts_mount,
425 .kill_sb = devpts_kill_sb, 423 .kill_sb = devpts_kill_sb,
426}; 424};
427 425
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..b044705eedd4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
218 * filesystems can use it to hold additional state between get_block calls and 218 * filesystems can use it to hold additional state between get_block calls and
219 * dio_complete. 219 * dio_complete.
220 */ 220 */
221static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) 221static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
222{ 222{
223 ssize_t transferred = 0; 223 ssize_t transferred = 0;
224 224
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
325} 325}
326EXPORT_SYMBOL_GPL(dio_end_io); 326EXPORT_SYMBOL_GPL(dio_end_io);
327 327
328static int 328static void
329dio_bio_alloc(struct dio *dio, struct block_device *bdev, 329dio_bio_alloc(struct dio *dio, struct block_device *bdev,
330 sector_t first_sector, int nr_vecs) 330 sector_t first_sector, int nr_vecs)
331{ 331{
332 struct bio *bio; 332 struct bio *bio;
333 333
334 /*
335 * bio_alloc() is guaranteed to return a bio when called with
336 * __GFP_WAIT and we request a valid number of vectors.
337 */
334 bio = bio_alloc(GFP_KERNEL, nr_vecs); 338 bio = bio_alloc(GFP_KERNEL, nr_vecs);
335 339
336 bio->bi_bdev = bdev; 340 bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
342 346
343 dio->bio = bio; 347 dio->bio = bio;
344 dio->logical_offset_in_bio = dio->cur_page_fs_offset; 348 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
345 return 0;
346} 349}
347 350
348/* 351/*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
583 goto out; 586 goto out;
584 sector = start_sector << (dio->blkbits - 9); 587 sector = start_sector << (dio->blkbits - 9);
585 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); 588 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
589 nr_pages = min(nr_pages, BIO_MAX_PAGES);
586 BUG_ON(nr_pages <= 0); 590 BUG_ON(nr_pages <= 0);
587 ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); 591 dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
588 dio->boundary = 0; 592 dio->boundary = 0;
589out: 593out:
590 return ret; 594 return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e8116..1897eb1b4b6a 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on EXPERIMENTAL && INET 3 depends on EXPERIMENTAL && INET
4 depends on SYSFS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select CONFIGFS_FS
6 select IP_SCTP 5 select IP_SCTP
7 help 6 help
8 A general purpose distributed lock manager for kernel or userspace 7 A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf25158746..6b42ba807dfd 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
643static const struct file_operations waiters_fops = { 643static const struct file_operations waiters_fops = {
644 .owner = THIS_MODULE, 644 .owner = THIS_MODULE,
645 .open = waiters_open, 645 .open = waiters_open,
646 .read = waiters_read 646 .read = waiters_read,
647 .llseek = default_llseek,
647}; 648};
648 649
649void dlm_delete_debug_file(struct dlm_ls *ls) 650void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15ca..64e5f3efdd81 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1846 struct dlm_lkb *gr; 1846 struct dlm_lkb *gr;
1847 1847
1848 list_for_each_entry(gr, head, lkb_statequeue) { 1848 list_for_each_entry(gr, head, lkb_statequeue) {
1849 /* skip self when sending basts to convertqueue */
1850 if (gr == lkb)
1851 continue;
1849 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) { 1852 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
1850 queue_bast(r, gr, lkb->lkb_rqmode); 1853 queue_bast(r, gr, lkb->lkb_rqmode);
1851 gr->lkb_highbast = lkb->lkb_rqmode; 1854 gr->lkb_highbast = lkb->lkb_rqmode;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622a..9c64ae9e4c1a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
63#define NEEDED_RMEM (4*1024*1024) 63#define NEEDED_RMEM (4*1024*1024)
64#define CONN_HASH_SIZE 32 64#define CONN_HASH_SIZE 32
65 65
66/* Number of messages to send before rescheduling */
67#define MAX_SEND_MSG_COUNT 25
68
66struct cbuf { 69struct cbuf {
67 unsigned int base; 70 unsigned int base;
68 unsigned int len; 71 unsigned int len;
@@ -108,6 +111,7 @@ struct connection {
108#define CF_INIT_PENDING 4 111#define CF_INIT_PENDING 4
109#define CF_IS_OTHERCON 5 112#define CF_IS_OTHERCON 5
110#define CF_CLOSE 6 113#define CF_CLOSE 6
114#define CF_APP_LIMITED 7
111 struct list_head writequeue; /* List of outgoing writequeue_entries */ 115 struct list_head writequeue; /* List of outgoing writequeue_entries */
112 spinlock_t writequeue_lock; 116 spinlock_t writequeue_lock;
113 int (*rx_action) (struct connection *); /* What to do when active */ 117 int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +299,17 @@ static void lowcomms_write_space(struct sock *sk)
295{ 299{
296 struct connection *con = sock2con(sk); 300 struct connection *con = sock2con(sk);
297 301
298 if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags)) 302 if (!con)
303 return;
304
305 clear_bit(SOCK_NOSPACE, &con->sock->flags);
306
307 if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
308 con->sock->sk->sk_write_pending--;
309 clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
310 }
311
312 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
299 queue_work(send_workqueue, &con->swork); 313 queue_work(send_workqueue, &con->swork);
300} 314}
301 315
@@ -915,6 +929,7 @@ static void tcp_connect_to_sock(struct connection *con)
915 struct sockaddr_storage saddr, src_addr; 929 struct sockaddr_storage saddr, src_addr;
916 int addr_len; 930 int addr_len;
917 struct socket *sock = NULL; 931 struct socket *sock = NULL;
932 int one = 1;
918 933
919 if (con->nodeid == 0) { 934 if (con->nodeid == 0) {
920 log_print("attempt to connect sock 0 foiled"); 935 log_print("attempt to connect sock 0 foiled");
@@ -960,6 +975,11 @@ static void tcp_connect_to_sock(struct connection *con)
960 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); 975 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
961 976
962 log_print("connecting to %d", con->nodeid); 977 log_print("connecting to %d", con->nodeid);
978
979 /* Turn off Nagle's algorithm */
980 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
981 sizeof(one));
982
963 result = 983 result =
964 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, 984 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
965 O_NONBLOCK); 985 O_NONBLOCK);
@@ -1011,6 +1031,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
1011 goto create_out; 1031 goto create_out;
1012 } 1032 }
1013 1033
1034 /* Turn off Nagle's algorithm */
1035 kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
1036 sizeof(one));
1037
1014 result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 1038 result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
1015 (char *)&one, sizeof(one)); 1039 (char *)&one, sizeof(one));
1016 1040
@@ -1297,6 +1321,7 @@ static void send_to_sock(struct connection *con)
1297 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1321 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1298 struct writequeue_entry *e; 1322 struct writequeue_entry *e;
1299 int len, offset; 1323 int len, offset;
1324 int count = 0;
1300 1325
1301 mutex_lock(&con->sock_mutex); 1326 mutex_lock(&con->sock_mutex);
1302 if (con->sock == NULL) 1327 if (con->sock == NULL)
@@ -1319,14 +1344,27 @@ static void send_to_sock(struct connection *con)
1319 ret = kernel_sendpage(con->sock, e->page, offset, len, 1344 ret = kernel_sendpage(con->sock, e->page, offset, len,
1320 msg_flags); 1345 msg_flags);
1321 if (ret == -EAGAIN || ret == 0) { 1346 if (ret == -EAGAIN || ret == 0) {
1347 if (ret == -EAGAIN &&
1348 test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
1349 !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
1350 /* Notify TCP that we're limited by the
1351 * application window size.
1352 */
1353 set_bit(SOCK_NOSPACE, &con->sock->flags);
1354 con->sock->sk->sk_write_pending++;
1355 }
1322 cond_resched(); 1356 cond_resched();
1323 goto out; 1357 goto out;
1324 } 1358 }
1325 if (ret <= 0) 1359 if (ret <= 0)
1326 goto send_error; 1360 goto send_error;
1327 } 1361 }
1328 /* Don't starve people filling buffers */ 1362
1363 /* Don't starve people filling buffers */
1364 if (++count >= MAX_SEND_MSG_COUNT) {
1329 cond_resched(); 1365 cond_resched();
1366 count = 0;
1367 }
1330 1368
1331 spin_lock(&con->writequeue_lock); 1369 spin_lock(&con->writequeue_lock);
1332 e->offset += ret; 1370 e->offset += ret;
@@ -1430,20 +1468,19 @@ static void work_stop(void)
1430 1468
1431static int work_start(void) 1469static int work_start(void)
1432{ 1470{
1433 int error; 1471 recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
1434 recv_workqueue = create_workqueue("dlm_recv"); 1472 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1435 error = IS_ERR(recv_workqueue); 1473 if (!recv_workqueue) {
1436 if (error) { 1474 log_print("can't start dlm_recv");
1437 log_print("can't start dlm_recv %d", error); 1475 return -ENOMEM;
1438 return error;
1439 } 1476 }
1440 1477
1441 send_workqueue = create_singlethread_workqueue("dlm_send"); 1478 send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
1442 error = IS_ERR(send_workqueue); 1479 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1443 if (error) { 1480 if (!send_workqueue) {
1444 log_print("can't start dlm_send %d", error); 1481 log_print("can't start dlm_send");
1445 destroy_workqueue(recv_workqueue); 1482 destroy_workqueue(recv_workqueue);
1446 return error; 1483 return -ENOMEM;
1447 } 1484 }
1448 1485
1449 return 0; 1486 return 0;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db6943..30d8b85febbf 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
412 .read = dev_read, 412 .read = dev_read,
413 .write = dev_write, 413 .write = dev_write,
414 .poll = dev_poll, 414 .poll = dev_poll,
415 .owner = THIS_MODULE 415 .owner = THIS_MODULE,
416 .llseek = noop_llseek,
416}; 417};
417 418
418static struct miscdevice plock_dev_misc = { 419static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130c..66d6c16bf440 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
1009 .write = device_write, 1009 .write = device_write,
1010 .poll = device_poll, 1010 .poll = device_poll,
1011 .owner = THIS_MODULE, 1011 .owner = THIS_MODULE,
1012 .llseek = noop_llseek,
1012}; 1013};
1013 1014
1014static const struct file_operations ctl_device_fops = { 1015static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
1017 .read = device_read, 1018 .read = device_read,
1018 .write = device_write, 1019 .write = device_write,
1019 .owner = THIS_MODULE, 1020 .owner = THIS_MODULE,
1021 .llseek = noop_llseek,
1020}; 1022};
1021 1023
1022static struct miscdevice ctl_device = { 1024static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
1029 .open = monitor_device_open, 1031 .open = monitor_device_open,
1030 .release = monitor_device_close, 1032 .release = monitor_device_close,
1031 .owner = THIS_MODULE, 1033 .owner = THIS_MODULE,
1034 .llseek = noop_llseek,
1032}; 1035};
1033 1036
1034static struct miscdevice monitor_device = { 1037static struct miscdevice monitor_device = {
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e7..bfd8b680e648 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
348 BUG_ON(!crypt_stat || !crypt_stat->tfm 348 BUG_ON(!crypt_stat || !crypt_stat->tfm
349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); 349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
350 if (unlikely(ecryptfs_verbosity > 0)) { 350 if (unlikely(ecryptfs_verbosity > 0)) {
351 ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", 351 ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
352 crypt_stat->key_size); 352 crypt_stat->key_size);
353 ecryptfs_dump_hex(crypt_stat->key, 353 ecryptfs_dump_hex(crypt_stat->key,
354 crypt_stat->key_size); 354 crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
414 (extent_base + extent_offset)); 414 (extent_base + extent_offset));
415 if (rc) { 415 if (rc) {
416 ecryptfs_printk(KERN_ERR, "Error attempting to " 416 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
417 "derive IV for extent [0x%.16x]; " 417 "extent [0x%.16llx]; rc = [%d]\n",
418 "rc = [%d]\n", (extent_base + extent_offset), 418 (unsigned long long)(extent_base + extent_offset), rc);
419 rc);
420 goto out; 419 goto out;
421 } 420 }
422 if (unlikely(ecryptfs_verbosity > 0)) { 421 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
443 } 442 }
444 rc = 0; 443 rc = 0;
445 if (unlikely(ecryptfs_verbosity > 0)) { 444 if (unlikely(ecryptfs_verbosity > 0)) {
446 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; " 445 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
447 "rc = [%d]\n", (extent_base + extent_offset), 446 "rc = [%d]\n",
448 rc); 447 (unsigned long long)(extent_base + extent_offset), rc);
449 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 448 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
450 "encryption:\n"); 449 "encryption:\n");
451 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); 450 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
540 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 539 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
541 (extent_base + extent_offset)); 540 (extent_base + extent_offset));
542 if (rc) { 541 if (rc) {
543 ecryptfs_printk(KERN_ERR, "Error attempting to " 542 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
544 "derive IV for extent [0x%.16x]; " 543 "extent [0x%.16llx]; rc = [%d]\n",
545 "rc = [%d]\n", (extent_base + extent_offset), 544 (unsigned long long)(extent_base + extent_offset), rc);
546 rc);
547 goto out; 545 goto out;
548 } 546 }
549 if (unlikely(ecryptfs_verbosity > 0)) { 547 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
571 } 569 }
572 rc = 0; 570 rc = 0;
573 if (unlikely(ecryptfs_verbosity > 0)) { 571 if (unlikely(ecryptfs_verbosity > 0)) {
574 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; " 572 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
575 "rc = [%d]\n", (extent_base + extent_offset), 573 "rc = [%d]\n",
576 rc); 574 (unsigned long long)(extent_base + extent_offset), rc);
577 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 575 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
578 "decryption:\n"); 576 "decryption:\n");
579 ecryptfs_dump_hex((char *)(page_address(page) 577 ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
780 } 778 }
781 ecryptfs_printk(KERN_DEBUG, 779 ecryptfs_printk(KERN_DEBUG,
782 "Initializing cipher [%s]; strlen = [%d]; " 780 "Initializing cipher [%s]; strlen = [%d]; "
783 "key_size_bits = [%d]\n", 781 "key_size_bits = [%zd]\n",
784 crypt_stat->cipher, (int)strlen(crypt_stat->cipher), 782 crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
785 crypt_stat->key_size << 3); 783 crypt_stat->key_size << 3);
786 if (crypt_stat->tfm) { 784 if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f79..6fc4f319b550 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
44 */ 44 */
45static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) 45static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
46{ 46{
47 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 47 struct dentry *lower_dentry;
48 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); 48 struct vfsmount *lower_mnt;
49 struct dentry *dentry_save; 49 struct dentry *dentry_save;
50 struct vfsmount *vfsmount_save; 50 struct vfsmount *vfsmount_save;
51 int rc = 1; 51 int rc = 1;
52 52
53 if (nd->flags & LOOKUP_RCU)
54 return -ECHILD;
55
56 lower_dentry = ecryptfs_dentry_to_lower(dentry);
57 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
53 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) 58 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
54 goto out; 59 goto out;
55 dentry_save = nd->path.dentry; 60 dentry_save = nd->path.dentry;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a9..dbc84ed96336 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
192 (((struct user_key_payload*)key->payload.data)->data); 192 (((struct user_key_payload*)key->payload.data)->data);
193} 193}
194 194
195#define ECRYPTFS_SUPER_MAGIC 0xf15f
196#define ECRYPTFS_MAX_KEYSET_SIZE 1024 195#define ECRYPTFS_MAX_KEYSET_SIZE 1024
197#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 196#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
198#define ECRYPTFS_MAX_NUM_ENC_KEYS 64 197#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -377,6 +376,7 @@ struct ecryptfs_mount_crypt_stat {
377#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 376#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010
378#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 377#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020
379#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 378#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040
379#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080
380 u32 flags; 380 u32 flags;
381 struct list_head global_auth_tok_list; 381 struct list_head global_auth_tok_list;
382 struct mutex global_auth_tok_list_mutex; 382 struct mutex global_auth_tok_list_mutex;
@@ -477,7 +477,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
477static inline struct ecryptfs_file_info * 477static inline struct ecryptfs_file_info *
478ecryptfs_file_to_private(struct file *file) 478ecryptfs_file_to_private(struct file *file)
479{ 479{
480 return (struct ecryptfs_file_info *)file->private_data; 480 return file->private_data;
481} 481}
482 482
483static inline void 483static inline void
@@ -583,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
583 583
584#define ecryptfs_printk(type, fmt, arg...) \ 584#define ecryptfs_printk(type, fmt, arg...) \
585 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); 585 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
586__attribute__ ((format(printf, 1, 2)))
586void __ecryptfs_printk(const char *fmt, ...); 587void __ecryptfs_printk(const char *fmt, ...);
587 588
588extern const struct file_operations ecryptfs_main_fops; 589extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 622c95140802..81e10e6a9443 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/compat.h> 32#include <linux/compat.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/smp_lock.h>
35#include "ecryptfs_kernel.h" 34#include "ecryptfs_kernel.h"
36 35
37/** 36/**
@@ -48,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
48 const struct iovec *iov, 47 const struct iovec *iov,
49 unsigned long nr_segs, loff_t pos) 48 unsigned long nr_segs, loff_t pos)
50{ 49{
51 int rc; 50 ssize_t rc;
52 struct dentry *lower_dentry; 51 struct dentry *lower_dentry;
53 struct vfsmount *lower_vfsmount; 52 struct vfsmount *lower_vfsmount;
54 struct file *file = iocb->ki_filp; 53 struct file *file = iocb->ki_filp;
@@ -192,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
192 | ECRYPTFS_ENCRYPTED); 191 | ECRYPTFS_ENCRYPTED);
193 } 192 }
194 mutex_unlock(&crypt_stat->cs_mutex); 193 mutex_unlock(&crypt_stat->cs_mutex);
195 if (!ecryptfs_inode_to_private(inode)->lower_file) { 194 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
196 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 195 if (rc) {
197 if (rc) { 196 printk(KERN_ERR "%s: Error attempting to initialize "
198 printk(KERN_ERR "%s: Error attempting to initialize " 197 "the persistent file for the dentry with name "
199 "the persistent file for the dentry with name " 198 "[%s]; rc = [%d]\n", __func__,
200 "[%s]; rc = [%d]\n", __func__, 199 ecryptfs_dentry->d_name.name, rc);
201 ecryptfs_dentry->d_name.name, rc); 200 goto out_free;
202 goto out_free;
203 }
204 } 201 }
205 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) 202 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
206 && !(file->f_flags & O_RDONLY)) { 203 == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
207 rc = -EPERM; 204 rc = -EPERM;
208 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " 205 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
209 "file must hence be opened RO\n", __func__); 206 "file must hence be opened RO\n", __func__);
@@ -244,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
244 } 241 }
245 } 242 }
246 mutex_unlock(&crypt_stat->cs_mutex); 243 mutex_unlock(&crypt_stat->cs_mutex);
247 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] " 244 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
248 "size: [0x%.16x]\n", inode, inode->i_ino, 245 "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
249 i_size_read(inode)); 246 (unsigned long long)i_size_read(inode));
250 goto out; 247 goto out;
251out_free: 248out_free:
252 kmem_cache_free(ecryptfs_file_info_cache, 249 kmem_cache_free(ecryptfs_file_info_cache,
@@ -284,11 +281,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
284 int rc = 0; 281 int rc = 0;
285 struct file *lower_file = NULL; 282 struct file *lower_file = NULL;
286 283
287 lock_kernel();
288 lower_file = ecryptfs_file_to_lower(file); 284 lower_file = ecryptfs_file_to_lower(file);
289 if (lower_file->f_op && lower_file->f_op->fasync) 285 if (lower_file->f_op && lower_file->f_op->fasync)
290 rc = lower_file->f_op->fasync(fd, lower_file, flag); 286 rc = lower_file->f_op->fasync(fd, lower_file, flag);
291 unlock_kernel();
292 return rc; 287 return rc;
293} 288}
294 289
@@ -332,6 +327,7 @@ const struct file_operations ecryptfs_dir_fops = {
332 .fsync = ecryptfs_fsync, 327 .fsync = ecryptfs_fsync,
333 .fasync = ecryptfs_fasync, 328 .fasync = ecryptfs_fasync,
334 .splice_read = generic_file_splice_read, 329 .splice_read = generic_file_splice_read,
330 .llseek = default_llseek,
335}; 331};
336 332
337const struct file_operations ecryptfs_main_fops = { 333const struct file_operations ecryptfs_main_fops = {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3fbc94203380..bd33f87a1907 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/xattr.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
@@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
70 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); 71 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
71 struct dentry *dentry_save; 72 struct dentry *dentry_save;
72 struct vfsmount *vfsmount_save; 73 struct vfsmount *vfsmount_save;
74 unsigned int flags_save;
73 int rc; 75 int rc;
74 76
75 dentry_save = nd->path.dentry; 77 dentry_save = nd->path.dentry;
76 vfsmount_save = nd->path.mnt; 78 vfsmount_save = nd->path.mnt;
79 flags_save = nd->flags;
77 nd->path.dentry = lower_dentry; 80 nd->path.dentry = lower_dentry;
78 nd->path.mnt = lower_mnt; 81 nd->path.mnt = lower_mnt;
82 nd->flags &= ~LOOKUP_OPEN;
79 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); 83 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
80 nd->path.dentry = dentry_save; 84 nd->path.dentry = dentry_save;
81 nd->path.mnt = vfsmount_save; 85 nd->path.mnt = vfsmount_save;
86 nd->flags = flags_save;
82 return rc; 87 return rc;
83} 88}
84 89
@@ -180,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
180 "context; rc = [%d]\n", rc); 185 "context; rc = [%d]\n", rc);
181 goto out; 186 goto out;
182 } 187 }
183 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 188 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
184 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 189 if (rc) {
185 if (rc) { 190 printk(KERN_ERR "%s: Error attempting to initialize "
186 printk(KERN_ERR "%s: Error attempting to initialize " 191 "the persistent file for the dentry with name "
187 "the persistent file for the dentry with name " 192 "[%s]; rc = [%d]\n", __func__,
188 "[%s]; rc = [%d]\n", __func__, 193 ecryptfs_dentry->d_name.name, rc);
189 ecryptfs_dentry->d_name.name, rc); 194 goto out;
190 goto out;
191 }
192 } 195 }
193 rc = ecryptfs_write_metadata(ecryptfs_dentry); 196 rc = ecryptfs_write_metadata(ecryptfs_dentry);
194 if (rc) { 197 if (rc) {
@@ -255,7 +258,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
255 ecryptfs_dentry->d_parent)); 258 ecryptfs_dentry->d_parent));
256 lower_inode = lower_dentry->d_inode; 259 lower_inode = lower_dentry->d_inode;
257 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); 260 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
258 BUG_ON(!atomic_read(&lower_dentry->d_count)); 261 BUG_ON(!lower_dentry->d_count);
259 ecryptfs_set_dentry_private(ecryptfs_dentry, 262 ecryptfs_set_dentry_private(ecryptfs_dentry,
260 kmem_cache_alloc(ecryptfs_dentry_info_cache, 263 kmem_cache_alloc(ecryptfs_dentry_info_cache,
261 GFP_KERNEL)); 264 GFP_KERNEL));
@@ -297,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
297 rc = -ENOMEM; 300 rc = -ENOMEM;
298 goto out; 301 goto out;
299 } 302 }
300 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 303 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
301 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 304 if (rc) {
302 if (rc) { 305 printk(KERN_ERR "%s: Error attempting to initialize "
303 printk(KERN_ERR "%s: Error attempting to initialize " 306 "the persistent file for the dentry with name "
304 "the persistent file for the dentry with name " 307 "[%s]; rc = [%d]\n", __func__,
305 "[%s]; rc = [%d]\n", __func__, 308 ecryptfs_dentry->d_name.name, rc);
306 ecryptfs_dentry->d_name.name, rc); 309 goto out_free_kmem;
307 goto out_free_kmem;
308 }
309 } 310 }
310 crypt_stat = &ecryptfs_inode_to_private( 311 crypt_stat = &ecryptfs_inode_to_private(
311 ecryptfs_dentry->d_inode)->crypt_stat; 312 ecryptfs_dentry->d_inode)->crypt_stat;
@@ -436,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
436 struct qstr lower_name; 437 struct qstr lower_name;
437 int rc = 0; 438 int rc = 0;
438 439
439 ecryptfs_dentry->d_op = &ecryptfs_dops;
440 if ((ecryptfs_dentry->d_name.len == 1 440 if ((ecryptfs_dentry->d_name.len == 1
441 && !strcmp(ecryptfs_dentry->d_name.name, ".")) 441 && !strcmp(ecryptfs_dentry->d_name.name, "."))
442 || (ecryptfs_dentry->d_name.len == 2 442 || (ecryptfs_dentry->d_name.len == 2
@@ -449,7 +449,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
449 lower_name.hash = ecryptfs_dentry->d_name.hash; 449 lower_name.hash = ecryptfs_dentry->d_name.hash;
450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
452 &lower_name); 452 lower_dir_dentry->d_inode, &lower_name);
453 if (rc < 0) 453 if (rc < 0)
454 goto out_d_drop; 454 goto out_d_drop;
455 } 455 }
@@ -484,7 +484,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len); 484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
487 &lower_name); 487 lower_dir_dentry->d_inode, &lower_name);
488 if (rc < 0) 488 if (rc < 0)
489 goto out_d_drop; 489 goto out_d_drop;
490 } 490 }
@@ -975,8 +975,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
975} 975}
976 976
977static int 977static int
978ecryptfs_permission(struct inode *inode, int mask) 978ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
979{ 979{
980 if (flags & IPERM_FLAG_RCU)
981 return -ECHILD;
980 return inode_permission(ecryptfs_inode_to_lower(inode), mask); 982 return inode_permission(ecryptfs_inode_to_lower(inode), mask);
981} 983}
982 984
@@ -1108,10 +1110,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1108 rc = -EOPNOTSUPP; 1110 rc = -EOPNOTSUPP;
1109 goto out; 1111 goto out;
1110 } 1112 }
1111 mutex_lock(&lower_dentry->d_inode->i_mutex); 1113
1112 rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value, 1114 rc = vfs_setxattr(lower_dentry, name, value, size, flags);
1113 size, flags);
1114 mutex_unlock(&lower_dentry->d_inode->i_mutex);
1115out: 1115out:
1116 return rc; 1116 return rc;
1117} 1117}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 73811cfa2ea4..c1436cff6f2d 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
59 break; 59 break;
60 default: 60 default:
61 ecryptfs_printk(KERN_WARNING, "Unknown error code: " 61 ecryptfs_printk(KERN_WARNING, "Unknown error code: "
62 "[0x%.16x]\n", err_code); 62 "[0x%.16lx]\n", err_code);
63 rc = -EINVAL; 63 rc = -EINVAL;
64 } 64 }
65 return rc; 65 return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
130 } else { 130 } else {
131 rc = -EINVAL; 131 rc = -EINVAL;
132 ecryptfs_printk(KERN_WARNING, 132 ecryptfs_printk(KERN_WARNING,
133 "Unsupported packet size: [%d]\n", size); 133 "Unsupported packet size: [%zd]\n", size);
134 } 134 }
135 return rc; 135 return rc;
136} 136}
@@ -446,6 +446,7 @@ out:
446 */ 446 */
447static int 447static int
448ecryptfs_find_auth_tok_for_sig( 448ecryptfs_find_auth_tok_for_sig(
449 struct key **auth_tok_key,
449 struct ecryptfs_auth_tok **auth_tok, 450 struct ecryptfs_auth_tok **auth_tok,
450 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 451 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
451 char *sig) 452 char *sig)
@@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig(
453 struct ecryptfs_global_auth_tok *global_auth_tok; 454 struct ecryptfs_global_auth_tok *global_auth_tok;
454 int rc = 0; 455 int rc = 0;
455 456
457 (*auth_tok_key) = NULL;
456 (*auth_tok) = NULL; 458 (*auth_tok) = NULL;
457 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, 459 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
458 mount_crypt_stat, sig)) { 460 mount_crypt_stat, sig)) {
459 struct key *auth_tok_key;
460 461
461 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, 462 /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
463 * mount_crypt_stat structure, we prevent to use auth toks that
464 * are not inserted through the ecryptfs_add_global_auth_tok
465 * function.
466 */
467 if (mount_crypt_stat->flags
468 & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
469 return -EINVAL;
470
471 rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
462 sig); 472 sig);
463 } else 473 } else
464 (*auth_tok) = global_auth_tok->global_auth_tok; 474 (*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,6 +519,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
509 char *filename, size_t filename_size) 519 char *filename, size_t filename_size)
510{ 520{
511 struct ecryptfs_write_tag_70_packet_silly_stack *s; 521 struct ecryptfs_write_tag_70_packet_silly_stack *s;
522 struct key *auth_tok_key = NULL;
512 int rc = 0; 523 int rc = 0;
513 524
514 s = kmalloc(sizeof(*s), GFP_KERNEL); 525 s = kmalloc(sizeof(*s), GFP_KERNEL);
@@ -606,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
606 } 617 }
607 dest[s->i++] = s->cipher_code; 618 dest[s->i++] = s->cipher_code;
608 rc = ecryptfs_find_auth_tok_for_sig( 619 rc = ecryptfs_find_auth_tok_for_sig(
620 &auth_tok_key,
609 &s->auth_tok, mount_crypt_stat, 621 &s->auth_tok, mount_crypt_stat,
610 mount_crypt_stat->global_default_fnek_sig); 622 mount_crypt_stat->global_default_fnek_sig);
611 if (rc) { 623 if (rc) {
@@ -753,6 +765,8 @@ out_free_unlock:
753out_unlock: 765out_unlock:
754 mutex_unlock(s->tfm_mutex); 766 mutex_unlock(s->tfm_mutex);
755out: 767out:
768 if (auth_tok_key)
769 key_put(auth_tok_key);
756 kfree(s); 770 kfree(s);
757 return rc; 771 return rc;
758} 772}
@@ -798,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
798 char *data, size_t max_packet_size) 812 char *data, size_t max_packet_size)
799{ 813{
800 struct ecryptfs_parse_tag_70_packet_silly_stack *s; 814 struct ecryptfs_parse_tag_70_packet_silly_stack *s;
815 struct key *auth_tok_key = NULL;
801 int rc = 0; 816 int rc = 0;
802 817
803 (*packet_size) = 0; 818 (*packet_size) = 0;
@@ -910,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
910 * >= ECRYPTFS_MAX_IV_BYTES. */ 925 * >= ECRYPTFS_MAX_IV_BYTES. */
911 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); 926 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
912 s->desc.info = s->iv; 927 s->desc.info = s->iv;
913 rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, 928 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
929 &s->auth_tok, mount_crypt_stat,
914 s->fnek_sig_hex); 930 s->fnek_sig_hex);
915 if (rc) { 931 if (rc) {
916 printk(KERN_ERR "%s: Error attempting to find auth tok for " 932 printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -986,6 +1002,8 @@ out:
986 (*filename_size) = 0; 1002 (*filename_size) = 0;
987 (*filename) = NULL; 1003 (*filename) = NULL;
988 } 1004 }
1005 if (auth_tok_key)
1006 key_put(auth_tok_key);
989 kfree(s); 1007 kfree(s);
990 return rc; 1008 return rc;
991} 1009}
@@ -1557,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
1557 ECRYPTFS_VERSION_MAJOR, 1575 ECRYPTFS_VERSION_MAJOR,
1558 ECRYPTFS_VERSION_MINOR); 1576 ECRYPTFS_VERSION_MINOR);
1559 rc = -EINVAL; 1577 rc = -EINVAL;
1560 goto out; 1578 goto out_release_key;
1561 } 1579 }
1562 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD 1580 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
1563 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { 1581 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
1564 printk(KERN_ERR "Invalid auth_tok structure " 1582 printk(KERN_ERR "Invalid auth_tok structure "
1565 "returned from key query\n"); 1583 "returned from key query\n");
1566 rc = -EINVAL; 1584 rc = -EINVAL;
1567 goto out; 1585 goto out_release_key;
1586 }
1587out_release_key:
1588 if (rc) {
1589 key_put(*auth_tok_key);
1590 (*auth_tok_key) = NULL;
1568 } 1591 }
1569out: 1592out:
1570 return rc; 1593 return rc;
@@ -1649,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1649 auth_tok->session_key.decrypted_key_size); 1672 auth_tok->session_key.decrypted_key_size);
1650 crypt_stat->flags |= ECRYPTFS_KEY_VALID; 1673 crypt_stat->flags |= ECRYPTFS_KEY_VALID;
1651 if (unlikely(ecryptfs_verbosity > 0)) { 1674 if (unlikely(ecryptfs_verbosity > 0)) {
1652 ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n", 1675 ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
1653 crypt_stat->key_size); 1676 crypt_stat->key_size);
1654 ecryptfs_dump_hex(crypt_stat->key, 1677 ecryptfs_dump_hex(crypt_stat->key,
1655 crypt_stat->key_size); 1678 crypt_stat->key_size);
@@ -1688,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1688 struct ecryptfs_auth_tok_list_item *auth_tok_list_item; 1711 struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
1689 size_t tag_11_contents_size; 1712 size_t tag_11_contents_size;
1690 size_t tag_11_packet_size; 1713 size_t tag_11_packet_size;
1714 struct key *auth_tok_key = NULL;
1691 int rc = 0; 1715 int rc = 0;
1692 1716
1693 INIT_LIST_HEAD(&auth_tok_list); 1717 INIT_LIST_HEAD(&auth_tok_list);
@@ -1730,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1730 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { 1754 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
1731 ecryptfs_printk(KERN_ERR, "Expected " 1755 ecryptfs_printk(KERN_ERR, "Expected "
1732 "signature of size [%d]; " 1756 "signature of size [%d]; "
1733 "read size [%d]\n", 1757 "read size [%zd]\n",
1734 ECRYPTFS_SIG_SIZE, 1758 ECRYPTFS_SIG_SIZE,
1735 tag_11_contents_size); 1759 tag_11_contents_size);
1736 rc = -EIO; 1760 rc = -EIO;
@@ -1763,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1763 goto out_wipe_list; 1787 goto out_wipe_list;
1764 break; 1788 break;
1765 default: 1789 default:
1766 ecryptfs_printk(KERN_DEBUG, "No packet at offset " 1790 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
1767 "[%d] of the file header; hex value of " 1791 "of the file header; hex value of "
1768 "character is [0x%.2x]\n", i, src[i]); 1792 "character is [0x%.2x]\n", i, src[i]);
1769 next_packet_is_auth_tok_packet = 0; 1793 next_packet_is_auth_tok_packet = 0;
1770 } 1794 }
@@ -1784,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1784 * just one will be sufficient to decrypt to get the FEK. */ 1808 * just one will be sufficient to decrypt to get the FEK. */
1785find_next_matching_auth_tok: 1809find_next_matching_auth_tok:
1786 found_auth_tok = 0; 1810 found_auth_tok = 0;
1811 if (auth_tok_key) {
1812 key_put(auth_tok_key);
1813 auth_tok_key = NULL;
1814 }
1787 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { 1815 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
1788 candidate_auth_tok = &auth_tok_list_item->auth_tok; 1816 candidate_auth_tok = &auth_tok_list_item->auth_tok;
1789 if (unlikely(ecryptfs_verbosity > 0)) { 1817 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1800,10 +1828,11 @@ find_next_matching_auth_tok:
1800 rc = -EINVAL; 1828 rc = -EINVAL;
1801 goto out_wipe_list; 1829 goto out_wipe_list;
1802 } 1830 }
1803 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, 1831 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
1832 &matching_auth_tok,
1804 crypt_stat->mount_crypt_stat, 1833 crypt_stat->mount_crypt_stat,
1805 candidate_auth_tok_sig); 1834 candidate_auth_tok_sig);
1806 if (matching_auth_tok) { 1835 if (!rc) {
1807 found_auth_tok = 1; 1836 found_auth_tok = 1;
1808 goto found_matching_auth_tok; 1837 goto found_matching_auth_tok;
1809 } 1838 }
@@ -1835,8 +1864,8 @@ found_matching_auth_tok:
1835 "session key for authentication token with sig " 1864 "session key for authentication token with sig "
1836 "[%.*s]; rc = [%d]. Removing auth tok " 1865 "[%.*s]; rc = [%d]. Removing auth tok "
1837 "candidate from the list and searching for " 1866 "candidate from the list and searching for "
1838 "the next match.\n", candidate_auth_tok_sig, 1867 "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
1839 ECRYPTFS_SIG_SIZE_HEX, rc); 1868 candidate_auth_tok_sig, rc);
1840 list_for_each_entry_safe(auth_tok_list_item, 1869 list_for_each_entry_safe(auth_tok_list_item,
1841 auth_tok_list_item_tmp, 1870 auth_tok_list_item_tmp,
1842 &auth_tok_list, list) { 1871 &auth_tok_list, list) {
@@ -1866,6 +1895,8 @@ found_matching_auth_tok:
1866out_wipe_list: 1895out_wipe_list:
1867 wipe_auth_tok_list(&auth_tok_list); 1896 wipe_auth_tok_list(&auth_tok_list);
1868out: 1897out:
1898 if (auth_tok_key)
1899 key_put(auth_tok_key);
1869 return rc; 1900 return rc;
1870} 1901}
1871 1902
@@ -2137,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2137 if (encrypted_session_key_valid) { 2168 if (encrypted_session_key_valid) {
2138 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " 2169 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
2139 "using auth_tok->session_key.encrypted_key, " 2170 "using auth_tok->session_key.encrypted_key, "
2140 "where key_rec->enc_key_size = [%d]\n", 2171 "where key_rec->enc_key_size = [%zd]\n",
2141 key_rec->enc_key_size); 2172 key_rec->enc_key_size);
2142 memcpy(key_rec->enc_key, 2173 memcpy(key_rec->enc_key,
2143 auth_tok->session_key.encrypted_key, 2174 auth_tok->session_key.encrypted_key,
@@ -2167,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2167 if (rc < 1 || rc > 2) { 2198 if (rc < 1 || rc > 2) {
2168 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2199 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2169 "for crypt_stat session key; expected rc = 1; " 2200 "for crypt_stat session key; expected rc = 1; "
2170 "got rc = [%d]. key_rec->enc_key_size = [%d]\n", 2201 "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
2171 rc, key_rec->enc_key_size); 2202 rc, key_rec->enc_key_size);
2172 rc = -ENOMEM; 2203 rc = -ENOMEM;
2173 goto out; 2204 goto out;
@@ -2178,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2178 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2209 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2179 "for crypt_stat encrypted session key; " 2210 "for crypt_stat encrypted session key; "
2180 "expected rc = 1; got rc = [%d]. " 2211 "expected rc = 1; got rc = [%d]. "
2181 "key_rec->enc_key_size = [%d]\n", rc, 2212 "key_rec->enc_key_size = [%zd]\n", rc,
2182 key_rec->enc_key_size); 2213 key_rec->enc_key_size);
2183 rc = -ENOMEM; 2214 rc = -ENOMEM;
2184 goto out; 2215 goto out;
@@ -2193,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2193 goto out; 2224 goto out;
2194 } 2225 }
2195 rc = 0; 2226 rc = 0;
2196 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n", 2227 ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
2197 crypt_stat->key_size); 2228 crypt_stat->key_size);
2198 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, 2229 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
2199 (*key_rec).enc_key_size); 2230 (*key_rec).enc_key_size);
@@ -2204,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2204 } 2235 }
2205 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); 2236 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
2206 if (ecryptfs_verbosity > 0) { 2237 if (ecryptfs_verbosity > 0) {
2207 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n", 2238 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
2208 key_rec->enc_key_size); 2239 key_rec->enc_key_size);
2209 ecryptfs_dump_hex(key_rec->enc_key, 2240 ecryptfs_dump_hex(key_rec->enc_key,
2210 key_rec->enc_key_size); 2241 key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..758323a0f09a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/magic.h>
39#include "ecryptfs_kernel.h" 40#include "ecryptfs_kernel.h"
40 41
41/** 42/**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
141 return rc; 142 return rc;
142} 143}
143 144
144/** 145static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
145 * ecryptfs_interpose 146 struct super_block *sb)
146 * @lower_dentry: Existing dentry in the lower filesystem
147 * @dentry: ecryptfs' dentry
148 * @sb: ecryptfs's super_block
149 * @flags: flags to govern behavior of interpose procedure
150 *
151 * Interposes upper and lower dentries.
152 *
153 * Returns zero on success; non-zero otherwise
154 */
155int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
156 struct super_block *sb, u32 flags)
157{ 147{
158 struct inode *lower_inode;
159 struct inode *inode; 148 struct inode *inode;
160 int rc = 0; 149 int rc = 0;
161 150
162 lower_inode = lower_dentry->d_inode;
163 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { 151 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
164 rc = -EXDEV; 152 rc = -EXDEV;
165 goto out; 153 goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
189 if (special_file(lower_inode->i_mode)) 177 if (special_file(lower_inode->i_mode))
190 init_special_inode(inode, lower_inode->i_mode, 178 init_special_inode(inode, lower_inode->i_mode,
191 lower_inode->i_rdev); 179 lower_inode->i_rdev);
192 dentry->d_op = &ecryptfs_dops;
193 fsstack_copy_attr_all(inode, lower_inode); 180 fsstack_copy_attr_all(inode, lower_inode);
194 /* This size will be overwritten for real files w/ headers and 181 /* This size will be overwritten for real files w/ headers and
195 * other metadata */ 182 * other metadata */
196 fsstack_copy_inode_size(inode, lower_inode); 183 fsstack_copy_inode_size(inode, lower_inode);
184 return inode;
185out:
186 return ERR_PTR(rc);
187}
188
189/**
190 * ecryptfs_interpose
191 * @lower_dentry: Existing dentry in the lower filesystem
192 * @dentry: ecryptfs' dentry
193 * @sb: ecryptfs's super_block
194 * @flags: flags to govern behavior of interpose procedure
195 *
196 * Interposes upper and lower dentries.
197 *
198 * Returns zero on success; non-zero otherwise
199 */
200int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
201 struct super_block *sb, u32 flags)
202{
203 struct inode *lower_inode = lower_dentry->d_inode;
204 struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
205 if (IS_ERR(inode))
206 return PTR_ERR(inode);
197 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) 207 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
198 d_add(dentry, inode); 208 d_add(dentry, inode);
199 else 209 else
200 d_instantiate(dentry, inode); 210 d_instantiate(dentry, inode);
201out: 211 return 0;
202 return rc;
203} 212}
204 213
205enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, 214enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -208,7 +217,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 217 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, 218 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, 219 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_unlink_sigs, ecryptfs_opt_err }; 220 ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
221 ecryptfs_opt_err };
212 222
213static const match_table_t tokens = { 223static const match_table_t tokens = {
214 {ecryptfs_opt_sig, "sig=%s"}, 224 {ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +233,7 @@ static const match_table_t tokens = {
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, 233 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, 234 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
225 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, 235 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
236 {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
226 {ecryptfs_opt_err, NULL} 237 {ecryptfs_opt_err, NULL}
227}; 238};
228 239
@@ -406,6 +417,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
406 case ecryptfs_opt_unlink_sigs: 417 case ecryptfs_opt_unlink_sigs:
407 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; 418 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
408 break; 419 break;
420 case ecryptfs_opt_mount_auth_tok_only:
421 mount_crypt_stat->flags |=
422 ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
423 break;
409 case ecryptfs_opt_err: 424 case ecryptfs_opt_err:
410 default: 425 default:
411 printk(KERN_WARNING 426 printk(KERN_WARNING
@@ -486,68 +501,21 @@ struct kmem_cache *ecryptfs_sb_info_cache;
486static struct file_system_type ecryptfs_fs_type; 501static struct file_system_type ecryptfs_fs_type;
487 502
488/** 503/**
489 * ecryptfs_read_super
490 * @sb: The ecryptfs super block
491 * @dev_name: The path to mount over
492 *
493 * Read the super block of the lower filesystem, and use
494 * ecryptfs_interpose to create our initial inode and super block
495 * struct.
496 */
497static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
498{
499 struct path path;
500 int rc;
501
502 rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
503 if (rc) {
504 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
505 goto out;
506 }
507 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
508 rc = -EINVAL;
509 printk(KERN_ERR "Mount on filesystem of type "
510 "eCryptfs explicitly disallowed due to "
511 "known incompatibilities\n");
512 goto out_free;
513 }
514 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
515 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
516 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
517 ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
518 ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
519 rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
520 if (rc)
521 goto out_free;
522 rc = 0;
523 goto out;
524out_free:
525 path_put(&path);
526out:
527 return rc;
528}
529
530/**
531 * ecryptfs_get_sb 504 * ecryptfs_get_sb
532 * @fs_type 505 * @fs_type
533 * @flags 506 * @flags
534 * @dev_name: The path to mount over 507 * @dev_name: The path to mount over
535 * @raw_data: The options passed into the kernel 508 * @raw_data: The options passed into the kernel
536 *
537 * The whole ecryptfs_get_sb process is broken into 3 functions:
538 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
539 * ecryptfs_read_super(): this accesses the lower filesystem and uses
540 * ecryptfs_interpose to perform most of the linking
541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
542 */ 509 */
543static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, 510static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
544 const char *dev_name, void *raw_data, 511 const char *dev_name, void *raw_data)
545 struct vfsmount *mnt)
546{ 512{
547 struct super_block *s; 513 struct super_block *s;
548 struct ecryptfs_sb_info *sbi; 514 struct ecryptfs_sb_info *sbi;
549 struct ecryptfs_dentry_info *root_info; 515 struct ecryptfs_dentry_info *root_info;
550 const char *err = "Getting sb failed"; 516 const char *err = "Getting sb failed";
517 struct inode *inode;
518 struct path path;
551 int rc; 519 int rc;
552 520
553 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); 521 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -570,10 +538,8 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
570 538
571 s->s_flags = flags; 539 s->s_flags = flags;
572 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); 540 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
573 if (rc) { 541 if (rc)
574 deactivate_locked_super(s); 542 goto out1;
575 goto out;
576 }
577 543
578 ecryptfs_set_superblock_private(s, sbi); 544 ecryptfs_set_superblock_private(s, sbi);
579 s->s_bdi = &sbi->bdi; 545 s->s_bdi = &sbi->bdi;
@@ -581,42 +547,62 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
581 /* ->kill_sb() will take care of sbi after that point */ 547 /* ->kill_sb() will take care of sbi after that point */
582 sbi = NULL; 548 sbi = NULL;
583 s->s_op = &ecryptfs_sops; 549 s->s_op = &ecryptfs_sops;
550 s->s_d_op = &ecryptfs_dops;
584 551
585 rc = -ENOMEM; 552 err = "Reading sb failed";
586 s->s_root = d_alloc(NULL, &(const struct qstr) { 553 rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
587 .hash = 0,.name = "/",.len = 1}); 554 if (rc) {
555 ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
556 goto out1;
557 }
558 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
559 rc = -EINVAL;
560 printk(KERN_ERR "Mount on filesystem of type "
561 "eCryptfs explicitly disallowed due to "
562 "known incompatibilities\n");
563 goto out_free;
564 }
565 ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
567 s->s_blocksize = path.dentry->d_sb->s_blocksize;
568 s->s_magic = ECRYPTFS_SUPER_MAGIC;
569
570 inode = ecryptfs_get_inode(path.dentry->d_inode, s);
571 rc = PTR_ERR(inode);
572 if (IS_ERR(inode))
573 goto out_free;
574
575 s->s_root = d_alloc_root(inode);
588 if (!s->s_root) { 576 if (!s->s_root) {
589 deactivate_locked_super(s); 577 iput(inode);
590 goto out; 578 rc = -ENOMEM;
579 goto out_free;
591 } 580 }
592 s->s_root->d_op = &ecryptfs_dops;
593 s->s_root->d_sb = s;
594 s->s_root->d_parent = s->s_root;
595 581
582 rc = -ENOMEM;
596 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); 583 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
597 if (!root_info) { 584 if (!root_info)
598 deactivate_locked_super(s); 585 goto out_free;
599 goto out; 586
600 }
601 /* ->kill_sb() will take care of root_info */ 587 /* ->kill_sb() will take care of root_info */
602 ecryptfs_set_dentry_private(s->s_root, root_info); 588 ecryptfs_set_dentry_private(s->s_root, root_info);
589 ecryptfs_set_dentry_lower(s->s_root, path.dentry);
590 ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
591
603 s->s_flags |= MS_ACTIVE; 592 s->s_flags |= MS_ACTIVE;
604 rc = ecryptfs_read_super(s, dev_name); 593 return dget(s->s_root);
605 if (rc) {
606 deactivate_locked_super(s);
607 err = "Reading sb failed";
608 goto out;
609 }
610 simple_set_mnt(mnt, s);
611 return 0;
612 594
595out_free:
596 path_put(&path);
597out1:
598 deactivate_locked_super(s);
613out: 599out:
614 if (sbi) { 600 if (sbi) {
615 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); 601 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi); 602 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 } 603 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc); 604 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
619 return rc; 605 return ERR_PTR(rc);
620} 606}
621 607
622/** 608/**
@@ -639,7 +625,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
639static struct file_system_type ecryptfs_fs_type = { 625static struct file_system_type ecryptfs_fs_type = {
640 .owner = THIS_MODULE, 626 .owner = THIS_MODULE,
641 .name = "ecryptfs", 627 .name = "ecryptfs",
642 .get_sb = ecryptfs_get_sb, 628 .mount = ecryptfs_mount,
643 .kill_sb = ecryptfs_kill_block_super, 629 .kill_sb = ecryptfs_kill_block_super,
644 .fs_flags = 0 630 .fs_flags = 0
645}; 631};
@@ -824,9 +810,10 @@ static int __init ecryptfs_init(void)
824 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " 810 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
825 "larger than the host's page size, and so " 811 "larger than the host's page size, and so "
826 "eCryptfs cannot run on this system. The " 812 "eCryptfs cannot run on this system. The "
827 "default eCryptfs extent size is [%d] bytes; " 813 "default eCryptfs extent size is [%u] bytes; "
828 "the page size is [%d] bytes.\n", 814 "the page size is [%lu] bytes.\n",
829 ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE); 815 ECRYPTFS_DEFAULT_EXTENT_SIZE,
816 (unsigned long)PAGE_CACHE_SIZE);
830 goto out; 817 goto out;
831 } 818 }
832 rc = ecryptfs_init_kmem_caches(); 819 rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 00208c3d7e92..940a82e63dc3 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
482 .read = ecryptfs_miscdev_read, 482 .read = ecryptfs_miscdev_read,
483 .write = ecryptfs_miscdev_write, 483 .write = ecryptfs_miscdev_write,
484 .release = ecryptfs_miscdev_release, 484 .release = ecryptfs_miscdev_release,
485 .llseek = noop_llseek,
485}; 486};
486 487
487static struct miscdevice ecryptfs_miscdev = { 488static struct miscdevice ecryptfs_miscdev = {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544b..cc64fca89f8d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
65 rc = ecryptfs_encrypt_page(page); 65 rc = ecryptfs_encrypt_page(page);
66 if (rc) { 66 if (rc) {
67 ecryptfs_printk(KERN_WARNING, "Error encrypting " 67 ecryptfs_printk(KERN_WARNING, "Error encrypting "
68 "page (upper index [0x%.16x])\n", page->index); 68 "page (upper index [0x%.16lx])\n", page->index);
69 ClearPageUptodate(page); 69 ClearPageUptodate(page);
70 goto out; 70 goto out;
71 } 71 }
@@ -237,7 +237,7 @@ out:
237 ClearPageUptodate(page); 237 ClearPageUptodate(page);
238 else 238 else
239 SetPageUptodate(page); 239 SetPageUptodate(page);
240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", 240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
241 page->index); 241 page->index);
242 unlock_page(page); 242 unlock_page(page);
243 return rc; 243 return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
290 return -ENOMEM; 290 return -ENOMEM;
291 *pagep = page; 291 *pagep = page;
292 292
293 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
293 if (!PageUptodate(page)) { 294 if (!PageUptodate(page)) {
294 struct ecryptfs_crypt_stat *crypt_stat = 295 struct ecryptfs_crypt_stat *crypt_stat =
295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat; 296 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
335 SetPageUptodate(page); 336 SetPageUptodate(page);
336 } 337 }
337 } else { 338 } else {
338 rc = ecryptfs_decrypt_page(page); 339 if (prev_page_end_size
339 if (rc) { 340 >= i_size_read(page->mapping->host)) {
340 printk(KERN_ERR "%s: Error decrypting page " 341 zero_user(page, 0, PAGE_CACHE_SIZE);
341 "at index [%ld]; rc = [%d]\n", 342 } else {
342 __func__, page->index, rc); 343 rc = ecryptfs_decrypt_page(page);
343 ClearPageUptodate(page); 344 if (rc) {
344 goto out; 345 printk(KERN_ERR "%s: Error decrypting "
346 "page at index [%ld]; "
347 "rc = [%d]\n",
348 __func__, page->index, rc);
349 ClearPageUptodate(page);
350 goto out;
351 }
345 } 352 }
346 SetPageUptodate(page); 353 SetPageUptodate(page);
347 } 354 }
348 } 355 }
349 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
350 /* If creating a page or more of holes, zero them out via truncate. 356 /* If creating a page or more of holes, zero them out via truncate.
351 * Note, this will increase i_size. */ 357 * Note, this will increase i_size. */
352 if (index != 0) { 358 if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
488 } else 494 } else
489 ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); 495 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
490 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 496 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
491 "(page w/ index = [0x%.16x], to = [%d])\n", index, to); 497 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
492 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 498 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
493 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, 499 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
494 to); 500 to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
503 rc = fill_zeros_to_end_of_page(page, to); 509 rc = fill_zeros_to_end_of_page(page, to);
504 if (rc) { 510 if (rc) {
505 ecryptfs_printk(KERN_WARNING, "Error attempting to fill " 511 ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
506 "zeros in page with index = [0x%.16x]\n", index); 512 "zeros in page with index = [0x%.16lx]\n", index);
507 goto out; 513 goto out;
508 } 514 }
509 rc = ecryptfs_encrypt_page(page); 515 rc = ecryptfs_encrypt_page(page);
510 if (rc) { 516 if (rc) {
511 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " 517 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
512 "index [0x%.16x])\n", index); 518 "index [0x%.16lx])\n", index);
513 goto out; 519 goto out;
514 } 520 }
515 if (pos + copied > i_size_read(ecryptfs_inode)) { 521 if (pos + copied > i_size_read(ecryptfs_inode)) {
516 i_size_write(ecryptfs_inode, pos + copied); 522 i_size_write(ecryptfs_inode, pos + copied);
517 ecryptfs_printk(KERN_DEBUG, "Expanded file size to " 523 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
518 "[0x%.16x]\n", i_size_read(ecryptfs_inode)); 524 "[0x%.16llx]\n",
525 (unsigned long long)i_size_read(ecryptfs_inode));
519 } 526 }
520 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 527 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
521 if (rc) 528 if (rc)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f7fc286a3aa9..3042fe123a34 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/seq_file.h> 30#include <linux/seq_file.h>
31#include <linux/smp_lock.h>
32#include <linux/file.h> 31#include <linux/file.h>
33#include <linux/crypto.h> 32#include <linux/crypto.h>
34#include "ecryptfs_kernel.h" 33#include "ecryptfs_kernel.h"
@@ -63,6 +62,16 @@ out:
63 return inode; 62 return inode;
64} 63}
65 64
65static void ecryptfs_i_callback(struct rcu_head *head)
66{
67 struct inode *inode = container_of(head, struct inode, i_rcu);
68 struct ecryptfs_inode_info *inode_info;
69 inode_info = ecryptfs_inode_to_private(inode);
70
71 INIT_LIST_HEAD(&inode->i_dentry);
72 kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
73}
74
66/** 75/**
67 * ecryptfs_destroy_inode 76 * ecryptfs_destroy_inode
68 * @inode: The ecryptfs inode 77 * @inode: The ecryptfs inode
@@ -89,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
89 } 98 }
90 } 99 }
91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 100 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
92 kmem_cache_free(ecryptfs_inode_info_cache, inode_info); 101 call_rcu(&inode->i_rcu, ecryptfs_i_callback);
93} 102}
94 103
95/** 104/**
@@ -180,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
180 seq_printf(m, ",ecryptfs_encrypted_view"); 189 seq_printf(m, ",ecryptfs_encrypted_view");
181 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) 190 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
182 seq_printf(m, ",ecryptfs_unlink_sigs"); 191 seq_printf(m, ",ecryptfs_unlink_sigs");
192 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
193 seq_printf(m, ",ecryptfs_mount_auth_tok_only");
183 194
184 return 0; 195 return 0;
185} 196}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f04942810818..0f31acb0131c 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
20static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); 20static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
21static int efs_fill_super(struct super_block *s, void *d, int silent); 21static int efs_fill_super(struct super_block *s, void *d, int silent);
22 22
23static int efs_get_sb(struct file_system_type *fs_type, 23static struct dentry *efs_mount(struct file_system_type *fs_type,
24 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 24 int flags, const char *dev_name, void *data)
25{ 25{
26 return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt); 26 return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
27} 27}
28 28
29static struct file_system_type efs_fs_type = { 29static struct file_system_type efs_fs_type = {
30 .owner = THIS_MODULE, 30 .owner = THIS_MODULE,
31 .name = "efs", 31 .name = "efs",
32 .get_sb = efs_get_sb, 32 .mount = efs_mount,
33 .kill_sb = kill_block_super, 33 .kill_sb = kill_block_super,
34 .fs_flags = FS_REQUIRES_DEV, 34 .fs_flags = FS_REQUIRES_DEV,
35}; 35};
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
65 return &ei->vfs_inode; 65 return &ei->vfs_inode;
66} 66}
67 67
68static void efs_destroy_inode(struct inode *inode) 68static void efs_i_callback(struct rcu_head *head)
69{ 69{
70 struct inode *inode = container_of(head, struct inode, i_rcu);
71 INIT_LIST_HEAD(&inode->i_dentry);
70 kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); 72 kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
71} 73}
72 74
75static void efs_destroy_inode(struct inode *inode)
76{
77 call_rcu(&inode->i_rcu, efs_i_callback);
78}
79
73static void init_once(void *foo) 80static void init_once(void *foo)
74{ 81{
75 struct efs_inode_info *ei = (struct efs_inode_info *) foo; 82 struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf88..e0194b3e14d6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
293 .poll = eventfd_poll, 293 .poll = eventfd_poll,
294 .read = eventfd_read, 294 .read = eventfd_read,
295 .write = eventfd_write, 295 .write = eventfd_write,
296 .llseek = noop_llseek,
296}; 297};
297 298
298/** 299/**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919cb..267d0ada4541 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
77/* Maximum number of nesting allowed inside epoll sets */ 77/* Maximum number of nesting allowed inside epoll sets */
78#define EP_MAX_NESTS 4 78#define EP_MAX_NESTS 4
79 79
80/* Maximum msec timeout value storeable in a long int */
81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
82
83#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 80#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
84 81
85#define EP_UNACTIVE_PTR ((void *) -1L) 82#define EP_UNACTIVE_PTR ((void *) -1L)
@@ -220,7 +217,7 @@ struct ep_send_events_data {
220 * Configuration options available inside /proc/sys/fs/epoll/ 217 * Configuration options available inside /proc/sys/fs/epoll/
221 */ 218 */
222/* Maximum number of epoll watched descriptors, per user */ 219/* Maximum number of epoll watched descriptors, per user */
223static int max_user_watches __read_mostly; 220static long max_user_watches __read_mostly;
224 221
225/* 222/*
226 * This mutex is used to serialize ep_free() and eventpoll_release_file(). 223 * This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -243,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
243 240
244#include <linux/sysctl.h> 241#include <linux/sysctl.h>
245 242
246static int zero; 243static long zero;
244static long long_max = LONG_MAX;
247 245
248ctl_table epoll_table[] = { 246ctl_table epoll_table[] = {
249 { 247 {
250 .procname = "max_user_watches", 248 .procname = "max_user_watches",
251 .data = &max_user_watches, 249 .data = &max_user_watches,
252 .maxlen = sizeof(int), 250 .maxlen = sizeof(max_user_watches),
253 .mode = 0644, 251 .mode = 0644,
254 .proc_handler = proc_dointvec_minmax, 252 .proc_handler = proc_doulongvec_minmax,
255 .extra1 = &zero, 253 .extra1 = &zero,
254 .extra2 = &long_max,
256 }, 255 },
257 { } 256 { }
258}; 257};
@@ -564,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
564 /* At this point it is safe to free the eventpoll item */ 563 /* At this point it is safe to free the eventpoll item */
565 kmem_cache_free(epi_cache, epi); 564 kmem_cache_free(epi_cache, epi);
566 565
567 atomic_dec(&ep->user->epoll_watches); 566 atomic_long_dec(&ep->user->epoll_watches);
568 567
569 return 0; 568 return 0;
570} 569}
@@ -674,7 +673,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
674/* File callbacks that implement the eventpoll file behaviour */ 673/* File callbacks that implement the eventpoll file behaviour */
675static const struct file_operations eventpoll_fops = { 674static const struct file_operations eventpoll_fops = {
676 .release = ep_eventpoll_release, 675 .release = ep_eventpoll_release,
677 .poll = ep_eventpoll_poll 676 .poll = ep_eventpoll_poll,
677 .llseek = noop_llseek,
678}; 678};
679 679
680/* Fast test to see if the file is an evenpoll file */ 680/* Fast test to see if the file is an evenpoll file */
@@ -900,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
900{ 900{
901 int error, revents, pwake = 0; 901 int error, revents, pwake = 0;
902 unsigned long flags; 902 unsigned long flags;
903 long user_watches;
903 struct epitem *epi; 904 struct epitem *epi;
904 struct ep_pqueue epq; 905 struct ep_pqueue epq;
905 906
906 if (unlikely(atomic_read(&ep->user->epoll_watches) >= 907 user_watches = atomic_long_read(&ep->user->epoll_watches);
907 max_user_watches)) 908 if (unlikely(user_watches >= max_user_watches))
908 return -ENOSPC; 909 return -ENOSPC;
909 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 910 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
910 return -ENOMEM; 911 return -ENOMEM;
@@ -968,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
968 969
969 spin_unlock_irqrestore(&ep->lock, flags); 970 spin_unlock_irqrestore(&ep->lock, flags);
970 971
971 atomic_inc(&ep->user->epoll_watches); 972 atomic_long_inc(&ep->user->epoll_watches);
972 973
973 /* We have to call this outside the lock */ 974 /* We have to call this outside the lock */
974 if (pwake) 975 if (pwake)
@@ -1113,21 +1114,35 @@ static int ep_send_events(struct eventpoll *ep,
1113 return ep_scan_ready_list(ep, ep_send_events_proc, &esed); 1114 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1114} 1115}
1115 1116
1117static inline struct timespec ep_set_mstimeout(long ms)
1118{
1119 struct timespec now, ts = {
1120 .tv_sec = ms / MSEC_PER_SEC,
1121 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1122 };
1123
1124 ktime_get_ts(&now);
1125 return timespec_add_safe(now, ts);
1126}
1127
1116static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1128static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1117 int maxevents, long timeout) 1129 int maxevents, long timeout)
1118{ 1130{
1119 int res, eavail; 1131 int res, eavail, timed_out = 0;
1120 unsigned long flags; 1132 unsigned long flags;
1121 long jtimeout; 1133 long slack;
1122 wait_queue_t wait; 1134 wait_queue_t wait;
1135 ktime_t expires, *to = NULL;
1123 1136
1124 /* 1137 if (timeout > 0) {
1125 * Calculate the timeout by checking for the "infinite" value (-1) 1138 struct timespec end_time = ep_set_mstimeout(timeout);
1126 * and the overflow condition. The passed timeout is in milliseconds, 1139
1127 * that why (t * HZ) / 1000. 1140 slack = select_estimate_accuracy(&end_time);
1128 */ 1141 to = &expires;
1129 jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? 1142 *to = timespec_to_ktime(end_time);
1130 MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; 1143 } else if (timeout == 0) {
1144 timed_out = 1;
1145 }
1131 1146
1132retry: 1147retry:
1133 spin_lock_irqsave(&ep->lock, flags); 1148 spin_lock_irqsave(&ep->lock, flags);
@@ -1149,7 +1164,7 @@ retry:
1149 * to TASK_INTERRUPTIBLE before doing the checks. 1164 * to TASK_INTERRUPTIBLE before doing the checks.
1150 */ 1165 */
1151 set_current_state(TASK_INTERRUPTIBLE); 1166 set_current_state(TASK_INTERRUPTIBLE);
1152 if (!list_empty(&ep->rdllist) || !jtimeout) 1167 if (!list_empty(&ep->rdllist) || timed_out)
1153 break; 1168 break;
1154 if (signal_pending(current)) { 1169 if (signal_pending(current)) {
1155 res = -EINTR; 1170 res = -EINTR;
@@ -1157,7 +1172,9 @@ retry:
1157 } 1172 }
1158 1173
1159 spin_unlock_irqrestore(&ep->lock, flags); 1174 spin_unlock_irqrestore(&ep->lock, flags);
1160 jtimeout = schedule_timeout(jtimeout); 1175 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1176 timed_out = 1;
1177
1161 spin_lock_irqsave(&ep->lock, flags); 1178 spin_lock_irqsave(&ep->lock, flags);
1162 } 1179 }
1163 __remove_wait_queue(&ep->wq, &wait); 1180 __remove_wait_queue(&ep->wq, &wait);
@@ -1175,7 +1192,7 @@ retry:
1175 * more luck. 1192 * more luck.
1176 */ 1193 */
1177 if (!res && eavail && 1194 if (!res && eavail &&
1178 !(res = ep_send_events(ep, events, maxevents)) && jtimeout) 1195 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1179 goto retry; 1196 goto retry;
1180 1197
1181 return res; 1198 return res;
@@ -1422,6 +1439,7 @@ static int __init eventpoll_init(void)
1422 */ 1439 */
1423 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / 1440 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1424 EP_ITEM_COST; 1441 EP_ITEM_COST;
1442 BUG_ON(max_user_watches < 0);
1425 1443
1426 /* Initialize the structure used to perform safe poll wait head wake ups */ 1444 /* Initialize the structure used to perform safe poll wait head wake ups */
1427 ep_nested_calls_init(&poll_safewake_ncalls); 1445 ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..52a447d9b6ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
54#include <linux/fsnotify.h> 54#include <linux/fsnotify.h>
55#include <linux/fs_struct.h> 55#include <linux/fs_struct.h>
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
@@ -65,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
65unsigned int core_pipe_limit; 66unsigned int core_pipe_limit;
66int suid_dumpable = 0; 67int suid_dumpable = 0;
67 68
69struct core_name {
70 char *corename;
71 int used, size;
72};
73static atomic_t call_count = ATOMIC_INIT(1);
74
68/* The maximal length of core_pattern is also specified in sysctl.c */ 75/* The maximal length of core_pattern is also specified in sysctl.c */
69 76
70static LIST_HEAD(formats); 77static LIST_HEAD(formats);
@@ -113,7 +120,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
113 goto out; 120 goto out;
114 121
115 file = do_filp_open(AT_FDCWD, tmp, 122 file = do_filp_open(AT_FDCWD, tmp,
116 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, 123 O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
117 MAY_READ | MAY_EXEC | MAY_OPEN); 124 MAY_READ | MAY_EXEC | MAY_OPEN);
118 putname(tmp); 125 putname(tmp);
119 error = PTR_ERR(file); 126 error = PTR_ERR(file);
@@ -157,7 +164,26 @@ out:
157 164
158#ifdef CONFIG_MMU 165#ifdef CONFIG_MMU
159 166
160static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 167void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
168{
169 struct mm_struct *mm = current->mm;
170 long diff = (long)(pages - bprm->vma_pages);
171
172 if (!mm || !diff)
173 return;
174
175 bprm->vma_pages = pages;
176
177#ifdef SPLIT_RSS_COUNTING
178 add_mm_counter(mm, MM_ANONPAGES, diff);
179#else
180 spin_lock(&mm->page_table_lock);
181 add_mm_counter(mm, MM_ANONPAGES, diff);
182 spin_unlock(&mm->page_table_lock);
183#endif
184}
185
186struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
161 int write) 187 int write)
162{ 188{
163 struct page *page; 189 struct page *page;
@@ -179,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
179 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; 205 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
180 struct rlimit *rlim; 206 struct rlimit *rlim;
181 207
208 acct_arg_size(bprm, size / PAGE_SIZE);
209
182 /* 210 /*
183 * We've historically supported up to 32 pages (ARG_MAX) 211 * We've historically supported up to 32 pages (ARG_MAX)
184 * of argument strings even with small stacks 212 * of argument strings even with small stacks
@@ -247,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
247 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 275 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 276 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 277 INIT_LIST_HEAD(&vma->anon_vma_chain);
278
279 err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
280 if (err)
281 goto err;
282
250 err = insert_vm_struct(mm, vma); 283 err = insert_vm_struct(mm, vma);
251 if (err) 284 if (err)
252 goto err; 285 goto err;
@@ -269,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
269 302
270#else 303#else
271 304
272static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 305void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
306{
307}
308
309struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
273 int write) 310 int write)
274{ 311{
275 struct page *page; 312 struct page *page;
@@ -686,7 +723,7 @@ struct file *open_exec(const char *name)
686 int err; 723 int err;
687 724
688 file = do_filp_open(AT_FDCWD, name, 725 file = do_filp_open(AT_FDCWD, name,
689 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, 726 O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
690 MAY_EXEC | MAY_OPEN); 727 MAY_EXEC | MAY_OPEN);
691 if (IS_ERR(file)) 728 if (IS_ERR(file))
692 goto out; 729 goto out;
@@ -759,6 +796,10 @@ static int exec_mmap(struct mm_struct *mm)
759 tsk->mm = mm; 796 tsk->mm = mm;
760 tsk->active_mm = mm; 797 tsk->active_mm = mm;
761 activate_mm(active_mm, mm); 798 activate_mm(active_mm, mm);
799 if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
800 atomic_dec(&old_mm->oom_disable_count);
801 atomic_inc(&tsk->mm->oom_disable_count);
802 }
762 task_unlock(tsk); 803 task_unlock(tsk);
763 arch_pick_mmap_layout(mm); 804 arch_pick_mmap_layout(mm);
764 if (old_mm) { 805 if (old_mm) {
@@ -992,13 +1033,14 @@ int flush_old_exec(struct linux_binprm * bprm)
992 /* 1033 /*
993 * Release all of the old mmap stuff 1034 * Release all of the old mmap stuff
994 */ 1035 */
1036 acct_arg_size(bprm, 0);
995 retval = exec_mmap(bprm->mm); 1037 retval = exec_mmap(bprm->mm);
996 if (retval) 1038 if (retval)
997 goto out; 1039 goto out;
998 1040
999 bprm->mm = NULL; /* We're using it now */ 1041 bprm->mm = NULL; /* We're using it now */
1000 1042
1001 current->flags &= ~PF_RANDOMIZE; 1043 current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
1002 flush_thread(); 1044 flush_thread();
1003 current->personality &= ~bprm->per_clear; 1045 current->personality &= ~bprm->per_clear;
1004 1046
@@ -1078,14 +1120,14 @@ EXPORT_SYMBOL(setup_new_exec);
1078 */ 1120 */
1079int prepare_bprm_creds(struct linux_binprm *bprm) 1121int prepare_bprm_creds(struct linux_binprm *bprm)
1080{ 1122{
1081 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1123 if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1082 return -ERESTARTNOINTR; 1124 return -ERESTARTNOINTR;
1083 1125
1084 bprm->cred = prepare_exec_creds(); 1126 bprm->cred = prepare_exec_creds();
1085 if (likely(bprm->cred)) 1127 if (likely(bprm->cred))
1086 return 0; 1128 return 0;
1087 1129
1088 mutex_unlock(&current->cred_guard_mutex); 1130 mutex_unlock(&current->signal->cred_guard_mutex);
1089 return -ENOMEM; 1131 return -ENOMEM;
1090} 1132}
1091 1133
@@ -1093,7 +1135,7 @@ void free_bprm(struct linux_binprm *bprm)
1093{ 1135{
1094 free_arg_pages(bprm); 1136 free_arg_pages(bprm);
1095 if (bprm->cred) { 1137 if (bprm->cred) {
1096 mutex_unlock(&current->cred_guard_mutex); 1138 mutex_unlock(&current->signal->cred_guard_mutex);
1097 abort_creds(bprm->cred); 1139 abort_creds(bprm->cred);
1098 } 1140 }
1099 kfree(bprm); 1141 kfree(bprm);
@@ -1114,13 +1156,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1114 * credentials; any time after this it may be unlocked. 1156 * credentials; any time after this it may be unlocked.
1115 */ 1157 */
1116 security_bprm_committed_creds(bprm); 1158 security_bprm_committed_creds(bprm);
1117 mutex_unlock(&current->cred_guard_mutex); 1159 mutex_unlock(&current->signal->cred_guard_mutex);
1118} 1160}
1119EXPORT_SYMBOL(install_exec_creds); 1161EXPORT_SYMBOL(install_exec_creds);
1120 1162
1121/* 1163/*
1122 * determine how safe it is to execute the proposed program 1164 * determine how safe it is to execute the proposed program
1123 * - the caller must hold current->cred_guard_mutex to protect against 1165 * - the caller must hold ->cred_guard_mutex to protect against
1124 * PTRACE_ATTACH 1166 * PTRACE_ATTACH
1125 */ 1167 */
1126int check_unsafe_exec(struct linux_binprm *bprm) 1168int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1401,7 +1443,6 @@ int do_execve(const char * filename,
1401 if (retval < 0) 1443 if (retval < 0)
1402 goto out; 1444 goto out;
1403 1445
1404 current->flags &= ~PF_KTHREAD;
1405 retval = search_binary_handler(bprm,regs); 1446 retval = search_binary_handler(bprm,regs);
1406 if (retval < 0) 1447 if (retval < 0)
1407 goto out; 1448 goto out;
@@ -1416,8 +1457,10 @@ int do_execve(const char * filename,
1416 return retval; 1457 return retval;
1417 1458
1418out: 1459out:
1419 if (bprm->mm) 1460 if (bprm->mm) {
1420 mmput (bprm->mm); 1461 acct_arg_size(bprm, 0);
1462 mmput(bprm->mm);
1463 }
1421 1464
1422out_file: 1465out_file:
1423 if (bprm->file) { 1466 if (bprm->file) {
@@ -1454,127 +1497,148 @@ void set_binfmt(struct linux_binfmt *new)
1454 1497
1455EXPORT_SYMBOL(set_binfmt); 1498EXPORT_SYMBOL(set_binfmt);
1456 1499
1500static int expand_corename(struct core_name *cn)
1501{
1502 char *old_corename = cn->corename;
1503
1504 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
1505 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
1506
1507 if (!cn->corename) {
1508 kfree(old_corename);
1509 return -ENOMEM;
1510 }
1511
1512 return 0;
1513}
1514
1515static int cn_printf(struct core_name *cn, const char *fmt, ...)
1516{
1517 char *cur;
1518 int need;
1519 int ret;
1520 va_list arg;
1521
1522 va_start(arg, fmt);
1523 need = vsnprintf(NULL, 0, fmt, arg);
1524 va_end(arg);
1525
1526 if (likely(need < cn->size - cn->used - 1))
1527 goto out_printf;
1528
1529 ret = expand_corename(cn);
1530 if (ret)
1531 goto expand_fail;
1532
1533out_printf:
1534 cur = cn->corename + cn->used;
1535 va_start(arg, fmt);
1536 vsnprintf(cur, need + 1, fmt, arg);
1537 va_end(arg);
1538 cn->used += need;
1539 return 0;
1540
1541expand_fail:
1542 return ret;
1543}
1544
1457/* format_corename will inspect the pattern parameter, and output a 1545/* format_corename will inspect the pattern parameter, and output a
1458 * name into corename, which must have space for at least 1546 * name into corename, which must have space for at least
1459 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. 1547 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1460 */ 1548 */
1461static int format_corename(char *corename, long signr) 1549static int format_corename(struct core_name *cn, long signr)
1462{ 1550{
1463 const struct cred *cred = current_cred(); 1551 const struct cred *cred = current_cred();
1464 const char *pat_ptr = core_pattern; 1552 const char *pat_ptr = core_pattern;
1465 int ispipe = (*pat_ptr == '|'); 1553 int ispipe = (*pat_ptr == '|');
1466 char *out_ptr = corename;
1467 char *const out_end = corename + CORENAME_MAX_SIZE;
1468 int rc;
1469 int pid_in_pattern = 0; 1554 int pid_in_pattern = 0;
1555 int err = 0;
1556
1557 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
1558 cn->corename = kmalloc(cn->size, GFP_KERNEL);
1559 cn->used = 0;
1560
1561 if (!cn->corename)
1562 return -ENOMEM;
1470 1563
1471 /* Repeat as long as we have more pattern to process and more output 1564 /* Repeat as long as we have more pattern to process and more output
1472 space */ 1565 space */
1473 while (*pat_ptr) { 1566 while (*pat_ptr) {
1474 if (*pat_ptr != '%') { 1567 if (*pat_ptr != '%') {
1475 if (out_ptr == out_end) 1568 if (*pat_ptr == 0)
1476 goto out; 1569 goto out;
1477 *out_ptr++ = *pat_ptr++; 1570 err = cn_printf(cn, "%c", *pat_ptr++);
1478 } else { 1571 } else {
1479 switch (*++pat_ptr) { 1572 switch (*++pat_ptr) {
1573 /* single % at the end, drop that */
1480 case 0: 1574 case 0:
1481 goto out; 1575 goto out;
1482 /* Double percent, output one percent */ 1576 /* Double percent, output one percent */
1483 case '%': 1577 case '%':
1484 if (out_ptr == out_end) 1578 err = cn_printf(cn, "%c", '%');
1485 goto out;
1486 *out_ptr++ = '%';
1487 break; 1579 break;
1488 /* pid */ 1580 /* pid */
1489 case 'p': 1581 case 'p':
1490 pid_in_pattern = 1; 1582 pid_in_pattern = 1;
1491 rc = snprintf(out_ptr, out_end - out_ptr, 1583 err = cn_printf(cn, "%d",
1492 "%d", task_tgid_vnr(current)); 1584 task_tgid_vnr(current));
1493 if (rc > out_end - out_ptr)
1494 goto out;
1495 out_ptr += rc;
1496 break; 1585 break;
1497 /* uid */ 1586 /* uid */
1498 case 'u': 1587 case 'u':
1499 rc = snprintf(out_ptr, out_end - out_ptr, 1588 err = cn_printf(cn, "%d", cred->uid);
1500 "%d", cred->uid);
1501 if (rc > out_end - out_ptr)
1502 goto out;
1503 out_ptr += rc;
1504 break; 1589 break;
1505 /* gid */ 1590 /* gid */
1506 case 'g': 1591 case 'g':
1507 rc = snprintf(out_ptr, out_end - out_ptr, 1592 err = cn_printf(cn, "%d", cred->gid);
1508 "%d", cred->gid);
1509 if (rc > out_end - out_ptr)
1510 goto out;
1511 out_ptr += rc;
1512 break; 1593 break;
1513 /* signal that caused the coredump */ 1594 /* signal that caused the coredump */
1514 case 's': 1595 case 's':
1515 rc = snprintf(out_ptr, out_end - out_ptr, 1596 err = cn_printf(cn, "%ld", signr);
1516 "%ld", signr);
1517 if (rc > out_end - out_ptr)
1518 goto out;
1519 out_ptr += rc;
1520 break; 1597 break;
1521 /* UNIX time of coredump */ 1598 /* UNIX time of coredump */
1522 case 't': { 1599 case 't': {
1523 struct timeval tv; 1600 struct timeval tv;
1524 do_gettimeofday(&tv); 1601 do_gettimeofday(&tv);
1525 rc = snprintf(out_ptr, out_end - out_ptr, 1602 err = cn_printf(cn, "%lu", tv.tv_sec);
1526 "%lu", tv.tv_sec);
1527 if (rc > out_end - out_ptr)
1528 goto out;
1529 out_ptr += rc;
1530 break; 1603 break;
1531 } 1604 }
1532 /* hostname */ 1605 /* hostname */
1533 case 'h': 1606 case 'h':
1534 down_read(&uts_sem); 1607 down_read(&uts_sem);
1535 rc = snprintf(out_ptr, out_end - out_ptr, 1608 err = cn_printf(cn, "%s",
1536 "%s", utsname()->nodename); 1609 utsname()->nodename);
1537 up_read(&uts_sem); 1610 up_read(&uts_sem);
1538 if (rc > out_end - out_ptr)
1539 goto out;
1540 out_ptr += rc;
1541 break; 1611 break;
1542 /* executable */ 1612 /* executable */
1543 case 'e': 1613 case 'e':
1544 rc = snprintf(out_ptr, out_end - out_ptr, 1614 err = cn_printf(cn, "%s", current->comm);
1545 "%s", current->comm);
1546 if (rc > out_end - out_ptr)
1547 goto out;
1548 out_ptr += rc;
1549 break; 1615 break;
1550 /* core limit size */ 1616 /* core limit size */
1551 case 'c': 1617 case 'c':
1552 rc = snprintf(out_ptr, out_end - out_ptr, 1618 err = cn_printf(cn, "%lu",
1553 "%lu", rlimit(RLIMIT_CORE)); 1619 rlimit(RLIMIT_CORE));
1554 if (rc > out_end - out_ptr)
1555 goto out;
1556 out_ptr += rc;
1557 break; 1620 break;
1558 default: 1621 default:
1559 break; 1622 break;
1560 } 1623 }
1561 ++pat_ptr; 1624 ++pat_ptr;
1562 } 1625 }
1626
1627 if (err)
1628 return err;
1563 } 1629 }
1630
1564 /* Backward compatibility with core_uses_pid: 1631 /* Backward compatibility with core_uses_pid:
1565 * 1632 *
1566 * If core_pattern does not include a %p (as is the default) 1633 * If core_pattern does not include a %p (as is the default)
1567 * and core_uses_pid is set, then .%pid will be appended to 1634 * and core_uses_pid is set, then .%pid will be appended to
1568 * the filename. Do not do this for piped commands. */ 1635 * the filename. Do not do this for piped commands. */
1569 if (!ispipe && !pid_in_pattern && core_uses_pid) { 1636 if (!ispipe && !pid_in_pattern && core_uses_pid) {
1570 rc = snprintf(out_ptr, out_end - out_ptr, 1637 err = cn_printf(cn, ".%d", task_tgid_vnr(current));
1571 ".%d", task_tgid_vnr(current)); 1638 if (err)
1572 if (rc > out_end - out_ptr) 1639 return err;
1573 goto out;
1574 out_ptr += rc;
1575 } 1640 }
1576out: 1641out:
1577 *out_ptr = 0;
1578 return ispipe; 1642 return ispipe;
1579} 1643}
1580 1644
@@ -1851,7 +1915,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
1851void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1915void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1852{ 1916{
1853 struct core_state core_state; 1917 struct core_state core_state;
1854 char corename[CORENAME_MAX_SIZE + 1]; 1918 struct core_name cn;
1855 struct mm_struct *mm = current->mm; 1919 struct mm_struct *mm = current->mm;
1856 struct linux_binfmt * binfmt; 1920 struct linux_binfmt * binfmt;
1857 const struct cred *old_cred; 1921 const struct cred *old_cred;
@@ -1906,7 +1970,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1906 */ 1970 */
1907 clear_thread_flag(TIF_SIGPENDING); 1971 clear_thread_flag(TIF_SIGPENDING);
1908 1972
1909 ispipe = format_corename(corename, signr); 1973 ispipe = format_corename(&cn, signr);
1974
1975 if (ispipe == -ENOMEM) {
1976 printk(KERN_WARNING "format_corename failed\n");
1977 printk(KERN_WARNING "Aborting core\n");
1978 goto fail_corename;
1979 }
1910 1980
1911 if (ispipe) { 1981 if (ispipe) {
1912 int dump_count; 1982 int dump_count;
@@ -1943,7 +2013,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1943 goto fail_dropcount; 2013 goto fail_dropcount;
1944 } 2014 }
1945 2015
1946 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); 2016 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
1947 if (!helper_argv) { 2017 if (!helper_argv) {
1948 printk(KERN_WARNING "%s failed to allocate memory\n", 2018 printk(KERN_WARNING "%s failed to allocate memory\n",
1949 __func__); 2019 __func__);
@@ -1956,7 +2026,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1956 argv_free(helper_argv); 2026 argv_free(helper_argv);
1957 if (retval) { 2027 if (retval) {
1958 printk(KERN_INFO "Core dump to %s pipe failed\n", 2028 printk(KERN_INFO "Core dump to %s pipe failed\n",
1959 corename); 2029 cn.corename);
1960 goto close_fail; 2030 goto close_fail;
1961 } 2031 }
1962 } else { 2032 } else {
@@ -1965,7 +2035,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1965 if (cprm.limit < binfmt->min_coredump) 2035 if (cprm.limit < binfmt->min_coredump)
1966 goto fail_unlock; 2036 goto fail_unlock;
1967 2037
1968 cprm.file = filp_open(corename, 2038 cprm.file = filp_open(cn.corename,
1969 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 2039 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1970 0600); 2040 0600);
1971 if (IS_ERR(cprm.file)) 2041 if (IS_ERR(cprm.file))
@@ -2007,6 +2077,8 @@ fail_dropcount:
2007 if (ispipe) 2077 if (ispipe)
2008 atomic_dec(&core_dump_count); 2078 atomic_dec(&core_dump_count);
2009fail_unlock: 2079fail_unlock:
2080 kfree(cn.corename);
2081fail_corename:
2010 coredump_finish(mm); 2082 coredump_finish(mm);
2011 revert_creds(old_cred); 2083 revert_creds(old_cred);
2012fail_creds: 2084fail_creds:
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc1..dcc941d82d67 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
420 err = exofs_write_begin(NULL, page->mapping, pos, len, 420 err = exofs_write_begin(NULL, page->mapping, pos, len,
421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
422 if (err) 422 if (err)
423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n", 423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
424 err); 424 err);
425 425
426 de->inode_no = cpu_to_le64(inode->i_ino); 426 de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, 556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
557 &page, NULL); 557 &page, NULL);
558 if (err) 558 if (err)
559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n", 559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
560 err); 560 err);
561 if (pde) 561 if (pde)
562 pde->rec_len = cpu_to_le16(to - from); 562 pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
46{ 46{
47 int ret; 47 int ret;
48 struct inode *inode = filp->f_mapping->host; 48 struct inode *inode = filp->f_mapping->host;
49 struct writeback_control wbc = {
50 .sync_mode = WB_SYNC_ALL,
51 .nr_to_write = 0, /* metadata-only; caller takes care of data */
52 };
53 struct super_block *sb; 49 struct super_block *sb;
54 50
55 if (!(inode->i_state & I_DIRTY)) 51 if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
57 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 53 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
58 return 0; 54 return 0;
59 55
60 ret = sync_inode(inode, &wbc); 56 ret = sync_inode_metadata(inode, 1);
61 57
62 /* This is a good place to write the sb */ 58 /* This is a good place to write the sb */
63 /* TODO: Sechedule an sb-sync on create */ 59 /* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3eadd97324b1..a7555238c41a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -185,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
185/* Called at the end of reads, to optionally unlock pages and update their 185/* Called at the end of reads, to optionally unlock pages and update their
186 * status. 186 * status.
187 */ 187 */
188static int __readpages_done(struct page_collect *pcol, bool do_unlock) 188static int __readpages_done(struct page_collect *pcol)
189{ 189{
190 int i; 190 int i;
191 u64 resid; 191 u64 resid;
@@ -221,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
221 page_stat ? "bad_bytes" : "good_bytes"); 221 page_stat ? "bad_bytes" : "good_bytes");
222 222
223 ret = update_read_page(page, page_stat); 223 ret = update_read_page(page, page_stat);
224 if (do_unlock) 224 if (!pcol->read_4_write)
225 unlock_page(page); 225 unlock_page(page);
226 length += PAGE_SIZE; 226 length += PAGE_SIZE;
227 } 227 }
@@ -236,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
236{ 236{
237 struct page_collect *pcol = p; 237 struct page_collect *pcol = p;
238 238
239 __readpages_done(pcol, true); 239 __readpages_done(pcol);
240 atomic_dec(&pcol->sbi->s_curr_pending); 240 atomic_dec(&pcol->sbi->s_curr_pending);
241 kfree(pcol); 241 kfree(pcol);
242} 242}
@@ -257,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
257 } 257 }
258} 258}
259 259
260static int read_exec(struct page_collect *pcol, bool is_sync) 260static int read_exec(struct page_collect *pcol)
261{ 261{
262 struct exofs_i_info *oi = exofs_i(pcol->inode); 262 struct exofs_i_info *oi = exofs_i(pcol->inode);
263 struct exofs_io_state *ios = pcol->ios; 263 struct exofs_io_state *ios = pcol->ios;
@@ -267,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
267 if (!pcol->pages) 267 if (!pcol->pages)
268 return 0; 268 return 0;
269 269
270 /* see comment in _readpage() about sync reads */
271 WARN_ON(is_sync && (pcol->nr_pages != 1));
272
273 ios->pages = pcol->pages; 270 ios->pages = pcol->pages;
274 ios->nr_pages = pcol->nr_pages; 271 ios->nr_pages = pcol->nr_pages;
275 ios->length = pcol->length; 272 ios->length = pcol->length;
276 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 273 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
277 274
278 if (is_sync) { 275 if (pcol->read_4_write) {
279 exofs_oi_read(oi, pcol->ios); 276 exofs_oi_read(oi, pcol->ios);
280 return __readpages_done(pcol, false); 277 return __readpages_done(pcol);
281 } 278 }
282 279
283 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 280 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -303,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
303 return 0; 300 return 0;
304 301
305err: 302err:
306 if (!is_sync) 303 if (!pcol->read_4_write)
307 _unlock_pcol_pages(pcol, ret, READ); 304 _unlock_pcol_pages(pcol, ret, READ);
308 305
309 pcol_free(pcol); 306 pcol_free(pcol);
@@ -356,7 +353,7 @@ static int readpage_strip(void *data, struct page *page)
356 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 353 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
357 " splitting\n", inode->i_ino, page->index); 354 " splitting\n", inode->i_ino, page->index);
358 355
359 return read_exec(pcol, false); 356 return read_exec(pcol);
360 } 357 }
361 358
362try_again: 359try_again:
@@ -366,7 +363,7 @@ try_again:
366 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 363 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
367 page->index)) { 364 page->index)) {
368 /* Discontinuity detected, split the request */ 365 /* Discontinuity detected, split the request */
369 ret = read_exec(pcol, false); 366 ret = read_exec(pcol);
370 if (unlikely(ret)) 367 if (unlikely(ret))
371 goto fail; 368 goto fail;
372 goto try_again; 369 goto try_again;
@@ -391,7 +388,7 @@ try_again:
391 page, len, pcol->nr_pages, pcol->length); 388 page, len, pcol->nr_pages, pcol->length);
392 389
393 /* split the request, and start again with current page */ 390 /* split the request, and start again with current page */
394 ret = read_exec(pcol, false); 391 ret = read_exec(pcol);
395 if (unlikely(ret)) 392 if (unlikely(ret))
396 goto fail; 393 goto fail;
397 394
@@ -420,27 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
420 return ret; 417 return ret;
421 } 418 }
422 419
423 return read_exec(&pcol, false); 420 return read_exec(&pcol);
424} 421}
425 422
426static int _readpage(struct page *page, bool is_sync) 423static int _readpage(struct page *page, bool read_4_write)
427{ 424{
428 struct page_collect pcol; 425 struct page_collect pcol;
429 int ret; 426 int ret;
430 427
431 _pcol_init(&pcol, 1, page->mapping->host); 428 _pcol_init(&pcol, 1, page->mapping->host);
432 429
433 /* readpage_strip might call read_exec(,is_sync==false) at several 430 pcol.read_4_write = read_4_write;
434 * places but not if we have a single page.
435 */
436 pcol.read_4_write = is_sync;
437 ret = readpage_strip(&pcol, page); 431 ret = readpage_strip(&pcol, page);
438 if (ret) { 432 if (ret) {
439 EXOFS_ERR("_readpage => %d\n", ret); 433 EXOFS_ERR("_readpage => %d\n", ret);
440 return ret; 434 return ret;
441 } 435 }
442 436
443 return read_exec(&pcol, is_sync); 437 return read_exec(&pcol);
444} 438}
445 439
446/* 440/*
@@ -511,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
511 505
512 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
513 if (!pcol_copy) { 507 if (!pcol_copy) {
514 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); 508 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
515 ret = -ENOMEM; 509 ret = -ENOMEM;
516 goto err; 510 goto err;
517 } 511 }
@@ -527,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
527 521
528 ret = exofs_oi_write(oi, ios); 522 ret = exofs_oi_write(oi, ios);
529 if (unlikely(ret)) { 523 if (unlikely(ret)) {
530 EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); 524 EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
531 goto err; 525 goto err;
532 } 526 }
533 527
@@ -628,7 +622,7 @@ try_again:
628 /* split the request, next loop will start again */ 622 /* split the request, next loop will start again */
629 ret = write_exec(pcol); 623 ret = write_exec(pcol);
630 if (unlikely(ret)) { 624 if (unlikely(ret)) {
631 EXOFS_DBGMSG("write_exec faild => %d", ret); 625 EXOFS_DBGMSG("write_exec failed => %d", ret);
632 goto fail; 626 goto fail;
633 } 627 }
634 628
@@ -719,7 +713,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
719 ret = simple_write_begin(file, mapping, pos, len, flags, pagep, 713 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
720 fsdata); 714 fsdata);
721 if (ret) { 715 if (ret) {
722 EXOFS_DBGMSG("simple_write_begin faild\n"); 716 EXOFS_DBGMSG("simple_write_begin failed\n");
723 goto out; 717 goto out;
724 } 718 }
725 719
@@ -732,7 +726,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
732 if (ret) { 726 if (ret) {
733 /*SetPageError was done by _readpage. Is it ok?*/ 727 /*SetPageError was done by _readpage. Is it ok?*/
734 unlock_page(page); 728 unlock_page(page);
735 EXOFS_DBGMSG("__readpage_filler faild\n"); 729 EXOFS_DBGMSG("__readpage_filler failed\n");
736 } 730 }
737 } 731 }
738out: 732out:
@@ -1072,8 +1066,10 @@ bad_inode:
1072int __exofs_wait_obj_created(struct exofs_i_info *oi) 1066int __exofs_wait_obj_created(struct exofs_i_info *oi)
1073{ 1067{
1074 if (!obj_created(oi)) { 1068 if (!obj_created(oi)) {
1069 EXOFS_DBGMSG("!obj_created\n");
1075 BUG_ON(!obj_2bcreated(oi)); 1070 BUG_ON(!obj_2bcreated(oi));
1076 wait_event(oi->i_wq, obj_created(oi)); 1071 wait_event(oi->i_wq, obj_created(oi));
1072 EXOFS_DBGMSG("wait_event done\n");
1077 } 1073 }
1078 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1074 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1079} 1075}
@@ -1095,7 +1091,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
1095 atomic_dec(&sbi->s_curr_pending); 1091 atomic_dec(&sbi->s_curr_pending);
1096 1092
1097 if (unlikely(ret)) { 1093 if (unlikely(ret)) {
1098 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1094 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
1099 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1095 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1100 /*TODO: When FS is corrupted creation can fail, object already 1096 /*TODO: When FS is corrupted creation can fail, object already
1101 * exist. Get rid of this asynchronous creation, if exist 1097 * exist. Get rid of this asynchronous creation, if exist
@@ -1107,7 +1103,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
1107 1103
1108 set_obj_created(oi); 1104 set_obj_created(oi);
1109 1105
1110 atomic_dec(&inode->i_count);
1111 wake_up(&oi->i_wq); 1106 wake_up(&oi->i_wq);
1112} 1107}
1113 1108
@@ -1157,17 +1152,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1157 ios->obj.id = exofs_oi_objno(oi); 1152 ios->obj.id = exofs_oi_objno(oi);
1158 exofs_make_credential(oi->i_cred, &ios->obj); 1153 exofs_make_credential(oi->i_cred, &ios->obj);
1159 1154
1160 /* increment the refcount so that the inode will still be around when we
1161 * reach the callback
1162 */
1163 atomic_inc(&inode->i_count);
1164
1165 ios->done = create_done; 1155 ios->done = create_done;
1166 ios->private = inode; 1156 ios->private = inode;
1167 ios->cred = oi->i_cred; 1157 ios->cred = oi->i_cred;
1168 ret = exofs_sbi_create(ios); 1158 ret = exofs_sbi_create(ios);
1169 if (ret) { 1159 if (ret) {
1170 atomic_dec(&inode->i_count);
1171 exofs_put_io_state(ios); 1160 exofs_put_io_state(ios);
1172 return ERR_PTR(ret); 1161 return ERR_PTR(ret);
1173 } 1162 }
@@ -1215,7 +1204,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1215 1204
1216 args = kzalloc(sizeof(*args), GFP_KERNEL); 1205 args = kzalloc(sizeof(*args), GFP_KERNEL);
1217 if (!args) { 1206 if (!args) {
1218 EXOFS_DBGMSG("Faild kzalloc of args\n"); 1207 EXOFS_DBGMSG("Failed kzalloc of args\n");
1219 return -ENOMEM; 1208 return -ENOMEM;
1220 } 1209 }
1221 1210
@@ -1257,12 +1246,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1257 ios->out_attr_len = 1; 1246 ios->out_attr_len = 1;
1258 ios->out_attr = &attr; 1247 ios->out_attr = &attr;
1259 1248
1260 if (!obj_created(oi)) { 1249 wait_obj_created(oi);
1261 EXOFS_DBGMSG("!obj_created\n");
1262 BUG_ON(!obj_2bcreated(oi));
1263 wait_event(oi->i_wq, obj_created(oi));
1264 EXOFS_DBGMSG("wait_event done\n");
1265 }
1266 1250
1267 if (!do_sync) { 1251 if (!do_sync) {
1268 args->sbi = sbi; 1252 args->sbi = sbi;
@@ -1325,12 +1309,12 @@ void exofs_evict_inode(struct inode *inode)
1325 inode->i_size = 0; 1309 inode->i_size = 0;
1326 end_writeback(inode); 1310 end_writeback(inode);
1327 1311
1328 /* if we are deleting an obj that hasn't been created yet, wait */ 1312 /* if we are deleting an obj that hasn't been created yet, wait.
1329 if (!obj_created(oi)) { 1313 * This also makes sure that create_done cannot be called with an
1330 BUG_ON(!obj_2bcreated(oi)); 1314 * already evicted inode.
1331 wait_event(oi->i_wq, obj_created(oi)); 1315 */
1332 /* ignore the error attempt a remove anyway */ 1316 wait_obj_created(oi);
1333 } 1317 /* ignore the error, attempt a remove anyway */
1334 1318
1335 /* Now Remove the OSD objects */ 1319 /* Now Remove the OSD objects */
1336 ret = exofs_get_io_state(&sbi->layout, &ios); 1320 ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6550bf70e41d..f74a2ec027a6 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
55 55
56 ret = osd_finalize_request(or, 0, cred, NULL); 56 ret = osd_finalize_request(or, 0, cred, NULL);
57 if (unlikely(ret)) { 57 if (unlikely(ret)) {
58 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); 58 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
59 goto out; 59 goto out;
60 } 60 }
61 61
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
79 */ 79 */
80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); 80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
81 if (unlikely(!ios)) { 81 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", 82 EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs)); 83 exofs_io_state_size(layout->s_numdevs));
84 *pios = NULL; 84 *pios = NULL;
85 return -ENOMEM; 85 return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
172 172
173 ret = osd_finalize_request(or, 0, ios->cred, NULL); 173 ret = osd_finalize_request(or, 0, ios->cred, NULL);
174 if (unlikely(ret)) { 174 if (unlikely(ret)) {
175 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", 175 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
176 ret); 176 ret);
177 return ret; 177 return ret;
178 } 178 }
@@ -361,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
361 361
362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
363 if (unlikely(!per_dev->bio)) { 363 if (unlikely(!per_dev->bio)) {
364 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", 364 EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
365 bio_size); 365 bio_size);
366 return -ENOMEM; 366 return -ENOMEM;
367 } 367 }
@@ -564,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
564 master_dev->bio->bi_max_vecs); 564 master_dev->bio->bi_max_vecs);
565 if (unlikely(!bio)) { 565 if (unlikely(!bio)) {
566 EXOFS_DBGMSG( 566 EXOFS_DBGMSG(
567 "Faild to allocate BIO size=%u\n", 567 "Failed to allocate BIO size=%u\n",
568 master_dev->bio->bi_max_vecs); 568 master_dev->bio->bi_max_vecs);
569 ret = -ENOMEM; 569 ret = -ENOMEM;
570 goto out; 570 goto out;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
153 153
154 inode->i_ctime = CURRENT_TIME; 154 inode->i_ctime = CURRENT_TIME;
155 inode_inc_link_count(inode); 155 inode_inc_link_count(inode);
156 atomic_inc(&inode->i_count); 156 ihold(inode);
157 157
158 return exofs_add_nondir(dentry, inode); 158 return exofs_add_nondir(dentry, inode);
159} 159}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 047e92fa3af8..8c6c4669b381 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
150 return &oi->vfs_inode; 150 return &oi->vfs_inode;
151} 151}
152 152
153static void exofs_i_callback(struct rcu_head *head)
154{
155 struct inode *inode = container_of(head, struct inode, i_rcu);
156 INIT_LIST_HEAD(&inode->i_dentry);
157 kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
158}
159
153/* 160/*
154 * Remove an inode from the cache 161 * Remove an inode from the cache
155 */ 162 */
156static void exofs_destroy_inode(struct inode *inode) 163static void exofs_destroy_inode(struct inode *inode)
157{ 164{
158 kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); 165 call_rcu(&inode->i_rcu, exofs_i_callback);
159} 166}
160 167
161/* 168/*
@@ -659,19 +666,19 @@ free_bdi:
659/* 666/*
660 * Set up the superblock (calls exofs_fill_super eventually) 667 * Set up the superblock (calls exofs_fill_super eventually)
661 */ 668 */
662static int exofs_get_sb(struct file_system_type *type, 669static struct dentry *exofs_mount(struct file_system_type *type,
663 int flags, const char *dev_name, 670 int flags, const char *dev_name,
664 void *data, struct vfsmount *mnt) 671 void *data)
665{ 672{
666 struct exofs_mountopt opts; 673 struct exofs_mountopt opts;
667 int ret; 674 int ret;
668 675
669 ret = parse_options(data, &opts); 676 ret = parse_options(data, &opts);
670 if (ret) 677 if (ret)
671 return ret; 678 return ERR_PTR(ret);
672 679
673 opts.dev_name = dev_name; 680 opts.dev_name = dev_name;
674 return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt); 681 return mount_nodev(type, flags, &opts, exofs_fill_super);
675} 682}
676 683
677/* 684/*
@@ -809,7 +816,7 @@ static const struct export_operations exofs_export_ops = {
809static struct file_system_type exofs_type = { 816static struct file_system_type exofs_type = {
810 .owner = THIS_MODULE, 817 .owner = THIS_MODULE,
811 .name = "exofs", 818 .name = "exofs",
812 .get_sb = exofs_get_sb, 819 .mount = exofs_mount,
813 .kill_sb = generic_shutdown_super, 820 .kill_sb = generic_shutdown_super,
814}; 821};
815 822
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..4b6825740dd5 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result,
43 void *context) 43 void *context)
44{ 44{
45 struct dentry *dentry, *toput = NULL; 45 struct dentry *dentry, *toput = NULL;
46 struct inode *inode;
46 47
47 if (acceptable(context, result)) 48 if (acceptable(context, result))
48 return result; 49 return result;
49 50
50 spin_lock(&dcache_lock); 51 inode = result->d_inode;
51 list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { 52 spin_lock(&inode->i_lock);
52 dget_locked(dentry); 53 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
53 spin_unlock(&dcache_lock); 54 dget(dentry);
55 spin_unlock(&inode->i_lock);
54 if (toput) 56 if (toput)
55 dput(toput); 57 dput(toput);
56 if (dentry != result && acceptable(context, dentry)) { 58 if (dentry != result && acceptable(context, dentry)) {
57 dput(result); 59 dput(result);
58 return dentry; 60 return dentry;
59 } 61 }
60 spin_lock(&dcache_lock); 62 spin_lock(&inode->i_lock);
61 toput = dentry; 63 toput = dentry;
62 } 64 }
63 spin_unlock(&dcache_lock); 65 spin_unlock(&inode->i_lock);
64 66
65 if (toput) 67 if (toput)
66 dput(toput); 68 dput(toput);
@@ -74,21 +76,20 @@ static struct dentry *
74find_disconnected_root(struct dentry *dentry) 76find_disconnected_root(struct dentry *dentry)
75{ 77{
76 dget(dentry); 78 dget(dentry);
77 spin_lock(&dentry->d_lock); 79 while (!IS_ROOT(dentry)) {
78 while (!IS_ROOT(dentry) && 80 struct dentry *parent = dget_parent(dentry);
79 (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { 81
80 struct dentry *parent = dentry->d_parent; 82 if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
81 dget(parent); 83 dput(parent);
82 spin_unlock(&dentry->d_lock); 84 break;
85 }
86
83 dput(dentry); 87 dput(dentry);
84 dentry = parent; 88 dentry = parent;
85 spin_lock(&dentry->d_lock);
86 } 89 }
87 spin_unlock(&dentry->d_lock);
88 return dentry; 90 return dentry;
89} 91}
90 92
91
92/* 93/*
93 * Make sure target_dir is fully connected to the dentry tree. 94 * Make sure target_dir is fully connected to the dentry tree.
94 * 95 *
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bada..7b4180554a62 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,17 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
232} 232}
233 233
234int 234int
235ext2_check_acl(struct inode *inode, int mask) 235ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
236{ 236{
237 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); 237 struct posix_acl *acl;
238
239 if (flags & IPERM_FLAG_RCU) {
240 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
241 return -ECHILD;
242 return -EAGAIN;
243 }
238 244
245 acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
239 if (IS_ERR(acl)) 246 if (IS_ERR(acl))
240 return PTR_ERR(acl); 247 return PTR_ERR(acl);
241 if (acl) { 248 if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac44..c939b7b12099 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
54#ifdef CONFIG_EXT2_FS_POSIX_ACL 54#ifdef CONFIG_EXT2_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext2_check_acl (struct inode *, int); 57extern int ext2_check_acl (struct inode *, int, unsigned int);
58extern int ext2_acl_chmod (struct inode *); 58extern int ext2_acl_chmod (struct inode *);
59extern int ext2_init_acl (struct inode *, struct inode *); 59extern int ext2_init_acl (struct inode *, struct inode *);
60 60
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c6c684b44ea1..0d06f4e75699 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
646 return here; 646 return here;
647} 647}
648 648
649/* 649/**
650 * ext2_try_to_allocate() 650 * ext2_try_to_allocate()
651 * @sb: superblock 651 * @sb: superblock
652 * @handle: handle to this transaction
653 * @group: given allocation block group 652 * @group: given allocation block group
654 * @bitmap_bh: bufferhead holds the block bitmap 653 * @bitmap_bh: bufferhead holds the block bitmap
655 * @grp_goal: given target block within the group 654 * @grp_goal: given target block within the group
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..47cda410b548 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -28,21 +28,30 @@
28 28
29typedef struct ext2_dir_entry_2 ext2_dirent; 29typedef struct ext2_dir_entry_2 ext2_dirent;
30 30
31/*
32 * Tests against MAX_REC_LEN etc were put in place for 64k block
33 * sizes; if that is not possible on this arch, we can skip
34 * those tests and speed things up.
35 */
31static inline unsigned ext2_rec_len_from_disk(__le16 dlen) 36static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
32{ 37{
33 unsigned len = le16_to_cpu(dlen); 38 unsigned len = le16_to_cpu(dlen);
34 39
40#if (PAGE_CACHE_SIZE >= 65536)
35 if (len == EXT2_MAX_REC_LEN) 41 if (len == EXT2_MAX_REC_LEN)
36 return 1 << 16; 42 return 1 << 16;
43#endif
37 return len; 44 return len;
38} 45}
39 46
40static inline __le16 ext2_rec_len_to_disk(unsigned len) 47static inline __le16 ext2_rec_len_to_disk(unsigned len)
41{ 48{
49#if (PAGE_CACHE_SIZE >= 65536)
42 if (len == (1 << 16)) 50 if (len == (1 << 16))
43 return cpu_to_le16(EXT2_MAX_REC_LEN); 51 return cpu_to_le16(EXT2_MAX_REC_LEN);
44 else 52 else
45 BUG_ON(len > (1 << 16)); 53 BUG_ON(len > (1 << 16));
54#endif
46 return cpu_to_le16(len); 55 return cpu_to_le16(len);
47} 56}
48 57
@@ -98,7 +107,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
98 if (IS_DIRSYNC(dir)) { 107 if (IS_DIRSYNC(dir)) {
99 err = write_one_page(page, 1); 108 err = write_one_page(page, 1);
100 if (!err) 109 if (!err)
101 err = ext2_sync_inode(dir); 110 err = sync_inode_metadata(dir, 1);
102 } else { 111 } else {
103 unlock_page(page); 112 unlock_page(page);
104 } 113 }
@@ -129,15 +138,15 @@ static void ext2_check_page(struct page *page, int quiet)
129 p = (ext2_dirent *)(kaddr + offs); 138 p = (ext2_dirent *)(kaddr + offs);
130 rec_len = ext2_rec_len_from_disk(p->rec_len); 139 rec_len = ext2_rec_len_from_disk(p->rec_len);
131 140
132 if (rec_len < EXT2_DIR_REC_LEN(1)) 141 if (unlikely(rec_len < EXT2_DIR_REC_LEN(1)))
133 goto Eshort; 142 goto Eshort;
134 if (rec_len & 3) 143 if (unlikely(rec_len & 3))
135 goto Ealign; 144 goto Ealign;
136 if (rec_len < EXT2_DIR_REC_LEN(p->name_len)) 145 if (unlikely(rec_len < EXT2_DIR_REC_LEN(p->name_len)))
137 goto Enamelen; 146 goto Enamelen;
138 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) 147 if (unlikely(((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)))
139 goto Espan; 148 goto Espan;
140 if (le32_to_cpu(p->inode) > max_inumber) 149 if (unlikely(le32_to_cpu(p->inode) > max_inumber))
141 goto Einumber; 150 goto Einumber;
142 } 151 }
143 if (offs != limit) 152 if (offs != limit)
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, struct writeback_control *); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_evict_inode(struct inode *); 122extern void ext2_evict_inode(struct inode *);
123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 123extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern int ext2_setattr (struct dentry *, struct iattr *); 124extern int ext2_setattr (struct dentry *, struct iattr *);
126extern void ext2_set_inode_flags(struct inode *inode); 125extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 940c96168868..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -458,7 +458,7 @@ failed_out:
458 * the same format as ext2_get_branch() would do. We are calling it after 458 * the same format as ext2_get_branch() would do. We are calling it after
459 * we had read the existing part of chain and partial points to the last 459 * we had read the existing part of chain and partial points to the last
460 * triple of that (one with zero ->key). Upon the exit we have the same 460 * triple of that (one with zero ->key). Upon the exit we have the same
461 * picture as after the successful ext2_get_block(), excpet that in one 461 * picture as after the successful ext2_get_block(), except that in one
462 * place chain is disconnected - *branch->p is still zero (we did not 462 * place chain is disconnected - *branch->p is still zero (we did not
463 * set the last link), but branch->key contains the number that should 463 * set the last link), but branch->key contains the number that should
464 * be placed into *branch->p to fill that gap. 464 * be placed into *branch->p to fill that gap.
@@ -662,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
662 mutex_lock(&ei->truncate_mutex); 662 mutex_lock(&ei->truncate_mutex);
663 /* 663 /*
664 * If the indirect block is missing while we are reading 664 * If the indirect block is missing while we are reading
665 * the chain(ext3_get_branch() returns -EAGAIN err), or 665 * the chain(ext2_get_branch() returns -EAGAIN err), or
666 * if the chain has been changed after we grab the semaphore, 666 * if the chain has been changed after we grab the semaphore,
667 * (either because another process truncated this branch, or 667 * (either because another process truncated this branch, or
668 * another get_block allocated this branch) re-grab the chain to see if 668 * another get_block allocated this branch) re-grab the chain to see if
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1204 if (inode_needs_sync(inode)) { 1204 if (inode_needs_sync(inode)) {
1205 sync_mapping_buffers(inode->i_mapping); 1205 sync_mapping_buffers(inode->i_mapping);
1206 ext2_sync_inode (inode); 1206 sync_inode_metadata(inode, 1);
1207 } else { 1207 } else {
1208 mark_inode_dirty(inode); 1208 mark_inode_dirty(inode);
1209 } 1209 }
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1524} 1524}
1525 1525
1526int ext2_sync_inode(struct inode *inode)
1527{
1528 struct writeback_control wbc = {
1529 .sync_mode = WB_SYNC_ALL,
1530 .nr_to_write = 0, /* sys_fsync did this */
1531 };
1532 return sync_inode(inode, &wbc);
1533}
1534
1535int ext2_setattr(struct dentry *dentry, struct iattr *iattr) 1526int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1536{ 1527{
1537 struct inode *inode = dentry->d_inode; 1528 struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..2e1d8341d827 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -67,7 +67,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
67 inode = NULL; 67 inode = NULL;
68 if (ino) { 68 if (ino) {
69 inode = ext2_iget(dir->i_sb, ino); 69 inode = ext2_iget(dir->i_sb, ino);
70 if (unlikely(IS_ERR(inode))) { 70 if (IS_ERR(inode)) {
71 if (PTR_ERR(inode) == -ESTALE) { 71 if (PTR_ERR(inode) == -ESTALE) {
72 ext2_error(dir->i_sb, __func__, 72 ext2_error(dir->i_sb, __func__,
73 "deleted inode referenced: %lu", 73 "deleted inode referenced: %lu",
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
206 206
207 inode->i_ctime = CURRENT_TIME_SEC; 207 inode->i_ctime = CURRENT_TIME_SEC;
208 inode_inc_link_count(inode); 208 inode_inc_link_count(inode);
209 atomic_inc(&inode->i_count); 209 ihold(inode);
210 210
211 err = ext2_add_link(dentry, inode); 211 err = ext2_add_link(dentry, inode);
212 if (!err) { 212 if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1ec602673ea8..7731695e65d9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -43,9 +43,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data);
43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 43static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
44static int ext2_sync_fs(struct super_block *sb, int wait); 44static int ext2_sync_fs(struct super_block *sb, int wait);
45 45
46void ext2_error (struct super_block * sb, const char * function, 46void ext2_error(struct super_block *sb, const char *function,
47 const char * fmt, ...) 47 const char *fmt, ...)
48{ 48{
49 struct va_format vaf;
49 va_list args; 50 va_list args;
50 struct ext2_sb_info *sbi = EXT2_SB(sb); 51 struct ext2_sb_info *sbi = EXT2_SB(sb);
51 struct ext2_super_block *es = sbi->s_es; 52 struct ext2_super_block *es = sbi->s_es;
@@ -59,9 +60,13 @@ void ext2_error (struct super_block * sb, const char * function,
59 } 60 }
60 61
61 va_start(args, fmt); 62 va_start(args, fmt);
62 printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function); 63
63 vprintk(fmt, args); 64 vaf.fmt = fmt;
64 printk("\n"); 65 vaf.va = &args;
66
67 printk(KERN_CRIT "EXT2-fs (%s): error: %s: %pV\n",
68 sb->s_id, function, &vaf);
69
65 va_end(args); 70 va_end(args);
66 71
67 if (test_opt(sb, ERRORS_PANIC)) 72 if (test_opt(sb, ERRORS_PANIC))
@@ -76,12 +81,16 @@ void ext2_error (struct super_block * sb, const char * function,
76void ext2_msg(struct super_block *sb, const char *prefix, 81void ext2_msg(struct super_block *sb, const char *prefix,
77 const char *fmt, ...) 82 const char *fmt, ...)
78{ 83{
84 struct va_format vaf;
79 va_list args; 85 va_list args;
80 86
81 va_start(args, fmt); 87 va_start(args, fmt);
82 printk("%sEXT2-fs (%s): ", prefix, sb->s_id); 88
83 vprintk(fmt, args); 89 vaf.fmt = fmt;
84 printk("\n"); 90 vaf.va = &args;
91
92 printk("%sEXT2-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
93
85 va_end(args); 94 va_end(args);
86} 95}
87 96
@@ -161,11 +170,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
161 return &ei->vfs_inode; 170 return &ei->vfs_inode;
162} 171}
163 172
164static void ext2_destroy_inode(struct inode *inode) 173static void ext2_i_callback(struct rcu_head *head)
165{ 174{
175 struct inode *inode = container_of(head, struct inode, i_rcu);
176 INIT_LIST_HEAD(&inode->i_dentry);
166 kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); 177 kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
167} 178}
168 179
180static void ext2_destroy_inode(struct inode *inode)
181{
182 call_rcu(&inode->i_rcu, ext2_i_callback);
183}
184
169static void init_once(void *foo) 185static void init_once(void *foo)
170{ 186{
171 struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; 187 struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
@@ -747,15 +763,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
747 __le32 features; 763 __le32 features;
748 int err; 764 int err;
749 765
766 err = -ENOMEM;
750 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 767 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
751 if (!sbi) 768 if (!sbi)
752 return -ENOMEM; 769 goto failed_unlock;
753 770
754 sbi->s_blockgroup_lock = 771 sbi->s_blockgroup_lock =
755 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 772 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
756 if (!sbi->s_blockgroup_lock) { 773 if (!sbi->s_blockgroup_lock) {
757 kfree(sbi); 774 kfree(sbi);
758 return -ENOMEM; 775 goto failed_unlock;
759 } 776 }
760 sb->s_fs_info = sbi; 777 sb->s_fs_info = sbi;
761 sbi->s_sb_block = sb_block; 778 sbi->s_sb_block = sb_block;
@@ -1107,6 +1124,7 @@ failed_sbi:
1107 sb->s_fs_info = NULL; 1124 sb->s_fs_info = NULL;
1108 kfree(sbi->s_blockgroup_lock); 1125 kfree(sbi->s_blockgroup_lock);
1109 kfree(sbi); 1126 kfree(sbi);
1127failed_unlock:
1110 return ret; 1128 return ret;
1111} 1129}
1112 1130
@@ -1219,9 +1237,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1219 } 1237 }
1220 1238
1221 es = sbi->s_es; 1239 es = sbi->s_es;
1222 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1240 if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
1223 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1224 invalidate_inodes(sb)) {
1225 ext2_msg(sb, KERN_WARNING, "warning: refusing change of " 1241 ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
1226 "xip flag with busy inodes while remounting"); 1242 "xip flag with busy inodes while remounting");
1227 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1243 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
@@ -1356,10 +1372,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1356 return 0; 1372 return 0;
1357} 1373}
1358 1374
1359static int ext2_get_sb(struct file_system_type *fs_type, 1375static struct dentry *ext2_mount(struct file_system_type *fs_type,
1360 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1376 int flags, const char *dev_name, void *data)
1361{ 1377{
1362 return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt); 1378 return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
1363} 1379}
1364 1380
1365#ifdef CONFIG_QUOTA 1381#ifdef CONFIG_QUOTA
@@ -1473,7 +1489,7 @@ out:
1473static struct file_system_type ext2_fs_type = { 1489static struct file_system_type ext2_fs_type = {
1474 .owner = THIS_MODULE, 1490 .owner = THIS_MODULE,
1475 .name = "ext2", 1491 .name = "ext2",
1476 .get_sb = ext2_get_sb, 1492 .mount = ext2_mount,
1477 .kill_sb = kill_block_super, 1493 .kill_sb = kill_block_super,
1478 .fs_flags = FS_REQUIRES_DEV, 1494 .fs_flags = FS_REQUIRES_DEV,
1479}; 1495};
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..c2e4dce984d2 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -199,14 +199,6 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
199 goto found; 199 goto found;
200 entry = next; 200 entry = next;
201 } 201 }
202 /* Check the remaining name entries */
203 while (!IS_LAST_ENTRY(entry)) {
204 struct ext2_xattr_entry *next =
205 EXT2_XATTR_NEXT(entry);
206 if ((char *)next >= end)
207 goto bad_block;
208 entry = next;
209 }
210 if (ext2_xattr_cache_insert(bh)) 202 if (ext2_xattr_cache_insert(bh))
211 ea_idebug(inode, "cache insert failed"); 203 ea_idebug(inode, "cache insert failed");
212 error = -ENODATA; 204 error = -ENODATA;
@@ -355,7 +347,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
355/* 347/*
356 * ext2_xattr_set() 348 * ext2_xattr_set()
357 * 349 *
358 * Create, replace or remove an extended attribute for this inode. Buffer 350 * Create, replace or remove an extended attribute for this inode. Value
359 * is NULL to remove an existing extended attribute, and non-NULL to 351 * is NULL to remove an existing extended attribute, and non-NULL to
360 * either replace an existing extended attribute, or create a new extended 352 * either replace an existing extended attribute, or create a new extended
361 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 353 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -699,7 +691,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
699 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; 691 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
700 inode->i_ctime = CURRENT_TIME_SEC; 692 inode->i_ctime = CURRENT_TIME_SEC;
701 if (IS_SYNC(inode)) { 693 if (IS_SYNC(inode)) {
702 error = ext2_sync_inode (inode); 694 error = sync_inode_metadata(inode, 1);
703 /* In case sync failed due to ENOSPC the inode was actually 695 /* In case sync failed due to ENOSPC the inode was actually
704 * written (only some dirty data were not) so we just proceed 696 * written (only some dirty data were not) so we just proceed
705 * as if nothing happened and cleanup the unused block */ 697 * as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe212183..e4fa49e6c539 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,17 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
240} 240}
241 241
242int 242int
243ext3_check_acl(struct inode *inode, int mask) 243ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
244{ 244{
245 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); 245 struct posix_acl *acl;
246
247 if (flags & IPERM_FLAG_RCU) {
248 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
249 return -ECHILD;
250 return -EAGAIN;
251 }
246 252
253 acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
247 if (IS_ERR(acl)) 254 if (IS_ERR(acl))
248 return PTR_ERR(acl); 255 return PTR_ERR(acl);
249 if (acl) { 256 if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de9..5faf8048e906 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
54#ifdef CONFIG_EXT3_FS_POSIX_ACL 54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext3_check_acl (struct inode *, int); 57extern int ext3_check_acl (struct inode *, int, unsigned int);
58extern int ext3_acl_chmod (struct inode *); 58extern int ext3_acl_chmod (struct inode *);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60 60
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4ded..045995c8ce5a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
20#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
21#include <linux/quotaops.h> 21#include <linux/quotaops.h>
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/blkdev.h>
23 24
24/* 25/*
25 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
@@ -39,6 +40,21 @@
39 40
40#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 41#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
41 42
43/*
44 * Calculate the block group number and offset, given a block number
45 */
46static void ext3_get_group_no_and_offset(struct super_block *sb,
47 ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
48{
49 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
50
51 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
52 if (offsetp)
53 *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
54 if (blockgrpp)
55 *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
56}
57
42/** 58/**
43 * ext3_get_group_desc() -- load group descriptor from disk 59 * ext3_get_group_desc() -- load group descriptor from disk
44 * @sb: super block 60 * @sb: super block
@@ -792,9 +808,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
792 if (here < 0) 808 if (here < 0)
793 here = 0; 809 here = 0;
794 810
795 p = ((char *)bh->b_data) + (here >> 3); 811 p = bh->b_data + (here >> 3);
796 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); 812 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
797 next = (r - ((char *)bh->b_data)) << 3; 813 next = (r - bh->b_data) << 3;
798 814
799 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) 815 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
800 return next; 816 return next;
@@ -810,8 +826,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
810 826
811/** 827/**
812 * claim_block() 828 * claim_block()
829 * @lock: the spin lock for this block group
813 * @block: the free block (group relative) to allocate 830 * @block: the free block (group relative) to allocate
814 * @bh: the bufferhead containts the block group bitmap 831 * @bh: the buffer_head contains the block group bitmap
815 * 832 *
816 * We think we can allocate this block in this bitmap. Try to set the bit. 833 * We think we can allocate this block in this bitmap. Try to set the bit.
817 * If that succeeds then check that nobody has allocated and then freed the 834 * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +973,11 @@ fail_access:
956 * but we will shift to the place where start_block is, 973 * but we will shift to the place where start_block is,
957 * then start from there, when looking for a reservable space. 974 * then start from there, when looking for a reservable space.
958 * 975 *
959 * @size: the target new reservation window size 976 * @my_rsv: the reservation window
977 *
978 * @sb: the super block
960 * 979 *
961 * @group_first_block: the first block we consider to start 980 * @start_block: the first block we consider to start
962 * the real search from 981 * the real search from
963 * 982 *
964 * @last_block: 983 * @last_block:
@@ -1084,7 +1103,7 @@ static int find_next_reservable_window(
1084 * 1103 *
1085 * failed: we failed to find a reservation window in this group 1104 * failed: we failed to find a reservation window in this group
1086 * 1105 *
1087 * @rsv: the reservation 1106 * @my_rsv: the reservation window
1088 * 1107 *
1089 * @grp_goal: The goal (group-relative). It is where the search for a 1108 * @grp_goal: The goal (group-relative). It is where the search for a
1090 * free reservable space should start from. 1109 * free reservable space should start from.
@@ -1273,8 +1292,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1273 * @group: given allocation block group 1292 * @group: given allocation block group
1274 * @bitmap_bh: bufferhead holds the block bitmap 1293 * @bitmap_bh: bufferhead holds the block bitmap
1275 * @grp_goal: given target block within the group 1294 * @grp_goal: given target block within the group
1276 * @count: target number of blocks to allocate
1277 * @my_rsv: reservation window 1295 * @my_rsv: reservation window
1296 * @count: target number of blocks to allocate
1278 * @errp: pointer to store the error code 1297 * @errp: pointer to store the error code
1279 * 1298 *
1280 * This is the main function used to allocate a new block and its reservation 1299 * This is the main function used to allocate a new block and its reservation
@@ -1882,3 +1901,253 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1882 return ext3_bg_num_gdb_meta(sb,group); 1901 return ext3_bg_num_gdb_meta(sb,group);
1883 1902
1884} 1903}
1904
1905/**
1906 * ext3_trim_all_free -- function to trim all free space in alloc. group
1907 * @sb: super block for file system
1908 * @group: allocation group to trim
1909 * @start: first group block to examine
1910 * @max: last group block to examine
1911 * @gdp: allocation group description structure
1912 * @minblocks: minimum extent block count
1913 *
1914 * ext3_trim_all_free walks through group's block bitmap searching for free
1915 * blocks. When the free block is found, it tries to allocate this block and
1916 * consequent free block to get the biggest free extent possible, until it
1917 * reaches any used block. Then issue a TRIM command on this extent and free
1918 * the extent in the block bitmap. This is done until whole group is scanned.
1919 */
1920ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1921 ext3_grpblk_t start, ext3_grpblk_t max,
1922 ext3_grpblk_t minblocks)
1923{
1924 handle_t *handle;
1925 ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
1926 ext3_fsblk_t discard_block;
1927 struct ext3_sb_info *sbi;
1928 struct buffer_head *gdp_bh, *bitmap_bh = NULL;
1929 struct ext3_group_desc *gdp;
1930 int err = 0, ret = 0;
1931
1932 /*
1933 * We will update one block bitmap, and one group descriptor
1934 */
1935 handle = ext3_journal_start_sb(sb, 2);
1936 if (IS_ERR(handle))
1937 return PTR_ERR(handle);
1938
1939 bitmap_bh = read_block_bitmap(sb, group);
1940 if (!bitmap_bh) {
1941 err = -EIO;
1942 goto err_out;
1943 }
1944
1945 BUFFER_TRACE(bitmap_bh, "getting undo access");
1946 err = ext3_journal_get_undo_access(handle, bitmap_bh);
1947 if (err)
1948 goto err_out;
1949
1950 gdp = ext3_get_group_desc(sb, group, &gdp_bh);
1951 if (!gdp) {
1952 err = -EIO;
1953 goto err_out;
1954 }
1955
1956 BUFFER_TRACE(gdp_bh, "get_write_access");
1957 err = ext3_journal_get_write_access(handle, gdp_bh);
1958 if (err)
1959 goto err_out;
1960
1961 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1962 sbi = EXT3_SB(sb);
1963
1964 /* Walk through the whole group */
1965 while (start < max) {
1966 start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1967 if (start < 0)
1968 break;
1969 next = start;
1970
1971 /*
1972 * Allocate contiguous free extents by setting bits in the
1973 * block bitmap
1974 */
1975 while (next < max
1976 && claim_block(sb_bgl_lock(sbi, group),
1977 next, bitmap_bh)) {
1978 next++;
1979 }
1980
1981 /* We did not claim any blocks */
1982 if (next == start)
1983 continue;
1984
1985 discard_block = (ext3_fsblk_t)start +
1986 ext3_group_first_block_no(sb, group);
1987
1988 /* Update counters */
1989 spin_lock(sb_bgl_lock(sbi, group));
1990 le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
1991 spin_unlock(sb_bgl_lock(sbi, group));
1992 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1993
1994 /* Do not issue a TRIM on extents smaller than minblocks */
1995 if ((next - start) < minblocks)
1996 goto free_extent;
1997
1998 /* Send the TRIM command down to the device */
1999 err = sb_issue_discard(sb, discard_block, next - start,
2000 GFP_NOFS, 0);
2001 count += (next - start);
2002free_extent:
2003 freed = 0;
2004
2005 /*
2006 * Clear bits in the bitmap
2007 */
2008 for (bit = start; bit < next; bit++) {
2009 BUFFER_TRACE(bitmap_bh, "clear bit");
2010 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
2011 bit, bitmap_bh->b_data)) {
2012 ext3_error(sb, __func__,
2013 "bit already cleared for block "E3FSBLK,
2014 (unsigned long)bit);
2015 BUFFER_TRACE(bitmap_bh, "bit already cleared");
2016 } else {
2017 freed++;
2018 }
2019 }
2020
2021 /* Update couters */
2022 spin_lock(sb_bgl_lock(sbi, group));
2023 le16_add_cpu(&gdp->bg_free_blocks_count, freed);
2024 spin_unlock(sb_bgl_lock(sbi, group));
2025 percpu_counter_add(&sbi->s_freeblocks_counter, freed);
2026
2027 start = next;
2028 if (err < 0) {
2029 if (err != -EOPNOTSUPP)
2030 ext3_warning(sb, __func__, "Discard command "
2031 "returned error %d\n", err);
2032 break;
2033 }
2034
2035 if (fatal_signal_pending(current)) {
2036 err = -ERESTARTSYS;
2037 break;
2038 }
2039
2040 cond_resched();
2041
2042 /* No more suitable extents */
2043 if ((free_blocks - count) < minblocks)
2044 break;
2045 }
2046
2047 /* We dirtied the bitmap block */
2048 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2049 ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
2050 if (!err)
2051 err = ret;
2052
2053 /* And the group descriptor block */
2054 BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
2055 ret = ext3_journal_dirty_metadata(handle, gdp_bh);
2056 if (!err)
2057 err = ret;
2058
2059 ext3_debug("trimmed %d blocks in the group %d\n",
2060 count, group);
2061
2062err_out:
2063 if (err)
2064 count = err;
2065 ext3_journal_stop(handle);
2066 brelse(bitmap_bh);
2067
2068 return count;
2069}
2070
2071/**
2072 * ext3_trim_fs() -- trim ioctl handle function
2073 * @sb: superblock for filesystem
2074 * @start: First Byte to trim
2075 * @len: number of Bytes to trim from start
2076 * @minlen: minimum extent length in Bytes
2077 *
2078 * ext3_trim_fs goes through all allocation groups containing Bytes from
2079 * start to start+len. For each such a group ext3_trim_all_free function
2080 * is invoked to trim all free space.
2081 */
2082int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2083{
2084 ext3_grpblk_t last_block, first_block, free_blocks;
2085 unsigned long first_group, last_group;
2086 unsigned long group, ngroups;
2087 struct ext3_group_desc *gdp;
2088 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2089 uint64_t start, len, minlen, trimmed;
2090 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2091 int ret = 0;
2092
2093 start = range->start >> sb->s_blocksize_bits;
2094 len = range->len >> sb->s_blocksize_bits;
2095 minlen = range->minlen >> sb->s_blocksize_bits;
2096 trimmed = 0;
2097
2098 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
2099 return -EINVAL;
2100 if (start >= max_blks)
2101 goto out;
2102 if (start < le32_to_cpu(es->s_first_data_block)) {
2103 len -= le32_to_cpu(es->s_first_data_block) - start;
2104 start = le32_to_cpu(es->s_first_data_block);
2105 }
2106 if (start + len > max_blks)
2107 len = max_blks - start;
2108
2109 ngroups = EXT3_SB(sb)->s_groups_count;
2110 smp_rmb();
2111
2112 /* Determine first and last group to examine based on start and len */
2113 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2114 &first_group, &first_block);
2115 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
2116 &last_group, &last_block);
2117 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
2118 last_block = EXT3_BLOCKS_PER_GROUP(sb);
2119
2120 if (first_group > last_group)
2121 return -EINVAL;
2122
2123 for (group = first_group; group <= last_group; group++) {
2124 gdp = ext3_get_group_desc(sb, group, NULL);
2125 if (!gdp)
2126 break;
2127
2128 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
2129 if (free_blocks < minlen)
2130 continue;
2131
2132 if (len >= EXT3_BLOCKS_PER_GROUP(sb))
2133 len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
2134 else
2135 last_block = first_block + len;
2136
2137 ret = ext3_trim_all_free(sb, group, first_block,
2138 last_block, minlen);
2139 if (ret < 0)
2140 break;
2141
2142 trimmed += ret;
2143 first_block = 0;
2144 }
2145
2146 if (ret >= 0)
2147 ret = 0;
2148
2149out:
2150 range->len = trimmed * sb->s_blocksize;
2151
2152 return ret;
2153}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e2e72c367cf6..34f0a072b935 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -69,25 +69,26 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
69 const char * error_msg = NULL; 69 const char * error_msg = NULL;
70 const int rlen = ext3_rec_len_from_disk(de->rec_len); 70 const int rlen = ext3_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT3_DIR_REC_LEN(1)) 72 if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
73 error_msg = "rec_len is smaller than minimal"; 73 error_msg = "rec_len is smaller than minimal";
74 else if (rlen % 4 != 0) 74 else if (unlikely(rlen % 4 != 0))
75 error_msg = "rec_len % 4 != 0"; 75 error_msg = "rec_len % 4 != 0";
76 else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) 76 else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
77 error_msg = "rec_len is too small for name_len"; 77 error_msg = "rec_len is too small for name_len";
78 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 78 else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
79 error_msg = "directory entry across blocks"; 79 error_msg = "directory entry across blocks";
80 else if (le32_to_cpu(de->inode) > 80 else if (unlikely(le32_to_cpu(de->inode) >
81 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) 81 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
82 error_msg = "inode out of bounds"; 82 error_msg = "inode out of bounds";
83 83
84 if (error_msg != NULL) 84 if (unlikely(error_msg != NULL))
85 ext3_error (dir->i_sb, function, 85 ext3_error (dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode), 89 (unsigned long) le32_to_cpu(de->inode),
90 rlen, de->name_len); 90 rlen, de->name_len);
91
91 return error_msg == NULL ? 1 : 0; 92 return error_msg == NULL ? 1 : 0;
92} 93}
93 94
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
90 * storage 90 * storage
91 */ 91 */
92 if (needs_barrier) 92 if (needs_barrier)
93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
94 BLKDEV_IFL_WAIT);
95 return ret; 94 return ret;
96} 95}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4ab72db3559e..9724aef22460 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -570,9 +570,14 @@ got:
570 ei->i_state_flags = 0; 570 ei->i_state_flags = 0;
571 ext3_set_inode_state(inode, EXT3_STATE_NEW); 571 ext3_set_inode_state(inode, EXT3_STATE_NEW);
572 572
573 ei->i_extra_isize = 573 /* See comment in ext3_iget for explanation */
574 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 574 if (ino >= EXT3_FIRST_INO(sb) + 1 &&
575 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 575 EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
576 ei->i_extra_isize =
577 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
578 } else {
579 ei->i_extra_isize = 0;
580 }
576 581
577 ret = inode; 582 ret = inode;
578 dquot_initialize(inode); 583 dquot_initialize(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..ae94f6d949f5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -498,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
498} 498}
499 499
500/** 500/**
501 * ext3_blks_to_allocate: Look up the block map and count the number 501 * ext3_blks_to_allocate - Look up the block map and count the number
502 * of direct blocks need to be allocated for the given branch. 502 * of direct blocks need to be allocated for the given branch.
503 * 503 *
504 * @branch: chain of indirect blocks 504 * @branch: chain of indirect blocks
@@ -536,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
536} 536}
537 537
538/** 538/**
539 * ext3_alloc_blocks: multiple allocate blocks needed for a branch 539 * ext3_alloc_blocks - multiple allocate blocks needed for a branch
540 * @handle: handle for this transaction
541 * @inode: owner
542 * @goal: preferred place for allocation
540 * @indirect_blks: the number of blocks need to allocate for indirect 543 * @indirect_blks: the number of blocks need to allocate for indirect
541 * blocks 544 * blocks
542 * 545 * @blks: number of blocks need to allocated for direct blocks
543 * @new_blocks: on return it will store the new block numbers for 546 * @new_blocks: on return it will store the new block numbers for
544 * the indirect blocks(if needed) and the first direct block, 547 * the indirect blocks(if needed) and the first direct block,
545 * @blks: on return it will store the total number of allocated 548 * @err: here we store the error value
546 * direct blocks 549 *
550 * return the number of direct blocks allocated
547 */ 551 */
548static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, 552static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
549 ext3_fsblk_t goal, int indirect_blks, int blks, 553 ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -598,9 +602,11 @@ failed_out:
598 602
599/** 603/**
600 * ext3_alloc_branch - allocate and set up a chain of blocks. 604 * ext3_alloc_branch - allocate and set up a chain of blocks.
605 * @handle: handle for this transaction
601 * @inode: owner 606 * @inode: owner
602 * @indirect_blks: number of allocated indirect blocks 607 * @indirect_blks: number of allocated indirect blocks
603 * @blks: number of allocated direct blocks 608 * @blks: number of allocated direct blocks
609 * @goal: preferred place for allocation
604 * @offsets: offsets (in the blocks) to store the pointers to next. 610 * @offsets: offsets (in the blocks) to store the pointers to next.
605 * @branch: place to store the chain in. 611 * @branch: place to store the chain in.
606 * 612 *
@@ -700,10 +706,9 @@ failed:
700 706
701/** 707/**
702 * ext3_splice_branch - splice the allocated branch onto inode. 708 * ext3_splice_branch - splice the allocated branch onto inode.
709 * @handle: handle for this transaction
703 * @inode: owner 710 * @inode: owner
704 * @block: (logical) number of block we are adding 711 * @block: (logical) number of block we are adding
705 * @chain: chain of indirect blocks (with a missing link - see
706 * ext3_alloc_branch)
707 * @where: location of missing link 712 * @where: location of missing link
708 * @num: number of indirect blocks we are adding 713 * @num: number of indirect blocks we are adding
709 * @blks: number of direct blocks we are adding 714 * @blks: number of direct blocks we are adding
@@ -1696,8 +1701,8 @@ static int ext3_journalled_writepage(struct page *page,
1696 * doesn't seem much point in redirtying the page here. 1701 * doesn't seem much point in redirtying the page here.
1697 */ 1702 */
1698 ClearPageChecked(page); 1703 ClearPageChecked(page);
1699 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 1704 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
1700 ext3_get_block); 1705 ext3_get_block);
1701 if (ret != 0) { 1706 if (ret != 0) {
1702 ext3_journal_stop(handle); 1707 ext3_journal_stop(handle);
1703 goto out_unlock; 1708 goto out_unlock;
@@ -2140,13 +2145,15 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2140 if (try_to_extend_transaction(handle, inode)) { 2145 if (try_to_extend_transaction(handle, inode)) {
2141 if (bh) { 2146 if (bh) {
2142 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 2147 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2143 ext3_journal_dirty_metadata(handle, bh); 2148 if (ext3_journal_dirty_metadata(handle, bh))
2149 return;
2144 } 2150 }
2145 ext3_mark_inode_dirty(handle, inode); 2151 ext3_mark_inode_dirty(handle, inode);
2146 truncate_restart_transaction(handle, inode); 2152 truncate_restart_transaction(handle, inode);
2147 if (bh) { 2153 if (bh) {
2148 BUFFER_TRACE(bh, "retaking write access"); 2154 BUFFER_TRACE(bh, "retaking write access");
2149 ext3_journal_get_write_access(handle, bh); 2155 if (ext3_journal_get_write_access(handle, bh))
2156 return;
2150 } 2157 }
2151 } 2158 }
2152 2159
@@ -2530,7 +2537,6 @@ void ext3_truncate(struct inode *inode)
2530 */ 2537 */
2531 } else { 2538 } else {
2532 /* Shared branch grows from an indirect block */ 2539 /* Shared branch grows from an indirect block */
2533 BUFFER_TRACE(partial->bh, "get_write_access");
2534 ext3_free_branches(handle, inode, partial->bh, 2540 ext3_free_branches(handle, inode, partial->bh,
2535 partial->p, 2541 partial->p,
2536 partial->p+1, (chain+n-1) - partial); 2542 partial->p+1, (chain+n-1) - partial);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 88974814783a..fc080dd561f7 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -276,7 +276,29 @@ group_add_out:
276 mnt_drop_write(filp->f_path.mnt); 276 mnt_drop_write(filp->f_path.mnt);
277 return err; 277 return err;
278 } 278 }
279 case FITRIM: {
279 280
281 struct super_block *sb = inode->i_sb;
282 struct fstrim_range range;
283 int ret = 0;
284
285 if (!capable(CAP_SYS_ADMIN))
286 return -EPERM;
287
288 if (copy_from_user(&range, (struct fstrim_range *)arg,
289 sizeof(range)))
290 return -EFAULT;
291
292 ret = ext3_trim_fs(sb, &range);
293 if (ret < 0)
294 return ret;
295
296 if (copy_to_user((struct fstrim_range *)arg, &range,
297 sizeof(range)))
298 return -EFAULT;
299
300 return 0;
301 }
280 302
281 default: 303 default:
282 return -ENOTTY; 304 return -ENOTTY;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..b27ba71810ec 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -858,6 +858,7 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
858 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 858 struct buffer_head * bh_use[NAMEI_RA_SIZE];
859 struct buffer_head * bh, *ret = NULL; 859 struct buffer_head * bh, *ret = NULL;
860 unsigned long start, block, b; 860 unsigned long start, block, b;
861 const u8 *name = entry->name;
861 int ra_max = 0; /* Number of bh's in the readahead 862 int ra_max = 0; /* Number of bh's in the readahead
862 buffer, bh_use[] */ 863 buffer, bh_use[] */
863 int ra_ptr = 0; /* Current index into readahead 864 int ra_ptr = 0; /* Current index into readahead
@@ -871,6 +872,16 @@ static struct buffer_head *ext3_find_entry(struct inode *dir,
871 namelen = entry->len; 872 namelen = entry->len;
872 if (namelen > EXT3_NAME_LEN) 873 if (namelen > EXT3_NAME_LEN)
873 return NULL; 874 return NULL;
875 if ((namelen <= 2) && (name[0] == '.') &&
876 (name[1] == '.' || name[1] == 0)) {
877 /*
878 * "." or ".." will only be in the first block
879 * NFS may look up ".."; "." should be handled by the VFS
880 */
881 block = start = 0;
882 nblocks = 1;
883 goto restart;
884 }
874 if (is_dx(dir)) { 885 if (is_dx(dir)) {
875 bh = ext3_dx_find_entry(dir, entry, res_dir, &err); 886 bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
876 /* 887 /*
@@ -961,55 +972,35 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
961 struct qstr *entry, struct ext3_dir_entry_2 **res_dir, 972 struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
962 int *err) 973 int *err)
963{ 974{
964 struct super_block * sb; 975 struct super_block *sb = dir->i_sb;
965 struct dx_hash_info hinfo; 976 struct dx_hash_info hinfo;
966 u32 hash;
967 struct dx_frame frames[2], *frame; 977 struct dx_frame frames[2], *frame;
968 struct ext3_dir_entry_2 *de, *top;
969 struct buffer_head *bh; 978 struct buffer_head *bh;
970 unsigned long block; 979 unsigned long block;
971 int retval; 980 int retval;
972 int namelen = entry->len;
973 const u8 *name = entry->name;
974 981
975 sb = dir->i_sb; 982 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
976 /* NFS may look up ".." - look at dx_root directory block */ 983 return NULL;
977 if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
978 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
979 return NULL;
980 } else {
981 frame = frames;
982 frame->bh = NULL; /* for dx_release() */
983 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
984 dx_set_block(frame->at, 0); /* dx_root block is 0 */
985 }
986 hash = hinfo.hash;
987 do { 984 do {
988 block = dx_get_block(frame->at); 985 block = dx_get_block(frame->at);
989 if (!(bh = ext3_bread (NULL,dir, block, 0, err))) 986 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
990 goto errout; 987 goto errout;
991 de = (struct ext3_dir_entry_2 *) bh->b_data;
992 top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
993 EXT3_DIR_REC_LEN(0));
994 for (; de < top; de = ext3_next_entry(de)) {
995 int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
996 + ((char *) de - bh->b_data);
997
998 if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
999 brelse(bh);
1000 *err = ERR_BAD_DX_DIR;
1001 goto errout;
1002 }
1003 988
1004 if (ext3_match(namelen, name, de)) { 989 retval = search_dirblock(bh, dir, entry,
1005 *res_dir = de; 990 block << EXT3_BLOCK_SIZE_BITS(sb),
1006 dx_release(frames); 991 res_dir);
1007 return bh; 992 if (retval == 1) {
1008 } 993 dx_release(frames);
994 return bh;
1009 } 995 }
1010 brelse (bh); 996 brelse(bh);
997 if (retval == -1) {
998 *err = ERR_BAD_DX_DIR;
999 goto errout;
1000 }
1001
1011 /* Check to see if we should continue to search */ 1002 /* Check to see if we should continue to search */
1012 retval = ext3_htree_next_block(dir, hash, frame, 1003 retval = ext3_htree_next_block(dir, hinfo.hash, frame,
1013 frames, NULL); 1004 frames, NULL);
1014 if (retval < 0) { 1005 if (retval < 0) {
1015 ext3_warning(sb, __func__, 1006 ext3_warning(sb, __func__,
@@ -1047,7 +1038,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1047 return ERR_PTR(-EIO); 1038 return ERR_PTR(-EIO);
1048 } 1039 }
1049 inode = ext3_iget(dir->i_sb, ino); 1040 inode = ext3_iget(dir->i_sb, ino);
1050 if (unlikely(IS_ERR(inode))) { 1041 if (IS_ERR(inode)) {
1051 if (PTR_ERR(inode) == -ESTALE) { 1042 if (PTR_ERR(inode) == -ESTALE) {
1052 ext3_error(dir->i_sb, __func__, 1043 ext3_error(dir->i_sb, __func__,
1053 "deleted inode referenced: %lu", 1044 "deleted inode referenced: %lu",
@@ -1607,7 +1598,9 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1607 if (err) 1598 if (err)
1608 goto journal_error; 1599 goto journal_error;
1609 } 1600 }
1610 ext3_journal_dirty_metadata(handle, frames[0].bh); 1601 err = ext3_journal_dirty_metadata(handle, frames[0].bh);
1602 if (err)
1603 goto journal_error;
1611 } 1604 }
1612 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1605 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1613 if (!de) 1606 if (!de)
@@ -1644,8 +1637,13 @@ static int ext3_delete_entry (handle_t *handle,
1644 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) 1637 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1645 return -EIO; 1638 return -EIO;
1646 if (de == de_del) { 1639 if (de == de_del) {
1640 int err;
1641
1647 BUFFER_TRACE(bh, "get_write_access"); 1642 BUFFER_TRACE(bh, "get_write_access");
1648 ext3_journal_get_write_access(handle, bh); 1643 err = ext3_journal_get_write_access(handle, bh);
1644 if (err)
1645 goto journal_error;
1646
1649 if (pde) 1647 if (pde)
1650 pde->rec_len = ext3_rec_len_to_disk( 1648 pde->rec_len = ext3_rec_len_to_disk(
1651 ext3_rec_len_from_disk(pde->rec_len) + 1649 ext3_rec_len_from_disk(pde->rec_len) +
@@ -1654,7 +1652,12 @@ static int ext3_delete_entry (handle_t *handle,
1654 de->inode = 0; 1652 de->inode = 0;
1655 dir->i_version++; 1653 dir->i_version++;
1656 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 1654 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1657 ext3_journal_dirty_metadata(handle, bh); 1655 err = ext3_journal_dirty_metadata(handle, bh);
1656 if (err) {
1657journal_error:
1658 ext3_std_error(dir->i_sb, err);
1659 return err;
1660 }
1658 return 0; 1661 return 0;
1659 } 1662 }
1660 i += ext3_rec_len_from_disk(de->rec_len); 1663 i += ext3_rec_len_from_disk(de->rec_len);
@@ -1762,7 +1765,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1762{ 1765{
1763 handle_t *handle; 1766 handle_t *handle;
1764 struct inode * inode; 1767 struct inode * inode;
1765 struct buffer_head * dir_block; 1768 struct buffer_head * dir_block = NULL;
1766 struct ext3_dir_entry_2 * de; 1769 struct ext3_dir_entry_2 * de;
1767 int err, retries = 0; 1770 int err, retries = 0;
1768 1771
@@ -1790,15 +1793,14 @@ retry:
1790 inode->i_fop = &ext3_dir_operations; 1793 inode->i_fop = &ext3_dir_operations;
1791 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1794 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1792 dir_block = ext3_bread (handle, inode, 0, 1, &err); 1795 dir_block = ext3_bread (handle, inode, 0, 1, &err);
1793 if (!dir_block) { 1796 if (!dir_block)
1794 drop_nlink(inode); /* is this nlink == 0? */ 1797 goto out_clear_inode;
1795 unlock_new_inode(inode); 1798
1796 ext3_mark_inode_dirty(handle, inode);
1797 iput (inode);
1798 goto out_stop;
1799 }
1800 BUFFER_TRACE(dir_block, "get_write_access"); 1799 BUFFER_TRACE(dir_block, "get_write_access");
1801 ext3_journal_get_write_access(handle, dir_block); 1800 err = ext3_journal_get_write_access(handle, dir_block);
1801 if (err)
1802 goto out_clear_inode;
1803
1802 de = (struct ext3_dir_entry_2 *) dir_block->b_data; 1804 de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1803 de->inode = cpu_to_le32(inode->i_ino); 1805 de->inode = cpu_to_le32(inode->i_ino);
1804 de->name_len = 1; 1806 de->name_len = 1;
@@ -1814,11 +1816,16 @@ retry:
1814 ext3_set_de_type(dir->i_sb, de, S_IFDIR); 1816 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1815 inode->i_nlink = 2; 1817 inode->i_nlink = 2;
1816 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); 1818 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1817 ext3_journal_dirty_metadata(handle, dir_block); 1819 err = ext3_journal_dirty_metadata(handle, dir_block);
1818 brelse (dir_block); 1820 if (err)
1819 ext3_mark_inode_dirty(handle, inode); 1821 goto out_clear_inode;
1820 err = ext3_add_entry (handle, dentry, inode); 1822
1823 err = ext3_mark_inode_dirty(handle, inode);
1824 if (!err)
1825 err = ext3_add_entry (handle, dentry, inode);
1826
1821 if (err) { 1827 if (err) {
1828out_clear_inode:
1822 inode->i_nlink = 0; 1829 inode->i_nlink = 0;
1823 unlock_new_inode(inode); 1830 unlock_new_inode(inode);
1824 ext3_mark_inode_dirty(handle, inode); 1831 ext3_mark_inode_dirty(handle, inode);
@@ -1827,10 +1834,14 @@ retry:
1827 } 1834 }
1828 inc_nlink(dir); 1835 inc_nlink(dir);
1829 ext3_update_dx_flag(dir); 1836 ext3_update_dx_flag(dir);
1830 ext3_mark_inode_dirty(handle, dir); 1837 err = ext3_mark_inode_dirty(handle, dir);
1838 if (err)
1839 goto out_clear_inode;
1840
1831 d_instantiate(dentry, inode); 1841 d_instantiate(dentry, inode);
1832 unlock_new_inode(inode); 1842 unlock_new_inode(inode);
1833out_stop: 1843out_stop:
1844 brelse(dir_block);
1834 ext3_journal_stop(handle); 1845 ext3_journal_stop(handle);
1835 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 1846 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1836 goto retry; 1847 goto retry;
@@ -2260,7 +2271,7 @@ retry:
2260 2271
2261 inode->i_ctime = CURRENT_TIME_SEC; 2272 inode->i_ctime = CURRENT_TIME_SEC;
2262 inc_nlink(inode); 2273 inc_nlink(inode);
2263 atomic_inc(&inode->i_count); 2274 ihold(inode);
2264 2275
2265 err = ext3_add_entry(handle, dentry, inode); 2276 err = ext3_add_entry(handle, dentry, inode);
2266 if (!err) { 2277 if (!err) {
@@ -2353,7 +2364,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2353 goto end_rename; 2364 goto end_rename;
2354 } else { 2365 } else {
2355 BUFFER_TRACE(new_bh, "get write access"); 2366 BUFFER_TRACE(new_bh, "get write access");
2356 ext3_journal_get_write_access(handle, new_bh); 2367 retval = ext3_journal_get_write_access(handle, new_bh);
2368 if (retval)
2369 goto journal_error;
2357 new_de->inode = cpu_to_le32(old_inode->i_ino); 2370 new_de->inode = cpu_to_le32(old_inode->i_ino);
2358 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, 2371 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2359 EXT3_FEATURE_INCOMPAT_FILETYPE)) 2372 EXT3_FEATURE_INCOMPAT_FILETYPE))
@@ -2362,7 +2375,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2362 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; 2375 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
2363 ext3_mark_inode_dirty(handle, new_dir); 2376 ext3_mark_inode_dirty(handle, new_dir);
2364 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); 2377 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2365 ext3_journal_dirty_metadata(handle, new_bh); 2378 retval = ext3_journal_dirty_metadata(handle, new_bh);
2379 if (retval)
2380 goto journal_error;
2366 brelse(new_bh); 2381 brelse(new_bh);
2367 new_bh = NULL; 2382 new_bh = NULL;
2368 } 2383 }
@@ -2411,10 +2426,17 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2411 ext3_update_dx_flag(old_dir); 2426 ext3_update_dx_flag(old_dir);
2412 if (dir_bh) { 2427 if (dir_bh) {
2413 BUFFER_TRACE(dir_bh, "get_write_access"); 2428 BUFFER_TRACE(dir_bh, "get_write_access");
2414 ext3_journal_get_write_access(handle, dir_bh); 2429 retval = ext3_journal_get_write_access(handle, dir_bh);
2430 if (retval)
2431 goto journal_error;
2415 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2432 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2416 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); 2433 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2417 ext3_journal_dirty_metadata(handle, dir_bh); 2434 retval = ext3_journal_dirty_metadata(handle, dir_bh);
2435 if (retval) {
2436journal_error:
2437 ext3_std_error(new_dir->i_sb, retval);
2438 goto end_rename;
2439 }
2418 drop_nlink(old_dir); 2440 drop_nlink(old_dir);
2419 if (new_inode) { 2441 if (new_inode) {
2420 drop_nlink(new_inode); 2442 drop_nlink(new_inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0ccd7b12b73c..108b142e11ed 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -249,7 +249,11 @@ static int setup_new_group_blocks(struct super_block *sb,
249 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 249 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
250 set_buffer_uptodate(gdb); 250 set_buffer_uptodate(gdb);
251 unlock_buffer(gdb); 251 unlock_buffer(gdb);
252 ext3_journal_dirty_metadata(handle, gdb); 252 err = ext3_journal_dirty_metadata(handle, gdb);
253 if (err) {
254 brelse(gdb);
255 goto exit_bh;
256 }
253 ext3_set_bit(bit, bh->b_data); 257 ext3_set_bit(bit, bh->b_data);
254 brelse(gdb); 258 brelse(gdb);
255 } 259 }
@@ -269,7 +273,11 @@ static int setup_new_group_blocks(struct super_block *sb,
269 err = PTR_ERR(gdb); 273 err = PTR_ERR(gdb);
270 goto exit_bh; 274 goto exit_bh;
271 } 275 }
272 ext3_journal_dirty_metadata(handle, gdb); 276 err = ext3_journal_dirty_metadata(handle, gdb);
277 if (err) {
278 brelse(gdb);
279 goto exit_bh;
280 }
273 ext3_set_bit(bit, bh->b_data); 281 ext3_set_bit(bit, bh->b_data);
274 brelse(gdb); 282 brelse(gdb);
275 } 283 }
@@ -295,7 +303,11 @@ static int setup_new_group_blocks(struct super_block *sb,
295 err = PTR_ERR(it); 303 err = PTR_ERR(it);
296 goto exit_bh; 304 goto exit_bh;
297 } 305 }
298 ext3_journal_dirty_metadata(handle, it); 306 err = ext3_journal_dirty_metadata(handle, it);
307 if (err) {
308 brelse(it);
309 goto exit_bh;
310 }
299 brelse(it); 311 brelse(it);
300 ext3_set_bit(bit, bh->b_data); 312 ext3_set_bit(bit, bh->b_data);
301 } 313 }
@@ -306,7 +318,9 @@ static int setup_new_group_blocks(struct super_block *sb,
306 318
307 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), 319 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
308 bh->b_data); 320 bh->b_data);
309 ext3_journal_dirty_metadata(handle, bh); 321 err = ext3_journal_dirty_metadata(handle, bh);
322 if (err)
323 goto exit_bh;
310 brelse(bh); 324 brelse(bh);
311 325
312 /* Mark unused entries in inode bitmap used */ 326 /* Mark unused entries in inode bitmap used */
@@ -319,7 +333,7 @@ static int setup_new_group_blocks(struct super_block *sb,
319 333
320 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), 334 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
321 bh->b_data); 335 bh->b_data);
322 ext3_journal_dirty_metadata(handle, bh); 336 err = ext3_journal_dirty_metadata(handle, bh);
323exit_bh: 337exit_bh:
324 brelse(bh); 338 brelse(bh);
325 339
@@ -503,12 +517,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
503 * reserved inode, and will become GDT blocks (primary and backup). 517 * reserved inode, and will become GDT blocks (primary and backup).
504 */ 518 */
505 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; 519 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
506 ext3_journal_dirty_metadata(handle, dind); 520 err = ext3_journal_dirty_metadata(handle, dind);
521 if (err)
522 goto exit_group_desc;
507 brelse(dind); 523 brelse(dind);
524 dind = NULL;
508 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 525 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
509 ext3_mark_iloc_dirty(handle, inode, &iloc); 526 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
527 if (err)
528 goto exit_group_desc;
510 memset((*primary)->b_data, 0, sb->s_blocksize); 529 memset((*primary)->b_data, 0, sb->s_blocksize);
511 ext3_journal_dirty_metadata(handle, *primary); 530 err = ext3_journal_dirty_metadata(handle, *primary);
531 if (err)
532 goto exit_group_desc;
512 533
513 o_group_desc = EXT3_SB(sb)->s_group_desc; 534 o_group_desc = EXT3_SB(sb)->s_group_desc;
514 memcpy(n_group_desc, o_group_desc, 535 memcpy(n_group_desc, o_group_desc,
@@ -519,10 +540,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
519 kfree(o_group_desc); 540 kfree(o_group_desc);
520 541
521 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 542 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
522 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 543 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
544 if (err)
545 goto exit_inode;
523 546
524 return 0; 547 return 0;
525 548
549exit_group_desc:
550 kfree(n_group_desc);
526exit_inode: 551exit_inode:
527 //ext3_journal_release_buffer(handle, iloc.bh); 552 //ext3_journal_release_buffer(handle, iloc.bh);
528 brelse(iloc.bh); 553 brelse(iloc.bh);
@@ -706,16 +731,20 @@ static void update_backups(struct super_block *sb,
706 } 731 }
707 ext3_debug("update metadata backup %#04lx\n", 732 ext3_debug("update metadata backup %#04lx\n",
708 (unsigned long)bh->b_blocknr); 733 (unsigned long)bh->b_blocknr);
709 if ((err = ext3_journal_get_write_access(handle, bh))) 734 if ((err = ext3_journal_get_write_access(handle, bh))) {
735 brelse(bh);
710 break; 736 break;
737 }
711 lock_buffer(bh); 738 lock_buffer(bh);
712 memcpy(bh->b_data, data, size); 739 memcpy(bh->b_data, data, size);
713 if (rest) 740 if (rest)
714 memset(bh->b_data + size, 0, rest); 741 memset(bh->b_data + size, 0, rest);
715 set_buffer_uptodate(bh); 742 set_buffer_uptodate(bh);
716 unlock_buffer(bh); 743 unlock_buffer(bh);
717 ext3_journal_dirty_metadata(handle, bh); 744 err = ext3_journal_dirty_metadata(handle, bh);
718 brelse(bh); 745 brelse(bh);
746 if (err)
747 break;
719 } 748 }
720 if ((err2 = ext3_journal_stop(handle)) && !err) 749 if ((err2 = ext3_journal_stop(handle)) && !err)
721 err = err2; 750 err = err2;
@@ -922,7 +951,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
922 /* Update the global fs size fields */ 951 /* Update the global fs size fields */
923 sbi->s_groups_count++; 952 sbi->s_groups_count++;
924 953
925 ext3_journal_dirty_metadata(handle, primary); 954 err = ext3_journal_dirty_metadata(handle, primary);
955 if (err)
956 goto exit_journal;
926 957
927 /* Update the reserved block counts only once the new group is 958 /* Update the reserved block counts only once the new group is
928 * active. */ 959 * active. */
@@ -934,7 +965,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
934 percpu_counter_add(&sbi->s_freeinodes_counter, 965 percpu_counter_add(&sbi->s_freeinodes_counter,
935 EXT3_INODES_PER_GROUP(sb)); 966 EXT3_INODES_PER_GROUP(sb));
936 967
937 ext3_journal_dirty_metadata(handle, sbi->s_sbh); 968 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
938 969
939exit_journal: 970exit_journal:
940 mutex_unlock(&sbi->s_resize_lock); 971 mutex_unlock(&sbi->s_resize_lock);
@@ -977,7 +1008,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
977 o_blocks_count = le32_to_cpu(es->s_blocks_count); 1008 o_blocks_count = le32_to_cpu(es->s_blocks_count);
978 1009
979 if (test_opt(sb, DEBUG)) 1010 if (test_opt(sb, DEBUG))
980 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", 1011 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
1012 " upto "E3FSBLK" blocks\n",
981 o_blocks_count, n_blocks_count); 1013 o_blocks_count, n_blocks_count);
982 1014
983 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 1015 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -985,7 +1017,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
985 1017
986 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1018 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
987 printk(KERN_ERR "EXT3-fs: filesystem on %s:" 1019 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
988 " too large to resize to %lu blocks safely\n", 1020 " too large to resize to "E3FSBLK" blocks safely\n",
989 sb->s_id, n_blocks_count); 1021 sb->s_id, n_blocks_count);
990 if (sizeof(sector_t) < 8) 1022 if (sizeof(sector_t) < 8)
991 ext3_warning(sb, __func__, 1023 ext3_warning(sb, __func__,
@@ -1063,13 +1095,19 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1063 goto exit_put; 1095 goto exit_put;
1064 } 1096 }
1065 es->s_blocks_count = cpu_to_le32(o_blocks_count + add); 1097 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1066 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1098 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1067 mutex_unlock(&EXT3_SB(sb)->s_resize_lock); 1099 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1068 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, 1100 if (err) {
1069 o_blocks_count + add); 1101 ext3_warning(sb, __func__,
1102 "error %d on journal dirty metadata", err);
1103 ext3_journal_stop(handle);
1104 goto exit_put;
1105 }
1106 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
1107 o_blocks_count, o_blocks_count + add);
1070 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1108 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1071 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, 1109 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
1072 o_blocks_count + add); 1110 o_blocks_count, o_blocks_count + add);
1073 if ((err = ext3_journal_stop(handle))) 1111 if ((err = ext3_journal_stop(handle)))
1074 goto exit_put; 1112 goto exit_put;
1075 if (test_opt(sb, DEBUG)) 1113 if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dbf4dba03c4..85c8cc8f2473 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/parser.h> 29#include <linux/parser.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/exportfs.h> 31#include <linux/exportfs.h>
33#include <linux/vfs.h> 32#include <linux/vfs.h>
@@ -144,12 +143,16 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn,
144void ext3_msg(struct super_block *sb, const char *prefix, 143void ext3_msg(struct super_block *sb, const char *prefix,
145 const char *fmt, ...) 144 const char *fmt, ...)
146{ 145{
146 struct va_format vaf;
147 va_list args; 147 va_list args;
148 148
149 va_start(args, fmt); 149 va_start(args, fmt);
150 printk("%sEXT3-fs (%s): ", prefix, sb->s_id); 150
151 vprintk(fmt, args); 151 vaf.fmt = fmt;
152 printk("\n"); 152 vaf.va = &args;
153
154 printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
155
153 va_end(args); 156 va_end(args);
154} 157}
155 158
@@ -196,15 +199,20 @@ static void ext3_handle_error(struct super_block *sb)
196 sb->s_id); 199 sb->s_id);
197} 200}
198 201
199void ext3_error (struct super_block * sb, const char * function, 202void ext3_error(struct super_block *sb, const char *function,
200 const char * fmt, ...) 203 const char *fmt, ...)
201{ 204{
205 struct va_format vaf;
202 va_list args; 206 va_list args;
203 207
204 va_start(args, fmt); 208 va_start(args, fmt);
205 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); 209
206 vprintk(fmt, args); 210 vaf.fmt = fmt;
207 printk("\n"); 211 vaf.va = &args;
212
213 printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
214 sb->s_id, function, &vaf);
215
208 va_end(args); 216 va_end(args);
209 217
210 ext3_handle_error(sb); 218 ext3_handle_error(sb);
@@ -275,15 +283,20 @@ void __ext3_std_error (struct super_block * sb, const char * function,
275 * case we take the easy way out and panic immediately. 283 * case we take the easy way out and panic immediately.
276 */ 284 */
277 285
278void ext3_abort (struct super_block * sb, const char * function, 286void ext3_abort(struct super_block *sb, const char *function,
279 const char * fmt, ...) 287 const char *fmt, ...)
280{ 288{
289 struct va_format vaf;
281 va_list args; 290 va_list args;
282 291
283 va_start(args, fmt); 292 va_start(args, fmt);
284 printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function); 293
285 vprintk(fmt, args); 294 vaf.fmt = fmt;
286 printk("\n"); 295 vaf.va = &args;
296
297 printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
298 sb->s_id, function, &vaf);
299
287 va_end(args); 300 va_end(args);
288 301
289 if (test_opt(sb, ERRORS_PANIC)) 302 if (test_opt(sb, ERRORS_PANIC))
@@ -301,16 +314,20 @@ void ext3_abort (struct super_block * sb, const char * function,
301 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 314 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
302} 315}
303 316
304void ext3_warning (struct super_block * sb, const char * function, 317void ext3_warning(struct super_block *sb, const char *function,
305 const char * fmt, ...) 318 const char *fmt, ...)
306{ 319{
320 struct va_format vaf;
307 va_list args; 321 va_list args;
308 322
309 va_start(args, fmt); 323 va_start(args, fmt);
310 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ", 324
311 sb->s_id, function); 325 vaf.fmt = fmt;
312 vprintk(fmt, args); 326 vaf.va = &args;
313 printk("\n"); 327
328 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
329 sb->s_id, function, &vaf);
330
314 va_end(args); 331 va_end(args);
315} 332}
316 333
@@ -347,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
347 struct block_device *bdev; 364 struct block_device *bdev;
348 char b[BDEVNAME_SIZE]; 365 char b[BDEVNAME_SIZE];
349 366
350 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 367 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
351 if (IS_ERR(bdev)) 368 if (IS_ERR(bdev))
352 goto fail; 369 goto fail;
353 return bdev; 370 return bdev;
@@ -364,8 +381,7 @@ fail:
364 */ 381 */
365static int ext3_blkdev_put(struct block_device *bdev) 382static int ext3_blkdev_put(struct block_device *bdev)
366{ 383{
367 bd_release(bdev); 384 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
368 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
369} 385}
370 386
371static int ext3_blkdev_remove(struct ext3_sb_info *sbi) 387static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -411,9 +427,6 @@ static void ext3_put_super (struct super_block * sb)
411 int i, err; 427 int i, err;
412 428
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 429 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
415 lock_kernel();
416
417 ext3_xattr_put_super(sb); 430 ext3_xattr_put_super(sb);
418 err = journal_destroy(sbi->s_journal); 431 err = journal_destroy(sbi->s_journal);
419 sbi->s_journal = NULL; 432 sbi->s_journal = NULL;
@@ -462,8 +475,6 @@ static void ext3_put_super (struct super_block * sb)
462 sb->s_fs_info = NULL; 475 sb->s_fs_info = NULL;
463 kfree(sbi->s_blockgroup_lock); 476 kfree(sbi->s_blockgroup_lock);
464 kfree(sbi); 477 kfree(sbi);
465
466 unlock_kernel();
467} 478}
468 479
469static struct kmem_cache *ext3_inode_cachep; 480static struct kmem_cache *ext3_inode_cachep;
@@ -485,6 +496,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
485 return &ei->vfs_inode; 496 return &ei->vfs_inode;
486} 497}
487 498
499static void ext3_i_callback(struct rcu_head *head)
500{
501 struct inode *inode = container_of(head, struct inode, i_rcu);
502 INIT_LIST_HEAD(&inode->i_dentry);
503 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
504}
505
488static void ext3_destroy_inode(struct inode *inode) 506static void ext3_destroy_inode(struct inode *inode)
489{ 507{
490 if (!list_empty(&(EXT3_I(inode)->i_orphan))) { 508 if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -495,7 +513,7 @@ static void ext3_destroy_inode(struct inode *inode)
495 false); 513 false);
496 dump_stack(); 514 dump_stack();
497 } 515 }
498 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); 516 call_rcu(&inode->i_rcu, ext3_i_callback);
499} 517}
500 518
501static void init_once(void *foo) 519static void init_once(void *foo)
@@ -736,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
736static int ext3_mark_dquot_dirty(struct dquot *dquot); 754static int ext3_mark_dquot_dirty(struct dquot *dquot);
737static int ext3_write_info(struct super_block *sb, int type); 755static int ext3_write_info(struct super_block *sb, int type);
738static int ext3_quota_on(struct super_block *sb, int type, int format_id, 756static int ext3_quota_on(struct super_block *sb, int type, int format_id,
739 char *path); 757 struct path *path);
740static int ext3_quota_on_mount(struct super_block *sb, int type); 758static int ext3_quota_on_mount(struct super_block *sb, int type);
741static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 759static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
742 size_t len, loff_t off); 760 size_t len, loff_t off);
@@ -1306,9 +1324,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1306 ext3_msg(sb, KERN_WARNING, 1324 ext3_msg(sb, KERN_WARNING,
1307 "warning: mounting fs with errors, " 1325 "warning: mounting fs with errors, "
1308 "running e2fsck is recommended"); 1326 "running e2fsck is recommended");
1309 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1327 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1310 le16_to_cpu(es->s_mnt_count) >= 1328 le16_to_cpu(es->s_mnt_count) >=
1311 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1329 le16_to_cpu(es->s_max_mnt_count))
1312 ext3_msg(sb, KERN_WARNING, 1330 ext3_msg(sb, KERN_WARNING,
1313 "warning: maximal mount count reached, " 1331 "warning: maximal mount count reached, "
1314 "running e2fsck is recommended"); 1332 "running e2fsck is recommended");
@@ -1325,7 +1343,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1325 valid forever! :) */ 1343 valid forever! :) */
1326 es->s_state &= cpu_to_le16(~EXT3_VALID_FS); 1344 es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
1327#endif 1345#endif
1328 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1346 if (!le16_to_cpu(es->s_max_mnt_count))
1329 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); 1347 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
1330 le16_add_cpu(&es->s_mnt_count, 1); 1348 le16_add_cpu(&es->s_mnt_count, 1);
1331 es->s_mtime = cpu_to_le32(get_seconds()); 1349 es->s_mtime = cpu_to_le32(get_seconds());
@@ -1627,8 +1645,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1627 sbi->s_resgid = EXT3_DEF_RESGID; 1645 sbi->s_resgid = EXT3_DEF_RESGID;
1628 sbi->s_sb_block = sb_block; 1646 sbi->s_sb_block = sb_block;
1629 1647
1630 unlock_kernel();
1631
1632 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); 1648 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1633 if (!blocksize) { 1649 if (!blocksize) {
1634 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); 1650 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -1654,7 +1670,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1654 * Note: s_es must be initialized as soon as possible because 1670 * Note: s_es must be initialized as soon as possible because
1655 * some ext3 macro-instructions depend on its value 1671 * some ext3 macro-instructions depend on its value
1656 */ 1672 */
1657 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); 1673 es = (struct ext3_super_block *) (bh->b_data + offset);
1658 sbi->s_es = es; 1674 sbi->s_es = es;
1659 sb->s_magic = le16_to_cpu(es->s_magic); 1675 sb->s_magic = le16_to_cpu(es->s_magic);
1660 if (sb->s_magic != EXT3_SUPER_MAGIC) 1676 if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1765,7 +1781,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1765 "error: can't read superblock on 2nd try"); 1781 "error: can't read superblock on 2nd try");
1766 goto failed_mount; 1782 goto failed_mount;
1767 } 1783 }
1768 es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); 1784 es = (struct ext3_super_block *)(bh->b_data + offset);
1769 sbi->s_es = es; 1785 sbi->s_es = es;
1770 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { 1786 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1771 ext3_msg(sb, KERN_ERR, 1787 ext3_msg(sb, KERN_ERR,
@@ -1849,13 +1865,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1849 goto failed_mount; 1865 goto failed_mount;
1850 } 1866 }
1851 1867
1852 if (le32_to_cpu(es->s_blocks_count) > 1868 err = generic_check_addressable(sb->s_blocksize_bits,
1853 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1869 le32_to_cpu(es->s_blocks_count));
1870 if (err) {
1854 ext3_msg(sb, KERN_ERR, 1871 ext3_msg(sb, KERN_ERR,
1855 "error: filesystem is too large to mount safely"); 1872 "error: filesystem is too large to mount safely");
1856 if (sizeof(sector_t) < 8) 1873 if (sizeof(sector_t) < 8)
1857 ext3_msg(sb, KERN_ERR, 1874 ext3_msg(sb, KERN_ERR,
1858 "error: CONFIG_LBDAF not enabled"); 1875 "error: CONFIG_LBDAF not enabled");
1876 ret = err;
1859 goto failed_mount; 1877 goto failed_mount;
1860 } 1878 }
1861 1879
@@ -1864,13 +1882,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1864 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - 1882 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
1865 le32_to_cpu(es->s_first_data_block) - 1) 1883 le32_to_cpu(es->s_first_data_block) - 1)
1866 / EXT3_BLOCKS_PER_GROUP(sb)) + 1; 1884 / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
1867 db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / 1885 db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
1868 EXT3_DESC_PER_BLOCK(sb);
1869 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), 1886 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1870 GFP_KERNEL); 1887 GFP_KERNEL);
1871 if (sbi->s_group_desc == NULL) { 1888 if (sbi->s_group_desc == NULL) {
1872 ext3_msg(sb, KERN_ERR, 1889 ext3_msg(sb, KERN_ERR,
1873 "error: not enough memory"); 1890 "error: not enough memory");
1891 ret = -ENOMEM;
1874 goto failed_mount; 1892 goto failed_mount;
1875 } 1893 }
1876 1894
@@ -1958,6 +1976,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1958 } 1976 }
1959 if (err) { 1977 if (err) {
1960 ext3_msg(sb, KERN_ERR, "error: insufficient memory"); 1978 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
1979 ret = err;
1961 goto failed_mount3; 1980 goto failed_mount3;
1962 } 1981 }
1963 1982
@@ -2025,7 +2044,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2025 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": 2044 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
2026 "writeback"); 2045 "writeback");
2027 2046
2028 lock_kernel();
2029 return 0; 2047 return 0;
2030 2048
2031cantfind_ext3: 2049cantfind_ext3:
@@ -2055,7 +2073,6 @@ out_fail:
2055 sb->s_fs_info = NULL; 2073 sb->s_fs_info = NULL;
2056 kfree(sbi->s_blockgroup_lock); 2074 kfree(sbi->s_blockgroup_lock);
2057 kfree(sbi); 2075 kfree(sbi);
2058 lock_kernel();
2059 return ret; 2076 return ret;
2060} 2077}
2061 2078
@@ -2144,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2144 if (bdev == NULL) 2161 if (bdev == NULL)
2145 return NULL; 2162 return NULL;
2146 2163
2147 if (bd_claim(bdev, sb)) {
2148 ext3_msg(sb, KERN_ERR,
2149 "error: failed to claim external journal device");
2150 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2151 return NULL;
2152 }
2153
2154 blocksize = sb->s_blocksize; 2164 blocksize = sb->s_blocksize;
2155 hblock = bdev_logical_block_size(bdev); 2165 hblock = bdev_logical_block_size(bdev);
2156 if (blocksize < hblock) { 2166 if (blocksize < hblock) {
@@ -2168,7 +2178,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2168 goto out_bdev; 2178 goto out_bdev;
2169 } 2179 }
2170 2180
2171 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); 2181 es = (struct ext3_super_block *) (bh->b_data + offset);
2172 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || 2182 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
2173 !(le32_to_cpu(es->s_feature_incompat) & 2183 !(le32_to_cpu(es->s_feature_incompat) &
2174 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { 2184 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -2299,7 +2309,7 @@ static int ext3_load_journal(struct super_block *sb,
2299 EXT3_SB(sb)->s_journal = journal; 2309 EXT3_SB(sb)->s_journal = journal;
2300 ext3_clear_journal_err(sb, es); 2310 ext3_clear_journal_err(sb, es);
2301 2311
2302 if (journal_devnum && 2312 if (!really_read_only && journal_devnum &&
2303 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2313 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2304 es->s_journal_dev = cpu_to_le32(journal_devnum); 2314 es->s_journal_dev = cpu_to_le32(journal_devnum);
2305 2315
@@ -2361,6 +2371,21 @@ static int ext3_commit_super(struct super_block *sb,
2361 2371
2362 if (!sbh) 2372 if (!sbh)
2363 return error; 2373 return error;
2374
2375 if (buffer_write_io_error(sbh)) {
2376 /*
2377 * Oh, dear. A previous attempt to write the
2378 * superblock failed. This could happen because the
2379 * USB device was yanked out. Or it could happen to
2380 * be a transient write error and maybe the block will
2381 * be remapped. Nothing we can do but to retry the
2382 * write and hope for the best.
2383 */
2384 ext3_msg(sb, KERN_ERR, "previous I/O error to "
2385 "superblock detected");
2386 clear_buffer_write_io_error(sbh);
2387 set_buffer_uptodate(sbh);
2388 }
2364 /* 2389 /*
2365 * If the file system is mounted read-only, don't update the 2390 * If the file system is mounted read-only, don't update the
2366 * superblock write time. This avoids updating the superblock 2391 * superblock write time. This avoids updating the superblock
@@ -2377,8 +2402,15 @@ static int ext3_commit_super(struct super_block *sb,
2377 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2402 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2378 BUFFER_TRACE(sbh, "marking dirty"); 2403 BUFFER_TRACE(sbh, "marking dirty");
2379 mark_buffer_dirty(sbh); 2404 mark_buffer_dirty(sbh);
2380 if (sync) 2405 if (sync) {
2381 error = sync_dirty_buffer(sbh); 2406 error = sync_dirty_buffer(sbh);
2407 if (buffer_write_io_error(sbh)) {
2408 ext3_msg(sb, KERN_ERR, "I/O error while writing "
2409 "superblock");
2410 clear_buffer_write_io_error(sbh);
2411 set_buffer_uptodate(sbh);
2412 }
2413 }
2382 return error; 2414 return error;
2383} 2415}
2384 2416
@@ -2538,8 +2570,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2538 int i; 2570 int i;
2539#endif 2571#endif
2540 2572
2541 lock_kernel();
2542
2543 /* Store the original options */ 2573 /* Store the original options */
2544 lock_super(sb); 2574 lock_super(sb);
2545 old_sb_flags = sb->s_flags; 2575 old_sb_flags = sb->s_flags;
@@ -2648,7 +2678,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2648 kfree(old_opts.s_qf_names[i]); 2678 kfree(old_opts.s_qf_names[i]);
2649#endif 2679#endif
2650 unlock_super(sb); 2680 unlock_super(sb);
2651 unlock_kernel();
2652 2681
2653 if (enable_quota) 2682 if (enable_quota)
2654 dquot_resume(sb, -1); 2683 dquot_resume(sb, -1);
@@ -2669,7 +2698,6 @@ restore_opts:
2669 } 2698 }
2670#endif 2699#endif
2671 unlock_super(sb); 2700 unlock_super(sb);
2672 unlock_kernel();
2673 return err; 2701 return err;
2674} 2702}
2675 2703
@@ -2849,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
2849 * Standard function to be called on quota_on 2877 * Standard function to be called on quota_on
2850 */ 2878 */
2851static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2879static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2852 char *name) 2880 struct path *path)
2853{ 2881{
2854 int err; 2882 int err;
2855 struct path path;
2856 2883
2857 if (!test_opt(sb, QUOTA)) 2884 if (!test_opt(sb, QUOTA))
2858 return -EINVAL; 2885 return -EINVAL;
2859 2886
2860 err = kern_path(name, LOOKUP_FOLLOW, &path);
2861 if (err)
2862 return err;
2863
2864 /* Quotafile not on the same filesystem? */ 2887 /* Quotafile not on the same filesystem? */
2865 if (path.mnt->mnt_sb != sb) { 2888 if (path->mnt->mnt_sb != sb)
2866 path_put(&path);
2867 return -EXDEV; 2889 return -EXDEV;
2868 }
2869 /* Journaling quota? */ 2890 /* Journaling quota? */
2870 if (EXT3_SB(sb)->s_qf_names[type]) { 2891 if (EXT3_SB(sb)->s_qf_names[type]) {
2871 /* Quotafile not of fs root? */ 2892 /* Quotafile not of fs root? */
2872 if (path.dentry->d_parent != sb->s_root) 2893 if (path->dentry->d_parent != sb->s_root)
2873 ext3_msg(sb, KERN_WARNING, 2894 ext3_msg(sb, KERN_WARNING,
2874 "warning: Quota file not on filesystem root. " 2895 "warning: Quota file not on filesystem root. "
2875 "Journaled quota will not work."); 2896 "Journaled quota will not work.");
@@ -2879,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2879 * When we journal data on quota file, we have to flush journal to see 2900 * When we journal data on quota file, we have to flush journal to see
2880 * all updates to the file when we bypass pagecache... 2901 * all updates to the file when we bypass pagecache...
2881 */ 2902 */
2882 if (ext3_should_journal_data(path.dentry->d_inode)) { 2903 if (ext3_should_journal_data(path->dentry->d_inode)) {
2883 /* 2904 /*
2884 * We don't need to lock updates but journal_flush() could 2905 * We don't need to lock updates but journal_flush() could
2885 * otherwise be livelocked... 2906 * otherwise be livelocked...
@@ -2887,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2887 journal_lock_updates(EXT3_SB(sb)->s_journal); 2908 journal_lock_updates(EXT3_SB(sb)->s_journal);
2888 err = journal_flush(EXT3_SB(sb)->s_journal); 2909 err = journal_flush(EXT3_SB(sb)->s_journal);
2889 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2910 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2890 if (err) { 2911 if (err)
2891 path_put(&path);
2892 return err; 2912 return err;
2893 }
2894 } 2913 }
2895 2914
2896 err = dquot_quota_on_path(sb, type, format_id, &path); 2915 return dquot_quota_on(sb, type, format_id, path);
2897 path_put(&path);
2898 return err;
2899} 2916}
2900 2917
2901/* Read data from quotafile - avoid pagecache and such because we cannot afford 2918/* Read data from quotafile - avoid pagecache and such because we cannot afford
@@ -3010,16 +3027,16 @@ out:
3010 3027
3011#endif 3028#endif
3012 3029
3013static int ext3_get_sb(struct file_system_type *fs_type, 3030static struct dentry *ext3_mount(struct file_system_type *fs_type,
3014 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3031 int flags, const char *dev_name, void *data)
3015{ 3032{
3016 return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt); 3033 return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
3017} 3034}
3018 3035
3019static struct file_system_type ext3_fs_type = { 3036static struct file_system_type ext3_fs_type = {
3020 .owner = THIS_MODULE, 3037 .owner = THIS_MODULE,
3021 .name = "ext3", 3038 .name = "ext3",
3022 .get_sb = ext3_get_sb, 3039 .mount = ext3_mount,
3023 .kill_sb = kill_block_super, 3040 .kill_sb = kill_block_super,
3024 .fs_flags = FS_REQUIRES_DEV, 3041 .fs_flags = FS_REQUIRES_DEV,
3025}; 3042};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e69dc6dfaa89..32e6cc23bd9a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -925,7 +925,7 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
925/* 925/*
926 * ext3_xattr_set_handle() 926 * ext3_xattr_set_handle()
927 * 927 *
928 * Create, replace or remove an extended attribute for this inode. Buffer 928 * Create, replace or remove an extended attribute for this inode. Value
929 * is NULL to remove an existing extended attribute, and non-NULL to 929 * is NULL to remove an existing extended attribute, and non-NULL to
930 * either replace an existing extended attribute, or create a new extended 930 * either replace an existing extended attribute, or create a new extended
931 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 931 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..c947e36eda6c 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_EXT4_FS) += ext4.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
10 10
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ead..e0270d1f8d82 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
238} 238}
239 239
240int 240int
241ext4_check_acl(struct inode *inode, int mask) 241ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
242{ 242{
243 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); 243 struct posix_acl *acl;
244
245 if (flags & IPERM_FLAG_RCU) {
246 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
247 return -ECHILD;
248 return -EAGAIN;
249 }
244 250
251 acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
245 if (IS_ERR(acl)) 252 if (IS_ERR(acl))
246 return PTR_ERR(acl); 253 return PTR_ERR(acl);
247 if (acl) { 254 if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac4..dec821168fd4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext4_check_acl(struct inode *, int); 57extern int ext4_check_acl(struct inode *, int, unsigned int);
58extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..adf96b822781 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
171 * less than the blocksize * 8 ( which is the size 171 * less than the blocksize * 8 ( which is the size
172 * of bitmap ), set rest of the block bitmap to 1 172 * of bitmap ), set rest of the block bitmap to 1
173 */ 173 */
174 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 174 ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
175 bh->b_data);
175 } 176 }
176 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); 177 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
177} 178}
@@ -489,7 +490,7 @@ error_return:
489 * Check if filesystem has nblocks free & available for allocation. 490 * Check if filesystem has nblocks free & available for allocation.
490 * On success return 1, return 0 on failure. 491 * On success return 1, return 0 on failure.
491 */ 492 */
492int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 493static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
493{ 494{
494 s64 free_blocks, dirty_blocks, root_blocks; 495 s64 free_blocks, dirty_blocks, root_blocks;
495 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 496 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -591,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 * Account for the allocated meta blocks. We will never 592 * Account for the allocated meta blocks. We will never
592 * fail EDQUOT for metdata, but we do account for it. 593 * fail EDQUOT for metdata, but we do account for it.
593 */ 594 */
594 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 595 if (!(*errp) &&
596 ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
595 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 597 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
596 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 598 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
597 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
29 29
30static struct kmem_cache *ext4_system_zone_cachep; 30static struct kmem_cache *ext4_system_zone_cachep;
31 31
32int __init init_ext4_system_zone(void) 32int __init ext4_init_system_zone(void)
33{ 33{
34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 34 ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
35 SLAB_RECLAIM_ACCOUNT);
36 if (ext4_system_zone_cachep == NULL) 35 if (ext4_system_zone_cachep == NULL)
37 return -ENOMEM; 36 return -ENOMEM;
38 return 0; 37 return 0;
39} 38}
40 39
41void exit_ext4_system_zone(void) 40void ext4_exit_system_zone(void)
42{ 41{
43 kmem_cache_destroy(ext4_system_zone_cachep); 42 kmem_cache_destroy(ext4_system_zone_cachep);
44} 43}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..164c56092e58 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
39 struct file *filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = ext4_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext4_readdir, /* we take BKL. needed?*/ 44 .readdir = ext4_readdir, /* we take BKL. needed?*/
45 .unlocked_ioctl = ext4_ioctl, 45 .unlocked_ioctl = ext4_ioctl,
@@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
60 return (ext4_filetype_table[filetype]); 60 return (ext4_filetype_table[filetype]);
61} 61}
62 62
63 63/*
64 * Return 0 if the directory entry is OK, and 1 if there is a problem
65 *
66 * Note: this is the opposite of what ext2 and ext3 historically returned...
67 */
64int __ext4_check_dir_entry(const char *function, unsigned int line, 68int __ext4_check_dir_entry(const char *function, unsigned int line,
65 struct inode *dir, 69 struct inode *dir, struct file *filp,
66 struct ext4_dir_entry_2 *de, 70 struct ext4_dir_entry_2 *de,
67 struct buffer_head *bh, 71 struct buffer_head *bh,
68 unsigned int offset) 72 unsigned int offset)
@@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
71 const int rlen = ext4_rec_len_from_disk(de->rec_len, 75 const int rlen = ext4_rec_len_from_disk(de->rec_len,
72 dir->i_sb->s_blocksize); 76 dir->i_sb->s_blocksize);
73 77
74 if (rlen < EXT4_DIR_REC_LEN(1)) 78 if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
75 error_msg = "rec_len is smaller than minimal"; 79 error_msg = "rec_len is smaller than minimal";
76 else if (rlen % 4 != 0) 80 else if (unlikely(rlen % 4 != 0))
77 error_msg = "rec_len % 4 != 0"; 81 error_msg = "rec_len % 4 != 0";
78 else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) 82 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
79 error_msg = "rec_len is too small for name_len"; 83 error_msg = "rec_len is too small for name_len";
80 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 84 else if (unlikely(((char *) de - bh->b_data) + rlen >
85 dir->i_sb->s_blocksize))
81 error_msg = "directory entry across blocks"; 86 error_msg = "directory entry across blocks";
82 else if (le32_to_cpu(de->inode) > 87 else if (unlikely(le32_to_cpu(de->inode) >
83 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) 88 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
84 error_msg = "inode out of bounds"; 89 error_msg = "inode out of bounds";
90 else
91 return 0;
85 92
86 if (error_msg != NULL) 93 if (filp)
87 ext4_error_inode(dir, function, line, bh->b_blocknr, 94 ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
88 "bad entry in directory: %s - " 95 "bad entry in directory: %s - offset=%u(%u), "
89 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 96 "inode=%u, rec_len=%d, name_len=%d",
90 error_msg, (unsigned) (offset%bh->b_size), offset, 97 error_msg, (unsigned) (offset%bh->b_size),
91 le32_to_cpu(de->inode), 98 offset, le32_to_cpu(de->inode),
92 rlen, de->name_len); 99 rlen, de->name_len);
93 return error_msg == NULL ? 1 : 0; 100 else
101 ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
102 "bad entry in directory: %s - offset=%u(%u), "
103 "inode=%u, rec_len=%d, name_len=%d",
104 error_msg, (unsigned) (offset%bh->b_size),
105 offset, le32_to_cpu(de->inode),
106 rlen, de->name_len);
107
108 return 1;
94} 109}
95 110
96static int ext4_readdir(struct file *filp, 111static int ext4_readdir(struct file *filp,
@@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp,
152 */ 167 */
153 if (!bh) { 168 if (!bh) {
154 if (!dir_has_error) { 169 if (!dir_has_error) {
155 EXT4_ERROR_INODE(inode, "directory " 170 EXT4_ERROR_FILE(filp, 0,
156 "contains a hole at offset %Lu", 171 "directory contains a "
172 "hole at offset %llu",
157 (unsigned long long) filp->f_pos); 173 (unsigned long long) filp->f_pos);
158 dir_has_error = 1; 174 dir_has_error = 1;
159 } 175 }
@@ -194,8 +210,8 @@ revalidate:
194 while (!error && filp->f_pos < inode->i_size 210 while (!error && filp->f_pos < inode->i_size
195 && offset < sb->s_blocksize) { 211 && offset < sb->s_blocksize) {
196 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 212 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
197 if (!ext4_check_dir_entry(inode, de, 213 if (ext4_check_dir_entry(inode, filp, de,
198 bh, offset)) { 214 bh, offset)) {
199 /* 215 /*
200 * On error, skip the f_pos to the next block 216 * On error, skip the f_pos to the next block
201 */ 217 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..0c8d97b56f34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -62,8 +62,8 @@
62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ 62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) 63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
64 64
65#define EXT4_ERROR_FILE(file, fmt, a...) \ 65#define EXT4_ERROR_FILE(file, block, fmt, a...) \
66 ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) 66 ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
67 67
68/* data type for block offset of block group */ 68/* data type for block offset of block group */
69typedef int ext4_grpblk_t; 69typedef int ext4_grpblk_t;
@@ -168,7 +168,20 @@ struct mpage_da_data {
168 int pages_written; 168 int pages_written;
169 int retval; 169 int retval;
170}; 170};
171#define EXT4_IO_UNWRITTEN 0x1 171
172/*
173 * Flags for ext4_io_end->flags
174 */
175#define EXT4_IO_END_UNWRITTEN 0x0001
176#define EXT4_IO_END_ERROR 0x0002
177
178struct ext4_io_page {
179 struct page *p_page;
180 atomic_t p_count;
181};
182
183#define MAX_IO_PAGES 128
184
172typedef struct ext4_io_end { 185typedef struct ext4_io_end {
173 struct list_head list; /* per-file finished IO list */ 186 struct list_head list; /* per-file finished IO list */
174 struct inode *inode; /* file being written to */ 187 struct inode *inode; /* file being written to */
@@ -179,8 +192,18 @@ typedef struct ext4_io_end {
179 struct work_struct work; /* data work queue */ 192 struct work_struct work; /* data work queue */
180 struct kiocb *iocb; /* iocb struct for AIO */ 193 struct kiocb *iocb; /* iocb struct for AIO */
181 int result; /* error value for AIO */ 194 int result; /* error value for AIO */
195 int num_io_pages;
196 struct ext4_io_page *pages[MAX_IO_PAGES];
182} ext4_io_end_t; 197} ext4_io_end_t;
183 198
199struct ext4_io_submit {
200 int io_op;
201 struct bio *io_bio;
202 ext4_io_end_t *io_end;
203 struct ext4_io_page *io_page;
204 sector_t io_next_block;
205};
206
184/* 207/*
185 * Special inodes numbers 208 * Special inodes numbers
186 */ 209 */
@@ -205,6 +228,7 @@ typedef struct ext4_io_end {
205#define EXT4_MIN_BLOCK_SIZE 1024 228#define EXT4_MIN_BLOCK_SIZE 1024
206#define EXT4_MAX_BLOCK_SIZE 65536 229#define EXT4_MAX_BLOCK_SIZE 65536
207#define EXT4_MIN_BLOCK_LOG_SIZE 10 230#define EXT4_MIN_BLOCK_LOG_SIZE 10
231#define EXT4_MAX_BLOCK_LOG_SIZE 16
208#ifdef __KERNEL__ 232#ifdef __KERNEL__
209# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) 233# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize)
210#else 234#else
@@ -537,23 +561,7 @@ struct ext4_new_group_data {
537#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 561#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
538#endif 562#endif
539 563
540 564/* Max physical block we can address w/o extents */
541/*
542 * Mount options
543 */
544struct ext4_mount_options {
545 unsigned long s_mount_opt;
546 uid_t s_resuid;
547 gid_t s_resgid;
548 unsigned long s_commit_interval;
549 u32 s_min_batch_time, s_max_batch_time;
550#ifdef CONFIG_QUOTA
551 int s_jquota_fmt;
552 char *s_qf_names[MAXQUOTAS];
553#endif
554};
555
556/* Max physical block we can addres w/o extents */
557#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF 565#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
558 566
559/* 567/*
@@ -685,6 +693,8 @@ do { \
685 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ 693 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
686 ext4_decode_extra_time(&(inode)->xtime, \ 694 ext4_decode_extra_time(&(inode)->xtime, \
687 raw_inode->xtime ## _extra); \ 695 raw_inode->xtime ## _extra); \
696 else \
697 (inode)->xtime.tv_nsec = 0; \
688} while (0) 698} while (0)
689 699
690#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ 700#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
@@ -695,6 +705,8 @@ do { \
695 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ 705 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
696 ext4_decode_extra_time(&(einode)->xtime, \ 706 ext4_decode_extra_time(&(einode)->xtime, \
697 raw_inode->xtime ## _extra); \ 707 raw_inode->xtime ## _extra); \
708 else \
709 (einode)->xtime.tv_nsec = 0; \
698} while (0) 710} while (0)
699 711
700#define i_disk_version osd1.linux1.l_i_version 712#define i_disk_version osd1.linux1.l_i_version
@@ -726,12 +738,13 @@ do { \
726 738
727/* 739/*
728 * storage for cached extent 740 * storage for cached extent
741 * If ec_len == 0, then the cache is invalid.
742 * If ec_start == 0, then the cache represents a gap (null mapping)
729 */ 743 */
730struct ext4_ext_cache { 744struct ext4_ext_cache {
731 ext4_fsblk_t ec_start; 745 ext4_fsblk_t ec_start;
732 ext4_lblk_t ec_block; 746 ext4_lblk_t ec_block;
733 __u32 ec_len; /* must be 32bit to return holes */ 747 __u32 ec_len; /* must be 32bit to return holes */
734 __u32 ec_type;
735}; 748};
736 749
737/* 750/*
@@ -750,10 +763,12 @@ struct ext4_inode_info {
750 * near to their parent directory's inode. 763 * near to their parent directory's inode.
751 */ 764 */
752 ext4_group_t i_block_group; 765 ext4_group_t i_block_group;
766 ext4_lblk_t i_dir_start_lookup;
767#if (BITS_PER_LONG < 64)
753 unsigned long i_state_flags; /* Dynamic state flags */ 768 unsigned long i_state_flags; /* Dynamic state flags */
769#endif
754 unsigned long i_flags; 770 unsigned long i_flags;
755 771
756 ext4_lblk_t i_dir_start_lookup;
757#ifdef CONFIG_EXT4_FS_XATTR 772#ifdef CONFIG_EXT4_FS_XATTR
758 /* 773 /*
759 * Extended attributes can be read independently of the main file 774 * Extended attributes can be read independently of the main file
@@ -796,7 +811,7 @@ struct ext4_inode_info {
796 */ 811 */
797 struct rw_semaphore i_data_sem; 812 struct rw_semaphore i_data_sem;
798 struct inode vfs_inode; 813 struct inode vfs_inode;
799 struct jbd2_inode jinode; 814 struct jbd2_inode *jinode;
800 815
801 struct ext4_ext_cache i_cached_extent; 816 struct ext4_ext_cache i_cached_extent;
802 /* 817 /*
@@ -816,14 +831,12 @@ struct ext4_inode_info {
816 unsigned int i_reserved_data_blocks; 831 unsigned int i_reserved_data_blocks;
817 unsigned int i_reserved_meta_blocks; 832 unsigned int i_reserved_meta_blocks;
818 unsigned int i_allocated_meta_blocks; 833 unsigned int i_allocated_meta_blocks;
819 unsigned short i_delalloc_reserved_flag; 834 ext4_lblk_t i_da_metadata_calc_last_lblock;
820 sector_t i_da_metadata_calc_last_lblock;
821 int i_da_metadata_calc_len; 835 int i_da_metadata_calc_len;
822 836
823 /* on-disk additional length */ 837 /* on-disk additional length */
824 __u16 i_extra_isize; 838 __u16 i_extra_isize;
825 839
826 spinlock_t i_block_reservation_lock;
827#ifdef CONFIG_QUOTA 840#ifdef CONFIG_QUOTA
828 /* quota space reservation, managed internally by quota code */ 841 /* quota space reservation, managed internally by quota code */
829 qsize_t i_reserved_quota; 842 qsize_t i_reserved_quota;
@@ -832,9 +845,12 @@ struct ext4_inode_info {
832 /* completed IOs that might need unwritten extents handling */ 845 /* completed IOs that might need unwritten extents handling */
833 struct list_head i_completed_io_list; 846 struct list_head i_completed_io_list;
834 spinlock_t i_completed_io_lock; 847 spinlock_t i_completed_io_lock;
848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
835 /* current io_end structure for async DIO write*/ 849 /* current io_end structure for async DIO write*/
836 ext4_io_end_t *cur_aio_dio; 850 ext4_io_end_t *cur_aio_dio;
837 851
852 spinlock_t i_block_reservation_lock;
853
838 /* 854 /*
839 * Transactions that contain inode's metadata needed to complete 855 * Transactions that contain inode's metadata needed to complete
840 * fsync and fdatasync, respectively. 856 * fsync and fdatasync, respectively.
@@ -885,16 +901,27 @@ struct ext4_inode_info {
885#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 901#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
886#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 902#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
887#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 903#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
904#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
888#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 905#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
889#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 906#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
890#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 907#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
891#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 908#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
909#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
892 910
893#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 911#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
894#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 912 ~EXT4_MOUNT_##opt
913#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
914 EXT4_MOUNT_##opt
895#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ 915#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
896 EXT4_MOUNT_##opt) 916 EXT4_MOUNT_##opt)
897 917
918#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \
919 ~EXT4_MOUNT2_##opt
920#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \
921 EXT4_MOUNT2_##opt
922#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
923 EXT4_MOUNT2_##opt)
924
898#define ext4_set_bit ext2_set_bit 925#define ext4_set_bit ext2_set_bit
899#define ext4_set_bit_atomic ext2_set_bit_atomic 926#define ext4_set_bit_atomic ext2_set_bit_atomic
900#define ext4_clear_bit ext2_clear_bit 927#define ext4_clear_bit ext2_clear_bit
@@ -1060,6 +1087,7 @@ struct ext4_sb_info {
1060 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ 1087 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
1061 struct buffer_head **s_group_desc; 1088 struct buffer_head **s_group_desc;
1062 unsigned int s_mount_opt; 1089 unsigned int s_mount_opt;
1090 unsigned int s_mount_opt2;
1063 unsigned int s_mount_flags; 1091 unsigned int s_mount_flags;
1064 ext4_fsblk_t s_sb_block; 1092 ext4_fsblk_t s_sb_block;
1065 uid_t s_resuid; 1093 uid_t s_resuid;
@@ -1087,7 +1115,6 @@ struct ext4_sb_info {
1087 struct completion s_kobj_unregister; 1115 struct completion s_kobj_unregister;
1088 1116
1089 /* Journaling */ 1117 /* Journaling */
1090 struct inode *s_journal_inode;
1091 struct journal_s *s_journal; 1118 struct journal_s *s_journal;
1092 struct list_head s_orphan; 1119 struct list_head s_orphan;
1093 struct mutex s_orphan_lock; 1120 struct mutex s_orphan_lock;
@@ -1120,10 +1147,7 @@ struct ext4_sb_info {
1120 /* for buddy allocator */ 1147 /* for buddy allocator */
1121 struct ext4_group_info ***s_group_info; 1148 struct ext4_group_info ***s_group_info;
1122 struct inode *s_buddy_cache; 1149 struct inode *s_buddy_cache;
1123 long s_blocks_reserved;
1124 spinlock_t s_reserve_lock;
1125 spinlock_t s_md_lock; 1150 spinlock_t s_md_lock;
1126 tid_t s_last_transaction;
1127 unsigned short *s_mb_offsets; 1151 unsigned short *s_mb_offsets;
1128 unsigned int *s_mb_maxs; 1152 unsigned int *s_mb_maxs;
1129 1153
@@ -1141,7 +1165,6 @@ struct ext4_sb_info {
1141 unsigned long s_mb_last_start; 1165 unsigned long s_mb_last_start;
1142 1166
1143 /* stats for buddy allocator */ 1167 /* stats for buddy allocator */
1144 spinlock_t s_mb_pa_lock;
1145 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 1168 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
1146 atomic_t s_bal_success; /* we found long enough chunks */ 1169 atomic_t s_bal_success; /* we found long enough chunks */
1147 atomic_t s_bal_allocated; /* in blocks */ 1170 atomic_t s_bal_allocated; /* in blocks */
@@ -1172,6 +1195,11 @@ struct ext4_sb_info {
1172 1195
1173 /* timer for periodic error stats printing */ 1196 /* timer for periodic error stats printing */
1174 struct timer_list s_err_report; 1197 struct timer_list s_err_report;
1198
1199 /* Lazy inode table initialization info */
1200 struct ext4_li_request *s_li_request;
1201 /* Wait multiplier for lazy initialization thread */
1202 unsigned int s_li_wait_mult;
1175}; 1203};
1176 1204
1177static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1205static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1210,24 +1238,39 @@ enum {
1210 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1238 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1211 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1239 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1212 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1240 EXT4_STATE_NEWENTRY, /* File just added to dir */
1241 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1213}; 1242};
1214 1243
1215#define EXT4_INODE_BIT_FNS(name, field) \ 1244#define EXT4_INODE_BIT_FNS(name, field, offset) \
1216static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ 1245static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1217{ \ 1246{ \
1218 return test_bit(bit, &EXT4_I(inode)->i_##field); \ 1247 return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1219} \ 1248} \
1220static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ 1249static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1221{ \ 1250{ \
1222 set_bit(bit, &EXT4_I(inode)->i_##field); \ 1251 set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1223} \ 1252} \
1224static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ 1253static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1225{ \ 1254{ \
1226 clear_bit(bit, &EXT4_I(inode)->i_##field); \ 1255 clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
1227} 1256}
1228 1257
1229EXT4_INODE_BIT_FNS(flag, flags) 1258EXT4_INODE_BIT_FNS(flag, flags, 0)
1230EXT4_INODE_BIT_FNS(state, state_flags) 1259#if (BITS_PER_LONG < 64)
1260EXT4_INODE_BIT_FNS(state, state_flags, 0)
1261
1262static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1263{
1264 (ei)->i_state_flags = 0;
1265}
1266#else
1267EXT4_INODE_BIT_FNS(state, flags, 32)
1268
1269static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1270{
1271 /* We depend on the fact that callers will set i_flags */
1272}
1273#endif
1231#else 1274#else
1232/* Assume that user mode programs are passing in an ext4fs superblock, not 1275/* Assume that user mode programs are passing in an ext4fs superblock, not
1233 * a kernel struct super_block. This will allow us to call the feature-test 1276 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1533,7 +1576,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1533void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 1576void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1534 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); 1577 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
1535 1578
1536extern struct proc_dir_entry *ext4_proc_root; 1579/*
1580 * Timeout and state flag for lazy initialization inode thread.
1581 */
1582#define EXT4_DEF_LI_WAIT_MULT 10
1583#define EXT4_DEF_LI_MAX_START_DELAY 5
1584#define EXT4_LAZYINIT_QUIT 0x0001
1585#define EXT4_LAZYINIT_RUNNING 0x0002
1586
1587/*
1588 * Lazy inode table initialization info
1589 */
1590struct ext4_lazy_init {
1591 unsigned long li_state;
1592
1593 wait_queue_head_t li_wait_daemon;
1594 wait_queue_head_t li_wait_task;
1595 struct timer_list li_timer;
1596 struct task_struct *li_task;
1597
1598 struct list_head li_request_list;
1599 struct mutex li_list_mtx;
1600};
1601
1602struct ext4_li_request {
1603 struct super_block *lr_super;
1604 struct ext4_sb_info *lr_sbi;
1605 ext4_group_t lr_next_group;
1606 struct list_head lr_request;
1607 unsigned long lr_next_sched;
1608 unsigned long lr_timeout;
1609};
1610
1611struct ext4_features {
1612 struct kobject f_kobj;
1613 struct completion f_kobj_unregister;
1614};
1537 1615
1538/* 1616/*
1539 * Function prototypes 1617 * Function prototypes
@@ -1561,7 +1639,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1561extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1639extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1562 ext4_fsblk_t goal, unsigned long *count, int *errp); 1640 ext4_fsblk_t goal, unsigned long *count, int *errp);
1563extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1641extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1564extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1565extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1642extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1566 ext4_fsblk_t block, unsigned long count); 1643 ext4_fsblk_t block, unsigned long count);
1567extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1644extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1581,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1581 1658
1582/* dir.c */ 1659/* dir.c */
1583extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1660extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1661 struct file *,
1584 struct ext4_dir_entry_2 *, 1662 struct ext4_dir_entry_2 *,
1585 struct buffer_head *, unsigned int); 1663 struct buffer_head *, unsigned int);
1586#define ext4_check_dir_entry(dir, de, bh, offset) \ 1664#define ext4_check_dir_entry(dir, filp, de, bh, offset) \
1587 __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) 1665 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
1666 (de), (bh), (offset)))
1588extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1667extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1589 __u32 minor_hash, 1668 __u32 minor_hash,
1590 struct ext4_dir_entry_2 *dirent); 1669 struct ext4_dir_entry_2 *dirent);
@@ -1592,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1592 1671
1593/* fsync.c */ 1672/* fsync.c */
1594extern int ext4_sync_file(struct file *, int); 1673extern int ext4_sync_file(struct file *, int);
1674extern int ext4_flush_completed_IO(struct inode *);
1595 1675
1596/* hash.c */ 1676/* hash.c */
1597extern int ext4fs_dirhash(const char *name, int len, struct 1677extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1605,11 +1685,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1605extern unsigned long ext4_count_free_inodes(struct super_block *); 1685extern unsigned long ext4_count_free_inodes(struct super_block *);
1606extern unsigned long ext4_count_dirs(struct super_block *); 1686extern unsigned long ext4_count_dirs(struct super_block *);
1607extern void ext4_check_inodes_bitmap(struct super_block *); 1687extern void ext4_check_inodes_bitmap(struct super_block *);
1608extern unsigned ext4_init_inode_bitmap(struct super_block *sb, 1688extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1609 struct buffer_head *bh, 1689extern int ext4_init_inode_table(struct super_block *sb,
1610 ext4_group_t group, 1690 ext4_group_t group, int barrier);
1611 struct ext4_group_desc *desc);
1612extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1613 1691
1614/* mballoc.c */ 1692/* mballoc.c */
1615extern long ext4_mb_stats; 1693extern long ext4_mb_stats;
@@ -1620,16 +1698,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1620 struct ext4_allocation_request *, int *); 1698 struct ext4_allocation_request *, int *);
1621extern int ext4_mb_reserve_blocks(struct super_block *, int); 1699extern int ext4_mb_reserve_blocks(struct super_block *, int);
1622extern void ext4_discard_preallocations(struct inode *); 1700extern void ext4_discard_preallocations(struct inode *);
1623extern int __init init_ext4_mballoc(void); 1701extern int __init ext4_init_mballoc(void);
1624extern void exit_ext4_mballoc(void); 1702extern void ext4_exit_mballoc(void);
1625extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1703extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1626 struct buffer_head *bh, ext4_fsblk_t block, 1704 struct buffer_head *bh, ext4_fsblk_t block,
1627 unsigned long count, int flags); 1705 unsigned long count, int flags);
1628extern int ext4_mb_add_groupinfo(struct super_block *sb, 1706extern int ext4_mb_add_groupinfo(struct super_block *sb,
1629 ext4_group_t i, struct ext4_group_desc *desc); 1707 ext4_group_t i, struct ext4_group_desc *desc);
1630extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1708extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1631extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1709
1632 ext4_group_t, int);
1633/* inode.c */ 1710/* inode.c */
1634struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1711struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1635 ext4_lblk_t, int, int *); 1712 ext4_lblk_t, int, int *);
@@ -1657,13 +1734,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
1657extern int ext4_alloc_da_blocks(struct inode *inode); 1734extern int ext4_alloc_da_blocks(struct inode *inode);
1658extern void ext4_set_aops(struct inode *inode); 1735extern void ext4_set_aops(struct inode *inode);
1659extern int ext4_writepage_trans_blocks(struct inode *); 1736extern int ext4_writepage_trans_blocks(struct inode *);
1660extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1661extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1737extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1662extern int ext4_block_truncate_page(handle_t *handle, 1738extern int ext4_block_truncate_page(handle_t *handle,
1663 struct address_space *mapping, loff_t from); 1739 struct address_space *mapping, loff_t from);
1664extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1740extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1665extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1741extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1666extern int flush_completed_IO(struct inode *inode);
1667extern void ext4_da_update_reserve_space(struct inode *inode, 1742extern void ext4_da_update_reserve_space(struct inode *inode,
1668 int used, int quota_claim); 1743 int used, int quota_claim);
1669/* ioctl.c */ 1744/* ioctl.c */
@@ -1696,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int,
1696 ext4_fsblk_t, const char *, ...) 1771 ext4_fsblk_t, const char *, ...)
1697 __attribute__ ((format (printf, 5, 6))); 1772 __attribute__ ((format (printf, 5, 6)));
1698extern void ext4_error_file(struct file *, const char *, unsigned int, 1773extern void ext4_error_file(struct file *, const char *, unsigned int,
1699 const char *, ...) 1774 ext4_fsblk_t, const char *, ...)
1700 __attribute__ ((format (printf, 4, 5))); 1775 __attribute__ ((format (printf, 5, 6)));
1701extern void __ext4_std_error(struct super_block *, const char *, 1776extern void __ext4_std_error(struct super_block *, const char *,
1702 unsigned int, int); 1777 unsigned int, int);
1703extern void __ext4_abort(struct super_block *, const char *, unsigned int, 1778extern void __ext4_abort(struct super_block *, const char *, unsigned int,
@@ -1960,6 +2035,7 @@ extern const struct file_operations ext4_dir_operations;
1960/* file.c */ 2035/* file.c */
1961extern const struct inode_operations ext4_file_inode_operations; 2036extern const struct inode_operations ext4_file_inode_operations;
1962extern const struct file_operations ext4_file_operations; 2037extern const struct file_operations ext4_file_operations;
2038extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
1963 2039
1964/* namei.c */ 2040/* namei.c */
1965extern const struct inode_operations ext4_dir_inode_operations; 2041extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2049,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1973/* block_validity */ 2049/* block_validity */
1974extern void ext4_release_system_zone(struct super_block *sb); 2050extern void ext4_release_system_zone(struct super_block *sb);
1975extern int ext4_setup_system_zone(struct super_block *sb); 2051extern int ext4_setup_system_zone(struct super_block *sb);
1976extern int __init init_ext4_system_zone(void); 2052extern int __init ext4_init_system_zone(void);
1977extern void exit_ext4_system_zone(void); 2053extern void ext4_exit_system_zone(void);
1978extern int ext4_data_block_valid(struct ext4_sb_info *sbi, 2054extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
1979 ext4_fsblk_t start_blk, 2055 ext4_fsblk_t start_blk,
1980 unsigned int count); 2056 unsigned int count);
@@ -1989,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1989extern void ext4_ext_truncate(struct inode *); 2065extern void ext4_ext_truncate(struct inode *);
1990extern void ext4_ext_init(struct super_block *); 2066extern void ext4_ext_init(struct super_block *);
1991extern void ext4_ext_release(struct super_block *); 2067extern void ext4_ext_release(struct super_block *);
1992extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 2068extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
1993 loff_t len); 2069 loff_t len);
1994extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2070extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1995 ssize_t len); 2071 ssize_t len);
@@ -2002,6 +2078,18 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2002 __u64 start_orig, __u64 start_donor, 2078 __u64 start_orig, __u64 start_donor,
2003 __u64 len, __u64 *moved_len); 2079 __u64 len, __u64 *moved_len);
2004 2080
2081/* page-io.c */
2082extern int __init ext4_init_pageio(void);
2083extern void ext4_exit_pageio(void);
2084extern void ext4_ioend_wait(struct inode *);
2085extern void ext4_free_io_end(ext4_io_end_t *io);
2086extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2087extern int ext4_end_io_nolock(ext4_io_end_t *io);
2088extern void ext4_io_submit(struct ext4_io_submit *io);
2089extern int ext4_bio_write_page(struct ext4_io_submit *io,
2090 struct page *page,
2091 int len,
2092 struct writeback_control *wbc);
2005 2093
2006/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2094/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2007enum ext4_state_bits { 2095enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..2e29abb30f76 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -119,10 +119,6 @@ struct ext4_ext_path {
119 * structure for external API 119 * structure for external API
120 */ 120 */
121 121
122#define EXT4_EXT_CACHE_NO 0
123#define EXT4_EXT_CACHE_GAP 1
124#define EXT4_EXT_CACHE_EXTENT 2
125
126/* 122/*
127 * to be called by ext4_ext_walk_space() 123 * to be called by ext4_ext_walk_space()
128 * negative retcode - error 124 * negative retcode - error
@@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode)
197static inline void 193static inline void
198ext4_ext_invalidate_cache(struct inode *inode) 194ext4_ext_invalidate_cache(struct inode *inode)
199{ 195{
200 EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; 196 EXT4_I(inode)->i_cached_extent.ec_len = 0;
201} 197}
202 198
203static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) 199static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
@@ -225,11 +221,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); 221 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226} 222}
227 223
224/*
225 * ext4_ext_pblock:
226 * combine low and high parts of physical block number into ext4_fsblk_t
227 */
228static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
229{
230 ext4_fsblk_t block;
231
232 block = le32_to_cpu(ex->ee_start_lo);
233 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
234 return block;
235}
236
237/*
238 * ext4_idx_pblock:
239 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
240 */
241static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
242{
243 ext4_fsblk_t block;
244
245 block = le32_to_cpu(ix->ei_leaf_lo);
246 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
247 return block;
248}
249
250/*
251 * ext4_ext_store_pblock:
252 * stores a large physical block number into an extent struct,
253 * breaking it into parts
254 */
255static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
256 ext4_fsblk_t pb)
257{
258 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
259 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
260 0xffff);
261}
262
263/*
264 * ext4_idx_store_pblock:
265 * stores a large physical block number into an index struct,
266 * breaking it into parts
267 */
268static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
269 ext4_fsblk_t pb)
270{
271 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
272 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
273 0xffff);
274}
275
228extern int ext4_ext_calc_metadata_amount(struct inode *inode, 276extern int ext4_ext_calc_metadata_amount(struct inode *inode,
229 sector_t lblocks); 277 ext4_lblk_t lblocks);
230extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
231extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
232extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
233extern int ext4_extent_tree_init(handle_t *, struct inode *); 278extern int ext4_extent_tree_init(handle_t *, struct inode *);
234extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 279extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
235 int num, 280 int num,
@@ -237,19 +282,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
237extern int ext4_can_extents_be_merged(struct inode *inode, 282extern int ext4_can_extents_be_merged(struct inode *inode,
238 struct ext4_extent *ex1, 283 struct ext4_extent *ex1,
239 struct ext4_extent *ex2); 284 struct ext4_extent *ex2);
240extern int ext4_ext_try_to_merge(struct inode *inode,
241 struct ext4_ext_path *path,
242 struct ext4_extent *);
243extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
244extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); 285extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
245extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
246 ext_prepare_callback, void *);
247extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 286extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
248 struct ext4_ext_path *); 287 struct ext4_ext_path *);
249extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
250 ext4_lblk_t *, ext4_fsblk_t *);
251extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
252 ext4_lblk_t *, ext4_fsblk_t *);
253extern void ext4_ext_drop_refs(struct ext4_ext_path *); 288extern void ext4_ext_drop_refs(struct ext4_ext_path *);
254extern int ext4_ext_check_inode(struct inode *inode); 289extern int ext4_ext_check_inode(struct inode *inode);
255#endif /* _EXT4_EXTENTS */ 290#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b0bd792c58c5..d8b992e658c1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal)
253static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 253static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
254{ 254{
255 if (ext4_handle_valid(handle)) 255 if (ext4_handle_valid(handle))
256 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 256 return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
257 return 0; 257 return 0;
258} 258}
259 259
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..63a75810b7c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h" 45#include "ext4_extents.h"
46 46
47
48/*
49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{
54 ext4_fsblk_t block;
55
56 block = le32_to_cpu(ex->ee_start_lo);
57 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58 return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67 ext4_fsblk_t block;
68
69 block = le32_to_cpu(ix->ei_leaf_lo);
70 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71 return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92 ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95
96static int ext4_ext_truncate_extend_restart(handle_t *handle, 47static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode, 48 struct inode *inode,
98 int needed) 49 int needed)
@@ -166,10 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
166 struct ext4_extent *ex; 117 struct ext4_extent *ex;
167 depth = path->p_depth; 118 depth = path->p_depth;
168 119
169 /* try to predict block placement */ 120 /*
121 * Try to predict block placement assuming that we are
122 * filling in a file which will eventually be
123 * non-sparse --- i.e., in the case of libbfd writing
124 * an ELF object sections out-of-order but in a way
125 * the eventually results in a contiguous object or
126 * executable file, or some database extending a table
127 * space file. However, this is actually somewhat
128 * non-ideal if we are writing a sparse file such as
129 * qemu or KVM writing a raw image file that is going
130 * to stay fairly sparse, since it will end up
131 * fragmenting the file system's free space. Maybe we
132 * should have some hueristics or some way to allow
133 * userspace to pass a hint to file system,
134 * especiially if the latter case turns out to be
135 * common.
136 */
170 ex = path[depth].p_ext; 137 ex = path[depth].p_ext;
171 if (ex) 138 if (ex) {
172 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); 139 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
140 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
141
142 if (block > ext_block)
143 return ext_pblk + (block - ext_block);
144 else
145 return ext_pblk - (ext_block - block);
146 }
173 147
174 /* it looks like index is empty; 148 /* it looks like index is empty;
175 * try to find starting block from index itself */ 149 * try to find starting block from index itself */
@@ -292,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
292 * to allocate @blocks 266 * to allocate @blocks
293 * Worse case is one block per extent 267 * Worse case is one block per extent
294 */ 268 */
295int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) 269int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
296{ 270{
297 struct ext4_inode_info *ei = EXT4_I(inode); 271 struct ext4_inode_info *ei = EXT4_I(inode);
298 int idxs, num = 0; 272 int idxs, num = 0;
@@ -354,7 +328,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
354 328
355static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 329static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
356{ 330{
357 ext4_fsblk_t block = ext_pblock(ext); 331 ext4_fsblk_t block = ext4_ext_pblock(ext);
358 int len = ext4_ext_get_actual_len(ext); 332 int len = ext4_ext_get_actual_len(ext);
359 333
360 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 334 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +337,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
363static int ext4_valid_extent_idx(struct inode *inode, 337static int ext4_valid_extent_idx(struct inode *inode,
364 struct ext4_extent_idx *ext_idx) 338 struct ext4_extent_idx *ext_idx)
365{ 339{
366 ext4_fsblk_t block = idx_pblock(ext_idx); 340 ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
367 341
368 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 342 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
369} 343}
@@ -463,13 +437,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
463 for (k = 0; k <= l; k++, path++) { 437 for (k = 0; k <= l; k++, path++) {
464 if (path->p_idx) { 438 if (path->p_idx) {
465 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 439 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
466 idx_pblock(path->p_idx)); 440 ext4_idx_pblock(path->p_idx));
467 } else if (path->p_ext) { 441 } else if (path->p_ext) {
468 ext_debug(" %d:[%d]%d:%llu ", 442 ext_debug(" %d:[%d]%d:%llu ",
469 le32_to_cpu(path->p_ext->ee_block), 443 le32_to_cpu(path->p_ext->ee_block),
470 ext4_ext_is_uninitialized(path->p_ext), 444 ext4_ext_is_uninitialized(path->p_ext),
471 ext4_ext_get_actual_len(path->p_ext), 445 ext4_ext_get_actual_len(path->p_ext),
472 ext_pblock(path->p_ext)); 446 ext4_ext_pblock(path->p_ext));
473 } else 447 } else
474 ext_debug(" []"); 448 ext_debug(" []");
475 } 449 }
@@ -494,7 +468,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
494 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 468 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
495 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 469 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
496 ext4_ext_is_uninitialized(ex), 470 ext4_ext_is_uninitialized(ex),
497 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 471 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
498 } 472 }
499 ext_debug("\n"); 473 ext_debug("\n");
500} 474}
@@ -545,7 +519,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
545 519
546 path->p_idx = l - 1; 520 path->p_idx = l - 1;
547 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 521 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
548 idx_pblock(path->p_idx)); 522 ext4_idx_pblock(path->p_idx));
549 523
550#ifdef CHECK_BINSEARCH 524#ifdef CHECK_BINSEARCH
551 { 525 {
@@ -614,7 +588,7 @@ ext4_ext_binsearch(struct inode *inode,
614 path->p_ext = l - 1; 588 path->p_ext = l - 1;
615 ext_debug(" -> %d:%llu:[%d]%d ", 589 ext_debug(" -> %d:%llu:[%d]%d ",
616 le32_to_cpu(path->p_ext->ee_block), 590 le32_to_cpu(path->p_ext->ee_block),
617 ext_pblock(path->p_ext), 591 ext4_ext_pblock(path->p_ext),
618 ext4_ext_is_uninitialized(path->p_ext), 592 ext4_ext_is_uninitialized(path->p_ext),
619 ext4_ext_get_actual_len(path->p_ext)); 593 ext4_ext_get_actual_len(path->p_ext));
620 594
@@ -682,7 +656,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
682 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 656 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
683 657
684 ext4_ext_binsearch_idx(inode, path + ppos, block); 658 ext4_ext_binsearch_idx(inode, path + ppos, block);
685 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 659 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
686 path[ppos].p_depth = i; 660 path[ppos].p_depth = i;
687 path[ppos].p_ext = NULL; 661 path[ppos].p_ext = NULL;
688 662
@@ -721,7 +695,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
721 ext4_ext_binsearch(inode, path + ppos, block); 695 ext4_ext_binsearch(inode, path + ppos, block);
722 /* if not an empty leaf */ 696 /* if not an empty leaf */
723 if (path[ppos].p_ext) 697 if (path[ppos].p_ext)
724 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 698 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
725 699
726 ext4_ext_show_path(inode, path); 700 ext4_ext_show_path(inode, path);
727 701
@@ -739,9 +713,9 @@ err:
739 * insert new index [@logical;@ptr] into the block at @curp; 713 * insert new index [@logical;@ptr] into the block at @curp;
740 * check where to insert: before @curp or after @curp 714 * check where to insert: before @curp or after @curp
741 */ 715 */
742int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 716static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
743 struct ext4_ext_path *curp, 717 struct ext4_ext_path *curp,
744 int logical, ext4_fsblk_t ptr) 718 int logical, ext4_fsblk_t ptr)
745{ 719{
746 struct ext4_extent_idx *ix; 720 struct ext4_extent_idx *ix;
747 int len, err; 721 int len, err;
@@ -917,7 +891,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
917 EXT_MAX_EXTENT(path[depth].p_hdr)) { 891 EXT_MAX_EXTENT(path[depth].p_hdr)) {
918 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 892 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
919 le32_to_cpu(path[depth].p_ext->ee_block), 893 le32_to_cpu(path[depth].p_ext->ee_block),
920 ext_pblock(path[depth].p_ext), 894 ext4_ext_pblock(path[depth].p_ext),
921 ext4_ext_is_uninitialized(path[depth].p_ext), 895 ext4_ext_is_uninitialized(path[depth].p_ext),
922 ext4_ext_get_actual_len(path[depth].p_ext), 896 ext4_ext_get_actual_len(path[depth].p_ext),
923 newblock); 897 newblock);
@@ -1007,7 +981,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
1007 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 981 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
1008 ext_debug("%d: move %d:%llu in new index %llu\n", i, 982 ext_debug("%d: move %d:%llu in new index %llu\n", i,
1009 le32_to_cpu(path[i].p_idx->ei_block), 983 le32_to_cpu(path[i].p_idx->ei_block),
1010 idx_pblock(path[i].p_idx), 984 ext4_idx_pblock(path[i].p_idx),
1011 newblock); 985 newblock);
1012 /*memmove(++fidx, path[i].p_idx++, 986 /*memmove(++fidx, path[i].p_idx++,
1013 sizeof(struct ext4_extent_idx)); 987 sizeof(struct ext4_extent_idx));
@@ -1146,7 +1120,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1146 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1120 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1147 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1121 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1148 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1122 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1149 idx_pblock(EXT_FIRST_INDEX(neh))); 1123 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1150 1124
1151 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1125 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1152 err = ext4_ext_dirty(handle, inode, curp); 1126 err = ext4_ext_dirty(handle, inode, curp);
@@ -1232,9 +1206,9 @@ out:
1232 * returns 0 at @phys 1206 * returns 0 at @phys
1233 * return value contains 0 (success) or error code 1207 * return value contains 0 (success) or error code
1234 */ 1208 */
1235int 1209static int ext4_ext_search_left(struct inode *inode,
1236ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, 1210 struct ext4_ext_path *path,
1237 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1211 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1238{ 1212{
1239 struct ext4_extent_idx *ix; 1213 struct ext4_extent_idx *ix;
1240 struct ext4_extent *ex; 1214 struct ext4_extent *ex;
@@ -1286,7 +1260,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1286 } 1260 }
1287 1261
1288 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1262 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1289 *phys = ext_pblock(ex) + ee_len - 1; 1263 *phys = ext4_ext_pblock(ex) + ee_len - 1;
1290 return 0; 1264 return 0;
1291} 1265}
1292 1266
@@ -1297,9 +1271,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1297 * returns 0 at @phys 1271 * returns 0 at @phys
1298 * return value contains 0 (success) or error code 1272 * return value contains 0 (success) or error code
1299 */ 1273 */
1300int 1274static int ext4_ext_search_right(struct inode *inode,
1301ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, 1275 struct ext4_ext_path *path,
1302 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1276 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1303{ 1277{
1304 struct buffer_head *bh = NULL; 1278 struct buffer_head *bh = NULL;
1305 struct ext4_extent_header *eh; 1279 struct ext4_extent_header *eh;
@@ -1342,7 +1316,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1342 } 1316 }
1343 } 1317 }
1344 *logical = le32_to_cpu(ex->ee_block); 1318 *logical = le32_to_cpu(ex->ee_block);
1345 *phys = ext_pblock(ex); 1319 *phys = ext4_ext_pblock(ex);
1346 return 0; 1320 return 0;
1347 } 1321 }
1348 1322
@@ -1357,7 +1331,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1357 /* next allocated block in this leaf */ 1331 /* next allocated block in this leaf */
1358 ex++; 1332 ex++;
1359 *logical = le32_to_cpu(ex->ee_block); 1333 *logical = le32_to_cpu(ex->ee_block);
1360 *phys = ext_pblock(ex); 1334 *phys = ext4_ext_pblock(ex);
1361 return 0; 1335 return 0;
1362 } 1336 }
1363 1337
@@ -1376,7 +1350,7 @@ got_index:
1376 * follow it and find the closest allocated 1350 * follow it and find the closest allocated
1377 * block to the right */ 1351 * block to the right */
1378 ix++; 1352 ix++;
1379 block = idx_pblock(ix); 1353 block = ext4_idx_pblock(ix);
1380 while (++depth < path->p_depth) { 1354 while (++depth < path->p_depth) {
1381 bh = sb_bread(inode->i_sb, block); 1355 bh = sb_bread(inode->i_sb, block);
1382 if (bh == NULL) 1356 if (bh == NULL)
@@ -1388,7 +1362,7 @@ got_index:
1388 return -EIO; 1362 return -EIO;
1389 } 1363 }
1390 ix = EXT_FIRST_INDEX(eh); 1364 ix = EXT_FIRST_INDEX(eh);
1391 block = idx_pblock(ix); 1365 block = ext4_idx_pblock(ix);
1392 put_bh(bh); 1366 put_bh(bh);
1393 } 1367 }
1394 1368
@@ -1402,7 +1376,7 @@ got_index:
1402 } 1376 }
1403 ex = EXT_FIRST_EXTENT(eh); 1377 ex = EXT_FIRST_EXTENT(eh);
1404 *logical = le32_to_cpu(ex->ee_block); 1378 *logical = le32_to_cpu(ex->ee_block);
1405 *phys = ext_pblock(ex); 1379 *phys = ext4_ext_pblock(ex);
1406 put_bh(bh); 1380 put_bh(bh);
1407 return 0; 1381 return 0;
1408} 1382}
@@ -1573,7 +1547,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1573 return 0; 1547 return 0;
1574#endif 1548#endif
1575 1549
1576 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) 1550 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1577 return 1; 1551 return 1;
1578 return 0; 1552 return 0;
1579} 1553}
@@ -1585,9 +1559,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1585 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1559 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1586 * 1 if they got merged. 1560 * 1 if they got merged.
1587 */ 1561 */
1588int ext4_ext_try_to_merge(struct inode *inode, 1562static int ext4_ext_try_to_merge(struct inode *inode,
1589 struct ext4_ext_path *path, 1563 struct ext4_ext_path *path,
1590 struct ext4_extent *ex) 1564 struct ext4_extent *ex)
1591{ 1565{
1592 struct ext4_extent_header *eh; 1566 struct ext4_extent_header *eh;
1593 unsigned int depth, len; 1567 unsigned int depth, len;
@@ -1632,9 +1606,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1632 * such that there will be no overlap, and then returns 1. 1606 * such that there will be no overlap, and then returns 1.
1633 * If there is no overlap found, it returns 0. 1607 * If there is no overlap found, it returns 0.
1634 */ 1608 */
1635unsigned int ext4_ext_check_overlap(struct inode *inode, 1609static unsigned int ext4_ext_check_overlap(struct inode *inode,
1636 struct ext4_extent *newext, 1610 struct ext4_extent *newext,
1637 struct ext4_ext_path *path) 1611 struct ext4_ext_path *path)
1638{ 1612{
1639 ext4_lblk_t b1, b2; 1613 ext4_lblk_t b1, b2;
1640 unsigned int depth, len1; 1614 unsigned int depth, len1;
@@ -1706,11 +1680,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1706 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1680 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1707 && ext4_can_extents_be_merged(inode, ex, newext)) { 1681 && ext4_can_extents_be_merged(inode, ex, newext)) {
1708 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1682 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1709 ext4_ext_is_uninitialized(newext), 1683 ext4_ext_is_uninitialized(newext),
1710 ext4_ext_get_actual_len(newext), 1684 ext4_ext_get_actual_len(newext),
1711 le32_to_cpu(ex->ee_block), 1685 le32_to_cpu(ex->ee_block),
1712 ext4_ext_is_uninitialized(ex), 1686 ext4_ext_is_uninitialized(ex),
1713 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1687 ext4_ext_get_actual_len(ex),
1688 ext4_ext_pblock(ex));
1714 err = ext4_ext_get_access(handle, inode, path + depth); 1689 err = ext4_ext_get_access(handle, inode, path + depth);
1715 if (err) 1690 if (err)
1716 return err; 1691 return err;
@@ -1780,7 +1755,7 @@ has_space:
1780 /* there is no extent in this leaf, create first one */ 1755 /* there is no extent in this leaf, create first one */
1781 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1756 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1782 le32_to_cpu(newext->ee_block), 1757 le32_to_cpu(newext->ee_block),
1783 ext_pblock(newext), 1758 ext4_ext_pblock(newext),
1784 ext4_ext_is_uninitialized(newext), 1759 ext4_ext_is_uninitialized(newext),
1785 ext4_ext_get_actual_len(newext)); 1760 ext4_ext_get_actual_len(newext));
1786 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1761 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1769,7 @@ has_space:
1794 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1769 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1795 "move %d from 0x%p to 0x%p\n", 1770 "move %d from 0x%p to 0x%p\n",
1796 le32_to_cpu(newext->ee_block), 1771 le32_to_cpu(newext->ee_block),
1797 ext_pblock(newext), 1772 ext4_ext_pblock(newext),
1798 ext4_ext_is_uninitialized(newext), 1773 ext4_ext_is_uninitialized(newext),
1799 ext4_ext_get_actual_len(newext), 1774 ext4_ext_get_actual_len(newext),
1800 nearex, len, nearex + 1, nearex + 2); 1775 nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1783,7 @@ has_space:
1808 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1783 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1809 "move %d from 0x%p to 0x%p\n", 1784 "move %d from 0x%p to 0x%p\n",
1810 le32_to_cpu(newext->ee_block), 1785 le32_to_cpu(newext->ee_block),
1811 ext_pblock(newext), 1786 ext4_ext_pblock(newext),
1812 ext4_ext_is_uninitialized(newext), 1787 ext4_ext_is_uninitialized(newext),
1813 ext4_ext_get_actual_len(newext), 1788 ext4_ext_get_actual_len(newext),
1814 nearex, len, nearex + 1, nearex + 2); 1789 nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1794,7 @@ has_space:
1819 le16_add_cpu(&eh->eh_entries, 1); 1794 le16_add_cpu(&eh->eh_entries, 1);
1820 nearex = path[depth].p_ext; 1795 nearex = path[depth].p_ext;
1821 nearex->ee_block = newext->ee_block; 1796 nearex->ee_block = newext->ee_block;
1822 ext4_ext_store_pblock(nearex, ext_pblock(newext)); 1797 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1823 nearex->ee_len = newext->ee_len; 1798 nearex->ee_len = newext->ee_len;
1824 1799
1825merge: 1800merge:
@@ -1845,9 +1820,9 @@ cleanup:
1845 return err; 1820 return err;
1846} 1821}
1847 1822
1848int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1823static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1849 ext4_lblk_t num, ext_prepare_callback func, 1824 ext4_lblk_t num, ext_prepare_callback func,
1850 void *cbdata) 1825 void *cbdata)
1851{ 1826{
1852 struct ext4_ext_path *path = NULL; 1827 struct ext4_ext_path *path = NULL;
1853 struct ext4_ext_cache cbex; 1828 struct ext4_ext_cache cbex;
@@ -1919,12 +1894,10 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1919 cbex.ec_block = start; 1894 cbex.ec_block = start;
1920 cbex.ec_len = end - start; 1895 cbex.ec_len = end - start;
1921 cbex.ec_start = 0; 1896 cbex.ec_start = 0;
1922 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1923 } else { 1897 } else {
1924 cbex.ec_block = le32_to_cpu(ex->ee_block); 1898 cbex.ec_block = le32_to_cpu(ex->ee_block);
1925 cbex.ec_len = ext4_ext_get_actual_len(ex); 1899 cbex.ec_len = ext4_ext_get_actual_len(ex);
1926 cbex.ec_start = ext_pblock(ex); 1900 cbex.ec_start = ext4_ext_pblock(ex);
1927 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1928 } 1901 }
1929 1902
1930 if (unlikely(cbex.ec_len == 0)) { 1903 if (unlikely(cbex.ec_len == 0)) {
@@ -1964,13 +1937,12 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1964 1937
1965static void 1938static void
1966ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1939ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1967 __u32 len, ext4_fsblk_t start, int type) 1940 __u32 len, ext4_fsblk_t start)
1968{ 1941{
1969 struct ext4_ext_cache *cex; 1942 struct ext4_ext_cache *cex;
1970 BUG_ON(len == 0); 1943 BUG_ON(len == 0);
1971 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1944 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1972 cex = &EXT4_I(inode)->i_cached_extent; 1945 cex = &EXT4_I(inode)->i_cached_extent;
1973 cex->ec_type = type;
1974 cex->ec_block = block; 1946 cex->ec_block = block;
1975 cex->ec_len = len; 1947 cex->ec_len = len;
1976 cex->ec_start = start; 1948 cex->ec_start = start;
@@ -2023,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2023 } 1995 }
2024 1996
2025 ext_debug(" -> %u:%lu\n", lblock, len); 1997 ext_debug(" -> %u:%lu\n", lblock, len);
2026 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 1998 ext4_ext_put_in_cache(inode, lblock, len, 0);
2027} 1999}
2028 2000
2001/*
2002 * Return 0 if cache is invalid; 1 if the cache is valid
2003 */
2029static int 2004static int
2030ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2005ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2031 struct ext4_extent *ex) 2006 struct ext4_extent *ex)
2032{ 2007{
2033 struct ext4_ext_cache *cex; 2008 struct ext4_ext_cache *cex;
2034 int ret = EXT4_EXT_CACHE_NO; 2009 int ret = 0;
2035 2010
2036 /* 2011 /*
2037 * We borrow i_block_reservation_lock to protect i_cached_extent 2012 * We borrow i_block_reservation_lock to protect i_cached_extent
@@ -2040,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2040 cex = &EXT4_I(inode)->i_cached_extent; 2015 cex = &EXT4_I(inode)->i_cached_extent;
2041 2016
2042 /* has cache valid data? */ 2017 /* has cache valid data? */
2043 if (cex->ec_type == EXT4_EXT_CACHE_NO) 2018 if (cex->ec_len == 0)
2044 goto errout; 2019 goto errout;
2045 2020
2046 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
2047 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
2048 if (in_range(block, cex->ec_block, cex->ec_len)) { 2021 if (in_range(block, cex->ec_block, cex->ec_len)) {
2049 ex->ee_block = cpu_to_le32(cex->ec_block); 2022 ex->ee_block = cpu_to_le32(cex->ec_block);
2050 ext4_ext_store_pblock(ex, cex->ec_start); 2023 ext4_ext_store_pblock(ex, cex->ec_start);
@@ -2052,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2052 ext_debug("%u cached by %u:%u:%llu\n", 2025 ext_debug("%u cached by %u:%u:%llu\n",
2053 block, 2026 block,
2054 cex->ec_block, cex->ec_len, cex->ec_start); 2027 cex->ec_block, cex->ec_len, cex->ec_start);
2055 ret = cex->ec_type; 2028 ret = 1;
2056 } 2029 }
2057errout: 2030errout:
2058 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2031 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2073,7 +2046,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2073 2046
2074 /* free index block */ 2047 /* free index block */
2075 path--; 2048 path--;
2076 leaf = idx_pblock(path->p_idx); 2049 leaf = ext4_idx_pblock(path->p_idx);
2077 if (unlikely(path->p_hdr->eh_entries == 0)) { 2050 if (unlikely(path->p_hdr->eh_entries == 0)) {
2078 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2051 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2079 return -EIO; 2052 return -EIO;
@@ -2181,7 +2154,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2181 ext4_fsblk_t start; 2154 ext4_fsblk_t start;
2182 2155
2183 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2156 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2184 start = ext_pblock(ex) + ee_len - num; 2157 start = ext4_ext_pblock(ex) + ee_len - num;
2185 ext_debug("free last %u blocks starting %llu\n", num, start); 2158 ext_debug("free last %u blocks starting %llu\n", num, start);
2186 ext4_free_blocks(handle, inode, 0, start, num, flags); 2159 ext4_free_blocks(handle, inode, 0, start, num, flags);
2187 } else if (from == le32_to_cpu(ex->ee_block) 2160 } else if (from == le32_to_cpu(ex->ee_block)
@@ -2310,7 +2283,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2310 goto out; 2283 goto out;
2311 2284
2312 ext_debug("new extent: %u:%u:%llu\n", block, num, 2285 ext_debug("new extent: %u:%u:%llu\n", block, num,
2313 ext_pblock(ex)); 2286 ext4_ext_pblock(ex));
2314 ex--; 2287 ex--;
2315 ex_ee_block = le32_to_cpu(ex->ee_block); 2288 ex_ee_block = le32_to_cpu(ex->ee_block);
2316 ex_ee_len = ext4_ext_get_actual_len(ex); 2289 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2421,9 +2394,9 @@ again:
2421 struct buffer_head *bh; 2394 struct buffer_head *bh;
2422 /* go to the next level */ 2395 /* go to the next level */
2423 ext_debug("move to level %d (block %llu)\n", 2396 ext_debug("move to level %d (block %llu)\n",
2424 i + 1, idx_pblock(path[i].p_idx)); 2397 i + 1, ext4_idx_pblock(path[i].p_idx));
2425 memset(path + i + 1, 0, sizeof(*path)); 2398 memset(path + i + 1, 0, sizeof(*path));
2426 bh = sb_bread(sb, idx_pblock(path[i].p_idx)); 2399 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2427 if (!bh) { 2400 if (!bh) {
2428 /* should we reset i_size? */ 2401 /* should we reset i_size? */
2429 err = -EIO; 2402 err = -EIO;
@@ -2535,77 +2508,21 @@ void ext4_ext_release(struct super_block *sb)
2535#endif 2508#endif
2536} 2509}
2537 2510
2538static void bi_complete(struct bio *bio, int error)
2539{
2540 complete((struct completion *)bio->bi_private);
2541}
2542
2543/* FIXME!! we need to try to merge to left or right after zero-out */ 2511/* FIXME!! we need to try to merge to left or right after zero-out */
2544static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2512static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2545{ 2513{
2514 ext4_fsblk_t ee_pblock;
2515 unsigned int ee_len;
2546 int ret; 2516 int ret;
2547 struct bio *bio;
2548 int blkbits, blocksize;
2549 sector_t ee_pblock;
2550 struct completion event;
2551 unsigned int ee_len, len, done, offset;
2552
2553 2517
2554 blkbits = inode->i_blkbits;
2555 blocksize = inode->i_sb->s_blocksize;
2556 ee_len = ext4_ext_get_actual_len(ex); 2518 ee_len = ext4_ext_get_actual_len(ex);
2557 ee_pblock = ext_pblock(ex); 2519 ee_pblock = ext4_ext_pblock(ex);
2558
2559 /* convert ee_pblock to 512 byte sectors */
2560 ee_pblock = ee_pblock << (blkbits - 9);
2561
2562 while (ee_len > 0) {
2563
2564 if (ee_len > BIO_MAX_PAGES)
2565 len = BIO_MAX_PAGES;
2566 else
2567 len = ee_len;
2568
2569 bio = bio_alloc(GFP_NOIO, len);
2570 if (!bio)
2571 return -ENOMEM;
2572
2573 bio->bi_sector = ee_pblock;
2574 bio->bi_bdev = inode->i_sb->s_bdev;
2575
2576 done = 0;
2577 offset = 0;
2578 while (done < len) {
2579 ret = bio_add_page(bio, ZERO_PAGE(0),
2580 blocksize, offset);
2581 if (ret != blocksize) {
2582 /*
2583 * We can't add any more pages because of
2584 * hardware limitations. Start a new bio.
2585 */
2586 break;
2587 }
2588 done++;
2589 offset += blocksize;
2590 if (offset >= PAGE_CACHE_SIZE)
2591 offset = 0;
2592 }
2593 2520
2594 init_completion(&event); 2521 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2595 bio->bi_private = &event; 2522 if (ret > 0)
2596 bio->bi_end_io = bi_complete; 2523 ret = 0;
2597 submit_bio(WRITE, bio);
2598 wait_for_completion(&event);
2599 2524
2600 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2525 return ret;
2601 bio_put(bio);
2602 return -EIO;
2603 }
2604 bio_put(bio);
2605 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9);
2607 }
2608 return 0;
2609} 2526}
2610 2527
2611#define EXT4_EXT_ZERO_LEN 7 2528#define EXT4_EXT_ZERO_LEN 7
@@ -2651,12 +2568,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2651 ee_block = le32_to_cpu(ex->ee_block); 2568 ee_block = le32_to_cpu(ex->ee_block);
2652 ee_len = ext4_ext_get_actual_len(ex); 2569 ee_len = ext4_ext_get_actual_len(ex);
2653 allocated = ee_len - (map->m_lblk - ee_block); 2570 allocated = ee_len - (map->m_lblk - ee_block);
2654 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2571 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2655 2572
2656 ex2 = ex; 2573 ex2 = ex;
2657 orig_ex.ee_block = ex->ee_block; 2574 orig_ex.ee_block = ex->ee_block;
2658 orig_ex.ee_len = cpu_to_le16(ee_len); 2575 orig_ex.ee_len = cpu_to_le16(ee_len);
2659 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2576 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2660 2577
2661 /* 2578 /*
2662 * It is safe to convert extent to initialized via explicit 2579 * It is safe to convert extent to initialized via explicit
@@ -2675,7 +2592,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2675 /* update the extent length and mark as initialized */ 2592 /* update the extent length and mark as initialized */
2676 ex->ee_block = orig_ex.ee_block; 2593 ex->ee_block = orig_ex.ee_block;
2677 ex->ee_len = orig_ex.ee_len; 2594 ex->ee_len = orig_ex.ee_len;
2678 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2595 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2679 ext4_ext_dirty(handle, inode, path + depth); 2596 ext4_ext_dirty(handle, inode, path + depth);
2680 /* zeroed the full extent */ 2597 /* zeroed the full extent */
2681 return allocated; 2598 return allocated;
@@ -2710,7 +2627,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2710 ex->ee_block = orig_ex.ee_block; 2627 ex->ee_block = orig_ex.ee_block;
2711 ex->ee_len = cpu_to_le16(ee_len - allocated); 2628 ex->ee_len = cpu_to_le16(ee_len - allocated);
2712 ext4_ext_mark_uninitialized(ex); 2629 ext4_ext_mark_uninitialized(ex);
2713 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2630 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2714 ext4_ext_dirty(handle, inode, path + depth); 2631 ext4_ext_dirty(handle, inode, path + depth);
2715 2632
2716 ex3 = &newex; 2633 ex3 = &newex;
@@ -2725,7 +2642,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2725 goto fix_extent_len; 2642 goto fix_extent_len;
2726 ex->ee_block = orig_ex.ee_block; 2643 ex->ee_block = orig_ex.ee_block;
2727 ex->ee_len = orig_ex.ee_len; 2644 ex->ee_len = orig_ex.ee_len;
2728 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2645 ext4_ext_store_pblock(ex,
2646 ext4_ext_pblock(&orig_ex));
2729 ext4_ext_dirty(handle, inode, path + depth); 2647 ext4_ext_dirty(handle, inode, path + depth);
2730 /* blocks available from map->m_lblk */ 2648 /* blocks available from map->m_lblk */
2731 return allocated; 2649 return allocated;
@@ -2782,7 +2700,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2782 /* update the extent length and mark as initialized */ 2700 /* update the extent length and mark as initialized */
2783 ex->ee_block = orig_ex.ee_block; 2701 ex->ee_block = orig_ex.ee_block;
2784 ex->ee_len = orig_ex.ee_len; 2702 ex->ee_len = orig_ex.ee_len;
2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2703 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2786 ext4_ext_dirty(handle, inode, path + depth); 2704 ext4_ext_dirty(handle, inode, path + depth);
2787 /* zeroed the full extent */ 2705 /* zeroed the full extent */
2788 /* blocks available from map->m_lblk */ 2706 /* blocks available from map->m_lblk */
@@ -2833,7 +2751,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2833 /* update the extent length and mark as initialized */ 2751 /* update the extent length and mark as initialized */
2834 ex->ee_block = orig_ex.ee_block; 2752 ex->ee_block = orig_ex.ee_block;
2835 ex->ee_len = orig_ex.ee_len; 2753 ex->ee_len = orig_ex.ee_len;
2836 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2754 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2837 ext4_ext_dirty(handle, inode, path + depth); 2755 ext4_ext_dirty(handle, inode, path + depth);
2838 /* zero out the first half */ 2756 /* zero out the first half */
2839 /* blocks available from map->m_lblk */ 2757 /* blocks available from map->m_lblk */
@@ -2902,7 +2820,7 @@ insert:
2902 /* update the extent length and mark as initialized */ 2820 /* update the extent length and mark as initialized */
2903 ex->ee_block = orig_ex.ee_block; 2821 ex->ee_block = orig_ex.ee_block;
2904 ex->ee_len = orig_ex.ee_len; 2822 ex->ee_len = orig_ex.ee_len;
2905 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2823 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2906 ext4_ext_dirty(handle, inode, path + depth); 2824 ext4_ext_dirty(handle, inode, path + depth);
2907 /* zero out the first half */ 2825 /* zero out the first half */
2908 return allocated; 2826 return allocated;
@@ -2915,7 +2833,7 @@ out:
2915fix_extent_len: 2833fix_extent_len:
2916 ex->ee_block = orig_ex.ee_block; 2834 ex->ee_block = orig_ex.ee_block;
2917 ex->ee_len = orig_ex.ee_len; 2835 ex->ee_len = orig_ex.ee_len;
2918 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2836 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2919 ext4_ext_mark_uninitialized(ex); 2837 ext4_ext_mark_uninitialized(ex);
2920 ext4_ext_dirty(handle, inode, path + depth); 2838 ext4_ext_dirty(handle, inode, path + depth);
2921 return err; 2839 return err;
@@ -2927,14 +2845,14 @@ fix_extent_len:
2927 * to an uninitialized extent. 2845 * to an uninitialized extent.
2928 * 2846 *
2929 * Writing to an uninitized extent may result in splitting the uninitialized 2847 * Writing to an uninitized extent may result in splitting the uninitialized
2930 * extent into multiple /intialized unintialized extents (up to three) 2848 * extent into multiple /initialized uninitialized extents (up to three)
2931 * There are three possibilities: 2849 * There are three possibilities:
2932 * a> There is no split required: Entire extent should be uninitialized 2850 * a> There is no split required: Entire extent should be uninitialized
2933 * b> Splits in two extents: Write is happening at either end of the extent 2851 * b> Splits in two extents: Write is happening at either end of the extent
2934 * c> Splits in three extents: Somone is writing in middle of the extent 2852 * c> Splits in three extents: Somone is writing in middle of the extent
2935 * 2853 *
2936 * One of more index blocks maybe needed if the extent tree grow after 2854 * One of more index blocks maybe needed if the extent tree grow after
2937 * the unintialized extent split. To prevent ENOSPC occur at the IO 2855 * the uninitialized extent split. To prevent ENOSPC occur at the IO
2938 * complete, we need to split the uninitialized extent before DIO submit 2856 * complete, we need to split the uninitialized extent before DIO submit
2939 * the IO. The uninitialized extent called at this time will be split 2857 * the IO. The uninitialized extent called at this time will be split
2940 * into three uninitialized extent(at most). After IO complete, the part 2858 * into three uninitialized extent(at most). After IO complete, the part
@@ -2973,12 +2891,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2973 ee_block = le32_to_cpu(ex->ee_block); 2891 ee_block = le32_to_cpu(ex->ee_block);
2974 ee_len = ext4_ext_get_actual_len(ex); 2892 ee_len = ext4_ext_get_actual_len(ex);
2975 allocated = ee_len - (map->m_lblk - ee_block); 2893 allocated = ee_len - (map->m_lblk - ee_block);
2976 newblock = map->m_lblk - ee_block + ext_pblock(ex); 2894 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2977 2895
2978 ex2 = ex; 2896 ex2 = ex;
2979 orig_ex.ee_block = ex->ee_block; 2897 orig_ex.ee_block = ex->ee_block;
2980 orig_ex.ee_len = cpu_to_le16(ee_len); 2898 orig_ex.ee_len = cpu_to_le16(ee_len);
2981 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2899 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2982 2900
2983 /* 2901 /*
2984 * It is safe to convert extent to initialized via explicit 2902 * It is safe to convert extent to initialized via explicit
@@ -3027,7 +2945,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3027 /* update the extent length and mark as initialized */ 2945 /* update the extent length and mark as initialized */
3028 ex->ee_block = orig_ex.ee_block; 2946 ex->ee_block = orig_ex.ee_block;
3029 ex->ee_len = orig_ex.ee_len; 2947 ex->ee_len = orig_ex.ee_len;
3030 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2948 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3031 ext4_ext_dirty(handle, inode, path + depth); 2949 ext4_ext_dirty(handle, inode, path + depth);
3032 /* zeroed the full extent */ 2950 /* zeroed the full extent */
3033 /* blocks available from map->m_lblk */ 2951 /* blocks available from map->m_lblk */
@@ -3099,7 +3017,7 @@ insert:
3099 /* update the extent length and mark as initialized */ 3017 /* update the extent length and mark as initialized */
3100 ex->ee_block = orig_ex.ee_block; 3018 ex->ee_block = orig_ex.ee_block;
3101 ex->ee_len = orig_ex.ee_len; 3019 ex->ee_len = orig_ex.ee_len;
3102 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3020 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3103 ext4_ext_dirty(handle, inode, path + depth); 3021 ext4_ext_dirty(handle, inode, path + depth);
3104 /* zero out the first half */ 3022 /* zero out the first half */
3105 return allocated; 3023 return allocated;
@@ -3112,7 +3030,7 @@ out:
3112fix_extent_len: 3030fix_extent_len:
3113 ex->ee_block = orig_ex.ee_block; 3031 ex->ee_block = orig_ex.ee_block;
3114 ex->ee_len = orig_ex.ee_len; 3032 ex->ee_len = orig_ex.ee_len;
3115 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3033 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3116 ext4_ext_mark_uninitialized(ex); 3034 ext4_ext_mark_uninitialized(ex);
3117 ext4_ext_dirty(handle, inode, path + depth); 3035 ext4_ext_dirty(handle, inode, path + depth);
3118 return err; 3036 return err;
@@ -3180,6 +3098,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3180 unmap_underlying_metadata(bdev, block + i); 3098 unmap_underlying_metadata(bdev, block + i);
3181} 3099}
3182 3100
3101/*
3102 * Handle EOFBLOCKS_FL flag, clearing it if necessary
3103 */
3104static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3105 ext4_lblk_t lblk,
3106 struct ext4_ext_path *path,
3107 unsigned int len)
3108{
3109 int i, depth;
3110 struct ext4_extent_header *eh;
3111 struct ext4_extent *ex, *last_ex;
3112
3113 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3114 return 0;
3115
3116 depth = ext_depth(inode);
3117 eh = path[depth].p_hdr;
3118 ex = path[depth].p_ext;
3119
3120 if (unlikely(!eh->eh_entries)) {
3121 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3122 "EOFBLOCKS_FL set");
3123 return -EIO;
3124 }
3125 last_ex = EXT_LAST_EXTENT(eh);
3126 /*
3127 * We should clear the EOFBLOCKS_FL flag if we are writing the
3128 * last block in the last extent in the file. We test this by
3129 * first checking to see if the caller to
3130 * ext4_ext_get_blocks() was interested in the last block (or
3131 * a block beyond the last block) in the current extent. If
3132 * this turns out to be false, we can bail out from this
3133 * function immediately.
3134 */
3135 if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3136 ext4_ext_get_actual_len(last_ex))
3137 return 0;
3138 /*
3139 * If the caller does appear to be planning to write at or
3140 * beyond the end of the current extent, we then test to see
3141 * if the current extent is the last extent in the file, by
3142 * checking to make sure it was reached via the rightmost node
3143 * at each level of the tree.
3144 */
3145 for (i = depth-1; i >= 0; i--)
3146 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3147 return 0;
3148 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3149 return ext4_mark_inode_dirty(handle, inode);
3150}
3151
3183static int 3152static int
3184ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3153ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3185 struct ext4_map_blocks *map, 3154 struct ext4_map_blocks *map,
@@ -3206,7 +3175,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3206 * completed 3175 * completed
3207 */ 3176 */
3208 if (io) 3177 if (io)
3209 io->flag = EXT4_IO_UNWRITTEN; 3178 io->flag = EXT4_IO_END_UNWRITTEN;
3210 else 3179 else
3211 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3180 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3212 if (ext4_should_dioread_nolock(inode)) 3181 if (ext4_should_dioread_nolock(inode))
@@ -3217,8 +3186,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3217 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3186 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3218 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3187 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3219 path); 3188 path);
3220 if (ret >= 0) 3189 if (ret >= 0) {
3221 ext4_update_inode_fsync_trans(handle, inode, 1); 3190 ext4_update_inode_fsync_trans(handle, inode, 1);
3191 err = check_eofblocks_fl(handle, inode, map->m_lblk,
3192 path, map->m_len);
3193 } else
3194 err = ret;
3222 goto out2; 3195 goto out2;
3223 } 3196 }
3224 /* buffered IO case */ 3197 /* buffered IO case */
@@ -3244,8 +3217,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3244 3217
3245 /* buffered write, writepage time, convert*/ 3218 /* buffered write, writepage time, convert*/
3246 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3219 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3247 if (ret >= 0) 3220 if (ret >= 0) {
3248 ext4_update_inode_fsync_trans(handle, inode, 1); 3221 ext4_update_inode_fsync_trans(handle, inode, 1);
3222 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3223 map->m_len);
3224 if (err < 0)
3225 goto out2;
3226 }
3227
3249out: 3228out:
3250 if (ret <= 0) { 3229 if (ret <= 0) {
3251 err = ret; 3230 err = ret;
@@ -3292,6 +3271,7 @@ out2:
3292 } 3271 }
3293 return err ? err : allocated; 3272 return err ? err : allocated;
3294} 3273}
3274
3295/* 3275/*
3296 * Block allocation/map/preallocation routine for extents based files 3276 * Block allocation/map/preallocation routine for extents based files
3297 * 3277 *
@@ -3315,9 +3295,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3315{ 3295{
3316 struct ext4_ext_path *path = NULL; 3296 struct ext4_ext_path *path = NULL;
3317 struct ext4_extent_header *eh; 3297 struct ext4_extent_header *eh;
3318 struct ext4_extent newex, *ex, *last_ex; 3298 struct ext4_extent newex, *ex;
3319 ext4_fsblk_t newblock; 3299 ext4_fsblk_t newblock;
3320 int i, err = 0, depth, ret, cache_type; 3300 int err = 0, depth, ret;
3321 unsigned int allocated = 0; 3301 unsigned int allocated = 0;
3322 struct ext4_allocation_request ar; 3302 struct ext4_allocation_request ar;
3323 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3303 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3326,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3326 map->m_lblk, map->m_len, inode->i_ino); 3306 map->m_lblk, map->m_len, inode->i_ino);
3327 3307
3328 /* check in cache */ 3308 /* check in cache */
3329 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); 3309 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3330 if (cache_type) { 3310 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3331 if (cache_type == EXT4_EXT_CACHE_GAP) {
3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3311 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3333 /* 3312 /*
3334 * block isn't allocated yet and 3313 * block isn't allocated yet and
@@ -3337,17 +3316,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3337 goto out2; 3316 goto out2;
3338 } 3317 }
3339 /* we should allocate requested block */ 3318 /* we should allocate requested block */
3340 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3319 } else {
3341 /* block is already allocated */ 3320 /* block is already allocated */
3342 newblock = map->m_lblk 3321 newblock = map->m_lblk
3343 - le32_to_cpu(newex.ee_block) 3322 - le32_to_cpu(newex.ee_block)
3344 + ext_pblock(&newex); 3323 + ext4_ext_pblock(&newex);
3345 /* number of remaining blocks in the extent */ 3324 /* number of remaining blocks in the extent */
3346 allocated = ext4_ext_get_actual_len(&newex) - 3325 allocated = ext4_ext_get_actual_len(&newex) -
3347 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3326 (map->m_lblk - le32_to_cpu(newex.ee_block));
3348 goto out; 3327 goto out;
3349 } else {
3350 BUG();
3351 } 3328 }
3352 } 3329 }
3353 3330
@@ -3379,7 +3356,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3379 ex = path[depth].p_ext; 3356 ex = path[depth].p_ext;
3380 if (ex) { 3357 if (ex) {
3381 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3358 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3382 ext4_fsblk_t ee_start = ext_pblock(ex); 3359 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3383 unsigned short ee_len; 3360 unsigned short ee_len;
3384 3361
3385 /* 3362 /*
@@ -3398,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3398 /* Do not put uninitialized extent in the cache */ 3375 /* Do not put uninitialized extent in the cache */
3399 if (!ext4_ext_is_uninitialized(ex)) { 3376 if (!ext4_ext_is_uninitialized(ex)) {
3400 ext4_ext_put_in_cache(inode, ee_block, 3377 ext4_ext_put_in_cache(inode, ee_block,
3401 ee_len, ee_start, 3378 ee_len, ee_start);
3402 EXT4_EXT_CACHE_EXTENT);
3403 goto out; 3379 goto out;
3404 } 3380 }
3405 ret = ext4_ext_handle_uninitialized_extents(handle, 3381 ret = ext4_ext_handle_uninitialized_extents(handle,
@@ -3488,7 +3464,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3488 */ 3464 */
3489 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3490 if (io) 3466 if (io)
3491 io->flag = EXT4_IO_UNWRITTEN; 3467 io->flag = EXT4_IO_END_UNWRITTEN;
3492 else 3468 else
3493 ext4_set_inode_state(inode, 3469 ext4_set_inode_state(inode,
3494 EXT4_STATE_DIO_UNWRITTEN); 3470 EXT4_STATE_DIO_UNWRITTEN);
@@ -3497,44 +3473,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3497 map->m_flags |= EXT4_MAP_UNINIT; 3473 map->m_flags |= EXT4_MAP_UNINIT;
3498 } 3474 }
3499 3475
3500 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { 3476 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3501 if (unlikely(!eh->eh_entries)) { 3477 if (err)
3502 EXT4_ERROR_INODE(inode, 3478 goto out2;
3503 "eh->eh_entries == 0 and " 3479
3504 "EOFBLOCKS_FL set");
3505 err = -EIO;
3506 goto out2;
3507 }
3508 last_ex = EXT_LAST_EXTENT(eh);
3509 /*
3510 * If the current leaf block was reached by looking at
3511 * the last index block all the way down the tree, and
3512 * we are extending the inode beyond the last extent
3513 * in the current leaf block, then clear the
3514 * EOFBLOCKS_FL flag.
3515 */
3516 for (i = depth-1; i >= 0; i--) {
3517 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3518 break;
3519 }
3520 if ((i < 0) &&
3521 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3522 ext4_ext_get_actual_len(last_ex)))
3523 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3524 }
3525 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3480 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3526 if (err) { 3481 if (err) {
3527 /* free data blocks we just allocated */ 3482 /* free data blocks we just allocated */
3528 /* not a good idea to call discard here directly, 3483 /* not a good idea to call discard here directly,
3529 * but otherwise we'd need to call it every free() */ 3484 * but otherwise we'd need to call it every free() */
3530 ext4_discard_preallocations(inode); 3485 ext4_discard_preallocations(inode);
3531 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3486 ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
3532 ext4_ext_get_actual_len(&newex), 0); 3487 ext4_ext_get_actual_len(&newex), 0);
3533 goto out2; 3488 goto out2;
3534 } 3489 }
3535 3490
3536 /* previous routine could use block we allocated */ 3491 /* previous routine could use block we allocated */
3537 newblock = ext_pblock(&newex); 3492 newblock = ext4_ext_pblock(&newex);
3538 allocated = ext4_ext_get_actual_len(&newex); 3493 allocated = ext4_ext_get_actual_len(&newex);
3539 if (allocated > map->m_len) 3494 if (allocated > map->m_len)
3540 allocated = map->m_len; 3495 allocated = map->m_len;
@@ -3552,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3552 * when it is _not_ an uninitialized extent. 3507 * when it is _not_ an uninitialized extent.
3553 */ 3508 */
3554 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3509 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3555 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, 3510 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
3556 EXT4_EXT_CACHE_EXTENT);
3557 ext4_update_inode_fsync_trans(handle, inode, 1); 3511 ext4_update_inode_fsync_trans(handle, inode, 1);
3558 } else 3512 } else
3559 ext4_update_inode_fsync_trans(handle, inode, 0); 3513 ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -3581,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode)
3581 int err = 0; 3535 int err = 0;
3582 3536
3583 /* 3537 /*
3538 * finish any pending end_io work so we won't run the risk of
3539 * converting any truncated blocks to initialized later
3540 */
3541 ext4_flush_completed_IO(inode);
3542
3543 /*
3584 * probably first extent we're gonna free will be last in block 3544 * probably first extent we're gonna free will be last in block
3585 */ 3545 */
3586 err = ext4_writepage_trans_blocks(inode); 3546 err = ext4_writepage_trans_blocks(inode);
@@ -3667,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
3667} 3627}
3668 3628
3669/* 3629/*
3670 * preallocate space for a file. This implements ext4's fallocate inode 3630 * preallocate space for a file. This implements ext4's fallocate file
3671 * operation, which gets called from sys_fallocate system call. 3631 * operation, which gets called from sys_fallocate system call.
3672 * For block-mapped files, posix_fallocate should fall back to the method 3632 * For block-mapped files, posix_fallocate should fall back to the method
3673 * of writing zeroes to the required new blocks (the same behavior which is 3633 * of writing zeroes to the required new blocks (the same behavior which is
3674 * expected for file systems which do not support fallocate() system call). 3634 * expected for file systems which do not support fallocate() system call).
3675 */ 3635 */
3676long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3636long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3677{ 3637{
3638 struct inode *inode = file->f_path.dentry->d_inode;
3678 handle_t *handle; 3639 handle_t *handle;
3679 loff_t new_size; 3640 loff_t new_size;
3680 unsigned int max_blocks; 3641 unsigned int max_blocks;
@@ -3684,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3684 struct ext4_map_blocks map; 3645 struct ext4_map_blocks map;
3685 unsigned int credits, blkbits = inode->i_blkbits; 3646 unsigned int credits, blkbits = inode->i_blkbits;
3686 3647
3648 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3649 if (mode & ~FALLOC_FL_KEEP_SIZE)
3650 return -EOPNOTSUPP;
3651
3687 /* 3652 /*
3688 * currently supporting (pre)allocate mode for extent-based 3653 * currently supporting (pre)allocate mode for extent-based
3689 * files _only_ 3654 * files _only_
@@ -3691,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3691 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3656 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3692 return -EOPNOTSUPP; 3657 return -EOPNOTSUPP;
3693 3658
3694 /* preallocation to directories is currently not supported */
3695 if (S_ISDIR(inode->i_mode))
3696 return -ENODEV;
3697
3698 map.m_lblk = offset >> blkbits; 3659 map.m_lblk = offset >> blkbits;
3699 /* 3660 /*
3700 * We can't just convert len to max_blocks because 3661 * We can't just convert len to max_blocks because
@@ -3729,7 +3690,7 @@ retry:
3729 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3690 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3730 "returned error inode#%lu, block=%u, " 3691 "returned error inode#%lu, block=%u, "
3731 "max_blocks=%u", __func__, 3692 "max_blocks=%u", __func__,
3732 inode->i_ino, block, max_blocks); 3693 inode->i_ino, map.m_lblk, max_blocks);
3733#endif 3694#endif
3734 ext4_mark_inode_dirty(handle, inode); 3695 ext4_mark_inode_dirty(handle, inode);
3735 ret2 = ext4_journal_stop(handle); 3696 ret2 = ext4_journal_stop(handle);
@@ -3829,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3829 3790
3830 logical = (__u64)newex->ec_block << blksize_bits; 3791 logical = (__u64)newex->ec_block << blksize_bits;
3831 3792
3832 if (newex->ec_type == EXT4_EXT_CACHE_GAP) { 3793 if (newex->ec_start == 0) {
3833 pgoff_t offset; 3794 pgoff_t offset;
3834 struct page *page; 3795 struct page *page;
3835 struct buffer_head *bh = NULL; 3796 struct buffer_head *bh = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..2e8322c8aa88 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
104{ 104{
105 struct super_block *sb = inode->i_sb; 105 struct super_block *sb = inode->i_sb;
106 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 106 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
107 struct ext4_inode_info *ei = EXT4_I(inode);
107 struct vfsmount *mnt = filp->f_path.mnt; 108 struct vfsmount *mnt = filp->f_path.mnt;
108 struct path path; 109 struct path path;
109 char buf[64], *cp; 110 char buf[64], *cp;
@@ -127,11 +128,74 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
127 ext4_mark_super_dirty(sb); 128 ext4_mark_super_dirty(sb);
128 } 129 }
129 } 130 }
131 /*
132 * Set up the jbd2_inode if we are opening the inode for
133 * writing and the journal is present
134 */
135 if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
136 struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
137
138 spin_lock(&inode->i_lock);
139 if (!ei->jinode) {
140 if (!jinode) {
141 spin_unlock(&inode->i_lock);
142 return -ENOMEM;
143 }
144 ei->jinode = jinode;
145 jbd2_journal_init_jbd_inode(ei->jinode, inode);
146 jinode = NULL;
147 }
148 spin_unlock(&inode->i_lock);
149 if (unlikely(jinode != NULL))
150 jbd2_free_inode(jinode);
151 }
130 return dquot_file_open(inode, filp); 152 return dquot_file_open(inode, filp);
131} 153}
132 154
155/*
156 * ext4_llseek() copied from generic_file_llseek() to handle both
157 * block-mapped and extent-mapped maxbytes values. This should
158 * otherwise be identical with generic_file_llseek().
159 */
160loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
161{
162 struct inode *inode = file->f_mapping->host;
163 loff_t maxbytes;
164
165 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
166 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
167 else
168 maxbytes = inode->i_sb->s_maxbytes;
169 mutex_lock(&inode->i_mutex);
170 switch (origin) {
171 case SEEK_END:
172 offset += inode->i_size;
173 break;
174 case SEEK_CUR:
175 if (offset == 0) {
176 mutex_unlock(&inode->i_mutex);
177 return file->f_pos;
178 }
179 offset += file->f_pos;
180 break;
181 }
182
183 if (offset < 0 || offset > maxbytes) {
184 mutex_unlock(&inode->i_mutex);
185 return -EINVAL;
186 }
187
188 if (offset != file->f_pos) {
189 file->f_pos = offset;
190 file->f_version = 0;
191 }
192 mutex_unlock(&inode->i_mutex);
193
194 return offset;
195}
196
133const struct file_operations ext4_file_operations = { 197const struct file_operations ext4_file_operations = {
134 .llseek = generic_file_llseek, 198 .llseek = ext4_llseek,
135 .read = do_sync_read, 199 .read = do_sync_read,
136 .write = do_sync_write, 200 .write = do_sync_write,
137 .aio_read = generic_file_aio_read, 201 .aio_read = generic_file_aio_read,
@@ -146,6 +210,7 @@ const struct file_operations ext4_file_operations = {
146 .fsync = ext4_sync_file, 210 .fsync = ext4_sync_file,
147 .splice_read = generic_file_splice_read, 211 .splice_read = generic_file_splice_read,
148 .splice_write = generic_file_splice_write, 212 .splice_write = generic_file_splice_write,
213 .fallocate = ext4_fallocate,
149}; 214};
150 215
151const struct inode_operations ext4_file_inode_operations = { 216const struct inode_operations ext4_file_inode_operations = {
@@ -159,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
159 .removexattr = generic_removexattr, 224 .removexattr = generic_removexattr,
160#endif 225#endif
161 .check_acl = ext4_check_acl, 226 .check_acl = ext4_check_acl,
162 .fallocate = ext4_fallocate,
163 .fiemap = ext4_fiemap, 227 .fiemap = ext4_fiemap,
164}; 228};
165 229
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..7829b287822a 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
34 34
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37static void dump_completed_IO(struct inode * inode)
38{
39#ifdef EXT4_DEBUG
40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags;
43
44 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
45 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
46 return;
47 }
48
49 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
50 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
51 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
52 cur = &io->list;
53 before = cur->prev;
54 io0 = container_of(before, ext4_io_end_t, list);
55 after = cur->next;
56 io1 = container_of(after, ext4_io_end_t, list);
57
58 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
59 io, inode->i_ino, io0, io1);
60 }
61 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
62#endif
63}
64
65/*
66 * This function is called from ext4_sync_file().
67 *
68 * When IO is completed, the work to convert unwritten extents to
69 * written is queued on workqueue but may not get immediately
70 * scheduled. When fsync is called, we need to ensure the
71 * conversion is complete before fsync returns.
72 * The inode keeps track of a list of pending/completed IO that
73 * might needs to do the conversion. This function walks through
74 * the list and convert the related unwritten extents for completed IO
75 * to written.
76 * The function return the number of pending IOs on success.
77 */
78extern int ext4_flush_completed_IO(struct inode *inode)
79{
80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode);
82 unsigned long flags;
83 int ret = 0;
84 int ret2 = 0;
85
86 if (list_empty(&ei->i_completed_io_list))
87 return ret;
88
89 dump_completed_IO(inode);
90 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
91 while (!list_empty(&ei->i_completed_io_list)){
92 io = list_entry(ei->i_completed_io_list.next,
93 ext4_io_end_t, list);
94 /*
95 * Calling ext4_end_io_nolock() to convert completed
96 * IO to written.
97 *
98 * When ext4_sync_file() is called, run_queue() may already
99 * about to flush the work corresponding to this io structure.
100 * It will be upset if it founds the io structure related
101 * to the work-to-be schedule is freed.
102 *
103 * Thus we need to keep the io structure still valid here after
104 * convertion finished. The io structure has a flag to
105 * avoid double converting from both fsync and background work
106 * queue work.
107 */
108 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
109 ret = ext4_end_io_nolock(io);
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
111 if (ret < 0)
112 ret2 = ret;
113 else
114 list_del_init(&io->list);
115 }
116 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
117 return (ret2 < 0) ? ret2 : 0;
118}
119
37/* 120/*
38 * If we're not journaling and this is a just-created file, we have to 121 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since 122 * sync our parent directory (if it was freshly created) since
@@ -86,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync)
86 if (inode->i_sb->s_flags & MS_RDONLY) 169 if (inode->i_sb->s_flags & MS_RDONLY)
87 return 0; 170 return 0;
88 171
89 ret = flush_completed_IO(inode); 172 ret = ext4_flush_completed_IO(inode);
90 if (ret < 0) 173 if (ret < 0)
91 return ret; 174 return ret;
92 175
@@ -128,10 +211,9 @@ int ext4_sync_file(struct file *file, int datasync)
128 (journal->j_fs_dev != journal->j_dev) && 211 (journal->j_fs_dev != journal->j_dev) &&
129 (journal->j_flags & JBD2_BARRIER)) 212 (journal->j_flags & JBD2_BARRIER))
130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 213 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
131 NULL, BLKDEV_IFL_WAIT); 214 NULL);
132 ret = jbd2_log_wait_commit(journal, commit_tid); 215 ret = jbd2_log_wait_commit(journal, commit_tid);
133 } else if (journal->j_flags & JBD2_BARRIER) 216 } else if (journal->j_flags & JBD2_BARRIER)
134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 217 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
135 BLKDEV_IFL_WAIT);
136 return ret; 218 return ret;
137} 219}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..eb9097aec6f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
50 * need to use it within a single byte (to ensure we get endianness right). 50 * need to use it within a single byte (to ensure we get endianness right).
51 * We can use memset for the rest of the bitmap as there are no other users. 51 * We can use memset for the rest of the bitmap as there are no other users.
52 */ 52 */
53void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) 53void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
54{ 54{
55 int i; 55 int i;
56 56
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
65} 65}
66 66
67/* Initializes an uninitialized inode bitmap */ 67/* Initializes an uninitialized inode bitmap */
68unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, 68static unsigned ext4_init_inode_bitmap(struct super_block *sb,
69 ext4_group_t block_group, 69 struct buffer_head *bh,
70 struct ext4_group_desc *gdp) 70 ext4_group_t block_group,
71 struct ext4_group_desc *gdp)
71{ 72{
72 struct ext4_sb_info *sbi = EXT4_SB(sb); 73 struct ext4_sb_info *sbi = EXT4_SB(sb);
73 74
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
85 } 86 }
86 87
87 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 88 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
88 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 89 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
89 bh->b_data); 90 bh->b_data);
90 91
91 return EXT4_INODES_PER_GROUP(sb); 92 return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
107 desc = ext4_get_group_desc(sb, block_group, NULL); 108 desc = ext4_get_group_desc(sb, block_group, NULL);
108 if (!desc) 109 if (!desc)
109 return NULL; 110 return NULL;
111
110 bitmap_blk = ext4_inode_bitmap(sb, desc); 112 bitmap_blk = ext4_inode_bitmap(sb, desc);
111 bh = sb_getblk(sb, bitmap_blk); 113 bh = sb_getblk(sb, bitmap_blk);
112 if (unlikely(!bh)) { 114 if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 125 unlock_buffer(bh);
124 return bh; 126 return bh;
125 } 127 }
128
126 ext4_lock_group(sb, block_group); 129 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 130 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 131 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
133 return bh; 136 return bh;
134 } 137 }
135 ext4_unlock_group(sb, block_group); 138 ext4_unlock_group(sb, block_group);
139
136 if (buffer_uptodate(bh)) { 140 if (buffer_uptodate(bh)) {
137 /* 141 /*
138 * if not uninit if bh is uptodate, 142 * if not uninit if bh is uptodate,
@@ -411,8 +415,8 @@ struct orlov_stats {
411 * for a particular block group or flex_bg. If flex_size is 1, then g 415 * for a particular block group or flex_bg. If flex_size is 1, then g
412 * is a block group number; otherwise it is flex_bg number. 416 * is a block group number; otherwise it is flex_bg number.
413 */ 417 */
414void get_orlov_stats(struct super_block *sb, ext4_group_t g, 418static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
415 int flex_size, struct orlov_stats *stats) 419 int flex_size, struct orlov_stats *stats)
416{ 420{
417 struct ext4_group_desc *desc; 421 struct ext4_group_desc *desc;
418 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; 422 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
712{ 716{
713 int free = 0, retval = 0, count; 717 int free = 0, retval = 0, count;
714 struct ext4_sb_info *sbi = EXT4_SB(sb); 718 struct ext4_sb_info *sbi = EXT4_SB(sb);
719 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 720 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
716 721
722 /*
723 * We have to be sure that new inode allocation does not race with
724 * inode table initialization, because otherwise we may end up
725 * allocating and writing new inode right before sb_issue_zeroout
726 * takes place and overwriting our new inode with zeroes. So we
727 * take alloc_sem to prevent it.
728 */
729 down_read(&grp->alloc_sem);
717 ext4_lock_group(sb, group); 730 ext4_lock_group(sb, group);
718 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 731 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
719 /* not a free inode */ 732 /* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
724 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 737 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
725 ino > EXT4_INODES_PER_GROUP(sb)) { 738 ino > EXT4_INODES_PER_GROUP(sb)) {
726 ext4_unlock_group(sb, group); 739 ext4_unlock_group(sb, group);
740 up_read(&grp->alloc_sem);
727 ext4_error(sb, "reserved inode or inode > inodes count - " 741 ext4_error(sb, "reserved inode or inode > inodes count - "
728 "block_group = %u, inode=%lu", group, 742 "block_group = %u, inode=%lu", group,
729 ino + group * EXT4_INODES_PER_GROUP(sb)); 743 ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 786 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773err_ret: 787err_ret:
774 ext4_unlock_group(sb, group); 788 ext4_unlock_group(sb, group);
789 up_read(&grp->alloc_sem);
775 return retval; 790 return retval;
776} 791}
777 792
@@ -1012,7 +1027,7 @@ got:
1012 inode->i_generation = sbi->s_next_generation++; 1027 inode->i_generation = sbi->s_next_generation++;
1013 spin_unlock(&sbi->s_next_gen_lock); 1028 spin_unlock(&sbi->s_next_gen_lock);
1014 1029
1015 ei->i_state_flags = 0; 1030 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
1016 ext4_set_inode_state(inode, EXT4_STATE_NEW); 1031 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1017 1032
1018 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1033 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1205 } 1220 }
1206 return count; 1221 return count;
1207} 1222}
1223
1224/*
1225 * Zeroes not yet zeroed inode table - just write zeroes through the whole
1226 * inode table. Must be called without any spinlock held. The only place
1227 * where it is called from on active part of filesystem is ext4lazyinit
1228 * thread, so we do not need any special locks, however we have to prevent
1229 * inode allocation from the current group, so we take alloc_sem lock, to
1230 * block ext4_claim_inode until we are finished.
1231 */
1232extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1233 int barrier)
1234{
1235 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1236 struct ext4_sb_info *sbi = EXT4_SB(sb);
1237 struct ext4_group_desc *gdp = NULL;
1238 struct buffer_head *group_desc_bh;
1239 handle_t *handle;
1240 ext4_fsblk_t blk;
1241 int num, ret = 0, used_blks = 0;
1242
1243 /* This should not happen, but just to be sure check this */
1244 if (sb->s_flags & MS_RDONLY) {
1245 ret = 1;
1246 goto out;
1247 }
1248
1249 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1250 if (!gdp)
1251 goto out;
1252
1253 /*
1254 * We do not need to lock this, because we are the only one
1255 * handling this flag.
1256 */
1257 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1258 goto out;
1259
1260 handle = ext4_journal_start_sb(sb, 1);
1261 if (IS_ERR(handle)) {
1262 ret = PTR_ERR(handle);
1263 goto out;
1264 }
1265
1266 down_write(&grp->alloc_sem);
1267 /*
1268 * If inode bitmap was already initialized there may be some
1269 * used inodes so we need to skip blocks with used inodes in
1270 * inode table.
1271 */
1272 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1273 used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1274 ext4_itable_unused_count(sb, gdp)),
1275 sbi->s_inodes_per_block);
1276
1277 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1278 ext4_error(sb, "Something is wrong with group %u\n"
1279 "Used itable blocks: %d"
1280 "itable unused count: %u\n",
1281 group, used_blks,
1282 ext4_itable_unused_count(sb, gdp));
1283 ret = 1;
1284 goto out;
1285 }
1286
1287 blk = ext4_inode_table(sb, gdp) + used_blks;
1288 num = sbi->s_itb_per_group - used_blks;
1289
1290 BUFFER_TRACE(group_desc_bh, "get_write_access");
1291 ret = ext4_journal_get_write_access(handle,
1292 group_desc_bh);
1293 if (ret)
1294 goto err_out;
1295
1296 /*
1297 * Skip zeroout if the inode table is full. But we set the ZEROED
1298 * flag anyway, because obviously, when it is full it does not need
1299 * further zeroing.
1300 */
1301 if (unlikely(num == 0))
1302 goto skip_zeroout;
1303
1304 ext4_debug("going to zero out inode table in group %d\n",
1305 group);
1306 ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1307 if (ret < 0)
1308 goto err_out;
1309 if (barrier)
1310 blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
1311
1312skip_zeroout:
1313 ext4_lock_group(sb, group);
1314 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1315 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
1316 ext4_unlock_group(sb, group);
1317
1318 BUFFER_TRACE(group_desc_bh,
1319 "call ext4_handle_dirty_metadata");
1320 ret = ext4_handle_dirty_metadata(handle, NULL,
1321 group_desc_bh);
1322
1323err_out:
1324 up_write(&grp->alloc_sem);
1325 ext4_journal_stop(handle);
1326out:
1327 return ret;
1328}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..9f7f9e49914f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/printk.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/ratelimit.h>
43 45
44#include "ext4_jbd2.h" 46#include "ext4_jbd2.h"
45#include "xattr.h" 47#include "xattr.h"
@@ -53,13 +55,27 @@
53static inline int ext4_begin_ordered_truncate(struct inode *inode, 55static inline int ext4_begin_ordered_truncate(struct inode *inode,
54 loff_t new_size) 56 loff_t new_size)
55{ 57{
56 return jbd2_journal_begin_ordered_truncate( 58 trace_ext4_begin_ordered_truncate(inode, new_size);
57 EXT4_SB(inode->i_sb)->s_journal, 59 /*
58 &EXT4_I(inode)->jinode, 60 * If jinode is zero, then we never opened the file for
59 new_size); 61 * writing, so there's no need to call
62 * jbd2_journal_begin_ordered_truncate() since there's no
63 * outstanding writes we need to flush.
64 */
65 if (!EXT4_I(inode)->jinode)
66 return 0;
67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68 EXT4_I(inode)->jinode,
69 new_size);
60} 70}
61 71
62static void ext4_invalidatepage(struct page *page, unsigned long offset); 72static void ext4_invalidatepage(struct page *page, unsigned long offset);
73static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
74 struct buffer_head *bh_result, int create);
75static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
76static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
77static int __ext4_journalled_writepage(struct page *page, unsigned int len);
78static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
63 79
64/* 80/*
65 * Test whether an inode is a fast symlink. 81 * Test whether an inode is a fast symlink.
@@ -172,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
172 handle_t *handle; 188 handle_t *handle;
173 int err; 189 int err;
174 190
191 trace_ext4_evict_inode(inode);
175 if (inode->i_nlink) { 192 if (inode->i_nlink) {
176 truncate_inode_pages(&inode->i_data, 0); 193 truncate_inode_pages(&inode->i_data, 0);
177 goto no_delete; 194 goto no_delete;
@@ -544,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
544} 561}
545 562
546/** 563/**
547 * ext4_blks_to_allocate: Look up the block map and count the number 564 * ext4_blks_to_allocate - Look up the block map and count the number
548 * of direct blocks need to be allocated for the given branch. 565 * of direct blocks need to be allocated for the given branch.
549 * 566 *
550 * @branch: chain of indirect blocks 567 * @branch: chain of indirect blocks
@@ -583,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
583 600
584/** 601/**
585 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation
586 * @indirect_blks: the number of blocks need to allocate for indirect 607 * @indirect_blks: the number of blocks need to allocate for indirect
587 * blocks 608 * blocks
588 * 609 * @blks: number of desired blocks
589 * @new_blocks: on return it will store the new block numbers for 610 * @new_blocks: on return it will store the new block numbers for
590 * the indirect blocks(if needed) and the first direct block, 611 * the indirect blocks(if needed) and the first direct block,
591 * @blks: on return it will store the total number of allocated 612 * @err: on return it will store the error code
592 * direct blocks 613 *
614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters.
593 */ 616 */
594static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
595 ext4_lblk_t iblock, ext4_fsblk_t goal, 618 ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -703,9 +726,11 @@ failed_out:
703 726
704/** 727/**
705 * ext4_alloc_branch - allocate and set up a chain of blocks. 728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction
706 * @inode: owner 730 * @inode: owner
707 * @indirect_blks: number of allocated indirect blocks 731 * @indirect_blks: number of allocated indirect blocks
708 * @blks: number of allocated direct blocks 732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation
709 * @offsets: offsets (in the blocks) to store the pointers to next. 734 * @offsets: offsets (in the blocks) to store the pointers to next.
710 * @branch: place to store the chain in. 735 * @branch: place to store the chain in.
711 * 736 *
@@ -755,6 +780,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 * parent to disk. 780 * parent to disk.
756 */ 781 */
757 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783 if (unlikely(!bh)) {
784 err = -EIO;
785 goto failed;
786 }
787
758 branch[n].bh = bh; 788 branch[n].bh = bh;
759 lock_buffer(bh); 789 lock_buffer(bh);
760 BUFFER_TRACE(bh, "call get_create_access"); 790 BUFFER_TRACE(bh, "call get_create_access");
@@ -813,6 +843,7 @@ failed:
813 843
814/** 844/**
815 * ext4_splice_branch - splice the allocated branch onto inode. 845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction
816 * @inode: owner 847 * @inode: owner
817 * @block: (logical) number of block we are adding 848 * @block: (logical) number of block we are adding
818 * @chain: chain of indirect blocks (with a missing link - see 849 * @chain: chain of indirect blocks (with a missing link - see
@@ -1068,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1068 * Calculate the number of metadata blocks need to reserve 1099 * Calculate the number of metadata blocks need to reserve
1069 * to allocate a block located at @lblock 1100 * to allocate a block located at @lblock
1070 */ 1101 */
1071static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1102static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1072{ 1103{
1073 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1104 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1074 return ext4_ext_calc_metadata_amount(inode, lblock); 1105 return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1207,8 +1238,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1207 break; 1238 break;
1208 idx++; 1239 idx++;
1209 num++; 1240 num++;
1210 if (num >= max_pages) 1241 if (num >= max_pages) {
1242 done = 1;
1211 break; 1243 break;
1244 }
1212 } 1245 }
1213 pagevec_release(&pvec); 1246 pagevec_release(&pvec);
1214 } 1247 }
@@ -1305,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1305 * avoid double accounting 1338 * avoid double accounting
1306 */ 1339 */
1307 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1340 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1308 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1341 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1309 /* 1342 /*
1310 * We need to check for EXT4 here because migrate 1343 * We need to check for EXT4 here because migrate
1311 * could have changed the inode type in between 1344 * could have changed the inode type in between
@@ -1335,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1335 ext4_da_update_reserve_space(inode, retval, 1); 1368 ext4_da_update_reserve_space(inode, retval, 1);
1336 } 1369 }
1337 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1370 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1338 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1371 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1339 1372
1340 up_write((&EXT4_I(inode)->i_data_sem)); 1373 up_write((&EXT4_I(inode)->i_data_sem));
1341 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1374 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1538,10 +1571,10 @@ static int do_journal_get_write_access(handle_t *handle,
1538 if (!buffer_mapped(bh) || buffer_freed(bh)) 1571 if (!buffer_mapped(bh) || buffer_freed(bh))
1539 return 0; 1572 return 0;
1540 /* 1573 /*
1541 * __block_prepare_write() could have dirtied some buffers. Clean 1574 * __block_write_begin() could have dirtied some buffers. Clean
1542 * the dirty bit as jbd2_journal_get_write_access() could complain 1575 * the dirty bit as jbd2_journal_get_write_access() could complain
1543 * otherwise about fs integrity issues. Setting of the dirty bit 1576 * otherwise about fs integrity issues. Setting of the dirty bit
1544 * by __block_prepare_write() isn't a real problem here as we clear 1577 * by __block_write_begin() isn't a real problem here as we clear
1545 * the bit before releasing a page lock and thus writeback cannot 1578 * the bit before releasing a page lock and thus writeback cannot
1546 * ever write the buffer. 1579 * ever write the buffer.
1547 */ 1580 */
@@ -1863,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file,
1863/* 1896/*
1864 * Reserve a single block located at lblock 1897 * Reserve a single block located at lblock
1865 */ 1898 */
1866static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) 1899static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1867{ 1900{
1868 int retries = 0; 1901 int retries = 0;
1869 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1902 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1995,16 +2028,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1995 * 2028 *
1996 * As pages are already locked by write_cache_pages(), we can't use it 2029 * As pages are already locked by write_cache_pages(), we can't use it
1997 */ 2030 */
1998static int mpage_da_submit_io(struct mpage_da_data *mpd) 2031static int mpage_da_submit_io(struct mpage_da_data *mpd,
2032 struct ext4_map_blocks *map)
1999{ 2033{
2000 long pages_skipped;
2001 struct pagevec pvec; 2034 struct pagevec pvec;
2002 unsigned long index, end; 2035 unsigned long index, end;
2003 int ret = 0, err, nr_pages, i; 2036 int ret = 0, err, nr_pages, i;
2004 struct inode *inode = mpd->inode; 2037 struct inode *inode = mpd->inode;
2005 struct address_space *mapping = inode->i_mapping; 2038 struct address_space *mapping = inode->i_mapping;
2039 loff_t size = i_size_read(inode);
2040 unsigned int len, block_start;
2041 struct buffer_head *bh, *page_bufs = NULL;
2042 int journal_data = ext4_should_journal_data(inode);
2043 sector_t pblock = 0, cur_logical = 0;
2044 struct ext4_io_submit io_submit;
2006 2045
2007 BUG_ON(mpd->next_page <= mpd->first_page); 2046 BUG_ON(mpd->next_page <= mpd->first_page);
2047 memset(&io_submit, 0, sizeof(io_submit));
2008 /* 2048 /*
2009 * We need to start from the first_page to the next_page - 1 2049 * We need to start from the first_page to the next_page - 1
2010 * to make sure we also write the mapped dirty buffer_heads. 2050 * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,122 +2060,111 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2020 if (nr_pages == 0) 2060 if (nr_pages == 0)
2021 break; 2061 break;
2022 for (i = 0; i < nr_pages; i++) { 2062 for (i = 0; i < nr_pages; i++) {
2063 int commit_write = 0, redirty_page = 0;
2023 struct page *page = pvec.pages[i]; 2064 struct page *page = pvec.pages[i];
2024 2065
2025 index = page->index; 2066 index = page->index;
2026 if (index > end) 2067 if (index > end)
2027 break; 2068 break;
2069
2070 if (index == size >> PAGE_CACHE_SHIFT)
2071 len = size & ~PAGE_CACHE_MASK;
2072 else
2073 len = PAGE_CACHE_SIZE;
2074 if (map) {
2075 cur_logical = index << (PAGE_CACHE_SHIFT -
2076 inode->i_blkbits);
2077 pblock = map->m_pblk + (cur_logical -
2078 map->m_lblk);
2079 }
2028 index++; 2080 index++;
2029 2081
2030 BUG_ON(!PageLocked(page)); 2082 BUG_ON(!PageLocked(page));
2031 BUG_ON(PageWriteback(page)); 2083 BUG_ON(PageWriteback(page));
2032 2084
2033 pages_skipped = mpd->wbc->pages_skipped;
2034 err = mapping->a_ops->writepage(page, mpd->wbc);
2035 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
2036 /*
2037 * have successfully written the page
2038 * without skipping the same
2039 */
2040 mpd->pages_written++;
2041 /* 2085 /*
2042 * In error case, we have to continue because 2086 * If the page does not have buffers (for
2043 * remaining pages are still locked 2087 * whatever reason), try to create them using
2044 * XXX: unlock and re-dirty them? 2088 * __block_write_begin. If this fails,
2089 * redirty the page and move on.
2045 */ 2090 */
2046 if (ret == 0) 2091 if (!page_has_buffers(page)) {
2047 ret = err; 2092 if (__block_write_begin(page, 0, len,
2048 } 2093 noalloc_get_block_write)) {
2049 pagevec_release(&pvec); 2094 redirty_page:
2050 } 2095 redirty_page_for_writepage(mpd->wbc,
2051 return ret; 2096 page);
2052} 2097 unlock_page(page);
2053 2098 continue;
2054/* 2099 }
2055 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2100 commit_write = 1;
2056 * 2101 }
2057 * the function goes through all passed space and put actual disk
2058 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2059 */
2060static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2061 struct ext4_map_blocks *map)
2062{
2063 struct inode *inode = mpd->inode;
2064 struct address_space *mapping = inode->i_mapping;
2065 int blocks = map->m_len;
2066 sector_t pblock = map->m_pblk, cur_logical;
2067 struct buffer_head *head, *bh;
2068 pgoff_t index, end;
2069 struct pagevec pvec;
2070 int nr_pages, i;
2071
2072 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2073 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2074 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2075
2076 pagevec_init(&pvec, 0);
2077
2078 while (index <= end) {
2079 /* XXX: optimize tail */
2080 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2081 if (nr_pages == 0)
2082 break;
2083 for (i = 0; i < nr_pages; i++) {
2084 struct page *page = pvec.pages[i];
2085
2086 index = page->index;
2087 if (index > end)
2088 break;
2089 index++;
2090
2091 BUG_ON(!PageLocked(page));
2092 BUG_ON(PageWriteback(page));
2093 BUG_ON(!page_has_buffers(page));
2094
2095 bh = page_buffers(page);
2096 head = bh;
2097
2098 /* skip blocks out of the range */
2099 do {
2100 if (cur_logical >= map->m_lblk)
2101 break;
2102 cur_logical++;
2103 } while ((bh = bh->b_this_page) != head);
2104 2102
2103 bh = page_bufs = page_buffers(page);
2104 block_start = 0;
2105 do { 2105 do {
2106 if (cur_logical >= map->m_lblk + blocks) 2106 if (!bh)
2107 break; 2107 goto redirty_page;
2108 2108 if (map && (cur_logical >= map->m_lblk) &&
2109 if (buffer_delay(bh) || buffer_unwritten(bh)) { 2109 (cur_logical <= (map->m_lblk +
2110 2110 (map->m_len - 1)))) {
2111 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2112
2113 if (buffer_delay(bh)) { 2111 if (buffer_delay(bh)) {
2114 clear_buffer_delay(bh); 2112 clear_buffer_delay(bh);
2115 bh->b_blocknr = pblock; 2113 bh->b_blocknr = pblock;
2116 } else {
2117 /*
2118 * unwritten already should have
2119 * blocknr assigned. Verify that
2120 */
2121 clear_buffer_unwritten(bh);
2122 BUG_ON(bh->b_blocknr != pblock);
2123 } 2114 }
2115 if (buffer_unwritten(bh) ||
2116 buffer_mapped(bh))
2117 BUG_ON(bh->b_blocknr != pblock);
2118 if (map->m_flags & EXT4_MAP_UNINIT)
2119 set_buffer_uninit(bh);
2120 clear_buffer_unwritten(bh);
2121 }
2124 2122
2125 } else if (buffer_mapped(bh)) 2123 /* redirty page if block allocation undone */
2126 BUG_ON(bh->b_blocknr != pblock); 2124 if (buffer_delay(bh) || buffer_unwritten(bh))
2127 2125 redirty_page = 1;
2128 if (map->m_flags & EXT4_MAP_UNINIT) 2126 bh = bh->b_this_page;
2129 set_buffer_uninit(bh); 2127 block_start += bh->b_size;
2130 cur_logical++; 2128 cur_logical++;
2131 pblock++; 2129 pblock++;
2132 } while ((bh = bh->b_this_page) != head); 2130 } while (bh != page_bufs);
2131
2132 if (redirty_page)
2133 goto redirty_page;
2134
2135 if (commit_write)
2136 /* mark the buffer_heads as dirty & uptodate */
2137 block_commit_write(page, 0, len);
2138
2139 /*
2140 * Delalloc doesn't support data journalling,
2141 * but eventually maybe we'll lift this
2142 * restriction.
2143 */
2144 if (unlikely(journal_data && PageChecked(page)))
2145 err = __ext4_journalled_writepage(page, len);
2146 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2147 err = ext4_bio_write_page(&io_submit, page,
2148 len, mpd->wbc);
2149 else
2150 err = block_write_full_page(page,
2151 noalloc_get_block_write, mpd->wbc);
2152
2153 if (!err)
2154 mpd->pages_written++;
2155 /*
2156 * In error case, we have to continue because
2157 * remaining pages are still locked
2158 */
2159 if (ret == 0)
2160 ret = err;
2133 } 2161 }
2134 pagevec_release(&pvec); 2162 pagevec_release(&pvec);
2135 } 2163 }
2164 ext4_io_submit(&io_submit);
2165 return ret;
2136} 2166}
2137 2167
2138
2139static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2168static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2140 sector_t logical, long blk_cnt) 2169 sector_t logical, long blk_cnt)
2141{ 2170{
@@ -2187,35 +2216,32 @@ static void ext4_print_free_blocks(struct inode *inode)
2187} 2216}
2188 2217
2189/* 2218/*
2190 * mpage_da_map_blocks - go through given space 2219 * mpage_da_map_and_submit - go through given space, map them
2220 * if necessary, and then submit them for I/O
2191 * 2221 *
2192 * @mpd - bh describing space 2222 * @mpd - bh describing space
2193 * 2223 *
2194 * The function skips space we know is already mapped to disk blocks. 2224 * The function skips space we know is already mapped to disk blocks.
2195 * 2225 *
2196 */ 2226 */
2197static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2227static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2198{ 2228{
2199 int err, blks, get_blocks_flags; 2229 int err, blks, get_blocks_flags;
2200 struct ext4_map_blocks map; 2230 struct ext4_map_blocks map, *mapp = NULL;
2201 sector_t next = mpd->b_blocknr; 2231 sector_t next = mpd->b_blocknr;
2202 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2232 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2203 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2233 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2204 handle_t *handle = NULL; 2234 handle_t *handle = NULL;
2205 2235
2206 /* 2236 /*
2207 * We consider only non-mapped and non-allocated blocks 2237 * If the blocks are mapped already, or we couldn't accumulate
2238 * any blocks, then proceed immediately to the submission stage.
2208 */ 2239 */
2209 if ((mpd->b_state & (1 << BH_Mapped)) && 2240 if ((mpd->b_size == 0) ||
2210 !(mpd->b_state & (1 << BH_Delay)) && 2241 ((mpd->b_state & (1 << BH_Mapped)) &&
2211 !(mpd->b_state & (1 << BH_Unwritten))) 2242 !(mpd->b_state & (1 << BH_Delay)) &&
2212 return 0; 2243 !(mpd->b_state & (1 << BH_Unwritten))))
2213 2244 goto submit_io;
2214 /*
2215 * If we didn't accumulate anything to write simply return
2216 */
2217 if (!mpd->b_size)
2218 return 0;
2219 2245
2220 handle = ext4_journal_current_handle(); 2246 handle = ext4_journal_current_handle();
2221 BUG_ON(!handle); 2247 BUG_ON(!handle);
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2231 * affects functions in many different parts of the allocation 2257 * affects functions in many different parts of the allocation
2232 * call path. This flag exists primarily because we don't 2258 * call path. This flag exists primarily because we don't
2233 * want to change *many* call functions, so ext4_map_blocks() 2259 * want to change *many* call functions, so ext4_map_blocks()
2234 * will set the magic i_delalloc_reserved_flag once the 2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2235 * inode's allocation semaphore is taken. 2261 * inode's allocation semaphore is taken.
2236 * 2262 *
2237 * If the blocks in questions were delalloc blocks, set 2263 * If the blocks in questions were delalloc blocks, set
@@ -2252,17 +2278,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2252 2278
2253 err = blks; 2279 err = blks;
2254 /* 2280 /*
2255 * If get block returns with error we simply 2281 * If get block returns EAGAIN or ENOSPC and there
2256 * return. Later writepage will redirty the page and 2282 * appears to be free blocks we will call
2257 * writepages will find the dirty page again 2283 * ext4_writepage() for all of the pages which will
2284 * just redirty the pages.
2258 */ 2285 */
2259 if (err == -EAGAIN) 2286 if (err == -EAGAIN)
2260 return 0; 2287 goto submit_io;
2261 2288
2262 if (err == -ENOSPC && 2289 if (err == -ENOSPC &&
2263 ext4_count_free_blocks(sb)) { 2290 ext4_count_free_blocks(sb)) {
2264 mpd->retval = err; 2291 mpd->retval = err;
2265 return 0; 2292 goto submit_io;
2266 } 2293 }
2267 2294
2268 /* 2295 /*
@@ -2287,10 +2314,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2287 /* invalidate all the pages */ 2314 /* invalidate all the pages */
2288 ext4_da_block_invalidatepages(mpd, next, 2315 ext4_da_block_invalidatepages(mpd, next,
2289 mpd->b_size >> mpd->inode->i_blkbits); 2316 mpd->b_size >> mpd->inode->i_blkbits);
2290 return err; 2317 return;
2291 } 2318 }
2292 BUG_ON(blks == 0); 2319 BUG_ON(blks == 0);
2293 2320
2321 mapp = &map;
2294 if (map.m_flags & EXT4_MAP_NEW) { 2322 if (map.m_flags & EXT4_MAP_NEW) {
2295 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2323 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2296 int i; 2324 int i;
@@ -2299,18 +2327,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2299 unmap_underlying_metadata(bdev, map.m_pblk + i); 2327 unmap_underlying_metadata(bdev, map.m_pblk + i);
2300 } 2328 }
2301 2329
2302 /*
2303 * If blocks are delayed marked, we need to
2304 * put actual blocknr and drop delayed bit
2305 */
2306 if ((mpd->b_state & (1 << BH_Delay)) ||
2307 (mpd->b_state & (1 << BH_Unwritten)))
2308 mpage_put_bnr_to_bhs(mpd, &map);
2309
2310 if (ext4_should_order_data(mpd->inode)) { 2330 if (ext4_should_order_data(mpd->inode)) {
2311 err = ext4_jbd2_file_inode(handle, mpd->inode); 2331 err = ext4_jbd2_file_inode(handle, mpd->inode);
2312 if (err) 2332 if (err)
2313 return err; 2333 /* This only happens if the journal is aborted */
2334 return;
2314 } 2335 }
2315 2336
2316 /* 2337 /*
@@ -2321,10 +2342,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2321 disksize = i_size_read(mpd->inode); 2342 disksize = i_size_read(mpd->inode);
2322 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2343 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2323 ext4_update_i_disksize(mpd->inode, disksize); 2344 ext4_update_i_disksize(mpd->inode, disksize);
2324 return ext4_mark_inode_dirty(handle, mpd->inode); 2345 err = ext4_mark_inode_dirty(handle, mpd->inode);
2346 if (err)
2347 ext4_error(mpd->inode->i_sb,
2348 "Failed to mark inode %lu dirty",
2349 mpd->inode->i_ino);
2325 } 2350 }
2326 2351
2327 return 0; 2352submit_io:
2353 mpage_da_submit_io(mpd, mapp);
2354 mpd->io_done = 1;
2328} 2355}
2329 2356
2330#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2357#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2428,7 @@ flush_it:
2401 * We couldn't merge the block to our extent, so we 2428 * We couldn't merge the block to our extent, so we
2402 * need to flush current extent and start new one 2429 * need to flush current extent and start new one
2403 */ 2430 */
2404 if (mpage_da_map_blocks(mpd) == 0) 2431 mpage_da_map_and_submit(mpd);
2405 mpage_da_submit_io(mpd);
2406 mpd->io_done = 1;
2407 return; 2432 return;
2408} 2433}
2409 2434
@@ -2422,9 +2447,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2422 * The function finds extents of pages and scan them for all blocks. 2447 * The function finds extents of pages and scan them for all blocks.
2423 */ 2448 */
2424static int __mpage_da_writepage(struct page *page, 2449static int __mpage_da_writepage(struct page *page,
2425 struct writeback_control *wbc, void *data) 2450 struct writeback_control *wbc,
2451 struct mpage_da_data *mpd)
2426{ 2452{
2427 struct mpage_da_data *mpd = data;
2428 struct inode *inode = mpd->inode; 2453 struct inode *inode = mpd->inode;
2429 struct buffer_head *bh, *head; 2454 struct buffer_head *bh, *head;
2430 sector_t logical; 2455 sector_t logical;
@@ -2435,15 +2460,13 @@ static int __mpage_da_writepage(struct page *page,
2435 if (mpd->next_page != page->index) { 2460 if (mpd->next_page != page->index) {
2436 /* 2461 /*
2437 * Nope, we can't. So, we map non-allocated blocks 2462 * Nope, we can't. So, we map non-allocated blocks
2438 * and start IO on them using writepage() 2463 * and start IO on them
2439 */ 2464 */
2440 if (mpd->next_page != mpd->first_page) { 2465 if (mpd->next_page != mpd->first_page) {
2441 if (mpage_da_map_blocks(mpd) == 0) 2466 mpage_da_map_and_submit(mpd);
2442 mpage_da_submit_io(mpd);
2443 /* 2467 /*
2444 * skip rest of the page in the page_vec 2468 * skip rest of the page in the page_vec
2445 */ 2469 */
2446 mpd->io_done = 1;
2447 redirty_page_for_writepage(wbc, page); 2470 redirty_page_for_writepage(wbc, page);
2448 unlock_page(page); 2471 unlock_page(page);
2449 return MPAGE_DA_EXTENT_TAIL; 2472 return MPAGE_DA_EXTENT_TAIL;
@@ -2550,8 +2573,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2550 if (buffer_delay(bh)) 2573 if (buffer_delay(bh))
2551 return 0; /* Not sure this could or should happen */ 2574 return 0; /* Not sure this could or should happen */
2552 /* 2575 /*
2553 * XXX: __block_prepare_write() unmaps passed block, 2576 * XXX: __block_write_begin() unmaps passed block, is it OK?
2554 * is it OK?
2555 */ 2577 */
2556 ret = ext4_da_reserve_space(inode, iblock); 2578 ret = ext4_da_reserve_space(inode, iblock);
2557 if (ret) 2579 if (ret)
@@ -2583,7 +2605,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2583/* 2605/*
2584 * This function is used as a standard get_block_t calback function 2606 * This function is used as a standard get_block_t calback function
2585 * when there is no desire to allocate any blocks. It is used as a 2607 * when there is no desire to allocate any blocks. It is used as a
2586 * callback function for block_prepare_write() and block_write_full_page(). 2608 * callback function for block_write_begin() and block_write_full_page().
2587 * These functions should only try to map a single block at a time. 2609 * These functions should only try to map a single block at a time.
2588 * 2610 *
2589 * Since this function doesn't do block allocations even if the caller 2611 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2645,7 @@ static int __ext4_journalled_writepage(struct page *page,
2623 int ret = 0; 2645 int ret = 0;
2624 int err; 2646 int err;
2625 2647
2648 ClearPageChecked(page);
2626 page_bufs = page_buffers(page); 2649 page_bufs = page_buffers(page);
2627 BUG_ON(!page_bufs); 2650 BUG_ON(!page_bufs);
2628 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2651 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2700,7 +2723,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2700static int ext4_writepage(struct page *page, 2723static int ext4_writepage(struct page *page,
2701 struct writeback_control *wbc) 2724 struct writeback_control *wbc)
2702{ 2725{
2703 int ret = 0; 2726 int ret = 0, commit_write = 0;
2704 loff_t size; 2727 loff_t size;
2705 unsigned int len; 2728 unsigned int len;
2706 struct buffer_head *page_bufs = NULL; 2729 struct buffer_head *page_bufs = NULL;
@@ -2713,71 +2736,44 @@ static int ext4_writepage(struct page *page,
2713 else 2736 else
2714 len = PAGE_CACHE_SIZE; 2737 len = PAGE_CACHE_SIZE;
2715 2738
2716 if (page_has_buffers(page)) { 2739 /*
2717 page_bufs = page_buffers(page); 2740 * If the page does not have buffers (for whatever reason),
2718 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2741 * try to create them using __block_write_begin. If this
2719 ext4_bh_delay_or_unwritten)) { 2742 * fails, redirty the page and move on.
2720 /* 2743 */
2721 * We don't want to do block allocation 2744 if (!page_has_buffers(page)) {
2722 * So redirty the page and return 2745 if (__block_write_begin(page, 0, len,
2723 * We may reach here when we do a journal commit 2746 noalloc_get_block_write)) {
2724 * via journal_submit_inode_data_buffers. 2747 redirty_page:
2725 * If we don't have mapping block we just ignore
2726 * them. We can also reach here via shrink_page_list
2727 */
2728 redirty_page_for_writepage(wbc, page); 2748 redirty_page_for_writepage(wbc, page);
2729 unlock_page(page); 2749 unlock_page(page);
2730 return 0; 2750 return 0;
2731 } 2751 }
2732 } else { 2752 commit_write = 1;
2753 }
2754 page_bufs = page_buffers(page);
2755 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2756 ext4_bh_delay_or_unwritten)) {
2733 /* 2757 /*
2734 * The test for page_has_buffers() is subtle: 2758 * We don't want to do block allocation, so redirty
2735 * We know the page is dirty but it lost buffers. That means 2759 * the page and return. We may reach here when we do
2736 * that at some moment in time after write_begin()/write_end() 2760 * a journal commit via journal_submit_inode_data_buffers.
2737 * has been called all buffers have been clean and thus they 2761 * We can also reach here via shrink_page_list
2738 * must have been written at least once. So they are all
2739 * mapped and we can happily proceed with mapping them
2740 * and writing the page.
2741 *
2742 * Try to initialize the buffer_heads and check whether
2743 * all are mapped and non delay. We don't want to
2744 * do block allocation here.
2745 */ 2762 */
2746 ret = block_prepare_write(page, 0, len, 2763 goto redirty_page;
2747 noalloc_get_block_write); 2764 }
2748 if (!ret) { 2765 if (commit_write)
2749 page_bufs = page_buffers(page);
2750 /* check whether all are mapped and non delay */
2751 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2752 ext4_bh_delay_or_unwritten)) {
2753 redirty_page_for_writepage(wbc, page);
2754 unlock_page(page);
2755 return 0;
2756 }
2757 } else {
2758 /*
2759 * We can't do block allocation here
2760 * so just redity the page and unlock
2761 * and return
2762 */
2763 redirty_page_for_writepage(wbc, page);
2764 unlock_page(page);
2765 return 0;
2766 }
2767 /* now mark the buffer_heads as dirty and uptodate */ 2766 /* now mark the buffer_heads as dirty and uptodate */
2768 block_commit_write(page, 0, len); 2767 block_commit_write(page, 0, len);
2769 }
2770 2768
2771 if (PageChecked(page) && ext4_should_journal_data(inode)) { 2769 if (PageChecked(page) && ext4_should_journal_data(inode))
2772 /* 2770 /*
2773 * It's mmapped pagecache. Add buffers and journal it. There 2771 * It's mmapped pagecache. Add buffers and journal it. There
2774 * doesn't seem much point in redirtying the page here. 2772 * doesn't seem much point in redirtying the page here.
2775 */ 2773 */
2776 ClearPageChecked(page);
2777 return __ext4_journalled_writepage(page, len); 2774 return __ext4_journalled_writepage(page, len);
2778 }
2779 2775
2780 if (page_bufs && buffer_uninit(page_bufs)) { 2776 if (buffer_uninit(page_bufs)) {
2781 ext4_set_bh_endio(page_bufs, inode); 2777 ext4_set_bh_endio(page_bufs, inode);
2782 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2778 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2783 wbc, ext4_end_io_buffer_write); 2779 wbc, ext4_end_io_buffer_write);
@@ -2824,25 +2820,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2824 */ 2820 */
2825static int write_cache_pages_da(struct address_space *mapping, 2821static int write_cache_pages_da(struct address_space *mapping,
2826 struct writeback_control *wbc, 2822 struct writeback_control *wbc,
2827 struct mpage_da_data *mpd) 2823 struct mpage_da_data *mpd,
2824 pgoff_t *done_index)
2828{ 2825{
2829 int ret = 0; 2826 int ret = 0;
2830 int done = 0; 2827 int done = 0;
2831 struct pagevec pvec; 2828 struct pagevec pvec;
2832 int nr_pages; 2829 unsigned nr_pages;
2833 pgoff_t index; 2830 pgoff_t index;
2834 pgoff_t end; /* Inclusive */ 2831 pgoff_t end; /* Inclusive */
2835 long nr_to_write = wbc->nr_to_write; 2832 long nr_to_write = wbc->nr_to_write;
2833 int tag;
2836 2834
2837 pagevec_init(&pvec, 0); 2835 pagevec_init(&pvec, 0);
2838 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2836 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2839 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2837 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2840 2838
2839 if (wbc->sync_mode == WB_SYNC_ALL)
2840 tag = PAGECACHE_TAG_TOWRITE;
2841 else
2842 tag = PAGECACHE_TAG_DIRTY;
2843
2844 *done_index = index;
2841 while (!done && (index <= end)) { 2845 while (!done && (index <= end)) {
2842 int i; 2846 int i;
2843 2847
2844 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2848 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2845 PAGECACHE_TAG_DIRTY,
2846 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2849 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2847 if (nr_pages == 0) 2850 if (nr_pages == 0)
2848 break; 2851 break;
@@ -2862,6 +2865,8 @@ static int write_cache_pages_da(struct address_space *mapping,
2862 break; 2865 break;
2863 } 2866 }
2864 2867
2868 *done_index = page->index + 1;
2869
2865 lock_page(page); 2870 lock_page(page);
2866 2871
2867 /* 2872 /*
@@ -2947,6 +2952,8 @@ static int ext4_da_writepages(struct address_space *mapping,
2947 long desired_nr_to_write, nr_to_writebump = 0; 2952 long desired_nr_to_write, nr_to_writebump = 0;
2948 loff_t range_start = wbc->range_start; 2953 loff_t range_start = wbc->range_start;
2949 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2954 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2955 pgoff_t done_index = 0;
2956 pgoff_t end;
2950 2957
2951 trace_ext4_da_writepages(inode, wbc); 2958 trace_ext4_da_writepages(inode, wbc);
2952 2959
@@ -2982,8 +2989,11 @@ static int ext4_da_writepages(struct address_space *mapping,
2982 wbc->range_start = index << PAGE_CACHE_SHIFT; 2989 wbc->range_start = index << PAGE_CACHE_SHIFT;
2983 wbc->range_end = LLONG_MAX; 2990 wbc->range_end = LLONG_MAX;
2984 wbc->range_cyclic = 0; 2991 wbc->range_cyclic = 0;
2985 } else 2992 end = -1;
2993 } else {
2986 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2994 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2995 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2996 }
2987 2997
2988 /* 2998 /*
2989 * This works around two forms of stupidity. The first is in 2999 * This works around two forms of stupidity. The first is in
@@ -3002,9 +3012,12 @@ static int ext4_da_writepages(struct address_space *mapping,
3002 * sbi->max_writeback_mb_bump whichever is smaller. 3012 * sbi->max_writeback_mb_bump whichever is smaller.
3003 */ 3013 */
3004 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 3014 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
3005 if (!range_cyclic && range_whole) 3015 if (!range_cyclic && range_whole) {
3006 desired_nr_to_write = wbc->nr_to_write * 8; 3016 if (wbc->nr_to_write == LONG_MAX)
3007 else 3017 desired_nr_to_write = wbc->nr_to_write;
3018 else
3019 desired_nr_to_write = wbc->nr_to_write * 8;
3020 } else
3008 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 3021 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
3009 max_pages); 3022 max_pages);
3010 if (desired_nr_to_write > max_pages) 3023 if (desired_nr_to_write > max_pages)
@@ -3021,6 +3034,9 @@ static int ext4_da_writepages(struct address_space *mapping,
3021 pages_skipped = wbc->pages_skipped; 3034 pages_skipped = wbc->pages_skipped;
3022 3035
3023retry: 3036retry:
3037 if (wbc->sync_mode == WB_SYNC_ALL)
3038 tag_pages_for_writeback(mapping, index, end);
3039
3024 while (!ret && wbc->nr_to_write > 0) { 3040 while (!ret && wbc->nr_to_write > 0) {
3025 3041
3026 /* 3042 /*
@@ -3059,16 +3075,14 @@ retry:
3059 mpd.io_done = 0; 3075 mpd.io_done = 0;
3060 mpd.pages_written = 0; 3076 mpd.pages_written = 0;
3061 mpd.retval = 0; 3077 mpd.retval = 0;
3062 ret = write_cache_pages_da(mapping, wbc, &mpd); 3078 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3063 /* 3079 /*
3064 * If we have a contiguous extent of pages and we 3080 * If we have a contiguous extent of pages and we
3065 * haven't done the I/O yet, map the blocks and submit 3081 * haven't done the I/O yet, map the blocks and submit
3066 * them for I/O. 3082 * them for I/O.
3067 */ 3083 */
3068 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3084 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3069 if (mpage_da_map_blocks(&mpd) == 0) 3085 mpage_da_map_and_submit(&mpd);
3070 mpage_da_submit_io(&mpd);
3071 mpd.io_done = 1;
3072 ret = MPAGE_DA_EXTENT_TAIL; 3086 ret = MPAGE_DA_EXTENT_TAIL;
3073 } 3087 }
3074 trace_ext4_da_write_pages(inode, &mpd); 3088 trace_ext4_da_write_pages(inode, &mpd);
@@ -3115,14 +3129,13 @@ retry:
3115 __func__, wbc->nr_to_write, ret); 3129 __func__, wbc->nr_to_write, ret);
3116 3130
3117 /* Update index */ 3131 /* Update index */
3118 index += pages_written;
3119 wbc->range_cyclic = range_cyclic; 3132 wbc->range_cyclic = range_cyclic;
3120 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3133 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3121 /* 3134 /*
3122 * set the writeback_index so that range_cyclic 3135 * set the writeback_index so that range_cyclic
3123 * mode will write it back later 3136 * mode will write it back later
3124 */ 3137 */
3125 mapping->writeback_index = index; 3138 mapping->writeback_index = done_index;
3126 3139
3127out_writepages: 3140out_writepages:
3128 wbc->nr_to_write -= nr_to_writebump; 3141 wbc->nr_to_write -= nr_to_writebump;
@@ -3367,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
3367 * doing I/O at all. 3380 * doing I/O at all.
3368 * 3381 *
3369 * We could call write_cache_pages(), and then redirty all of 3382 * We could call write_cache_pages(), and then redirty all of
3370 * the pages by calling redirty_page_for_writeback() but that 3383 * the pages by calling redirty_page_for_writepage() but that
3371 * would be ugly in the extreme. So instead we would need to 3384 * would be ugly in the extreme. So instead we would need to
3372 * replicate parts of the code in the above functions, 3385 * replicate parts of the code in the above functions,
3373 * simplifying them becuase we wouldn't actually intend to 3386 * simplifying them becuase we wouldn't actually intend to
@@ -3457,15 +3470,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3457 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3470 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3458} 3471}
3459 3472
3460static void ext4_free_io_end(ext4_io_end_t *io)
3461{
3462 BUG_ON(!io);
3463 if (io->page)
3464 put_page(io->page);
3465 iput(io->inode);
3466 kfree(io);
3467}
3468
3469static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3473static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3470{ 3474{
3471 struct buffer_head *head, *bh; 3475 struct buffer_head *head, *bh;
@@ -3642,173 +3646,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3642 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3646 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3643} 3647}
3644 3648
3645static void dump_completed_IO(struct inode * inode)
3646{
3647#ifdef EXT4_DEBUG
3648 struct list_head *cur, *before, *after;
3649 ext4_io_end_t *io, *io0, *io1;
3650 unsigned long flags;
3651
3652 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3653 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3654 return;
3655 }
3656
3657 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3658 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3659 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3660 cur = &io->list;
3661 before = cur->prev;
3662 io0 = container_of(before, ext4_io_end_t, list);
3663 after = cur->next;
3664 io1 = container_of(after, ext4_io_end_t, list);
3665
3666 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3667 io, inode->i_ino, io0, io1);
3668 }
3669 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3670#endif
3671}
3672
3673/*
3674 * check a range of space and convert unwritten extents to written.
3675 */
3676static int ext4_end_io_nolock(ext4_io_end_t *io)
3677{
3678 struct inode *inode = io->inode;
3679 loff_t offset = io->offset;
3680 ssize_t size = io->size;
3681 int ret = 0;
3682
3683 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3684 "list->prev 0x%p\n",
3685 io, inode->i_ino, io->list.next, io->list.prev);
3686
3687 if (list_empty(&io->list))
3688 return ret;
3689
3690 if (io->flag != EXT4_IO_UNWRITTEN)
3691 return ret;
3692
3693 ret = ext4_convert_unwritten_extents(inode, offset, size);
3694 if (ret < 0) {
3695 printk(KERN_EMERG "%s: failed to convert unwritten"
3696 "extents to written extents, error is %d"
3697 " io is still on inode %lu aio dio list\n",
3698 __func__, ret, inode->i_ino);
3699 return ret;
3700 }
3701
3702 if (io->iocb)
3703 aio_complete(io->iocb, io->result, 0);
3704 /* clear the DIO AIO unwritten flag */
3705 io->flag = 0;
3706 return ret;
3707}
3708
3709/*
3710 * work on completed aio dio IO, to convert unwritten extents to extents
3711 */
3712static void ext4_end_io_work(struct work_struct *work)
3713{
3714 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3715 struct inode *inode = io->inode;
3716 struct ext4_inode_info *ei = EXT4_I(inode);
3717 unsigned long flags;
3718 int ret;
3719
3720 mutex_lock(&inode->i_mutex);
3721 ret = ext4_end_io_nolock(io);
3722 if (ret < 0) {
3723 mutex_unlock(&inode->i_mutex);
3724 return;
3725 }
3726
3727 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3728 if (!list_empty(&io->list))
3729 list_del_init(&io->list);
3730 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3731 mutex_unlock(&inode->i_mutex);
3732 ext4_free_io_end(io);
3733}
3734
3735/*
3736 * This function is called from ext4_sync_file().
3737 *
3738 * When IO is completed, the work to convert unwritten extents to
3739 * written is queued on workqueue but may not get immediately
3740 * scheduled. When fsync is called, we need to ensure the
3741 * conversion is complete before fsync returns.
3742 * The inode keeps track of a list of pending/completed IO that
3743 * might needs to do the conversion. This function walks through
3744 * the list and convert the related unwritten extents for completed IO
3745 * to written.
3746 * The function return the number of pending IOs on success.
3747 */
3748int flush_completed_IO(struct inode *inode)
3749{
3750 ext4_io_end_t *io;
3751 struct ext4_inode_info *ei = EXT4_I(inode);
3752 unsigned long flags;
3753 int ret = 0;
3754 int ret2 = 0;
3755
3756 if (list_empty(&ei->i_completed_io_list))
3757 return ret;
3758
3759 dump_completed_IO(inode);
3760 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3761 while (!list_empty(&ei->i_completed_io_list)){
3762 io = list_entry(ei->i_completed_io_list.next,
3763 ext4_io_end_t, list);
3764 /*
3765 * Calling ext4_end_io_nolock() to convert completed
3766 * IO to written.
3767 *
3768 * When ext4_sync_file() is called, run_queue() may already
3769 * about to flush the work corresponding to this io structure.
3770 * It will be upset if it founds the io structure related
3771 * to the work-to-be schedule is freed.
3772 *
3773 * Thus we need to keep the io structure still valid here after
3774 * convertion finished. The io structure has a flag to
3775 * avoid double converting from both fsync and background work
3776 * queue work.
3777 */
3778 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3779 ret = ext4_end_io_nolock(io);
3780 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3781 if (ret < 0)
3782 ret2 = ret;
3783 else
3784 list_del_init(&io->list);
3785 }
3786 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3787 return (ret2 < 0) ? ret2 : 0;
3788}
3789
3790static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3791{
3792 ext4_io_end_t *io = NULL;
3793
3794 io = kmalloc(sizeof(*io), flags);
3795
3796 if (io) {
3797 igrab(inode);
3798 io->inode = inode;
3799 io->flag = 0;
3800 io->offset = 0;
3801 io->size = 0;
3802 io->page = NULL;
3803 io->iocb = NULL;
3804 io->result = 0;
3805 INIT_WORK(&io->work, ext4_end_io_work);
3806 INIT_LIST_HEAD(&io->list);
3807 }
3808
3809 return io;
3810}
3811
3812static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3649static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3813 ssize_t size, void *private, int ret, 3650 ssize_t size, void *private, int ret,
3814 bool is_async) 3651 bool is_async)
@@ -3828,7 +3665,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3828 size); 3665 size);
3829 3666
3830 /* if not aio dio with unwritten extents, just free io and return */ 3667 /* if not aio dio with unwritten extents, just free io and return */
3831 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3668 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3832 ext4_free_io_end(io_end); 3669 ext4_free_io_end(io_end);
3833 iocb->private = NULL; 3670 iocb->private = NULL;
3834out: 3671out:
@@ -3845,14 +3682,14 @@ out:
3845 } 3682 }
3846 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3683 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3847 3684
3848 /* queue the work to convert unwritten extents to written */
3849 queue_work(wq, &io_end->work);
3850
3851 /* Add the io_end to per-inode completed aio dio list*/ 3685 /* Add the io_end to per-inode completed aio dio list*/
3852 ei = EXT4_I(io_end->inode); 3686 ei = EXT4_I(io_end->inode);
3853 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3687 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3854 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3688 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3855 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3689 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3690
3691 /* queue the work to convert unwritten extents to written */
3692 queue_work(wq, &io_end->work);
3856 iocb->private = NULL; 3693 iocb->private = NULL;
3857} 3694}
3858 3695
@@ -3873,7 +3710,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3873 goto out; 3710 goto out;
3874 } 3711 }
3875 3712
3876 io_end->flag = EXT4_IO_UNWRITTEN; 3713 io_end->flag = EXT4_IO_END_UNWRITTEN;
3877 inode = io_end->inode; 3714 inode = io_end->inode;
3878 3715
3879 /* Add the io_end to per-inode completed io list*/ 3716 /* Add the io_end to per-inode completed io list*/
@@ -3901,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3901retry: 3738retry:
3902 io_end = ext4_init_io_end(inode, GFP_ATOMIC); 3739 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3903 if (!io_end) { 3740 if (!io_end) {
3904 if (printk_ratelimit()) 3741 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3905 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3906 schedule(); 3742 schedule();
3907 goto retry; 3743 goto retry;
3908 } 3744 }
@@ -3926,9 +3762,9 @@ retry:
3926 * preallocated extents, and those write extend the file, no need to 3762 * preallocated extents, and those write extend the file, no need to
3927 * fall back to buffered IO. 3763 * fall back to buffered IO.
3928 * 3764 *
3929 * For holes, we fallocate those blocks, mark them as unintialized 3765 * For holes, we fallocate those blocks, mark them as uninitialized
3930 * If those blocks were preallocated, we mark sure they are splited, but 3766 * If those blocks were preallocated, we mark sure they are splited, but
3931 * still keep the range to write as unintialized. 3767 * still keep the range to write as uninitialized.
3932 * 3768 *
3933 * The unwrritten extents will be converted to written when DIO is completed. 3769 * The unwrritten extents will be converted to written when DIO is completed.
3934 * For async direct IO, since the IO may still pending when return, we 3770 * For async direct IO, since the IO may still pending when return, we
@@ -4226,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle,
4226 if (ext4_should_journal_data(inode)) { 4062 if (ext4_should_journal_data(inode)) {
4227 err = ext4_handle_dirty_metadata(handle, inode, bh); 4063 err = ext4_handle_dirty_metadata(handle, inode, bh);
4228 } else { 4064 } else {
4229 if (ext4_should_order_data(inode)) 4065 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4230 err = ext4_jbd2_file_inode(handle, inode); 4066 err = ext4_jbd2_file_inode(handle, inode);
4231 mark_buffer_dirty(bh); 4067 mark_buffer_dirty(bh);
4232 } 4068 }
@@ -4350,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4350{ 4186{
4351 __le32 *p; 4187 __le32 *p;
4352 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 4188 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4189 int err;
4353 4190
4354 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4191 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4355 flags |= EXT4_FREE_BLOCKS_METADATA; 4192 flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4365,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4365 if (try_to_extend_transaction(handle, inode)) { 4202 if (try_to_extend_transaction(handle, inode)) {
4366 if (bh) { 4203 if (bh) {
4367 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4204 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4368 ext4_handle_dirty_metadata(handle, inode, bh); 4205 err = ext4_handle_dirty_metadata(handle, inode, bh);
4206 if (unlikely(err)) {
4207 ext4_std_error(inode->i_sb, err);
4208 return 1;
4209 }
4210 }
4211 err = ext4_mark_inode_dirty(handle, inode);
4212 if (unlikely(err)) {
4213 ext4_std_error(inode->i_sb, err);
4214 return 1;
4215 }
4216 err = ext4_truncate_restart_trans(handle, inode,
4217 blocks_for_truncate(inode));
4218 if (unlikely(err)) {
4219 ext4_std_error(inode->i_sb, err);
4220 return 1;
4369 } 4221 }
4370 ext4_mark_inode_dirty(handle, inode);
4371 ext4_truncate_restart_trans(handle, inode,
4372 blocks_for_truncate(inode));
4373 if (bh) { 4222 if (bh) {
4374 BUFFER_TRACE(bh, "retaking write access"); 4223 BUFFER_TRACE(bh, "retaking write access");
4375 ext4_journal_get_write_access(handle, bh); 4224 ext4_journal_get_write_access(handle, bh);
@@ -4530,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4530 (__le32 *) bh->b_data, 4379 (__le32 *) bh->b_data,
4531 (__le32 *) bh->b_data + addr_per_block, 4380 (__le32 *) bh->b_data + addr_per_block,
4532 depth); 4381 depth);
4382 brelse(bh);
4533 4383
4534 /* 4384 /*
4535 * Everything below this this pointer has been 4385 * Everything below this this pointer has been
@@ -5040,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5040 } 4890 }
5041 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4891 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
5042 4892
5043 ei->i_state_flags = 0; 4893 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
5044 ei->i_dir_start_lookup = 0; 4894 ei->i_dir_start_lookup = 0;
5045 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4895 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
5046 /* We now have enough fields to check if the inode was active or not. 4896 /* We now have enough fields to check if the inode was active or not.
@@ -5299,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
5299 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5149 if (ext4_inode_blocks_set(handle, raw_inode, ei))
5300 goto out_brelse; 5150 goto out_brelse;
5301 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5151 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5302 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 5152 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5303 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5153 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5304 cpu_to_le32(EXT4_OS_HURD)) 5154 cpu_to_le32(EXT4_OS_HURD))
5305 raw_inode->i_file_acl_high = 5155 raw_inode->i_file_acl_high =
@@ -5464,6 +5314,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5464{ 5314{
5465 struct inode *inode = dentry->d_inode; 5315 struct inode *inode = dentry->d_inode;
5466 int error, rc = 0; 5316 int error, rc = 0;
5317 int orphan = 0;
5467 const unsigned int ia_valid = attr->ia_valid; 5318 const unsigned int ia_valid = attr->ia_valid;
5468 5319
5469 error = inode_change_ok(inode, attr); 5320 error = inode_change_ok(inode, attr);
@@ -5519,8 +5370,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5519 error = PTR_ERR(handle); 5370 error = PTR_ERR(handle);
5520 goto err_out; 5371 goto err_out;
5521 } 5372 }
5522 5373 if (ext4_handle_valid(handle)) {
5523 error = ext4_orphan_add(handle, inode); 5374 error = ext4_orphan_add(handle, inode);
5375 orphan = 1;
5376 }
5524 EXT4_I(inode)->i_disksize = attr->ia_size; 5377 EXT4_I(inode)->i_disksize = attr->ia_size;
5525 rc = ext4_mark_inode_dirty(handle, inode); 5378 rc = ext4_mark_inode_dirty(handle, inode);
5526 if (!error) 5379 if (!error)
@@ -5538,6 +5391,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5538 goto err_out; 5391 goto err_out;
5539 } 5392 }
5540 ext4_orphan_del(handle, inode); 5393 ext4_orphan_del(handle, inode);
5394 orphan = 0;
5541 ext4_journal_stop(handle); 5395 ext4_journal_stop(handle);
5542 goto err_out; 5396 goto err_out;
5543 } 5397 }
@@ -5560,7 +5414,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5560 * If the call to ext4_truncate failed to get a transaction handle at 5414 * If the call to ext4_truncate failed to get a transaction handle at
5561 * all, we need to clean up the in-core orphan list manually. 5415 * all, we need to clean up the in-core orphan list manually.
5562 */ 5416 */
5563 if (inode->i_nlink) 5417 if (orphan && inode->i_nlink)
5564 ext4_orphan_del(NULL, inode); 5418 ext4_orphan_del(NULL, inode);
5565 5419
5566 if (!rc && (ia_valid & ATTR_MODE)) 5420 if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5592 * will return the blocks that include the delayed allocation 5446 * will return the blocks that include the delayed allocation
5593 * blocks for this file. 5447 * blocks for this file.
5594 */ 5448 */
5595 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
5596 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5449 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5597 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
5598 5450
5599 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5451 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5600 return 0; 5452 return 0;
@@ -5643,7 +5495,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5643 * 5495 *
5644 * Also account for superblock, inode, quota and xattr blocks 5496 * Also account for superblock, inode, quota and xattr blocks
5645 */ 5497 */
5646int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5498static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5647{ 5499{
5648 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5500 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5649 int gdpblocks; 5501 int gdpblocks;
@@ -5831,6 +5683,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5831 int err, ret; 5683 int err, ret;
5832 5684
5833 might_sleep(); 5685 might_sleep();
5686 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5834 err = ext4_reserve_inode_write(handle, inode, &iloc); 5687 err = ext4_reserve_inode_write(handle, inode, &iloc);
5835 if (ext4_handle_valid(handle) && 5688 if (ext4_handle_valid(handle) &&
5836 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5689 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1bd..eb3bc2fe647e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
331 return err; 331 return err;
332 } 332 }
333 333
334 case FITRIM:
335 {
336 struct super_block *sb = inode->i_sb;
337 struct fstrim_range range;
338 int ret = 0;
339
340 if (!capable(CAP_SYS_ADMIN))
341 return -EPERM;
342
343 if (copy_from_user(&range, (struct fstrim_range *)arg,
344 sizeof(range)))
345 return -EFAULT;
346
347 ret = ext4_trim_fs(sb, &range);
348 if (ret < 0)
349 return ret;
350
351 if (copy_to_user((struct fstrim_range *)arg, &range,
352 sizeof(range)))
353 return -EFAULT;
354
355 return 0;
356 }
357
334 default: 358 default:
335 return -ENOTTY; 359 return -ENOTTY;
336 } 360 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..851f49b2f9d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
338static struct kmem_cache *ext4_pspace_cachep; 338static struct kmem_cache *ext4_pspace_cachep;
339static struct kmem_cache *ext4_ac_cachep; 339static struct kmem_cache *ext4_ac_cachep;
340static struct kmem_cache *ext4_free_ext_cachep; 340static struct kmem_cache *ext4_free_ext_cachep;
341
342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES \
346 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
347static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
348
341static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 349static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
342 ext4_group_t group); 350 ext4_group_t group);
343static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 351static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -939,6 +947,85 @@ out:
939} 947}
940 948
941/* 949/*
950 * lock the group_info alloc_sem of all the groups
951 * belonging to the same buddy cache page. This
952 * make sure other parallel operation on the buddy
953 * cache doesn't happen whild holding the buddy cache
954 * lock
955 */
956static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
957 ext4_group_t group)
958{
959 int i;
960 int block, pnum;
961 int blocks_per_page;
962 int groups_per_page;
963 ext4_group_t ngroups = ext4_get_groups_count(sb);
964 ext4_group_t first_group;
965 struct ext4_group_info *grp;
966
967 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
968 /*
969 * the buddy cache inode stores the block bitmap
970 * and buddy information in consecutive blocks.
971 * So for each group we need two blocks.
972 */
973 block = group * 2;
974 pnum = block / blocks_per_page;
975 first_group = pnum * blocks_per_page / 2;
976
977 groups_per_page = blocks_per_page >> 1;
978 if (groups_per_page == 0)
979 groups_per_page = 1;
980 /* read all groups the page covers into the cache */
981 for (i = 0; i < groups_per_page; i++) {
982
983 if ((first_group + i) >= ngroups)
984 break;
985 grp = ext4_get_group_info(sb, first_group + i);
986 /* take all groups write allocation
987 * semaphore. This make sure there is
988 * no block allocation going on in any
989 * of that groups
990 */
991 down_write_nested(&grp->alloc_sem, i);
992 }
993 return i;
994}
995
996static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
997 ext4_group_t group, int locked_group)
998{
999 int i;
1000 int block, pnum;
1001 int blocks_per_page;
1002 ext4_group_t first_group;
1003 struct ext4_group_info *grp;
1004
1005 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1006 /*
1007 * the buddy cache inode stores the block bitmap
1008 * and buddy information in consecutive blocks.
1009 * So for each group we need two blocks.
1010 */
1011 block = group * 2;
1012 pnum = block / blocks_per_page;
1013 first_group = pnum * blocks_per_page / 2;
1014 /* release locks on all the groups */
1015 for (i = 0; i < locked_group; i++) {
1016
1017 grp = ext4_get_group_info(sb, first_group + i);
1018 /* take all groups write allocation
1019 * semaphore. This make sure there is
1020 * no block allocation going on in any
1021 * of that groups
1022 */
1023 up_write(&grp->alloc_sem);
1024 }
1025
1026}
1027
1028/*
942 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1029 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
943 * block group lock of all groups for this page; do not hold the BG lock when 1030 * block group lock of all groups for this page; do not hold the BG lock when
944 * calling this routine! 1031 * calling this routine!
@@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1915 return 0; 2002 return 0;
1916} 2003}
1917 2004
1918/*
1919 * lock the group_info alloc_sem of all the groups
1920 * belonging to the same buddy cache page. This
1921 * make sure other parallel operation on the buddy
1922 * cache doesn't happen whild holding the buddy cache
1923 * lock
1924 */
1925int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1926{
1927 int i;
1928 int block, pnum;
1929 int blocks_per_page;
1930 int groups_per_page;
1931 ext4_group_t ngroups = ext4_get_groups_count(sb);
1932 ext4_group_t first_group;
1933 struct ext4_group_info *grp;
1934
1935 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1936 /*
1937 * the buddy cache inode stores the block bitmap
1938 * and buddy information in consecutive blocks.
1939 * So for each group we need two blocks.
1940 */
1941 block = group * 2;
1942 pnum = block / blocks_per_page;
1943 first_group = pnum * blocks_per_page / 2;
1944
1945 groups_per_page = blocks_per_page >> 1;
1946 if (groups_per_page == 0)
1947 groups_per_page = 1;
1948 /* read all groups the page covers into the cache */
1949 for (i = 0; i < groups_per_page; i++) {
1950
1951 if ((first_group + i) >= ngroups)
1952 break;
1953 grp = ext4_get_group_info(sb, first_group + i);
1954 /* take all groups write allocation
1955 * semaphore. This make sure there is
1956 * no block allocation going on in any
1957 * of that groups
1958 */
1959 down_write_nested(&grp->alloc_sem, i);
1960 }
1961 return i;
1962}
1963
1964void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1965 ext4_group_t group, int locked_group)
1966{
1967 int i;
1968 int block, pnum;
1969 int blocks_per_page;
1970 ext4_group_t first_group;
1971 struct ext4_group_info *grp;
1972
1973 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1974 /*
1975 * the buddy cache inode stores the block bitmap
1976 * and buddy information in consecutive blocks.
1977 * So for each group we need two blocks.
1978 */
1979 block = group * 2;
1980 pnum = block / blocks_per_page;
1981 first_group = pnum * blocks_per_page / 2;
1982 /* release locks on all the groups */
1983 for (i = 0; i < locked_group; i++) {
1984
1985 grp = ext4_get_group_info(sb, first_group + i);
1986 /* take all groups write allocation
1987 * semaphore. This make sure there is
1988 * no block allocation going on in any
1989 * of that groups
1990 */
1991 up_write(&grp->alloc_sem);
1992 }
1993
1994}
1995
1996static noinline_for_stack int 2005static noinline_for_stack int
1997ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2006ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1998{ 2007{
@@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2233 .release = seq_release, 2242 .release = seq_release,
2234}; 2243};
2235 2244
2245static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2246{
2247 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2248 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2249
2250 BUG_ON(!cachep);
2251 return cachep;
2252}
2236 2253
2237/* Create and initialize ext4_group_info data for the given group. */ 2254/* Create and initialize ext4_group_info data for the given group. */
2238int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2255int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2239 struct ext4_group_desc *desc) 2256 struct ext4_group_desc *desc)
2240{ 2257{
2241 int i, len; 2258 int i;
2242 int metalen = 0; 2259 int metalen = 0;
2243 struct ext4_sb_info *sbi = EXT4_SB(sb); 2260 struct ext4_sb_info *sbi = EXT4_SB(sb);
2244 struct ext4_group_info **meta_group_info; 2261 struct ext4_group_info **meta_group_info;
2262 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2245 2263
2246 /* 2264 /*
2247 * First check if this group is the first of a reserved block. 2265 * First check if this group is the first of a reserved block.
@@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2261 meta_group_info; 2279 meta_group_info;
2262 } 2280 }
2263 2281
2264 /*
2265 * calculate needed size. if change bb_counters size,
2266 * don't forget about ext4_mb_generate_buddy()
2267 */
2268 len = offsetof(typeof(**meta_group_info),
2269 bb_counters[sb->s_blocksize_bits + 2]);
2270
2271 meta_group_info = 2282 meta_group_info =
2272 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2283 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2273 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2284 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2274 2285
2275 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2286 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2276 if (meta_group_info[i] == NULL) { 2287 if (meta_group_info[i] == NULL) {
2277 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2288 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2278 goto exit_group_info; 2289 goto exit_group_info;
2279 } 2290 }
2291 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2280 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2292 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2281 &(meta_group_info[i]->bb_state)); 2293 &(meta_group_info[i]->bb_state));
2282 2294
@@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2331 int num_meta_group_infos_max; 2343 int num_meta_group_infos_max;
2332 int array_size; 2344 int array_size;
2333 struct ext4_group_desc *desc; 2345 struct ext4_group_desc *desc;
2346 struct kmem_cache *cachep;
2334 2347
2335 /* This is the number of blocks used by GDT */ 2348 /* This is the number of blocks used by GDT */
2336 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2349 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2373,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2373 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2386 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2374 goto err_freesgi; 2387 goto err_freesgi;
2375 } 2388 }
2389 sbi->s_buddy_cache->i_ino = get_next_ino();
2376 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2390 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2377 for (i = 0; i < ngroups; i++) { 2391 for (i = 0; i < ngroups; i++) {
2378 desc = ext4_get_group_desc(sb, i, NULL); 2392 desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
2388 return 0; 2402 return 0;
2389 2403
2390err_freebuddy: 2404err_freebuddy:
2405 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2391 while (i-- > 0) 2406 while (i-- > 0)
2392 kfree(ext4_get_group_info(sb, i)); 2407 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2393 i = num_meta_group_infos; 2408 i = num_meta_group_infos;
2394 while (i-- > 0) 2409 while (i-- > 0)
2395 kfree(sbi->s_group_info[i]); 2410 kfree(sbi->s_group_info[i]);
@@ -2406,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2406 unsigned offset; 2421 unsigned offset;
2407 unsigned max; 2422 unsigned max;
2408 int ret; 2423 int ret;
2424 int cache_index;
2425 struct kmem_cache *cachep;
2426 char *namep = NULL;
2409 2427
2410 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2428 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2411 2429
2412 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2430 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2413 if (sbi->s_mb_offsets == NULL) { 2431 if (sbi->s_mb_offsets == NULL) {
2414 return -ENOMEM; 2432 ret = -ENOMEM;
2433 goto out;
2415 } 2434 }
2416 2435
2417 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2436 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2418 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2437 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2419 if (sbi->s_mb_maxs == NULL) { 2438 if (sbi->s_mb_maxs == NULL) {
2420 kfree(sbi->s_mb_offsets); 2439 ret = -ENOMEM;
2421 return -ENOMEM; 2440 goto out;
2441 }
2442
2443 cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2444 cachep = ext4_groupinfo_caches[cache_index];
2445 if (!cachep) {
2446 char name[32];
2447 int len = offsetof(struct ext4_group_info,
2448 bb_counters[sb->s_blocksize_bits + 2]);
2449
2450 sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
2451 namep = kstrdup(name, GFP_KERNEL);
2452 if (!namep) {
2453 ret = -ENOMEM;
2454 goto out;
2455 }
2456
2457 /* Need to free the kmem_cache_name() when we
2458 * destroy the slab */
2459 cachep = kmem_cache_create(namep, len, 0,
2460 SLAB_RECLAIM_ACCOUNT, NULL);
2461 if (!cachep) {
2462 ret = -ENOMEM;
2463 goto out;
2464 }
2465 ext4_groupinfo_caches[cache_index] = cachep;
2422 } 2466 }
2423 2467
2424 /* order 0 is regular bitmap */ 2468 /* order 0 is regular bitmap */
@@ -2439,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2439 /* init file for buddy data */ 2483 /* init file for buddy data */
2440 ret = ext4_mb_init_backend(sb); 2484 ret = ext4_mb_init_backend(sb);
2441 if (ret != 0) { 2485 if (ret != 0) {
2442 kfree(sbi->s_mb_offsets); 2486 goto out;
2443 kfree(sbi->s_mb_maxs);
2444 return ret;
2445 } 2487 }
2446 2488
2447 spin_lock_init(&sbi->s_md_lock); 2489 spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2456 2498
2457 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2499 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2458 if (sbi->s_locality_groups == NULL) { 2500 if (sbi->s_locality_groups == NULL) {
2459 kfree(sbi->s_mb_offsets); 2501 ret = -ENOMEM;
2460 kfree(sbi->s_mb_maxs); 2502 goto out;
2461 return -ENOMEM;
2462 } 2503 }
2463 for_each_possible_cpu(i) { 2504 for_each_possible_cpu(i) {
2464 struct ext4_locality_group *lg; 2505 struct ext4_locality_group *lg;
@@ -2475,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2475 2516
2476 if (sbi->s_journal) 2517 if (sbi->s_journal)
2477 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2518 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2478 return 0; 2519out:
2520 if (ret) {
2521 kfree(sbi->s_mb_offsets);
2522 kfree(sbi->s_mb_maxs);
2523 kfree(namep);
2524 }
2525 return ret;
2479} 2526}
2480 2527
2481/* need to called with the ext4 group lock held */ 2528/* need to called with the ext4 group lock held */
@@ -2503,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
2503 int num_meta_group_infos; 2550 int num_meta_group_infos;
2504 struct ext4_group_info *grinfo; 2551 struct ext4_group_info *grinfo;
2505 struct ext4_sb_info *sbi = EXT4_SB(sb); 2552 struct ext4_sb_info *sbi = EXT4_SB(sb);
2553 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2506 2554
2507 if (sbi->s_group_info) { 2555 if (sbi->s_group_info) {
2508 for (i = 0; i < ngroups; i++) { 2556 for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
2513 ext4_lock_group(sb, i); 2561 ext4_lock_group(sb, i);
2514 ext4_mb_cleanup_pa(grinfo); 2562 ext4_mb_cleanup_pa(grinfo);
2515 ext4_unlock_group(sb, i); 2563 ext4_unlock_group(sb, i);
2516 kfree(grinfo); 2564 kmem_cache_free(cachep, grinfo);
2517 } 2565 }
2518 num_meta_group_infos = (ngroups + 2566 num_meta_group_infos = (ngroups +
2519 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2567 EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,20 +2605,15 @@ int ext4_mb_release(struct super_block *sb)
2557 return 0; 2605 return 0;
2558} 2606}
2559 2607
2560static inline void ext4_issue_discard(struct super_block *sb, 2608static inline int ext4_issue_discard(struct super_block *sb,
2561 ext4_group_t block_group, ext4_grpblk_t block, int count) 2609 ext4_group_t block_group, ext4_grpblk_t block, int count)
2562{ 2610{
2563 int ret;
2564 ext4_fsblk_t discard_block; 2611 ext4_fsblk_t discard_block;
2565 2612
2566 discard_block = block + ext4_group_first_block_no(sb, block_group); 2613 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb, 2614 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count); 2615 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count); 2616 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2570 if (ret == EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2573 }
2574} 2617}
2575 2618
2576/* 2619/*
@@ -2582,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2582 struct super_block *sb = journal->j_private; 2625 struct super_block *sb = journal->j_private;
2583 struct ext4_buddy e4b; 2626 struct ext4_buddy e4b;
2584 struct ext4_group_info *db; 2627 struct ext4_group_info *db;
2585 int err, count = 0, count2 = 0; 2628 int err, ret, count = 0, count2 = 0;
2586 struct ext4_free_data *entry; 2629 struct ext4_free_data *entry;
2587 struct list_head *l, *ltmp; 2630 struct list_head *l, *ltmp;
2588 2631
@@ -2592,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2592 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2635 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2593 entry->count, entry->group, entry); 2636 entry->count, entry->group, entry);
2594 2637
2595 if (test_opt(sb, DISCARD)) 2638 if (test_opt(sb, DISCARD)) {
2596 ext4_issue_discard(sb, entry->group, 2639 ret = ext4_issue_discard(sb, entry->group,
2597 entry->start_blk, entry->count); 2640 entry->start_blk, entry->count);
2641 if (unlikely(ret == -EOPNOTSUPP)) {
2642 ext4_warning(sb, "discard not supported, "
2643 "disabling");
2644 clear_opt(sb, DISCARD);
2645 }
2646 }
2598 2647
2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2648 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2600 /* we expect to find existing buddy because it's pinned */ 2649 /* we expect to find existing buddy because it's pinned */
@@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
2658 2707
2659#endif 2708#endif
2660 2709
2661int __init init_ext4_mballoc(void) 2710int __init ext4_init_mballoc(void)
2662{ 2711{
2663 ext4_pspace_cachep = 2712 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2664 kmem_cache_create("ext4_prealloc_space", 2713 SLAB_RECLAIM_ACCOUNT);
2665 sizeof(struct ext4_prealloc_space),
2666 0, SLAB_RECLAIM_ACCOUNT, NULL);
2667 if (ext4_pspace_cachep == NULL) 2714 if (ext4_pspace_cachep == NULL)
2668 return -ENOMEM; 2715 return -ENOMEM;
2669 2716
2670 ext4_ac_cachep = 2717 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2671 kmem_cache_create("ext4_alloc_context", 2718 SLAB_RECLAIM_ACCOUNT);
2672 sizeof(struct ext4_allocation_context),
2673 0, SLAB_RECLAIM_ACCOUNT, NULL);
2674 if (ext4_ac_cachep == NULL) { 2719 if (ext4_ac_cachep == NULL) {
2675 kmem_cache_destroy(ext4_pspace_cachep); 2720 kmem_cache_destroy(ext4_pspace_cachep);
2676 return -ENOMEM; 2721 return -ENOMEM;
2677 } 2722 }
2678 2723
2679 ext4_free_ext_cachep = 2724 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2680 kmem_cache_create("ext4_free_block_extents", 2725 SLAB_RECLAIM_ACCOUNT);
2681 sizeof(struct ext4_free_data),
2682 0, SLAB_RECLAIM_ACCOUNT, NULL);
2683 if (ext4_free_ext_cachep == NULL) { 2726 if (ext4_free_ext_cachep == NULL) {
2684 kmem_cache_destroy(ext4_pspace_cachep); 2727 kmem_cache_destroy(ext4_pspace_cachep);
2685 kmem_cache_destroy(ext4_ac_cachep); 2728 kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void)
2689 return 0; 2732 return 0;
2690} 2733}
2691 2734
2692void exit_ext4_mballoc(void) 2735void ext4_exit_mballoc(void)
2693{ 2736{
2737 int i;
2694 /* 2738 /*
2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2739 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2696 * before destroying the slab cache. 2740 * before destroying the slab cache.
@@ -2699,6 +2743,15 @@ void exit_ext4_mballoc(void)
2699 kmem_cache_destroy(ext4_pspace_cachep); 2743 kmem_cache_destroy(ext4_pspace_cachep);
2700 kmem_cache_destroy(ext4_ac_cachep); 2744 kmem_cache_destroy(ext4_ac_cachep);
2701 kmem_cache_destroy(ext4_free_ext_cachep); 2745 kmem_cache_destroy(ext4_free_ext_cachep);
2746
2747 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2748 struct kmem_cache *cachep = ext4_groupinfo_caches[i];
2749 if (cachep) {
2750 char *name = (char *)kmem_cache_name(cachep);
2751 kmem_cache_destroy(cachep);
2752 kfree(name);
2753 }
2754 }
2702 ext4_remove_debugfs_entry(); 2755 ext4_remove_debugfs_entry();
2703} 2756}
2704 2757
@@ -3535,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3535 */ 3588 */
3536static noinline_for_stack int 3589static noinline_for_stack int
3537ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3590ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3538 struct ext4_prealloc_space *pa, 3591 struct ext4_prealloc_space *pa)
3539 struct ext4_allocation_context *ac)
3540{ 3592{
3541 struct super_block *sb = e4b->bd_sb; 3593 struct super_block *sb = e4b->bd_sb;
3542 struct ext4_sb_info *sbi = EXT4_SB(sb); 3594 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3554 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3606 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3555 end = bit + pa->pa_len; 3607 end = bit + pa->pa_len;
3556 3608
3557 if (ac) {
3558 ac->ac_sb = sb;
3559 ac->ac_inode = pa->pa_inode;
3560 }
3561
3562 while (bit < end) { 3609 while (bit < end) {
3563 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3610 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3564 if (bit >= end) 3611 if (bit >= end)
@@ -3569,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3569 (unsigned) next - bit, (unsigned) group); 3616 (unsigned) next - bit, (unsigned) group);
3570 free += next - bit; 3617 free += next - bit;
3571 3618
3572 if (ac) { 3619 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3573 ac->ac_b_ex.fe_group = group; 3620 trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
3574 ac->ac_b_ex.fe_start = bit; 3621 grp_blk_start + bit, next - bit);
3575 ac->ac_b_ex.fe_len = next - bit;
3576 ac->ac_b_ex.fe_logical = 0;
3577 trace_ext4_mballoc_discard(ac);
3578 }
3579
3580 trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
3581 next - bit);
3582 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3622 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3583 bit = next + 1; 3623 bit = next + 1;
3584 } 3624 }
@@ -3601,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3601 3641
3602static noinline_for_stack int 3642static noinline_for_stack int
3603ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3643ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3604 struct ext4_prealloc_space *pa, 3644 struct ext4_prealloc_space *pa)
3605 struct ext4_allocation_context *ac)
3606{ 3645{
3607 struct super_block *sb = e4b->bd_sb; 3646 struct super_block *sb = e4b->bd_sb;
3608 ext4_group_t group; 3647 ext4_group_t group;
3609 ext4_grpblk_t bit; 3648 ext4_grpblk_t bit;
3610 3649
3611 trace_ext4_mb_release_group_pa(sb, ac, pa); 3650 trace_ext4_mb_release_group_pa(sb, pa);
3612 BUG_ON(pa->pa_deleted == 0); 3651 BUG_ON(pa->pa_deleted == 0);
3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3652 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3653 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3615 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3654 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3616 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3655 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3617 3656 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3618 if (ac) {
3619 ac->ac_sb = sb;
3620 ac->ac_inode = NULL;
3621 ac->ac_b_ex.fe_group = group;
3622 ac->ac_b_ex.fe_start = bit;
3623 ac->ac_b_ex.fe_len = pa->pa_len;
3624 ac->ac_b_ex.fe_logical = 0;
3625 trace_ext4_mballoc_discard(ac);
3626 }
3627 3657
3628 return 0; 3658 return 0;
3629} 3659}
@@ -3644,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3644 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3674 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3645 struct buffer_head *bitmap_bh = NULL; 3675 struct buffer_head *bitmap_bh = NULL;
3646 struct ext4_prealloc_space *pa, *tmp; 3676 struct ext4_prealloc_space *pa, *tmp;
3647 struct ext4_allocation_context *ac;
3648 struct list_head list; 3677 struct list_head list;
3649 struct ext4_buddy e4b; 3678 struct ext4_buddy e4b;
3650 int err; 3679 int err;
@@ -3673,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3673 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3702 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3674 3703
3675 INIT_LIST_HEAD(&list); 3704 INIT_LIST_HEAD(&list);
3676 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3677 if (ac)
3678 ac->ac_sb = sb;
3679repeat: 3705repeat:
3680 ext4_lock_group(sb, group); 3706 ext4_lock_group(sb, group);
3681 list_for_each_entry_safe(pa, tmp, 3707 list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3756,9 @@ repeat:
3730 spin_unlock(pa->pa_obj_lock); 3756 spin_unlock(pa->pa_obj_lock);
3731 3757
3732 if (pa->pa_type == MB_GROUP_PA) 3758 if (pa->pa_type == MB_GROUP_PA)
3733 ext4_mb_release_group_pa(&e4b, pa, ac); 3759 ext4_mb_release_group_pa(&e4b, pa);
3734 else 3760 else
3735 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3761 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3736 3762
3737 list_del(&pa->u.pa_tmp_list); 3763 list_del(&pa->u.pa_tmp_list);
3738 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3764 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3766,6 @@ repeat:
3740 3766
3741out: 3767out:
3742 ext4_unlock_group(sb, group); 3768 ext4_unlock_group(sb, group);
3743 if (ac)
3744 kmem_cache_free(ext4_ac_cachep, ac);
3745 ext4_mb_unload_buddy(&e4b); 3769 ext4_mb_unload_buddy(&e4b);
3746 put_bh(bitmap_bh); 3770 put_bh(bitmap_bh);
3747 return free; 3771 return free;
@@ -3762,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
3762 struct super_block *sb = inode->i_sb; 3786 struct super_block *sb = inode->i_sb;
3763 struct buffer_head *bitmap_bh = NULL; 3787 struct buffer_head *bitmap_bh = NULL;
3764 struct ext4_prealloc_space *pa, *tmp; 3788 struct ext4_prealloc_space *pa, *tmp;
3765 struct ext4_allocation_context *ac;
3766 ext4_group_t group = 0; 3789 ext4_group_t group = 0;
3767 struct list_head list; 3790 struct list_head list;
3768 struct ext4_buddy e4b; 3791 struct ext4_buddy e4b;
@@ -3778,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
3778 3801
3779 INIT_LIST_HEAD(&list); 3802 INIT_LIST_HEAD(&list);
3780 3803
3781 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3782 if (ac) {
3783 ac->ac_sb = sb;
3784 ac->ac_inode = inode;
3785 }
3786repeat: 3804repeat:
3787 /* first, collect all pa's in the inode */ 3805 /* first, collect all pa's in the inode */
3788 spin_lock(&ei->i_prealloc_lock); 3806 spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3870,7 @@ repeat:
3852 3870
3853 ext4_lock_group(sb, group); 3871 ext4_lock_group(sb, group);
3854 list_del(&pa->pa_group_list); 3872 list_del(&pa->pa_group_list);
3855 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3873 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3856 ext4_unlock_group(sb, group); 3874 ext4_unlock_group(sb, group);
3857 3875
3858 ext4_mb_unload_buddy(&e4b); 3876 ext4_mb_unload_buddy(&e4b);
@@ -3861,23 +3879,8 @@ repeat:
3861 list_del(&pa->u.pa_tmp_list); 3879 list_del(&pa->u.pa_tmp_list);
3862 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3880 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3863 } 3881 }
3864 if (ac)
3865 kmem_cache_free(ext4_ac_cachep, ac);
3866} 3882}
3867 3883
3868/*
3869 * finds all preallocated spaces and return blocks being freed to them
3870 * if preallocated space becomes full (no block is used from the space)
3871 * then the function frees space in buddy
3872 * XXX: at the moment, truncate (which is the only way to free blocks)
3873 * discards all preallocations
3874 */
3875static void ext4_mb_return_to_preallocation(struct inode *inode,
3876 struct ext4_buddy *e4b,
3877 sector_t block, int count)
3878{
3879 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
3880}
3881#ifdef CONFIG_EXT4_DEBUG 3884#ifdef CONFIG_EXT4_DEBUG
3882static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3885static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3883{ 3886{
@@ -4060,14 +4063,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4060 struct ext4_buddy e4b; 4063 struct ext4_buddy e4b;
4061 struct list_head discard_list; 4064 struct list_head discard_list;
4062 struct ext4_prealloc_space *pa, *tmp; 4065 struct ext4_prealloc_space *pa, *tmp;
4063 struct ext4_allocation_context *ac;
4064 4066
4065 mb_debug(1, "discard locality group preallocation\n"); 4067 mb_debug(1, "discard locality group preallocation\n");
4066 4068
4067 INIT_LIST_HEAD(&discard_list); 4069 INIT_LIST_HEAD(&discard_list);
4068 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4069 if (ac)
4070 ac->ac_sb = sb;
4071 4070
4072 spin_lock(&lg->lg_prealloc_lock); 4071 spin_lock(&lg->lg_prealloc_lock);
4073 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4072 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4118,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4119 } 4118 }
4120 ext4_lock_group(sb, group); 4119 ext4_lock_group(sb, group);
4121 list_del(&pa->pa_group_list); 4120 list_del(&pa->pa_group_list);
4122 ext4_mb_release_group_pa(&e4b, pa, ac); 4121 ext4_mb_release_group_pa(&e4b, pa);
4123 ext4_unlock_group(sb, group); 4122 ext4_unlock_group(sb, group);
4124 4123
4125 ext4_mb_unload_buddy(&e4b); 4124 ext4_mb_unload_buddy(&e4b);
4126 list_del(&pa->u.pa_tmp_list); 4125 list_del(&pa->u.pa_tmp_list);
4127 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4126 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4128 } 4127 }
4129 if (ac)
4130 kmem_cache_free(ext4_ac_cachep, ac);
4131} 4128}
4132 4129
4133/* 4130/*
@@ -4273,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4273 * EDQUOT check, as blocks and quotas have been already 4270 * EDQUOT check, as blocks and quotas have been already
4274 * reserved when data being copied into pagecache. 4271 * reserved when data being copied into pagecache.
4275 */ 4272 */
4276 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4273 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4277 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4274 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4278 else { 4275 else {
4279 /* Without delayed allocation we need to verify 4276 /* Without delayed allocation we need to verify
@@ -4370,7 +4367,8 @@ out:
4370 if (inquota && ar->len < inquota) 4367 if (inquota && ar->len < inquota)
4371 dquot_free_block(ar->inode, inquota - ar->len); 4368 dquot_free_block(ar->inode, inquota - ar->len);
4372 if (!ar->len) { 4369 if (!ar->len) {
4373 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4370 if (!ext4_test_inode_state(ar->inode,
4371 EXT4_STATE_DELALLOC_RESERVED))
4374 /* release all the reserved blocks if non delalloc */ 4372 /* release all the reserved blocks if non delalloc */
4375 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4373 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4376 reserv_blks); 4374 reserv_blks);
@@ -4491,7 +4489,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4491{ 4489{
4492 struct buffer_head *bitmap_bh = NULL; 4490 struct buffer_head *bitmap_bh = NULL;
4493 struct super_block *sb = inode->i_sb; 4491 struct super_block *sb = inode->i_sb;
4494 struct ext4_allocation_context *ac = NULL;
4495 struct ext4_group_desc *gdp; 4492 struct ext4_group_desc *gdp;
4496 unsigned long freed = 0; 4493 unsigned long freed = 0;
4497 unsigned int overflow; 4494 unsigned int overflow;
@@ -4531,6 +4528,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4531 if (!bh) 4528 if (!bh)
4532 tbh = sb_find_get_block(inode->i_sb, 4529 tbh = sb_find_get_block(inode->i_sb,
4533 block + i); 4530 block + i);
4531 if (unlikely(!tbh))
4532 continue;
4534 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4533 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4535 inode, tbh, block + i); 4534 inode, tbh, block + i);
4536 } 4535 }
@@ -4546,12 +4545,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4546 if (!ext4_should_writeback_data(inode)) 4545 if (!ext4_should_writeback_data(inode))
4547 flags |= EXT4_FREE_BLOCKS_METADATA; 4546 flags |= EXT4_FREE_BLOCKS_METADATA;
4548 4547
4549 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4550 if (ac) {
4551 ac->ac_inode = inode;
4552 ac->ac_sb = sb;
4553 }
4554
4555do_more: 4548do_more:
4556 overflow = 0; 4549 overflow = 0;
4557 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4550 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4602,7 @@ do_more:
4609 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4602 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4610 } 4603 }
4611#endif 4604#endif
4612 if (ac) { 4605 trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
4613 ac->ac_b_ex.fe_group = block_group;
4614 ac->ac_b_ex.fe_start = bit;
4615 ac->ac_b_ex.fe_len = count;
4616 trace_ext4_mballoc_free(ac);
4617 }
4618 4606
4619 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4607 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4620 if (err) 4608 if (err)
@@ -4626,7 +4614,11 @@ do_more:
4626 * blocks being freed are metadata. these blocks shouldn't 4614 * blocks being freed are metadata. these blocks shouldn't
4627 * be used until this transaction is committed 4615 * be used until this transaction is committed
4628 */ 4616 */
4629 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4617 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4618 if (!new_entry) {
4619 err = -ENOMEM;
4620 goto error_return;
4621 }
4630 new_entry->start_blk = bit; 4622 new_entry->start_blk = bit;
4631 new_entry->group = block_group; 4623 new_entry->group = block_group;
4632 new_entry->count = count; 4624 new_entry->count = count;
@@ -4643,9 +4635,6 @@ do_more:
4643 ext4_lock_group(sb, block_group); 4635 ext4_lock_group(sb, block_group);
4644 mb_clear_bits(bitmap_bh->b_data, bit, count); 4636 mb_clear_bits(bitmap_bh->b_data, bit, count);
4645 mb_free_blocks(inode, &e4b, bit, count); 4637 mb_free_blocks(inode, &e4b, bit, count);
4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4647 if (test_opt(sb, DISCARD))
4648 ext4_issue_discard(sb, block_group, bit, count);
4649 } 4638 }
4650 4639
4651 ret = ext4_free_blks_count(sb, gdp) + count; 4640 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4674,194 @@ error_return:
4685 dquot_free_block(inode, freed); 4674 dquot_free_block(inode, freed);
4686 brelse(bitmap_bh); 4675 brelse(bitmap_bh);
4687 ext4_std_error(sb, err); 4676 ext4_std_error(sb, err);
4688 if (ac)
4689 kmem_cache_free(ext4_ac_cachep, ac);
4690 return; 4677 return;
4691} 4678}
4679
4680/**
4681 * ext4_trim_extent -- function to TRIM one single free extent in the group
4682 * @sb: super block for the file system
4683 * @start: starting block of the free extent in the alloc. group
4684 * @count: number of blocks to TRIM
4685 * @group: alloc. group we are working with
4686 * @e4b: ext4 buddy for the group
4687 *
4688 * Trim "count" blocks starting at "start" in the "group". To assure that no
4689 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4690 * be called with under the group lock.
4691 */
4692static int ext4_trim_extent(struct super_block *sb, int start, int count,
4693 ext4_group_t group, struct ext4_buddy *e4b)
4694{
4695 struct ext4_free_extent ex;
4696 int ret = 0;
4697
4698 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4699
4700 ex.fe_start = start;
4701 ex.fe_group = group;
4702 ex.fe_len = count;
4703
4704 /*
4705 * Mark blocks used, so no one can reuse them while
4706 * being trimmed.
4707 */
4708 mb_mark_used(e4b, &ex);
4709 ext4_unlock_group(sb, group);
4710
4711 ret = ext4_issue_discard(sb, group, start, count);
4712
4713 ext4_lock_group(sb, group);
4714 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4715 return ret;
4716}
4717
4718/**
4719 * ext4_trim_all_free -- function to trim all free space in alloc. group
4720 * @sb: super block for file system
4721 * @e4b: ext4 buddy
4722 * @start: first group block to examine
4723 * @max: last group block to examine
4724 * @minblocks: minimum extent block count
4725 *
4726 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4727 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4728 * the extent.
4729 *
4730 *
4731 * ext4_trim_all_free walks through group's block bitmap searching for free
4732 * extents. When the free extent is found, mark it as used in group buddy
4733 * bitmap. Then issue a TRIM command on this extent and free the extent in
4734 * the group buddy bitmap. This is done until whole group is scanned.
4735 */
4736ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4737 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4738{
4739 void *bitmap;
4740 ext4_grpblk_t next, count = 0;
4741 ext4_group_t group;
4742 int ret = 0;
4743
4744 BUG_ON(e4b == NULL);
4745
4746 bitmap = e4b->bd_bitmap;
4747 group = e4b->bd_group;
4748 start = (e4b->bd_info->bb_first_free > start) ?
4749 e4b->bd_info->bb_first_free : start;
4750 ext4_lock_group(sb, group);
4751
4752 while (start < max) {
4753 start = mb_find_next_zero_bit(bitmap, max, start);
4754 if (start >= max)
4755 break;
4756 next = mb_find_next_bit(bitmap, max, start);
4757
4758 if ((next - start) >= minblocks) {
4759 ret = ext4_trim_extent(sb, start,
4760 next - start, group, e4b);
4761 if (ret < 0)
4762 break;
4763 count += next - start;
4764 }
4765 start = next + 1;
4766
4767 if (fatal_signal_pending(current)) {
4768 count = -ERESTARTSYS;
4769 break;
4770 }
4771
4772 if (need_resched()) {
4773 ext4_unlock_group(sb, group);
4774 cond_resched();
4775 ext4_lock_group(sb, group);
4776 }
4777
4778 if ((e4b->bd_info->bb_free - count) < minblocks)
4779 break;
4780 }
4781 ext4_unlock_group(sb, group);
4782
4783 ext4_debug("trimmed %d blocks in the group %d\n",
4784 count, group);
4785
4786 if (ret < 0)
4787 count = ret;
4788
4789 return count;
4790}
4791
4792/**
4793 * ext4_trim_fs() -- trim ioctl handle function
4794 * @sb: superblock for filesystem
4795 * @range: fstrim_range structure
4796 *
4797 * start: First Byte to trim
4798 * len: number of Bytes to trim from start
4799 * minlen: minimum extent length in Bytes
4800 * ext4_trim_fs goes through all allocation groups containing Bytes from
4801 * start to start+len. For each such a group ext4_trim_all_free function
4802 * is invoked to trim all free space.
4803 */
4804int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4805{
4806 struct ext4_buddy e4b;
4807 ext4_group_t first_group, last_group;
4808 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4809 ext4_grpblk_t cnt = 0, first_block, last_block;
4810 uint64_t start, len, minlen, trimmed;
4811 ext4_fsblk_t first_data_blk =
4812 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4813 int ret = 0;
4814
4815 start = range->start >> sb->s_blocksize_bits;
4816 len = range->len >> sb->s_blocksize_bits;
4817 minlen = range->minlen >> sb->s_blocksize_bits;
4818 trimmed = 0;
4819
4820 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4821 return -EINVAL;
4822 if (start < first_data_blk) {
4823 len -= first_data_blk - start;
4824 start = first_data_blk;
4825 }
4826
4827 /* Determine first and last group to examine based on start and len */
4828 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4829 &first_group, &first_block);
4830 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4831 &last_group, &last_block);
4832 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4833 last_block = EXT4_BLOCKS_PER_GROUP(sb);
4834
4835 if (first_group > last_group)
4836 return -EINVAL;
4837
4838 for (group = first_group; group <= last_group; group++) {
4839 ret = ext4_mb_load_buddy(sb, group, &e4b);
4840 if (ret) {
4841 ext4_error(sb, "Error in loading buddy "
4842 "information for %u", group);
4843 break;
4844 }
4845
4846 if (len >= EXT4_BLOCKS_PER_GROUP(sb))
4847 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
4848 else
4849 last_block = first_block + len;
4850
4851 if (e4b.bd_info->bb_free >= minlen) {
4852 cnt = ext4_trim_all_free(sb, &e4b, first_block,
4853 last_block, minlen);
4854 if (cnt < 0) {
4855 ret = cnt;
4856 ext4_mb_unload_buddy(&e4b);
4857 break;
4858 }
4859 }
4860 ext4_mb_unload_buddy(&e4b);
4861 trimmed += cnt;
4862 first_block = 0;
4863 }
4864 range->len = trimmed * sb->s_blocksize;
4865
4866 return ret;
4867}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..b0a126f23c20 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
412 struct buffer_head *bh; 412 struct buffer_head *bh;
413 struct ext4_extent_header *eh; 413 struct ext4_extent_header *eh;
414 414
415 block = idx_pblock(ix); 415 block = ext4_idx_pblock(ix);
416 bh = sb_bread(inode->i_sb, block); 416 bh = sb_bread(inode->i_sb, block);
417 if (!bh) 417 if (!bh)
418 return -EIO; 418 return -EIO;
@@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode)
496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * 496 goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; 497 EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 498 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
499 S_IFREG, 0, goal); 499 S_IFREG, NULL, goal);
500 if (IS_ERR(tmp_inode)) { 500 if (IS_ERR(tmp_inode)) {
501 retval = -ENOMEM; 501 retval = -ENOMEM;
502 ext4_journal_stop(handle); 502 ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..b9f3e7862f13 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 85 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
86 /* leaf block */ 86 /* leaf block */
87 *extent = ++path[ppos].p_ext; 87 *extent = ++path[ppos].p_ext;
88 path[ppos].p_block = ext_pblock(path[ppos].p_ext); 88 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
89 return 0; 89 return 0;
90 } 90 }
91 91
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
96 96
97 /* index block */ 97 /* index block */
98 path[ppos].p_idx++; 98 path[ppos].p_idx++;
99 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 99 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
100 if (path[ppos+1].p_bh) 100 if (path[ppos+1].p_bh)
101 brelse(path[ppos+1].p_bh); 101 brelse(path[ppos+1].p_bh);
102 path[ppos+1].p_bh = 102 path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
111 path[cur_ppos].p_idx = 111 path[cur_ppos].p_idx =
112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr); 112 EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
113 path[cur_ppos].p_block = 113 path[cur_ppos].p_block =
114 idx_pblock(path[cur_ppos].p_idx); 114 ext4_idx_pblock(path[cur_ppos].p_idx);
115 if (path[cur_ppos+1].p_bh) 115 if (path[cur_ppos+1].p_bh)
116 brelse(path[cur_ppos+1].p_bh); 116 brelse(path[cur_ppos+1].p_bh);
117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, 117 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
133 path[leaf_ppos].p_ext = *extent = 133 path[leaf_ppos].p_ext = *extent =
134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 134 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
135 path[leaf_ppos].p_block = 135 path[leaf_ppos].p_block =
136 ext_pblock(path[leaf_ppos].p_ext); 136 ext4_ext_pblock(path[leaf_ppos].p_ext);
137 return 0; 137 return 0;
138 } 138 }
139 } 139 }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
249 */ 249 */
250 o_end->ee_block = end_ext->ee_block; 250 o_end->ee_block = end_ext->ee_block;
251 o_end->ee_len = end_ext->ee_len; 251 o_end->ee_len = end_ext->ee_len;
252 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 252 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
253 } 253 }
254 254
255 o_start->ee_len = start_ext->ee_len; 255 o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
276 */ 276 */
277 o_end->ee_block = end_ext->ee_block; 277 o_end->ee_block = end_ext->ee_block;
278 o_end->ee_len = end_ext->ee_len; 278 o_end->ee_len = end_ext->ee_len;
279 ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); 279 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
280 280
281 /* 281 /*
282 * Set 0 to the extent block if new_ext was 282 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
361 /* Insert new entry */ 361 /* Insert new entry */
362 if (new_ext->ee_len) { 362 if (new_ext->ee_len) {
363 o_start[i] = *new_ext; 363 o_start[i] = *new_ext;
364 ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); 364 ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
365 } 365 }
366 366
367 /* Insert end entry */ 367 /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
488 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
489 489
490 new_ext.ee_block = cpu_to_le32(*from); 490 new_ext.ee_block = cpu_to_le32(*from);
491 ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); 491 ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
492 new_ext.ee_len = dext->ee_len; 492 new_ext.ee_len = dext->ee_len;
493 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 493 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 494 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
553 copy_extent_status(oext, &end_ext); 553 copy_extent_status(oext, &end_ext);
554 end_ext_alen = ext4_ext_get_actual_len(&end_ext); 554 end_ext_alen = ext4_ext_get_actual_len(&end_ext);
555 ext4_ext_store_pblock(&end_ext, 555 ext4_ext_store_pblock(&end_ext,
556 (ext_pblock(o_end) + oext_alen - end_ext_alen)); 556 (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
557 end_ext.ee_block = 557 end_ext.ee_block =
558 cpu_to_le32(le32_to_cpu(o_end->ee_block) + 558 cpu_to_le32(le32_to_cpu(o_end->ee_block) +
559 oext_alen - end_ext_alen); 559 oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
604 /* When tmp_dext is too large, pick up the target range. */ 604 /* When tmp_dext is too large, pick up the target range. */
605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 605 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
606 606
607 ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff); 607 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
608 tmp_dext->ee_block = 608 tmp_dext->ee_block =
609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); 609 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); 610 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
613 tmp_dext->ee_len = cpu_to_le16(max_count); 613 tmp_dext->ee_len = cpu_to_le16(max_count);
614 614
615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); 615 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
616 ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff); 616 ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
617 617
618 /* Adjust extent length if donor extent is larger than orig */ 618 /* Adjust extent length if donor extent is larger than orig */
619 if (ext4_ext_get_actual_len(tmp_dext) > 619 if (ext4_ext_get_actual_len(tmp_dext) >
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..5485390d32c5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
581 dir->i_sb->s_blocksize - 581 dir->i_sb->s_blocksize -
582 EXT4_DIR_REC_LEN(0)); 582 EXT4_DIR_REC_LEN(0));
583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
584 if (!ext4_check_dir_entry(dir, de, bh, 584 if (ext4_check_dir_entry(dir, NULL, de, bh,
585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
586 +((char *)de - bh->b_data))) { 586 + ((char *)de - bh->b_data))) {
587 /* On error, skip the f_pos to the next block. */ 587 /* On error, skip the f_pos to the next block. */
588 dir_file->f_pos = (dir_file->f_pos | 588 dir_file->f_pos = (dir_file->f_pos |
589 (dir->i_sb->s_blocksize - 1)) + 1; 589 (dir->i_sb->s_blocksize - 1)) + 1;
@@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
820 if ((char *) de + namelen <= dlimit && 820 if ((char *) de + namelen <= dlimit &&
821 ext4_match (namelen, name, de)) { 821 ext4_match (namelen, name, de)) {
822 /* found a match - just to be sure, do a full check */ 822 /* found a match - just to be sure, do a full check */
823 if (!ext4_check_dir_entry(dir, de, bh, offset)) 823 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
824 return -1; 824 return -1;
825 *res_dir = de; 825 *res_dir = de;
826 return 1; 826 return 1;
@@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
856 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 856 struct buffer_head *bh_use[NAMEI_RA_SIZE];
857 struct buffer_head *bh, *ret = NULL; 857 struct buffer_head *bh, *ret = NULL;
858 ext4_lblk_t start, block, b; 858 ext4_lblk_t start, block, b;
859 const u8 *name = d_name->name;
859 int ra_max = 0; /* Number of bh's in the readahead 860 int ra_max = 0; /* Number of bh's in the readahead
860 buffer, bh_use[] */ 861 buffer, bh_use[] */
861 int ra_ptr = 0; /* Current index into readahead 862 int ra_ptr = 0; /* Current index into readahead
@@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
870 namelen = d_name->len; 871 namelen = d_name->len;
871 if (namelen > EXT4_NAME_LEN) 872 if (namelen > EXT4_NAME_LEN)
872 return NULL; 873 return NULL;
874 if ((namelen <= 2) && (name[0] == '.') &&
875 (name[1] == '.' || name[1] == '\0')) {
876 /*
877 * "." or ".." will only be in the first block
878 * NFS may look up ".."; "." should be handled by the VFS
879 */
880 block = start = 0;
881 nblocks = 1;
882 goto restart;
883 }
873 if (is_dx(dir)) { 884 if (is_dx(dir)) {
874 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); 885 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
875 /* 886 /*
@@ -960,55 +971,35 @@ cleanup_and_exit:
960static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, 971static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
961 struct ext4_dir_entry_2 **res_dir, int *err) 972 struct ext4_dir_entry_2 **res_dir, int *err)
962{ 973{
963 struct super_block * sb; 974 struct super_block * sb = dir->i_sb;
964 struct dx_hash_info hinfo; 975 struct dx_hash_info hinfo;
965 u32 hash;
966 struct dx_frame frames[2], *frame; 976 struct dx_frame frames[2], *frame;
967 struct ext4_dir_entry_2 *de, *top;
968 struct buffer_head *bh; 977 struct buffer_head *bh;
969 ext4_lblk_t block; 978 ext4_lblk_t block;
970 int retval; 979 int retval;
971 int namelen = d_name->len;
972 const u8 *name = d_name->name;
973 980
974 sb = dir->i_sb; 981 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
975 /* NFS may look up ".." - look at dx_root directory block */ 982 return NULL;
976 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
977 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
978 return NULL;
979 } else {
980 frame = frames;
981 frame->bh = NULL; /* for dx_release() */
982 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
983 dx_set_block(frame->at, 0); /* dx_root block is 0 */
984 }
985 hash = hinfo.hash;
986 do { 983 do {
987 block = dx_get_block(frame->at); 984 block = dx_get_block(frame->at);
988 if (!(bh = ext4_bread (NULL,dir, block, 0, err))) 985 if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
989 goto errout; 986 goto errout;
990 de = (struct ext4_dir_entry_2 *) bh->b_data;
991 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
992 EXT4_DIR_REC_LEN(0));
993 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
994 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
995 + ((char *) de - bh->b_data);
996
997 if (!ext4_check_dir_entry(dir, de, bh, off)) {
998 brelse(bh);
999 *err = ERR_BAD_DX_DIR;
1000 goto errout;
1001 }
1002 987
1003 if (ext4_match(namelen, name, de)) { 988 retval = search_dirblock(bh, dir, d_name,
1004 *res_dir = de; 989 block << EXT4_BLOCK_SIZE_BITS(sb),
1005 dx_release(frames); 990 res_dir);
1006 return bh; 991 if (retval == 1) { /* Success! */
1007 } 992 dx_release(frames);
993 return bh;
1008 } 994 }
1009 brelse(bh); 995 brelse(bh);
996 if (retval == -1) {
997 *err = ERR_BAD_DX_DIR;
998 goto errout;
999 }
1000
1010 /* Check to see if we should continue to search */ 1001 /* Check to see if we should continue to search */
1011 retval = ext4_htree_next_block(dir, hash, frame, 1002 retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1012 frames, NULL); 1003 frames, NULL);
1013 if (retval < 0) { 1004 if (retval < 0) {
1014 ext4_warning(sb, 1005 ext4_warning(sb,
@@ -1045,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1045 return ERR_PTR(-EIO); 1036 return ERR_PTR(-EIO);
1046 } 1037 }
1047 inode = ext4_iget(dir->i_sb, ino); 1038 inode = ext4_iget(dir->i_sb, ino);
1048 if (unlikely(IS_ERR(inode))) { 1039 if (IS_ERR(inode)) {
1049 if (PTR_ERR(inode) == -ESTALE) { 1040 if (PTR_ERR(inode) == -ESTALE) {
1050 EXT4_ERROR_INODE(dir, 1041 EXT4_ERROR_INODE(dir,
1051 "deleted inode referenced: %u", 1042 "deleted inode referenced: %u",
@@ -1278,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1278 de = (struct ext4_dir_entry_2 *)bh->b_data; 1269 de = (struct ext4_dir_entry_2 *)bh->b_data;
1279 top = bh->b_data + blocksize - reclen; 1270 top = bh->b_data + blocksize - reclen;
1280 while ((char *) de <= top) { 1271 while ((char *) de <= top) {
1281 if (!ext4_check_dir_entry(dir, de, bh, offset)) 1272 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1282 return -EIO; 1273 return -EIO;
1283 if (ext4_match(namelen, name, de)) 1274 if (ext4_match(namelen, name, de))
1284 return -EEXIST; 1275 return -EEXIST;
@@ -1611,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1611 if (err) 1602 if (err)
1612 goto journal_error; 1603 goto journal_error;
1613 } 1604 }
1614 ext4_handle_dirty_metadata(handle, inode, frames[0].bh); 1605 err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1606 if (err) {
1607 ext4_std_error(inode->i_sb, err);
1608 goto cleanup;
1609 }
1615 } 1610 }
1616 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1611 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1617 if (!de) 1612 if (!de)
@@ -1639,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle,
1639{ 1634{
1640 struct ext4_dir_entry_2 *de, *pde; 1635 struct ext4_dir_entry_2 *de, *pde;
1641 unsigned int blocksize = dir->i_sb->s_blocksize; 1636 unsigned int blocksize = dir->i_sb->s_blocksize;
1642 int i; 1637 int i, err;
1643 1638
1644 i = 0; 1639 i = 0;
1645 pde = NULL; 1640 pde = NULL;
1646 de = (struct ext4_dir_entry_2 *) bh->b_data; 1641 de = (struct ext4_dir_entry_2 *) bh->b_data;
1647 while (i < bh->b_size) { 1642 while (i < bh->b_size) {
1648 if (!ext4_check_dir_entry(dir, de, bh, i)) 1643 if (ext4_check_dir_entry(dir, NULL, de, bh, i))
1649 return -EIO; 1644 return -EIO;
1650 if (de == de_del) { 1645 if (de == de_del) {
1651 BUFFER_TRACE(bh, "get_write_access"); 1646 BUFFER_TRACE(bh, "get_write_access");
1652 ext4_journal_get_write_access(handle, bh); 1647 err = ext4_journal_get_write_access(handle, bh);
1648 if (unlikely(err)) {
1649 ext4_std_error(dir->i_sb, err);
1650 return err;
1651 }
1653 if (pde) 1652 if (pde)
1654 pde->rec_len = ext4_rec_len_to_disk( 1653 pde->rec_len = ext4_rec_len_to_disk(
1655 ext4_rec_len_from_disk(pde->rec_len, 1654 ext4_rec_len_from_disk(pde->rec_len,
@@ -1661,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle,
1661 de->inode = 0; 1660 de->inode = 0;
1662 dir->i_version++; 1661 dir->i_version++;
1663 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1662 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1664 ext4_handle_dirty_metadata(handle, dir, bh); 1663 err = ext4_handle_dirty_metadata(handle, dir, bh);
1664 if (unlikely(err)) {
1665 ext4_std_error(dir->i_sb, err);
1666 return err;
1667 }
1665 return 0; 1668 return 0;
1666 } 1669 }
1667 i += ext4_rec_len_from_disk(de->rec_len, blocksize); 1670 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -1798,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1798{ 1801{
1799 handle_t *handle; 1802 handle_t *handle;
1800 struct inode *inode; 1803 struct inode *inode;
1801 struct buffer_head *dir_block; 1804 struct buffer_head *dir_block = NULL;
1802 struct ext4_dir_entry_2 *de; 1805 struct ext4_dir_entry_2 *de;
1803 unsigned int blocksize = dir->i_sb->s_blocksize; 1806 unsigned int blocksize = dir->i_sb->s_blocksize;
1804 int err, retries = 0; 1807 int err, retries = 0;
@@ -1831,7 +1834,9 @@ retry:
1831 if (!dir_block) 1834 if (!dir_block)
1832 goto out_clear_inode; 1835 goto out_clear_inode;
1833 BUFFER_TRACE(dir_block, "get_write_access"); 1836 BUFFER_TRACE(dir_block, "get_write_access");
1834 ext4_journal_get_write_access(handle, dir_block); 1837 err = ext4_journal_get_write_access(handle, dir_block);
1838 if (err)
1839 goto out_clear_inode;
1835 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1840 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1836 de->inode = cpu_to_le32(inode->i_ino); 1841 de->inode = cpu_to_le32(inode->i_ino);
1837 de->name_len = 1; 1842 de->name_len = 1;
@@ -1848,10 +1853,12 @@ retry:
1848 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1853 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1849 inode->i_nlink = 2; 1854 inode->i_nlink = 2;
1850 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 1855 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1851 ext4_handle_dirty_metadata(handle, dir, dir_block); 1856 err = ext4_handle_dirty_metadata(handle, dir, dir_block);
1852 brelse(dir_block); 1857 if (err)
1853 ext4_mark_inode_dirty(handle, inode); 1858 goto out_clear_inode;
1854 err = ext4_add_entry(handle, dentry, inode); 1859 err = ext4_mark_inode_dirty(handle, inode);
1860 if (!err)
1861 err = ext4_add_entry(handle, dentry, inode);
1855 if (err) { 1862 if (err) {
1856out_clear_inode: 1863out_clear_inode:
1857 clear_nlink(inode); 1864 clear_nlink(inode);
@@ -1862,10 +1869,13 @@ out_clear_inode:
1862 } 1869 }
1863 ext4_inc_count(handle, dir); 1870 ext4_inc_count(handle, dir);
1864 ext4_update_dx_flag(dir); 1871 ext4_update_dx_flag(dir);
1865 ext4_mark_inode_dirty(handle, dir); 1872 err = ext4_mark_inode_dirty(handle, dir);
1873 if (err)
1874 goto out_clear_inode;
1866 d_instantiate(dentry, inode); 1875 d_instantiate(dentry, inode);
1867 unlock_new_inode(inode); 1876 unlock_new_inode(inode);
1868out_stop: 1877out_stop:
1878 brelse(dir_block);
1869 ext4_journal_stop(handle); 1879 ext4_journal_stop(handle);
1870 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 1880 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1871 goto retry; 1881 goto retry;
@@ -1928,7 +1938,7 @@ static int empty_dir(struct inode *inode)
1928 } 1938 }
1929 de = (struct ext4_dir_entry_2 *) bh->b_data; 1939 de = (struct ext4_dir_entry_2 *) bh->b_data;
1930 } 1940 }
1931 if (!ext4_check_dir_entry(inode, de, bh, offset)) { 1941 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
1932 de = (struct ext4_dir_entry_2 *)(bh->b_data + 1942 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1933 sb->s_blocksize); 1943 sb->s_blocksize);
1934 offset = (offset | (sb->s_blocksize - 1)) + 1; 1944 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2312,7 +2322,7 @@ retry:
2312 2322
2313 inode->i_ctime = ext4_current_time(inode); 2323 inode->i_ctime = ext4_current_time(inode);
2314 ext4_inc_count(handle, inode); 2324 ext4_inc_count(handle, inode);
2315 atomic_inc(&inode->i_count); 2325 ihold(inode);
2316 2326
2317 err = ext4_add_entry(handle, dentry, inode); 2327 err = ext4_add_entry(handle, dentry, inode);
2318 if (!err) { 2328 if (!err) {
@@ -2416,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2416 ext4_current_time(new_dir); 2426 ext4_current_time(new_dir);
2417 ext4_mark_inode_dirty(handle, new_dir); 2427 ext4_mark_inode_dirty(handle, new_dir);
2418 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 2428 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2419 ext4_handle_dirty_metadata(handle, new_dir, new_bh); 2429 retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2430 if (unlikely(retval)) {
2431 ext4_std_error(new_dir->i_sb, retval);
2432 goto end_rename;
2433 }
2420 brelse(new_bh); 2434 brelse(new_bh);
2421 new_bh = NULL; 2435 new_bh = NULL;
2422 } 2436 }
@@ -2468,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2468 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2482 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2469 cpu_to_le32(new_dir->i_ino); 2483 cpu_to_le32(new_dir->i_ino);
2470 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2484 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2471 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2485 retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2486 if (retval) {
2487 ext4_std_error(old_dir->i_sb, retval);
2488 goto end_rename;
2489 }
2472 ext4_dec_count(handle, old_dir); 2490 ext4_dec_count(handle, old_dir);
2473 if (new_inode) { 2491 if (new_inode) {
2474 /* checked empty_dir above, can't have another parent, 2492 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..7270dcfca92a
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,428 @@
1/*
2 * linux/fs/ext4/page-io.c
3 *
4 * This contains the new page_io functions for ext4
5 *
6 * Written by Theodore Ts'o, 2010.
7 */
8
9#include <linux/module.h>
10#include <linux/fs.h>
11#include <linux/time.h>
12#include <linux/jbd2.h>
13#include <linux/highuid.h>
14#include <linux/pagemap.h>
15#include <linux/quotaops.h>
16#include <linux/string.h>
17#include <linux/buffer_head.h>
18#include <linux/writeback.h>
19#include <linux/pagevec.h>
20#include <linux/mpage.h>
21#include <linux/namei.h>
22#include <linux/uio.h>
23#include <linux/bio.h>
24#include <linux/workqueue.h>
25#include <linux/kernel.h>
26#include <linux/slab.h>
27
28#include "ext4_jbd2.h"
29#include "xattr.h"
30#include "acl.h"
31#include "ext4_extents.h"
32
33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
39int __init ext4_init_pageio(void)
40{
41 int i;
42
43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
44 if (io_page_cachep == NULL)
45 return -ENOMEM;
46 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
47 if (io_end_cachep == NULL) {
48 kmem_cache_destroy(io_page_cachep);
49 return -ENOMEM;
50 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
53
54 return 0;
55}
56
57void ext4_exit_pageio(void)
58{
59 kmem_cache_destroy(io_end_cachep);
60 kmem_cache_destroy(io_page_cachep);
61}
62
63void ext4_ioend_wait(struct inode *inode)
64{
65 wait_queue_head_t *wq = to_ioend_wq(inode);
66
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68}
69
70static void put_io_page(struct ext4_io_page *io_page)
71{
72 if (atomic_dec_and_test(&io_page->p_count)) {
73 end_page_writeback(io_page->p_page);
74 put_page(io_page->p_page);
75 kmem_cache_free(io_page_cachep, io_page);
76 }
77}
78
79void ext4_free_io_end(ext4_io_end_t *io)
80{
81 int i;
82 wait_queue_head_t *wq;
83
84 BUG_ON(!io);
85 if (io->page)
86 put_page(io->page);
87 for (i = 0; i < io->num_io_pages; i++)
88 put_io_page(io->pages[i]);
89 io->num_io_pages = 0;
90 wq = to_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq))
93 wake_up_all(wq);
94 kmem_cache_free(io_end_cachep, io);
95}
96
97/*
98 * check a range of space and convert unwritten extents to written.
99 */
100int ext4_end_io_nolock(ext4_io_end_t *io)
101{
102 struct inode *inode = io->inode;
103 loff_t offset = io->offset;
104 ssize_t size = io->size;
105 int ret = 0;
106
107 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
108 "list->prev 0x%p\n",
109 io, inode->i_ino, io->list.next, io->list.prev);
110
111 if (list_empty(&io->list))
112 return ret;
113
114 if (!(io->flag & EXT4_IO_END_UNWRITTEN))
115 return ret;
116
117 ret = ext4_convert_unwritten_extents(inode, offset, size);
118 if (ret < 0) {
119 printk(KERN_EMERG "%s: failed to convert unwritten "
120 "extents to written extents, error is %d "
121 "io is still on inode %lu aio dio list\n",
122 __func__, ret, inode->i_ino);
123 return ret;
124 }
125
126 if (io->iocb)
127 aio_complete(io->iocb, io->result, 0);
128 /* clear the DIO AIO unwritten flag */
129 io->flag &= ~EXT4_IO_END_UNWRITTEN;
130 return ret;
131}
132
133/*
134 * work on completed aio dio IO, to convert unwritten extents to extents
135 */
136static void ext4_end_io_work(struct work_struct *work)
137{
138 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
139 struct inode *inode = io->inode;
140 struct ext4_inode_info *ei = EXT4_I(inode);
141 unsigned long flags;
142 int ret;
143
144 mutex_lock(&inode->i_mutex);
145 ret = ext4_end_io_nolock(io);
146 if (ret < 0) {
147 mutex_unlock(&inode->i_mutex);
148 return;
149 }
150
151 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
152 if (!list_empty(&io->list))
153 list_del_init(&io->list);
154 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
155 mutex_unlock(&inode->i_mutex);
156 ext4_free_io_end(io);
157}
158
159ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
160{
161 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
162 if (io) {
163 atomic_inc(&EXT4_I(inode)->i_ioend_count);
164 io->inode = inode;
165 INIT_WORK(&io->work, ext4_end_io_work);
166 INIT_LIST_HEAD(&io->list);
167 }
168 return io;
169}
170
171/*
172 * Print an buffer I/O error compatible with the fs/buffer.c. This
173 * provides compatibility with dmesg scrapers that look for a specific
174 * buffer I/O error message. We really need a unified error reporting
175 * structure to userspace ala Digital Unix's uerf system, but it's
176 * probably not going to happen in my lifetime, due to LKML politics...
177 */
178static void buffer_io_error(struct buffer_head *bh)
179{
180 char b[BDEVNAME_SIZE];
181 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
182 bdevname(bh->b_bdev, b),
183 (unsigned long long)bh->b_blocknr);
184}
185
186static void ext4_end_bio(struct bio *bio, int error)
187{
188 ext4_io_end_t *io_end = bio->bi_private;
189 struct workqueue_struct *wq;
190 struct inode *inode;
191 unsigned long flags;
192 int i;
193
194 BUG_ON(!io_end);
195 bio->bi_private = NULL;
196 bio->bi_end_io = NULL;
197 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
198 error = 0;
199 bio_put(bio);
200
201 for (i = 0; i < io_end->num_io_pages; i++) {
202 struct page *page = io_end->pages[i]->p_page;
203 struct buffer_head *bh, *head;
204 int partial_write = 0;
205
206 head = page_buffers(page);
207 if (error)
208 SetPageError(page);
209 BUG_ON(!head);
210 if (head->b_size == PAGE_CACHE_SIZE)
211 clear_buffer_dirty(head);
212 else {
213 loff_t offset;
214 loff_t io_end_offset = io_end->offset + io_end->size;
215
216 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
217 bh = head;
218 do {
219 if ((offset >= io_end->offset) &&
220 (offset+bh->b_size <= io_end_offset)) {
221 if (error)
222 buffer_io_error(bh);
223
224 clear_buffer_dirty(bh);
225 }
226 if (buffer_delay(bh))
227 partial_write = 1;
228 else if (!buffer_mapped(bh))
229 clear_buffer_dirty(bh);
230 else if (buffer_dirty(bh))
231 partial_write = 1;
232 offset += bh->b_size;
233 bh = bh->b_this_page;
234 } while (bh != head);
235 }
236
237 /*
238 * If this is a partial write which happened to make
239 * all buffers uptodate then we can optimize away a
240 * bogus readpage() for the next read(). Here we
241 * 'discover' whether the page went uptodate as a
242 * result of this (potentially partial) write.
243 */
244 if (!partial_write)
245 SetPageUptodate(page);
246
247 put_io_page(io_end->pages[i]);
248 }
249 io_end->num_io_pages = 0;
250 inode = io_end->inode;
251
252 if (error) {
253 io_end->flag |= EXT4_IO_END_ERROR;
254 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
255 "(offset %llu size %ld starting block %llu)",
256 inode->i_ino,
257 (unsigned long long) io_end->offset,
258 (long) io_end->size,
259 (unsigned long long)
260 bio->bi_sector >> (inode->i_blkbits - 9));
261 }
262
263 /* Add the io_end to per-inode completed io list*/
264 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
265 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
266 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
267
268 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
269 /* queue the work to convert unwritten extents to written */
270 queue_work(wq, &io_end->work);
271}
272
273void ext4_io_submit(struct ext4_io_submit *io)
274{
275 struct bio *bio = io->io_bio;
276
277 if (bio) {
278 bio_get(io->io_bio);
279 submit_bio(io->io_op, io->io_bio);
280 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
281 bio_put(io->io_bio);
282 }
283 io->io_bio = 0;
284 io->io_op = 0;
285 io->io_end = 0;
286}
287
288static int io_submit_init(struct ext4_io_submit *io,
289 struct inode *inode,
290 struct writeback_control *wbc,
291 struct buffer_head *bh)
292{
293 ext4_io_end_t *io_end;
294 struct page *page = bh->b_page;
295 int nvecs = bio_get_nr_vecs(bh->b_bdev);
296 struct bio *bio;
297
298 io_end = ext4_init_io_end(inode, GFP_NOFS);
299 if (!io_end)
300 return -ENOMEM;
301 do {
302 bio = bio_alloc(GFP_NOIO, nvecs);
303 nvecs >>= 1;
304 } while (bio == NULL);
305
306 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
307 bio->bi_bdev = bh->b_bdev;
308 bio->bi_private = io->io_end = io_end;
309 bio->bi_end_io = ext4_end_bio;
310
311 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
312
313 io->io_bio = bio;
314 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
315 WRITE_SYNC_PLUG : WRITE);
316 io->io_next_block = bh->b_blocknr;
317 return 0;
318}
319
320static int io_submit_add_bh(struct ext4_io_submit *io,
321 struct ext4_io_page *io_page,
322 struct inode *inode,
323 struct writeback_control *wbc,
324 struct buffer_head *bh)
325{
326 ext4_io_end_t *io_end;
327 int ret;
328
329 if (buffer_new(bh)) {
330 clear_buffer_new(bh);
331 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
332 }
333
334 if (!buffer_mapped(bh) || buffer_delay(bh)) {
335 if (!buffer_mapped(bh))
336 clear_buffer_dirty(bh);
337 if (io->io_bio)
338 ext4_io_submit(io);
339 return 0;
340 }
341
342 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
343submit_and_retry:
344 ext4_io_submit(io);
345 }
346 if (io->io_bio == NULL) {
347 ret = io_submit_init(io, inode, wbc, bh);
348 if (ret)
349 return ret;
350 }
351 io_end = io->io_end;
352 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
353 (io_end->pages[io_end->num_io_pages-1] != io_page))
354 goto submit_and_retry;
355 if (buffer_uninit(bh))
356 io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
357 io->io_end->size += bh->b_size;
358 io->io_next_block++;
359 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
360 if (ret != bh->b_size)
361 goto submit_and_retry;
362 if ((io_end->num_io_pages == 0) ||
363 (io_end->pages[io_end->num_io_pages-1] != io_page)) {
364 io_end->pages[io_end->num_io_pages++] = io_page;
365 atomic_inc(&io_page->p_count);
366 }
367 return 0;
368}
369
370int ext4_bio_write_page(struct ext4_io_submit *io,
371 struct page *page,
372 int len,
373 struct writeback_control *wbc)
374{
375 struct inode *inode = page->mapping->host;
376 unsigned block_start, block_end, blocksize;
377 struct ext4_io_page *io_page;
378 struct buffer_head *bh, *head;
379 int ret = 0;
380
381 blocksize = 1 << inode->i_blkbits;
382
383 BUG_ON(PageWriteback(page));
384 set_page_writeback(page);
385 ClearPageError(page);
386
387 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
388 if (!io_page) {
389 set_page_dirty(page);
390 unlock_page(page);
391 return -ENOMEM;
392 }
393 io_page->p_page = page;
394 atomic_set(&io_page->p_count, 1);
395 get_page(page);
396
397 for (bh = head = page_buffers(page), block_start = 0;
398 bh != head || !block_start;
399 block_start = block_end, bh = bh->b_this_page) {
400 block_end = block_start + blocksize;
401 if (block_start >= len) {
402 clear_buffer_dirty(bh);
403 set_buffer_uptodate(bh);
404 continue;
405 }
406 ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
407 if (ret) {
408 /*
409 * We only get here on ENOMEM. Not much else
410 * we can do but mark the page as dirty, and
411 * better luck next time.
412 */
413 set_page_dirty(page);
414 break;
415 }
416 }
417 unlock_page(page);
418 /*
419 * If the page was truncated before we could do the writeback,
420 * or we had a memory allocation error while trying to write
421 * the first buffer head, we won't have submitted any pages for
422 * I/O. In that case we need to make sure we've cleared the
423 * PageWriteback bit from the page to prevent the system from
424 * wedging later on.
425 */
426 put_io_page(io_page);
427 return ret;
428}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..3ecc6e45d2f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -220,29 +220,25 @@ static int setup_new_group_blocks(struct super_block *sb,
220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
221 set_buffer_uptodate(gdb); 221 set_buffer_uptodate(gdb);
222 unlock_buffer(gdb); 222 unlock_buffer(gdb);
223 ext4_handle_dirty_metadata(handle, NULL, gdb); 223 err = ext4_handle_dirty_metadata(handle, NULL, gdb);
224 if (unlikely(err)) {
225 brelse(gdb);
226 goto exit_bh;
227 }
224 ext4_set_bit(bit, bh->b_data); 228 ext4_set_bit(bit, bh->b_data);
225 brelse(gdb); 229 brelse(gdb);
226 } 230 }
227 231
228 /* Zero out all of the reserved backup group descriptor table blocks */ 232 /* Zero out all of the reserved backup group descriptor table blocks */
229 for (i = 0, bit = gdblocks + 1, block = start + bit; 233 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
230 i < reserved_gdb; i++, block++, bit++) { 234 block, sbi->s_itb_per_group);
231 struct buffer_head *gdb; 235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
232 236 GFP_NOFS);
233 ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); 237 if (err)
234 238 goto exit_bh;
235 if ((err = extend_or_restart_transaction(handle, 1, bh))) 239 for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
236 goto exit_bh;
237
238 if (IS_ERR(gdb = bclean(handle, sb, block))) {
239 err = PTR_ERR(gdb);
240 goto exit_bh;
241 }
242 ext4_handle_dirty_metadata(handle, NULL, gdb);
243 ext4_set_bit(bit, bh->b_data); 240 ext4_set_bit(bit, bh->b_data);
244 brelse(gdb); 241
245 }
246 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 242 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
247 input->block_bitmap - start); 243 input->block_bitmap - start);
248 ext4_set_bit(input->block_bitmap - start, bh->b_data); 244 ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,29 +247,26 @@ static int setup_new_group_blocks(struct super_block *sb,
251 ext4_set_bit(input->inode_bitmap - start, bh->b_data); 247 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
252 248
253 /* Zero out all of the inode table blocks */ 249 /* Zero out all of the inode table blocks */
254 for (i = 0, block = input->inode_table, bit = block - start; 250 block = input->inode_table;
255 i < sbi->s_itb_per_group; i++, bit++, block++) { 251 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
256 struct buffer_head *it; 252 block, sbi->s_itb_per_group);
257 253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
258 ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); 254 if (err)
259 255 goto exit_bh;
260 if ((err = extend_or_restart_transaction(handle, 1, bh))) 256 for (i = 0, bit = input->inode_table - start;
261 goto exit_bh; 257 i < sbi->s_itb_per_group; i++, bit++)
262
263 if (IS_ERR(it = bclean(handle, sb, block))) {
264 err = PTR_ERR(it);
265 goto exit_bh;
266 }
267 ext4_handle_dirty_metadata(handle, NULL, it);
268 brelse(it);
269 ext4_set_bit(bit, bh->b_data); 258 ext4_set_bit(bit, bh->b_data);
270 }
271 259
272 if ((err = extend_or_restart_transaction(handle, 2, bh))) 260 if ((err = extend_or_restart_transaction(handle, 2, bh)))
273 goto exit_bh; 261 goto exit_bh;
274 262
275 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); 263 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
276 ext4_handle_dirty_metadata(handle, NULL, bh); 264 bh->b_data);
265 err = ext4_handle_dirty_metadata(handle, NULL, bh);
266 if (unlikely(err)) {
267 ext4_std_error(sb, err);
268 goto exit_bh;
269 }
277 brelse(bh); 270 brelse(bh);
278 /* Mark unused entries in inode bitmap used */ 271 /* Mark unused entries in inode bitmap used */
279 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 272 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
@@ -283,9 +276,11 @@ static int setup_new_group_blocks(struct super_block *sb,
283 goto exit_journal; 276 goto exit_journal;
284 } 277 }
285 278
286 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 279 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
287 bh->b_data); 280 bh->b_data);
288 ext4_handle_dirty_metadata(handle, NULL, bh); 281 err = ext4_handle_dirty_metadata(handle, NULL, bh);
282 if (unlikely(err))
283 ext4_std_error(sb, err);
289exit_bh: 284exit_bh:
290 brelse(bh); 285 brelse(bh);
291 286
@@ -437,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
437 goto exit_dind; 432 goto exit_dind;
438 } 433 }
439 434
440 if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) 435 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
436 if (unlikely(err))
441 goto exit_dind; 437 goto exit_dind;
442 438
443 if ((err = ext4_journal_get_write_access(handle, *primary))) 439 err = ext4_journal_get_write_access(handle, *primary);
440 if (unlikely(err))
444 goto exit_sbh; 441 goto exit_sbh;
445 442
446 if ((err = ext4_journal_get_write_access(handle, dind))) 443 err = ext4_journal_get_write_access(handle, dind);
447 goto exit_primary; 444 if (unlikely(err))
445 ext4_std_error(sb, err);
448 446
449 /* ext4_reserve_inode_write() gets a reference on the iloc */ 447 /* ext4_reserve_inode_write() gets a reference on the iloc */
450 if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) 448 err = ext4_reserve_inode_write(handle, inode, &iloc);
449 if (unlikely(err))
451 goto exit_dindj; 450 goto exit_dindj;
452 451
453 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), 452 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
@@ -469,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
469 * reserved inode, and will become GDT blocks (primary and backup). 468 * reserved inode, and will become GDT blocks (primary and backup).
470 */ 469 */
471 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 470 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
472 ext4_handle_dirty_metadata(handle, NULL, dind); 471 err = ext4_handle_dirty_metadata(handle, NULL, dind);
473 brelse(dind); 472 if (unlikely(err)) {
473 ext4_std_error(sb, err);
474 goto exit_inode;
475 }
474 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 476 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
475 ext4_mark_iloc_dirty(handle, inode, &iloc); 477 ext4_mark_iloc_dirty(handle, inode, &iloc);
476 memset((*primary)->b_data, 0, sb->s_blocksize); 478 memset((*primary)->b_data, 0, sb->s_blocksize);
477 ext4_handle_dirty_metadata(handle, NULL, *primary); 479 err = ext4_handle_dirty_metadata(handle, NULL, *primary);
480 if (unlikely(err)) {
481 ext4_std_error(sb, err);
482 goto exit_inode;
483 }
484 brelse(dind);
478 485
479 o_group_desc = EXT4_SB(sb)->s_group_desc; 486 o_group_desc = EXT4_SB(sb)->s_group_desc;
480 memcpy(n_group_desc, o_group_desc, 487 memcpy(n_group_desc, o_group_desc,
@@ -485,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
485 kfree(o_group_desc); 492 kfree(o_group_desc);
486 493
487 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 494 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
488 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 495 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
496 if (err)
497 ext4_std_error(sb, err);
489 498
490 return 0; 499 return err;
491 500
492exit_inode: 501exit_inode:
493 /* ext4_journal_release_buffer(handle, iloc.bh); */ 502 /* ext4_journal_release_buffer(handle, iloc.bh); */
494 brelse(iloc.bh); 503 brelse(iloc.bh);
495exit_dindj: 504exit_dindj:
496 /* ext4_journal_release_buffer(handle, dind); */ 505 /* ext4_journal_release_buffer(handle, dind); */
497exit_primary:
498 /* ext4_journal_release_buffer(handle, *primary); */
499exit_sbh: 506exit_sbh:
500 /* ext4_journal_release_buffer(handle, *primary); */ 507 /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
501exit_dind: 508exit_dind:
502 brelse(dind); 509 brelse(dind);
503exit_bh: 510exit_bh:
@@ -680,7 +687,9 @@ static void update_backups(struct super_block *sb,
680 memset(bh->b_data + size, 0, rest); 687 memset(bh->b_data + size, 0, rest);
681 set_buffer_uptodate(bh); 688 set_buffer_uptodate(bh);
682 unlock_buffer(bh); 689 unlock_buffer(bh);
683 ext4_handle_dirty_metadata(handle, NULL, bh); 690 err = ext4_handle_dirty_metadata(handle, NULL, bh);
691 if (unlikely(err))
692 ext4_std_error(sb, err);
684 brelse(bh); 693 brelse(bh);
685 } 694 }
686 if ((err2 = ext4_journal_stop(handle)) && !err) 695 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -898,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
898 /* Update the global fs size fields */ 907 /* Update the global fs size fields */
899 sbi->s_groups_count++; 908 sbi->s_groups_count++;
900 909
901 ext4_handle_dirty_metadata(handle, NULL, primary); 910 err = ext4_handle_dirty_metadata(handle, NULL, primary);
911 if (unlikely(err)) {
912 ext4_std_error(sb, err);
913 goto exit_journal;
914 }
902 915
903 /* Update the reserved block counts only once the new group is 916 /* Update the reserved block counts only once the new group is
904 * active. */ 917 * active. */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..48ce561fafac 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/blkdev.h> 27#include <linux/blkdev.h>
28#include <linux/parser.h> 28#include <linux/parser.h>
29#include <linux/smp_lock.h>
30#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 30#include <linux/exportfs.h>
32#include <linux/vfs.h> 31#include <linux/vfs.h>
@@ -41,6 +40,9 @@
41#include <linux/crc16.h> 40#include <linux/crc16.h>
42#include <asm/uaccess.h> 41#include <asm/uaccess.h>
43 42
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45
44#include "ext4.h" 46#include "ext4.h"
45#include "ext4_jbd2.h" 47#include "ext4_jbd2.h"
46#include "xattr.h" 48#include "xattr.h"
@@ -50,8 +52,11 @@
50#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 53#include <trace/events/ext4.h>
52 54
53struct proc_dir_entry *ext4_proc_root; 55static struct proc_dir_entry *ext4_proc_root;
54static struct kset *ext4_kset; 56static struct kset *ext4_kset;
57struct ext4_lazy_init *ext4_li_info;
58struct mutex ext4_li_mtx;
59struct ext4_features *ext4_feat;
55 60
56static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 61static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
57 unsigned long journal_devnum); 62 unsigned long journal_devnum);
@@ -68,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 73static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 74static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt); 77 const char *dev_name, void *data);
78static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb);
73 80
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 81#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = { 82static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE, 83 .owner = THIS_MODULE,
77 .name = "ext3", 84 .name = "ext3",
78 .get_sb = ext4_get_sb, 85 .mount = ext4_mount,
79 .kill_sb = kill_block_super, 86 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV, 87 .fs_flags = FS_REQUIRES_DEV,
81}; 88};
@@ -381,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb)
381void __ext4_error(struct super_block *sb, const char *function, 388void __ext4_error(struct super_block *sb, const char *function,
382 unsigned int line, const char *fmt, ...) 389 unsigned int line, const char *fmt, ...)
383{ 390{
391 struct va_format vaf;
384 va_list args; 392 va_list args;
385 393
386 va_start(args, fmt); 394 va_start(args, fmt);
387 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", 395 vaf.fmt = fmt;
388 sb->s_id, function, line, current->comm); 396 vaf.va = &args;
389 vprintk(fmt, args); 397 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
390 printk("\n"); 398 sb->s_id, function, line, current->comm, &vaf);
391 va_end(args); 399 va_end(args);
392 400
393 ext4_handle_error(sb); 401 ext4_handle_error(sb);
@@ -398,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function,
398 const char *fmt, ...) 406 const char *fmt, ...)
399{ 407{
400 va_list args; 408 va_list args;
409 struct va_format vaf;
401 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 410 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
402 411
403 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 412 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
404 es->s_last_error_block = cpu_to_le64(block); 413 es->s_last_error_block = cpu_to_le64(block);
405 save_error_info(inode->i_sb, function, line); 414 save_error_info(inode->i_sb, function, line);
406 va_start(args, fmt); 415 va_start(args, fmt);
416 vaf.fmt = fmt;
417 vaf.va = &args;
407 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", 418 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
408 inode->i_sb->s_id, function, line, inode->i_ino); 419 inode->i_sb->s_id, function, line, inode->i_ino);
409 if (block) 420 if (block)
410 printk("block %llu: ", block); 421 printk(KERN_CONT "block %llu: ", block);
411 printk("comm %s: ", current->comm); 422 printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
412 vprintk(fmt, args);
413 printk("\n");
414 va_end(args); 423 va_end(args);
415 424
416 ext4_handle_error(inode->i_sb); 425 ext4_handle_error(inode->i_sb);
417} 426}
418 427
419void ext4_error_file(struct file *file, const char *function, 428void ext4_error_file(struct file *file, const char *function,
420 unsigned int line, const char *fmt, ...) 429 unsigned int line, ext4_fsblk_t block,
430 const char *fmt, ...)
421{ 431{
422 va_list args; 432 va_list args;
433 struct va_format vaf;
423 struct ext4_super_block *es; 434 struct ext4_super_block *es;
424 struct inode *inode = file->f_dentry->d_inode; 435 struct inode *inode = file->f_dentry->d_inode;
425 char pathname[80], *path; 436 char pathname[80], *path;
@@ -427,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function,
427 es = EXT4_SB(inode->i_sb)->s_es; 438 es = EXT4_SB(inode->i_sb)->s_es;
428 es->s_last_error_ino = cpu_to_le32(inode->i_ino); 439 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
429 save_error_info(inode->i_sb, function, line); 440 save_error_info(inode->i_sb, function, line);
430 va_start(args, fmt);
431 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 441 path = d_path(&(file->f_path), pathname, sizeof(pathname));
432 if (!path) 442 if (IS_ERR(path))
433 path = "(unknown)"; 443 path = "(unknown)";
434 printk(KERN_CRIT 444 printk(KERN_CRIT
435 "EXT4-fs error (device %s): %s:%d: inode #%lu " 445 "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
436 "(comm %s path %s): ", 446 inode->i_sb->s_id, function, line, inode->i_ino);
437 inode->i_sb->s_id, function, line, inode->i_ino, 447 if (block)
438 current->comm, path); 448 printk(KERN_CONT "block %llu: ", block);
439 vprintk(fmt, args); 449 va_start(args, fmt);
440 printk("\n"); 450 vaf.fmt = fmt;
451 vaf.va = &args;
452 printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
441 va_end(args); 453 va_end(args);
442 454
443 ext4_handle_error(inode->i_sb); 455 ext4_handle_error(inode->i_sb);
@@ -536,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function,
536 panic("EXT4-fs panic from previous error\n"); 548 panic("EXT4-fs panic from previous error\n");
537} 549}
538 550
539void ext4_msg (struct super_block * sb, const char *prefix, 551void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
540 const char *fmt, ...)
541{ 552{
553 struct va_format vaf;
542 va_list args; 554 va_list args;
543 555
544 va_start(args, fmt); 556 va_start(args, fmt);
545 printk("%sEXT4-fs (%s): ", prefix, sb->s_id); 557 vaf.fmt = fmt;
546 vprintk(fmt, args); 558 vaf.va = &args;
547 printk("\n"); 559 printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
548 va_end(args); 560 va_end(args);
549} 561}
550 562
551void __ext4_warning(struct super_block *sb, const char *function, 563void __ext4_warning(struct super_block *sb, const char *function,
552 unsigned int line, const char *fmt, ...) 564 unsigned int line, const char *fmt, ...)
553{ 565{
566 struct va_format vaf;
554 va_list args; 567 va_list args;
555 568
556 va_start(args, fmt); 569 va_start(args, fmt);
557 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", 570 vaf.fmt = fmt;
558 sb->s_id, function, line); 571 vaf.va = &args;
559 vprintk(fmt, args); 572 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
560 printk("\n"); 573 sb->s_id, function, line, &vaf);
561 va_end(args); 574 va_end(args);
562} 575}
563 576
@@ -568,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line,
568__releases(bitlock) 581__releases(bitlock)
569__acquires(bitlock) 582__acquires(bitlock)
570{ 583{
584 struct va_format vaf;
571 va_list args; 585 va_list args;
572 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 586 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
573 587
574 es->s_last_error_ino = cpu_to_le32(ino); 588 es->s_last_error_ino = cpu_to_le32(ino);
575 es->s_last_error_block = cpu_to_le64(block); 589 es->s_last_error_block = cpu_to_le64(block);
576 __save_error_info(sb, function, line); 590 __save_error_info(sb, function, line);
591
577 va_start(args, fmt); 592 va_start(args, fmt);
593
594 vaf.fmt = fmt;
595 vaf.va = &args;
578 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", 596 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
579 sb->s_id, function, line, grp); 597 sb->s_id, function, line, grp);
580 if (ino) 598 if (ino)
581 printk("inode %lu: ", ino); 599 printk(KERN_CONT "inode %lu: ", ino);
582 if (block) 600 if (block)
583 printk("block %llu:", (unsigned long long) block); 601 printk(KERN_CONT "block %llu:", (unsigned long long) block);
584 vprintk(fmt, args); 602 printk(KERN_CONT "%pV\n", &vaf);
585 printk("\n");
586 va_end(args); 603 va_end(args);
587 604
588 if (test_opt(sb, ERRORS_CONT)) { 605 if (test_opt(sb, ERRORS_CONT)) {
@@ -640,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
640 struct block_device *bdev; 657 struct block_device *bdev;
641 char b[BDEVNAME_SIZE]; 658 char b[BDEVNAME_SIZE];
642 659
643 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 660 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
644 if (IS_ERR(bdev)) 661 if (IS_ERR(bdev))
645 goto fail; 662 goto fail;
646 return bdev; 663 return bdev;
@@ -656,8 +673,7 @@ fail:
656 */ 673 */
657static int ext4_blkdev_put(struct block_device *bdev) 674static int ext4_blkdev_put(struct block_device *bdev)
658{ 675{
659 bd_release(bdev); 676 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
660 return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
661} 677}
662 678
663static int ext4_blkdev_remove(struct ext4_sb_info *sbi) 679static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -702,13 +718,13 @@ static void ext4_put_super(struct super_block *sb)
702 struct ext4_super_block *es = sbi->s_es; 718 struct ext4_super_block *es = sbi->s_es;
703 int i, err; 719 int i, err;
704 720
721 ext4_unregister_li_request(sb);
705 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 722 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
706 723
707 flush_workqueue(sbi->dio_unwritten_wq); 724 flush_workqueue(sbi->dio_unwritten_wq);
708 destroy_workqueue(sbi->dio_unwritten_wq); 725 destroy_workqueue(sbi->dio_unwritten_wq);
709 726
710 lock_super(sb); 727 lock_super(sb);
711 lock_kernel();
712 if (sb->s_dirt) 728 if (sb->s_dirt)
713 ext4_commit_super(sb, 1); 729 ext4_commit_super(sb, 1);
714 730
@@ -719,6 +735,7 @@ static void ext4_put_super(struct super_block *sb)
719 ext4_abort(sb, "Couldn't clean up the journal"); 735 ext4_abort(sb, "Couldn't clean up the journal");
720 } 736 }
721 737
738 del_timer(&sbi->s_err_report);
722 ext4_release_system_zone(sb); 739 ext4_release_system_zone(sb);
723 ext4_mb_release(sb); 740 ext4_mb_release(sb);
724 ext4_ext_release(sb); 741 ext4_ext_release(sb);
@@ -775,7 +792,6 @@ static void ext4_put_super(struct super_block *sb)
775 * Now that we are completely done shutting down the 792 * Now that we are completely done shutting down the
776 * superblock, we need to actually destroy the kobject. 793 * superblock, we need to actually destroy the kobject.
777 */ 794 */
778 unlock_kernel();
779 unlock_super(sb); 795 unlock_super(sb);
780 kobject_put(&sbi->s_kobj); 796 kobject_put(&sbi->s_kobj);
781 wait_for_completion(&sbi->s_kobj_unregister); 797 wait_for_completion(&sbi->s_kobj_unregister);
@@ -801,32 +817,43 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
801 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 817 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
802 INIT_LIST_HEAD(&ei->i_prealloc_list); 818 INIT_LIST_HEAD(&ei->i_prealloc_list);
803 spin_lock_init(&ei->i_prealloc_lock); 819 spin_lock_init(&ei->i_prealloc_lock);
804 /*
805 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
806 * therefore it can be null here. Don't check it, just initialize
807 * jinode.
808 */
809 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
810 ei->i_reserved_data_blocks = 0; 820 ei->i_reserved_data_blocks = 0;
811 ei->i_reserved_meta_blocks = 0; 821 ei->i_reserved_meta_blocks = 0;
812 ei->i_allocated_meta_blocks = 0; 822 ei->i_allocated_meta_blocks = 0;
813 ei->i_da_metadata_calc_len = 0; 823 ei->i_da_metadata_calc_len = 0;
814 ei->i_delalloc_reserved_flag = 0;
815 spin_lock_init(&(ei->i_block_reservation_lock)); 824 spin_lock_init(&(ei->i_block_reservation_lock));
816#ifdef CONFIG_QUOTA 825#ifdef CONFIG_QUOTA
817 ei->i_reserved_quota = 0; 826 ei->i_reserved_quota = 0;
818#endif 827#endif
828 ei->jinode = NULL;
819 INIT_LIST_HEAD(&ei->i_completed_io_list); 829 INIT_LIST_HEAD(&ei->i_completed_io_list);
820 spin_lock_init(&ei->i_completed_io_lock); 830 spin_lock_init(&ei->i_completed_io_lock);
821 ei->cur_aio_dio = NULL; 831 ei->cur_aio_dio = NULL;
822 ei->i_sync_tid = 0; 832 ei->i_sync_tid = 0;
823 ei->i_datasync_tid = 0; 833 ei->i_datasync_tid = 0;
834 atomic_set(&ei->i_ioend_count, 0);
824 835
825 return &ei->vfs_inode; 836 return &ei->vfs_inode;
826} 837}
827 838
839static int ext4_drop_inode(struct inode *inode)
840{
841 int drop = generic_drop_inode(inode);
842
843 trace_ext4_drop_inode(inode, drop);
844 return drop;
845}
846
847static void ext4_i_callback(struct rcu_head *head)
848{
849 struct inode *inode = container_of(head, struct inode, i_rcu);
850 INIT_LIST_HEAD(&inode->i_dentry);
851 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
852}
853
828static void ext4_destroy_inode(struct inode *inode) 854static void ext4_destroy_inode(struct inode *inode)
829{ 855{
856 ext4_ioend_wait(inode);
830 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 857 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
831 ext4_msg(inode->i_sb, KERN_ERR, 858 ext4_msg(inode->i_sb, KERN_ERR,
832 "Inode %lu (%p): orphan list check failed!", 859 "Inode %lu (%p): orphan list check failed!",
@@ -836,7 +863,7 @@ static void ext4_destroy_inode(struct inode *inode)
836 true); 863 true);
837 dump_stack(); 864 dump_stack();
838 } 865 }
839 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); 866 call_rcu(&inode->i_rcu, ext4_i_callback);
840} 867}
841 868
842static void init_once(void *foo) 869static void init_once(void *foo)
@@ -874,9 +901,12 @@ void ext4_clear_inode(struct inode *inode)
874 end_writeback(inode); 901 end_writeback(inode);
875 dquot_drop(inode); 902 dquot_drop(inode);
876 ext4_discard_preallocations(inode); 903 ext4_discard_preallocations(inode);
877 if (EXT4_JOURNAL(inode)) 904 if (EXT4_I(inode)->jinode) {
878 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 905 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
879 &EXT4_I(inode)->jinode); 906 EXT4_I(inode)->jinode);
907 jbd2_free_inode(EXT4_I(inode)->jinode);
908 EXT4_I(inode)->jinode = NULL;
909 }
880} 910}
881 911
882static inline void ext4_show_quota_options(struct seq_file *seq, 912static inline void ext4_show_quota_options(struct seq_file *seq,
@@ -1009,6 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1009 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1039 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1010 seq_puts(seq, ",nodelalloc"); 1040 seq_puts(seq, ",nodelalloc");
1011 1041
1042 if (test_opt(sb, MBLK_IO_SUBMIT))
1043 seq_puts(seq, ",mblk_io_submit");
1012 if (sbi->s_stripe) 1044 if (sbi->s_stripe)
1013 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1045 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1014 /* 1046 /*
@@ -1045,6 +1077,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1045 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) 1077 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1046 seq_puts(seq, ",block_validity"); 1078 seq_puts(seq, ",block_validity");
1047 1079
1080 if (!test_opt(sb, INIT_INODE_TABLE))
1081 seq_puts(seq, ",noinit_inode_table");
1082 else if (sbi->s_li_wait_mult)
1083 seq_printf(seq, ",init_inode_table=%u",
1084 (unsigned) sbi->s_li_wait_mult);
1085
1048 ext4_show_quota_options(seq, sb); 1086 ext4_show_quota_options(seq, sb);
1049 1087
1050 return 0; 1088 return 0;
@@ -1123,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1123static int ext4_mark_dquot_dirty(struct dquot *dquot); 1161static int ext4_mark_dquot_dirty(struct dquot *dquot);
1124static int ext4_write_info(struct super_block *sb, int type); 1162static int ext4_write_info(struct super_block *sb, int type);
1125static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1163static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1126 char *path); 1164 struct path *path);
1127static int ext4_quota_off(struct super_block *sb, int type); 1165static int ext4_quota_off(struct super_block *sb, int type);
1128static int ext4_quota_on_mount(struct super_block *sb, int type); 1166static int ext4_quota_on_mount(struct super_block *sb, int type);
1129static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1167static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1160,6 +1198,7 @@ static const struct super_operations ext4_sops = {
1160 .destroy_inode = ext4_destroy_inode, 1198 .destroy_inode = ext4_destroy_inode,
1161 .write_inode = ext4_write_inode, 1199 .write_inode = ext4_write_inode,
1162 .dirty_inode = ext4_dirty_inode, 1200 .dirty_inode = ext4_dirty_inode,
1201 .drop_inode = ext4_drop_inode,
1163 .evict_inode = ext4_evict_inode, 1202 .evict_inode = ext4_evict_inode,
1164 .put_super = ext4_put_super, 1203 .put_super = ext4_put_super,
1165 .sync_fs = ext4_sync_fs, 1204 .sync_fs = ext4_sync_fs,
@@ -1180,6 +1219,7 @@ static const struct super_operations ext4_nojournal_sops = {
1180 .destroy_inode = ext4_destroy_inode, 1219 .destroy_inode = ext4_destroy_inode,
1181 .write_inode = ext4_write_inode, 1220 .write_inode = ext4_write_inode,
1182 .dirty_inode = ext4_dirty_inode, 1221 .dirty_inode = ext4_dirty_inode,
1222 .drop_inode = ext4_drop_inode,
1183 .evict_inode = ext4_evict_inode, 1223 .evict_inode = ext4_evict_inode,
1184 .write_super = ext4_write_super, 1224 .write_super = ext4_write_super,
1185 .put_super = ext4_put_super, 1225 .put_super = ext4_put_super,
@@ -1214,11 +1254,12 @@ enum {
1214 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1254 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1215 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1255 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
1216 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1256 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
1217 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1257 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1218 Opt_block_validity, Opt_noblock_validity, 1258 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1219 Opt_inode_readahead_blks, Opt_journal_ioprio, 1259 Opt_inode_readahead_blks, Opt_journal_ioprio,
1220 Opt_dioread_nolock, Opt_dioread_lock, 1260 Opt_dioread_nolock, Opt_dioread_lock,
1221 Opt_discard, Opt_nodiscard, 1261 Opt_discard, Opt_nodiscard,
1262 Opt_init_inode_table, Opt_noinit_inode_table,
1222}; 1263};
1223 1264
1224static const match_table_t tokens = { 1265static const match_table_t tokens = {
@@ -1278,6 +1319,8 @@ static const match_table_t tokens = {
1278 {Opt_resize, "resize"}, 1319 {Opt_resize, "resize"},
1279 {Opt_delalloc, "delalloc"}, 1320 {Opt_delalloc, "delalloc"},
1280 {Opt_nodelalloc, "nodelalloc"}, 1321 {Opt_nodelalloc, "nodelalloc"},
1322 {Opt_mblk_io_submit, "mblk_io_submit"},
1323 {Opt_nomblk_io_submit, "nomblk_io_submit"},
1281 {Opt_block_validity, "block_validity"}, 1324 {Opt_block_validity, "block_validity"},
1282 {Opt_noblock_validity, "noblock_validity"}, 1325 {Opt_noblock_validity, "noblock_validity"},
1283 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1326 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1289,6 +1332,9 @@ static const match_table_t tokens = {
1289 {Opt_dioread_lock, "dioread_lock"}, 1332 {Opt_dioread_lock, "dioread_lock"},
1290 {Opt_discard, "discard"}, 1333 {Opt_discard, "discard"},
1291 {Opt_nodiscard, "nodiscard"}, 1334 {Opt_nodiscard, "nodiscard"},
1335 {Opt_init_inode_table, "init_itable=%u"},
1336 {Opt_init_inode_table, "init_itable"},
1337 {Opt_noinit_inode_table, "noinit_itable"},
1292 {Opt_err, NULL}, 1338 {Opt_err, NULL},
1293}; 1339};
1294 1340
@@ -1353,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1353 sbi->s_qf_names[qtype] = NULL; 1399 sbi->s_qf_names[qtype] = NULL;
1354 return 0; 1400 return 0;
1355 } 1401 }
1356 set_opt(sbi->s_mount_opt, QUOTA); 1402 set_opt(sb, QUOTA);
1357 return 1; 1403 return 1;
1358} 1404}
1359 1405
@@ -1408,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb,
1408 switch (token) { 1454 switch (token) {
1409 case Opt_bsd_df: 1455 case Opt_bsd_df:
1410 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1456 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1411 clear_opt(sbi->s_mount_opt, MINIX_DF); 1457 clear_opt(sb, MINIX_DF);
1412 break; 1458 break;
1413 case Opt_minix_df: 1459 case Opt_minix_df:
1414 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1460 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1415 set_opt(sbi->s_mount_opt, MINIX_DF); 1461 set_opt(sb, MINIX_DF);
1416 1462
1417 break; 1463 break;
1418 case Opt_grpid: 1464 case Opt_grpid:
1419 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1465 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1420 set_opt(sbi->s_mount_opt, GRPID); 1466 set_opt(sb, GRPID);
1421 1467
1422 break; 1468 break;
1423 case Opt_nogrpid: 1469 case Opt_nogrpid:
1424 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1470 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1425 clear_opt(sbi->s_mount_opt, GRPID); 1471 clear_opt(sb, GRPID);
1426 1472
1427 break; 1473 break;
1428 case Opt_resuid: 1474 case Opt_resuid:
@@ -1440,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb,
1440 /* *sb_block = match_int(&args[0]); */ 1486 /* *sb_block = match_int(&args[0]); */
1441 break; 1487 break;
1442 case Opt_err_panic: 1488 case Opt_err_panic:
1443 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1489 clear_opt(sb, ERRORS_CONT);
1444 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1490 clear_opt(sb, ERRORS_RO);
1445 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1491 set_opt(sb, ERRORS_PANIC);
1446 break; 1492 break;
1447 case Opt_err_ro: 1493 case Opt_err_ro:
1448 clear_opt(sbi->s_mount_opt, ERRORS_CONT); 1494 clear_opt(sb, ERRORS_CONT);
1449 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1495 clear_opt(sb, ERRORS_PANIC);
1450 set_opt(sbi->s_mount_opt, ERRORS_RO); 1496 set_opt(sb, ERRORS_RO);
1451 break; 1497 break;
1452 case Opt_err_cont: 1498 case Opt_err_cont:
1453 clear_opt(sbi->s_mount_opt, ERRORS_RO); 1499 clear_opt(sb, ERRORS_RO);
1454 clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 1500 clear_opt(sb, ERRORS_PANIC);
1455 set_opt(sbi->s_mount_opt, ERRORS_CONT); 1501 set_opt(sb, ERRORS_CONT);
1456 break; 1502 break;
1457 case Opt_nouid32: 1503 case Opt_nouid32:
1458 set_opt(sbi->s_mount_opt, NO_UID32); 1504 set_opt(sb, NO_UID32);
1459 break; 1505 break;
1460 case Opt_debug: 1506 case Opt_debug:
1461 set_opt(sbi->s_mount_opt, DEBUG); 1507 set_opt(sb, DEBUG);
1462 break; 1508 break;
1463 case Opt_oldalloc: 1509 case Opt_oldalloc:
1464 set_opt(sbi->s_mount_opt, OLDALLOC); 1510 set_opt(sb, OLDALLOC);
1465 break; 1511 break;
1466 case Opt_orlov: 1512 case Opt_orlov:
1467 clear_opt(sbi->s_mount_opt, OLDALLOC); 1513 clear_opt(sb, OLDALLOC);
1468 break; 1514 break;
1469#ifdef CONFIG_EXT4_FS_XATTR 1515#ifdef CONFIG_EXT4_FS_XATTR
1470 case Opt_user_xattr: 1516 case Opt_user_xattr:
1471 set_opt(sbi->s_mount_opt, XATTR_USER); 1517 set_opt(sb, XATTR_USER);
1472 break; 1518 break;
1473 case Opt_nouser_xattr: 1519 case Opt_nouser_xattr:
1474 clear_opt(sbi->s_mount_opt, XATTR_USER); 1520 clear_opt(sb, XATTR_USER);
1475 break; 1521 break;
1476#else 1522#else
1477 case Opt_user_xattr: 1523 case Opt_user_xattr:
@@ -1481,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb,
1481#endif 1527#endif
1482#ifdef CONFIG_EXT4_FS_POSIX_ACL 1528#ifdef CONFIG_EXT4_FS_POSIX_ACL
1483 case Opt_acl: 1529 case Opt_acl:
1484 set_opt(sbi->s_mount_opt, POSIX_ACL); 1530 set_opt(sb, POSIX_ACL);
1485 break; 1531 break;
1486 case Opt_noacl: 1532 case Opt_noacl:
1487 clear_opt(sbi->s_mount_opt, POSIX_ACL); 1533 clear_opt(sb, POSIX_ACL);
1488 break; 1534 break;
1489#else 1535#else
1490 case Opt_acl: 1536 case Opt_acl:
@@ -1503,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb,
1503 "Cannot specify journal on remount"); 1549 "Cannot specify journal on remount");
1504 return 0; 1550 return 0;
1505 } 1551 }
1506 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1552 set_opt(sb, UPDATE_JOURNAL);
1507 break; 1553 break;
1508 case Opt_journal_dev: 1554 case Opt_journal_dev:
1509 if (is_remount) { 1555 if (is_remount) {
@@ -1516,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb,
1516 *journal_devnum = option; 1562 *journal_devnum = option;
1517 break; 1563 break;
1518 case Opt_journal_checksum: 1564 case Opt_journal_checksum:
1519 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1565 set_opt(sb, JOURNAL_CHECKSUM);
1520 break; 1566 break;
1521 case Opt_journal_async_commit: 1567 case Opt_journal_async_commit:
1522 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1568 set_opt(sb, JOURNAL_ASYNC_COMMIT);
1523 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1569 set_opt(sb, JOURNAL_CHECKSUM);
1524 break; 1570 break;
1525 case Opt_noload: 1571 case Opt_noload:
1526 set_opt(sbi->s_mount_opt, NOLOAD); 1572 set_opt(sb, NOLOAD);
1527 break; 1573 break;
1528 case Opt_commit: 1574 case Opt_commit:
1529 if (match_int(&args[0], &option)) 1575 if (match_int(&args[0], &option))
@@ -1566,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb,
1566 return 0; 1612 return 0;
1567 } 1613 }
1568 } else { 1614 } else {
1569 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 1615 clear_opt(sb, DATA_FLAGS);
1570 sbi->s_mount_opt |= data_opt; 1616 sbi->s_mount_opt |= data_opt;
1571 } 1617 }
1572 break; 1618 break;
1573 case Opt_data_err_abort: 1619 case Opt_data_err_abort:
1574 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1620 set_opt(sb, DATA_ERR_ABORT);
1575 break; 1621 break;
1576 case Opt_data_err_ignore: 1622 case Opt_data_err_ignore:
1577 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1623 clear_opt(sb, DATA_ERR_ABORT);
1578 break; 1624 break;
1579#ifdef CONFIG_QUOTA 1625#ifdef CONFIG_QUOTA
1580 case Opt_usrjquota: 1626 case Opt_usrjquota:
@@ -1614,12 +1660,12 @@ set_qf_format:
1614 break; 1660 break;
1615 case Opt_quota: 1661 case Opt_quota:
1616 case Opt_usrquota: 1662 case Opt_usrquota:
1617 set_opt(sbi->s_mount_opt, QUOTA); 1663 set_opt(sb, QUOTA);
1618 set_opt(sbi->s_mount_opt, USRQUOTA); 1664 set_opt(sb, USRQUOTA);
1619 break; 1665 break;
1620 case Opt_grpquota: 1666 case Opt_grpquota:
1621 set_opt(sbi->s_mount_opt, QUOTA); 1667 set_opt(sb, QUOTA);
1622 set_opt(sbi->s_mount_opt, GRPQUOTA); 1668 set_opt(sb, GRPQUOTA);
1623 break; 1669 break;
1624 case Opt_noquota: 1670 case Opt_noquota:
1625 if (sb_any_quota_loaded(sb)) { 1671 if (sb_any_quota_loaded(sb)) {
@@ -1627,9 +1673,9 @@ set_qf_format:
1627 "options when quota turned on"); 1673 "options when quota turned on");
1628 return 0; 1674 return 0;
1629 } 1675 }
1630 clear_opt(sbi->s_mount_opt, QUOTA); 1676 clear_opt(sb, QUOTA);
1631 clear_opt(sbi->s_mount_opt, USRQUOTA); 1677 clear_opt(sb, USRQUOTA);
1632 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1678 clear_opt(sb, GRPQUOTA);
1633 break; 1679 break;
1634#else 1680#else
1635 case Opt_quota: 1681 case Opt_quota:
@@ -1655,7 +1701,7 @@ set_qf_format:
1655 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; 1701 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1656 break; 1702 break;
1657 case Opt_nobarrier: 1703 case Opt_nobarrier:
1658 clear_opt(sbi->s_mount_opt, BARRIER); 1704 clear_opt(sb, BARRIER);
1659 break; 1705 break;
1660 case Opt_barrier: 1706 case Opt_barrier:
1661 if (args[0].from) { 1707 if (args[0].from) {
@@ -1664,9 +1710,9 @@ set_qf_format:
1664 } else 1710 } else
1665 option = 1; /* No argument, default to 1 */ 1711 option = 1; /* No argument, default to 1 */
1666 if (option) 1712 if (option)
1667 set_opt(sbi->s_mount_opt, BARRIER); 1713 set_opt(sb, BARRIER);
1668 else 1714 else
1669 clear_opt(sbi->s_mount_opt, BARRIER); 1715 clear_opt(sb, BARRIER);
1670 break; 1716 break;
1671 case Opt_ignore: 1717 case Opt_ignore:
1672 break; 1718 break;
@@ -1690,11 +1736,17 @@ set_qf_format:
1690 "Ignoring deprecated bh option"); 1736 "Ignoring deprecated bh option");
1691 break; 1737 break;
1692 case Opt_i_version: 1738 case Opt_i_version:
1693 set_opt(sbi->s_mount_opt, I_VERSION); 1739 set_opt(sb, I_VERSION);
1694 sb->s_flags |= MS_I_VERSION; 1740 sb->s_flags |= MS_I_VERSION;
1695 break; 1741 break;
1696 case Opt_nodelalloc: 1742 case Opt_nodelalloc:
1697 clear_opt(sbi->s_mount_opt, DELALLOC); 1743 clear_opt(sb, DELALLOC);
1744 break;
1745 case Opt_mblk_io_submit:
1746 set_opt(sb, MBLK_IO_SUBMIT);
1747 break;
1748 case Opt_nomblk_io_submit:
1749 clear_opt(sb, MBLK_IO_SUBMIT);
1698 break; 1750 break;
1699 case Opt_stripe: 1751 case Opt_stripe:
1700 if (match_int(&args[0], &option)) 1752 if (match_int(&args[0], &option))
@@ -1704,13 +1756,13 @@ set_qf_format:
1704 sbi->s_stripe = option; 1756 sbi->s_stripe = option;
1705 break; 1757 break;
1706 case Opt_delalloc: 1758 case Opt_delalloc:
1707 set_opt(sbi->s_mount_opt, DELALLOC); 1759 set_opt(sb, DELALLOC);
1708 break; 1760 break;
1709 case Opt_block_validity: 1761 case Opt_block_validity:
1710 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1762 set_opt(sb, BLOCK_VALIDITY);
1711 break; 1763 break;
1712 case Opt_noblock_validity: 1764 case Opt_noblock_validity:
1713 clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 1765 clear_opt(sb, BLOCK_VALIDITY);
1714 break; 1766 break;
1715 case Opt_inode_readahead_blks: 1767 case Opt_inode_readahead_blks:
1716 if (match_int(&args[0], &option)) 1768 if (match_int(&args[0], &option))
@@ -1734,7 +1786,7 @@ set_qf_format:
1734 option); 1786 option);
1735 break; 1787 break;
1736 case Opt_noauto_da_alloc: 1788 case Opt_noauto_da_alloc:
1737 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1789 set_opt(sb, NO_AUTO_DA_ALLOC);
1738 break; 1790 break;
1739 case Opt_auto_da_alloc: 1791 case Opt_auto_da_alloc:
1740 if (args[0].from) { 1792 if (args[0].from) {
@@ -1743,21 +1795,35 @@ set_qf_format:
1743 } else 1795 } else
1744 option = 1; /* No argument, default to 1 */ 1796 option = 1; /* No argument, default to 1 */
1745 if (option) 1797 if (option)
1746 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1798 clear_opt(sb, NO_AUTO_DA_ALLOC);
1747 else 1799 else
1748 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1800 set_opt(sb,NO_AUTO_DA_ALLOC);
1749 break; 1801 break;
1750 case Opt_discard: 1802 case Opt_discard:
1751 set_opt(sbi->s_mount_opt, DISCARD); 1803 set_opt(sb, DISCARD);
1752 break; 1804 break;
1753 case Opt_nodiscard: 1805 case Opt_nodiscard:
1754 clear_opt(sbi->s_mount_opt, DISCARD); 1806 clear_opt(sb, DISCARD);
1755 break; 1807 break;
1756 case Opt_dioread_nolock: 1808 case Opt_dioread_nolock:
1757 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1809 set_opt(sb, DIOREAD_NOLOCK);
1758 break; 1810 break;
1759 case Opt_dioread_lock: 1811 case Opt_dioread_lock:
1760 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1812 clear_opt(sb, DIOREAD_NOLOCK);
1813 break;
1814 case Opt_init_inode_table:
1815 set_opt(sb, INIT_INODE_TABLE);
1816 if (args[0].from) {
1817 if (match_int(&args[0], &option))
1818 return 0;
1819 } else
1820 option = EXT4_DEF_LI_WAIT_MULT;
1821 if (option < 0)
1822 return 0;
1823 sbi->s_li_wait_mult = option;
1824 break;
1825 case Opt_noinit_inode_table:
1826 clear_opt(sb, INIT_INODE_TABLE);
1761 break; 1827 break;
1762 default: 1828 default:
1763 ext4_msg(sb, KERN_ERR, 1829 ext4_msg(sb, KERN_ERR,
@@ -1769,10 +1835,10 @@ set_qf_format:
1769#ifdef CONFIG_QUOTA 1835#ifdef CONFIG_QUOTA
1770 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1836 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1771 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) 1837 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1772 clear_opt(sbi->s_mount_opt, USRQUOTA); 1838 clear_opt(sb, USRQUOTA);
1773 1839
1774 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) 1840 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1775 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1841 clear_opt(sb, GRPQUOTA);
1776 1842
1777 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { 1843 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1778 ext4_msg(sb, KERN_ERR, "old and new quota " 1844 ext4_msg(sb, KERN_ERR, "old and new quota "
@@ -1842,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1842 ext4_commit_super(sb, 1); 1908 ext4_commit_super(sb, 1);
1843 if (test_opt(sb, DEBUG)) 1909 if (test_opt(sb, DEBUG))
1844 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 1910 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1845 "bpg=%lu, ipg=%lu, mo=%04x]\n", 1911 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
1846 sb->s_blocksize, 1912 sb->s_blocksize,
1847 sbi->s_groups_count, 1913 sbi->s_groups_count,
1848 EXT4_BLOCKS_PER_GROUP(sb), 1914 EXT4_BLOCKS_PER_GROUP(sb),
1849 EXT4_INODES_PER_GROUP(sb), 1915 EXT4_INODES_PER_GROUP(sb),
1850 sbi->s_mount_opt); 1916 sbi->s_mount_opt, sbi->s_mount_opt2);
1851 1917
1852 return res; 1918 return res;
1853} 1919}
@@ -1877,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
1877 size = flex_group_count * sizeof(struct flex_groups); 1943 size = flex_group_count * sizeof(struct flex_groups);
1878 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); 1944 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
1879 if (sbi->s_flex_groups == NULL) { 1945 if (sbi->s_flex_groups == NULL) {
1880 sbi->s_flex_groups = vmalloc(size); 1946 sbi->s_flex_groups = vzalloc(size);
1881 if (sbi->s_flex_groups) 1947 if (sbi->s_flex_groups == NULL) {
1882 memset(sbi->s_flex_groups, 0, size); 1948 ext4_msg(sb, KERN_ERR,
1883 } 1949 "not enough memory for %u flex groups",
1884 if (sbi->s_flex_groups == NULL) { 1950 flex_group_count);
1885 ext4_msg(sb, KERN_ERR, "not enough memory for " 1951 goto failed;
1886 "%u flex groups", flex_group_count); 1952 }
1887 goto failed;
1888 } 1953 }
1889 1954
1890 for (i = 0; i < sbi->s_groups_count; i++) { 1955 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -1942,7 +2007,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
1942} 2007}
1943 2008
1944/* Called at mount-time, super-block is locked */ 2009/* Called at mount-time, super-block is locked */
1945static int ext4_check_descriptors(struct super_block *sb) 2010static int ext4_check_descriptors(struct super_block *sb,
2011 ext4_group_t *first_not_zeroed)
1946{ 2012{
1947 struct ext4_sb_info *sbi = EXT4_SB(sb); 2013 struct ext4_sb_info *sbi = EXT4_SB(sb);
1948 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); 2014 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1951,7 +2017,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1951 ext4_fsblk_t inode_bitmap; 2017 ext4_fsblk_t inode_bitmap;
1952 ext4_fsblk_t inode_table; 2018 ext4_fsblk_t inode_table;
1953 int flexbg_flag = 0; 2019 int flexbg_flag = 0;
1954 ext4_group_t i; 2020 ext4_group_t i, grp = sbi->s_groups_count;
1955 2021
1956 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 2022 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1957 flexbg_flag = 1; 2023 flexbg_flag = 1;
@@ -1967,6 +2033,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1967 last_block = first_block + 2033 last_block = first_block +
1968 (EXT4_BLOCKS_PER_GROUP(sb) - 1); 2034 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1969 2035
2036 if ((grp == sbi->s_groups_count) &&
2037 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2038 grp = i;
2039
1970 block_bitmap = ext4_block_bitmap(sb, gdp); 2040 block_bitmap = ext4_block_bitmap(sb, gdp);
1971 if (block_bitmap < first_block || block_bitmap > last_block) { 2041 if (block_bitmap < first_block || block_bitmap > last_block) {
1972 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2042 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2004,6 +2074,8 @@ static int ext4_check_descriptors(struct super_block *sb)
2004 if (!flexbg_flag) 2074 if (!flexbg_flag)
2005 first_block += EXT4_BLOCKS_PER_GROUP(sb); 2075 first_block += EXT4_BLOCKS_PER_GROUP(sb);
2006 } 2076 }
2077 if (NULL != first_not_zeroed)
2078 *first_not_zeroed = grp;
2007 2079
2008 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 2080 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
2009 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); 2081 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2376,6 +2448,7 @@ static struct ext4_attr ext4_attr_##_name = { \
2376#define EXT4_ATTR(name, mode, show, store) \ 2448#define EXT4_ATTR(name, mode, show, store) \
2377static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2449static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2378 2450
2451#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2379#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) 2452#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2380#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) 2453#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2381#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2454#define EXT4_RW_ATTR_SBI_UI(name, elname) \
@@ -2412,6 +2485,16 @@ static struct attribute *ext4_attrs[] = {
2412 NULL, 2485 NULL,
2413}; 2486};
2414 2487
2488/* Features this copy of ext4 supports */
2489EXT4_INFO_ATTR(lazy_itable_init);
2490EXT4_INFO_ATTR(batched_discard);
2491
2492static struct attribute *ext4_feat_attrs[] = {
2493 ATTR_LIST(lazy_itable_init),
2494 ATTR_LIST(batched_discard),
2495 NULL,
2496};
2497
2415static ssize_t ext4_attr_show(struct kobject *kobj, 2498static ssize_t ext4_attr_show(struct kobject *kobj,
2416 struct attribute *attr, char *buf) 2499 struct attribute *attr, char *buf)
2417{ 2500{
@@ -2440,7 +2523,6 @@ static void ext4_sb_release(struct kobject *kobj)
2440 complete(&sbi->s_kobj_unregister); 2523 complete(&sbi->s_kobj_unregister);
2441} 2524}
2442 2525
2443
2444static const struct sysfs_ops ext4_attr_ops = { 2526static const struct sysfs_ops ext4_attr_ops = {
2445 .show = ext4_attr_show, 2527 .show = ext4_attr_show,
2446 .store = ext4_attr_store, 2528 .store = ext4_attr_store,
@@ -2452,6 +2534,17 @@ static struct kobj_type ext4_ktype = {
2452 .release = ext4_sb_release, 2534 .release = ext4_sb_release,
2453}; 2535};
2454 2536
2537static void ext4_feat_release(struct kobject *kobj)
2538{
2539 complete(&ext4_feat->f_kobj_unregister);
2540}
2541
2542static struct kobj_type ext4_feat_ktype = {
2543 .default_attrs = ext4_feat_attrs,
2544 .sysfs_ops = &ext4_attr_ops,
2545 .release = ext4_feat_release,
2546};
2547
2455/* 2548/*
2456 * Check whether this filesystem can be mounted based on 2549 * Check whether this filesystem can be mounted based on
2457 * the features present and the RDONLY/RDWR mount requested. 2550 * the features present and the RDONLY/RDWR mount requested.
@@ -2542,6 +2635,368 @@ static void print_daily_error_info(unsigned long arg)
2542 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2635 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2543} 2636}
2544 2637
2638static void ext4_lazyinode_timeout(unsigned long data)
2639{
2640 struct task_struct *p = (struct task_struct *)data;
2641 wake_up_process(p);
2642}
2643
2644/* Find next suitable group and run ext4_init_inode_table */
2645static int ext4_run_li_request(struct ext4_li_request *elr)
2646{
2647 struct ext4_group_desc *gdp = NULL;
2648 ext4_group_t group, ngroups;
2649 struct super_block *sb;
2650 unsigned long timeout = 0;
2651 int ret = 0;
2652
2653 sb = elr->lr_super;
2654 ngroups = EXT4_SB(sb)->s_groups_count;
2655
2656 for (group = elr->lr_next_group; group < ngroups; group++) {
2657 gdp = ext4_get_group_desc(sb, group, NULL);
2658 if (!gdp) {
2659 ret = 1;
2660 break;
2661 }
2662
2663 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2664 break;
2665 }
2666
2667 if (group == ngroups)
2668 ret = 1;
2669
2670 if (!ret) {
2671 timeout = jiffies;
2672 ret = ext4_init_inode_table(sb, group,
2673 elr->lr_timeout ? 0 : 1);
2674 if (elr->lr_timeout == 0) {
2675 timeout = jiffies - timeout;
2676 if (elr->lr_sbi->s_li_wait_mult)
2677 timeout *= elr->lr_sbi->s_li_wait_mult;
2678 else
2679 timeout *= 20;
2680 elr->lr_timeout = timeout;
2681 }
2682 elr->lr_next_sched = jiffies + elr->lr_timeout;
2683 elr->lr_next_group = group + 1;
2684 }
2685
2686 return ret;
2687}
2688
2689/*
2690 * Remove lr_request from the list_request and free the
2691 * request tructure. Should be called with li_list_mtx held
2692 */
2693static void ext4_remove_li_request(struct ext4_li_request *elr)
2694{
2695 struct ext4_sb_info *sbi;
2696
2697 if (!elr)
2698 return;
2699
2700 sbi = elr->lr_sbi;
2701
2702 list_del(&elr->lr_request);
2703 sbi->s_li_request = NULL;
2704 kfree(elr);
2705}
2706
2707static void ext4_unregister_li_request(struct super_block *sb)
2708{
2709 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
2710
2711 if (!ext4_li_info)
2712 return;
2713
2714 mutex_lock(&ext4_li_info->li_list_mtx);
2715 ext4_remove_li_request(elr);
2716 mutex_unlock(&ext4_li_info->li_list_mtx);
2717}
2718
2719/*
2720 * This is the function where ext4lazyinit thread lives. It walks
2721 * through the request list searching for next scheduled filesystem.
2722 * When such a fs is found, run the lazy initialization request
2723 * (ext4_rn_li_request) and keep track of the time spend in this
2724 * function. Based on that time we compute next schedule time of
2725 * the request. When walking through the list is complete, compute
2726 * next waking time and put itself into sleep.
2727 */
2728static int ext4_lazyinit_thread(void *arg)
2729{
2730 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2731 struct list_head *pos, *n;
2732 struct ext4_li_request *elr;
2733 unsigned long next_wakeup;
2734 DEFINE_WAIT(wait);
2735
2736 BUG_ON(NULL == eli);
2737
2738 eli->li_timer.data = (unsigned long)current;
2739 eli->li_timer.function = ext4_lazyinode_timeout;
2740
2741 eli->li_task = current;
2742 wake_up(&eli->li_wait_task);
2743
2744cont_thread:
2745 while (true) {
2746 next_wakeup = MAX_JIFFY_OFFSET;
2747
2748 mutex_lock(&eli->li_list_mtx);
2749 if (list_empty(&eli->li_request_list)) {
2750 mutex_unlock(&eli->li_list_mtx);
2751 goto exit_thread;
2752 }
2753
2754 list_for_each_safe(pos, n, &eli->li_request_list) {
2755 elr = list_entry(pos, struct ext4_li_request,
2756 lr_request);
2757
2758 if (time_after_eq(jiffies, elr->lr_next_sched)) {
2759 if (ext4_run_li_request(elr) != 0) {
2760 /* error, remove the lazy_init job */
2761 ext4_remove_li_request(elr);
2762 continue;
2763 }
2764 }
2765
2766 if (time_before(elr->lr_next_sched, next_wakeup))
2767 next_wakeup = elr->lr_next_sched;
2768 }
2769 mutex_unlock(&eli->li_list_mtx);
2770
2771 if (freezing(current))
2772 refrigerator();
2773
2774 if ((time_after_eq(jiffies, next_wakeup)) ||
2775 (MAX_JIFFY_OFFSET == next_wakeup)) {
2776 cond_resched();
2777 continue;
2778 }
2779
2780 eli->li_timer.expires = next_wakeup;
2781 add_timer(&eli->li_timer);
2782 prepare_to_wait(&eli->li_wait_daemon, &wait,
2783 TASK_INTERRUPTIBLE);
2784 if (time_before(jiffies, next_wakeup))
2785 schedule();
2786 finish_wait(&eli->li_wait_daemon, &wait);
2787 }
2788
2789exit_thread:
2790 /*
2791 * It looks like the request list is empty, but we need
2792 * to check it under the li_list_mtx lock, to prevent any
2793 * additions into it, and of course we should lock ext4_li_mtx
2794 * to atomically free the list and ext4_li_info, because at
2795 * this point another ext4 filesystem could be registering
2796 * new one.
2797 */
2798 mutex_lock(&ext4_li_mtx);
2799 mutex_lock(&eli->li_list_mtx);
2800 if (!list_empty(&eli->li_request_list)) {
2801 mutex_unlock(&eli->li_list_mtx);
2802 mutex_unlock(&ext4_li_mtx);
2803 goto cont_thread;
2804 }
2805 mutex_unlock(&eli->li_list_mtx);
2806 del_timer_sync(&ext4_li_info->li_timer);
2807 eli->li_task = NULL;
2808 wake_up(&eli->li_wait_task);
2809
2810 kfree(ext4_li_info);
2811 ext4_li_info = NULL;
2812 mutex_unlock(&ext4_li_mtx);
2813
2814 return 0;
2815}
2816
2817static void ext4_clear_request_list(void)
2818{
2819 struct list_head *pos, *n;
2820 struct ext4_li_request *elr;
2821
2822 mutex_lock(&ext4_li_info->li_list_mtx);
2823 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2824 elr = list_entry(pos, struct ext4_li_request,
2825 lr_request);
2826 ext4_remove_li_request(elr);
2827 }
2828 mutex_unlock(&ext4_li_info->li_list_mtx);
2829}
2830
2831static int ext4_run_lazyinit_thread(void)
2832{
2833 struct task_struct *t;
2834
2835 t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
2836 if (IS_ERR(t)) {
2837 int err = PTR_ERR(t);
2838 ext4_clear_request_list();
2839 del_timer_sync(&ext4_li_info->li_timer);
2840 kfree(ext4_li_info);
2841 ext4_li_info = NULL;
2842 printk(KERN_CRIT "EXT4: error %d creating inode table "
2843 "initialization thread\n",
2844 err);
2845 return err;
2846 }
2847 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2848
2849 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2850 return 0;
2851}
2852
2853/*
2854 * Check whether it make sense to run itable init. thread or not.
2855 * If there is at least one uninitialized inode table, return
2856 * corresponding group number, else the loop goes through all
2857 * groups and return total number of groups.
2858 */
2859static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
2860{
2861 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
2862 struct ext4_group_desc *gdp = NULL;
2863
2864 for (group = 0; group < ngroups; group++) {
2865 gdp = ext4_get_group_desc(sb, group, NULL);
2866 if (!gdp)
2867 continue;
2868
2869 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2870 break;
2871 }
2872
2873 return group;
2874}
2875
2876static int ext4_li_info_new(void)
2877{
2878 struct ext4_lazy_init *eli = NULL;
2879
2880 eli = kzalloc(sizeof(*eli), GFP_KERNEL);
2881 if (!eli)
2882 return -ENOMEM;
2883
2884 eli->li_task = NULL;
2885 INIT_LIST_HEAD(&eli->li_request_list);
2886 mutex_init(&eli->li_list_mtx);
2887
2888 init_waitqueue_head(&eli->li_wait_daemon);
2889 init_waitqueue_head(&eli->li_wait_task);
2890 init_timer(&eli->li_timer);
2891 eli->li_state |= EXT4_LAZYINIT_QUIT;
2892
2893 ext4_li_info = eli;
2894
2895 return 0;
2896}
2897
2898static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
2899 ext4_group_t start)
2900{
2901 struct ext4_sb_info *sbi = EXT4_SB(sb);
2902 struct ext4_li_request *elr;
2903 unsigned long rnd;
2904
2905 elr = kzalloc(sizeof(*elr), GFP_KERNEL);
2906 if (!elr)
2907 return NULL;
2908
2909 elr->lr_super = sb;
2910 elr->lr_sbi = sbi;
2911 elr->lr_next_group = start;
2912
2913 /*
2914 * Randomize first schedule time of the request to
2915 * spread the inode table initialization requests
2916 * better.
2917 */
2918 get_random_bytes(&rnd, sizeof(rnd));
2919 elr->lr_next_sched = jiffies + (unsigned long)rnd %
2920 (EXT4_DEF_LI_MAX_START_DELAY * HZ);
2921
2922 return elr;
2923}
2924
2925static int ext4_register_li_request(struct super_block *sb,
2926 ext4_group_t first_not_zeroed)
2927{
2928 struct ext4_sb_info *sbi = EXT4_SB(sb);
2929 struct ext4_li_request *elr;
2930 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2931 int ret = 0;
2932
2933 if (sbi->s_li_request != NULL)
2934 return 0;
2935
2936 if (first_not_zeroed == ngroups ||
2937 (sb->s_flags & MS_RDONLY) ||
2938 !test_opt(sb, INIT_INODE_TABLE)) {
2939 sbi->s_li_request = NULL;
2940 return 0;
2941 }
2942
2943 if (first_not_zeroed == ngroups) {
2944 sbi->s_li_request = NULL;
2945 return 0;
2946 }
2947
2948 elr = ext4_li_request_new(sb, first_not_zeroed);
2949 if (!elr)
2950 return -ENOMEM;
2951
2952 mutex_lock(&ext4_li_mtx);
2953
2954 if (NULL == ext4_li_info) {
2955 ret = ext4_li_info_new();
2956 if (ret)
2957 goto out;
2958 }
2959
2960 mutex_lock(&ext4_li_info->li_list_mtx);
2961 list_add(&elr->lr_request, &ext4_li_info->li_request_list);
2962 mutex_unlock(&ext4_li_info->li_list_mtx);
2963
2964 sbi->s_li_request = elr;
2965
2966 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
2967 ret = ext4_run_lazyinit_thread();
2968 if (ret)
2969 goto out;
2970 }
2971out:
2972 mutex_unlock(&ext4_li_mtx);
2973 if (ret)
2974 kfree(elr);
2975 return ret;
2976}
2977
2978/*
2979 * We do not need to lock anything since this is called on
2980 * module unload.
2981 */
2982static void ext4_destroy_lazyinit_thread(void)
2983{
2984 /*
2985 * If thread exited earlier
2986 * there's nothing to be done.
2987 */
2988 if (!ext4_li_info)
2989 return;
2990
2991 ext4_clear_request_list();
2992
2993 while (ext4_li_info->li_task) {
2994 wake_up(&ext4_li_info->li_wait_daemon);
2995 wait_event(ext4_li_info->li_wait_task,
2996 ext4_li_info->li_task == NULL);
2997 }
2998}
2999
2545static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3000static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2546 __releases(kernel_lock) 3001 __releases(kernel_lock)
2547 __acquires(kernel_lock) 3002 __acquires(kernel_lock)
@@ -2567,6 +3022,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2567 __u64 blocks_count; 3022 __u64 blocks_count;
2568 int err; 3023 int err;
2569 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3024 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3025 ext4_group_t first_not_zeroed;
2570 3026
2571 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 3027 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2572 if (!sbi) 3028 if (!sbi)
@@ -2588,8 +3044,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2588 sbi->s_sectors_written_start = 3044 sbi->s_sectors_written_start =
2589 part_stat_read(sb->s_bdev->bd_part, sectors[1]); 3045 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
2590 3046
2591 unlock_kernel();
2592
2593 /* Cleanup superblock name */ 3047 /* Cleanup superblock name */
2594 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3048 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
2595 *cp = '!'; 3049 *cp = '!';
@@ -2629,40 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2629 3083
2630 /* Set defaults before we parse the mount options */ 3084 /* Set defaults before we parse the mount options */
2631 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3085 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3086 set_opt(sb, INIT_INODE_TABLE);
2632 if (def_mount_opts & EXT4_DEFM_DEBUG) 3087 if (def_mount_opts & EXT4_DEFM_DEBUG)
2633 set_opt(sbi->s_mount_opt, DEBUG); 3088 set_opt(sb, DEBUG);
2634 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3089 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
2635 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", 3090 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
2636 "2.6.38"); 3091 "2.6.38");
2637 set_opt(sbi->s_mount_opt, GRPID); 3092 set_opt(sb, GRPID);
2638 } 3093 }
2639 if (def_mount_opts & EXT4_DEFM_UID16) 3094 if (def_mount_opts & EXT4_DEFM_UID16)
2640 set_opt(sbi->s_mount_opt, NO_UID32); 3095 set_opt(sb, NO_UID32);
2641#ifdef CONFIG_EXT4_FS_XATTR 3096#ifdef CONFIG_EXT4_FS_XATTR
2642 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 3097 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
2643 set_opt(sbi->s_mount_opt, XATTR_USER); 3098 set_opt(sb, XATTR_USER);
2644#endif 3099#endif
2645#ifdef CONFIG_EXT4_FS_POSIX_ACL 3100#ifdef CONFIG_EXT4_FS_POSIX_ACL
2646 if (def_mount_opts & EXT4_DEFM_ACL) 3101 if (def_mount_opts & EXT4_DEFM_ACL)
2647 set_opt(sbi->s_mount_opt, POSIX_ACL); 3102 set_opt(sb, POSIX_ACL);
2648#endif 3103#endif
2649 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3104 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2650 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3105 set_opt(sb, JOURNAL_DATA);
2651 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3106 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2652 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3107 set_opt(sb, ORDERED_DATA);
2653 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 3108 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2654 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 3109 set_opt(sb, WRITEBACK_DATA);
2655 3110
2656 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 3111 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2657 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 3112 set_opt(sb, ERRORS_PANIC);
2658 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) 3113 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
2659 set_opt(sbi->s_mount_opt, ERRORS_CONT); 3114 set_opt(sb, ERRORS_CONT);
2660 else 3115 else
2661 set_opt(sbi->s_mount_opt, ERRORS_RO); 3116 set_opt(sb, ERRORS_RO);
2662 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) 3117 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
2663 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); 3118 set_opt(sb, BLOCK_VALIDITY);
2664 if (def_mount_opts & EXT4_DEFM_DISCARD) 3119 if (def_mount_opts & EXT4_DEFM_DISCARD)
2665 set_opt(sbi->s_mount_opt, DISCARD); 3120 set_opt(sb, DISCARD);
2666 3121
2667 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 3122 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
2668 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 3123 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2671,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2671 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 3126 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2672 3127
2673 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) 3128 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
2674 set_opt(sbi->s_mount_opt, BARRIER); 3129 set_opt(sb, BARRIER);
2675 3130
2676 /* 3131 /*
2677 * enable delayed allocation by default 3132 * enable delayed allocation by default
@@ -2679,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 */ 3134 */
2680 if (!IS_EXT3_SB(sb) && 3135 if (!IS_EXT3_SB(sb) &&
2681 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3136 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
2682 set_opt(sbi->s_mount_opt, DELALLOC); 3137 set_opt(sb, DELALLOC);
2683 3138
2684 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3139 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
2685 &journal_devnum, &journal_ioprio, NULL, 0)) { 3140 &journal_devnum, &journal_ioprio, NULL, 0)) {
@@ -2831,15 +3286,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2831 * Test whether we have more sectors than will fit in sector_t, 3286 * Test whether we have more sectors than will fit in sector_t,
2832 * and whether the max offset is addressable by the page cache. 3287 * and whether the max offset is addressable by the page cache.
2833 */ 3288 */
2834 if ((ext4_blocks_count(es) > 3289 err = generic_check_addressable(sb->s_blocksize_bits,
2835 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || 3290 ext4_blocks_count(es));
2836 (ext4_blocks_count(es) > 3291 if (err) {
2837 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2838 ext4_msg(sb, KERN_ERR, "filesystem" 3292 ext4_msg(sb, KERN_ERR, "filesystem"
2839 " too large to mount safely on this system"); 3293 " too large to mount safely on this system");
2840 if (sizeof(sector_t) < 8) 3294 if (sizeof(sector_t) < 8)
2841 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3295 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2842 ret = -EFBIG; 3296 ret = err;
2843 goto failed_mount; 3297 goto failed_mount;
2844 } 3298 }
2845 3299
@@ -2908,7 +3362,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2908 goto failed_mount2; 3362 goto failed_mount2;
2909 } 3363 }
2910 } 3364 }
2911 if (!ext4_check_descriptors(sb)) { 3365 if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
2912 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3366 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
2913 goto failed_mount2; 3367 goto failed_mount2;
2914 } 3368 }
@@ -2924,6 +3378,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2924 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3378 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2925 spin_lock_init(&sbi->s_next_gen_lock); 3379 spin_lock_init(&sbi->s_next_gen_lock);
2926 3380
3381 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3382 ext4_count_free_blocks(sb));
3383 if (!err) {
3384 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3385 ext4_count_free_inodes(sb));
3386 }
3387 if (!err) {
3388 err = percpu_counter_init(&sbi->s_dirs_counter,
3389 ext4_count_dirs(sb));
3390 }
3391 if (!err) {
3392 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
3393 }
3394 if (err) {
3395 ext4_msg(sb, KERN_ERR, "insufficient memory");
3396 goto failed_mount3;
3397 }
3398
2927 sbi->s_stripe = ext4_get_stripe_size(sbi); 3399 sbi->s_stripe = ext4_get_stripe_size(sbi);
2928 sbi->s_max_writeback_mb_bump = 128; 3400 sbi->s_max_writeback_mb_bump = 128;
2929 3401
@@ -2965,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2965 "suppressed and not mounted read-only"); 3437 "suppressed and not mounted read-only");
2966 goto failed_mount_wq; 3438 goto failed_mount_wq;
2967 } else { 3439 } else {
2968 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 3440 clear_opt(sb, DATA_FLAGS);
2969 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 3441 set_opt(sb, WRITEBACK_DATA);
2970 sbi->s_journal = NULL; 3442 sbi->s_journal = NULL;
2971 needs_recovery = 0; 3443 needs_recovery = 0;
2972 goto no_journal; 3444 goto no_journal;
@@ -3004,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3004 */ 3476 */
3005 if (jbd2_journal_check_available_features 3477 if (jbd2_journal_check_available_features
3006 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) 3478 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
3007 set_opt(sbi->s_mount_opt, ORDERED_DATA); 3479 set_opt(sb, ORDERED_DATA);
3008 else 3480 else
3009 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 3481 set_opt(sb, JOURNAL_DATA);
3010 break; 3482 break;
3011 3483
3012 case EXT4_MOUNT_ORDERED_DATA: 3484 case EXT4_MOUNT_ORDERED_DATA:
@@ -3022,22 +3494,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3022 } 3494 }
3023 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3495 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3024 3496
3025no_journal: 3497 /*
3026 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3498 * The journal may have updated the bg summary counts, so we
3027 ext4_count_free_blocks(sb)); 3499 * need to update the global counters.
3028 if (!err) 3500 */
3029 err = percpu_counter_init(&sbi->s_freeinodes_counter, 3501 percpu_counter_set(&sbi->s_freeblocks_counter,
3030 ext4_count_free_inodes(sb)); 3502 ext4_count_free_blocks(sb));
3031 if (!err) 3503 percpu_counter_set(&sbi->s_freeinodes_counter,
3032 err = percpu_counter_init(&sbi->s_dirs_counter, 3504 ext4_count_free_inodes(sb));
3033 ext4_count_dirs(sb)); 3505 percpu_counter_set(&sbi->s_dirs_counter,
3034 if (!err) 3506 ext4_count_dirs(sb));
3035 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); 3507 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3036 if (err) {
3037 ext4_msg(sb, KERN_ERR, "insufficient memory");
3038 goto failed_mount_wq;
3039 }
3040 3508
3509no_journal:
3041 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3510 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
3042 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3511 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3043 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3512 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3099,18 +3568,18 @@ no_journal:
3099 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { 3568 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
3100 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 3569 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
3101 "requested data journaling mode"); 3570 "requested data journaling mode");
3102 clear_opt(sbi->s_mount_opt, DELALLOC); 3571 clear_opt(sb, DELALLOC);
3103 } 3572 }
3104 if (test_opt(sb, DIOREAD_NOLOCK)) { 3573 if (test_opt(sb, DIOREAD_NOLOCK)) {
3105 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 3574 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3106 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3575 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3107 "option - requested data journaling mode"); 3576 "option - requested data journaling mode");
3108 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3577 clear_opt(sb, DIOREAD_NOLOCK);
3109 } 3578 }
3110 if (sb->s_blocksize < PAGE_SIZE) { 3579 if (sb->s_blocksize < PAGE_SIZE) {
3111 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " 3580 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3112 "option - block size is too small"); 3581 "option - block size is too small");
3113 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 3582 clear_opt(sb, DIOREAD_NOLOCK);
3114 } 3583 }
3115 } 3584 }
3116 3585
@@ -3129,6 +3598,10 @@ no_journal:
3129 goto failed_mount4; 3598 goto failed_mount4;
3130 } 3599 }
3131 3600
3601 err = ext4_register_li_request(sb, first_not_zeroed);
3602 if (err)
3603 goto failed_mount4;
3604
3132 sbi->s_kobj.kset = ext4_kset; 3605 sbi->s_kobj.kset = ext4_kset;
3133 init_completion(&sbi->s_kobj_unregister); 3606 init_completion(&sbi->s_kobj_unregister);
3134 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, 3607 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3166,7 +3639,6 @@ no_journal:
3166 if (es->s_error_count) 3639 if (es->s_error_count)
3167 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 3640 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3168 3641
3169 lock_kernel();
3170 kfree(orig_data); 3642 kfree(orig_data);
3171 return 0; 3643 return 0;
3172 3644
@@ -3184,10 +3656,6 @@ failed_mount_wq:
3184 jbd2_journal_destroy(sbi->s_journal); 3656 jbd2_journal_destroy(sbi->s_journal);
3185 sbi->s_journal = NULL; 3657 sbi->s_journal = NULL;
3186 } 3658 }
3187 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3188 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3189 percpu_counter_destroy(&sbi->s_dirs_counter);
3190 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3191failed_mount3: 3659failed_mount3:
3192 if (sbi->s_flex_groups) { 3660 if (sbi->s_flex_groups) {
3193 if (is_vmalloc_addr(sbi->s_flex_groups)) 3661 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3195,6 +3663,10 @@ failed_mount3:
3195 else 3663 else
3196 kfree(sbi->s_flex_groups); 3664 kfree(sbi->s_flex_groups);
3197 } 3665 }
3666 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3667 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3668 percpu_counter_destroy(&sbi->s_dirs_counter);
3669 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3198failed_mount2: 3670failed_mount2:
3199 for (i = 0; i < db_count; i++) 3671 for (i = 0; i < db_count; i++)
3200 brelse(sbi->s_group_desc[i]); 3672 brelse(sbi->s_group_desc[i]);
@@ -3213,7 +3685,6 @@ out_fail:
3213 sb->s_fs_info = NULL; 3685 sb->s_fs_info = NULL;
3214 kfree(sbi->s_blockgroup_lock); 3686 kfree(sbi->s_blockgroup_lock);
3215 kfree(sbi); 3687 kfree(sbi);
3216 lock_kernel();
3217out_free_orig: 3688out_free_orig:
3218 kfree(orig_data); 3689 kfree(orig_data);
3219 return ret; 3690 return ret;
@@ -3306,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
3306 if (bdev == NULL) 3777 if (bdev == NULL)
3307 return NULL; 3778 return NULL;
3308 3779
3309 if (bd_claim(bdev, sb)) {
3310 ext4_msg(sb, KERN_ERR,
3311 "failed to claim external journal device");
3312 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
3313 return NULL;
3314 }
3315
3316 blocksize = sb->s_blocksize; 3780 blocksize = sb->s_blocksize;
3317 hblock = bdev_logical_block_size(bdev); 3781 hblock = bdev_logical_block_size(bdev);
3318 if (blocksize < hblock) { 3782 if (blocksize < hblock) {
@@ -3470,7 +3934,7 @@ static int ext4_load_journal(struct super_block *sb,
3470 EXT4_SB(sb)->s_journal = journal; 3934 EXT4_SB(sb)->s_journal = journal;
3471 ext4_clear_journal_err(sb, es); 3935 ext4_clear_journal_err(sb, es);
3472 3936
3473 if (journal_devnum && 3937 if (!really_read_only && journal_devnum &&
3474 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 3938 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3475 es->s_journal_dev = cpu_to_le32(journal_devnum); 3939 es->s_journal_dev = cpu_to_le32(journal_devnum);
3476 3940
@@ -3524,9 +3988,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3524 es->s_kbytes_written = 3988 es->s_kbytes_written =
3525 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 3989 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3526 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3990 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3527 &EXT4_SB(sb)->s_freeblocks_counter)); 3991 &EXT4_SB(sb)->s_freeblocks_counter));
3528 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3992 es->s_free_inodes_count =
3529 &EXT4_SB(sb)->s_freeinodes_counter)); 3993 cpu_to_le32(percpu_counter_sum_positive(
3994 &EXT4_SB(sb)->s_freeinodes_counter));
3530 sb->s_dirt = 0; 3995 sb->s_dirt = 0;
3531 BUFFER_TRACE(sbh, "marking dirty"); 3996 BUFFER_TRACE(sbh, "marking dirty");
3532 mark_buffer_dirty(sbh); 3997 mark_buffer_dirty(sbh);
@@ -3706,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb)
3706 return 0; 4171 return 0;
3707} 4172}
3708 4173
4174/*
4175 * Structure to save mount options for ext4_remount's benefit
4176 */
4177struct ext4_mount_options {
4178 unsigned long s_mount_opt;
4179 unsigned long s_mount_opt2;
4180 uid_t s_resuid;
4181 gid_t s_resgid;
4182 unsigned long s_commit_interval;
4183 u32 s_min_batch_time, s_max_batch_time;
4184#ifdef CONFIG_QUOTA
4185 int s_jquota_fmt;
4186 char *s_qf_names[MAXQUOTAS];
4187#endif
4188};
4189
3709static int ext4_remount(struct super_block *sb, int *flags, char *data) 4190static int ext4_remount(struct super_block *sb, int *flags, char *data)
3710{ 4191{
3711 struct ext4_super_block *es; 4192 struct ext4_super_block *es;
@@ -3722,12 +4203,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3722#endif 4203#endif
3723 char *orig_data = kstrdup(data, GFP_KERNEL); 4204 char *orig_data = kstrdup(data, GFP_KERNEL);
3724 4205
3725 lock_kernel();
3726
3727 /* Store the original options */ 4206 /* Store the original options */
3728 lock_super(sb); 4207 lock_super(sb);
3729 old_sb_flags = sb->s_flags; 4208 old_sb_flags = sb->s_flags;
3730 old_opts.s_mount_opt = sbi->s_mount_opt; 4209 old_opts.s_mount_opt = sbi->s_mount_opt;
4210 old_opts.s_mount_opt2 = sbi->s_mount_opt2;
3731 old_opts.s_resuid = sbi->s_resuid; 4211 old_opts.s_resuid = sbi->s_resuid;
3732 old_opts.s_resgid = sbi->s_resgid; 4212 old_opts.s_resgid = sbi->s_resgid;
3733 old_opts.s_commit_interval = sbi->s_commit_interval; 4213 old_opts.s_commit_interval = sbi->s_commit_interval;
@@ -3846,6 +4326,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3846 enable_quota = 1; 4326 enable_quota = 1;
3847 } 4327 }
3848 } 4328 }
4329
4330 /*
4331 * Reinitialize lazy itable initialization thread based on
4332 * current settings
4333 */
4334 if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
4335 ext4_unregister_li_request(sb);
4336 else {
4337 ext4_group_t first_not_zeroed;
4338 first_not_zeroed = ext4_has_uninit_itable(sb);
4339 ext4_register_li_request(sb, first_not_zeroed);
4340 }
4341
3849 ext4_setup_system_zone(sb); 4342 ext4_setup_system_zone(sb);
3850 if (sbi->s_journal == NULL) 4343 if (sbi->s_journal == NULL)
3851 ext4_commit_super(sb, 1); 4344 ext4_commit_super(sb, 1);
@@ -3858,7 +4351,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3858 kfree(old_opts.s_qf_names[i]); 4351 kfree(old_opts.s_qf_names[i]);
3859#endif 4352#endif
3860 unlock_super(sb); 4353 unlock_super(sb);
3861 unlock_kernel();
3862 if (enable_quota) 4354 if (enable_quota)
3863 dquot_resume(sb, -1); 4355 dquot_resume(sb, -1);
3864 4356
@@ -3869,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3869restore_opts: 4361restore_opts:
3870 sb->s_flags = old_sb_flags; 4362 sb->s_flags = old_sb_flags;
3871 sbi->s_mount_opt = old_opts.s_mount_opt; 4363 sbi->s_mount_opt = old_opts.s_mount_opt;
4364 sbi->s_mount_opt2 = old_opts.s_mount_opt2;
3872 sbi->s_resuid = old_opts.s_resuid; 4365 sbi->s_resuid = old_opts.s_resuid;
3873 sbi->s_resgid = old_opts.s_resgid; 4366 sbi->s_resgid = old_opts.s_resgid;
3874 sbi->s_commit_interval = old_opts.s_commit_interval; 4367 sbi->s_commit_interval = old_opts.s_commit_interval;
@@ -3884,7 +4377,6 @@ restore_opts:
3884 } 4377 }
3885#endif 4378#endif
3886 unlock_super(sb); 4379 unlock_super(sb);
3887 unlock_kernel();
3888 kfree(orig_data); 4380 kfree(orig_data);
3889 return err; 4381 return err;
3890} 4382}
@@ -4066,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
4066 * Standard function to be called on quota_on 4558 * Standard function to be called on quota_on
4067 */ 4559 */
4068static int ext4_quota_on(struct super_block *sb, int type, int format_id, 4560static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4069 char *name) 4561 struct path *path)
4070{ 4562{
4071 int err; 4563 int err;
4072 struct path path;
4073 4564
4074 if (!test_opt(sb, QUOTA)) 4565 if (!test_opt(sb, QUOTA))
4075 return -EINVAL; 4566 return -EINVAL;
4076 4567
4077 err = kern_path(name, LOOKUP_FOLLOW, &path);
4078 if (err)
4079 return err;
4080
4081 /* Quotafile not on the same filesystem? */ 4568 /* Quotafile not on the same filesystem? */
4082 if (path.mnt->mnt_sb != sb) { 4569 if (path->mnt->mnt_sb != sb)
4083 path_put(&path);
4084 return -EXDEV; 4570 return -EXDEV;
4085 }
4086 /* Journaling quota? */ 4571 /* Journaling quota? */
4087 if (EXT4_SB(sb)->s_qf_names[type]) { 4572 if (EXT4_SB(sb)->s_qf_names[type]) {
4088 /* Quotafile not in fs root? */ 4573 /* Quotafile not in fs root? */
4089 if (path.dentry->d_parent != sb->s_root) 4574 if (path->dentry->d_parent != sb->s_root)
4090 ext4_msg(sb, KERN_WARNING, 4575 ext4_msg(sb, KERN_WARNING,
4091 "Quota file not on filesystem root. " 4576 "Quota file not on filesystem root. "
4092 "Journaled quota will not work"); 4577 "Journaled quota will not work");
@@ -4097,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4097 * all updates to the file when we bypass pagecache... 4582 * all updates to the file when we bypass pagecache...
4098 */ 4583 */
4099 if (EXT4_SB(sb)->s_journal && 4584 if (EXT4_SB(sb)->s_journal &&
4100 ext4_should_journal_data(path.dentry->d_inode)) { 4585 ext4_should_journal_data(path->dentry->d_inode)) {
4101 /* 4586 /*
4102 * We don't need to lock updates but journal_flush() could 4587 * We don't need to lock updates but journal_flush() could
4103 * otherwise be livelocked... 4588 * otherwise be livelocked...
@@ -4105,25 +4590,19 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4105 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 4590 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
4106 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 4591 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
4107 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4592 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4108 if (err) { 4593 if (err)
4109 path_put(&path);
4110 return err; 4594 return err;
4111 }
4112 } 4595 }
4113 4596
4114 err = dquot_quota_on_path(sb, type, format_id, &path); 4597 return dquot_quota_on(sb, type, format_id, path);
4115 path_put(&path);
4116 return err;
4117} 4598}
4118 4599
4119static int ext4_quota_off(struct super_block *sb, int type) 4600static int ext4_quota_off(struct super_block *sb, int type)
4120{ 4601{
4121 /* Force all delayed allocation blocks to be allocated */ 4602 /* Force all delayed allocation blocks to be allocated.
4122 if (test_opt(sb, DELALLOC)) { 4603 * Caller already holds s_umount sem */
4123 down_read(&sb->s_umount); 4604 if (test_opt(sb, DELALLOC))
4124 sync_filesystem(sb); 4605 sync_filesystem(sb);
4125 up_read(&sb->s_umount);
4126 }
4127 4606
4128 return dquot_quota_off(sb, type); 4607 return dquot_quota_off(sb, type);
4129} 4608}
@@ -4229,17 +4708,17 @@ out:
4229 4708
4230#endif 4709#endif
4231 4710
4232static int ext4_get_sb(struct file_system_type *fs_type, int flags, 4711static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4233 const char *dev_name, void *data, struct vfsmount *mnt) 4712 const char *dev_name, void *data)
4234{ 4713{
4235 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4714 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
4236} 4715}
4237 4716
4238#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4717#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4239static struct file_system_type ext2_fs_type = { 4718static struct file_system_type ext2_fs_type = {
4240 .owner = THIS_MODULE, 4719 .owner = THIS_MODULE,
4241 .name = "ext2", 4720 .name = "ext2",
4242 .get_sb = ext4_get_sb, 4721 .mount = ext4_mount,
4243 .kill_sb = kill_block_super, 4722 .kill_sb = kill_block_super,
4244 .fs_flags = FS_REQUIRES_DEV, 4723 .fs_flags = FS_REQUIRES_DEV,
4245}; 4724};
@@ -4284,28 +4763,58 @@ static inline void unregister_as_ext3(void) { }
4284static struct file_system_type ext4_fs_type = { 4763static struct file_system_type ext4_fs_type = {
4285 .owner = THIS_MODULE, 4764 .owner = THIS_MODULE,
4286 .name = "ext4", 4765 .name = "ext4",
4287 .get_sb = ext4_get_sb, 4766 .mount = ext4_mount,
4288 .kill_sb = kill_block_super, 4767 .kill_sb = kill_block_super,
4289 .fs_flags = FS_REQUIRES_DEV, 4768 .fs_flags = FS_REQUIRES_DEV,
4290}; 4769};
4291 4770
4292static int __init init_ext4_fs(void) 4771int __init ext4_init_feat_adverts(void)
4772{
4773 struct ext4_features *ef;
4774 int ret = -ENOMEM;
4775
4776 ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
4777 if (!ef)
4778 goto out;
4779
4780 ef->f_kobj.kset = ext4_kset;
4781 init_completion(&ef->f_kobj_unregister);
4782 ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
4783 "features");
4784 if (ret) {
4785 kfree(ef);
4786 goto out;
4787 }
4788
4789 ext4_feat = ef;
4790 ret = 0;
4791out:
4792 return ret;
4793}
4794
4795static int __init ext4_init_fs(void)
4293{ 4796{
4294 int err; 4797 int err;
4295 4798
4296 ext4_check_flag_values(); 4799 ext4_check_flag_values();
4297 err = init_ext4_system_zone(); 4800 err = ext4_init_pageio();
4298 if (err) 4801 if (err)
4299 return err; 4802 return err;
4803 err = ext4_init_system_zone();
4804 if (err)
4805 goto out5;
4300 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 4806 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4301 if (!ext4_kset) 4807 if (!ext4_kset)
4302 goto out4; 4808 goto out4;
4303 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 4809 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4304 err = init_ext4_mballoc(); 4810
4811 err = ext4_init_feat_adverts();
4812
4813 err = ext4_init_mballoc();
4305 if (err) 4814 if (err)
4306 goto out3; 4815 goto out3;
4307 4816
4308 err = init_ext4_xattr(); 4817 err = ext4_init_xattr();
4309 if (err) 4818 if (err)
4310 goto out2; 4819 goto out2;
4311 err = init_inodecache(); 4820 err = init_inodecache();
@@ -4316,38 +4825,46 @@ static int __init init_ext4_fs(void)
4316 err = register_filesystem(&ext4_fs_type); 4825 err = register_filesystem(&ext4_fs_type);
4317 if (err) 4826 if (err)
4318 goto out; 4827 goto out;
4828
4829 ext4_li_info = NULL;
4830 mutex_init(&ext4_li_mtx);
4319 return 0; 4831 return 0;
4320out: 4832out:
4321 unregister_as_ext2(); 4833 unregister_as_ext2();
4322 unregister_as_ext3(); 4834 unregister_as_ext3();
4323 destroy_inodecache(); 4835 destroy_inodecache();
4324out1: 4836out1:
4325 exit_ext4_xattr(); 4837 ext4_exit_xattr();
4326out2: 4838out2:
4327 exit_ext4_mballoc(); 4839 ext4_exit_mballoc();
4328out3: 4840out3:
4841 kfree(ext4_feat);
4329 remove_proc_entry("fs/ext4", NULL); 4842 remove_proc_entry("fs/ext4", NULL);
4330 kset_unregister(ext4_kset); 4843 kset_unregister(ext4_kset);
4331out4: 4844out4:
4332 exit_ext4_system_zone(); 4845 ext4_exit_system_zone();
4846out5:
4847 ext4_exit_pageio();
4333 return err; 4848 return err;
4334} 4849}
4335 4850
4336static void __exit exit_ext4_fs(void) 4851static void __exit ext4_exit_fs(void)
4337{ 4852{
4853 ext4_destroy_lazyinit_thread();
4338 unregister_as_ext2(); 4854 unregister_as_ext2();
4339 unregister_as_ext3(); 4855 unregister_as_ext3();
4340 unregister_filesystem(&ext4_fs_type); 4856 unregister_filesystem(&ext4_fs_type);
4341 destroy_inodecache(); 4857 destroy_inodecache();
4342 exit_ext4_xattr(); 4858 ext4_exit_xattr();
4343 exit_ext4_mballoc(); 4859 ext4_exit_mballoc();
4344 remove_proc_entry("fs/ext4", NULL); 4860 remove_proc_entry("fs/ext4", NULL);
4345 kset_unregister(ext4_kset); 4861 kset_unregister(ext4_kset);
4346 exit_ext4_system_zone(); 4862 ext4_exit_system_zone();
4863 ext4_exit_pageio();
4347} 4864}
4348 4865
4349MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 4866MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
4350MODULE_DESCRIPTION("Fourth Extended Filesystem"); 4867MODULE_DESCRIPTION("Fourth Extended Filesystem");
4351MODULE_LICENSE("GPL"); 4868MODULE_LICENSE("GPL");
4352module_init(init_ext4_fs) 4869module_init(ext4_init_fs)
4353module_exit(exit_ext4_fs) 4870module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..fc32176eee39 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -427,23 +427,23 @@ cleanup:
427static int 427static int
428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) 428ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
429{ 429{
430 int i_error, b_error; 430 int ret, ret2;
431 431
432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem); 432 down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
433 i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); 433 ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
434 if (i_error < 0) { 434 if (ret < 0)
435 b_error = 0; 435 goto errout;
436 } else { 436 if (buffer) {
437 if (buffer) { 437 buffer += ret;
438 buffer += i_error; 438 buffer_size -= ret;
439 buffer_size -= i_error;
440 }
441 b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
442 if (b_error < 0)
443 i_error = 0;
444 } 439 }
440 ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
441 if (ret < 0)
442 goto errout;
443 ret += ret2;
444errout:
445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem); 445 up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
446 return i_error + b_error; 446 return ret;
447} 447}
448 448
449/* 449/*
@@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
947/* 947/*
948 * ext4_xattr_set_handle() 948 * ext4_xattr_set_handle()
949 * 949 *
950 * Create, replace or remove an extended attribute for this inode. Buffer 950 * Create, replace or remove an extended attribute for this inode. Value
951 * is NULL to remove an existing extended attribute, and non-NULL to 951 * is NULL to remove an existing extended attribute, and non-NULL to
952 * either replace an existing extended attribute, or create a new extended 952 * either replace an existing extended attribute, or create a new extended
953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE 953 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1588#undef BLOCK_HASH_SHIFT 1588#undef BLOCK_HASH_SHIFT
1589 1589
1590int __init 1590int __init
1591init_ext4_xattr(void) 1591ext4_init_xattr(void)
1592{ 1592{
1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); 1593 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
1594 if (!ext4_xattr_cache) 1594 if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
1597} 1597}
1598 1598
1599void 1599void
1600exit_ext4_xattr(void) 1600ext4_exit_xattr(void)
1601{ 1601{
1602 if (ext4_xattr_cache) 1602 if (ext4_xattr_cache)
1603 mb_cache_destroy(ext4_xattr_cache); 1603 mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..1ef16520b950 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 83extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
84 struct ext4_inode *raw_inode, handle_t *handle); 84 struct ext4_inode *raw_inode, handle_t *handle);
85 85
86extern int init_ext4_xattr(void); 86extern int __init ext4_init_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void ext4_exit_xattr(void);
88 88
89extern const struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
121{ 121{
122} 122}
123 123
124static inline int 124static __init inline int
125init_ext4_xattr(void) 125ext4_init_xattr(void)
126{ 126{
127 return 0; 127 return 0;
128} 128}
129 129
130static inline void 130static inline void
131exit_ext4_xattr(void) 131ext4_exit_xattr(void)
132{ 132{
133} 133}
134 134
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c28..f50408901f7e 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
319 struct msdos_dir_entry *de, loff_t i_pos); 319 struct msdos_dir_entry *de, loff_t i_pos);
320extern int fat_sync_inode(struct inode *inode); 320extern int fat_sync_inode(struct inode *inode);
321extern int fat_fill_super(struct super_block *sb, void *data, int silent, 321extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322 const struct inode_operations *fs_dir_inode_ops, int isvfat); 322 const struct inode_operations *fs_dir_inode_ops,
323 int isvfat, void (*setup)(struct super_block *));
323 324
324extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
325 struct inode *i2); 326 struct inode *i2);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
577 577
578 sb_issue_discard(sb, 578 sb_issue_discard(sb,
579 fat_clus_to_blknr(sbi, first_cl), 579 fat_clus_to_blknr(sbi, first_cl),
580 nr_clus * sbi->sec_per_clus); 580 nr_clus * sbi->sec_per_clus,
581 GFP_NOFS, 0);
581 582
582 first_cl = cluster; 583 first_cl = cluster;
583 } 584 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 830058057d33..86753fe10bd1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/smp_lock.h>
18#include <linux/seq_file.h> 17#include <linux/seq_file.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include <linux/mpage.h> 19#include <linux/mpage.h>
@@ -489,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
489{ 488{
490 struct msdos_sb_info *sbi = MSDOS_SB(sb); 489 struct msdos_sb_info *sbi = MSDOS_SB(sb);
491 490
492 lock_kernel();
493
494 if (sb->s_dirt) 491 if (sb->s_dirt)
495 fat_write_super(sb); 492 fat_write_super(sb);
496 493
@@ -504,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
504 501
505 sb->s_fs_info = NULL; 502 sb->s_fs_info = NULL;
506 kfree(sbi); 503 kfree(sbi);
507
508 unlock_kernel();
509} 504}
510 505
511static struct kmem_cache *fat_inode_cachep; 506static struct kmem_cache *fat_inode_cachep;
@@ -519,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
519 return &ei->vfs_inode; 514 return &ei->vfs_inode;
520} 515}
521 516
522static void fat_destroy_inode(struct inode *inode) 517static void fat_i_callback(struct rcu_head *head)
523{ 518{
519 struct inode *inode = container_of(head, struct inode, i_rcu);
520 INIT_LIST_HEAD(&inode->i_dentry);
524 kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); 521 kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
525} 522}
526 523
524static void fat_destroy_inode(struct inode *inode)
525{
526 call_rcu(&inode->i_rcu, fat_i_callback);
527}
528
527static void init_once(void *foo) 529static void init_once(void *foo)
528{ 530{
529 struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; 531 struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -701,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
701 struct fid *fid, int fh_len, int fh_type) 703 struct fid *fid, int fh_len, int fh_type)
702{ 704{
703 struct inode *inode = NULL; 705 struct inode *inode = NULL;
704 struct dentry *result;
705 u32 *fh = fid->raw; 706 u32 *fh = fid->raw;
706 707
707 if (fh_len < 5 || fh_type != 3) 708 if (fh_len < 5 || fh_type != 3)
@@ -746,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
746 * the fat_iget lookup again. If that fails, then we are totally out 747 * the fat_iget lookup again. If that fails, then we are totally out
747 * of luck. But all that is for another day 748 * of luck. But all that is for another day
748 */ 749 */
749 result = d_obtain_alias(inode); 750 return d_obtain_alias(inode);
750 if (!IS_ERR(result))
751 result->d_op = sb->s_root->d_op;
752 return result;
753} 751}
754 752
755static int 753static int
@@ -797,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
797 brelse(bh); 795 brelse(bh);
798 796
799 parent = d_obtain_alias(inode); 797 parent = d_obtain_alias(inode);
800 if (!IS_ERR(parent))
801 parent->d_op = sb->s_root->d_op;
802out: 798out:
803 unlock_super(sb); 799 unlock_super(sb);
804 800
@@ -1242,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
1242 * Read the super block of an MS-DOS FS. 1238 * Read the super block of an MS-DOS FS.
1243 */ 1239 */
1244int fat_fill_super(struct super_block *sb, void *data, int silent, 1240int fat_fill_super(struct super_block *sb, void *data, int silent,
1245 const struct inode_operations *fs_dir_inode_ops, int isvfat) 1241 const struct inode_operations *fs_dir_inode_ops, int isvfat,
1242 void (*setup)(struct super_block *))
1246{ 1243{
1247 struct inode *root_inode = NULL, *fat_inode = NULL; 1244 struct inode *root_inode = NULL, *fat_inode = NULL;
1248 struct buffer_head *bh; 1245 struct buffer_head *bh;
@@ -1278,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1278 if (error) 1275 if (error)
1279 goto out_fail; 1276 goto out_fail;
1280 1277
1278 setup(sb); /* flavour-specific stuff that needs options */
1279
1281 error = -EIO; 1280 error = -EIO;
1282 sb_min_blocksize(sb, 512); 1281 sb_min_blocksize(sb, 512);
1283 bh = sb_bread(sb, 0); 1282 bh = sb_bread(sb, 0);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f2356388..970e682ea754 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
255 255
256 for (i = 0; i < nr_bhs; i++) { 256 for (i = 0; i < nr_bhs; i++) {
257 wait_on_buffer(bhs[i]); 257 wait_on_buffer(bhs[i]);
258 if (buffer_eopnotsupp(bhs[i])) { 258 if (!err && !buffer_uptodate(bhs[i]))
259 clear_buffer_eopnotsupp(bhs[i]);
260 err = -EOPNOTSUPP;
261 } else if (!err && !buffer_uptodate(bhs[i]))
262 err = -EIO; 259 err = -EIO;
263 } 260 }
264 return err; 261 return err;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd77..711499040eb6 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
148 * that the existing dentry can be used. The msdos fs routines will 148 * that the existing dentry can be used. The msdos fs routines will
149 * return ENOENT or EINVAL as appropriate. 149 * return ENOENT or EINVAL as appropriate.
150 */ 150 */
151static int msdos_hash(struct dentry *dentry, struct qstr *qstr) 151static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
152 struct qstr *qstr)
152{ 153{
153 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; 154 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
154 unsigned char msdos_name[MSDOS_NAME]; 155 unsigned char msdos_name[MSDOS_NAME];
@@ -164,16 +165,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
164 * Compare two msdos names. If either of the names are invalid, 165 * Compare two msdos names. If either of the names are invalid,
165 * we fall back to doing the standard name comparison. 166 * we fall back to doing the standard name comparison.
166 */ 167 */
167static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) 168static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
169 const struct dentry *dentry, const struct inode *inode,
170 unsigned int len, const char *str, const struct qstr *name)
168{ 171{
169 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; 172 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
170 unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME]; 173 unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
171 int error; 174 int error;
172 175
173 error = msdos_format_name(a->name, a->len, a_msdos_name, options); 176 error = msdos_format_name(name->name, name->len, a_msdos_name, options);
174 if (error) 177 if (error)
175 goto old_compare; 178 goto old_compare;
176 error = msdos_format_name(b->name, b->len, b_msdos_name, options); 179 error = msdos_format_name(str, len, b_msdos_name, options);
177 if (error) 180 if (error)
178 goto old_compare; 181 goto old_compare;
179 error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME); 182 error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +185,8 @@ out:
182 185
183old_compare: 186old_compare:
184 error = 1; 187 error = 1;
185 if (a->len == b->len) 188 if (name->len == len)
186 error = memcmp(a->name, b->name, a->len); 189 error = memcmp(name->name, str, len);
187 goto out; 190 goto out;
188} 191}
189 192
@@ -224,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
224 } 227 }
225out: 228out:
226 unlock_super(sb); 229 unlock_super(sb);
227 dentry->d_op = &msdos_dentry_operations; 230 return d_splice_alias(inode, dentry);
228 dentry = d_splice_alias(inode, dentry);
229 if (dentry)
230 dentry->d_op = &msdos_dentry_operations;
231 return dentry;
232 231
233error: 232error:
234 unlock_super(sb); 233 unlock_super(sb);
@@ -658,31 +657,29 @@ static const struct inode_operations msdos_dir_inode_operations = {
658 .getattr = fat_getattr, 657 .getattr = fat_getattr,
659}; 658};
660 659
661static int msdos_fill_super(struct super_block *sb, void *data, int silent) 660static void setup(struct super_block *sb)
662{ 661{
663 int res; 662 sb->s_d_op = &msdos_dentry_operations;
664
665 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
666 if (res)
667 return res;
668
669 sb->s_flags |= MS_NOATIME; 663 sb->s_flags |= MS_NOATIME;
670 sb->s_root->d_op = &msdos_dentry_operations;
671 return 0;
672} 664}
673 665
674static int msdos_get_sb(struct file_system_type *fs_type, 666static int msdos_fill_super(struct super_block *sb, void *data, int silent)
667{
668 return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
669 0, setup);
670}
671
672static struct dentry *msdos_mount(struct file_system_type *fs_type,
675 int flags, const char *dev_name, 673 int flags, const char *dev_name,
676 void *data, struct vfsmount *mnt) 674 void *data)
677{ 675{
678 return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super, 676 return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
679 mnt);
680} 677}
681 678
682static struct file_system_type msdos_fs_type = { 679static struct file_system_type msdos_fs_type = {
683 .owner = THIS_MODULE, 680 .owner = THIS_MODULE,
684 .name = "msdos", 681 .name = "msdos",
685 .get_sb = msdos_get_sb, 682 .mount = msdos_mount,
686 .kill_sb = kill_block_super, 683 .kill_sb = kill_block_super,
687 .fs_flags = FS_REQUIRES_DEV, 684 .fs_flags = FS_REQUIRES_DEV,
688}; 685};
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fbaa..f88f752babd9 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
43 43
44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) 44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
45{ 45{
46 if (nd->flags & LOOKUP_RCU)
47 return -ECHILD;
48
46 /* This is not negative dentry. Always valid. */ 49 /* This is not negative dentry. Always valid. */
47 if (dentry->d_inode) 50 if (dentry->d_inode)
48 return 1; 51 return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
51 54
52static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) 55static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
53{ 56{
57 if (nd->flags & LOOKUP_RCU)
58 return -ECHILD;
59
54 /* 60 /*
55 * This is not negative dentry. Always valid. 61 * This is not negative dentry. Always valid.
56 * 62 *
@@ -85,22 +91,26 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
85} 91}
86 92
87/* returns the length of a struct qstr, ignoring trailing dots */ 93/* returns the length of a struct qstr, ignoring trailing dots */
88static unsigned int vfat_striptail_len(struct qstr *qstr) 94static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
89{ 95{
90 unsigned int len = qstr->len; 96 while (len && name[len - 1] == '.')
91
92 while (len && qstr->name[len - 1] == '.')
93 len--; 97 len--;
94 return len; 98 return len;
95} 99}
96 100
101static unsigned int vfat_striptail_len(const struct qstr *qstr)
102{
103 return __vfat_striptail_len(qstr->len, qstr->name);
104}
105
97/* 106/*
98 * Compute the hash for the vfat name corresponding to the dentry. 107 * Compute the hash for the vfat name corresponding to the dentry.
99 * Note: if the name is invalid, we leave the hash code unchanged so 108 * Note: if the name is invalid, we leave the hash code unchanged so
100 * that the existing dentry can be used. The vfat fs routines will 109 * that the existing dentry can be used. The vfat fs routines will
101 * return ENOENT or EINVAL as appropriate. 110 * return ENOENT or EINVAL as appropriate.
102 */ 111 */
103static int vfat_hash(struct dentry *dentry, struct qstr *qstr) 112static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
113 struct qstr *qstr)
104{ 114{
105 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); 115 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
106 return 0; 116 return 0;
@@ -112,9 +122,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
112 * that the existing dentry can be used. The vfat fs routines will 122 * that the existing dentry can be used. The vfat fs routines will
113 * return ENOENT or EINVAL as appropriate. 123 * return ENOENT or EINVAL as appropriate.
114 */ 124 */
115static int vfat_hashi(struct dentry *dentry, struct qstr *qstr) 125static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
126 struct qstr *qstr)
116{ 127{
117 struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; 128 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
118 const unsigned char *name; 129 const unsigned char *name;
119 unsigned int len; 130 unsigned int len;
120 unsigned long hash; 131 unsigned long hash;
@@ -133,16 +144,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
133/* 144/*
134 * Case insensitive compare of two vfat names. 145 * Case insensitive compare of two vfat names.
135 */ 146 */
136static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b) 147static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
148 const struct dentry *dentry, const struct inode *inode,
149 unsigned int len, const char *str, const struct qstr *name)
137{ 150{
138 struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io; 151 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
139 unsigned int alen, blen; 152 unsigned int alen, blen;
140 153
141 /* A filename cannot end in '.' or we treat it like it has none */ 154 /* A filename cannot end in '.' or we treat it like it has none */
142 alen = vfat_striptail_len(a); 155 alen = vfat_striptail_len(name);
143 blen = vfat_striptail_len(b); 156 blen = __vfat_striptail_len(len, str);
144 if (alen == blen) { 157 if (alen == blen) {
145 if (nls_strnicmp(t, a->name, b->name, alen) == 0) 158 if (nls_strnicmp(t, name->name, str, alen) == 0)
146 return 0; 159 return 0;
147 } 160 }
148 return 1; 161 return 1;
@@ -151,15 +164,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
151/* 164/*
152 * Case sensitive compare of two vfat names. 165 * Case sensitive compare of two vfat names.
153 */ 166 */
154static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b) 167static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
168 const struct dentry *dentry, const struct inode *inode,
169 unsigned int len, const char *str, const struct qstr *name)
155{ 170{
156 unsigned int alen, blen; 171 unsigned int alen, blen;
157 172
158 /* A filename cannot end in '.' or we treat it like it has none */ 173 /* A filename cannot end in '.' or we treat it like it has none */
159 alen = vfat_striptail_len(a); 174 alen = vfat_striptail_len(name);
160 blen = vfat_striptail_len(b); 175 blen = __vfat_striptail_len(len, str);
161 if (alen == blen) { 176 if (alen == blen) {
162 if (strncmp(a->name, b->name, alen) == 0) 177 if (strncmp(name->name, str, alen) == 0)
163 return 0; 178 return 0;
164 } 179 }
165 return 1; 180 return 1;
@@ -757,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
757 772
758out: 773out:
759 unlock_super(sb); 774 unlock_super(sb);
760 dentry->d_op = sb->s_root->d_op;
761 dentry->d_time = dentry->d_parent->d_inode->i_version; 775 dentry->d_time = dentry->d_parent->d_inode->i_version;
762 dentry = d_splice_alias(inode, dentry); 776 dentry = d_splice_alias(inode, dentry);
763 if (dentry) { 777 if (dentry)
764 dentry->d_op = sb->s_root->d_op;
765 dentry->d_time = dentry->d_parent->d_inode->i_version; 778 dentry->d_time = dentry->d_parent->d_inode->i_version;
766 }
767 return dentry; 779 return dentry;
768 780
769error: 781error:
@@ -1051,34 +1063,31 @@ static const struct inode_operations vfat_dir_inode_operations = {
1051 .getattr = fat_getattr, 1063 .getattr = fat_getattr,
1052}; 1064};
1053 1065
1054static int vfat_fill_super(struct super_block *sb, void *data, int silent) 1066static void setup(struct super_block *sb)
1055{ 1067{
1056 int res;
1057
1058 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
1059 if (res)
1060 return res;
1061
1062 if (MSDOS_SB(sb)->options.name_check != 's') 1068 if (MSDOS_SB(sb)->options.name_check != 's')
1063 sb->s_root->d_op = &vfat_ci_dentry_ops; 1069 sb->s_d_op = &vfat_ci_dentry_ops;
1064 else 1070 else
1065 sb->s_root->d_op = &vfat_dentry_ops; 1071 sb->s_d_op = &vfat_dentry_ops;
1072}
1066 1073
1067 return 0; 1074static int vfat_fill_super(struct super_block *sb, void *data, int silent)
1075{
1076 return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
1077 1, setup);
1068} 1078}
1069 1079
1070static int vfat_get_sb(struct file_system_type *fs_type, 1080static struct dentry *vfat_mount(struct file_system_type *fs_type,
1071 int flags, const char *dev_name, 1081 int flags, const char *dev_name,
1072 void *data, struct vfsmount *mnt) 1082 void *data)
1073{ 1083{
1074 return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super, 1084 return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
1075 mnt);
1076} 1085}
1077 1086
1078static struct file_system_type vfat_fs_type = { 1087static struct file_system_type vfat_fs_type = {
1079 .owner = THIS_MODULE, 1088 .owner = THIS_MODULE,
1080 .name = "vfat", 1089 .name = "vfat",
1081 .get_sb = vfat_get_sb, 1090 .mount = vfat_mount,
1082 .kill_sb = kill_block_super, 1091 .kill_sb = kill_block_super,
1083 .fs_flags = FS_REQUIRES_DEV, 1092 .fs_flags = FS_REQUIRES_DEV,
1084}; 1093};
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f8cc34f542c3..cb1026181bdc 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
640 * match the state "is the filp on a fasync list". 640 * match the state "is the filp on a fasync list".
641 * 641 *
642 */ 642 */
643static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 643int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
644{ 644{
645 struct fasync_struct *fa, **fp; 645 struct fasync_struct *fa, **fp;
646 int result = 0; 646 int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
666 return result; 666 return result;
667} 667}
668 668
669struct fasync_struct *fasync_alloc(void)
670{
671 return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
672}
673
669/* 674/*
670 * Add a fasync entry. Return negative on error, positive if 675 * NOTE! This can be used only for unused fasync entries:
671 * added, and zero if did nothing but change an existing one. 676 * entries that actually got inserted on the fasync list
677 * need to be released by rcu - see fasync_remove_entry.
678 */
679void fasync_free(struct fasync_struct *new)
680{
681 kmem_cache_free(fasync_cache, new);
682}
683
684/*
685 * Insert a new entry into the fasync list. Return the pointer to the
686 * old one if we didn't use the new one.
672 * 687 *
673 * NOTE! It is very important that the FASYNC flag always 688 * NOTE! It is very important that the FASYNC flag always
674 * match the state "is the filp on a fasync list". 689 * match the state "is the filp on a fasync list".
675 */ 690 */
676static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) 691struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
677{ 692{
678 struct fasync_struct *new, *fa, **fp; 693 struct fasync_struct *fa, **fp;
679 int result = 0;
680
681 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
682 if (!new)
683 return -ENOMEM;
684 694
685 spin_lock(&filp->f_lock); 695 spin_lock(&filp->f_lock);
686 spin_lock(&fasync_lock); 696 spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
691 spin_lock_irq(&fa->fa_lock); 701 spin_lock_irq(&fa->fa_lock);
692 fa->fa_fd = fd; 702 fa->fa_fd = fd;
693 spin_unlock_irq(&fa->fa_lock); 703 spin_unlock_irq(&fa->fa_lock);
694
695 kmem_cache_free(fasync_cache, new);
696 goto out; 704 goto out;
697 } 705 }
698 706
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
702 new->fa_fd = fd; 710 new->fa_fd = fd;
703 new->fa_next = *fapp; 711 new->fa_next = *fapp;
704 rcu_assign_pointer(*fapp, new); 712 rcu_assign_pointer(*fapp, new);
705 result = 1;
706 filp->f_flags |= FASYNC; 713 filp->f_flags |= FASYNC;
707 714
708out: 715out:
709 spin_unlock(&fasync_lock); 716 spin_unlock(&fasync_lock);
710 spin_unlock(&filp->f_lock); 717 spin_unlock(&filp->f_lock);
711 return result; 718 return fa;
719}
720
721/*
722 * Add a fasync entry. Return negative on error, positive if
723 * added, and zero if did nothing but change an existing one.
724 */
725static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
726{
727 struct fasync_struct *new;
728
729 new = fasync_alloc();
730 if (!new)
731 return -ENOMEM;
732
733 /*
734 * fasync_insert_entry() returns the old (update) entry if
735 * it existed.
736 *
737 * So free the (unused) new entry and return 0 to let the
738 * caller know that we didn't add any new fasync entries.
739 */
740 if (fasync_insert_entry(fd, filp, fapp, new)) {
741 fasync_free(new);
742 return 0;
743 }
744
745 return 1;
712} 746}
713 747
714/* 748/*
@@ -781,7 +815,7 @@ static int __init fcntl_init(void)
781 __O_SYNC | O_DSYNC | FASYNC | 815 __O_SYNC | O_DSYNC | FASYNC |
782 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 816 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
783 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 817 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
784 FMODE_EXEC 818 __FMODE_EXEC
785 )); 819 ));
786 820
787 fasync_cache = kmem_cache_create("fasync_cache", 821 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d2..4e303c22d5ee 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
151 */ 151 */
152const struct file_operations def_fifo_fops = { 152const struct file_operations def_fifo_fops = {
153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */ 153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */
154 .llseek = noop_llseek,
154}; 155};
diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..eb36b6b17e26 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
60/* 60/*
61 * Return the total number of open files in the system 61 * Return the total number of open files in the system
62 */ 62 */
63static int get_nr_files(void) 63static long get_nr_files(void)
64{ 64{
65 return percpu_counter_read_positive(&nr_files); 65 return percpu_counter_read_positive(&nr_files);
66} 66}
@@ -68,7 +68,7 @@ static int get_nr_files(void)
68/* 68/*
69 * Return the maximum number of open files in the system 69 * Return the maximum number of open files in the system
70 */ 70 */
71int get_max_files(void) 71unsigned long get_max_files(void)
72{ 72{
73 return files_stat.max_files; 73 return files_stat.max_files;
74} 74}
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
82 void __user *buffer, size_t *lenp, loff_t *ppos) 82 void __user *buffer, size_t *lenp, loff_t *ppos)
83{ 83{
84 files_stat.nr_files = get_nr_files(); 84 files_stat.nr_files = get_nr_files();
85 return proc_dointvec(table, write, buffer, lenp, ppos); 85 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
86} 86}
87#else 87#else
88int proc_nr_files(ctl_table *table, int write, 88int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
105struct file *get_empty_filp(void) 105struct file *get_empty_filp(void)
106{ 106{
107 const struct cred *cred = current_cred(); 107 const struct cred *cred = current_cred();
108 static int old_max; 108 static long old_max;
109 struct file * f; 109 struct file * f;
110 110
111 /* 111 /*
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
125 goto fail; 125 goto fail;
126 126
127 percpu_counter_inc(&nr_files); 127 percpu_counter_inc(&nr_files);
128 f->f_cred = get_cred(cred);
128 if (security_file_alloc(f)) 129 if (security_file_alloc(f))
129 goto fail_sec; 130 goto fail_sec;
130 131
131 INIT_LIST_HEAD(&f->f_u.fu_list); 132 INIT_LIST_HEAD(&f->f_u.fu_list);
132 atomic_long_set(&f->f_count, 1); 133 atomic_long_set(&f->f_count, 1);
133 rwlock_init(&f->f_owner.lock); 134 rwlock_init(&f->f_owner.lock);
134 f->f_cred = get_cred(cred);
135 spin_lock_init(&f->f_lock); 135 spin_lock_init(&f->f_lock);
136 eventpoll_init_file(f); 136 eventpoll_init_file(f);
137 /* f->f_version: 0 */ 137 /* f->f_version: 0 */
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
140over: 140over:
141 /* Ran out of filps - report that */ 141 /* Ran out of filps - report that */
142 if (get_nr_files() > old_max) { 142 if (get_nr_files() > old_max) {
143 printk(KERN_INFO "VFS: file-max limit %d reached\n", 143 pr_info("VFS: file-max limit %lu reached\n", get_max_files());
144 get_max_files());
145 old_max = get_nr_files(); 144 old_max = get_nr_files();
146 } 145 }
147 goto fail; 146 goto fail;
@@ -312,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
312 struct files_struct *files = current->files; 311 struct files_struct *files = current->files;
313 312
314 *fput_needed = 0; 313 *fput_needed = 0;
315 if (likely((atomic_read(&files->count) == 1))) { 314 if (atomic_read(&files->count) == 1) {
316 file = fcheck_files(files, fd); 315 file = fcheck_files(files, fd);
317 } else { 316 } else {
318 rcu_read_lock(); 317 rcu_read_lock();
@@ -487,7 +486,7 @@ retry:
487 486
488void __init files_init(unsigned long mempages) 487void __init files_init(unsigned long mempages)
489{ 488{
490 int n; 489 unsigned long n;
491 490
492 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 491 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
493 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 492 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
498 */ 497 */
499 498
500 n = (mempages * (PAGE_SIZE / 1024)) / 10; 499 n = (mempages * (PAGE_SIZE / 1024)) / 10;
501 files_stat.max_files = n; 500 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
502 if (files_stat.max_files < NR_FILE)
503 files_stat.max_files = NR_FILE;
504 files_defer_init(); 501 files_defer_init();
505 lg_lock_init(files_lglock); 502 lg_lock_init(files_lglock);
506 percpu_counter_init(&nr_files, 0); 503 percpu_counter_init(&nr_files, 0);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8eef..751d6b255a12 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
115 tmp = &(*tmp)->next; 115 tmp = &(*tmp)->next;
116 } 116 }
117 write_unlock(&file_systems_lock); 117 write_unlock(&file_systems_lock);
118
119 synchronize_rcu();
120
118 return -EINVAL; 121 return -EINVAL;
119} 122}
120 123
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..2ba6719ac612 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
260 struct inode *ip = NULL; 260 struct inode *ip = NULL;
261 261
262 if ((ip = new_inode(sbp))) { 262 if ((ip = new_inode(sbp))) {
263 ip->i_ino = get_next_ino();
263 vxfs_iinit(ip, vip); 264 vxfs_iinit(ip, vip);
264 ip->i_mapping->a_ops = &vxfs_aops; 265 ip->i_mapping->a_ops = &vxfs_aops;
265 } 266 }
@@ -336,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
336 return ip; 337 return ip;
337} 338}
338 339
340static void vxfs_i_callback(struct rcu_head *head)
341{
342 struct inode *inode = container_of(head, struct inode, i_rcu);
343 INIT_LIST_HEAD(&inode->i_dentry);
344 kmem_cache_free(vxfs_inode_cachep, inode->i_private);
345}
346
339/** 347/**
340 * vxfs_evict_inode - remove inode from main memory 348 * vxfs_evict_inode - remove inode from main memory
341 * @ip: inode to discard. 349 * @ip: inode to discard.
@@ -349,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
349{ 357{
350 truncate_inode_pages(&ip->i_data, 0); 358 truncate_inode_pages(&ip->i_data, 0);
351 end_writeback(ip); 359 end_writeback(ip);
352 kmem_cache_free(vxfs_inode_cachep, ip->i_private); 360 call_rcu(&ip->i_rcu, vxfs_i_callback);
353} 361}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c6..6c5131d592f0 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
36#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/kernel.h> 37#include <linux/kernel.h>
38#include <linux/pagemap.h> 38#include <linux/pagemap.h>
39#include <linux/smp_lock.h>
40 39
41#include "vxfs.h" 40#include "vxfs.h"
42#include "vxfs_dir.h" 41#include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
212 if (dp->d_name.len > VXFS_NAMELEN) 211 if (dp->d_name.len > VXFS_NAMELEN)
213 return ERR_PTR(-ENAMETOOLONG); 212 return ERR_PTR(-ENAMETOOLONG);
214 213
215 lock_kernel();
216 ino = vxfs_inode_by_name(dip, dp); 214 ino = vxfs_inode_by_name(dip, dp);
217 if (ino) { 215 if (ino) {
218 ip = vxfs_iget(dip->i_sb, ino); 216 ip = vxfs_iget(dip->i_sb, ino);
219 if (IS_ERR(ip)) { 217 if (IS_ERR(ip))
220 unlock_kernel();
221 return ERR_CAST(ip); 218 return ERR_CAST(ip);
222 }
223 } 219 }
224 unlock_kernel();
225 d_add(dp, ip); 220 d_add(dp, ip);
226 return NULL; 221 return NULL;
227} 222}
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
248 u_long page, npages, block, pblocks, nblocks, offset; 243 u_long page, npages, block, pblocks, nblocks, offset;
249 loff_t pos; 244 loff_t pos;
250 245
251 lock_kernel();
252
253 switch ((long)fp->f_pos) { 246 switch ((long)fp->f_pos) {
254 case 0: 247 case 0:
255 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) 248 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
265 258
266 pos = fp->f_pos - 2; 259 pos = fp->f_pos - 2;
267 260
268 if (pos > VXFS_DIRROUND(ip->i_size)) { 261 if (pos > VXFS_DIRROUND(ip->i_size))
269 unlock_kernel();
270 return 0; 262 return 0;
271 }
272 263
273 npages = dir_pages(ip); 264 npages = dir_pages(ip);
274 nblocks = dir_blocks(ip); 265 nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
327done: 318done:
328 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; 319 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
329out: 320out:
330 unlock_kernel();
331 return 0; 321 return 0;
332} 322}
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index dc0c041e85cb..9d1c99558389 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/smp_lock.h>
42#include <linux/stat.h> 41#include <linux/stat.h>
43#include <linux/vfs.h> 42#include <linux/vfs.h>
44#include <linux/mount.h> 43#include <linux/mount.h>
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
81{ 80{
82 struct vxfs_sb_info *infp = VXFS_SBI(sbp); 81 struct vxfs_sb_info *infp = VXFS_SBI(sbp);
83 82
84 lock_kernel();
85
86 vxfs_put_fake_inode(infp->vsi_fship); 83 vxfs_put_fake_inode(infp->vsi_fship);
87 vxfs_put_fake_inode(infp->vsi_ilist); 84 vxfs_put_fake_inode(infp->vsi_ilist);
88 vxfs_put_fake_inode(infp->vsi_stilist); 85 vxfs_put_fake_inode(infp->vsi_stilist);
89 86
90 brelse(infp->vsi_bp); 87 brelse(infp->vsi_bp);
91 kfree(infp); 88 kfree(infp);
92
93 unlock_kernel();
94} 89}
95 90
96/** 91/**
@@ -148,7 +143,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
148 * The superblock on success, else %NULL. 143 * The superblock on success, else %NULL.
149 * 144 *
150 * Locking: 145 * Locking:
151 * We are under the bkl and @sbp->s_lock. 146 * We are under @sbp->s_lock.
152 */ 147 */
153static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) 148static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
154{ 149{
@@ -251,17 +246,16 @@ out:
251/* 246/*
252 * The usual module blurb. 247 * The usual module blurb.
253 */ 248 */
254static int vxfs_get_sb(struct file_system_type *fs_type, 249static struct dentry *vxfs_mount(struct file_system_type *fs_type,
255 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 250 int flags, const char *dev_name, void *data)
256{ 251{
257 return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super, 252 return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
258 mnt);
259} 253}
260 254
261static struct file_system_type vxfs_fs_type = { 255static struct file_system_type vxfs_fs_type = {
262 .owner = THIS_MODULE, 256 .owner = THIS_MODULE,
263 .name = "vxfs", 257 .name = "vxfs",
264 .get_sb = vxfs_get_sb, 258 .mount = vxfs_mount,
265 .kill_sb = kill_block_super, 259 .kill_sb = kill_block_super,
266 .fs_flags = FS_REQUIRES_DEV, 260 .fs_flags = FS_REQUIRES_DEV,
267}; 261};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1e23c33ea5cf..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,13 +79,14 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
79 return sb->s_bdi; 79 return sb->s_bdi;
80} 80}
81 81
82static void bdi_queue_work(struct backing_dev_info *bdi, 82static inline struct inode *wb_inode(struct list_head *head)
83 struct wb_writeback_work *work)
84{ 83{
85 trace_writeback_queue(bdi, work); 84 return list_entry(head, struct inode, i_wb_list);
85}
86 86
87 spin_lock_bh(&bdi->wb_lock); 87/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88 list_add_tail(&work->list, &bdi->work_list); 88static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89{
89 if (bdi->wb.task) { 90 if (bdi->wb.task) {
90 wake_up_process(bdi->wb.task); 91 wake_up_process(bdi->wb.task);
91 } else { 92 } else {
@@ -93,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
93 * The bdi thread isn't there, wake up the forker thread which 94 * The bdi thread isn't there, wake up the forker thread which
94 * will create and run it. 95 * will create and run it.
95 */ 96 */
96 trace_writeback_nothread(bdi, work);
97 wake_up_process(default_backing_dev_info.wb.task); 97 wake_up_process(default_backing_dev_info.wb.task);
98 } 98 }
99}
100
101static void bdi_queue_work(struct backing_dev_info *bdi,
102 struct wb_writeback_work *work)
103{
104 trace_writeback_queue(bdi, work);
105
106 spin_lock_bh(&bdi->wb_lock);
107 list_add_tail(&work->list, &bdi->work_list);
108 if (!bdi->wb.task)
109 trace_writeback_nothread(bdi, work);
110 bdi_wakeup_flusher(bdi);
99 spin_unlock_bh(&bdi->wb_lock); 111 spin_unlock_bh(&bdi->wb_lock);
100} 112}
101 113
102static void 114static void
103__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 115__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
104 bool range_cyclic, bool for_background) 116 bool range_cyclic)
105{ 117{
106 struct wb_writeback_work *work; 118 struct wb_writeback_work *work;
107 119
@@ -121,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
121 work->sync_mode = WB_SYNC_NONE; 133 work->sync_mode = WB_SYNC_NONE;
122 work->nr_pages = nr_pages; 134 work->nr_pages = nr_pages;
123 work->range_cyclic = range_cyclic; 135 work->range_cyclic = range_cyclic;
124 work->for_background = for_background;
125 136
126 bdi_queue_work(bdi, work); 137 bdi_queue_work(bdi, work);
127} 138}
@@ -139,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
139 */ 150 */
140void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 151void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
141{ 152{
142 __bdi_start_writeback(bdi, nr_pages, true, false); 153 __bdi_start_writeback(bdi, nr_pages, true);
143} 154}
144 155
145/** 156/**
@@ -147,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
147 * @bdi: the backing device to write from 158 * @bdi: the backing device to write from
148 * 159 *
149 * Description: 160 * Description:
150 * This does WB_SYNC_NONE background writeback. The IO is only 161 * This makes sure WB_SYNC_NONE background writeback happens. When
151 * started when this function returns, we make no guarentees on 162 * this function returns, it is only guaranteed that for given BDI
152 * completion. Caller need not hold sb s_umount semaphore. 163 * some IO is happening if we are over background dirty threshold.
164 * Caller need not hold sb s_umount semaphore.
153 */ 165 */
154void bdi_start_background_writeback(struct backing_dev_info *bdi) 166void bdi_start_background_writeback(struct backing_dev_info *bdi)
155{ 167{
156 __bdi_start_writeback(bdi, LONG_MAX, true, true); 168 /*
169 * We just wake up the flusher thread. It will perform background
170 * writeback as soon as there is no other work to do.
171 */
172 trace_writeback_wake_background(bdi);
173 spin_lock_bh(&bdi->wb_lock);
174 bdi_wakeup_flusher(bdi);
175 spin_unlock_bh(&bdi->wb_lock);
157} 176}
158 177
159/* 178/*
@@ -172,11 +191,11 @@ static void redirty_tail(struct inode *inode)
172 if (!list_empty(&wb->b_dirty)) { 191 if (!list_empty(&wb->b_dirty)) {
173 struct inode *tail; 192 struct inode *tail;
174 193
175 tail = list_entry(wb->b_dirty.next, struct inode, i_list); 194 tail = wb_inode(wb->b_dirty.next);
176 if (time_before(inode->dirtied_when, tail->dirtied_when)) 195 if (time_before(inode->dirtied_when, tail->dirtied_when))
177 inode->dirtied_when = jiffies; 196 inode->dirtied_when = jiffies;
178 } 197 }
179 list_move(&inode->i_list, &wb->b_dirty); 198 list_move(&inode->i_wb_list, &wb->b_dirty);
180} 199}
181 200
182/* 201/*
@@ -186,7 +205,7 @@ static void requeue_io(struct inode *inode)
186{ 205{
187 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 206 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
188 207
189 list_move(&inode->i_list, &wb->b_more_io); 208 list_move(&inode->i_wb_list, &wb->b_more_io);
190} 209}
191 210
192static void inode_sync_complete(struct inode *inode) 211static void inode_sync_complete(struct inode *inode)
@@ -227,14 +246,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
227 int do_sb_sort = 0; 246 int do_sb_sort = 0;
228 247
229 while (!list_empty(delaying_queue)) { 248 while (!list_empty(delaying_queue)) {
230 inode = list_entry(delaying_queue->prev, struct inode, i_list); 249 inode = wb_inode(delaying_queue->prev);
231 if (older_than_this && 250 if (older_than_this &&
232 inode_dirtied_after(inode, *older_than_this)) 251 inode_dirtied_after(inode, *older_than_this))
233 break; 252 break;
234 if (sb && sb != inode->i_sb) 253 if (sb && sb != inode->i_sb)
235 do_sb_sort = 1; 254 do_sb_sort = 1;
236 sb = inode->i_sb; 255 sb = inode->i_sb;
237 list_move(&inode->i_list, &tmp); 256 list_move(&inode->i_wb_list, &tmp);
238 } 257 }
239 258
240 /* just one sb in list, splice to dispatch_queue and we're done */ 259 /* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +264,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
245 264
246 /* Move inodes from one superblock together */ 265 /* Move inodes from one superblock together */
247 while (!list_empty(&tmp)) { 266 while (!list_empty(&tmp)) {
248 inode = list_entry(tmp.prev, struct inode, i_list); 267 sb = wb_inode(tmp.prev)->i_sb;
249 sb = inode->i_sb;
250 list_for_each_prev_safe(pos, node, &tmp) { 268 list_for_each_prev_safe(pos, node, &tmp) {
251 inode = list_entry(pos, struct inode, i_list); 269 inode = wb_inode(pos);
252 if (inode->i_sb == sb) 270 if (inode->i_sb == sb)
253 list_move(&inode->i_list, dispatch_queue); 271 list_move(&inode->i_wb_list, dispatch_queue);
254 } 272 }
255 } 273 }
256} 274}
@@ -408,16 +426,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
408 * completion. 426 * completion.
409 */ 427 */
410 redirty_tail(inode); 428 redirty_tail(inode);
411 } else if (atomic_read(&inode->i_count)) {
412 /*
413 * The inode is clean, inuse
414 */
415 list_move(&inode->i_list, &inode_in_use);
416 } else { 429 } else {
417 /* 430 /*
418 * The inode is clean, unused 431 * The inode is clean. At this point we either have
432 * a reference to the inode or it's on it's way out.
433 * No need to add it back to the LRU.
419 */ 434 */
420 list_move(&inode->i_list, &inode_unused); 435 list_del_init(&inode->i_wb_list);
421 } 436 }
422 } 437 }
423 inode_sync_complete(inode); 438 inode_sync_complete(inode);
@@ -465,8 +480,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
465{ 480{
466 while (!list_empty(&wb->b_io)) { 481 while (!list_empty(&wb->b_io)) {
467 long pages_skipped; 482 long pages_skipped;
468 struct inode *inode = list_entry(wb->b_io.prev, 483 struct inode *inode = wb_inode(wb->b_io.prev);
469 struct inode, i_list);
470 484
471 if (inode->i_sb != sb) { 485 if (inode->i_sb != sb) {
472 if (only_this_sb) { 486 if (only_this_sb) {
@@ -487,10 +501,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
487 return 0; 501 return 0;
488 } 502 }
489 503
490 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 504 /*
505 * Don't bother with new inodes or inodes beeing freed, first
506 * kind does not need peridic writeout yet, and for the latter
507 * kind writeout is handled by the freer.
508 */
509 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
491 requeue_io(inode); 510 requeue_io(inode);
492 continue; 511 continue;
493 } 512 }
513
494 /* 514 /*
495 * Was this inode dirtied after sync_sb_inodes was called? 515 * Was this inode dirtied after sync_sb_inodes was called?
496 * This keeps sync from extra jobs and livelock. 516 * This keeps sync from extra jobs and livelock.
@@ -498,7 +518,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
498 if (inode_dirtied_after(inode, wbc->wb_start)) 518 if (inode_dirtied_after(inode, wbc->wb_start))
499 return 1; 519 return 1;
500 520
501 BUG_ON(inode->i_state & I_FREEING);
502 __iget(inode); 521 __iget(inode);
503 pages_skipped = wbc->pages_skipped; 522 pages_skipped = wbc->pages_skipped;
504 writeback_single_inode(inode, wbc); 523 writeback_single_inode(inode, wbc);
@@ -536,8 +555,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
536 queue_io(wb, wbc->older_than_this); 555 queue_io(wb, wbc->older_than_this);
537 556
538 while (!list_empty(&wb->b_io)) { 557 while (!list_empty(&wb->b_io)) {
539 struct inode *inode = list_entry(wb->b_io.prev, 558 struct inode *inode = wb_inode(wb->b_io.prev);
540 struct inode, i_list);
541 struct super_block *sb = inode->i_sb; 559 struct super_block *sb = inode->i_sb;
542 560
543 if (!pin_sb_for_writeback(sb)) { 561 if (!pin_sb_for_writeback(sb)) {
@@ -582,7 +600,7 @@ static inline bool over_bground_thresh(void)
582 global_dirty_limits(&background_thresh, &dirty_thresh); 600 global_dirty_limits(&background_thresh, &dirty_thresh);
583 601
584 return (global_page_state(NR_FILE_DIRTY) + 602 return (global_page_state(NR_FILE_DIRTY) +
585 global_page_state(NR_UNSTABLE_NFS) >= background_thresh); 603 global_page_state(NR_UNSTABLE_NFS) > background_thresh);
586} 604}
587 605
588/* 606/*
@@ -612,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
612 }; 630 };
613 unsigned long oldest_jif; 631 unsigned long oldest_jif;
614 long wrote = 0; 632 long wrote = 0;
633 long write_chunk;
615 struct inode *inode; 634 struct inode *inode;
616 635
617 if (wbc.for_kupdate) { 636 if (wbc.for_kupdate) {
@@ -624,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
624 wbc.range_end = LLONG_MAX; 643 wbc.range_end = LLONG_MAX;
625 } 644 }
626 645
646 /*
647 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
648 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
649 * here avoids calling into writeback_inodes_wb() more than once.
650 *
651 * The intended call sequence for WB_SYNC_ALL writeback is:
652 *
653 * wb_writeback()
654 * __writeback_inodes_sb() <== called only once
655 * write_cache_pages() <== called once for each inode
656 * (quickly) tag currently dirty pages
657 * (maybe slowly) sync all tagged pages
658 */
659 if (wbc.sync_mode == WB_SYNC_NONE)
660 write_chunk = MAX_WRITEBACK_PAGES;
661 else
662 write_chunk = LONG_MAX;
663
627 wbc.wb_start = jiffies; /* livelock avoidance */ 664 wbc.wb_start = jiffies; /* livelock avoidance */
628 for (;;) { 665 for (;;) {
629 /* 666 /*
@@ -633,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
633 break; 670 break;
634 671
635 /* 672 /*
673 * Background writeout and kupdate-style writeback may
674 * run forever. Stop them if there is other work to do
675 * so that e.g. sync can proceed. They'll be restarted
676 * after the other works are all done.
677 */
678 if ((work->for_background || work->for_kupdate) &&
679 !list_empty(&wb->bdi->work_list))
680 break;
681
682 /*
636 * For background writeout, stop when we are below the 683 * For background writeout, stop when we are below the
637 * background dirty threshold 684 * background dirty threshold
638 */ 685 */
@@ -640,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
640 break; 687 break;
641 688
642 wbc.more_io = 0; 689 wbc.more_io = 0;
643 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 690 wbc.nr_to_write = write_chunk;
644 wbc.pages_skipped = 0; 691 wbc.pages_skipped = 0;
645 692
646 trace_wbc_writeback_start(&wbc, wb->bdi); 693 trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -650,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
650 writeback_inodes_wb(wb, &wbc); 697 writeback_inodes_wb(wb, &wbc);
651 trace_wbc_writeback_written(&wbc, wb->bdi); 698 trace_wbc_writeback_written(&wbc, wb->bdi);
652 699
653 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 700 work->nr_pages -= write_chunk - wbc.nr_to_write;
654 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 701 wrote += write_chunk - wbc.nr_to_write;
655 702
656 /* 703 /*
657 * If we consumed everything, see if we have more 704 * If we consumed everything, see if we have more
@@ -666,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
666 /* 713 /*
667 * Did we write something? Try for more 714 * Did we write something? Try for more
668 */ 715 */
669 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) 716 if (wbc.nr_to_write < write_chunk)
670 continue; 717 continue;
671 /* 718 /*
672 * Nothing written. Wait for some inode to 719 * Nothing written. Wait for some inode to
@@ -675,8 +722,7 @@ static long wb_writeback(struct bdi_writeback *wb,
675 */ 722 */
676 spin_lock(&inode_lock); 723 spin_lock(&inode_lock);
677 if (!list_empty(&wb->b_more_io)) { 724 if (!list_empty(&wb->b_more_io)) {
678 inode = list_entry(wb->b_more_io.prev, 725 inode = wb_inode(wb->b_more_io.prev);
679 struct inode, i_list);
680 trace_wbc_writeback_wait(&wbc, wb->bdi); 726 trace_wbc_writeback_wait(&wbc, wb->bdi);
681 inode_wait_for_writeback(inode); 727 inode_wait_for_writeback(inode);
682 } 728 }
@@ -704,6 +750,34 @@ get_next_work_item(struct backing_dev_info *bdi)
704 return work; 750 return work;
705} 751}
706 752
753/*
754 * Add in the number of potentially dirty inodes, because each inode
755 * write can dirty pagecache in the underlying blockdev.
756 */
757static unsigned long get_nr_dirty_pages(void)
758{
759 return global_page_state(NR_FILE_DIRTY) +
760 global_page_state(NR_UNSTABLE_NFS) +
761 get_nr_dirty_inodes();
762}
763
764static long wb_check_background_flush(struct bdi_writeback *wb)
765{
766 if (over_bground_thresh()) {
767
768 struct wb_writeback_work work = {
769 .nr_pages = LONG_MAX,
770 .sync_mode = WB_SYNC_NONE,
771 .for_background = 1,
772 .range_cyclic = 1,
773 };
774
775 return wb_writeback(wb, &work);
776 }
777
778 return 0;
779}
780
707static long wb_check_old_data_flush(struct bdi_writeback *wb) 781static long wb_check_old_data_flush(struct bdi_writeback *wb)
708{ 782{
709 unsigned long expired; 783 unsigned long expired;
@@ -721,9 +795,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
721 return 0; 795 return 0;
722 796
723 wb->last_old_flush = jiffies; 797 wb->last_old_flush = jiffies;
724 nr_pages = global_page_state(NR_FILE_DIRTY) + 798 nr_pages = get_nr_dirty_pages();
725 global_page_state(NR_UNSTABLE_NFS) +
726 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
727 799
728 if (nr_pages) { 800 if (nr_pages) {
729 struct wb_writeback_work work = { 801 struct wb_writeback_work work = {
@@ -775,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
775 * Check for periodic writeback, kupdated() style 847 * Check for periodic writeback, kupdated() style
776 */ 848 */
777 wrote += wb_check_old_data_flush(wb); 849 wrote += wb_check_old_data_flush(wb);
850 wrote += wb_check_background_flush(wb);
778 clear_bit(BDI_writeback_running, &wb->bdi->state); 851 clear_bit(BDI_writeback_running, &wb->bdi->state);
779 852
780 return wrote; 853 return wrote;
@@ -790,7 +863,7 @@ int bdi_writeback_thread(void *data)
790 struct backing_dev_info *bdi = wb->bdi; 863 struct backing_dev_info *bdi = wb->bdi;
791 long pages_written; 864 long pages_written;
792 865
793 current->flags |= PF_FLUSHER | PF_SWAPWRITE; 866 current->flags |= PF_SWAPWRITE;
794 set_freezable(); 867 set_freezable();
795 wb->last_active = jiffies; 868 wb->last_active = jiffies;
796 869
@@ -861,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
861 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 934 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
862 if (!bdi_has_dirty_io(bdi)) 935 if (!bdi_has_dirty_io(bdi))
863 continue; 936 continue;
864 __bdi_start_writeback(bdi, nr_pages, false, false); 937 __bdi_start_writeback(bdi, nr_pages, false);
865 } 938 }
866 rcu_read_unlock(); 939 rcu_read_unlock();
867} 940}
@@ -962,7 +1035,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
962 * dirty list. Add blockdev inodes as well. 1035 * dirty list. Add blockdev inodes as well.
963 */ 1036 */
964 if (!S_ISBLK(inode->i_mode)) { 1037 if (!S_ISBLK(inode->i_mode)) {
965 if (hlist_unhashed(&inode->i_hash)) 1038 if (inode_unhashed(inode))
966 goto out; 1039 goto out;
967 } 1040 }
968 if (inode->i_state & I_FREEING) 1041 if (inode->i_state & I_FREEING)
@@ -990,7 +1063,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
990 } 1063 }
991 1064
992 inode->dirtied_when = jiffies; 1065 inode->dirtied_when = jiffies;
993 list_move(&inode->i_list, &bdi->wb.b_dirty); 1066 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
994 } 1067 }
995 } 1068 }
996out: 1069out:
@@ -1103,9 +1176,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
1103 */ 1176 */
1104void writeback_inodes_sb(struct super_block *sb) 1177void writeback_inodes_sb(struct super_block *sb)
1105{ 1178{
1106 return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) + 1179 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
1107 global_page_state(NR_UNSTABLE_NFS) +
1108 (inodes_stat.nr_inodes - inodes_stat.nr_unused));
1109} 1180}
1110EXPORT_SYMBOL(writeback_inodes_sb); 1181EXPORT_SYMBOL(writeback_inodes_sb);
1111 1182
@@ -1154,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1154 * @sb: the superblock 1225 * @sb: the superblock
1155 * 1226 *
1156 * This function writes and waits on any dirty inode belonging to this 1227 * This function writes and waits on any dirty inode belonging to this
1157 * super_block. The number of pages synced is returned. 1228 * super_block.
1158 */ 1229 */
1159void sync_inodes_sb(struct super_block *sb) 1230void sync_inodes_sb(struct super_block *sb)
1160{ 1231{
@@ -1230,3 +1301,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1230 return ret; 1301 return ret;
1231} 1302}
1232EXPORT_SYMBOL(sync_inode); 1303EXPORT_SYMBOL(sync_inode);
1304
1305/**
1306 * sync_inode_metadata - write an inode to disk
1307 * @inode: the inode to sync
1308 * @wait: wait for I/O to complete.
1309 *
1310 * Write an inode to disk and adjust its dirty state after completion.
1311 *
1312 * Note: only writes the actual inode, no associated data or other metadata.
1313 */
1314int sync_inode_metadata(struct inode *inode, int wait)
1315{
1316 struct writeback_control wbc = {
1317 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
1318 .nr_to_write = 0, /* metadata-only */
1319 };
1320
1321 return sync_inode(inode, &wbc);
1322}
1323EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ed45a9cf5f3d..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
4#include <linux/path.h> 4#include <linux/path.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/fs_struct.h> 6#include <linux/fs_struct.h>
7#include "internal.h"
8
9static inline void path_get_longterm(struct path *path)
10{
11 path_get(path);
12 mnt_make_longterm(path->mnt);
13}
14
15static inline void path_put_longterm(struct path *path)
16{
17 mnt_make_shortterm(path->mnt);
18 path_put(path);
19}
7 20
8/* 21/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 22 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -14,12 +27,14 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
14 struct path old_root; 27 struct path old_root;
15 28
16 spin_lock(&fs->lock); 29 spin_lock(&fs->lock);
30 write_seqcount_begin(&fs->seq);
17 old_root = fs->root; 31 old_root = fs->root;
18 fs->root = *path; 32 fs->root = *path;
19 path_get(path); 33 path_get_longterm(path);
34 write_seqcount_end(&fs->seq);
20 spin_unlock(&fs->lock); 35 spin_unlock(&fs->lock);
21 if (old_root.dentry) 36 if (old_root.dentry)
22 path_put(&old_root); 37 path_put_longterm(&old_root);
23} 38}
24 39
25/* 40/*
@@ -31,13 +46,15 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
31 struct path old_pwd; 46 struct path old_pwd;
32 47
33 spin_lock(&fs->lock); 48 spin_lock(&fs->lock);
49 write_seqcount_begin(&fs->seq);
34 old_pwd = fs->pwd; 50 old_pwd = fs->pwd;
35 fs->pwd = *path; 51 fs->pwd = *path;
36 path_get(path); 52 path_get_longterm(path);
53 write_seqcount_end(&fs->seq);
37 spin_unlock(&fs->lock); 54 spin_unlock(&fs->lock);
38 55
39 if (old_pwd.dentry) 56 if (old_pwd.dentry)
40 path_put(&old_pwd); 57 path_put_longterm(&old_pwd);
41} 58}
42 59
43void chroot_fs_refs(struct path *old_root, struct path *new_root) 60void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -52,31 +69,33 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
52 fs = p->fs; 69 fs = p->fs;
53 if (fs) { 70 if (fs) {
54 spin_lock(&fs->lock); 71 spin_lock(&fs->lock);
72 write_seqcount_begin(&fs->seq);
55 if (fs->root.dentry == old_root->dentry 73 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) { 74 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root); 75 path_get_longterm(new_root);
58 fs->root = *new_root; 76 fs->root = *new_root;
59 count++; 77 count++;
60 } 78 }
61 if (fs->pwd.dentry == old_root->dentry 79 if (fs->pwd.dentry == old_root->dentry
62 && fs->pwd.mnt == old_root->mnt) { 80 && fs->pwd.mnt == old_root->mnt) {
63 path_get(new_root); 81 path_get_longterm(new_root);
64 fs->pwd = *new_root; 82 fs->pwd = *new_root;
65 count++; 83 count++;
66 } 84 }
85 write_seqcount_end(&fs->seq);
67 spin_unlock(&fs->lock); 86 spin_unlock(&fs->lock);
68 } 87 }
69 task_unlock(p); 88 task_unlock(p);
70 } while_each_thread(g, p); 89 } while_each_thread(g, p);
71 read_unlock(&tasklist_lock); 90 read_unlock(&tasklist_lock);
72 while (count--) 91 while (count--)
73 path_put(old_root); 92 path_put_longterm(old_root);
74} 93}
75 94
76void free_fs_struct(struct fs_struct *fs) 95void free_fs_struct(struct fs_struct *fs)
77{ 96{
78 path_put(&fs->root); 97 path_put_longterm(&fs->root);
79 path_put(&fs->pwd); 98 path_put_longterm(&fs->pwd);
80 kmem_cache_free(fs_cachep, fs); 99 kmem_cache_free(fs_cachep, fs);
81} 100}
82 101
@@ -88,8 +107,10 @@ void exit_fs(struct task_struct *tsk)
88 int kill; 107 int kill;
89 task_lock(tsk); 108 task_lock(tsk);
90 spin_lock(&fs->lock); 109 spin_lock(&fs->lock);
110 write_seqcount_begin(&fs->seq);
91 tsk->fs = NULL; 111 tsk->fs = NULL;
92 kill = !--fs->users; 112 kill = !--fs->users;
113 write_seqcount_end(&fs->seq);
93 spin_unlock(&fs->lock); 114 spin_unlock(&fs->lock);
94 task_unlock(tsk); 115 task_unlock(tsk);
95 if (kill) 116 if (kill)
@@ -105,8 +126,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
105 fs->users = 1; 126 fs->users = 1;
106 fs->in_exec = 0; 127 fs->in_exec = 0;
107 spin_lock_init(&fs->lock); 128 spin_lock_init(&fs->lock);
129 seqcount_init(&fs->seq);
108 fs->umask = old->umask; 130 fs->umask = old->umask;
109 get_fs_root_and_pwd(old, &fs->root, &fs->pwd); 131
132 spin_lock(&old->lock);
133 fs->root = old->root;
134 path_get_longterm(&fs->root);
135 fs->pwd = old->pwd;
136 path_get_longterm(&fs->pwd);
137 spin_unlock(&old->lock);
110 } 138 }
111 return fs; 139 return fs;
112} 140}
@@ -144,6 +172,7 @@ EXPORT_SYMBOL(current_umask);
144struct fs_struct init_fs = { 172struct fs_struct init_fs = {
145 .users = 1, 173 .users = 1,
146 .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), 174 .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
175 .seq = SEQCNT_ZERO,
147 .umask = 0022, 176 .umask = 0022,
148}; 177};
149 178
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede09..48a18f184d50 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
101 object->n_ops++; 101 object->n_ops++;
102 object->n_exclusive++; /* reads and writes must wait */ 102 object->n_exclusive++; /* reads and writes must wait */
103 103
104 if (object->n_ops > 0) { 104 if (object->n_ops > 1) {
105 atomic_inc(&op->usage); 105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops); 106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend); 107 fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f9..85542a7daf40 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
179static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
180 .open = nonseekable_open, 180 .open = nonseekable_open,
181 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
182 .llseek = no_llseek,
182}; 183};
183 184
184static const struct file_operations fuse_ctl_waiting_ops = { 185static const struct file_operations fuse_ctl_waiting_ops = {
185 .open = nonseekable_open, 186 .open = nonseekable_open,
186 .read = fuse_conn_waiting_read, 187 .read = fuse_conn_waiting_read,
188 .llseek = no_llseek,
187}; 189};
188 190
189static const struct file_operations fuse_conn_max_background_ops = { 191static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open, 192 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read, 193 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write, 194 .write = fuse_conn_max_background_write,
195 .llseek = no_llseek,
193}; 196};
194 197
195static const struct file_operations fuse_conn_congestion_threshold_ops = { 198static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open, 199 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read, 200 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write, 201 .write = fuse_conn_congestion_threshold_write,
202 .llseek = no_llseek,
199}; 203};
200 204
201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 205static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
@@ -218,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
218 if (!inode) 222 if (!inode)
219 return NULL; 223 return NULL;
220 224
225 inode->i_ino = get_next_ino();
221 inode->i_mode = mode; 226 inode->i_mode = mode;
222 inode->i_uid = fc->user_id; 227 inode->i_uid = fc->user_id;
223 inode->i_gid = fc->group_id; 228 inode->i_gid = fc->group_id;
@@ -317,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
317 return 0; 322 return 0;
318} 323}
319 324
320static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags, 325static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
321 const char *dev_name, void *raw_data, 326 int flags, const char *dev_name, void *raw_data)
322 struct vfsmount *mnt)
323{ 327{
324 return get_sb_single(fs_type, flags, raw_data, 328 return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
325 fuse_ctl_fill_super, mnt);
326} 329}
327 330
328static void fuse_ctl_kill_sb(struct super_block *sb) 331static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -341,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
341static struct file_system_type fuse_ctl_fs_type = { 344static struct file_system_type fuse_ctl_fs_type = {
342 .owner = THIS_MODULE, 345 .owner = THIS_MODULE,
343 .name = "fusectl", 346 .name = "fusectl",
344 .get_sb = fuse_ctl_get_sb, 347 .mount = fuse_ctl_mount,
345 .kill_sb = fuse_ctl_kill_sb, 348 .kill_sb = fuse_ctl_kill_sb,
346}; 349};
347 350
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278bd..3e87cce5837d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
182 .unlocked_ioctl = cuse_file_ioctl, 182 .unlocked_ioctl = cuse_file_ioctl,
183 .compat_ioctl = cuse_file_compat_ioctl, 183 .compat_ioctl = cuse_file_compat_ioctl,
184 .poll = fuse_file_poll, 184 .poll = fuse_file_poll,
185 .llseek = noop_llseek,
185}; 186};
186 187
187 188
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cde755cca564..cf8d28d1fbad 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -251,6 +251,20 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
251 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 251 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
252} 252}
253 253
254void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
255 u64 nodeid, u64 nlookup)
256{
257 forget->forget_one.nodeid = nodeid;
258 forget->forget_one.nlookup = nlookup;
259
260 spin_lock(&fc->lock);
261 fc->forget_list_tail->next = forget;
262 fc->forget_list_tail = forget;
263 wake_up(&fc->waitq);
264 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
265 spin_unlock(&fc->lock);
266}
267
254static void flush_bg_queue(struct fuse_conn *fc) 268static void flush_bg_queue(struct fuse_conn *fc)
255{ 269{
256 while (fc->active_background < fc->max_background && 270 while (fc->active_background < fc->max_background &&
@@ -438,12 +452,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
438 } 452 }
439} 453}
440 454
441void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
442{
443 req->isreply = 0;
444 fuse_request_send_nowait(fc, req);
445}
446
447void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) 455void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
448{ 456{
449 req->isreply = 1; 457 req->isreply = 1;
@@ -809,11 +817,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
809 int err; 817 int err;
810 struct page *page = *pagep; 818 struct page *page = *pagep;
811 819
812 if (page && zeroing && count < PAGE_SIZE) { 820 if (page && zeroing && count < PAGE_SIZE)
813 void *mapaddr = kmap_atomic(page, KM_USER1); 821 clear_highpage(page);
814 memset(mapaddr, 0, PAGE_SIZE); 822
815 kunmap_atomic(mapaddr, KM_USER1);
816 }
817 while (count) { 823 while (count) {
818 if (cs->write && cs->pipebufs && page) { 824 if (cs->write && cs->pipebufs && page) {
819 return fuse_ref_page(cs, page, offset, count); 825 return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +836,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
830 } 836 }
831 } 837 }
832 if (page) { 838 if (page) {
833 void *mapaddr = kmap_atomic(page, KM_USER1); 839 void *mapaddr = kmap_atomic(page, KM_USER0);
834 void *buf = mapaddr + offset; 840 void *buf = mapaddr + offset;
835 offset += fuse_copy_do(cs, &buf, &count); 841 offset += fuse_copy_do(cs, &buf, &count);
836 kunmap_atomic(mapaddr, KM_USER1); 842 kunmap_atomic(mapaddr, KM_USER0);
837 } else 843 } else
838 offset += fuse_copy_do(cs, NULL, &count); 844 offset += fuse_copy_do(cs, NULL, &count);
839 } 845 }
@@ -898,9 +904,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
898 return err; 904 return err;
899} 905}
900 906
907static int forget_pending(struct fuse_conn *fc)
908{
909 return fc->forget_list_head.next != NULL;
910}
911
901static int request_pending(struct fuse_conn *fc) 912static int request_pending(struct fuse_conn *fc)
902{ 913{
903 return !list_empty(&fc->pending) || !list_empty(&fc->interrupts); 914 return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
915 forget_pending(fc);
904} 916}
905 917
906/* Wait until a request is available on the pending list */ 918/* Wait until a request is available on the pending list */
@@ -962,6 +974,120 @@ __releases(fc->lock)
962 return err ? err : reqsize; 974 return err ? err : reqsize;
963} 975}
964 976
977static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
978 unsigned max,
979 unsigned *countp)
980{
981 struct fuse_forget_link *head = fc->forget_list_head.next;
982 struct fuse_forget_link **newhead = &head;
983 unsigned count;
984
985 for (count = 0; *newhead != NULL && count < max; count++)
986 newhead = &(*newhead)->next;
987
988 fc->forget_list_head.next = *newhead;
989 *newhead = NULL;
990 if (fc->forget_list_head.next == NULL)
991 fc->forget_list_tail = &fc->forget_list_head;
992
993 if (countp != NULL)
994 *countp = count;
995
996 return head;
997}
998
999static int fuse_read_single_forget(struct fuse_conn *fc,
1000 struct fuse_copy_state *cs,
1001 size_t nbytes)
1002__releases(fc->lock)
1003{
1004 int err;
1005 struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
1006 struct fuse_forget_in arg = {
1007 .nlookup = forget->forget_one.nlookup,
1008 };
1009 struct fuse_in_header ih = {
1010 .opcode = FUSE_FORGET,
1011 .nodeid = forget->forget_one.nodeid,
1012 .unique = fuse_get_unique(fc),
1013 .len = sizeof(ih) + sizeof(arg),
1014 };
1015
1016 spin_unlock(&fc->lock);
1017 kfree(forget);
1018 if (nbytes < ih.len)
1019 return -EINVAL;
1020
1021 err = fuse_copy_one(cs, &ih, sizeof(ih));
1022 if (!err)
1023 err = fuse_copy_one(cs, &arg, sizeof(arg));
1024 fuse_copy_finish(cs);
1025
1026 if (err)
1027 return err;
1028
1029 return ih.len;
1030}
1031
1032static int fuse_read_batch_forget(struct fuse_conn *fc,
1033 struct fuse_copy_state *cs, size_t nbytes)
1034__releases(fc->lock)
1035{
1036 int err;
1037 unsigned max_forgets;
1038 unsigned count;
1039 struct fuse_forget_link *head;
1040 struct fuse_batch_forget_in arg = { .count = 0 };
1041 struct fuse_in_header ih = {
1042 .opcode = FUSE_BATCH_FORGET,
1043 .unique = fuse_get_unique(fc),
1044 .len = sizeof(ih) + sizeof(arg),
1045 };
1046
1047 if (nbytes < ih.len) {
1048 spin_unlock(&fc->lock);
1049 return -EINVAL;
1050 }
1051
1052 max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
1053 head = dequeue_forget(fc, max_forgets, &count);
1054 spin_unlock(&fc->lock);
1055
1056 arg.count = count;
1057 ih.len += count * sizeof(struct fuse_forget_one);
1058 err = fuse_copy_one(cs, &ih, sizeof(ih));
1059 if (!err)
1060 err = fuse_copy_one(cs, &arg, sizeof(arg));
1061
1062 while (head) {
1063 struct fuse_forget_link *forget = head;
1064
1065 if (!err) {
1066 err = fuse_copy_one(cs, &forget->forget_one,
1067 sizeof(forget->forget_one));
1068 }
1069 head = forget->next;
1070 kfree(forget);
1071 }
1072
1073 fuse_copy_finish(cs);
1074
1075 if (err)
1076 return err;
1077
1078 return ih.len;
1079}
1080
1081static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
1082 size_t nbytes)
1083__releases(fc->lock)
1084{
1085 if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
1086 return fuse_read_single_forget(fc, cs, nbytes);
1087 else
1088 return fuse_read_batch_forget(fc, cs, nbytes);
1089}
1090
965/* 1091/*
966 * Read a single request into the userspace filesystem's buffer. This 1092 * Read a single request into the userspace filesystem's buffer. This
967 * function waits until a request is available, then removes it from 1093 * function waits until a request is available, then removes it from
@@ -1000,6 +1126,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
1000 return fuse_read_interrupt(fc, cs, nbytes, req); 1126 return fuse_read_interrupt(fc, cs, nbytes, req);
1001 } 1127 }
1002 1128
1129 if (forget_pending(fc)) {
1130 if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
1131 return fuse_read_forget(fc, cs, nbytes);
1132
1133 if (fc->forget_batch <= -8)
1134 fc->forget_batch = 16;
1135 }
1136
1003 req = list_entry(fc->pending.next, struct fuse_req, list); 1137 req = list_entry(fc->pending.next, struct fuse_req, list);
1004 req->state = FUSE_REQ_READING; 1138 req->state = FUSE_REQ_READING;
1005 list_move(&req->list, &fc->io); 1139 list_move(&req->list, &fc->io);
@@ -1092,7 +1226,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1092 if (!fc) 1226 if (!fc)
1093 return -EPERM; 1227 return -EPERM;
1094 1228
1095 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1229 bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1096 if (!bufs) 1230 if (!bufs)
1097 return -ENOMEM; 1231 return -ENOMEM;
1098 1232
@@ -1336,12 +1470,7 @@ out_finish:
1336 1470
1337static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) 1471static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1338{ 1472{
1339 int i; 1473 release_pages(req->pages, req->num_pages, 0);
1340
1341 for (i = 0; i < req->num_pages; i++) {
1342 struct page *page = req->pages[i];
1343 page_cache_release(page);
1344 }
1345} 1474}
1346 1475
1347static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, 1476static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
@@ -1633,7 +1762,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1633 if (!fc) 1762 if (!fc)
1634 return -EPERM; 1763 return -EPERM;
1635 1764
1636 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1765 bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1637 if (!bufs) 1766 if (!bufs)
1638 return -ENOMEM; 1767 return -ENOMEM;
1639 1768
@@ -1777,6 +1906,8 @@ __acquires(fc->lock)
1777 flush_bg_queue(fc); 1906 flush_bg_queue(fc);
1778 end_requests(fc, &fc->pending); 1907 end_requests(fc, &fc->pending);
1779 end_requests(fc, &fc->processing); 1908 end_requests(fc, &fc->processing);
1909 while (forget_pending(fc))
1910 kfree(dequeue_forget(fc, 1, NULL));
1780} 1911}
1781 1912
1782/* 1913/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9627c95482d..bfed8447ed80 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -10,9 +10,9 @@
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/gfp.h>
14#include <linux/sched.h> 13#include <linux/sched.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/slab.h>
16 16
17#if BITS_PER_LONG >= 64 17#if BITS_PER_LONG >= 64
18static inline void fuse_dentry_settime(struct dentry *entry, u64 time) 18static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
156 */ 156 */
157static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) 157static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
158{ 158{
159 struct inode *inode = entry->d_inode; 159 struct inode *inode;
160 160
161 if (nd->flags & LOOKUP_RCU)
162 return -ECHILD;
163
164 inode = entry->d_inode;
161 if (inode && is_bad_inode(inode)) 165 if (inode && is_bad_inode(inode))
162 return 0; 166 return 0;
163 else if (fuse_dentry_time(entry) < get_jiffies_64()) { 167 else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -165,7 +169,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
165 struct fuse_entry_out outarg; 169 struct fuse_entry_out outarg;
166 struct fuse_conn *fc; 170 struct fuse_conn *fc;
167 struct fuse_req *req; 171 struct fuse_req *req;
168 struct fuse_req *forget_req; 172 struct fuse_forget_link *forget;
169 struct dentry *parent; 173 struct dentry *parent;
170 u64 attr_version; 174 u64 attr_version;
171 175
@@ -178,8 +182,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
178 if (IS_ERR(req)) 182 if (IS_ERR(req))
179 return 0; 183 return 0;
180 184
181 forget_req = fuse_get_req(fc); 185 forget = fuse_alloc_forget();
182 if (IS_ERR(forget_req)) { 186 if (!forget) {
183 fuse_put_request(fc, req); 187 fuse_put_request(fc, req);
184 return 0; 188 return 0;
185 } 189 }
@@ -199,15 +203,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
199 if (!err) { 203 if (!err) {
200 struct fuse_inode *fi = get_fuse_inode(inode); 204 struct fuse_inode *fi = get_fuse_inode(inode);
201 if (outarg.nodeid != get_node_id(inode)) { 205 if (outarg.nodeid != get_node_id(inode)) {
202 fuse_send_forget(fc, forget_req, 206 fuse_queue_forget(fc, forget, outarg.nodeid, 1);
203 outarg.nodeid, 1);
204 return 0; 207 return 0;
205 } 208 }
206 spin_lock(&fc->lock); 209 spin_lock(&fc->lock);
207 fi->nlookup++; 210 fi->nlookup++;
208 spin_unlock(&fc->lock); 211 spin_unlock(&fc->lock);
209 } 212 }
210 fuse_put_request(fc, forget_req); 213 kfree(forget);
211 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) 214 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
212 return 0; 215 return 0;
213 216
@@ -259,7 +262,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
259{ 262{
260 struct fuse_conn *fc = get_fuse_conn_super(sb); 263 struct fuse_conn *fc = get_fuse_conn_super(sb);
261 struct fuse_req *req; 264 struct fuse_req *req;
262 struct fuse_req *forget_req; 265 struct fuse_forget_link *forget;
263 u64 attr_version; 266 u64 attr_version;
264 int err; 267 int err;
265 268
@@ -273,9 +276,9 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
273 if (IS_ERR(req)) 276 if (IS_ERR(req))
274 goto out; 277 goto out;
275 278
276 forget_req = fuse_get_req(fc); 279 forget = fuse_alloc_forget();
277 err = PTR_ERR(forget_req); 280 err = -ENOMEM;
278 if (IS_ERR(forget_req)) { 281 if (!forget) {
279 fuse_put_request(fc, req); 282 fuse_put_request(fc, req);
280 goto out; 283 goto out;
281 } 284 }
@@ -301,13 +304,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
301 attr_version); 304 attr_version);
302 err = -ENOMEM; 305 err = -ENOMEM;
303 if (!*inode) { 306 if (!*inode) {
304 fuse_send_forget(fc, forget_req, outarg->nodeid, 1); 307 fuse_queue_forget(fc, forget, outarg->nodeid, 1);
305 goto out; 308 goto out;
306 } 309 }
307 err = 0; 310 err = 0;
308 311
309 out_put_forget: 312 out_put_forget:
310 fuse_put_request(fc, forget_req); 313 kfree(forget);
311 out: 314 out:
312 return err; 315 return err;
313} 316}
@@ -347,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
347 } 350 }
348 351
349 entry = newent ? newent : entry; 352 entry = newent ? newent : entry;
350 entry->d_op = &fuse_dentry_operations;
351 if (outarg_valid) 353 if (outarg_valid)
352 fuse_change_entry_timeout(entry, &outarg); 354 fuse_change_entry_timeout(entry, &outarg);
353 else 355 else
@@ -374,7 +376,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
374 struct inode *inode; 376 struct inode *inode;
375 struct fuse_conn *fc = get_fuse_conn(dir); 377 struct fuse_conn *fc = get_fuse_conn(dir);
376 struct fuse_req *req; 378 struct fuse_req *req;
377 struct fuse_req *forget_req; 379 struct fuse_forget_link *forget;
378 struct fuse_create_in inarg; 380 struct fuse_create_in inarg;
379 struct fuse_open_out outopen; 381 struct fuse_open_out outopen;
380 struct fuse_entry_out outentry; 382 struct fuse_entry_out outentry;
@@ -388,9 +390,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
388 if (flags & O_DIRECT) 390 if (flags & O_DIRECT)
389 return -EINVAL; 391 return -EINVAL;
390 392
391 forget_req = fuse_get_req(fc); 393 forget = fuse_alloc_forget();
392 if (IS_ERR(forget_req)) 394 if (!forget)
393 return PTR_ERR(forget_req); 395 return -ENOMEM;
394 396
395 req = fuse_get_req(fc); 397 req = fuse_get_req(fc);
396 err = PTR_ERR(req); 398 err = PTR_ERR(req);
@@ -448,10 +450,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
448 if (!inode) { 450 if (!inode) {
449 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 451 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
450 fuse_sync_release(ff, flags); 452 fuse_sync_release(ff, flags);
451 fuse_send_forget(fc, forget_req, outentry.nodeid, 1); 453 fuse_queue_forget(fc, forget, outentry.nodeid, 1);
452 return -ENOMEM; 454 return -ENOMEM;
453 } 455 }
454 fuse_put_request(fc, forget_req); 456 kfree(forget);
455 d_instantiate(entry, inode); 457 d_instantiate(entry, inode);
456 fuse_change_entry_timeout(entry, &outentry); 458 fuse_change_entry_timeout(entry, &outentry);
457 fuse_invalidate_attr(dir); 459 fuse_invalidate_attr(dir);
@@ -469,7 +471,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
469 out_put_request: 471 out_put_request:
470 fuse_put_request(fc, req); 472 fuse_put_request(fc, req);
471 out_put_forget_req: 473 out_put_forget_req:
472 fuse_put_request(fc, forget_req); 474 kfree(forget);
473 return err; 475 return err;
474} 476}
475 477
@@ -483,12 +485,12 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
483 struct fuse_entry_out outarg; 485 struct fuse_entry_out outarg;
484 struct inode *inode; 486 struct inode *inode;
485 int err; 487 int err;
486 struct fuse_req *forget_req; 488 struct fuse_forget_link *forget;
487 489
488 forget_req = fuse_get_req(fc); 490 forget = fuse_alloc_forget();
489 if (IS_ERR(forget_req)) { 491 if (!forget) {
490 fuse_put_request(fc, req); 492 fuse_put_request(fc, req);
491 return PTR_ERR(forget_req); 493 return -ENOMEM;
492 } 494 }
493 495
494 memset(&outarg, 0, sizeof(outarg)); 496 memset(&outarg, 0, sizeof(outarg));
@@ -515,10 +517,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
515 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 517 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
516 &outarg.attr, entry_attr_timeout(&outarg), 0); 518 &outarg.attr, entry_attr_timeout(&outarg), 0);
517 if (!inode) { 519 if (!inode) {
518 fuse_send_forget(fc, forget_req, outarg.nodeid, 1); 520 fuse_queue_forget(fc, forget, outarg.nodeid, 1);
519 return -ENOMEM; 521 return -ENOMEM;
520 } 522 }
521 fuse_put_request(fc, forget_req); 523 kfree(forget);
522 524
523 if (S_ISDIR(inode->i_mode)) { 525 if (S_ISDIR(inode->i_mode)) {
524 struct dentry *alias; 526 struct dentry *alias;
@@ -541,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
541 return 0; 543 return 0;
542 544
543 out_put_forget_req: 545 out_put_forget_req:
544 fuse_put_request(fc, forget_req); 546 kfree(forget);
545 return err; 547 return err;
546} 548}
547 549
@@ -981,12 +983,15 @@ static int fuse_access(struct inode *inode, int mask)
981 * access request is sent. Execute permission is still checked 983 * access request is sent. Execute permission is still checked
982 * locally based on file mode. 984 * locally based on file mode.
983 */ 985 */
984static int fuse_permission(struct inode *inode, int mask) 986static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
985{ 987{
986 struct fuse_conn *fc = get_fuse_conn(inode); 988 struct fuse_conn *fc = get_fuse_conn(inode);
987 bool refreshed = false; 989 bool refreshed = false;
988 int err = 0; 990 int err = 0;
989 991
992 if (flags & IPERM_FLAG_RCU)
993 return -ECHILD;
994
990 if (!fuse_allow_task(fc, current)) 995 if (!fuse_allow_task(fc, current))
991 return -EACCES; 996 return -EACCES;
992 997
@@ -1001,7 +1006,7 @@ static int fuse_permission(struct inode *inode, int mask)
1001 } 1006 }
1002 1007
1003 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { 1008 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
1004 err = generic_permission(inode, mask, NULL); 1009 err = generic_permission(inode, mask, flags, NULL);
1005 1010
1006 /* If permission is denied, try to refresh file 1011 /* If permission is denied, try to refresh file
1007 attributes. This is also needed, because the root 1012 attributes. This is also needed, because the root
@@ -1009,7 +1014,8 @@ static int fuse_permission(struct inode *inode, int mask)
1009 if (err == -EACCES && !refreshed) { 1014 if (err == -EACCES && !refreshed) {
1010 err = fuse_do_getattr(inode, NULL, NULL); 1015 err = fuse_do_getattr(inode, NULL, NULL);
1011 if (!err) 1016 if (!err)
1012 err = generic_permission(inode, mask, NULL); 1017 err = generic_permission(inode, mask,
1018 flags, NULL);
1013 } 1019 }
1014 1020
1015 /* Note: the opposite of the above test does not 1021 /* Note: the opposite of the above test does not
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123f..95da1bc1c826 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/compat.h>
16 17
17static const struct file_operations fuse_direct_io_file_operations; 18static const struct file_operations fuse_direct_io_file_operations;
18 19
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
134void fuse_finish_open(struct inode *inode, struct file *file) 135void fuse_finish_open(struct inode *inode, struct file *file)
135{ 136{
136 struct fuse_file *ff = file->private_data; 137 struct fuse_file *ff = file->private_data;
138 struct fuse_conn *fc = get_fuse_conn(inode);
137 139
138 if (ff->open_flags & FOPEN_DIRECT_IO) 140 if (ff->open_flags & FOPEN_DIRECT_IO)
139 file->f_op = &fuse_direct_io_file_operations; 141 file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
141 invalidate_inode_pages2(inode->i_mapping); 143 invalidate_inode_pages2(inode->i_mapping);
142 if (ff->open_flags & FOPEN_NONSEEKABLE) 144 if (ff->open_flags & FOPEN_NONSEEKABLE)
143 nonseekable_open(inode, file); 145 nonseekable_open(inode, file);
146 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
147 struct fuse_inode *fi = get_fuse_inode(inode);
148
149 spin_lock(&fc->lock);
150 fi->attr_version = ++fc->attr_version;
151 i_size_write(inode, 0);
152 spin_unlock(&fc->lock);
153 fuse_invalidate_attr(inode);
154 }
144} 155}
145 156
146int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 157int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,94 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1618} 1629}
1619 1630
1620/* 1631/*
1632 * CUSE servers compiled on 32bit broke on 64bit kernels because the
1633 * ABI was defined to be 'struct iovec' which is different on 32bit
1634 * and 64bit. Fortunately we can determine which structure the server
1635 * used from the size of the reply.
1636 */
1637static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
1638 size_t transferred, unsigned count,
1639 bool is_compat)
1640{
1641#ifdef CONFIG_COMPAT
1642 if (count * sizeof(struct compat_iovec) == transferred) {
1643 struct compat_iovec *ciov = src;
1644 unsigned i;
1645
1646 /*
1647 * With this interface a 32bit server cannot support
1648 * non-compat (i.e. ones coming from 64bit apps) ioctl
1649 * requests
1650 */
1651 if (!is_compat)
1652 return -EINVAL;
1653
1654 for (i = 0; i < count; i++) {
1655 dst[i].iov_base = compat_ptr(ciov[i].iov_base);
1656 dst[i].iov_len = ciov[i].iov_len;
1657 }
1658 return 0;
1659 }
1660#endif
1661
1662 if (count * sizeof(struct iovec) != transferred)
1663 return -EIO;
1664
1665 memcpy(dst, src, transferred);
1666 return 0;
1667}
1668
1669/* Make sure iov_length() won't overflow */
1670static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1671{
1672 size_t n;
1673 u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
1674
1675 for (n = 0; n < count; n++) {
1676 if (iov->iov_len > (size_t) max)
1677 return -ENOMEM;
1678 max -= iov->iov_len;
1679 }
1680 return 0;
1681}
1682
1683static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
1684 void *src, size_t transferred, unsigned count,
1685 bool is_compat)
1686{
1687 unsigned i;
1688 struct fuse_ioctl_iovec *fiov = src;
1689
1690 if (fc->minor < 16) {
1691 return fuse_copy_ioctl_iovec_old(dst, src, transferred,
1692 count, is_compat);
1693 }
1694
1695 if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
1696 return -EIO;
1697
1698 for (i = 0; i < count; i++) {
1699 /* Did the server supply an inappropriate value? */
1700 if (fiov[i].base != (unsigned long) fiov[i].base ||
1701 fiov[i].len != (unsigned long) fiov[i].len)
1702 return -EIO;
1703
1704 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
1705 dst[i].iov_len = (size_t) fiov[i].len;
1706
1707#ifdef CONFIG_COMPAT
1708 if (is_compat &&
1709 (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
1710 (compat_size_t) dst[i].iov_len != fiov[i].len))
1711 return -EIO;
1712#endif
1713 }
1714
1715 return 0;
1716}
1717
1718
1719/*
1621 * For ioctls, there is no generic way to determine how much memory 1720 * For ioctls, there is no generic way to determine how much memory
1622 * needs to be read and/or written. Furthermore, ioctls are allowed 1721 * needs to be read and/or written. Furthermore, ioctls are allowed
1623 * to dereference the passed pointer, so the parameter requires deep 1722 * to dereference the passed pointer, so the parameter requires deep
@@ -1677,18 +1776,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1677 struct fuse_ioctl_out outarg; 1776 struct fuse_ioctl_out outarg;
1678 struct fuse_req *req = NULL; 1777 struct fuse_req *req = NULL;
1679 struct page **pages = NULL; 1778 struct page **pages = NULL;
1680 struct page *iov_page = NULL; 1779 struct iovec *iov_page = NULL;
1681 struct iovec *in_iov = NULL, *out_iov = NULL; 1780 struct iovec *in_iov = NULL, *out_iov = NULL;
1682 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; 1781 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1683 size_t in_size, out_size, transferred; 1782 size_t in_size, out_size, transferred;
1684 int err; 1783 int err;
1685 1784
1785#if BITS_PER_LONG == 32
1786 inarg.flags |= FUSE_IOCTL_32BIT;
1787#else
1788 if (flags & FUSE_IOCTL_COMPAT)
1789 inarg.flags |= FUSE_IOCTL_32BIT;
1790#endif
1791
1686 /* assume all the iovs returned by client always fits in a page */ 1792 /* assume all the iovs returned by client always fits in a page */
1687 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1793 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1688 1794
1689 err = -ENOMEM; 1795 err = -ENOMEM;
1690 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); 1796 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1691 iov_page = alloc_page(GFP_KERNEL); 1797 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
1692 if (!pages || !iov_page) 1798 if (!pages || !iov_page)
1693 goto out; 1799 goto out;
1694 1800
@@ -1697,7 +1803,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1697 * RETRY from server is not allowed. 1803 * RETRY from server is not allowed.
1698 */ 1804 */
1699 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { 1805 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1700 struct iovec *iov = page_address(iov_page); 1806 struct iovec *iov = iov_page;
1701 1807
1702 iov->iov_base = (void __user *)arg; 1808 iov->iov_base = (void __user *)arg;
1703 iov->iov_len = _IOC_SIZE(cmd); 1809 iov->iov_len = _IOC_SIZE(cmd);
@@ -1778,7 +1884,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1778 1884
1779 /* did it ask for retry? */ 1885 /* did it ask for retry? */
1780 if (outarg.flags & FUSE_IOCTL_RETRY) { 1886 if (outarg.flags & FUSE_IOCTL_RETRY) {
1781 char *vaddr; 1887 void *vaddr;
1782 1888
1783 /* no retry if in restricted mode */ 1889 /* no retry if in restricted mode */
1784 err = -EIO; 1890 err = -EIO;
@@ -1798,18 +1904,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1798 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) 1904 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1799 goto out; 1905 goto out;
1800 1906
1801 err = -EIO;
1802 if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
1803 goto out;
1804
1805 /* okay, copy in iovs and retry */
1806 vaddr = kmap_atomic(pages[0], KM_USER0); 1907 vaddr = kmap_atomic(pages[0], KM_USER0);
1807 memcpy(page_address(iov_page), vaddr, transferred); 1908 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
1909 transferred, in_iovs + out_iovs,
1910 (flags & FUSE_IOCTL_COMPAT) != 0);
1808 kunmap_atomic(vaddr, KM_USER0); 1911 kunmap_atomic(vaddr, KM_USER0);
1912 if (err)
1913 goto out;
1809 1914
1810 in_iov = page_address(iov_page); 1915 in_iov = iov_page;
1811 out_iov = in_iov + in_iovs; 1916 out_iov = in_iov + in_iovs;
1812 1917
1918 err = fuse_verify_ioctl_iov(in_iov, in_iovs);
1919 if (err)
1920 goto out;
1921
1922 err = fuse_verify_ioctl_iov(out_iov, out_iovs);
1923 if (err)
1924 goto out;
1925
1813 goto retry; 1926 goto retry;
1814 } 1927 }
1815 1928
@@ -1821,8 +1934,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1821 out: 1934 out:
1822 if (req) 1935 if (req)
1823 fuse_put_request(fc, req); 1936 fuse_put_request(fc, req);
1824 if (iov_page) 1937 free_page((unsigned long) iov_page);
1825 __free_page(iov_page);
1826 while (num_pages) 1938 while (num_pages)
1827 __free_page(pages[--num_pages]); 1939 __free_page(pages[--num_pages]);
1828 kfree(pages); 1940 kfree(pages);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 57d4a3a0f102..ae5744a2f9e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -53,6 +53,12 @@ extern struct mutex fuse_mutex;
53extern unsigned max_user_bgreq; 53extern unsigned max_user_bgreq;
54extern unsigned max_user_congthresh; 54extern unsigned max_user_congthresh;
55 55
56/* One forget request */
57struct fuse_forget_link {
58 struct fuse_forget_one forget_one;
59 struct fuse_forget_link *next;
60};
61
56/** FUSE inode */ 62/** FUSE inode */
57struct fuse_inode { 63struct fuse_inode {
58 /** Inode data */ 64 /** Inode data */
@@ -66,7 +72,7 @@ struct fuse_inode {
66 u64 nlookup; 72 u64 nlookup;
67 73
68 /** The request used for sending the FORGET message */ 74 /** The request used for sending the FORGET message */
69 struct fuse_req *forget_req; 75 struct fuse_forget_link *forget;
70 76
71 /** Time in jiffies until the file attributes are valid */ 77 /** Time in jiffies until the file attributes are valid */
72 u64 i_time; 78 u64 i_time;
@@ -255,7 +261,6 @@ struct fuse_req {
255 261
256 /** Data for asynchronous requests */ 262 /** Data for asynchronous requests */
257 union { 263 union {
258 struct fuse_forget_in forget_in;
259 struct { 264 struct {
260 struct fuse_release_in in; 265 struct fuse_release_in in;
261 struct path path; 266 struct path path;
@@ -369,6 +374,13 @@ struct fuse_conn {
369 /** Pending interrupts */ 374 /** Pending interrupts */
370 struct list_head interrupts; 375 struct list_head interrupts;
371 376
377 /** Queue of pending forgets */
378 struct fuse_forget_link forget_list_head;
379 struct fuse_forget_link *forget_list_tail;
380
381 /** Batching of FORGET requests (positive indicates FORGET batch) */
382 int forget_batch;
383
372 /** Flag indicating if connection is blocked. This will be 384 /** Flag indicating if connection is blocked. This will be
373 the case before the INIT reply is received, and if there 385 the case before the INIT reply is received, and if there
374 are too many outstading backgrounds requests */ 386 are too many outstading backgrounds requests */
@@ -543,8 +555,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
543/** 555/**
544 * Send FORGET command 556 * Send FORGET command
545 */ 557 */
546void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, 558void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
547 u64 nodeid, u64 nlookup); 559 u64 nodeid, u64 nlookup);
560
561struct fuse_forget_link *fuse_alloc_forget(void);
548 562
549/** 563/**
550 * Initialize READ or READDIR request 564 * Initialize READ or READDIR request
@@ -656,11 +670,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
656void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); 670void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
657 671
658/** 672/**
659 * Send a request with no reply
660 */
661void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
662
663/**
664 * Send a request in the background 673 * Send a request in the background
665 */ 674 */
666void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); 675void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index da9e6e11374c..9e3f68cc1bd1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,6 +71,11 @@ struct fuse_mount_data {
71 unsigned blksize; 71 unsigned blksize;
72}; 72};
73 73
74struct fuse_forget_link *fuse_alloc_forget()
75{
76 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
77}
78
74static struct inode *fuse_alloc_inode(struct super_block *sb) 79static struct inode *fuse_alloc_inode(struct super_block *sb)
75{ 80{
76 struct inode *inode; 81 struct inode *inode;
@@ -90,8 +95,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
90 INIT_LIST_HEAD(&fi->queued_writes); 95 INIT_LIST_HEAD(&fi->queued_writes);
91 INIT_LIST_HEAD(&fi->writepages); 96 INIT_LIST_HEAD(&fi->writepages);
92 init_waitqueue_head(&fi->page_waitq); 97 init_waitqueue_head(&fi->page_waitq);
93 fi->forget_req = fuse_request_alloc(); 98 fi->forget = fuse_alloc_forget();
94 if (!fi->forget_req) { 99 if (!fi->forget) {
95 kmem_cache_free(fuse_inode_cachep, inode); 100 kmem_cache_free(fuse_inode_cachep, inode);
96 return NULL; 101 return NULL;
97 } 102 }
@@ -99,27 +104,20 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
99 return inode; 104 return inode;
100} 105}
101 106
102static void fuse_destroy_inode(struct inode *inode) 107static void fuse_i_callback(struct rcu_head *head)
103{ 108{
104 struct fuse_inode *fi = get_fuse_inode(inode); 109 struct inode *inode = container_of(head, struct inode, i_rcu);
105 BUG_ON(!list_empty(&fi->write_files)); 110 INIT_LIST_HEAD(&inode->i_dentry);
106 BUG_ON(!list_empty(&fi->queued_writes));
107 if (fi->forget_req)
108 fuse_request_free(fi->forget_req);
109 kmem_cache_free(fuse_inode_cachep, inode); 111 kmem_cache_free(fuse_inode_cachep, inode);
110} 112}
111 113
112void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, 114static void fuse_destroy_inode(struct inode *inode)
113 u64 nodeid, u64 nlookup)
114{ 115{
115 struct fuse_forget_in *inarg = &req->misc.forget_in; 116 struct fuse_inode *fi = get_fuse_inode(inode);
116 inarg->nlookup = nlookup; 117 BUG_ON(!list_empty(&fi->write_files));
117 req->in.h.opcode = FUSE_FORGET; 118 BUG_ON(!list_empty(&fi->queued_writes));
118 req->in.h.nodeid = nodeid; 119 kfree(fi->forget);
119 req->in.numargs = 1; 120 call_rcu(&inode->i_rcu, fuse_i_callback);
120 req->in.args[0].size = sizeof(struct fuse_forget_in);
121 req->in.args[0].value = inarg;
122 fuse_request_send_noreply(fc, req);
123} 121}
124 122
125static void fuse_evict_inode(struct inode *inode) 123static void fuse_evict_inode(struct inode *inode)
@@ -129,8 +127,8 @@ static void fuse_evict_inode(struct inode *inode)
129 if (inode->i_sb->s_flags & MS_ACTIVE) { 127 if (inode->i_sb->s_flags & MS_ACTIVE) {
130 struct fuse_conn *fc = get_fuse_conn(inode); 128 struct fuse_conn *fc = get_fuse_conn(inode);
131 struct fuse_inode *fi = get_fuse_inode(inode); 129 struct fuse_inode *fi = get_fuse_inode(inode);
132 fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup); 130 fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
133 fi->forget_req = NULL; 131 fi->forget = NULL;
134 } 132 }
135} 133}
136 134
@@ -534,6 +532,7 @@ void fuse_conn_init(struct fuse_conn *fc)
534 INIT_LIST_HEAD(&fc->interrupts); 532 INIT_LIST_HEAD(&fc->interrupts);
535 INIT_LIST_HEAD(&fc->bg_queue); 533 INIT_LIST_HEAD(&fc->bg_queue);
536 INIT_LIST_HEAD(&fc->entry); 534 INIT_LIST_HEAD(&fc->entry);
535 fc->forget_list_tail = &fc->forget_list_head;
537 atomic_set(&fc->num_waiting, 0); 536 atomic_set(&fc->num_waiting, 0);
538 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; 537 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
539 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; 538 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
@@ -618,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
618 goto out_iput; 617 goto out_iput;
619 618
620 entry = d_obtain_alias(inode); 619 entry = d_obtain_alias(inode);
621 if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) { 620 if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
622 entry->d_op = &fuse_dentry_operations;
623 fuse_invalidate_entry_cache(entry); 621 fuse_invalidate_entry_cache(entry);
624 }
625 622
626 return entry; 623 return entry;
627 624
@@ -720,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
720 } 717 }
721 718
722 parent = d_obtain_alias(inode); 719 parent = d_obtain_alias(inode);
723 if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) { 720 if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
724 parent->d_op = &fuse_dentry_operations;
725 fuse_invalidate_entry_cache(parent); 721 fuse_invalidate_entry_cache(parent);
726 }
727 722
728 return parent; 723 return parent;
729} 724}
@@ -990,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
990 iput(root); 985 iput(root);
991 goto err_put_conn; 986 goto err_put_conn;
992 } 987 }
988 /* only now - we want root dentry with NULL ->d_op */
989 sb->s_d_op = &fuse_dentry_operations;
993 990
994 init_req = fuse_request_alloc(); 991 init_req = fuse_request_alloc();
995 if (!init_req) 992 if (!init_req)
@@ -1041,11 +1038,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1041 return err; 1038 return err;
1042} 1039}
1043 1040
1044static int fuse_get_sb(struct file_system_type *fs_type, 1041static struct dentry *fuse_mount(struct file_system_type *fs_type,
1045 int flags, const char *dev_name, 1042 int flags, const char *dev_name,
1046 void *raw_data, struct vfsmount *mnt) 1043 void *raw_data)
1047{ 1044{
1048 return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); 1045 return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
1049} 1046}
1050 1047
1051static void fuse_kill_sb_anon(struct super_block *sb) 1048static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1065,17 +1062,16 @@ static struct file_system_type fuse_fs_type = {
1065 .owner = THIS_MODULE, 1062 .owner = THIS_MODULE,
1066 .name = "fuse", 1063 .name = "fuse",
1067 .fs_flags = FS_HAS_SUBTYPE, 1064 .fs_flags = FS_HAS_SUBTYPE,
1068 .get_sb = fuse_get_sb, 1065 .mount = fuse_mount,
1069 .kill_sb = fuse_kill_sb_anon, 1066 .kill_sb = fuse_kill_sb_anon,
1070}; 1067};
1071 1068
1072#ifdef CONFIG_BLOCK 1069#ifdef CONFIG_BLOCK
1073static int fuse_get_sb_blk(struct file_system_type *fs_type, 1070static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
1074 int flags, const char *dev_name, 1071 int flags, const char *dev_name,
1075 void *raw_data, struct vfsmount *mnt) 1072 void *raw_data)
1076{ 1073{
1077 return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super, 1074 return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
1078 mnt);
1079} 1075}
1080 1076
1081static void fuse_kill_sb_blk(struct super_block *sb) 1077static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1094,7 +1090,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
1094static struct file_system_type fuseblk_fs_type = { 1090static struct file_system_type fuseblk_fs_type = {
1095 .owner = THIS_MODULE, 1091 .owner = THIS_MODULE,
1096 .name = "fuseblk", 1092 .name = "fuseblk",
1097 .get_sb = fuse_get_sb_blk, 1093 .mount = fuse_mount_blk,
1098 .kill_sb = fuse_kill_sb_blk, 1094 .kill_sb = fuse_kill_sb_blk,
1099 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, 1095 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
1100}; 1096};
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a693..06c48a891832 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,14 +190,20 @@ generic_acl_chmod(struct inode *inode)
190} 190}
191 191
192int 192int
193generic_check_acl(struct inode *inode, int mask) 193generic_check_acl(struct inode *inode, int mask, unsigned int flags)
194{ 194{
195 struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 195 if (flags & IPERM_FLAG_RCU) {
196 196 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
197 if (acl) { 197 return -ECHILD;
198 int error = posix_acl_permission(inode, acl, mask); 198 } else {
199 posix_acl_release(acl); 199 struct posix_acl *acl;
200 return error; 200
201 acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
202 if (acl) {
203 int error = posix_acl_permission(inode, acl, mask);
204 posix_acl_release(acl);
205 return error;
206 }
201 } 207 }
202 return -EAGAIN; 208 return -EAGAIN;
203} 209}
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBDAF) 3 depends on (64BIT || LBDAF)
4 select DLM if GFS2_FS_LOCKING_DLM 4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM 5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM 6 select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943d..7118f1a780a9 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
75 * Returns: errno 75 * Returns: errno
76 */ 76 */
77 77
78int gfs2_check_acl(struct inode *inode, int mask) 78int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
79{ 79{
80 struct posix_acl *acl; 80 struct posix_acl *acl;
81 int error; 81 int error;
82 82
83 if (flags & IPERM_FLAG_RCU)
84 return -ECHILD;
85
83 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); 86 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
84 if (IS_ERR(acl)) 87 if (IS_ERR(acl))
85 return PTR_ERR(acl); 88 return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39ea..a93907c8159b 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" 16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
17#define GFS2_ACL_MAX_ENTRIES 25 17#define GFS2_ACL_MAX_ENTRIES 25
18 18
19extern int gfs2_check_acl(struct inode *inode, int mask); 19extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern const struct xattr_handler gfs2_xattr_system_handler; 22extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
36#include "glops.h" 36#include "glops.h"
37 37
38 38
39static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, 39void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
40 unsigned int from, unsigned int to) 40 unsigned int from, unsigned int to)
41{ 41{
42 struct buffer_head *head = page_buffers(page); 42 struct buffer_head *head = page_buffers(page);
43 unsigned int bsize = head->b_size; 43 unsigned int bsize = head->b_size;
@@ -615,10 +615,9 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
616 int alloc_required; 616 int alloc_required;
617 int error = 0; 617 int error = 0;
618 struct gfs2_alloc *al; 618 struct gfs2_alloc *al = NULL;
619 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 619 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
620 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 620 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
621 unsigned to = from + len;
622 struct page *page; 621 struct page *page;
623 622
624 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 623 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -663,6 +662,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
663 rblocks += RES_STATFS + RES_QUOTA; 662 rblocks += RES_STATFS + RES_QUOTA;
664 if (&ip->i_inode == sdp->sd_rindex) 663 if (&ip->i_inode == sdp->sd_rindex)
665 rblocks += 2 * RES_STATFS; 664 rblocks += 2 * RES_STATFS;
665 if (alloc_required)
666 rblocks += gfs2_rg_blocks(al);
666 667
667 error = gfs2_trans_begin(sdp, rblocks, 668 error = gfs2_trans_begin(sdp, rblocks,
668 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 669 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -689,20 +690,18 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
689 } 690 }
690 691
691prepare_write: 692prepare_write:
692 error = block_prepare_write(page, from, to, gfs2_block_map); 693 error = __block_write_begin(page, from, len, gfs2_block_map);
693out: 694out:
694 if (error == 0) 695 if (error == 0)
695 return 0; 696 return 0;
696 697
697 page_cache_release(page); 698 page_cache_release(page);
698 699
699 /* 700 gfs2_trans_end(sdp);
700 * XXX(truncate): the call below should probably be replaced with
701 * a call to the gfs2-specific truncate blocks helper to actually
702 * release disk blocks..
703 */
704 if (pos + len > ip->i_inode.i_size) 701 if (pos + len > ip->i_inode.i_size)
705 truncate_setsize(&ip->i_inode, ip->i_inode.i_size); 702 gfs2_trim_blocks(&ip->i_inode);
703 goto out_trans_fail;
704
706out_endtrans: 705out_endtrans:
707 gfs2_trans_end(sdp); 706 gfs2_trans_end(sdp);
708out_trans_fail: 707out_trans_fail:
@@ -802,10 +801,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
802 page_cache_release(page); 801 page_cache_release(page);
803 802
804 if (copied) { 803 if (copied) {
805 if (inode->i_size < to) { 804 if (inode->i_size < to)
806 i_size_write(inode, to); 805 i_size_write(inode, to);
807 ip->i_disksize = inode->i_size;
808 }
809 gfs2_dinode_out(ip, di); 806 gfs2_dinode_out(ip, di);
810 mark_inode_dirty(inode); 807 mark_inode_dirty(inode);
811 } 808 }
@@ -876,8 +873,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
876 873
877 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 874 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
878 if (ret > 0) { 875 if (ret > 0) {
879 if (inode->i_size > ip->i_disksize)
880 ip->i_disksize = inode->i_size;
881 gfs2_dinode_out(ip, dibh->b_data); 876 gfs2_dinode_out(ip, dibh->b_data);
882 mark_inode_dirty(inode); 877 mark_inode_dirty(inode);
883 } 878 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..3c4039d5eef1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
50 * @ip: the inode 50 * @ip: the inode
51 * @dibh: the dinode buffer 51 * @dibh: the dinode buffer
52 * @block: the block number that was allocated 52 * @block: the block number that was allocated
53 * @private: any locked page held by the caller process 53 * @page: The (optional) page. This is looked up if @page is NULL
54 * 54 *
55 * Returns: errno 55 * Returns: errno
56 */ 56 */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
109/** 109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big 110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff 111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file 112 * @page: The (optional) page. This is looked up if the @page is NULL
113 * @private: private data for the unstuffer
114 * 113 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such 114 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way. 115 * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 131 if (error)
133 goto out; 132 goto out;
134 133
135 if (ip->i_disksize) { 134 if (i_size_read(&ip->i_inode)) {
136 /* Get a free block, fill it with the stuffed data, 135 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 136 and write it out to disk */
138 137
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
161 di = (struct gfs2_dinode *)dibh->b_data; 160 di = (struct gfs2_dinode *)dibh->b_data;
162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
163 162
164 if (ip->i_disksize) { 163 if (i_size_read(&ip->i_inode)) {
165 *(__be64 *)(di + 1) = cpu_to_be64(block); 164 *(__be64 *)(di + 1) = cpu_to_be64(block);
166 gfs2_add_inode_blocks(&ip->i_inode, 1); 165 gfs2_add_inode_blocks(&ip->i_inode, 1);
167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 166 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -764,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
764 int metadata; 763 int metadata;
765 unsigned int revokes = 0; 764 unsigned int revokes = 0;
766 int x; 765 int x;
767 int error; 766 int error = 0;
768 767
769 if (!*top) 768 if (!*top)
770 sm->sm_first = 0; 769 sm->sm_first = 0;
@@ -781,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
781 if (metadata) 780 if (metadata)
782 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; 781 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
783 782
784 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); 783 if (ip != GFS2_I(sdp->sd_rindex))
784 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
785 else if (!sdp->sd_rgrps)
786 error = gfs2_ri_update(ip);
787
785 if (error) 788 if (error)
786 return error; 789 return error;
787 790
@@ -880,88 +883,20 @@ out_rg_gunlock:
880out_rlist: 883out_rlist:
881 gfs2_rlist_free(&rlist); 884 gfs2_rlist_free(&rlist);
882out: 885out:
883 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); 886 if (ip != GFS2_I(sdp->sd_rindex))
887 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
884 return error; 888 return error;
885} 889}
886 890
887/** 891/**
888 * do_grow - Make a file look bigger than it is
889 * @ip: the inode
890 * @size: the size to set the file to
891 *
892 * Called with an exclusive lock on @ip.
893 *
894 * Returns: errno
895 */
896
897static int do_grow(struct gfs2_inode *ip, u64 size)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_alloc *al;
901 struct buffer_head *dibh;
902 int error;
903
904 al = gfs2_alloc_get(ip);
905 if (!al)
906 return -ENOMEM;
907
908 error = gfs2_quota_lock_check(ip);
909 if (error)
910 goto out;
911
912 al->al_requested = sdp->sd_max_height + RES_DATA;
913
914 error = gfs2_inplace_reserve(ip);
915 if (error)
916 goto out_gunlock_q;
917
918 error = gfs2_trans_begin(sdp,
919 sdp->sd_max_height + al->al_rgd->rd_length +
920 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
921 if (error)
922 goto out_ipres;
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out_end_trans;
927
928 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
929 if (gfs2_is_stuffed(ip)) {
930 error = gfs2_unstuff_dinode(ip, NULL);
931 if (error)
932 goto out_brelse;
933 }
934 }
935
936 ip->i_disksize = size;
937 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(ip, dibh->b_data);
940
941out_brelse:
942 brelse(dibh);
943out_end_trans:
944 gfs2_trans_end(sdp);
945out_ipres:
946 gfs2_inplace_release(ip);
947out_gunlock_q:
948 gfs2_quota_unlock(ip);
949out:
950 gfs2_alloc_put(ip);
951 return error;
952}
953
954
955/**
956 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 892 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
957 * 893 *
958 * This is partly borrowed from ext3. 894 * This is partly borrowed from ext3.
959 */ 895 */
960static int gfs2_block_truncate_page(struct address_space *mapping) 896static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
961{ 897{
962 struct inode *inode = mapping->host; 898 struct inode *inode = mapping->host;
963 struct gfs2_inode *ip = GFS2_I(inode); 899 struct gfs2_inode *ip = GFS2_I(inode);
964 loff_t from = inode->i_size;
965 unsigned long index = from >> PAGE_CACHE_SHIFT; 900 unsigned long index = from >> PAGE_CACHE_SHIFT;
966 unsigned offset = from & (PAGE_CACHE_SIZE-1); 901 unsigned offset = from & (PAGE_CACHE_SIZE-1);
967 unsigned blocksize, iblock, length, pos; 902 unsigned blocksize, iblock, length, pos;
@@ -1023,9 +958,11 @@ unlock:
1023 return err; 958 return err;
1024} 959}
1025 960
1026static int trunc_start(struct gfs2_inode *ip, u64 size) 961static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1027{ 962{
1028 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 963 struct gfs2_inode *ip = GFS2_I(inode);
964 struct gfs2_sbd *sdp = GFS2_SB(inode);
965 struct address_space *mapping = inode->i_mapping;
1029 struct buffer_head *dibh; 966 struct buffer_head *dibh;
1030 int journaled = gfs2_is_jdata(ip); 967 int journaled = gfs2_is_jdata(ip);
1031 int error; 968 int error;
@@ -1039,31 +976,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 if (error) 976 if (error)
1040 goto out; 977 goto out;
1041 978
979 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
980
1042 if (gfs2_is_stuffed(ip)) { 981 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_dinode); 982 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044 ip->i_disksize = size;
1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1047 gfs2_dinode_out(ip, dibh->b_data);
1048 if (dsize > dibh->b_size)
1049 dsize = dibh->b_size;
1050 gfs2_buffer_clear_tail(dibh, dsize);
1051 error = 1;
1052 } else { 983 } else {
1053 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 984 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1054 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 985 error = gfs2_block_truncate_page(mapping, newsize);
1055 986 if (error)
1056 if (!error) { 987 goto out_brelse;
1057 ip->i_disksize = size;
1058 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1059 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1060 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1061 gfs2_dinode_out(ip, dibh->b_data);
1062 } 988 }
989 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1063 } 990 }
1064 991
1065 brelse(dibh); 992 i_size_write(inode, newsize);
993 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
994 gfs2_dinode_out(ip, dibh->b_data);
1066 995
996 truncate_pagecache(inode, oldsize, newsize);
997out_brelse:
998 brelse(dibh);
1067out: 999out:
1068 gfs2_trans_end(sdp); 1000 gfs2_trans_end(sdp);
1069 return error; 1001 return error;
@@ -1123,7 +1055,7 @@ static int trunc_end(struct gfs2_inode *ip)
1123 if (error) 1055 if (error)
1124 goto out; 1056 goto out;
1125 1057
1126 if (!ip->i_disksize) { 1058 if (!i_size_read(&ip->i_inode)) {
1127 ip->i_height = 0; 1059 ip->i_height = 0;
1128 ip->i_goal = ip->i_no_addr; 1060 ip->i_goal = ip->i_no_addr;
1129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1061 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1075,154 @@ out:
1143 1075
1144/** 1076/**
1145 * do_shrink - make a file smaller 1077 * do_shrink - make a file smaller
1146 * @ip: the inode 1078 * @inode: the inode
1147 * @size: the size to make the file 1079 * @oldsize: the current inode size
1148 * @truncator: function to truncate the last partial block 1080 * @newsize: the size to make the file
1149 * 1081 *
1150 * Called with an exclusive lock on @ip. 1082 * Called with an exclusive lock on @inode. The @size must
1083 * be equal to or smaller than the current inode size.
1151 * 1084 *
1152 * Returns: errno 1085 * Returns: errno
1153 */ 1086 */
1154 1087
1155static int do_shrink(struct gfs2_inode *ip, u64 size) 1088static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1156{ 1089{
1090 struct gfs2_inode *ip = GFS2_I(inode);
1157 int error; 1091 int error;
1158 1092
1159 error = trunc_start(ip, size); 1093 error = trunc_start(inode, oldsize, newsize);
1160 if (error < 0) 1094 if (error < 0)
1161 return error; 1095 return error;
1162 if (error > 0) 1096 if (gfs2_is_stuffed(ip))
1163 return 0; 1097 return 0;
1164 1098
1165 error = trunc_dealloc(ip, size); 1099 error = trunc_dealloc(ip, newsize);
1166 if (!error) 1100 if (error == 0)
1167 error = trunc_end(ip); 1101 error = trunc_end(ip);
1168 1102
1169 return error; 1103 return error;
1170} 1104}
1171 1105
1172static int do_touch(struct gfs2_inode *ip, u64 size) 1106void gfs2_trim_blocks(struct inode *inode)
1173{ 1107{
1174 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1108 u64 size = inode->i_size;
1109 int ret;
1110
1111 ret = do_shrink(inode, size, size);
1112 WARN_ON(ret != 0);
1113}
1114
1115/**
1116 * do_grow - Touch and update inode size
1117 * @inode: The inode
1118 * @size: The new size
1119 *
1120 * This function updates the timestamps on the inode and
1121 * may also increase the size of the inode. This function
1122 * must not be called with @size any smaller than the current
1123 * inode size.
1124 *
1125 * Although it is not strictly required to unstuff files here,
1126 * earlier versions of GFS2 have a bug in the stuffed file reading
1127 * code which will result in a buffer overrun if the size is larger
1128 * than the max stuffed file size. In order to prevent this from
1129 * occuring, such files are unstuffed, but in other cases we can
1130 * just update the inode size directly.
1131 *
1132 * Returns: 0 on success, or -ve on error
1133 */
1134
1135static int do_grow(struct inode *inode, u64 size)
1136{
1137 struct gfs2_inode *ip = GFS2_I(inode);
1138 struct gfs2_sbd *sdp = GFS2_SB(inode);
1175 struct buffer_head *dibh; 1139 struct buffer_head *dibh;
1140 struct gfs2_alloc *al = NULL;
1176 int error; 1141 int error;
1177 1142
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 1143 if (gfs2_is_stuffed(ip) &&
1144 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1145 al = gfs2_alloc_get(ip);
1146 if (al == NULL)
1147 return -ENOMEM;
1148
1149 error = gfs2_quota_lock_check(ip);
1150 if (error)
1151 goto do_grow_alloc_put;
1152
1153 al->al_requested = 1;
1154 error = gfs2_inplace_reserve(ip);
1155 if (error)
1156 goto do_grow_qunlock;
1157 }
1158
1159 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
1179 if (error) 1160 if (error)
1180 return error; 1161 goto do_grow_release;
1181 1162
1182 down_write(&ip->i_rw_mutex); 1163 if (al) {
1164 error = gfs2_unstuff_dinode(ip, NULL);
1165 if (error)
1166 goto do_end_trans;
1167 }
1183 1168
1184 error = gfs2_meta_inode_buffer(ip, &dibh); 1169 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error) 1170 if (error)
1186 goto do_touch_out; 1171 goto do_end_trans;
1187 1172
1173 i_size_write(inode, size);
1188 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1174 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1189 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1175 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1190 gfs2_dinode_out(ip, dibh->b_data); 1176 gfs2_dinode_out(ip, dibh->b_data);
1191 brelse(dibh); 1177 brelse(dibh);
1192 1178
1193do_touch_out: 1179do_end_trans:
1194 up_write(&ip->i_rw_mutex);
1195 gfs2_trans_end(sdp); 1180 gfs2_trans_end(sdp);
1181do_grow_release:
1182 if (al) {
1183 gfs2_inplace_release(ip);
1184do_grow_qunlock:
1185 gfs2_quota_unlock(ip);
1186do_grow_alloc_put:
1187 gfs2_alloc_put(ip);
1188 }
1196 return error; 1189 return error;
1197} 1190}
1198 1191
1199/** 1192/**
1200 * gfs2_truncatei - make a file a given size 1193 * gfs2_setattr_size - make a file a given size
1201 * @ip: the inode 1194 * @inode: the inode
1202 * @size: the size to make the file 1195 * @newsize: the size to make the file
1203 * @truncator: function to truncate the last partial block
1204 * 1196 *
1205 * The file size can grow, shrink, or stay the same size. 1197 * The file size can grow, shrink, or stay the same size. This
1198 * is called holding i_mutex and an exclusive glock on the inode
1199 * in question.
1206 * 1200 *
1207 * Returns: errno 1201 * Returns: errno
1208 */ 1202 */
1209 1203
1210int gfs2_truncatei(struct gfs2_inode *ip, u64 size) 1204int gfs2_setattr_size(struct inode *inode, u64 newsize)
1211{ 1205{
1212 int error; 1206 int ret;
1207 u64 oldsize;
1213 1208
1214 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1209 BUG_ON(!S_ISREG(inode->i_mode));
1215 return -EINVAL;
1216 1210
1217 if (size > ip->i_disksize) 1211 ret = inode_newsize_ok(inode, newsize);
1218 error = do_grow(ip, size); 1212 if (ret)
1219 else if (size < ip->i_disksize) 1213 return ret;
1220 error = do_shrink(ip, size);
1221 else
1222 /* update time stamps */
1223 error = do_touch(ip, size);
1224 1214
1225 return error; 1215 oldsize = inode->i_size;
1216 if (newsize >= oldsize)
1217 return do_grow(inode, newsize);
1218
1219 return do_shrink(inode, oldsize, newsize);
1226} 1220}
1227 1221
1228int gfs2_truncatei_resume(struct gfs2_inode *ip) 1222int gfs2_truncatei_resume(struct gfs2_inode *ip)
1229{ 1223{
1230 int error; 1224 int error;
1231 error = trunc_dealloc(ip, ip->i_disksize); 1225 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1232 if (!error) 1226 if (!error)
1233 error = trunc_end(ip); 1227 error = trunc_end(ip);
1234 return error; 1228 return error;
@@ -1269,7 +1263,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1269 1263
1270 shift = sdp->sd_sb.sb_bsize_shift; 1264 shift = sdp->sd_sb.sb_bsize_shift;
1271 BUG_ON(gfs2_is_dir(ip)); 1265 BUG_ON(gfs2_is_dir(ip));
1272 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; 1266 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1273 lblock = offset >> shift; 1267 lblock = offset >> shift;
1274 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1268 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1275 if (lblock_stop > end_of_file) 1269 if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
44 } 44 }
45} 45}
46 46
47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48extern int gfs2_block_map(struct inode *inode, sector_t lblock,
49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49 struct buffer_head *bh, int create);
50 50extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
51int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51 u64 *dblock, unsigned *extlen);
52int gfs2_truncatei_resume(struct gfs2_inode *ip); 52extern int gfs2_setattr_size(struct inode *inode, u64 size);
53int gfs2_file_dealloc(struct gfs2_inode *ip); 53extern void gfs2_trim_blocks(struct inode *inode);
54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55 unsigned int len); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len);
56 58
57#endif /* __BMAP_DOT_H__ */ 59#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..4a456338b873 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
11#include <linux/completion.h> 11#include <linux/completion.h>
12#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
13#include <linux/gfs2_ondisk.h> 13#include <linux/gfs2_ondisk.h>
14#include <linux/namei.h>
14#include <linux/crc32.h> 15#include <linux/crc32.h>
15 16
16#include "gfs2.h" 17#include "gfs2.h"
@@ -34,22 +35,30 @@
34 35
35static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) 36static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
36{ 37{
37 struct dentry *parent = dget_parent(dentry); 38 struct dentry *parent;
38 struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); 39 struct gfs2_sbd *sdp;
39 struct gfs2_inode *dip = GFS2_I(parent->d_inode); 40 struct gfs2_inode *dip;
40 struct inode *inode = dentry->d_inode; 41 struct inode *inode;
41 struct gfs2_holder d_gh; 42 struct gfs2_holder d_gh;
42 struct gfs2_inode *ip = NULL; 43 struct gfs2_inode *ip = NULL;
43 int error; 44 int error;
44 int had_lock = 0; 45 int had_lock = 0;
45 46
47 if (nd->flags & LOOKUP_RCU)
48 return -ECHILD;
49
50 parent = dget_parent(dentry);
51 sdp = GFS2_SB(parent->d_inode);
52 dip = GFS2_I(parent->d_inode);
53 inode = dentry->d_inode;
54
46 if (inode) { 55 if (inode) {
47 if (is_bad_inode(inode)) 56 if (is_bad_inode(inode))
48 goto invalid; 57 goto invalid;
49 ip = GFS2_I(inode); 58 ip = GFS2_I(inode);
50 } 59 }
51 60
52 if (sdp->sd_args.ar_localcaching) 61 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
53 goto valid; 62 goto valid;
54 63
55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); 64 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
@@ -100,13 +109,14 @@ fail:
100 return 0; 109 return 0;
101} 110}
102 111
103static int gfs2_dhash(struct dentry *dentry, struct qstr *str) 112static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
113 struct qstr *str)
104{ 114{
105 str->hash = gfs2_disk_hash(str->name, str->len); 115 str->hash = gfs2_disk_hash(str->name, str->len);
106 return 0; 116 return 0;
107} 117}
108 118
109static int gfs2_dentry_delete(struct dentry *dentry) 119static int gfs2_dentry_delete(const struct dentry *dentry)
110{ 120{
111 struct gfs2_inode *ginode; 121 struct gfs2_inode *ginode;
112 122
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81 81
82struct qstr gfs2_qdot __read_mostly;
83struct qstr gfs2_qdotdot __read_mostly;
84
82typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, 85typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83 u64 leaf_no, void *data); 86 u64 leaf_no, void *data);
84typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, 87typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
127 130
128 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 131 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 132 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130 if (ip->i_disksize < offset + size) 133 if (ip->i_inode.i_size < offset + size)
131 ip->i_disksize = offset + size; 134 i_size_write(&ip->i_inode, offset + size);
132 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 135 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133 gfs2_dinode_out(ip, dibh->b_data); 136 gfs2_dinode_out(ip, dibh->b_data);
134 137
@@ -225,8 +228,8 @@ out:
225 if (error) 228 if (error)
226 return error; 229 return error;
227 230
228 if (ip->i_disksize < offset + copied) 231 if (ip->i_inode.i_size < offset + copied)
229 ip->i_disksize = offset + copied; 232 i_size_write(&ip->i_inode, offset + copied);
230 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 233 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231 234
232 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 278 unsigned int o;
276 int copied = 0; 279 int copied = 0;
277 int error = 0; 280 int error = 0;
281 u64 disksize = i_size_read(&ip->i_inode);
278 282
279 if (offset >= ip->i_disksize) 283 if (offset >= disksize)
280 return 0; 284 return 0;
281 285
282 if (offset + size > ip->i_disksize) 286 if (offset + size > disksize)
283 size = ip->i_disksize - offset; 287 size = disksize - offset;
284 288
285 if (!size) 289 if (!size)
286 return 0; 290 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
727 unsigned hsize = 1 << ip->i_depth; 731 unsigned hsize = 1 << ip->i_depth;
728 unsigned index; 732 unsigned index;
729 u64 ln; 733 u64 ln;
730 if (hsize * sizeof(u64) != ip->i_disksize) { 734 if (hsize * sizeof(u64) != i_size_read(inode)) {
731 gfs2_consist_inode(ip); 735 gfs2_consist_inode(ip);
732 return ERR_PTR(-EIO); 736 return ERR_PTR(-EIO);
733 } 737 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
879 for (x = sdp->sd_hash_ptrs; x--; lp++) 883 for (x = sdp->sd_hash_ptrs; x--; lp++)
880 *lp = cpu_to_be64(bn); 884 *lp = cpu_to_be64(bn);
881 885
882 dip->i_disksize = sdp->sd_sb.sb_bsize / 2; 886 i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
883 gfs2_add_inode_blocks(&dip->i_inode, 1); 887 gfs2_add_inode_blocks(&dip->i_inode, 1);
884 dip->i_diskflags |= GFS2_DIF_EXHASH; 888 dip->i_diskflags |= GFS2_DIF_EXHASH;
885 889
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1057 u64 *buf; 1061 u64 *buf;
1058 u64 *from, *to; 1062 u64 *from, *to;
1059 u64 block; 1063 u64 block;
1064 u64 disksize = i_size_read(&dip->i_inode);
1060 int x; 1065 int x;
1061 int error = 0; 1066 int error = 0;
1062 1067
1063 hsize = 1 << dip->i_depth; 1068 hsize = 1 << dip->i_depth;
1064 if (hsize * sizeof(u64) != dip->i_disksize) { 1069 if (hsize * sizeof(u64) != disksize) {
1065 gfs2_consist_inode(dip); 1070 gfs2_consist_inode(dip);
1066 return -EIO; 1071 return -EIO;
1067 } 1072 }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1072 if (!buf) 1077 if (!buf)
1073 return -ENOMEM; 1078 return -ENOMEM;
1074 1079
1075 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { 1080 for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
1076 error = gfs2_dir_read_data(dip, (char *)buf, 1081 error = gfs2_dir_read_data(dip, (char *)buf,
1077 block * sdp->sd_hash_bsize, 1082 block * sdp->sd_hash_bsize,
1078 sdp->sd_hash_bsize, 1); 1083 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1375 unsigned depth = 0;
1371 1376
1372 hsize = 1 << dip->i_depth; 1377 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_disksize) { 1378 if (hsize * sizeof(u64) != i_size_read(inode)) {
1374 gfs2_consist_inode(dip); 1379 gfs2_consist_inode(dip);
1375 return -EIO; 1380 return -EIO;
1376 } 1381 }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1789 int error = 0;
1785 1790
1786 hsize = 1 << dip->i_depth; 1791 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_disksize) { 1792 if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
1788 gfs2_consist_inode(dip); 1793 gfs2_consist_inode(dip);
1789 return -EIO; 1794 return -EIO;
1790 } 1795 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); 20extern struct inode *gfs2_dir_search(struct inode *dir,
21int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 21 const struct qstr *filename);
22 const struct gfs2_inode *ip); 22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 23 const struct gfs2_inode *ip);
24 const struct gfs2_inode *ip, unsigned int type); 24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 25 const struct gfs2_inode *ip, unsigned int type);
26int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
27 filldir_t filldir); 27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
28int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 28 filldir_t filldir);
29 const struct gfs2_inode *nip, unsigned int new_type); 29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type);
30 31
31int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 32extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
32 33
33int gfs2_diradd_alloc_required(struct inode *dir, 34extern int gfs2_diradd_alloc_required(struct inode *dir,
34 const struct qstr *filename); 35 const struct qstr *filename);
35int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 36extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
36 struct buffer_head **bhp); 37 struct buffer_head **bhp);
37 38
38static inline u32 gfs2_disk_hash(const char *data, int len) 39static inline u32 gfs2_disk_hash(const char *data, int len)
39{ 40{
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
61 memcpy(dent + 1, name->name, name->len); 62 memcpy(dent + 1, name->name, name->len);
62} 63}
63 64
65extern struct qstr gfs2_qdot;
66extern struct qstr gfs2_qdotdot;
67
64#endif /* __DIR_DOT_H__ */ 68#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..9023db8184f9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,29 +126,14 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct qstr dotdot; 129 return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
130 struct dentry *dentry;
131
132 /*
133 * XXX(hch): it would be a good idea to keep this around as a
134 * static variable.
135 */
136 gfs2_str2qstr(&dotdot, "..");
137
138 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
139 if (!IS_ERR(dentry))
140 dentry->d_op = &gfs2_dops;
141 return dentry;
142} 130}
143 131
144static struct dentry *gfs2_get_dentry(struct super_block *sb, 132static struct dentry *gfs2_get_dentry(struct super_block *sb,
145 struct gfs2_inum_host *inum) 133 struct gfs2_inum_host *inum)
146{ 134{
147 struct gfs2_sbd *sdp = sb->s_fs_info; 135 struct gfs2_sbd *sdp = sb->s_fs_info;
148 struct gfs2_holder i_gh;
149 struct inode *inode; 136 struct inode *inode;
150 struct dentry *dentry;
151 int error;
152 137
153 inode = gfs2_ilookup(sb, inum->no_addr); 138 inode = gfs2_ilookup(sb, inum->no_addr);
154 if (inode) { 139 if (inode) {
@@ -159,52 +144,13 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
159 goto out_inode; 144 goto out_inode;
160 } 145 }
161 146
162 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, 147 inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
163 LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 148 GFS2_BLKST_DINODE);
164 if (error) 149 if (IS_ERR(inode))
165 return ERR_PTR(error); 150 return ERR_CAST(inode);
166
167 error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
168 if (error)
169 goto fail;
170
171 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
172 if (IS_ERR(inode)) {
173 error = PTR_ERR(inode);
174 goto fail;
175 }
176
177 error = gfs2_inode_refresh(GFS2_I(inode));
178 if (error) {
179 iput(inode);
180 goto fail;
181 }
182
183 /* Pick up the works we bypass in gfs2_inode_lookup */
184 if (inode->i_state & I_NEW)
185 gfs2_set_iop(inode);
186
187 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
188 iput(inode);
189 goto fail;
190 }
191
192 error = -EIO;
193 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
194 iput(inode);
195 goto fail;
196 }
197
198 gfs2_glock_dq_uninit(&i_gh);
199 151
200out_inode: 152out_inode:
201 dentry = d_obtain_alias(inode); 153 return d_obtain_alias(inode);
202 if (!IS_ERR(dentry))
203 dentry->d_op = &gfs2_dops;
204 return dentry;
205fail:
206 gfs2_glock_dq_uninit(&i_gh);
207 return ERR_PTR(error);
208} 154}
209 155
210static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, 156static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
22#include <linux/falloc.h>
23#include <linux/swap.h>
22#include <linux/crc32.h> 24#include <linux/crc32.h>
23#include <linux/writeback.h> 25#include <linux/writeback.h>
24#include <asm/uaccess.h> 26#include <asm/uaccess.h>
@@ -241,7 +243,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
241 !capable(CAP_LINUX_IMMUTABLE)) 243 !capable(CAP_LINUX_IMMUTABLE))
242 goto out; 244 goto out;
243 if (!IS_IMMUTABLE(inode)) { 245 if (!IS_IMMUTABLE(inode)) {
244 error = gfs2_permission(inode, MAY_WRITE); 246 error = gfs2_permission(inode, MAY_WRITE, 0);
245 if (error) 247 if (error)
246 goto out; 248 goto out;
247 } 249 }
@@ -382,8 +384,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
382 rblocks = RES_DINODE + ind_blocks; 384 rblocks = RES_DINODE + ind_blocks;
383 if (gfs2_is_jdata(ip)) 385 if (gfs2_is_jdata(ip))
384 rblocks += data_blocks ? data_blocks : 1; 386 rblocks += data_blocks ? data_blocks : 1;
385 if (ind_blocks || data_blocks) 387 if (ind_blocks || data_blocks) {
386 rblocks += RES_STATFS + RES_QUOTA; 388 rblocks += RES_STATFS + RES_QUOTA;
389 rblocks += gfs2_rg_blocks(al);
390 }
387 ret = gfs2_trans_begin(sdp, rblocks, 0); 391 ret = gfs2_trans_begin(sdp, rblocks, 0);
388 if (ret) 392 if (ret)
389 goto out_trans_fail; 393 goto out_trans_fail;
@@ -491,7 +495,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
491 goto fail; 495 goto fail;
492 496
493 if (!(file->f_flags & O_LARGEFILE) && 497 if (!(file->f_flags & O_LARGEFILE) &&
494 ip->i_disksize > MAX_NON_LFS) { 498 i_size_read(inode) > MAX_NON_LFS) {
495 error = -EOVERFLOW; 499 error = -EOVERFLOW;
496 goto fail_gunlock; 500 goto fail_gunlock;
497 } 501 }
@@ -608,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
608 return generic_file_aio_write(iocb, iov, nr_segs, pos); 612 return generic_file_aio_write(iocb, iov, nr_segs, pos);
609} 613}
610 614
615static void empty_write_end(struct page *page, unsigned from,
616 unsigned to)
617{
618 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
619
620 page_zero_new_buffers(page, from, to);
621 flush_dcache_page(page);
622 mark_page_accessed(page);
623
624 if (!gfs2_is_writeback(ip))
625 gfs2_page_add_databufs(ip, page, from, to);
626
627 block_commit_write(page, from, to);
628}
629
630static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
631{
632 unsigned start, end, next;
633 struct buffer_head *bh, *head;
634 int error;
635
636 if (!page_has_buffers(page)) {
637 error = __block_write_begin(page, from, to - from, gfs2_block_map);
638 if (unlikely(error))
639 return error;
640
641 empty_write_end(page, from, to);
642 return 0;
643 }
644
645 bh = head = page_buffers(page);
646 next = end = 0;
647 while (next < from) {
648 next += bh->b_size;
649 bh = bh->b_this_page;
650 }
651 start = next;
652 do {
653 next += bh->b_size;
654 if (buffer_mapped(bh)) {
655 if (end) {
656 error = __block_write_begin(page, start, end - start,
657 gfs2_block_map);
658 if (unlikely(error))
659 return error;
660 empty_write_end(page, start, end);
661 end = 0;
662 }
663 start = next;
664 }
665 else
666 end = next;
667 bh = bh->b_this_page;
668 } while (next < to);
669
670 if (end) {
671 error = __block_write_begin(page, start, end - start, gfs2_block_map);
672 if (unlikely(error))
673 return error;
674 empty_write_end(page, start, end);
675 }
676
677 return 0;
678}
679
680static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
681 int mode)
682{
683 struct gfs2_inode *ip = GFS2_I(inode);
684 struct buffer_head *dibh;
685 int error;
686 u64 start = offset >> PAGE_CACHE_SHIFT;
687 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
688 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
689 pgoff_t curr;
690 struct page *page;
691 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
692 unsigned int from, to;
693
694 if (!end_offset)
695 end_offset = PAGE_CACHE_SIZE;
696
697 error = gfs2_meta_inode_buffer(ip, &dibh);
698 if (unlikely(error))
699 goto out;
700
701 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
702
703 if (gfs2_is_stuffed(ip)) {
704 error = gfs2_unstuff_dinode(ip, NULL);
705 if (unlikely(error))
706 goto out;
707 }
708
709 curr = start;
710 offset = start << PAGE_CACHE_SHIFT;
711 from = start_offset;
712 to = PAGE_CACHE_SIZE;
713 while (curr <= end) {
714 page = grab_cache_page_write_begin(inode->i_mapping, curr,
715 AOP_FLAG_NOFS);
716 if (unlikely(!page)) {
717 error = -ENOMEM;
718 goto out;
719 }
720
721 if (curr == end)
722 to = end_offset;
723 error = write_empty_blocks(page, from, to);
724 if (!error && offset + to > inode->i_size &&
725 !(mode & FALLOC_FL_KEEP_SIZE)) {
726 i_size_write(inode, offset + to);
727 }
728 unlock_page(page);
729 page_cache_release(page);
730 if (error)
731 goto out;
732 curr++;
733 offset += PAGE_CACHE_SIZE;
734 from = 0;
735 }
736
737 gfs2_dinode_out(ip, dibh->b_data);
738 mark_inode_dirty(inode);
739
740 brelse(dibh);
741
742out:
743 return error;
744}
745
746static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
747 unsigned int *data_blocks, unsigned int *ind_blocks)
748{
749 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
750 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
751 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
752
753 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
754 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
755 max_data -= tmp;
756 }
757 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
758 so it might end up with fewer data blocks */
759 if (max_data <= *data_blocks)
760 return;
761 *data_blocks = max_data;
762 *ind_blocks = max_blocks - max_data;
763 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
764 if (*len > max) {
765 *len = max;
766 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
767 }
768}
769
770static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
771 loff_t len)
772{
773 struct inode *inode = file->f_path.dentry->d_inode;
774 struct gfs2_sbd *sdp = GFS2_SB(inode);
775 struct gfs2_inode *ip = GFS2_I(inode);
776 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
777 loff_t bytes, max_bytes;
778 struct gfs2_alloc *al;
779 int error;
780 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
781 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
782
783 /* We only support the FALLOC_FL_KEEP_SIZE mode */
784 if (mode & ~FALLOC_FL_KEEP_SIZE)
785 return -EOPNOTSUPP;
786
787 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
788 sdp->sd_sb.sb_bsize_shift;
789
790 len = next - offset;
791 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
792 if (!bytes)
793 bytes = UINT_MAX;
794
795 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
796 error = gfs2_glock_nq(&ip->i_gh);
797 if (unlikely(error))
798 goto out_uninit;
799
800 if (!gfs2_write_alloc_required(ip, offset, len))
801 goto out_unlock;
802
803 while (len > 0) {
804 if (len < bytes)
805 bytes = len;
806 al = gfs2_alloc_get(ip);
807 if (!al) {
808 error = -ENOMEM;
809 goto out_unlock;
810 }
811
812 error = gfs2_quota_lock_check(ip);
813 if (error)
814 goto out_alloc_put;
815
816retry:
817 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
818
819 al->al_requested = data_blocks + ind_blocks;
820 error = gfs2_inplace_reserve(ip);
821 if (error) {
822 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
823 bytes >>= 1;
824 goto retry;
825 }
826 goto out_qunlock;
827 }
828 max_bytes = bytes;
829 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
830 al->al_requested = data_blocks + ind_blocks;
831
832 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
833 RES_RG_HDR + gfs2_rg_blocks(al);
834 if (gfs2_is_jdata(ip))
835 rblocks += data_blocks ? data_blocks : 1;
836
837 error = gfs2_trans_begin(sdp, rblocks,
838 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
839 if (error)
840 goto out_trans_fail;
841
842 error = fallocate_chunk(inode, offset, max_bytes, mode);
843 gfs2_trans_end(sdp);
844
845 if (error)
846 goto out_trans_fail;
847
848 len -= max_bytes;
849 offset += max_bytes;
850 gfs2_inplace_release(ip);
851 gfs2_quota_unlock(ip);
852 gfs2_alloc_put(ip);
853 }
854 goto out_unlock;
855
856out_trans_fail:
857 gfs2_inplace_release(ip);
858out_qunlock:
859 gfs2_quota_unlock(ip);
860out_alloc_put:
861 gfs2_alloc_put(ip);
862out_unlock:
863 gfs2_glock_dq(&ip->i_gh);
864out_uninit:
865 gfs2_holder_uninit(&ip->i_gh);
866 return error;
867}
868
611#ifdef CONFIG_GFS2_FS_LOCKING_DLM 869#ifdef CONFIG_GFS2_FS_LOCKING_DLM
612 870
613/** 871/**
@@ -620,6 +878,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
620 * cluster; until we do, disable leases (by just returning -EINVAL), 878 * cluster; until we do, disable leases (by just returning -EINVAL),
621 * unless the administrator has requested purely local locking. 879 * unless the administrator has requested purely local locking.
622 * 880 *
881 * Locking: called under lock_flocks
882 *
623 * Returns: errno 883 * Returns: errno
624 */ 884 */
625 885
@@ -761,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
761 .splice_read = generic_file_splice_read, 1021 .splice_read = generic_file_splice_read,
762 .splice_write = generic_file_splice_write, 1022 .splice_write = generic_file_splice_write,
763 .setlease = gfs2_setlease, 1023 .setlease = gfs2_setlease,
1024 .fallocate = gfs2_fallocate,
764}; 1025};
765 1026
766const struct file_operations gfs2_dir_fops = { 1027const struct file_operations gfs2_dir_fops = {
@@ -771,6 +1032,7 @@ const struct file_operations gfs2_dir_fops = {
771 .fsync = gfs2_fsync, 1032 .fsync = gfs2_fsync,
772 .lock = gfs2_lock, 1033 .lock = gfs2_lock,
773 .flock = gfs2_flock, 1034 .flock = gfs2_flock,
1035 .llseek = default_llseek,
774}; 1036};
775 1037
776#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ 1038#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -789,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
789 .splice_read = generic_file_splice_read, 1051 .splice_read = generic_file_splice_read,
790 .splice_write = generic_file_splice_write, 1052 .splice_write = generic_file_splice_write,
791 .setlease = generic_setlease, 1053 .setlease = generic_setlease,
1054 .fallocate = gfs2_fallocate,
792}; 1055};
793 1056
794const struct file_operations gfs2_dir_fops_nolock = { 1057const struct file_operations gfs2_dir_fops_nolock = {
@@ -797,5 +1060,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
797 .open = gfs2_open, 1060 .open = gfs2_open,
798 .release = gfs2_close, 1061 .release = gfs2_close,
799 .fsync = gfs2_fsync, 1062 .fsync = gfs2_fsync,
1063 .llseek = default_llseek,
800}; 1064};
801 1065
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..08a8beb152e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
441 else 441 else
442 gfs2_glock_put_nolock(gl); 442 gfs2_glock_put_nolock(gl);
443 } 443 }
444 if (held1 && held2 && list_empty(&gl->gl_holders))
445 clear_bit(GLF_QUEUED, &gl->gl_flags);
444 446
445 gl->gl_state = new_state; 447 gl->gl_state = new_state;
446 gl->gl_tchange = jiffies; 448 gl->gl_tchange = jiffies;
@@ -539,21 +541,6 @@ out_locked:
539 spin_unlock(&gl->gl_spin); 541 spin_unlock(&gl->gl_spin);
540} 542}
541 543
542static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
543 unsigned int req_state,
544 unsigned int flags)
545{
546 int ret = LM_OUT_ERROR;
547
548 if (!sdp->sd_lockstruct.ls_ops->lm_lock)
549 return req_state == LM_ST_UNLOCKED ? 0 : req_state;
550
551 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
552 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
553 req_state, flags);
554 return ret;
555}
556
557/** 544/**
558 * do_xmote - Calls the DLM to change the state of a lock 545 * do_xmote - Calls the DLM to change the state of a lock
559 * @gl: The lock state 546 * @gl: The lock state
@@ -573,13 +560,14 @@ __acquires(&gl->gl_spin)
573 560
574 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | 561 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
575 LM_FLAG_PRIORITY); 562 LM_FLAG_PRIORITY);
576 BUG_ON(gl->gl_state == target); 563 GLOCK_BUG_ON(gl, gl->gl_state == target);
577 BUG_ON(gl->gl_state == gl->gl_target); 564 GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
578 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && 565 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
579 glops->go_inval) { 566 glops->go_inval) {
580 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); 567 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
581 do_error(gl, 0); /* Fail queued try locks */ 568 do_error(gl, 0); /* Fail queued try locks */
582 } 569 }
570 gl->gl_req = target;
583 spin_unlock(&gl->gl_spin); 571 spin_unlock(&gl->gl_spin);
584 if (glops->go_xmote_th) 572 if (glops->go_xmote_th)
585 glops->go_xmote_th(gl); 573 glops->go_xmote_th(gl);
@@ -592,15 +580,17 @@ __acquires(&gl->gl_spin)
592 gl->gl_state == LM_ST_DEFERRED) && 580 gl->gl_state == LM_ST_DEFERRED) &&
593 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 581 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
594 lck_flags |= LM_FLAG_TRY_1CB; 582 lck_flags |= LM_FLAG_TRY_1CB;
595 ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
596 583
597 if (!(ret & LM_OUT_ASYNC)) { 584 if (sdp->sd_lockstruct.ls_ops->lm_lock) {
598 finish_xmote(gl, ret); 585 /* lock_dlm */
586 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
587 GLOCK_BUG_ON(gl, ret);
588 } else { /* lock_nolock */
589 finish_xmote(gl, target);
599 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 590 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
600 gfs2_glock_put(gl); 591 gfs2_glock_put(gl);
601 } else {
602 GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
603 } 592 }
593
604 spin_lock(&gl->gl_spin); 594 spin_lock(&gl->gl_spin);
605} 595}
606 596
@@ -684,21 +674,20 @@ static void delete_work_func(struct work_struct *work)
684{ 674{
685 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); 675 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
686 struct gfs2_sbd *sdp = gl->gl_sbd; 676 struct gfs2_sbd *sdp = gl->gl_sbd;
687 struct gfs2_inode *ip = NULL; 677 struct gfs2_inode *ip;
688 struct inode *inode; 678 struct inode *inode;
689 u64 no_addr = 0; 679 u64 no_addr = gl->gl_name.ln_number;
680
681 ip = gl->gl_object;
682 /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
690 683
691 spin_lock(&gl->gl_spin);
692 ip = (struct gfs2_inode *)gl->gl_object;
693 if (ip) 684 if (ip)
694 no_addr = ip->i_no_addr;
695 spin_unlock(&gl->gl_spin);
696 if (ip) {
697 inode = gfs2_ilookup(sdp->sd_vfs, no_addr); 685 inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
698 if (inode) { 686 else
699 d_prune_aliases(inode); 687 inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
700 iput(inode); 688 if (inode && !IS_ERR(inode)) {
701 } 689 d_prune_aliases(inode);
690 iput(inode);
702 } 691 }
703 gfs2_glock_put(gl); 692 gfs2_glock_put(gl);
704} 693}
@@ -950,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
950 939
951void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) 940void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
952{ 941{
942 struct va_format vaf;
953 va_list args; 943 va_list args;
954 944
955 va_start(args, fmt); 945 va_start(args, fmt);
946
956 if (seq) { 947 if (seq) {
957 struct gfs2_glock_iter *gi = seq->private; 948 struct gfs2_glock_iter *gi = seq->private;
958 vsprintf(gi->string, fmt, args); 949 vsprintf(gi->string, fmt, args);
959 seq_printf(seq, gi->string); 950 seq_printf(seq, gi->string);
960 } else { 951 } else {
961 printk(KERN_ERR " "); 952 vaf.fmt = fmt;
962 vprintk(fmt, args); 953 vaf.va = &args;
954
955 printk(KERN_ERR " %pV", &vaf);
963 } 956 }
957
964 va_end(args); 958 va_end(args);
965} 959}
966 960
@@ -1012,6 +1006,7 @@ fail:
1012 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) 1006 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
1013 insert_pt = &gh2->gh_list; 1007 insert_pt = &gh2->gh_list;
1014 } 1008 }
1009 set_bit(GLF_QUEUED, &gl->gl_flags);
1015 if (likely(insert_pt == NULL)) { 1010 if (likely(insert_pt == NULL)) {
1016 list_add_tail(&gh->gh_list, &gl->gl_holders); 1011 list_add_tail(&gh->gh_list, &gl->gl_holders);
1017 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 1012 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1305,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1310 1305
1311 gfs2_glock_hold(gl); 1306 gfs2_glock_hold(gl);
1312 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1307 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1313 if (time_before(now, holdtime)) 1308 if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
1314 delay = holdtime - now; 1309 if (time_before(now, holdtime))
1315 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 1310 delay = holdtime - now;
1316 delay = gl->gl_ops->go_min_hold_time; 1311 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1312 delay = gl->gl_ops->go_min_hold_time;
1313 }
1317 1314
1318 spin_lock(&gl->gl_spin); 1315 spin_lock(&gl->gl_spin);
1319 handle_callback(gl, state, delay); 1316 handle_callback(gl, state, delay);
@@ -1357,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
1357 * @gl: Pointer to the glock 1354 * @gl: Pointer to the glock
1358 * @ret: The return value from the dlm 1355 * @ret: The return value from the dlm
1359 * 1356 *
1357 * The gl_reply field is under the gl_spin lock so that it is ok
1358 * to use a bitfield shared with other glock state fields.
1360 */ 1359 */
1361 1360
1362void gfs2_glock_complete(struct gfs2_glock *gl, int ret) 1361void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1363{ 1362{
1364 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 1363 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
1365 1364
1365 spin_lock(&gl->gl_spin);
1366 gl->gl_reply = ret; 1366 gl->gl_reply = ret;
1367 1367
1368 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { 1368 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
1369 spin_lock(&gl->gl_spin);
1370 if (gfs2_should_freeze(gl)) { 1369 if (gfs2_should_freeze(gl)) {
1371 set_bit(GLF_FROZEN, &gl->gl_flags); 1370 set_bit(GLF_FROZEN, &gl->gl_flags);
1372 spin_unlock(&gl->gl_spin); 1371 spin_unlock(&gl->gl_spin);
1373 return; 1372 return;
1374 } 1373 }
1375 spin_unlock(&gl->gl_spin);
1376 } 1374 }
1375
1376 spin_unlock(&gl->gl_spin);
1377 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1377 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1378 smp_wmb();
1378 gfs2_glock_hold(gl); 1379 gfs2_glock_hold(gl);
1379 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1380 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1380 gfs2_glock_put(gl); 1381 gfs2_glock_put(gl);
@@ -1512,7 +1513,7 @@ static void clear_glock(struct gfs2_glock *gl)
1512 spin_unlock(&lru_lock); 1513 spin_unlock(&lru_lock);
1513 1514
1514 spin_lock(&gl->gl_spin); 1515 spin_lock(&gl->gl_spin);
1515 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1516 if (gl->gl_state != LM_ST_UNLOCKED)
1516 handle_callback(gl, LM_ST_UNLOCKED, 0); 1517 handle_callback(gl, LM_ST_UNLOCKED, 0);
1517 spin_unlock(&gl->gl_spin); 1518 spin_unlock(&gl->gl_spin);
1518 gfs2_glock_hold(gl); 1519 gfs2_glock_hold(gl);
@@ -1622,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1622static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) 1623static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1623{ 1624{
1624 struct task_struct *gh_owner = NULL; 1625 struct task_struct *gh_owner = NULL;
1625 char buffer[KSYM_SYMBOL_LEN];
1626 char flags_buf[32]; 1626 char flags_buf[32];
1627 1627
1628 sprint_symbol(buffer, gh->gh_ip);
1629 if (gh->gh_owner_pid) 1628 if (gh->gh_owner_pid)
1630 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); 1629 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
1631 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n", 1630 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
1632 state2str(gh->gh_state), 1631 state2str(gh->gh_state),
1633 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), 1632 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
1634 gh->gh_error, 1633 gh->gh_error,
1635 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, 1634 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
1636 gh_owner ? gh_owner->comm : "(ended)", buffer); 1635 gh_owner ? gh_owner->comm : "(ended)",
1636 (void *)gh->gh_ip);
1637 return 0; 1637 return 0;
1638} 1638}
1639 1639
@@ -1660,6 +1660,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1660 *p++ = 'I'; 1660 *p++ = 'I';
1661 if (test_bit(GLF_FROZEN, gflags)) 1661 if (test_bit(GLF_FROZEN, gflags))
1662 *p++ = 'F'; 1662 *p++ = 'F';
1663 if (test_bit(GLF_QUEUED, gflags))
1664 *p++ = 'q';
1663 *p = 0; 1665 *p = 0;
1664 return buf; 1666 return buf;
1665} 1667}
@@ -1776,10 +1778,13 @@ int __init gfs2_glock_init(void)
1776 } 1778 }
1777#endif 1779#endif
1778 1780
1779 glock_workqueue = create_workqueue("glock_workqueue"); 1781 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
1782 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1780 if (IS_ERR(glock_workqueue)) 1783 if (IS_ERR(glock_workqueue))
1781 return PTR_ERR(glock_workqueue); 1784 return PTR_ERR(glock_workqueue);
1782 gfs2_delete_workqueue = create_workqueue("delete_workqueue"); 1785 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
1786 WQ_MEM_RECLAIM | WQ_FREEZEABLE,
1787 0);
1783 if (IS_ERR(gfs2_delete_workqueue)) { 1788 if (IS_ERR(gfs2_delete_workqueue)) {
1784 destroy_workqueue(glock_workqueue); 1789 destroy_workqueue(glock_workqueue);
1785 return PTR_ERR(gfs2_delete_workqueue); 1790 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..691851ceb615 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
87#define GL_ASYNC 0x00000040 87#define GL_ASYNC 0x00000040
88#define GL_EXACT 0x00000080 88#define GL_EXACT 0x00000080
89#define GL_SKIP 0x00000100 89#define GL_SKIP 0x00000100
90#define GL_ATIME 0x00000200
91#define GL_NOCACHE 0x00000400 90#define GL_NOCACHE 0x00000400
92 91
93/* 92/*
94 * lm_lock() and lm_async_cb return flags 93 * lm_async_cb return flags
95 * 94 *
96 * LM_OUT_ST_MASK 95 * LM_OUT_ST_MASK
97 * Masks the lower two bits of lock state in the returned value. 96 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
99 * LM_OUT_CANCELED 98 * LM_OUT_CANCELED
100 * The lock request was canceled. 99 * The lock request was canceled.
101 * 100 *
102 * LM_OUT_ASYNC
103 * The result of the request will be returned in an LM_CB_ASYNC callback.
104 *
105 */ 101 */
106 102
107#define LM_OUT_ST_MASK 0x00000003 103#define LM_OUT_ST_MASK 0x00000003
108#define LM_OUT_CANCELED 0x00000008 104#define LM_OUT_CANCELED 0x00000008
109#define LM_OUT_ASYNC 0x00000080 105#define LM_OUT_ERROR 0x00000004
110#define LM_OUT_ERROR 0x00000100
111 106
112/* 107/*
113 * lm_recovery_done() messages 108 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
124 void (*lm_unmount) (struct gfs2_sbd *sdp); 119 void (*lm_unmount) (struct gfs2_sbd *sdp);
125 void (*lm_withdraw) (struct gfs2_sbd *sdp); 120 void (*lm_withdraw) (struct gfs2_sbd *sdp);
126 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); 121 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
127 unsigned int (*lm_lock) (struct gfs2_glock *gl, 122 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
128 unsigned int req_state, unsigned int flags); 123 unsigned int flags);
129 void (*lm_cancel) (struct gfs2_glock *gl); 124 void (*lm_cancel) (struct gfs2_glock *gl);
130 const match_table_t *lm_tokens; 125 const match_table_t *lm_tokens;
131}; 126};
132 127
133#define LM_FLAG_TRY 0x00000001
134#define LM_FLAG_TRY_1CB 0x00000002
135#define LM_FLAG_NOEXP 0x00000004
136#define LM_FLAG_ANY 0x00000008
137#define LM_FLAG_PRIORITY 0x00000010
138
139#define GL_ASYNC 0x00000040
140#define GL_EXACT 0x00000080
141#define GL_SKIP 0x00000100
142#define GL_NOCACHE 0x00000400
143
144#define GLR_TRYFAILED 13
145
146extern struct workqueue_struct *gfs2_delete_workqueue; 128extern struct workqueue_struct *gfs2_delete_workqueue;
147static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 129static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
148{ 130{
@@ -212,10 +194,12 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
212int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 194int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
213void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 195void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
214void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 196void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
197
198__attribute__ ((format(printf, 2, 3)))
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 199void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 200
217/** 201/**
218 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 202 * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
219 * @gl: the glock 203 * @gl: the glock
220 * @state: the state we're requesting 204 * @state: the state we're requesting
221 * @flags: the modifier flags 205 * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..263561bf1a50 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
262 const struct gfs2_inode *ip = gl->gl_object; 262 const struct gfs2_inode *ip = gl->gl_object;
263 if (ip == NULL) 263 if (ip == NULL)
264 return 0; 264 return 0;
265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", 265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
266 (unsigned long long)ip->i_no_formal_ino, 266 (unsigned long long)ip->i_no_formal_ino,
267 (unsigned long long)ip->i_no_addr, 267 (unsigned long long)ip->i_no_addr,
268 IF2DT(ip->i_inode.i_mode), ip->i_flags, 268 IF2DT(ip->i_inode.i_mode), ip->i_flags,
269 (unsigned int)ip->i_diskflags, 269 (unsigned int)ip->i_diskflags,
270 (unsigned long long)ip->i_inode.i_size, 270 (unsigned long long)i_size_read(&ip->i_inode));
271 (unsigned long long)ip->i_disksize);
272 return 0; 271 return 0;
273} 272}
274 273
@@ -326,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
326 325
327 if (gl->gl_state != LM_ST_UNLOCKED && 326 if (gl->gl_state != LM_ST_UNLOCKED &&
328 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { 327 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
329 flush_workqueue(gfs2_delete_workqueue);
330 gfs2_meta_syncfs(sdp); 328 gfs2_meta_syncfs(sdp);
331 gfs2_log_shutdown(sdp); 329 gfs2_log_shutdown(sdp);
332 } 330 }
@@ -453,7 +451,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
453 [LM_TYPE_META] = &gfs2_meta_glops, 451 [LM_TYPE_META] = &gfs2_meta_glops,
454 [LM_TYPE_INODE] = &gfs2_inode_glops, 452 [LM_TYPE_INODE] = &gfs2_inode_glops,
455 [LM_TYPE_RGRP] = &gfs2_rgrp_glops, 453 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
456 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
457 [LM_TYPE_IOPEN] = &gfs2_iopen_glops, 454 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
458 [LM_TYPE_FLOCK] = &gfs2_flock_glops, 455 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
459 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, 456 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..a79790c06275 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
11#define __INCORE_DOT_H__ 11#define __INCORE_DOT_H__
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/kobject.h>
14#include <linux/workqueue.h> 15#include <linux/workqueue.h>
15#include <linux/dlm.h> 16#include <linux/dlm.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
@@ -196,6 +197,7 @@ enum {
196 GLF_REPLY_PENDING = 9, 197 GLF_REPLY_PENDING = 9,
197 GLF_INITIAL = 10, 198 GLF_INITIAL = 10,
198 GLF_FROZEN = 11, 199 GLF_FROZEN = 11,
200 GLF_QUEUED = 12,
199}; 201};
200 202
201struct gfs2_glock { 203struct gfs2_glock {
@@ -206,12 +208,14 @@ struct gfs2_glock {
206 208
207 spinlock_t gl_spin; 209 spinlock_t gl_spin;
208 210
209 unsigned int gl_state; 211 /* State fields protected by gl_spin */
210 unsigned int gl_target; 212 unsigned int gl_state:2, /* Current state */
211 unsigned int gl_reply; 213 gl_target:2, /* Target state */
214 gl_demote_state:2, /* State requested by remote node */
215 gl_req:2, /* State in last dlm request */
216 gl_reply:8; /* Last reply from the dlm */
217
212 unsigned int gl_hash; 218 unsigned int gl_hash;
213 unsigned int gl_req;
214 unsigned int gl_demote_state; /* state requested by remote node */
215 unsigned long gl_demote_time; /* time of first demote request */ 219 unsigned long gl_demote_time; /* time of first demote request */
216 struct list_head gl_holders; 220 struct list_head gl_holders;
217 221
@@ -267,7 +271,6 @@ struct gfs2_inode {
267 u64 i_no_formal_ino; 271 u64 i_no_formal_ino;
268 u64 i_generation; 272 u64 i_generation;
269 u64 i_eattr; 273 u64 i_eattr;
270 loff_t i_disksize;
271 unsigned long i_flags; /* GIF_... */ 274 unsigned long i_flags; /* GIF_... */
272 struct gfs2_glock *i_gl; /* Move into i_gh? */ 275 struct gfs2_glock *i_gl; /* Move into i_gh? */
273 struct gfs2_holder i_iopen_gh; 276 struct gfs2_holder i_iopen_gh;
@@ -416,11 +419,8 @@ struct gfs2_args {
416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 419 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 420 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
418 unsigned int ar_spectator:1; /* Don't get a journal */ 421 unsigned int ar_spectator:1; /* Don't get a journal */
419 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
420 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ 422 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
421 unsigned int ar_localcaching:1; /* Local caching */
422 unsigned int ar_debug:1; /* Oops on errors */ 423 unsigned int ar_debug:1; /* Oops on errors */
423 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
424 unsigned int ar_posix_acl:1; /* Enable posix acls */ 424 unsigned int ar_posix_acl:1; /* Enable posix acls */
425 unsigned int ar_quota:2; /* off/account/on */ 425 unsigned int ar_quota:2; /* off/account/on */
426 unsigned int ar_suiddir:1; /* suiddir support */ 426 unsigned int ar_suiddir:1; /* suiddir support */
@@ -497,7 +497,7 @@ struct gfs2_sb_host {
497 */ 497 */
498 498
499struct lm_lockstruct { 499struct lm_lockstruct {
500 unsigned int ls_jid; 500 int ls_jid;
501 unsigned int ls_first; 501 unsigned int ls_first;
502 unsigned int ls_first_done; 502 unsigned int ls_first_done;
503 unsigned int ls_nodir; 503 unsigned int ls_nodir;
@@ -572,6 +572,7 @@ struct gfs2_sbd {
572 struct list_head sd_rindex_mru_list; 572 struct list_head sd_rindex_mru_list;
573 struct gfs2_rgrpd *sd_rindex_forward; 573 struct gfs2_rgrpd *sd_rindex_forward;
574 unsigned int sd_rgrps; 574 unsigned int sd_rgrps;
575 unsigned int sd_max_rg_data;
575 576
576 /* Journal index stuff */ 577 /* Journal index stuff */
577 578
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..7aa7d4f8984a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,60 +73,15 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); 73 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
74} 74}
75 75
76struct gfs2_skip_data {
77 u64 no_addr;
78 int skipped;
79};
80
81static int iget_skip_test(struct inode *inode, void *opaque)
82{
83 struct gfs2_inode *ip = GFS2_I(inode);
84 struct gfs2_skip_data *data = opaque;
85
86 if (ip->i_no_addr == data->no_addr) {
87 if (inode->i_state & (I_FREEING|I_WILL_FREE)){
88 data->skipped = 1;
89 return 0;
90 }
91 return 1;
92 }
93 return 0;
94}
95
96static int iget_skip_set(struct inode *inode, void *opaque)
97{
98 struct gfs2_inode *ip = GFS2_I(inode);
99 struct gfs2_skip_data *data = opaque;
100
101 if (data->skipped)
102 return 1;
103 inode->i_ino = (unsigned long)(data->no_addr);
104 ip->i_no_addr = data->no_addr;
105 return 0;
106}
107
108static struct inode *gfs2_iget_skip(struct super_block *sb,
109 u64 no_addr)
110{
111 struct gfs2_skip_data data;
112 unsigned long hash = (unsigned long)no_addr;
113
114 data.no_addr = no_addr;
115 data.skipped = 0;
116 return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
117}
118
119/** 76/**
120 * GFS2 lookup code fills in vfs inode contents based on info obtained 77 * gfs2_set_iop - Sets inode operations
121 * from directory entry inside gfs2_inode_lookup(). This has caused issues 78 * @inode: The inode with correct i_mode filled in
122 * with NFS code path since its get_dentry routine doesn't have the relevant
123 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
124 * segment inside gfs2_inode_lookup code needs to get moved around.
125 * 79 *
126 * Clears I_NEW as well. 80 * GFS2 lookup code fills in vfs inode contents based on info obtained
127 **/ 81 * from directory entry inside gfs2_inode_lookup().
82 */
128 83
129void gfs2_set_iop(struct inode *inode) 84static void gfs2_set_iop(struct inode *inode)
130{ 85{
131 struct gfs2_sbd *sdp = GFS2_SB(inode); 86 struct gfs2_sbd *sdp = GFS2_SB(inode);
132 umode_t mode = inode->i_mode; 87 umode_t mode = inode->i_mode;
@@ -149,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
149 inode->i_op = &gfs2_file_iops; 104 inode->i_op = &gfs2_file_iops;
150 init_special_inode(inode, inode->i_mode, inode->i_rdev); 105 init_special_inode(inode, inode->i_mode, inode->i_rdev);
151 } 106 }
152
153 unlock_new_inode(inode);
154} 107}
155 108
156/** 109/**
@@ -162,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
162 * Returns: A VFS inode, or an error 115 * Returns: A VFS inode, or an error
163 */ 116 */
164 117
165struct inode *gfs2_inode_lookup(struct super_block *sb, 118struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
166 unsigned int type, 119 u64 no_addr, u64 no_formal_ino)
167 u64 no_addr,
168 u64 no_formal_ino)
169{ 120{
170 struct inode *inode; 121 struct inode *inode;
171 struct gfs2_inode *ip; 122 struct gfs2_inode *ip;
@@ -195,141 +146,80 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
195 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); 146 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
196 if (unlikely(error)) 147 if (unlikely(error))
197 goto fail_iopen; 148 goto fail_iopen;
198 ip->i_iopen_gh.gh_gl->gl_object = ip;
199 149
150 ip->i_iopen_gh.gh_gl->gl_object = ip;
200 gfs2_glock_put(io_gl); 151 gfs2_glock_put(io_gl);
201 io_gl = NULL; 152 io_gl = NULL;
202 153
203 if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
204 goto gfs2_nfsbypass;
205
206 inode->i_mode = DT2IF(type);
207
208 /*
209 * We must read the inode in order to work out its type in
210 * this case. Note that this doesn't happen often as we normally
211 * know the type beforehand. This code path only occurs during
212 * unlinked inode recovery (where it is safe to do this glock,
213 * which is not true in the general case).
214 */
215 if (type == DT_UNKNOWN) { 154 if (type == DT_UNKNOWN) {
216 struct gfs2_holder gh; 155 /* Inode glock must be locked already */
217 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 156 error = gfs2_inode_refresh(GFS2_I(inode));
218 if (unlikely(error)) 157 if (error)
219 goto fail_glock; 158 goto fail_refresh;
220 /* Inode is now uptodate */ 159 } else {
221 gfs2_glock_dq_uninit(&gh); 160 inode->i_mode = DT2IF(type);
222 } 161 }
223 162
224 gfs2_set_iop(inode); 163 gfs2_set_iop(inode);
164 unlock_new_inode(inode);
225 } 165 }
226 166
227gfs2_nfsbypass:
228 return inode; 167 return inode;
229fail_glock: 168
230 gfs2_glock_dq(&ip->i_iopen_gh); 169fail_refresh:
170 ip->i_iopen_gh.gh_gl->gl_object = NULL;
171 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
231fail_iopen: 172fail_iopen:
232 if (io_gl) 173 if (io_gl)
233 gfs2_glock_put(io_gl); 174 gfs2_glock_put(io_gl);
234fail_put: 175fail_put:
235 if (inode->i_state & I_NEW) 176 ip->i_gl->gl_object = NULL;
236 ip->i_gl->gl_object = NULL;
237 gfs2_glock_put(ip->i_gl); 177 gfs2_glock_put(ip->i_gl);
238fail: 178fail:
239 if (inode->i_state & I_NEW) 179 iget_failed(inode);
240 iget_failed(inode);
241 else
242 iput(inode);
243 return ERR_PTR(error); 180 return ERR_PTR(error);
244} 181}
245 182
246/** 183struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
247 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation 184 u64 *no_formal_ino, unsigned int blktype)
248 * and try to reclaim it by doing iput.
249 *
250 * This function assumes no rgrp locks are currently held.
251 *
252 * @sb: The super block
253 * no_addr: The inode number
254 *
255 */
256
257void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
258{ 185{
259 struct gfs2_sbd *sdp; 186 struct super_block *sb = sdp->sd_vfs;
260 struct gfs2_inode *ip; 187 struct gfs2_holder i_gh;
261 struct gfs2_glock *io_gl = NULL;
262 int error;
263 struct gfs2_holder gh;
264 struct inode *inode; 188 struct inode *inode;
189 int error;
265 190
266 inode = gfs2_iget_skip(sb, no_addr); 191 error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
267 192 LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
268 if (!inode) 193 if (error)
269 return; 194 return ERR_PTR(error);
270
271 /* If it's not a new inode, someone's using it, so leave it alone. */
272 if (!(inode->i_state & I_NEW)) {
273 iput(inode);
274 return;
275 }
276
277 ip = GFS2_I(inode);
278 sdp = GFS2_SB(inode);
279 ip->i_no_formal_ino = -1;
280 195
281 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 196 error = gfs2_check_blk_type(sdp, no_addr, blktype);
282 if (unlikely(error)) 197 if (error)
283 goto fail; 198 goto fail;
284 ip->i_gl->gl_object = ip;
285
286 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
287 if (unlikely(error))
288 goto fail_put;
289
290 set_bit(GIF_INVALID, &ip->i_flags);
291 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
292 &ip->i_iopen_gh);
293 if (unlikely(error))
294 goto fail_iopen;
295
296 ip->i_iopen_gh.gh_gl->gl_object = ip;
297 gfs2_glock_put(io_gl);
298 io_gl = NULL;
299 199
300 inode->i_mode = DT2IF(DT_UNKNOWN); 200 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
201 if (IS_ERR(inode))
202 goto fail;
301 203
302 /* 204 /* Two extra checks for NFS only */
303 * We must read the inode in order to work out its type in 205 if (no_formal_ino) {
304 * this case. Note that this doesn't happen often as we normally 206 error = -ESTALE;
305 * know the type beforehand. This code path only occurs during 207 if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
306 * unlinked inode recovery (where it is safe to do this glock, 208 goto fail_iput;
307 * which is not true in the general case).
308 */
309 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
310 &gh);
311 if (unlikely(error))
312 goto fail_glock;
313 209
314 /* Inode is now uptodate */ 210 error = -EIO;
315 gfs2_glock_dq_uninit(&gh); 211 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
316 gfs2_set_iop(inode); 212 goto fail_iput;
317 213
318 /* The iput will cause it to be deleted. */ 214 error = 0;
319 iput(inode); 215 }
320 return;
321 216
322fail_glock:
323 gfs2_glock_dq(&ip->i_iopen_gh);
324fail_iopen:
325 if (io_gl)
326 gfs2_glock_put(io_gl);
327fail_put:
328 ip->i_gl->gl_object = NULL;
329 gfs2_glock_put(ip->i_gl);
330fail: 217fail:
331 iget_failed(inode); 218 gfs2_glock_dq_uninit(&i_gh);
332 return; 219 return error ? ERR_PTR(error) : inode;
220fail_iput:
221 iput(inode);
222 goto fail;
333} 223}
334 224
335static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 225static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -359,8 +249,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
359 * to do that. 249 * to do that.
360 */ 250 */
361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 251 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
362 ip->i_disksize = be64_to_cpu(str->di_size); 252 i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
363 i_size_write(&ip->i_inode, ip->i_disksize);
364 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 253 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
365 atime.tv_sec = be64_to_cpu(str->di_atime); 254 atime.tv_sec = be64_to_cpu(str->di_atime);
366 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 255 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -592,7 +481,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
592 } 481 }
593 482
594 if (!is_root) { 483 if (!is_root) {
595 error = gfs2_permission(dir, MAY_EXEC); 484 error = gfs2_permission(dir, MAY_EXEC, 0);
596 if (error) 485 if (error)
597 goto out; 486 goto out;
598 } 487 }
@@ -622,7 +511,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
622{ 511{
623 int error; 512 int error;
624 513
625 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); 514 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
626 if (error) 515 if (error)
627 return error; 516 return error;
628 517
@@ -999,17 +888,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
999 if (error) 888 if (error)
1000 return error; 889 return error;
1001 890
1002 if ((attr->ia_valid & ATTR_SIZE) &&
1003 attr->ia_size != i_size_read(inode)) {
1004 error = vmtruncate(inode, attr->ia_size);
1005 if (error)
1006 return error;
1007 }
1008
1009 setattr_copy(inode, attr); 891 setattr_copy(inode, attr);
1010 mark_inode_dirty(inode); 892 mark_inode_dirty(inode);
1011
1012 gfs2_assert_warn(GFS2_SB(inode), !error);
1013 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 893 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1014 gfs2_dinode_out(ip, dibh->b_data); 894 gfs2_dinode_out(ip, dibh->b_data);
1015 brelse(dibh); 895 brelse(dibh);
@@ -1055,7 +935,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1055 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 935 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1056 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 936 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1057 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 937 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1058 str->di_size = cpu_to_be64(ip->i_disksize); 938 str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
1059 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 939 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1060 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 940 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1061 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 941 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +965,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
1085 (unsigned long long)ip->i_no_formal_ino); 965 (unsigned long long)ip->i_no_formal_ino);
1086 printk(KERN_INFO " no_addr = %llu\n", 966 printk(KERN_INFO " no_addr = %llu\n",
1087 (unsigned long long)ip->i_no_addr); 967 (unsigned long long)ip->i_no_addr);
1088 printk(KERN_INFO " i_disksize = %llu\n", 968 printk(KERN_INFO " i_size = %llu\n",
1089 (unsigned long long)ip->i_disksize); 969 (unsigned long long)i_size_read(&ip->i_inode));
1090 printk(KERN_INFO " blocks = %llu\n", 970 printk(KERN_INFO " blocks = %llu\n",
1091 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 971 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1092 printk(KERN_INFO " i_goal = %llu\n", 972 printk(KERN_INFO " i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..3e00a66e7cbd 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip, 19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state, 20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size); 21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
23 unsigned int from, unsigned int to);
22extern void gfs2_set_aops(struct inode *inode); 24extern void gfs2_set_aops(struct inode *inode);
23 25
24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 26static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,11 +82,25 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
80 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); 82 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
81} 83}
82 84
85static inline int gfs2_check_internal_file_size(struct inode *inode,
86 u64 minsize, u64 maxsize)
87{
88 u64 size = i_size_read(inode);
89 if (size < minsize || size > maxsize)
90 goto err;
91 if (size & ((1 << inode->i_blkbits) - 1))
92 goto err;
93 return 0;
94err:
95 gfs2_consist_inode(GFS2_I(inode));
96 return -EIO;
97}
83 98
84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 99extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino); 100 u64 no_addr, u64 no_formal_ino);
87extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); 101extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
102 u64 *no_formal_ino,
103 unsigned int blktype);
88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 104extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
89 105
90extern int gfs2_inode_refresh(struct gfs2_inode *ip); 106extern int gfs2_inode_refresh(struct gfs2_inode *ip);
@@ -96,7 +112,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
96extern struct inode *gfs2_createi(struct gfs2_holder *ghs, 112extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
97 const struct qstr *name, 113 const struct qstr *name,
98 unsigned int mode, dev_t dev); 114 unsigned int mode, dev_t dev);
99extern int gfs2_permission(struct inode *inode, int mask); 115extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
100extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 116extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
101extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 117extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
102extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 118extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..6e493aee28f8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
42 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
43 goto out; 43 goto out;
44 case -EAGAIN: /* Try lock fails */ 44 case -EAGAIN: /* Try lock fails */
45 case -EDEADLK: /* Deadlock detected */
45 goto out; 46 goto out;
46 case -EINVAL: /* Invalid */ 47 case -ETIMEDOUT: /* Canceled due to timeout */
47 case -ENOMEM: /* Out of memory */
48 ret |= LM_OUT_ERROR; 48 ret |= LM_OUT_ERROR;
49 goto out; 49 goto out;
50 case 0: /* Success */ 50 case 0: /* Success */
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
146 return lkf; 146 return lkf;
147} 147}
148 148
149static unsigned int gdlm_lock(struct gfs2_glock *gl, 149static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
150 unsigned int req_state, unsigned int flags) 150 unsigned int flags)
151{ 151{
152 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 152 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
153 int error;
154 int req; 153 int req;
155 u32 lkf; 154 u32 lkf;
156 155
157 gl->gl_req = req_state;
158 req = make_mode(req_state); 156 req = make_mode(req_state);
159 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); 157 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
160 158
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
162 * Submit the actual lock request. 160 * Submit the actual lock request.
163 */ 161 */
164 162
165 error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, 163 return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
166 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); 164 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
167 if (error == -EAGAIN)
168 return 0;
169 if (error)
170 return LM_OUT_ERROR;
171 return LM_OUT_ASYNC;
172} 165}
173 166
174static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) 167static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ac750bd31a6f..eb01f3575e10 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
592 lh->lh_hash = cpu_to_be32(hash); 592 lh->lh_hash = cpu_to_be32(hash);
593 593
594 bh->b_end_io = end_buffer_write_sync; 594 bh->b_end_io = end_buffer_write_sync;
595 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
596 goto skip_barrier;
597 get_bh(bh); 595 get_bh(bh);
598 submit_bh(WRITE_BARRIER | REQ_META, bh); 596 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
599 wait_on_buffer(bh);
600 if (buffer_eopnotsupp(bh)) {
601 clear_buffer_eopnotsupp(bh);
602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
605 lock_buffer(bh);
606skip_barrier:
607 get_bh(bh);
608 submit_bh(WRITE_SYNC | REQ_META, bh); 597 submit_bh(WRITE_SYNC | REQ_META, bh);
609 wait_on_buffer(bh); 598 else
610 } 599 submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
600 wait_on_buffer(bh);
601
611 if (!buffer_uptodate(bh)) 602 if (!buffer_uptodate(bh))
612 gfs2_io_error_bh(sdp, bh); 603 gfs2_io_error_bh(sdp, bh);
613 brelse(bh); 604 brelse(bh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..ebef7ab6e17e 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
24#include "glock.h" 24#include "glock.h"
25#include "quota.h" 25#include "quota.h"
26#include "recovery.h" 26#include "recovery.h"
27#include "dir.h"
27 28
28static struct shrinker qd_shrinker = { 29static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 30 .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
78{ 79{
79 int error; 80 int error;
80 81
82 gfs2_str2qstr(&gfs2_qdot, ".");
83 gfs2_str2qstr(&gfs2_qdotdot, "..");
84
81 error = gfs2_sys_init(); 85 error = gfs2_sys_init();
82 if (error) 86 if (error)
83 return error; 87 return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
140 144
141 error = -ENOMEM; 145 error = -ENOMEM;
142 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 146 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 WQ_NON_REENTRANT | WQ_RESCUER, 0); 147 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
144 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
145 goto fail_wq; 149 goto fail_wq;
146 150
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
55 * activity, but those code paths have their own higher-level 55 * activity, but those code paths have their own higher-level
56 * throttling. 56 * throttling.
57 */ 57 */
58 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 58 if (wbc->sync_mode != WB_SYNC_NONE) {
59 lock_buffer(bh); 59 lock_buffer(bh);
60 } else if (!trylock_buffer(bh)) { 60 } else if (!trylock_buffer(bh)) {
61 redirty_page_for_writepage(wbc, page); 61 redirty_page_for_writepage(wbc, page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..777927ce6f79 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
38#define DO 0 38#define DO 0
39#define UNDO 1 39#define UNDO 1
40 40
41static const u32 gfs2_old_fs_formats[] = {
42 0
43};
44
45static const u32 gfs2_old_multihost_formats[] = {
46 0
47};
48
49/** 41/**
50 * gfs2_tune_init - Fill a gfs2_tune structure with default values 42 * gfs2_tune_init - Fill a gfs2_tune structure with default values
51 * @gt: tune 43 * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
135 127
136static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) 128static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
137{ 129{
138 unsigned int x;
139
140 if (sb->sb_magic != GFS2_MAGIC || 130 if (sb->sb_magic != GFS2_MAGIC ||
141 sb->sb_type != GFS2_METATYPE_SB) { 131 sb->sb_type != GFS2_METATYPE_SB) {
142 if (!silent) 132 if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
150 sb->sb_multihost_format == GFS2_FORMAT_MULTI) 140 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
151 return 0; 141 return 0;
152 142
153 if (sb->sb_fs_format != GFS2_FORMAT_FS) { 143 fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
154 for (x = 0; gfs2_old_fs_formats[x]; x++)
155 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
156 break;
157
158 if (!gfs2_old_fs_formats[x]) {
159 printk(KERN_WARNING
160 "GFS2: code version (%u, %u) is incompatible "
161 "with ondisk format (%u, %u)\n",
162 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
163 sb->sb_fs_format, sb->sb_multihost_format);
164 printk(KERN_WARNING
165 "GFS2: I don't know how to upgrade this FS\n");
166 return -EINVAL;
167 }
168 }
169 144
170 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { 145 return -EINVAL;
171 for (x = 0; gfs2_old_multihost_formats[x]; x++)
172 if (gfs2_old_multihost_formats[x] ==
173 sb->sb_multihost_format)
174 break;
175
176 if (!gfs2_old_multihost_formats[x]) {
177 printk(KERN_WARNING
178 "GFS2: code version (%u, %u) is incompatible "
179 "with ondisk format (%u, %u)\n",
180 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
181 sb->sb_fs_format, sb->sb_multihost_format);
182 printk(KERN_WARNING
183 "GFS2: I don't know how to upgrade this FS\n");
184 return -EINVAL;
185 }
186 }
187
188 if (!sdp->sd_args.ar_upgrade) {
189 printk(KERN_WARNING
190 "GFS2: code version (%u, %u) is incompatible "
191 "with ondisk format (%u, %u)\n",
192 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
193 sb->sb_fs_format, sb->sb_multihost_format);
194 printk(KERN_INFO
195 "GFS2: Use the \"upgrade\" mount option to upgrade "
196 "the FS\n");
197 printk(KERN_INFO "GFS2: See the manual for more details\n");
198 return -EINVAL;
199 }
200
201 return 0;
202} 146}
203 147
204static void end_bio_io_page(struct bio *bio, int error) 148static void end_bio_io_page(struct bio *bio, int error)
@@ -496,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
496 iput(inode); 440 iput(inode);
497 return -ENOMEM; 441 return -ENOMEM;
498 } 442 }
499 dentry->d_op = &gfs2_dops;
500 *dptr = dentry; 443 *dptr = dentry;
501 return 0; 444 return 0;
502} 445}
@@ -586,7 +529,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
586 529
587 prev_db = 0; 530 prev_db = 0;
588 531
589 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { 532 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
590 bh.b_state = 0; 533 bh.b_state = 0;
591 bh.b_blocknr = 0; 534 bh.b_blocknr = 0;
592 bh.b_size = 1 << ip->i_inode.i_blkbits; 535 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +965,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1022 if (!strcmp("lock_nolock", proto)) { 965 if (!strcmp("lock_nolock", proto)) {
1023 lm = &nolock_ops; 966 lm = &nolock_ops;
1024 sdp->sd_args.ar_localflocks = 1; 967 sdp->sd_args.ar_localflocks = 1;
1025 sdp->sd_args.ar_localcaching = 1;
1026#ifdef CONFIG_GFS2_FS_LOCKING_DLM 968#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1027 } else if (!strcmp("lock_dlm", proto)) { 969 } else if (!strcmp("lock_dlm", proto)) {
1028 lm = &gfs2_dlm_ops; 970 lm = &gfs2_dlm_ops;
@@ -1113,8 +1055,6 @@ static int gfs2_journalid_wait(void *word)
1113 1055
1114static int wait_on_journal(struct gfs2_sbd *sdp) 1056static int wait_on_journal(struct gfs2_sbd *sdp)
1115{ 1057{
1116 if (sdp->sd_args.ar_spectator)
1117 return 0;
1118 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 1058 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1119 return 0; 1059 return 0;
1120 1060
@@ -1165,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1165 1105
1166 sb->s_magic = GFS2_MAGIC; 1106 sb->s_magic = GFS2_MAGIC;
1167 sb->s_op = &gfs2_super_ops; 1107 sb->s_op = &gfs2_super_ops;
1108 sb->s_d_op = &gfs2_dops;
1168 sb->s_export_op = &gfs2_export_ops; 1109 sb->s_export_op = &gfs2_export_ops;
1169 sb->s_xattr = gfs2_xattr_handlers; 1110 sb->s_xattr = gfs2_xattr_handlers;
1170 sb->s_qcop = &gfs2_quotactl_ops; 1111 sb->s_qcop = &gfs2_quotactl_ops;
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1217 if (error) 1158 if (error)
1218 goto fail_sb; 1159 goto fail_sb;
1219 1160
1161 /*
1162 * If user space has failed to join the cluster or some similar
1163 * failure has occurred, then the journal id will contain a
1164 * negative (error) number. This will then be returned to the
1165 * caller (of the mount syscall). We do this even for spectator
1166 * mounts (which just write a jid of 0 to indicate "ok" even though
1167 * the jid is unused in the spectator case)
1168 */
1169 if (sdp->sd_lockstruct.ls_jid < 0) {
1170 error = sdp->sd_lockstruct.ls_jid;
1171 sdp->sd_lockstruct.ls_jid = 0;
1172 goto fail_sb;
1173 }
1174
1220 error = init_inodes(sdp, DO); 1175 error = init_inodes(sdp, DO);
1221 if (error) 1176 if (error)
1222 goto fail_sb; 1177 goto fail_sb;
@@ -1264,7 +1219,6 @@ fail_sb:
1264fail_locking: 1219fail_locking:
1265 init_locking(sdp, &mount_gh, UNDO); 1220 init_locking(sdp, &mount_gh, UNDO);
1266fail_lm: 1221fail_lm:
1267 invalidate_inodes(sb);
1268 gfs2_gl_hash_clear(sdp); 1222 gfs2_gl_hash_clear(sdp);
1269 gfs2_lm_unmount(sdp); 1223 gfs2_lm_unmount(sdp);
1270fail_sys: 1224fail_sys:
@@ -1296,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
1296} 1250}
1297 1251
1298/** 1252/**
1299 * gfs2_get_sb - Get the GFS2 superblock 1253 * gfs2_mount - Get the GFS2 superblock
1300 * @fs_type: The GFS2 filesystem type 1254 * @fs_type: The GFS2 filesystem type
1301 * @flags: Mount flags 1255 * @flags: Mount flags
1302 * @dev_name: The name of the device 1256 * @dev_name: The name of the device
1303 * @data: The mount arguments 1257 * @data: The mount arguments
1304 * @mnt: The vfsmnt for this mount
1305 * 1258 *
1306 * Q. Why not use get_sb_bdev() ? 1259 * Q. Why not use get_sb_bdev() ?
1307 * A. We need to select one of two root directories to mount, independent 1260 * A. We need to select one of two root directories to mount, independent
@@ -1310,12 +1263,12 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
1310 * Returns: 0 or -ve on error 1263 * Returns: 0 or -ve on error
1311 */ 1264 */
1312 1265
1313static int gfs2_get_sb(struct file_system_type *fs_type, int flags, 1266static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1314 const char *dev_name, void *data, struct vfsmount *mnt) 1267 const char *dev_name, void *data)
1315{ 1268{
1316 struct block_device *bdev; 1269 struct block_device *bdev;
1317 struct super_block *s; 1270 struct super_block *s;
1318 fmode_t mode = FMODE_READ; 1271 fmode_t mode = FMODE_READ | FMODE_EXCL;
1319 int error; 1272 int error;
1320 struct gfs2_args args; 1273 struct gfs2_args args;
1321 struct gfs2_sbd *sdp; 1274 struct gfs2_sbd *sdp;
@@ -1323,9 +1276,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1323 if (!(flags & MS_RDONLY)) 1276 if (!(flags & MS_RDONLY))
1324 mode |= FMODE_WRITE; 1277 mode |= FMODE_WRITE;
1325 1278
1326 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1279 bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1327 if (IS_ERR(bdev)) 1280 if (IS_ERR(bdev))
1328 return PTR_ERR(bdev); 1281 return ERR_CAST(bdev);
1329 1282
1330 /* 1283 /*
1331 * once the super is inserted into the list by sget, s_umount 1284 * once the super is inserted into the list by sget, s_umount
@@ -1344,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1344 if (IS_ERR(s)) 1297 if (IS_ERR(s))
1345 goto error_bdev; 1298 goto error_bdev;
1346 1299
1300 if (s->s_root)
1301 blkdev_put(bdev, mode);
1302
1347 memset(&args, 0, sizeof(args)); 1303 memset(&args, 0, sizeof(args));
1348 args.ar_quota = GFS2_QUOTA_DEFAULT; 1304 args.ar_quota = GFS2_QUOTA_DEFAULT;
1349 args.ar_data = GFS2_DATA_DEFAULT; 1305 args.ar_data = GFS2_DATA_DEFAULT;
@@ -1355,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1355 error = gfs2_mount_args(&args, data); 1311 error = gfs2_mount_args(&args, data);
1356 if (error) { 1312 if (error) {
1357 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1313 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
1358 if (s->s_root) 1314 goto error_super;
1359 goto error_super;
1360 deactivate_locked_super(s);
1361 return error;
1362 } 1315 }
1363 1316
1364 if (s->s_root) { 1317 if (s->s_root) {
1365 error = -EBUSY; 1318 error = -EBUSY;
1366 if ((flags ^ s->s_flags) & MS_RDONLY) 1319 if ((flags ^ s->s_flags) & MS_RDONLY)
1367 goto error_super; 1320 goto error_super;
1368 close_bdev_exclusive(bdev, mode);
1369 } else { 1321 } else {
1370 char b[BDEVNAME_SIZE]; 1322 char b[BDEVNAME_SIZE];
1371 1323
@@ -1374,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1374 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 1326 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
1375 sb_set_blocksize(s, block_size(bdev)); 1327 sb_set_blocksize(s, block_size(bdev));
1376 error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); 1328 error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
1377 if (error) { 1329 if (error)
1378 deactivate_locked_super(s); 1330 goto error_super;
1379 return error;
1380 }
1381 s->s_flags |= MS_ACTIVE; 1331 s->s_flags |= MS_ACTIVE;
1382 bdev->bd_super = s; 1332 bdev->bd_super = s;
1383 } 1333 }
1384 1334
1385 sdp = s->s_fs_info; 1335 sdp = s->s_fs_info;
1386 mnt->mnt_sb = s;
1387 if (args.ar_meta) 1336 if (args.ar_meta)
1388 mnt->mnt_root = dget(sdp->sd_master_dir); 1337 return dget(sdp->sd_master_dir);
1389 else 1338 else
1390 mnt->mnt_root = dget(sdp->sd_root_dir); 1339 return dget(sdp->sd_root_dir);
1391 return 0;
1392 1340
1393error_super: 1341error_super:
1394 deactivate_locked_super(s); 1342 deactivate_locked_super(s);
1343 return ERR_PTR(error);
1395error_bdev: 1344error_bdev:
1396 close_bdev_exclusive(bdev, mode); 1345 blkdev_put(bdev, mode);
1397 return error; 1346 return ERR_PTR(error);
1398} 1347}
1399 1348
1400static int set_meta_super(struct super_block *s, void *ptr) 1349static int set_meta_super(struct super_block *s, void *ptr)
@@ -1402,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
1402 return -EINVAL; 1351 return -EINVAL;
1403} 1352}
1404 1353
1405static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, 1354static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
1406 const char *dev_name, void *data, struct vfsmount *mnt) 1355 int flags, const char *dev_name, void *data)
1407{ 1356{
1408 struct super_block *s; 1357 struct super_block *s;
1409 struct gfs2_sbd *sdp; 1358 struct gfs2_sbd *sdp;
@@ -1414,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1414 if (error) { 1363 if (error) {
1415 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1364 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
1416 dev_name, error); 1365 dev_name, error);
1417 return error; 1366 return ERR_PTR(error);
1418 } 1367 }
1419 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, 1368 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
1420 path.dentry->d_inode->i_sb->s_bdev); 1369 path.dentry->d_inode->i_sb->s_bdev);
1421 path_put(&path); 1370 path_put(&path);
1422 if (IS_ERR(s)) { 1371 if (IS_ERR(s)) {
1423 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1372 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
1424 return PTR_ERR(s); 1373 return ERR_CAST(s);
1425 } 1374 }
1426 if ((flags ^ s->s_flags) & MS_RDONLY) { 1375 if ((flags ^ s->s_flags) & MS_RDONLY) {
1427 deactivate_locked_super(s); 1376 deactivate_locked_super(s);
1428 return -EBUSY; 1377 return ERR_PTR(-EBUSY);
1429 } 1378 }
1430 sdp = s->s_fs_info; 1379 sdp = s->s_fs_info;
1431 mnt->mnt_sb = s; 1380 return dget(sdp->sd_master_dir);
1432 mnt->mnt_root = dget(sdp->sd_master_dir);
1433 return 0;
1434} 1381}
1435 1382
1436static void gfs2_kill_sb(struct super_block *sb) 1383static void gfs2_kill_sb(struct super_block *sb)
@@ -1456,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
1456struct file_system_type gfs2_fs_type = { 1403struct file_system_type gfs2_fs_type = {
1457 .name = "gfs2", 1404 .name = "gfs2",
1458 .fs_flags = FS_REQUIRES_DEV, 1405 .fs_flags = FS_REQUIRES_DEV,
1459 .get_sb = gfs2_get_sb, 1406 .mount = gfs2_mount,
1460 .kill_sb = gfs2_kill_sb, 1407 .kill_sb = gfs2_kill_sb,
1461 .owner = THIS_MODULE, 1408 .owner = THIS_MODULE,
1462}; 1409};
@@ -1464,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
1464struct file_system_type gfs2meta_fs_type = { 1411struct file_system_type gfs2meta_fs_type = {
1465 .name = "gfs2meta", 1412 .name = "gfs2meta",
1466 .fs_flags = FS_REQUIRES_DEV, 1413 .fs_flags = FS_REQUIRES_DEV,
1467 .get_sb = gfs2_get_sb_meta, 1414 .mount = gfs2_mount_meta,
1468 .owner = THIS_MODULE, 1415 .owner = THIS_MODULE,
1469}; 1416};
1470 1417
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -104,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
104{ 104{
105 struct inode *inode = NULL; 105 struct inode *inode = NULL;
106 106
107 dentry->d_op = &gfs2_dops;
108
109 inode = gfs2_lookupi(dir, &dentry->d_name, 0); 107 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
110 if (inode && IS_ERR(inode)) 108 if (inode && IS_ERR(inode))
111 return ERR_CAST(inode); 109 return ERR_CAST(inode);
@@ -164,7 +162,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
164 if (error) 162 if (error)
165 goto out_child; 163 goto out_child;
166 164
167 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); 165 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
168 if (error) 166 if (error)
169 goto out_gunlock; 167 goto out_gunlock;
170 168
@@ -217,7 +215,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
217 goto out_gunlock_q; 215 goto out_gunlock_q;
218 216
219 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 217 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
220 al->al_rgd->rd_length + 218 gfs2_rg_blocks(al) +
221 2 * RES_DINODE + RES_STATFS + 219 2 * RES_DINODE + RES_STATFS +
222 RES_QUOTA, 0); 220 RES_QUOTA, 0);
223 if (error) 221 if (error)
@@ -253,7 +251,7 @@ out_parent:
253 gfs2_holder_uninit(ghs); 251 gfs2_holder_uninit(ghs);
254 gfs2_holder_uninit(ghs + 1); 252 gfs2_holder_uninit(ghs + 1);
255 if (!error) { 253 if (!error) {
256 atomic_inc(&inode->i_count); 254 ihold(inode);
257 d_instantiate(dentry, inode); 255 d_instantiate(dentry, inode);
258 mark_inode_dirty(inode); 256 mark_inode_dirty(inode);
259 } 257 }
@@ -287,7 +285,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
287 if (IS_APPEND(&dip->i_inode)) 285 if (IS_APPEND(&dip->i_inode))
288 return -EPERM; 286 return -EPERM;
289 287
290 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); 288 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
291 if (error) 289 if (error)
292 return error; 290 return error;
293 291
@@ -406,7 +404,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
406 404
407 ip = ghs[1].gh_gl->gl_object; 405 ip = ghs[1].gh_gl->gl_object;
408 406
409 ip->i_disksize = size;
410 i_size_write(inode, size); 407 i_size_write(inode, size);
411 408
412 error = gfs2_meta_inode_buffer(ip, &dibh); 409 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +458,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
461 ip = ghs[1].gh_gl->gl_object; 458 ip = ghs[1].gh_gl->gl_object;
462 459
463 ip->i_inode.i_nlink = 2; 460 ip->i_inode.i_nlink = 2;
464 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 461 i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
465 ip->i_diskflags |= GFS2_DIF_JDATA; 462 ip->i_diskflags |= GFS2_DIF_JDATA;
466 ip->i_entries = 2; 463 ip->i_entries = 2;
467 464
@@ -470,18 +467,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
470 if (!gfs2_assert_withdraw(sdp, !error)) { 467 if (!gfs2_assert_withdraw(sdp, !error)) {
471 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 468 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
472 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); 469 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
473 struct qstr str;
474 470
475 gfs2_str2qstr(&str, ".");
476 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 471 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
477 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); 472 gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
478 dent->de_inum = di->di_num; /* already GFS2 endian */ 473 dent->de_inum = di->di_num; /* already GFS2 endian */
479 dent->de_type = cpu_to_be16(DT_DIR); 474 dent->de_type = cpu_to_be16(DT_DIR);
480 di->di_entries = cpu_to_be32(1); 475 di->di_entries = cpu_to_be32(1);
481 476
482 gfs2_str2qstr(&str, "..");
483 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); 477 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
484 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); 478 gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
485 479
486 gfs2_inum_out(dip, dent); 480 gfs2_inum_out(dip, dent);
487 dent->de_type = cpu_to_be16(DT_DIR); 481 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +516,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
522static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 516static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
523 struct gfs2_inode *ip) 517 struct gfs2_inode *ip)
524{ 518{
525 struct qstr dotname;
526 int error; 519 int error;
527 520
528 if (ip->i_entries != 2) { 521 if (ip->i_entries != 2) {
@@ -539,13 +532,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
539 if (error) 532 if (error)
540 return error; 533 return error;
541 534
542 gfs2_str2qstr(&dotname, "."); 535 error = gfs2_dir_del(ip, &gfs2_qdot);
543 error = gfs2_dir_del(ip, &dotname);
544 if (error) 536 if (error)
545 return error; 537 return error;
546 538
547 gfs2_str2qstr(&dotname, ".."); 539 error = gfs2_dir_del(ip, &gfs2_qdotdot);
548 error = gfs2_dir_del(ip, &dotname);
549 if (error) 540 if (error)
550 return error; 541 return error;
551 542
@@ -694,11 +685,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
694 struct inode *dir = &to->i_inode; 685 struct inode *dir = &to->i_inode;
695 struct super_block *sb = dir->i_sb; 686 struct super_block *sb = dir->i_sb;
696 struct inode *tmp; 687 struct inode *tmp;
697 struct qstr dotdot;
698 int error = 0; 688 int error = 0;
699 689
700 gfs2_str2qstr(&dotdot, "..");
701
702 igrab(dir); 690 igrab(dir);
703 691
704 for (;;) { 692 for (;;) {
@@ -711,7 +699,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
711 break; 699 break;
712 } 700 }
713 701
714 tmp = gfs2_lookupi(dir, &dotdot, 1); 702 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
715 if (IS_ERR(tmp)) { 703 if (IS_ERR(tmp)) {
716 error = PTR_ERR(tmp); 704 error = PTR_ERR(tmp);
717 break; 705 break;
@@ -744,7 +732,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
744 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 732 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
745 struct gfs2_inode *nip = NULL; 733 struct gfs2_inode *nip = NULL;
746 struct gfs2_sbd *sdp = GFS2_SB(odir); 734 struct gfs2_sbd *sdp = GFS2_SB(odir);
747 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; 735 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
748 struct gfs2_rgrpd *nrgd; 736 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 737 unsigned int num_gh;
750 int dir_rename = 0; 738 int dir_rename = 0;
@@ -758,6 +746,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 return 0; 746 return 0;
759 } 747 }
760 748
749 error = gfs2_rindex_hold(sdp, &ri_gh);
750 if (error)
751 return error;
761 752
762 if (odip != ndip) { 753 if (odip != ndip) {
763 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 754 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -827,7 +818,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
827 } 818 }
828 } 819 }
829 } else { 820 } else {
830 error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); 821 error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
831 if (error) 822 if (error)
832 goto out_gunlock; 823 goto out_gunlock;
833 824
@@ -862,7 +853,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
862 /* Check out the dir to be renamed */ 853 /* Check out the dir to be renamed */
863 854
864 if (dir_rename) { 855 if (dir_rename) {
865 error = gfs2_permission(odentry->d_inode, MAY_WRITE); 856 error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
866 if (error) 857 if (error)
867 goto out_gunlock; 858 goto out_gunlock;
868 } 859 }
@@ -887,12 +878,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
887 878
888 al->al_requested = sdp->sd_max_dirres; 879 al->al_requested = sdp->sd_max_dirres;
889 880
890 error = gfs2_inplace_reserve(ndip); 881 error = gfs2_inplace_reserve_ri(ndip);
891 if (error) 882 if (error)
892 goto out_gunlock_q; 883 goto out_gunlock_q;
893 884
894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 885 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
895 al->al_rgd->rd_length + 886 gfs2_rg_blocks(al) +
896 4 * RES_DINODE + 4 * RES_LEAF + 887 4 * RES_DINODE + 4 * RES_LEAF +
897 RES_STATFS + RES_QUOTA + 4, 0); 888 RES_STATFS + RES_QUOTA + 4, 0);
898 if (error) 889 if (error)
@@ -920,9 +911,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
920 } 911 }
921 912
922 if (dir_rename) { 913 if (dir_rename) {
923 struct qstr name;
924 gfs2_str2qstr(&name, "..");
925
926 error = gfs2_change_nlink(ndip, +1); 914 error = gfs2_change_nlink(ndip, +1);
927 if (error) 915 if (error)
928 goto out_end_trans; 916 goto out_end_trans;
@@ -930,7 +918,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
930 if (error) 918 if (error)
931 goto out_end_trans; 919 goto out_end_trans;
932 920
933 error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); 921 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
934 if (error) 922 if (error)
935 goto out_end_trans; 923 goto out_end_trans;
936 } else { 924 } else {
@@ -972,6 +960,7 @@ out_gunlock_r:
972 if (r_gh.gh_gl) 960 if (r_gh.gh_gl)
973 gfs2_glock_dq_uninit(&r_gh); 961 gfs2_glock_dq_uninit(&r_gh);
974out: 962out:
963 gfs2_glock_dq_uninit(&ri_gh);
975 return error; 964 return error;
976} 965}
977 966
@@ -990,7 +979,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 979 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
991 struct gfs2_holder i_gh; 980 struct gfs2_holder i_gh;
992 struct buffer_head *dibh; 981 struct buffer_head *dibh;
993 unsigned int x; 982 unsigned int x, size;
994 char *buf; 983 char *buf;
995 int error; 984 int error;
996 985
@@ -1002,7 +991,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1002 return NULL; 991 return NULL;
1003 } 992 }
1004 993
1005 if (!ip->i_disksize) { 994 size = (unsigned int)i_size_read(&ip->i_inode);
995 if (size == 0) {
1006 gfs2_consist_inode(ip); 996 gfs2_consist_inode(ip);
1007 buf = ERR_PTR(-EIO); 997 buf = ERR_PTR(-EIO);
1008 goto out; 998 goto out;
@@ -1014,7 +1004,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1014 goto out; 1004 goto out;
1015 } 1005 }
1016 1006
1017 x = ip->i_disksize + 1; 1007 x = size + 1;
1018 buf = kmalloc(x, GFP_NOFS); 1008 buf = kmalloc(x, GFP_NOFS);
1019 if (!buf) 1009 if (!buf)
1020 buf = ERR_PTR(-ENOMEM); 1010 buf = ERR_PTR(-ENOMEM);
@@ -1047,13 +1037,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1047 * Returns: errno 1037 * Returns: errno
1048 */ 1038 */
1049 1039
1050int gfs2_permission(struct inode *inode, int mask) 1040int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
1051{ 1041{
1052 struct gfs2_inode *ip = GFS2_I(inode); 1042 struct gfs2_inode *ip;
1053 struct gfs2_holder i_gh; 1043 struct gfs2_holder i_gh;
1054 int error; 1044 int error;
1055 int unlock = 0; 1045 int unlock = 0;
1056 1046
1047 if (flags & IPERM_FLAG_RCU)
1048 return -ECHILD;
1049
1050 ip = GFS2_I(inode);
1057 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { 1051 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
1058 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 1052 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
1059 if (error) 1053 if (error)
@@ -1064,42 +1058,17 @@ int gfs2_permission(struct inode *inode, int mask)
1064 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) 1058 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
1065 error = -EACCES; 1059 error = -EACCES;
1066 else 1060 else
1067 error = generic_permission(inode, mask, gfs2_check_acl); 1061 error = generic_permission(inode, mask, flags, gfs2_check_acl);
1068 if (unlock) 1062 if (unlock)
1069 gfs2_glock_dq_uninit(&i_gh); 1063 gfs2_glock_dq_uninit(&i_gh);
1070 1064
1071 return error; 1065 return error;
1072} 1066}
1073 1067
1074/*
1075 * XXX(truncate): the truncate_setsize calls should be moved to the end.
1076 */
1077static int setattr_size(struct inode *inode, struct iattr *attr)
1078{
1079 struct gfs2_inode *ip = GFS2_I(inode);
1080 struct gfs2_sbd *sdp = GFS2_SB(inode);
1081 int error;
1082
1083 if (attr->ia_size != ip->i_disksize) {
1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1085 if (error)
1086 return error;
1087 truncate_setsize(inode, attr->ia_size);
1088 gfs2_trans_end(sdp);
1089 }
1090
1091 error = gfs2_truncatei(ip, attr->ia_size);
1092 if (error && (inode->i_size != ip->i_disksize))
1093 i_size_write(inode, ip->i_disksize);
1094
1095 return error;
1096}
1097
1098static int setattr_chown(struct inode *inode, struct iattr *attr) 1068static int setattr_chown(struct inode *inode, struct iattr *attr)
1099{ 1069{
1100 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
1101 struct gfs2_sbd *sdp = GFS2_SB(inode); 1071 struct gfs2_sbd *sdp = GFS2_SB(inode);
1102 struct buffer_head *dibh;
1103 u32 ouid, ogid, nuid, ngid; 1072 u32 ouid, ogid, nuid, ngid;
1104 int error; 1073 int error;
1105 1074
@@ -1130,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1130 if (error) 1099 if (error)
1131 goto out_gunlock_q; 1100 goto out_gunlock_q;
1132 1101
1133 error = gfs2_meta_inode_buffer(ip, &dibh); 1102 error = gfs2_setattr_simple(ip, attr);
1134 if (error) 1103 if (error)
1135 goto out_end_trans; 1104 goto out_end_trans;
1136 1105
1137 if ((attr->ia_valid & ATTR_SIZE) &&
1138 attr->ia_size != i_size_read(inode)) {
1139 int error;
1140
1141 error = vmtruncate(inode, attr->ia_size);
1142 gfs2_assert_warn(sdp, !error);
1143 }
1144
1145 setattr_copy(inode, attr);
1146 mark_inode_dirty(inode);
1147
1148 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1149 gfs2_dinode_out(ip, dibh->b_data);
1150 brelse(dibh);
1151
1152 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { 1106 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
1153 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); 1107 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
1154 gfs2_quota_change(ip, -blocks, ouid, ogid); 1108 gfs2_quota_change(ip, -blocks, ouid, ogid);
@@ -1195,7 +1149,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1195 goto out; 1149 goto out;
1196 1150
1197 if (attr->ia_valid & ATTR_SIZE) 1151 if (attr->ia_valid & ATTR_SIZE)
1198 error = setattr_size(inode, attr); 1152 error = gfs2_setattr_size(inode, attr->ia_size);
1199 else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) 1153 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1200 error = setattr_chown(inode, attr); 1154 error = setattr_chown(inode, attr);
1201 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1155 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..a689901963de 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
631 struct fs_disk_quota *fdq) 631 struct fs_disk_quota *fdq)
632{ 632{
633 struct inode *inode = &ip->i_inode; 633 struct inode *inode = &ip->i_inode;
634 struct gfs2_sbd *sdp = GFS2_SB(inode);
634 struct address_space *mapping = inode->i_mapping; 635 struct address_space *mapping = inode->i_mapping;
635 unsigned long index = loc >> PAGE_CACHE_SHIFT; 636 unsigned long index = loc >> PAGE_CACHE_SHIFT;
636 unsigned offset = loc & (PAGE_CACHE_SIZE - 1); 637 unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,13 +659,17 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
658 qd->qd_qb.qb_value = qp->qu_value; 659 qd->qd_qb.qb_value = qp->qu_value;
659 if (fdq) { 660 if (fdq) {
660 if (fdq->d_fieldmask & FS_DQ_BSOFT) { 661 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
661 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); 662 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
662 qd->qd_qb.qb_warn = qp->qu_warn; 663 qd->qd_qb.qb_warn = qp->qu_warn;
663 } 664 }
664 if (fdq->d_fieldmask & FS_DQ_BHARD) { 665 if (fdq->d_fieldmask & FS_DQ_BHARD) {
665 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); 666 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
666 qd->qd_qb.qb_limit = qp->qu_limit; 667 qd->qd_qb.qb_limit = qp->qu_limit;
667 } 668 }
669 if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
670 qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
671 qd->qd_qb.qb_value = qp->qu_value;
672 }
668 } 673 }
669 674
670 /* Write the quota into the quota file on disk */ 675 /* Write the quota into the quota file on disk */
@@ -735,10 +740,8 @@ get_a_page:
735 goto out; 740 goto out;
736 741
737 size = loc + sizeof(struct gfs2_quota); 742 size = loc + sizeof(struct gfs2_quota);
738 if (size > inode->i_size) { 743 if (size > inode->i_size)
739 ip->i_disksize = size;
740 i_size_write(inode, size); 744 i_size_write(inode, size);
741 }
742 inode->i_mtime = inode->i_atime = CURRENT_TIME; 745 inode->i_mtime = inode->i_atime = CURRENT_TIME;
743 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 746 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
744 gfs2_dinode_out(ip, dibh->b_data); 747 gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +820,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
817 goto out_alloc; 820 goto out_alloc;
818 821
819 if (nalloc) 822 if (nalloc)
820 blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; 823 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
821 824
822 error = gfs2_trans_begin(sdp, blocks, 0); 825 error = gfs2_trans_begin(sdp, blocks, 0);
823 if (error) 826 if (error)
@@ -1190,18 +1193,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1190int gfs2_quota_init(struct gfs2_sbd *sdp) 1193int gfs2_quota_init(struct gfs2_sbd *sdp)
1191{ 1194{
1192 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1195 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1193 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; 1196 u64 size = i_size_read(sdp->sd_qc_inode);
1197 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1194 unsigned int x, slot = 0; 1198 unsigned int x, slot = 0;
1195 unsigned int found = 0; 1199 unsigned int found = 0;
1196 u64 dblock; 1200 u64 dblock;
1197 u32 extlen = 0; 1201 u32 extlen = 0;
1198 int error; 1202 int error;
1199 1203
1200 if (!ip->i_disksize || ip->i_disksize > (64 << 20) || 1204 if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
1201 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1202 gfs2_consist_inode(ip);
1203 return -EIO; 1205 return -EIO;
1204 } 1206
1205 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1207 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1206 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1208 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1207 1209
@@ -1500,9 +1502,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1500 fdq->d_version = FS_DQUOT_VERSION; 1502 fdq->d_version = FS_DQUOT_VERSION;
1501 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; 1503 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1502 fdq->d_id = id; 1504 fdq->d_id = id;
1503 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); 1505 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
1504 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); 1506 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
1505 fdq->d_bcount = be64_to_cpu(qlvb->qb_value); 1507 fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
1506 1508
1507 gfs2_glock_dq_uninit(&q_gh); 1509 gfs2_glock_dq_uninit(&q_gh);
1508out: 1510out:
@@ -1511,7 +1513,7 @@ out:
1511} 1513}
1512 1514
1513/* GFS2 only supports a subset of the XFS fields */ 1515/* GFS2 only supports a subset of the XFS fields */
1514#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) 1516#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
1515 1517
1516static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, 1518static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1517 struct fs_disk_quota *fdq) 1519 struct fs_disk_quota *fdq)
@@ -1569,11 +1571,17 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1569 1571
1570 /* If nothing has changed, this is a no-op */ 1572 /* If nothing has changed, this is a no-op */
1571 if ((fdq->d_fieldmask & FS_DQ_BSOFT) && 1573 if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
1572 (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) 1574 ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
1573 fdq->d_fieldmask ^= FS_DQ_BSOFT; 1575 fdq->d_fieldmask ^= FS_DQ_BSOFT;
1576
1574 if ((fdq->d_fieldmask & FS_DQ_BHARD) && 1577 if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
1575 (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) 1578 ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
1576 fdq->d_fieldmask ^= FS_DQ_BHARD; 1579 fdq->d_fieldmask ^= FS_DQ_BHARD;
1580
1581 if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
1582 ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
1583 fdq->d_fieldmask ^= FS_DQ_BCOUNT;
1584
1577 if (fdq->d_fieldmask == 0) 1585 if (fdq->d_fieldmask == 0)
1578 goto out_i; 1586 goto out_i;
1579 1587
@@ -1589,6 +1597,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1589 error = gfs2_inplace_reserve(ip); 1597 error = gfs2_inplace_reserve(ip);
1590 if (error) 1598 if (error)
1591 goto out_alloc; 1599 goto out_alloc;
1600 blocks += gfs2_rg_blocks(al);
1592 } 1601 }
1593 1602
1594 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1603 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
@@ -1621,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
1621 .get_dqblk = gfs2_get_dqblk, 1630 .get_dqblk = gfs2_get_dqblk,
1622 .set_dqblk = gfs2_set_dqblk, 1631 .set_dqblk = gfs2_set_dqblk,
1623}; 1632};
1624
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
455 int ro = 0; 455 int ro = 0;
456 unsigned int pass; 456 unsigned int pass;
457 int error; 457 int error;
458 int jlocked = 0;
458 459
459 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 460 if (sdp->sd_args.ar_spectator ||
461 (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
460 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", 462 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
461 jd->jd_jid); 463 jd->jd_jid);
462 464 jlocked = 1;
463 /* Acquire the journal lock so we can do recovery */ 465 /* Acquire the journal lock so we can do recovery */
464 466
465 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, 467 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
554 jd->jd_jid, t); 556 jd->jd_jid, t);
555 } 557 }
556 558
557 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
558 gfs2_glock_dq_uninit(&ji_gh);
559
560 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 559 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
561 560
562 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 561 if (jlocked) {
562 gfs2_glock_dq_uninit(&ji_gh);
563 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
564 }
564 565
565 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 566 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
566 goto done; 567 goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
568fail_gunlock_tr: 569fail_gunlock_tr:
569 gfs2_glock_dq_uninit(&t_gh); 570 gfs2_glock_dq_uninit(&t_gh);
570fail_gunlock_ji: 571fail_gunlock_ji:
571 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 572 if (jlocked) {
572 gfs2_glock_dq_uninit(&ji_gh); 573 gfs2_glock_dq_uninit(&ji_gh);
573fail_gunlock_j: 574fail_gunlock_j:
574 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..7293ea27020c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) 503 if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -583,12 +583,14 @@ static int read_rindex_entry(struct gfs2_inode *ip,
583 * Returns: 0 on successful update, error code otherwise 583 * Returns: 0 on successful update, error code otherwise
584 */ 584 */
585 585
586static int gfs2_ri_update(struct gfs2_inode *ip) 586int gfs2_ri_update(struct gfs2_inode *ip)
587{ 587{
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
590 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = i_size_read(inode);
592 struct gfs2_rgrpd *rgd;
593 unsigned int max_data = 0;
592 int error; 594 int error;
593 595
594 do_div(rgrp_count, sizeof(struct gfs2_rindex)); 596 do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,40 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
603 } 605 }
604 } 606 }
605 607
606 sdp->sd_rindex_uptodate = 1; 608 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
607 return 0; 609 if (rgd->rd_data > max_data)
608} 610 max_data = rgd->rd_data;
609 611 sdp->sd_max_rg_data = max_data;
610/**
611 * gfs2_ri_update_special - Pull in a new resource index from the disk
612 *
613 * This is a special version that's safe to call from gfs2_inplace_reserve_i.
614 * In this case we know that we don't have any resource groups in memory yet.
615 *
616 * @ip: pointer to the rindex inode
617 *
618 * Returns: 0 on successful update, error code otherwise
619 */
620static int gfs2_ri_update_special(struct gfs2_inode *ip)
621{
622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 struct inode *inode = &ip->i_inode;
624 struct file_ra_state ra_state;
625 int error;
626
627 file_ra_state_init(&ra_state, inode->i_mapping);
628 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
629 /* Ignore partials */
630 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
631 ip->i_disksize)
632 break;
633 error = read_rindex_entry(ip, &ra_state);
634 if (error) {
635 clear_rgrpdi(sdp);
636 return error;
637 }
638 }
639
640 sdp->sd_rindex_uptodate = 1; 612 sdp->sd_rindex_uptodate = 1;
641 return 0; 613 return 0;
642} 614}
@@ -854,8 +826,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 826 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 827 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 828 nr_sects, GFP_NOFS,
857 BLKDEV_IFL_WAIT | 829 0);
858 BLKDEV_IFL_BARRIER);
859 if (rv) 830 if (rv)
860 goto fail; 831 goto fail;
861 nr_sects = 0; 832 nr_sects = 0;
@@ -869,8 +840,7 @@ start_new_extent:
869 } 840 }
870 } 841 }
871 if (nr_sects) { 842 if (nr_sects) {
872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 843 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
874 if (rv) 844 if (rv)
875 goto fail; 845 goto fail;
876 } 846 }
@@ -953,17 +923,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
953 * The inode, if one has been found, in inode. 923 * The inode, if one has been found, in inode.
954 */ 924 */
955 925
956static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 926static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
957 u64 skip)
958{ 927{
959 u32 goal = 0, block; 928 u32 goal = 0, block;
960 u64 no_addr; 929 u64 no_addr;
961 struct gfs2_sbd *sdp = rgd->rd_sbd; 930 struct gfs2_sbd *sdp = rgd->rd_sbd;
962 unsigned int n; 931 unsigned int n;
932 struct gfs2_glock *gl;
933 struct gfs2_inode *ip;
934 int error;
935 int found = 0;
963 936
964 for(;;) { 937 while (goal < rgd->rd_data) {
965 if (goal >= rgd->rd_data)
966 break;
967 down_write(&sdp->sd_log_flush_lock); 938 down_write(&sdp->sd_log_flush_lock);
968 n = 1; 939 n = 1;
969 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, 940 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -980,11 +951,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
980 if (no_addr == skip) 951 if (no_addr == skip)
981 continue; 952 continue;
982 *last_unlinked = no_addr; 953 *last_unlinked = no_addr;
983 return no_addr; 954
955 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
956 if (error)
957 continue;
958
959 /* If the inode is already in cache, we can ignore it here
960 * because the existing inode disposal code will deal with
961 * it when all refs have gone away. Accessing gl_object like
962 * this is not safe in general. Here it is ok because we do
963 * not dereference the pointer, and we only need an approx
964 * answer to whether it is NULL or not.
965 */
966 ip = gl->gl_object;
967
968 if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
969 gfs2_glock_put(gl);
970 else
971 found++;
972
973 /* Limit reclaim to sensible number of tasks */
974 if (found > 2*NR_CPUS)
975 return;
984 } 976 }
985 977
986 rgd->rd_flags &= ~GFS2_RDF_CHECK; 978 rgd->rd_flags &= ~GFS2_RDF_CHECK;
987 return 0; 979 return;
988} 980}
989 981
990/** 982/**
@@ -1065,11 +1057,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1065 * Try to acquire rgrp in way which avoids contending with others. 1057 * Try to acquire rgrp in way which avoids contending with others.
1066 * 1058 *
1067 * Returns: errno 1059 * Returns: errno
1068 * unlinked: the block address of an unlinked block to be reclaimed
1069 */ 1060 */
1070 1061
1071static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, 1062static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1072 u64 *last_unlinked)
1073{ 1063{
1074 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1064 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1075 struct gfs2_rgrpd *rgd, *begin = NULL; 1065 struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1079,7 +1069,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1079 int loops = 0; 1069 int loops = 0;
1080 int error, rg_locked; 1070 int error, rg_locked;
1081 1071
1082 *unlinked = 0;
1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1072 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1084 1073
1085 while (rgd) { 1074 while (rgd) {
@@ -1096,17 +1085,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1096 case 0: 1085 case 0:
1097 if (try_rgrp_fit(rgd, al)) 1086 if (try_rgrp_fit(rgd, al))
1098 goto out; 1087 goto out;
1099 /* If the rg came in already locked, there's no 1088 if (rgd->rd_flags & GFS2_RDF_CHECK)
1100 way we can recover from a failed try_rgrp_unlink 1089 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1101 because that would require an iput which can only
1102 happen after the rgrp is unlocked. */
1103 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1104 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1105 ip->i_no_addr);
1106 if (!rg_locked) 1090 if (!rg_locked)
1107 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1091 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1108 if (*unlinked)
1109 return -EAGAIN;
1110 /* fall through */ 1092 /* fall through */
1111 case GLR_TRYFAILED: 1093 case GLR_TRYFAILED:
1112 rgd = recent_rgrp_next(rgd); 1094 rgd = recent_rgrp_next(rgd);
@@ -1135,13 +1117,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1135 case 0: 1117 case 0:
1136 if (try_rgrp_fit(rgd, al)) 1118 if (try_rgrp_fit(rgd, al))
1137 goto out; 1119 goto out;
1138 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) 1120 if (rgd->rd_flags & GFS2_RDF_CHECK)
1139 *unlinked = try_rgrp_unlink(rgd, last_unlinked, 1121 try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
1140 ip->i_no_addr);
1141 if (!rg_locked) 1122 if (!rg_locked)
1142 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1123 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1143 if (*unlinked)
1144 return -EAGAIN;
1145 break; 1124 break;
1146 1125
1147 case GLR_TRYFAILED: 1126 case GLR_TRYFAILED:
@@ -1188,47 +1167,52 @@ out:
1188 * Returns: errno 1167 * Returns: errno
1189 */ 1168 */
1190 1169
1191int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1170int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1171 char *file, unsigned int line)
1192{ 1172{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1173 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1174 struct gfs2_alloc *al = ip->i_alloc;
1195 int error = 0; 1175 int error = 0;
1196 u64 last_unlinked = NO_BLOCK, unlinked; 1176 u64 last_unlinked = NO_BLOCK;
1177 int tries = 0;
1197 1178
1198 if (gfs2_assert_warn(sdp, al->al_requested)) 1179 if (gfs2_assert_warn(sdp, al->al_requested))
1199 return -EINVAL; 1180 return -EINVAL;
1200 1181
1201try_again: 1182 if (hold_rindex) {
1202 /* We need to hold the rindex unless the inode we're using is 1183 /* We need to hold the rindex unless the inode we're using is
1203 the rindex itself, in which case it's already held. */ 1184 the rindex itself, in which case it's already held. */
1204 if (ip != GFS2_I(sdp->sd_rindex))
1205 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1206 else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
1207 error = gfs2_ri_update_special(ip);
1208
1209 if (error)
1210 return error;
1211
1212 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1213 dinodes along the way, error will equal -EAGAIN and unlinked will
1214 contains it block address. We then need to look up that inode and
1215 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) {
1218 if (ip != GFS2_I(sdp->sd_rindex)) 1185 if (ip != GFS2_I(sdp->sd_rindex))
1219 gfs2_glock_dq_uninit(&al->al_ri_gh); 1186 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1220 if (error != -EAGAIN) 1187 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1188 in, so: */
1189 error = gfs2_ri_update(ip);
1190 if (error)
1221 return error; 1191 return error;
1192 }
1222 1193
1223 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); 1194try_again:
1224 /* regardless of whether or not gfs2_process_unlinked_inode 1195 do {
1225 was successful, we don't want to repeat it again. */ 1196 error = get_local_rgrp(ip, &last_unlinked);
1226 last_unlinked = unlinked; 1197 /* If there is no space, flushing the log may release some */
1227 gfs2_log_flush(sdp, NULL); 1198 if (error) {
1228 error = 0; 1199 if (ip == GFS2_I(sdp->sd_rindex) &&
1200 !sdp->sd_rindex_uptodate) {
1201 error = gfs2_ri_update(ip);
1202 if (error)
1203 return error;
1204 goto try_again;
1205 }
1206 gfs2_log_flush(sdp, NULL);
1207 }
1208 } while (error && tries++ < 3);
1229 1209
1230 goto try_again; 1210 if (error) {
1211 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1212 gfs2_glock_dq_uninit(&al->al_ri_gh);
1213 return error;
1231 } 1214 }
1215
1232 /* no error, so we have the rgrp set in the inode's allocation. */ 1216 /* no error, so we have the rgrp set in the inode's allocation. */
1233 al->al_file = file; 1217 al->al_file = file;
1234 al->al_line = line; 1218 al->al_line = line;
@@ -1257,7 +1241,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1257 al->al_rgd = NULL; 1241 al->al_rgd = NULL;
1258 if (al->al_rgd_gh.gh_gl) 1242 if (al->al_rgd_gh.gh_gl)
1259 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1243 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1260 if (ip != GFS2_I(sdp->sd_rindex)) 1244 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1261 gfs2_glock_dq_uninit(&al->al_ri_gh); 1245 gfs2_glock_dq_uninit(&al->al_ri_gh);
1262} 1246}
1263 1247
@@ -1496,11 +1480,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1496 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1480 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1497 struct buffer_head *dibh; 1481 struct buffer_head *dibh;
1498 struct gfs2_alloc *al = ip->i_alloc; 1482 struct gfs2_alloc *al = ip->i_alloc;
1499 struct gfs2_rgrpd *rgd = al->al_rgd; 1483 struct gfs2_rgrpd *rgd;
1500 u32 goal, blk; 1484 u32 goal, blk;
1501 u64 block; 1485 u64 block;
1502 int error; 1486 int error;
1503 1487
1488 /* Only happens if there is a bug in gfs2, return something distinctive
1489 * to ensure that it is noticed.
1490 */
1491 if (al == NULL)
1492 return -ECANCELED;
1493
1494 rgd = al->al_rgd;
1495
1504 if (rgrp_contains_block(rgd, ip->i_goal)) 1496 if (rgrp_contains_block(rgd, ip->i_goal))
1505 goal = ip->i_goal - rgd->rd_data0; 1497 goal = ip->i_goal - rgd->rd_data0;
1506 else 1498 else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..50c2bb04369c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,13 +39,16 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 39 ip->i_alloc = NULL;
40} 40}
41 41
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, 42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
43 unsigned int line); 43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \ 44#define gfs2_inplace_reserve(ip) \
45gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
46 48
47extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
48 50
51extern int gfs2_ri_update(struct gfs2_inode *ip);
49extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
50extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); 53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
51 54
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..ec73ed70bae1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
85 {Opt_locktable, "locktable=%s"}, 85 {Opt_locktable, "locktable=%s"},
86 {Opt_hostdata, "hostdata=%s"}, 86 {Opt_hostdata, "hostdata=%s"},
87 {Opt_spectator, "spectator"}, 87 {Opt_spectator, "spectator"},
88 {Opt_spectator, "norecovery"},
88 {Opt_ignore_local_fs, "ignore_local_fs"}, 89 {Opt_ignore_local_fs, "ignore_local_fs"},
89 {Opt_localflocks, "localflocks"}, 90 {Opt_localflocks, "localflocks"},
90 {Opt_localcaching, "localcaching"}, 91 {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
159 args->ar_spectator = 1; 160 args->ar_spectator = 1;
160 break; 161 break;
161 case Opt_ignore_local_fs: 162 case Opt_ignore_local_fs:
162 args->ar_ignore_local_fs = 1; 163 /* Retained for backwards compat only */
163 break; 164 break;
164 case Opt_localflocks: 165 case Opt_localflocks:
165 args->ar_localflocks = 1; 166 args->ar_localflocks = 1;
166 break; 167 break;
167 case Opt_localcaching: 168 case Opt_localcaching:
168 args->ar_localcaching = 1; 169 /* Retained for backwards compat only */
169 break; 170 break;
170 case Opt_debug: 171 case Opt_debug:
171 if (args->ar_errors == GFS2_ERRORS_PANIC) { 172 if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
179 args->ar_debug = 0; 180 args->ar_debug = 0;
180 break; 181 break;
181 case Opt_upgrade: 182 case Opt_upgrade:
182 args->ar_upgrade = 1; 183 /* Retained for backwards compat only */
183 break; 184 break;
184 case Opt_acl: 185 case Opt_acl:
185 args->ar_posix_acl = 1; 186 args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
342{ 343{
343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 344 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 345 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
346 u64 size = i_size_read(jd->jd_inode);
345 347
346 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || 348 if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
347 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
348 gfs2_consist_inode(ip);
349 return -EIO; 349 return -EIO;
350 }
351 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
352 350
353 if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { 351 jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
352
353 if (gfs2_write_alloc_required(ip, 0, size)) {
354 gfs2_consist_inode(ip); 354 gfs2_consist_inode(ip);
355 return -EIO; 355 return -EIO;
356 } 356 }
@@ -857,7 +857,6 @@ restart:
857 gfs2_clear_rgrpd(sdp); 857 gfs2_clear_rgrpd(sdp);
858 gfs2_jindex_free(sdp); 858 gfs2_jindex_free(sdp);
859 /* Take apart glock structures and buffer lists */ 859 /* Take apart glock structures and buffer lists */
860 invalidate_inodes(sdp->sd_vfs);
861 gfs2_gl_hash_clear(sdp); 860 gfs2_gl_hash_clear(sdp);
862 /* Unmount the locking protocol */ 861 /* Unmount the locking protocol */
863 gfs2_lm_unmount(sdp); 862 gfs2_lm_unmount(sdp);
@@ -1129,9 +1128,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1129 1128
1130 /* Some flags must not be changed */ 1129 /* Some flags must not be changed */
1131 if (args_neq(&args, &sdp->sd_args, spectator) || 1130 if (args_neq(&args, &sdp->sd_args, spectator) ||
1132 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1133 args_neq(&args, &sdp->sd_args, localflocks) || 1131 args_neq(&args, &sdp->sd_args, localflocks) ||
1134 args_neq(&args, &sdp->sd_args, localcaching) ||
1135 args_neq(&args, &sdp->sd_args, meta)) 1132 args_neq(&args, &sdp->sd_args, meta))
1136 return -EINVAL; 1133 return -EINVAL;
1137 1134
@@ -1234,16 +1231,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1234 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1231 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1235 if (args->ar_spectator) 1232 if (args->ar_spectator)
1236 seq_printf(s, ",spectator"); 1233 seq_printf(s, ",spectator");
1237 if (args->ar_ignore_local_fs)
1238 seq_printf(s, ",ignore_local_fs");
1239 if (args->ar_localflocks) 1234 if (args->ar_localflocks)
1240 seq_printf(s, ",localflocks"); 1235 seq_printf(s, ",localflocks");
1241 if (args->ar_localcaching)
1242 seq_printf(s, ",localcaching");
1243 if (args->ar_debug) 1236 if (args->ar_debug)
1244 seq_printf(s, ",debug"); 1237 seq_printf(s, ",debug");
1245 if (args->ar_upgrade)
1246 seq_printf(s, ",upgrade");
1247 if (args->ar_posix_acl) 1238 if (args->ar_posix_acl)
1248 seq_printf(s, ",acl"); 1239 seq_printf(s, ",acl");
1249 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1240 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -1345,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
1345 if (error) 1336 if (error)
1346 goto out_truncate; 1337 goto out_truncate;
1347 1338
1339 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1348 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1340 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1349 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1341 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1350 error = gfs2_glock_nq(&ip->i_iopen_gh); 1342 error = gfs2_glock_nq(&ip->i_iopen_gh);
@@ -1414,11 +1406,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
1414 return &ip->i_inode; 1406 return &ip->i_inode;
1415} 1407}
1416 1408
1417static void gfs2_destroy_inode(struct inode *inode) 1409static void gfs2_i_callback(struct rcu_head *head)
1418{ 1410{
1411 struct inode *inode = container_of(head, struct inode, i_rcu);
1412 INIT_LIST_HEAD(&inode->i_dentry);
1419 kmem_cache_free(gfs2_inode_cachep, inode); 1413 kmem_cache_free(gfs2_inode_cachep, inode);
1420} 1414}
1421 1415
1416static void gfs2_destroy_inode(struct inode *inode)
1417{
1418 call_rcu(&inode->i_rcu, gfs2_i_callback);
1419}
1420
1422const struct super_operations gfs2_super_ops = { 1421const struct super_operations gfs2_super_ops = {
1423 .alloc_inode = gfs2_alloc_inode, 1422 .alloc_inode = gfs2_alloc_inode,
1424 .destroy_inode = gfs2_destroy_inode, 1423 .destroy_inode = gfs2_destroy_inode,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
230 230
231 if (gltype > LM_TYPE_JOURNAL) 231 if (gltype > LM_TYPE_JOURNAL)
232 return -EINVAL; 232 return -EINVAL;
233 glops = gfs2_glops_list[gltype]; 233 if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
234 glops = &gfs2_trans_glops;
235 else
236 glops = gfs2_glops_list[gltype];
234 if (glops == NULL) 237 if (glops == NULL)
235 return -EINVAL; 238 return -EINVAL;
236 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) 239 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
399 402
400static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) 403static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
401{ 404{
402 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); 405 return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
403} 406}
404 407
405static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 408static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
406{ 409{
407 unsigned jid; 410 int jid;
408 int rv; 411 int rv;
409 412
410 rv = sscanf(buf, "%u", &jid); 413 rv = sscanf(buf, "%d", &jid);
411 if (rv != 1) 414 if (rv != 1)
412 return -EINVAL; 415 return -EINVAL;
413 416
414 spin_lock(&sdp->sd_jindex_spin); 417 spin_lock(&sdp->sd_jindex_spin);
415 rv = -EINVAL; 418 rv = -EINVAL;
416 if (sdp->sd_args.ar_spectator)
417 goto out;
418 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 419 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
419 goto out; 420 goto out;
420 rv = -EBUSY; 421 rv = -EBUSY;
421 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) 422 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
422 goto out; 423 goto out;
424 rv = 0;
425 if (sdp->sd_args.ar_spectator && jid > 0)
426 rv = jid = -EINVAL;
423 sdp->sd_lockstruct.ls_jid = jid; 427 sdp->sd_lockstruct.ls_jid = jid;
428 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
424 smp_mb__after_clear_bit(); 429 smp_mb__after_clear_bit();
425 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); 430 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
426 rv = 0;
427out: 431out:
428 spin_unlock(&sdp->sd_jindex_spin); 432 spin_unlock(&sdp->sd_jindex_spin);
429 return rv ? rv : len; 433 return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
617 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 621 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
618 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 622 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
619 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) 623 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
620 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 624 add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
621 if (gfs2_uuid_valid(uuid)) 625 if (gfs2_uuid_valid(uuid))
622 add_uevent_var(env, "UUID=%pUB", uuid); 626 add_uevent_var(env, "UUID=%pUB", uuid);
623 return 0; 627 return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ 39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
40 {(1UL << GLF_REPLY_PENDING), "r" }, \ 40 {(1UL << GLF_REPLY_PENDING), "r" }, \
41 {(1UL << GLF_INITIAL), "I" }, \ 41 {(1UL << GLF_INITIAL), "I" }, \
42 {(1UL << GLF_FROZEN), "F" }) 42 {(1UL << GLF_FROZEN), "F" }, \
43 {(1UL << GLF_QUEUED), "q" })
43 44
44#ifndef NUMPTY 45#ifndef NUMPTY
45#define NUMPTY 46#define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
20#define RES_JDATA 1 20#define RES_JDATA 1
21#define RES_DATA 1 21#define RES_DATA 1
22#define RES_LEAF 1 22#define RES_LEAF 1
23#define RES_RG_HDR 1
23#define RES_RG_BIT 2 24#define RES_RG_BIT 2
24#define RES_EATTR 1 25#define RES_EATTR 1
25#define RES_STATFS 1 26#define RES_STATFS 1
26#define RES_QUOTA 2 27#define RES_QUOTA 2
27 28
29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
32{
33 return (al->al_requested < al->al_rgd->rd_length)?
34 al->al_requested + 1 : al->al_rgd->rd_length;
35}
36
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes); 38 unsigned int revokes);
30 39
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..439b61c03262 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 734 goto out_gunlock_q;
735 735
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + al->al_rgd->rd_length + 737 blks + gfs2_rg_blocks(al) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 738 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 739 if (error)
740 goto out_ipres; 740 goto out_ipres;
@@ -1296,10 +1296,8 @@ fail:
1296 1296
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1298{
1299 struct inode *inode = &ip->i_inode;
1300 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1301 struct gfs2_ea_location el; 1300 struct gfs2_ea_location el;
1302 struct buffer_head *dibh;
1303 int error; 1301 int error;
1304 1302
1305 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); 1303 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1321 if (error) 1319 if (error)
1322 return error; 1320 return error;
1323 1321
1324 error = gfs2_meta_inode_buffer(ip, &dibh); 1322 error = gfs2_setattr_simple(ip, attr);
1325 if (error)
1326 goto out_trans_end;
1327
1328 if ((attr->ia_valid & ATTR_SIZE) &&
1329 attr->ia_size != i_size_read(inode)) {
1330 int error;
1331
1332 error = vmtruncate(inode, attr->ia_size);
1333 gfs2_assert_warn(GFS2_SB(inode), !error);
1334 }
1335
1336 setattr_copy(inode, attr);
1337 mark_inode_dirty(inode);
1338
1339 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1340 gfs2_dinode_out(ip, dibh->b_data);
1341 brelse(dibh);
1342
1343out_trans_end:
1344 gfs2_trans_end(sdp); 1323 gfs2_trans_end(sdp);
1345 return error; 1324 return error;
1346} 1325}
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d8..571abe97b42a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7f..3ebc437736fe 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
27 if (!tree) 27 if (!tree)
28 return NULL; 28 return NULL;
29 29
30 init_MUTEX(&tree->tree_lock); 30 mutex_init(&tree->tree_lock);
31 spin_lock_init(&tree->hash_lock); 31 spin_lock_init(&tree->hash_lock);
32 /* Set the correct compare function */ 32 /* Set the correct compare function */
33 tree->sb = sb; 33 tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21d..2a1d712f85dc 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
33 unsigned int depth; 33 unsigned int depth;
34 34
35 //unsigned int map1_size, map_size; 35 //unsigned int map1_size, map_size;
36 struct semaphore tree_lock; 36 struct mutex tree_lock;
37 37
38 unsigned int pages_per_bnode; 38 unsigned int pages_per_bnode;
39 spinlock_t hash_lock; 39 spinlock_t hash_lock;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b8611b41b..afa66aaa2237 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
25 struct inode *inode = NULL; 25 struct inode *inode = NULL;
26 int res; 26 int res;
27 27
28 dentry->d_op = &hfs_dentry_operations;
29
30 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); 28 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
31 hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); 29 hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
32 res = hfs_brec_read(&fd, &rec, sizeof(rec)); 30 res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..ad97c2d58287 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
147 u16 blockoffset; 147 u16 blockoffset;
148 148
149 int fs_div; 149 int fs_div;
150
151 struct hlist_head rsrc_inodes;
152}; 150};
153 151
154#define HFS_FLG_BITMAP_DIRTY 0 152#define HFS_FLG_BITMAP_DIRTY 0
@@ -215,10 +213,14 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
215/* string.c */ 213/* string.c */
216extern const struct dentry_operations hfs_dentry_operations; 214extern const struct dentry_operations hfs_dentry_operations;
217 215
218extern int hfs_hash_dentry(struct dentry *, struct qstr *); 216extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
217 struct qstr *);
219extern int hfs_strcmp(const unsigned char *, unsigned int, 218extern int hfs_strcmp(const unsigned char *, unsigned int,
220 const unsigned char *, unsigned int); 219 const unsigned char *, unsigned int);
221extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 220extern int hfs_compare_dentry(const struct dentry *parent,
221 const struct inode *pinode,
222 const struct dentry *dentry, const struct inode *inode,
223 unsigned int len, const char *str, const struct qstr *name);
222 224
223/* trans.c */ 225/* trans.c */
224extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); 226extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
@@ -254,17 +256,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
254 sb->s_dirt = 1; 256 sb->s_dirt = 1;
255} 257}
256 258
257static inline void hfs_buffer_sync(struct buffer_head *bh)
258{
259 while (buffer_locked(bh)) {
260 wait_on_buffer(bh);
261 }
262 if (buffer_dirty(bh)) {
263 ll_rw_block(WRITE, 1, &bh);
264 wait_on_buffer(bh);
265 }
266}
267
268#define sb_bread512(sb, sec, data) ({ \ 259#define sb_bread512(sb, sec, data) ({ \
269 struct buffer_head *__bh; \ 260 struct buffer_head *__bh; \
270 sector_t __block; \ 261 sector_t __block; \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
524 HFS_I(inode)->rsrc_inode = dir; 524 HFS_I(inode)->rsrc_inode = dir;
525 HFS_I(dir)->rsrc_inode = inode; 525 HFS_I(dir)->rsrc_inode = inode;
526 igrab(dir); 526 igrab(dir);
527 hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes); 527 hlist_add_fake(&inode->i_hash);
528 mark_inode_dirty(inode); 528 mark_inode_dirty(inode);
529out: 529out:
530 d_add(dentry, inode); 530 d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
220 mdb->drLsMod = hfs_mtime(); 220 mdb->drLsMod = hfs_mtime();
221 221
222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh); 222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
223 hfs_buffer_sync(HFS_SB(sb)->mdb_bh); 223 sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
224 } 224 }
225 225
226 return 0; 226 return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); 287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); 288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); 289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
290 hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh); 290 sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
291 } 291 }
292 292
293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { 293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af79428..495a976a3cc9 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
51/* 51/*
52 * Hash a string to an integer in a case-independent way 52 * Hash a string to an integer in a case-independent way
53 */ 53 */
54int hfs_hash_dentry(struct dentry *dentry, struct qstr *this) 54int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
55 struct qstr *this)
55{ 56{
56 const unsigned char *name = this->name; 57 const unsigned char *name = this->name;
57 unsigned int hash, len = this->len; 58 unsigned int hash, len = this->len;
@@ -92,21 +93,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
92 * Test for equality of two strings in the HFS filename character ordering. 93 * Test for equality of two strings in the HFS filename character ordering.
93 * return 1 on failure and 0 on success 94 * return 1 on failure and 0 on success
94 */ 95 */
95int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) 96int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
97 const struct dentry *dentry, const struct inode *inode,
98 unsigned int len, const char *str, const struct qstr *name)
96{ 99{
97 const unsigned char *n1, *n2; 100 const unsigned char *n1, *n2;
98 int len;
99 101
100 len = s1->len;
101 if (len >= HFS_NAMELEN) { 102 if (len >= HFS_NAMELEN) {
102 if (s2->len < HFS_NAMELEN) 103 if (name->len < HFS_NAMELEN)
103 return 1; 104 return 1;
104 len = HFS_NAMELEN; 105 len = HFS_NAMELEN;
105 } else if (len != s2->len) 106 } else if (len != name->len)
106 return 1; 107 return 1;
107 108
108 n1 = s1->name; 109 n1 = str;
109 n2 = s2->name; 110 n2 = name->name;
110 while (len--) { 111 while (len--) {
111 if (caseorder[*n1++] != caseorder[*n2++]) 112 if (caseorder[*n1++] != caseorder[*n2++])
112 return 1; 113 return 1;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34235d4bf08b..1b55f704fb22 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/smp_lock.h>
24#include <linux/vfs.h> 23#include <linux/vfs.h>
25 24
26#include "hfs_fs.h" 25#include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
79 */ 78 */
80static void hfs_put_super(struct super_block *sb) 79static void hfs_put_super(struct super_block *sb)
81{ 80{
82 lock_kernel();
83
84 if (sb->s_dirt) 81 if (sb->s_dirt)
85 hfs_write_super(sb); 82 hfs_write_super(sb);
86 hfs_mdb_close(sb); 83 hfs_mdb_close(sb);
87 /* release the MDB's resources */ 84 /* release the MDB's resources */
88 hfs_mdb_put(sb); 85 hfs_mdb_put(sb);
89
90 unlock_kernel();
91} 86}
92 87
93/* 88/*
@@ -172,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
172 return i ? &i->vfs_inode : NULL; 167 return i ? &i->vfs_inode : NULL;
173} 168}
174 169
175static void hfs_destroy_inode(struct inode *inode) 170static void hfs_i_callback(struct rcu_head *head)
176{ 171{
172 struct inode *inode = container_of(head, struct inode, i_rcu);
173 INIT_LIST_HEAD(&inode->i_dentry);
177 kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); 174 kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
178} 175}
179 176
177static void hfs_destroy_inode(struct inode *inode)
178{
179 call_rcu(&inode->i_rcu, hfs_i_callback);
180}
181
180static const struct super_operations hfs_super_operations = { 182static const struct super_operations hfs_super_operations = {
181 .alloc_inode = hfs_alloc_inode, 183 .alloc_inode = hfs_alloc_inode,
182 .destroy_inode = hfs_destroy_inode, 184 .destroy_inode = hfs_destroy_inode,
@@ -385,8 +387,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
385 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); 387 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
386 if (!sbi) 388 if (!sbi)
387 return -ENOMEM; 389 return -ENOMEM;
390
388 sb->s_fs_info = sbi; 391 sb->s_fs_info = sbi;
389 INIT_HLIST_HEAD(&sbi->rsrc_inodes);
390 392
391 res = -EINVAL; 393 res = -EINVAL;
392 if (!parse_options((char *)data, sbi)) { 394 if (!parse_options((char *)data, sbi)) {
@@ -427,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
427 if (!root_inode) 429 if (!root_inode)
428 goto bail_no_root; 430 goto bail_no_root;
429 431
432 sb->s_d_op = &hfs_dentry_operations;
430 res = -ENOMEM; 433 res = -ENOMEM;
431 sb->s_root = d_alloc_root(root_inode); 434 sb->s_root = d_alloc_root(root_inode);
432 if (!sb->s_root) 435 if (!sb->s_root)
433 goto bail_iput; 436 goto bail_iput;
434 437
435 sb->s_root->d_op = &hfs_dentry_operations;
436
437 /* everything's okay */ 438 /* everything's okay */
438 return 0; 439 return 0;
439 440
@@ -446,17 +447,16 @@ bail:
446 return res; 447 return res;
447} 448}
448 449
449static int hfs_get_sb(struct file_system_type *fs_type, 450static struct dentry *hfs_mount(struct file_system_type *fs_type,
450 int flags, const char *dev_name, void *data, 451 int flags, const char *dev_name, void *data)
451 struct vfsmount *mnt)
452{ 452{
453 return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt); 453 return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
454} 454}
455 455
456static struct file_system_type hfs_fs_type = { 456static struct file_system_type hfs_fs_type = {
457 .owner = THIS_MODULE, 457 .owner = THIS_MODULE,
458 .name = "hfs", 458 .name = "hfs",
459 .get_sb = hfs_get_sb, 459 .mount = hfs_mount,
460 .kill_sb = kill_block_super, 460 .kill_sb = kill_block_super,
461 .fs_flags = FS_REQUIRES_DEV, 461 .fs_flags = FS_REQUIRES_DEV,
462}; 462};
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219aa..19cf291eb91f 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
8 * This file contains the code to do various system dependent things. 8 * This file contains the code to do various system dependent things.
9 */ 9 */
10 10
11#include <linux/namei.h>
11#include "hfs_fs.h" 12#include "hfs_fs.h"
12 13
13/* dentry case-handling: just lowercase everything */ 14/* dentry case-handling: just lowercase everything */
14 15
15static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd) 16static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
16{ 17{
17 struct inode *inode = dentry->d_inode; 18 struct inode *inode;
18 int diff; 19 int diff;
19 20
21 if (nd->flags & LOOKUP_RCU)
22 return -ECHILD;
23
24 inode = dentry->d_inode;
20 if(!inode) 25 if(!inode)
21 return 1; 26 return 1;
22 27
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..5d799c13205f 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,8 +22,9 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
22 return -ENOMEM; 22 return -ENOMEM;
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
26 down(&tree->tree_lock); 26 tree->cnid, __builtin_return_address(0));
27 mutex_lock(&tree->tree_lock);
27 return 0; 28 return 0;
28} 29}
29 30
@@ -31,8 +32,9 @@ void hfs_find_exit(struct hfs_find_data *fd)
31{ 32{
32 hfs_bnode_put(fd->bnode); 33 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 34 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 35 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
35 up(&fd->tree->tree_lock); 36 fd->tree->cnid, __builtin_return_address(0));
37 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 38 fd->tree = NULL;
37} 39}
38 40
@@ -52,6 +54,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
52 rec = (e + b) / 2; 54 rec = (e + b) / 2;
53 len = hfs_brec_lenoff(bnode, rec, &off); 55 len = hfs_brec_lenoff(bnode, rec, &off);
54 keylen = hfs_brec_keylen(bnode, rec); 56 keylen = hfs_brec_keylen(bnode, rec);
57 if (keylen == 0) {
58 res = -EINVAL;
59 goto fail;
60 }
55 hfs_bnode_read(bnode, fd->key, off, keylen); 61 hfs_bnode_read(bnode, fd->key, off, keylen);
56 cmpval = bnode->tree->keycmp(fd->key, fd->search_key); 62 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
57 if (!cmpval) { 63 if (!cmpval) {
@@ -67,6 +73,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
67 if (rec != e && e >= 0) { 73 if (rec != e && e >= 0) {
68 len = hfs_brec_lenoff(bnode, e, &off); 74 len = hfs_brec_lenoff(bnode, e, &off);
69 keylen = hfs_brec_keylen(bnode, e); 75 keylen = hfs_brec_keylen(bnode, e);
76 if (keylen == 0) {
77 res = -EINVAL;
78 goto fail;
79 }
70 hfs_bnode_read(bnode, fd->key, off, keylen); 80 hfs_bnode_read(bnode, fd->key, off, keylen);
71 } 81 }
72done: 82done:
@@ -75,6 +85,7 @@ done:
75 fd->keylength = keylen; 85 fd->keylength = keylen;
76 fd->entryoffset = off + keylen; 86 fd->entryoffset = off + keylen;
77 fd->entrylength = len - keylen; 87 fd->entrylength = len - keylen;
88fail:
78 return res; 89 return res;
79} 90}
80 91
@@ -198,6 +209,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
198 209
199 len = hfs_brec_lenoff(bnode, fd->record, &off); 210 len = hfs_brec_lenoff(bnode, fd->record, &off);
200 keylen = hfs_brec_keylen(bnode, fd->record); 211 keylen = hfs_brec_keylen(bnode, fd->record);
212 if (keylen == 0) {
213 res = -EINVAL;
214 goto out;
215 }
201 fd->keyoffset = off; 216 fd->keyoffset = off;
202 fd->keylength = keylen; 217 fd->keylength = keylen;
203 fd->entryoffset = off + keylen; 218 fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..1cad80c789cb 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -15,8 +15,10 @@
15 15
16#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8) 16#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8)
17 17
18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) 18int hfsplus_block_allocate(struct super_block *sb, u32 size,
19 u32 offset, u32 *max)
19{ 20{
21 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
20 struct page *page; 22 struct page *page;
21 struct address_space *mapping; 23 struct address_space *mapping;
22 __be32 *pptr, *curr, *end; 24 __be32 *pptr, *curr, *end;
@@ -29,8 +31,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
29 return size; 31 return size;
30 32
31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); 33 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 34 mutex_lock(&sbi->alloc_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 35 mapping = sbi->alloc_file->i_mapping;
34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); 36 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
35 if (IS_ERR(page)) { 37 if (IS_ERR(page)) {
36 start = size; 38 start = size;
@@ -150,16 +152,17 @@ done:
150 set_page_dirty(page); 152 set_page_dirty(page);
151 kunmap(page); 153 kunmap(page);
152 *max = offset + (curr - pptr) * 32 + i - start; 154 *max = offset + (curr - pptr) * 32 + i - start;
153 HFSPLUS_SB(sb).free_blocks -= *max; 155 sbi->free_blocks -= *max;
154 sb->s_dirt = 1; 156 sb->s_dirt = 1;
155 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); 157 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
156out: 158out:
157 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 159 mutex_unlock(&sbi->alloc_mutex);
158 return start; 160 return start;
159} 161}
160 162
161int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) 163int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
162{ 164{
165 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
163 struct page *page; 166 struct page *page;
164 struct address_space *mapping; 167 struct address_space *mapping;
165 __be32 *pptr, *curr, *end; 168 __be32 *pptr, *curr, *end;
@@ -172,11 +175,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
172 175
173 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); 176 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
174 /* are all of the bits in range? */ 177 /* are all of the bits in range? */
175 if ((offset + count) > HFSPLUS_SB(sb).total_blocks) 178 if ((offset + count) > sbi->total_blocks)
176 return -2; 179 return -2;
177 180
178 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 181 mutex_lock(&sbi->alloc_mutex);
179 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 182 mapping = sbi->alloc_file->i_mapping;
180 pnr = offset / PAGE_CACHE_BITS; 183 pnr = offset / PAGE_CACHE_BITS;
181 page = read_mapping_page(mapping, pnr, NULL); 184 page = read_mapping_page(mapping, pnr, NULL);
182 pptr = kmap(page); 185 pptr = kmap(page);
@@ -224,9 +227,9 @@ done:
224out: 227out:
225 set_page_dirty(page); 228 set_page_dirty(page);
226 kunmap(page); 229 kunmap(page);
227 HFSPLUS_SB(sb).free_blocks += len; 230 sbi->free_blocks += len;
228 sb->s_dirt = 1; 231 sb->s_dirt = 1;
229 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 232 mutex_unlock(&sbi->alloc_mutex);
230 233
231 return 0; 234 return 0;
232} 235}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 29da6574ba77..1c42cc5b899f 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -42,7 +42,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
42u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) 42u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
43{ 43{
44 __be16 data; 44 __be16 data;
45 // optimize later... 45 /* TODO: optimize later... */
46 hfs_bnode_read(node, &data, off, 2); 46 hfs_bnode_read(node, &data, off, 2);
47 return be16_to_cpu(data); 47 return be16_to_cpu(data);
48} 48}
@@ -50,7 +50,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
50u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) 50u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
51{ 51{
52 u8 data; 52 u8 data;
53 // optimize later... 53 /* TODO: optimize later... */
54 hfs_bnode_read(node, &data, off, 1); 54 hfs_bnode_read(node, &data, off, 1);
55 return data; 55 return data;
56} 56}
@@ -96,7 +96,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
96void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) 96void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
97{ 97{
98 __be16 v = cpu_to_be16(data); 98 __be16 v = cpu_to_be16(data);
99 // optimize later... 99 /* TODO: optimize later... */
100 hfs_bnode_write(node, &v, off, 2); 100 hfs_bnode_write(node, &v, off, 2);
101} 101}
102 102
@@ -212,7 +212,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
212 dst_page--; 212 dst_page--;
213 } 213 }
214 src -= len; 214 src -= len;
215 memmove(kmap(*dst_page) + src, kmap(*src_page) + src, len); 215 memmove(kmap(*dst_page) + src,
216 kmap(*src_page) + src, len);
216 kunmap(*src_page); 217 kunmap(*src_page);
217 set_page_dirty(*dst_page); 218 set_page_dirty(*dst_page);
218 kunmap(*dst_page); 219 kunmap(*dst_page);
@@ -250,14 +251,16 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
250 251
251 if (src == dst) { 252 if (src == dst) {
252 l = min(len, (int)PAGE_CACHE_SIZE - src); 253 l = min(len, (int)PAGE_CACHE_SIZE - src);
253 memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l); 254 memmove(kmap(*dst_page) + src,
255 kmap(*src_page) + src, l);
254 kunmap(*src_page); 256 kunmap(*src_page);
255 set_page_dirty(*dst_page); 257 set_page_dirty(*dst_page);
256 kunmap(*dst_page); 258 kunmap(*dst_page);
257 259
258 while ((len -= l) != 0) { 260 while ((len -= l) != 0) {
259 l = min(len, (int)PAGE_CACHE_SIZE); 261 l = min(len, (int)PAGE_CACHE_SIZE);
260 memmove(kmap(*++dst_page), kmap(*++src_page), l); 262 memmove(kmap(*++dst_page),
263 kmap(*++src_page), l);
261 kunmap(*src_page); 264 kunmap(*src_page);
262 set_page_dirty(*dst_page); 265 set_page_dirty(*dst_page);
263 kunmap(*dst_page); 266 kunmap(*dst_page);
@@ -268,7 +271,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
268 do { 271 do {
269 src_ptr = kmap(*src_page) + src; 272 src_ptr = kmap(*src_page) + src;
270 dst_ptr = kmap(*dst_page) + dst; 273 dst_ptr = kmap(*dst_page) + dst;
271 if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) { 274 if (PAGE_CACHE_SIZE - src <
275 PAGE_CACHE_SIZE - dst) {
272 l = PAGE_CACHE_SIZE - src; 276 l = PAGE_CACHE_SIZE - src;
273 src = 0; 277 src = 0;
274 dst += l; 278 dst += l;
@@ -340,7 +344,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
340 return; 344 return;
341 tmp->next = node->next; 345 tmp->next = node->next;
342 cnid = cpu_to_be32(tmp->next); 346 cnid = cpu_to_be32(tmp->next);
343 hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4); 347 hfs_bnode_write(tmp, &cnid,
348 offsetof(struct hfs_bnode_desc, next), 4);
344 hfs_bnode_put(tmp); 349 hfs_bnode_put(tmp);
345 } else if (node->type == HFS_NODE_LEAF) 350 } else if (node->type == HFS_NODE_LEAF)
346 tree->leaf_head = node->next; 351 tree->leaf_head = node->next;
@@ -351,15 +356,15 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
351 return; 356 return;
352 tmp->prev = node->prev; 357 tmp->prev = node->prev;
353 cnid = cpu_to_be32(tmp->prev); 358 cnid = cpu_to_be32(tmp->prev);
354 hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4); 359 hfs_bnode_write(tmp, &cnid,
360 offsetof(struct hfs_bnode_desc, prev), 4);
355 hfs_bnode_put(tmp); 361 hfs_bnode_put(tmp);
356 } else if (node->type == HFS_NODE_LEAF) 362 } else if (node->type == HFS_NODE_LEAF)
357 tree->leaf_tail = node->prev; 363 tree->leaf_tail = node->prev;
358 364
359 // move down? 365 /* move down? */
360 if (!node->prev && !node->next) { 366 if (!node->prev && !node->next)
361 printk(KERN_DEBUG "hfs_btree_del_level\n"); 367 dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
362 }
363 if (!node->parent) { 368 if (!node->parent) {
364 tree->root = 0; 369 tree->root = 0;
365 tree->depth = 0; 370 tree->depth = 0;
@@ -379,16 +384,16 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
379 struct hfs_bnode *node; 384 struct hfs_bnode *node;
380 385
381 if (cnid >= tree->node_count) { 386 if (cnid >= tree->node_count) {
382 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); 387 printk(KERN_ERR "hfs: request for non-existent node "
388 "%d in B*Tree\n",
389 cnid);
383 return NULL; 390 return NULL;
384 } 391 }
385 392
386 for (node = tree->node_hash[hfs_bnode_hash(cnid)]; 393 for (node = tree->node_hash[hfs_bnode_hash(cnid)];
387 node; node = node->next_hash) { 394 node; node = node->next_hash)
388 if (node->this == cnid) { 395 if (node->this == cnid)
389 return node; 396 return node;
390 }
391 }
392 return NULL; 397 return NULL;
393} 398}
394 399
@@ -402,7 +407,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
402 loff_t off; 407 loff_t off;
403 408
404 if (cnid >= tree->node_count) { 409 if (cnid >= tree->node_count) {
405 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); 410 printk(KERN_ERR "hfs: request for non-existent node "
411 "%d in B*Tree\n",
412 cnid);
406 return NULL; 413 return NULL;
407 } 414 }
408 415
@@ -429,7 +436,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
429 } else { 436 } else {
430 spin_unlock(&tree->hash_lock); 437 spin_unlock(&tree->hash_lock);
431 kfree(node); 438 kfree(node);
432 wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); 439 wait_event(node2->lock_wq,
440 !test_bit(HFS_BNODE_NEW, &node2->flags));
433 return node2; 441 return node2;
434 } 442 }
435 spin_unlock(&tree->hash_lock); 443 spin_unlock(&tree->hash_lock);
@@ -483,7 +491,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
483 if (node) { 491 if (node) {
484 hfs_bnode_get(node); 492 hfs_bnode_get(node);
485 spin_unlock(&tree->hash_lock); 493 spin_unlock(&tree->hash_lock);
486 wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags)); 494 wait_event(node->lock_wq,
495 !test_bit(HFS_BNODE_NEW, &node->flags));
487 if (test_bit(HFS_BNODE_ERROR, &node->flags)) 496 if (test_bit(HFS_BNODE_ERROR, &node->flags))
488 goto node_error; 497 goto node_error;
489 return node; 498 return node;
@@ -497,7 +506,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
497 if (!test_bit(HFS_BNODE_NEW, &node->flags)) 506 if (!test_bit(HFS_BNODE_NEW, &node->flags))
498 return node; 507 return node;
499 508
500 desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); 509 desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
510 node->page_offset);
501 node->prev = be32_to_cpu(desc->prev); 511 node->prev = be32_to_cpu(desc->prev);
502 node->next = be32_to_cpu(desc->next); 512 node->next = be32_to_cpu(desc->next);
503 node->num_recs = be16_to_cpu(desc->num_recs); 513 node->num_recs = be16_to_cpu(desc->num_recs);
@@ -556,11 +566,13 @@ node_error:
556 566
557void hfs_bnode_free(struct hfs_bnode *node) 567void hfs_bnode_free(struct hfs_bnode *node)
558{ 568{
559 //int i; 569#if 0
570 int i;
560 571
561 //for (i = 0; i < node->tree->pages_per_bnode; i++) 572 for (i = 0; i < node->tree->pages_per_bnode; i++)
562 // if (node->page[i]) 573 if (node->page[i])
563 // page_cache_release(node->page[i]); 574 page_cache_release(node->page[i]);
575#endif
564 kfree(node); 576 kfree(node);
565} 577}
566 578
@@ -607,7 +619,8 @@ void hfs_bnode_get(struct hfs_bnode *node)
607 if (node) { 619 if (node) {
608 atomic_inc(&node->refcnt); 620 atomic_inc(&node->refcnt);
609 dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", 621 dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
610 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 622 node->tree->cnid, node->this,
623 atomic_read(&node->refcnt));
611 } 624 }
612} 625}
613 626
@@ -619,7 +632,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
619 int i; 632 int i;
620 633
621 dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", 634 dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
622 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 635 node->tree->cnid, node->this,
636 atomic_read(&node->refcnt));
623 BUG_ON(!atomic_read(&node->refcnt)); 637 BUG_ON(!atomic_read(&node->refcnt));
624 if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) 638 if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
625 return; 639 return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..2312de34bd42 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -39,13 +39,17 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
39 !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { 39 !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
40 retval = node->tree->max_key_len + 2; 40 retval = node->tree->max_key_len + 2;
41 } else { 41 } else {
42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 42 recoff = hfs_bnode_read_u16(node,
43 node->tree->node_size - (rec + 1) * 2);
43 if (!recoff) 44 if (!recoff)
44 return 0; 45 return 0;
45 if (node->tree->attributes & HFS_TREE_BIGKEYS) 46
46 retval = hfs_bnode_read_u16(node, recoff) + 2; 47 retval = hfs_bnode_read_u16(node, recoff) + 2;
47 else 48 if (retval > node->tree->max_key_len + 2) {
48 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; 49 printk(KERN_ERR "hfs: keylen %d too large\n",
50 retval);
51 retval = 0;
52 }
49 } 53 }
50 return retval; 54 return retval;
51} 55}
@@ -81,7 +85,8 @@ again:
81 end_rec_off = tree->node_size - (node->num_recs + 1) * 2; 85 end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
82 end_off = hfs_bnode_read_u16(node, end_rec_off); 86 end_off = hfs_bnode_read_u16(node, end_rec_off);
83 end_rec_off -= 2; 87 end_rec_off -= 2;
84 dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); 88 dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
89 rec, size, end_off, end_rec_off);
85 if (size > end_rec_off - end_off) { 90 if (size > end_rec_off - end_off) {
86 if (new_node) 91 if (new_node)
87 panic("not enough room!\n"); 92 panic("not enough room!\n");
@@ -96,7 +101,9 @@ again:
96 } 101 }
97 node->num_recs++; 102 node->num_recs++;
98 /* write new last offset */ 103 /* write new last offset */
99 hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); 104 hfs_bnode_write_u16(node,
105 offsetof(struct hfs_bnode_desc, num_recs),
106 node->num_recs);
100 hfs_bnode_write_u16(node, end_rec_off, end_off + size); 107 hfs_bnode_write_u16(node, end_rec_off, end_off + size);
101 data_off = end_off; 108 data_off = end_off;
102 data_rec_off = end_rec_off + 2; 109 data_rec_off = end_rec_off + 2;
@@ -148,7 +155,8 @@ skip:
148 if (tree->attributes & HFS_TREE_VARIDXKEYS) 155 if (tree->attributes & HFS_TREE_VARIDXKEYS)
149 key_len = be16_to_cpu(fd->search_key->key_len) + 2; 156 key_len = be16_to_cpu(fd->search_key->key_len) + 2;
150 else { 157 else {
151 fd->search_key->key_len = cpu_to_be16(tree->max_key_len); 158 fd->search_key->key_len =
159 cpu_to_be16(tree->max_key_len);
152 key_len = tree->max_key_len + 2; 160 key_len = tree->max_key_len + 2;
153 } 161 }
154 goto again; 162 goto again;
@@ -177,7 +185,8 @@ again:
177 mark_inode_dirty(tree->inode); 185 mark_inode_dirty(tree->inode);
178 } 186 }
179 hfs_bnode_dump(node); 187 hfs_bnode_dump(node);
180 dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); 188 dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
189 fd->record, fd->keylength + fd->entrylength);
181 if (!--node->num_recs) { 190 if (!--node->num_recs) {
182 hfs_bnode_unlink(node); 191 hfs_bnode_unlink(node);
183 if (!node->parent) 192 if (!node->parent)
@@ -191,7 +200,9 @@ again:
191 __hfs_brec_find(node, fd); 200 __hfs_brec_find(node, fd);
192 goto again; 201 goto again;
193 } 202 }
194 hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); 203 hfs_bnode_write_u16(node,
204 offsetof(struct hfs_bnode_desc, num_recs),
205 node->num_recs);
195 206
196 if (rec_off == end_off) 207 if (rec_off == end_off)
197 goto skip; 208 goto skip;
@@ -216,7 +227,7 @@ skip:
216static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) 227static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
217{ 228{
218 struct hfs_btree *tree; 229 struct hfs_btree *tree;
219 struct hfs_bnode *node, *new_node; 230 struct hfs_bnode *node, *new_node, *next_node;
220 struct hfs_bnode_desc node_desc; 231 struct hfs_bnode_desc node_desc;
221 int num_recs, new_rec_off, new_off, old_rec_off; 232 int num_recs, new_rec_off, new_off, old_rec_off;
222 int data_start, data_end, size; 233 int data_start, data_end, size;
@@ -235,6 +246,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
235 new_node->type = node->type; 246 new_node->type = node->type;
236 new_node->height = node->height; 247 new_node->height = node->height;
237 248
249 if (node->next)
250 next_node = hfs_bnode_find(tree, node->next);
251 else
252 next_node = NULL;
253
254 if (IS_ERR(next_node)) {
255 hfs_bnode_put(node);
256 hfs_bnode_put(new_node);
257 return next_node;
258 }
259
238 size = tree->node_size / 2 - node->num_recs * 2 - 14; 260 size = tree->node_size / 2 - node->num_recs * 2 - 14;
239 old_rec_off = tree->node_size - 4; 261 old_rec_off = tree->node_size - 4;
240 num_recs = 1; 262 num_recs = 1;
@@ -248,6 +270,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
248 /* panic? */ 270 /* panic? */
249 hfs_bnode_put(node); 271 hfs_bnode_put(node);
250 hfs_bnode_put(new_node); 272 hfs_bnode_put(new_node);
273 if (next_node)
274 hfs_bnode_put(next_node);
251 return ERR_PTR(-ENOSPC); 275 return ERR_PTR(-ENOSPC);
252 } 276 }
253 277
@@ -302,8 +326,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
302 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); 326 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
303 327
304 /* update next bnode header */ 328 /* update next bnode header */
305 if (new_node->next) { 329 if (next_node) {
306 struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
307 next_node->prev = new_node->this; 330 next_node->prev = new_node->this;
308 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); 331 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
309 node_desc.prev = cpu_to_be32(next_node->prev); 332 node_desc.prev = cpu_to_be32(next_node->prev);
@@ -349,7 +372,8 @@ again:
349 newkeylen = hfs_bnode_read_u16(node, 14) + 2; 372 newkeylen = hfs_bnode_read_u16(node, 14) + 2;
350 else 373 else
351 fd->keylength = newkeylen = tree->max_key_len + 2; 374 fd->keylength = newkeylen = tree->max_key_len + 2;
352 dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); 375 dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
376 rec, fd->keylength, newkeylen);
353 377
354 rec_off = tree->node_size - (rec + 2) * 2; 378 rec_off = tree->node_size - (rec + 2) * 2;
355 end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; 379 end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -360,7 +384,7 @@ again:
360 end_off = hfs_bnode_read_u16(parent, end_rec_off); 384 end_off = hfs_bnode_read_u16(parent, end_rec_off);
361 if (end_rec_off - end_off < diff) { 385 if (end_rec_off - end_off < diff) {
362 386
363 printk(KERN_DEBUG "hfs: splitting index node...\n"); 387 dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
364 fd->bnode = parent; 388 fd->bnode = parent;
365 new_node = hfs_bnode_split(fd); 389 new_node = hfs_bnode_split(fd);
366 if (IS_ERR(new_node)) 390 if (IS_ERR(new_node))
@@ -368,7 +392,8 @@ again:
368 parent = fd->bnode; 392 parent = fd->bnode;
369 rec = fd->record; 393 rec = fd->record;
370 rec_off = tree->node_size - (rec + 2) * 2; 394 rec_off = tree->node_size - (rec + 2) * 2;
371 end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; 395 end_rec_off = tree->node_size -
396 (parent->num_recs + 1) * 2;
372 } 397 }
373 } 398 }
374 399
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..21023d9f8ff3 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
30 if (!tree) 30 if (!tree)
31 return NULL; 31 return NULL;
32 32
33 init_MUTEX(&tree->tree_lock); 33 mutex_init(&tree->tree_lock);
34 spin_lock_init(&tree->hash_lock); 34 spin_lock_init(&tree->hash_lock);
35 tree->sb = sb; 35 tree->sb = sb;
36 tree->cnid = id; 36 tree->cnid = id;
@@ -39,13 +39,20 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
39 goto free_tree; 39 goto free_tree;
40 tree->inode = inode; 40 tree->inode = inode;
41 41
42 if (!HFSPLUS_I(tree->inode)->first_blocks) {
43 printk(KERN_ERR
44 "hfs: invalid btree extent records (0 size).\n");
45 goto free_inode;
46 }
47
42 mapping = tree->inode->i_mapping; 48 mapping = tree->inode->i_mapping;
43 page = read_mapping_page(mapping, 0, NULL); 49 page = read_mapping_page(mapping, 0, NULL);
44 if (IS_ERR(page)) 50 if (IS_ERR(page))
45 goto free_tree; 51 goto free_inode;
46 52
47 /* Load the header */ 53 /* Load the header */
48 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 54 head = (struct hfs_btree_header_rec *)(kmap(page) +
55 sizeof(struct hfs_bnode_desc));
49 tree->root = be32_to_cpu(head->root); 56 tree->root = be32_to_cpu(head->root);
50 tree->leaf_count = be32_to_cpu(head->leaf_count); 57 tree->leaf_count = be32_to_cpu(head->leaf_count);
51 tree->leaf_head = be32_to_cpu(head->leaf_head); 58 tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -57,40 +64,72 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
57 tree->max_key_len = be16_to_cpu(head->max_key_len); 64 tree->max_key_len = be16_to_cpu(head->max_key_len);
58 tree->depth = be16_to_cpu(head->depth); 65 tree->depth = be16_to_cpu(head->depth);
59 66
60 /* Set the correct compare function */ 67 /* Verify the tree and set the correct compare function */
61 if (id == HFSPLUS_EXT_CNID) { 68 switch (id) {
69 case HFSPLUS_EXT_CNID:
70 if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
71 printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
72 tree->max_key_len);
73 goto fail_page;
74 }
75 if (tree->attributes & HFS_TREE_VARIDXKEYS) {
76 printk(KERN_ERR "hfs: invalid extent btree flag\n");
77 goto fail_page;
78 }
79
62 tree->keycmp = hfsplus_ext_cmp_key; 80 tree->keycmp = hfsplus_ext_cmp_key;
63 } else if (id == HFSPLUS_CAT_CNID) { 81 break;
64 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 82 case HFSPLUS_CAT_CNID:
83 if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
84 printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
85 tree->max_key_len);
86 goto fail_page;
87 }
88 if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
89 printk(KERN_ERR "hfs: invalid catalog btree flag\n");
90 goto fail_page;
91 }
92
93 if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
65 (head->key_type == HFSPLUS_KEY_BINARY)) 94 (head->key_type == HFSPLUS_KEY_BINARY))
66 tree->keycmp = hfsplus_cat_bin_cmp_key; 95 tree->keycmp = hfsplus_cat_bin_cmp_key;
67 else { 96 else {
68 tree->keycmp = hfsplus_cat_case_cmp_key; 97 tree->keycmp = hfsplus_cat_case_cmp_key;
69 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; 98 set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
70 } 99 }
71 } else { 100 break;
101 default:
72 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 102 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
73 goto fail_page; 103 goto fail_page;
74 } 104 }
75 105
106 if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
107 printk(KERN_ERR "hfs: invalid btree flag\n");
108 goto fail_page;
109 }
110
76 size = tree->node_size; 111 size = tree->node_size;
77 if (!is_power_of_2(size)) 112 if (!is_power_of_2(size))
78 goto fail_page; 113 goto fail_page;
79 if (!tree->node_count) 114 if (!tree->node_count)
80 goto fail_page; 115 goto fail_page;
116
81 tree->node_size_shift = ffs(size) - 1; 117 tree->node_size_shift = ffs(size) - 1;
82 118
83 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 119 tree->pages_per_bnode =
120 (tree->node_size + PAGE_CACHE_SIZE - 1) >>
121 PAGE_CACHE_SHIFT;
84 122
85 kunmap(page); 123 kunmap(page);
86 page_cache_release(page); 124 page_cache_release(page);
87 return tree; 125 return tree;
88 126
89 fail_page: 127 fail_page:
90 tree->inode->i_mapping->a_ops = &hfsplus_aops;
91 page_cache_release(page); 128 page_cache_release(page);
92 free_tree: 129 free_inode:
130 tree->inode->i_mapping->a_ops = &hfsplus_aops;
93 iput(tree->inode); 131 iput(tree->inode);
132 free_tree:
94 kfree(tree); 133 kfree(tree);
95 return NULL; 134 return NULL;
96} 135}
@@ -108,8 +147,10 @@ void hfs_btree_close(struct hfs_btree *tree)
108 while ((node = tree->node_hash[i])) { 147 while ((node = tree->node_hash[i])) {
109 tree->node_hash[i] = node->next_hash; 148 tree->node_hash[i] = node->next_hash;
110 if (atomic_read(&node->refcnt)) 149 if (atomic_read(&node->refcnt))
111 printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n", 150 printk(KERN_CRIT "hfs: node %d:%d "
112 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 151 "still has %d user(s)!\n",
152 node->tree->cnid, node->this,
153 atomic_read(&node->refcnt));
113 hfs_bnode_free(node); 154 hfs_bnode_free(node);
114 tree->node_hash_cnt--; 155 tree->node_hash_cnt--;
115 } 156 }
@@ -130,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
130 return; 171 return;
131 /* Load the header */ 172 /* Load the header */
132 page = node->page[0]; 173 page = node->page[0];
133 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 174 head = (struct hfs_btree_header_rec *)(kmap(page) +
175 sizeof(struct hfs_bnode_desc));
134 176
135 head->root = cpu_to_be32(tree->root); 177 head->root = cpu_to_be32(tree->root);
136 head->leaf_count = cpu_to_be32(tree->leaf_count); 178 head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -192,17 +234,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
192 234
193 while (!tree->free_nodes) { 235 while (!tree->free_nodes) {
194 struct inode *inode = tree->inode; 236 struct inode *inode = tree->inode;
237 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
195 u32 count; 238 u32 count;
196 int res; 239 int res;
197 240
198 res = hfsplus_file_extend(inode); 241 res = hfsplus_file_extend(inode);
199 if (res) 242 if (res)
200 return ERR_PTR(res); 243 return ERR_PTR(res);
201 HFSPLUS_I(inode).phys_size = inode->i_size = 244 hip->phys_size = inode->i_size =
202 (loff_t)HFSPLUS_I(inode).alloc_blocks << 245 (loff_t)hip->alloc_blocks <<
203 HFSPLUS_SB(tree->sb).alloc_blksz_shift; 246 HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
204 HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << 247 hip->fs_blocks =
205 HFSPLUS_SB(tree->sb).fs_shift; 248 hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
206 inode_set_bytes(inode, inode->i_size); 249 inode_set_bytes(inode, inode->i_size);
207 count = inode->i_size >> tree->node_size_shift; 250 count = inode->i_size >> tree->node_size_shift;
208 tree->free_nodes = count - tree->node_count; 251 tree->free_nodes = count - tree->node_count;
@@ -235,7 +278,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
235 tree->free_nodes--; 278 tree->free_nodes--;
236 mark_inode_dirty(tree->inode); 279 mark_inode_dirty(tree->inode);
237 hfs_bnode_put(node); 280 hfs_bnode_put(node);
238 return hfs_bnode_create(tree, idx); 281 return hfs_bnode_create(tree,
282 idx);
239 } 283 }
240 } 284 }
241 } 285 }
@@ -250,7 +294,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
250 kunmap(*pagep); 294 kunmap(*pagep);
251 nidx = node->next; 295 nidx = node->next;
252 if (!nidx) { 296 if (!nidx) {
253 printk(KERN_DEBUG "hfs: create new bmap node...\n"); 297 dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
254 next_node = hfs_bmap_new_bmap(node, idx); 298 next_node = hfs_bmap_new_bmap(node, idx);
255 } else 299 } else
256 next_node = hfs_bnode_find(tree, nidx); 300 next_node = hfs_bnode_find(tree, nidx);
@@ -292,7 +336,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
292 hfs_bnode_put(node); 336 hfs_bnode_put(node);
293 if (!i) { 337 if (!i) {
294 /* panic */; 338 /* panic */;
295 printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this); 339 printk(KERN_CRIT "hfs: unable to free bnode %u. "
340 "bmap not found!\n",
341 node->this);
296 return; 342 return;
297 } 343 }
298 node = hfs_bnode_find(tree, i); 344 node = hfs_bnode_find(tree, i);
@@ -300,7 +346,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
300 return; 346 return;
301 if (node->type != HFS_NODE_MAP) { 347 if (node->type != HFS_NODE_MAP) {
302 /* panic */; 348 /* panic */;
303 printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type); 349 printk(KERN_CRIT "hfs: invalid bmap found! "
350 "(%u,%d)\n",
351 node->this, node->type);
304 hfs_bnode_put(node); 352 hfs_bnode_put(node);
305 return; 353 return;
306 } 354 }
@@ -313,7 +361,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
313 m = 1 << (~nidx & 7); 361 m = 1 << (~nidx & 7);
314 byte = data[off]; 362 byte = data[off];
315 if (!(byte & m)) { 363 if (!(byte & m)) {
316 printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type); 364 printk(KERN_CRIT "hfs: trying to free free bnode "
365 "%u(%d)\n",
366 node->this, node->type);
317 kunmap(page); 367 kunmap(page);
318 hfs_bnode_put(node); 368 hfs_bnode_put(node);
319 return; 369 return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..b4ba1b319333 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
67 key->key_len = cpu_to_be16(6 + ustrlen); 67 key->key_len = cpu_to_be16(6 + ustrlen);
68} 68}
69 69
70static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) 70void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
71{ 71{
72 if (inode->i_flags & S_IMMUTABLE) 72 if (inode->i_flags & S_IMMUTABLE)
73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; 73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,25 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
77 perms->rootflags |= HFSPLUS_FLG_APPEND; 77 perms->rootflags |= HFSPLUS_FLG_APPEND;
78 else 78 else
79 perms->rootflags &= ~HFSPLUS_FLG_APPEND; 79 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
80 HFSPLUS_I(inode).rootflags = perms->rootflags; 80
81 HFSPLUS_I(inode).userflags = perms->userflags; 81 perms->userflags = HFSPLUS_I(inode)->userflags;
82 perms->mode = cpu_to_be16(inode->i_mode); 82 perms->mode = cpu_to_be16(inode->i_mode);
83 perms->owner = cpu_to_be32(inode->i_uid); 83 perms->owner = cpu_to_be32(inode->i_uid);
84 perms->group = cpu_to_be32(inode->i_gid); 84 perms->group = cpu_to_be32(inode->i_gid);
85
86 if (S_ISREG(inode->i_mode))
87 perms->dev = cpu_to_be32(inode->i_nlink);
88 else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
89 perms->dev = cpu_to_be32(inode->i_rdev);
90 else
91 perms->dev = 0;
85} 92}
86 93
87static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) 94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
95 u32 cnid, struct inode *inode)
88{ 96{
97 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
98
89 if (S_ISDIR(inode->i_mode)) { 99 if (S_ISDIR(inode->i_mode)) {
90 struct hfsplus_cat_folder *folder; 100 struct hfsplus_cat_folder *folder;
91 101
@@ -93,13 +103,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
93 memset(folder, 0, sizeof(*folder)); 103 memset(folder, 0, sizeof(*folder));
94 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 104 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
95 folder->id = cpu_to_be32(inode->i_ino); 105 folder->id = cpu_to_be32(inode->i_ino);
96 HFSPLUS_I(inode).create_date = 106 HFSPLUS_I(inode)->create_date =
97 folder->create_date = 107 folder->create_date =
98 folder->content_mod_date = 108 folder->content_mod_date =
99 folder->attribute_mod_date = 109 folder->attribute_mod_date =
100 folder->access_date = hfsp_now2mt(); 110 folder->access_date = hfsp_now2mt();
101 hfsplus_set_perms(inode, &folder->permissions); 111 hfsplus_cat_set_perms(inode, &folder->permissions);
102 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) 112 if (inode == sbi->hidden_dir)
103 /* invisible and namelocked */ 113 /* invisible and namelocked */
104 folder->user_info.frFlags = cpu_to_be16(0x5000); 114 folder->user_info.frFlags = cpu_to_be16(0x5000);
105 return sizeof(*folder); 115 return sizeof(*folder);
@@ -111,28 +121,40 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
111 file->type = cpu_to_be16(HFSPLUS_FILE); 121 file->type = cpu_to_be16(HFSPLUS_FILE);
112 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); 122 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
113 file->id = cpu_to_be32(cnid); 123 file->id = cpu_to_be32(cnid);
114 HFSPLUS_I(inode).create_date = 124 HFSPLUS_I(inode)->create_date =
115 file->create_date = 125 file->create_date =
116 file->content_mod_date = 126 file->content_mod_date =
117 file->attribute_mod_date = 127 file->attribute_mod_date =
118 file->access_date = hfsp_now2mt(); 128 file->access_date = hfsp_now2mt();
119 if (cnid == inode->i_ino) { 129 if (cnid == inode->i_ino) {
120 hfsplus_set_perms(inode, &file->permissions); 130 hfsplus_cat_set_perms(inode, &file->permissions);
121 if (S_ISLNK(inode->i_mode)) { 131 if (S_ISLNK(inode->i_mode)) {
122 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); 132 file->user_info.fdType =
123 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); 133 cpu_to_be32(HFSP_SYMLINK_TYPE);
134 file->user_info.fdCreator =
135 cpu_to_be32(HFSP_SYMLINK_CREATOR);
124 } else { 136 } else {
125 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); 137 file->user_info.fdType =
126 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); 138 cpu_to_be32(sbi->type);
139 file->user_info.fdCreator =
140 cpu_to_be32(sbi->creator);
127 } 141 }
128 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 142 if (HFSPLUS_FLG_IMMUTABLE &
129 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 143 (file->permissions.rootflags |
144 file->permissions.userflags))
145 file->flags |=
146 cpu_to_be16(HFSPLUS_FILE_LOCKED);
130 } else { 147 } else {
131 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 148 file->user_info.fdType =
132 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 149 cpu_to_be32(HFSP_HARDLINK_TYPE);
133 file->user_info.fdFlags = cpu_to_be16(0x100); 150 file->user_info.fdCreator =
134 file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; 151 cpu_to_be32(HFSP_HFSPLUS_CREATOR);
135 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); 152 file->user_info.fdFlags =
153 cpu_to_be16(0x100);
154 file->create_date =
155 HFSPLUS_I(sbi->hidden_dir)->create_date;
156 file->permissions.dev =
157 cpu_to_be32(HFSPLUS_I(inode)->linkid);
136 } 158 }
137 return sizeof(*file); 159 return sizeof(*file);
138 } 160 }
@@ -173,27 +195,30 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
173 return -EIO; 195 return -EIO;
174 } 196 }
175 197
176 hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), 198 hfsplus_cat_build_key_uni(fd->search_key,
177 &tmp.thread.nodeName); 199 be32_to_cpu(tmp.thread.parentID),
200 &tmp.thread.nodeName);
178 return hfs_brec_find(fd); 201 return hfs_brec_find(fd);
179} 202}
180 203
181int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) 204int hfsplus_create_cat(u32 cnid, struct inode *dir,
205 struct qstr *str, struct inode *inode)
182{ 206{
207 struct super_block *sb = dir->i_sb;
183 struct hfs_find_data fd; 208 struct hfs_find_data fd;
184 struct super_block *sb;
185 hfsplus_cat_entry entry; 209 hfsplus_cat_entry entry;
186 int entry_size; 210 int entry_size;
187 int err; 211 int err;
188 212
189 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); 213 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
190 sb = dir->i_sb; 214 str->name, cnid, inode->i_nlink);
191 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 215 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
192 216
193 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); 217 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
194 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? 218 entry_size = hfsplus_fill_cat_thread(sb, &entry,
219 S_ISDIR(inode->i_mode) ?
195 HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, 220 HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
196 dir->i_ino, str); 221 dir->i_ino, str);
197 err = hfs_brec_find(&fd); 222 err = hfs_brec_find(&fd);
198 if (err != -ENOENT) { 223 if (err != -ENOENT) {
199 if (!err) 224 if (!err)
@@ -219,7 +244,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
219 244
220 dir->i_size++; 245 dir->i_size++;
221 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 246 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
222 mark_inode_dirty(dir); 247 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
248
223 hfs_find_exit(&fd); 249 hfs_find_exit(&fd);
224 return 0; 250 return 0;
225 251
@@ -234,16 +260,16 @@ err2:
234 260
235int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) 261int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
236{ 262{
237 struct super_block *sb; 263 struct super_block *sb = dir->i_sb;
238 struct hfs_find_data fd; 264 struct hfs_find_data fd;
239 struct hfsplus_fork_raw fork; 265 struct hfsplus_fork_raw fork;
240 struct list_head *pos; 266 struct list_head *pos;
241 int err, off; 267 int err, off;
242 u16 type; 268 u16 type;
243 269
244 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); 270 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
245 sb = dir->i_sb; 271 str ? str->name : NULL, cnid);
246 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 272 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
247 273
248 if (!str) { 274 if (!str) {
249 int len; 275 int len;
@@ -253,11 +279,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
253 if (err) 279 if (err)
254 goto out; 280 goto out;
255 281
256 off = fd.entryoffset + offsetof(struct hfsplus_cat_thread, nodeName); 282 off = fd.entryoffset +
283 offsetof(struct hfsplus_cat_thread, nodeName);
257 fd.search_key->cat.parent = cpu_to_be32(dir->i_ino); 284 fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
258 hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.length, off, 2); 285 hfs_bnode_read(fd.bnode,
286 &fd.search_key->cat.name.length, off, 2);
259 len = be16_to_cpu(fd.search_key->cat.name.length) * 2; 287 len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
260 hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.unicode, off + 2, len); 288 hfs_bnode_read(fd.bnode,
289 &fd.search_key->cat.name.unicode,
290 off + 2, len);
261 fd.search_key->key_len = cpu_to_be16(6 + len); 291 fd.search_key->key_len = cpu_to_be16(6 + len);
262 } else 292 } else
263 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); 293 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
@@ -274,12 +304,13 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
274 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA); 304 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
275#endif 305#endif
276 306
277 off = fd.entryoffset + offsetof(struct hfsplus_cat_file, rsrc_fork); 307 off = fd.entryoffset +
308 offsetof(struct hfsplus_cat_file, rsrc_fork);
278 hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork)); 309 hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
279 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); 310 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
280 } 311 }
281 312
282 list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { 313 list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
283 struct hfsplus_readdir_data *rd = 314 struct hfsplus_readdir_data *rd =
284 list_entry(pos, struct hfsplus_readdir_data, list); 315 list_entry(pos, struct hfsplus_readdir_data, list);
285 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) 316 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -301,7 +332,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
301 332
302 dir->i_size--; 333 dir->i_size--;
303 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 334 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
304 mark_inode_dirty(dir); 335 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
305out: 336out:
306 hfs_find_exit(&fd); 337 hfs_find_exit(&fd);
307 338
@@ -312,16 +343,16 @@ int hfsplus_rename_cat(u32 cnid,
312 struct inode *src_dir, struct qstr *src_name, 343 struct inode *src_dir, struct qstr *src_name,
313 struct inode *dst_dir, struct qstr *dst_name) 344 struct inode *dst_dir, struct qstr *dst_name)
314{ 345{
315 struct super_block *sb; 346 struct super_block *sb = src_dir->i_sb;
316 struct hfs_find_data src_fd, dst_fd; 347 struct hfs_find_data src_fd, dst_fd;
317 hfsplus_cat_entry entry; 348 hfsplus_cat_entry entry;
318 int entry_size, type; 349 int entry_size, type;
319 int err = 0; 350 int err = 0;
320 351
321 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, 352 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
353 cnid, src_dir->i_ino, src_name->name,
322 dst_dir->i_ino, dst_name->name); 354 dst_dir->i_ino, dst_name->name);
323 sb = src_dir->i_sb; 355 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
324 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
325 dst_fd = src_fd; 356 dst_fd = src_fd;
326 357
327 /* find the old dir entry and read the data */ 358 /* find the old dir entry and read the data */
@@ -347,7 +378,6 @@ int hfsplus_rename_cat(u32 cnid,
347 goto out; 378 goto out;
348 dst_dir->i_size++; 379 dst_dir->i_size++;
349 dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; 380 dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
350 mark_inode_dirty(dst_dir);
351 381
352 /* finally remove the old entry */ 382 /* finally remove the old entry */
353 hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); 383 hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
@@ -359,7 +389,6 @@ int hfsplus_rename_cat(u32 cnid,
359 goto out; 389 goto out;
360 src_dir->i_size--; 390 src_dir->i_size--;
361 src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; 391 src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
362 mark_inode_dirty(src_dir);
363 392
364 /* remove old thread entry */ 393 /* remove old thread entry */
365 hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); 394 hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
@@ -373,7 +402,8 @@ int hfsplus_rename_cat(u32 cnid,
373 402
374 /* create new thread entry */ 403 /* create new thread entry */
375 hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); 404 hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
376 entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); 405 entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
406 dst_dir->i_ino, dst_name);
377 err = hfs_brec_find(&dst_fd); 407 err = hfs_brec_find(&dst_fd);
378 if (err != -ENOENT) { 408 if (err != -ENOENT) {
379 if (!err) 409 if (!err)
@@ -381,6 +411,9 @@ int hfsplus_rename_cat(u32 cnid,
381 goto out; 411 goto out;
382 } 412 }
383 err = hfs_brec_insert(&dst_fd, &entry, entry_size); 413 err = hfs_brec_insert(&dst_fd, &entry, entry_size);
414
415 hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
416 hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
384out: 417out:
385 hfs_bnode_put(dst_fd.bnode); 418 hfs_bnode_put(dst_fd.bnode);
386 hfs_find_exit(&src_fd); 419 hfs_find_exit(&src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,9 +37,8 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
37 37
38 sb = dir->i_sb; 38 sb = dir->i_sb;
39 39
40 dentry->d_op = &hfsplus_dentry_operations;
41 dentry->d_fsdata = NULL; 40 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 41 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 42 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
44again: 43again:
45 err = hfs_brec_read(&fd, &entry, sizeof(entry)); 44 err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -66,11 +65,17 @@ again:
66 goto fail; 65 goto fail;
67 } 66 }
68 cnid = be32_to_cpu(entry.file.id); 67 cnid = be32_to_cpu(entry.file.id);
69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 68 if (entry.file.user_info.fdType ==
70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && 69 cpu_to_be32(HFSP_HARDLINK_TYPE) &&
71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || 70 entry.file.user_info.fdCreator ==
72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && 71 cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
73 HFSPLUS_SB(sb).hidden_dir) { 72 (entry.file.create_date ==
73 HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
74 create_date ||
75 entry.file.create_date ==
76 HFSPLUS_I(sb->s_root->d_inode)->
77 create_date) &&
78 HFSPLUS_SB(sb)->hidden_dir) {
74 struct qstr str; 79 struct qstr str;
75 char name[32]; 80 char name[32];
76 81
@@ -83,10 +88,13 @@ again:
83 linkid = 0; 88 linkid = 0;
84 } else { 89 } else {
85 dentry->d_fsdata = (void *)(unsigned long)cnid; 90 dentry->d_fsdata = (void *)(unsigned long)cnid;
86 linkid = be32_to_cpu(entry.file.permissions.dev); 91 linkid =
92 be32_to_cpu(entry.file.permissions.dev);
87 str.len = sprintf(name, "iNode%d", linkid); 93 str.len = sprintf(name, "iNode%d", linkid);
88 str.name = name; 94 str.name = name;
89 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); 95 hfsplus_cat_build_key(sb, fd.search_key,
96 HFSPLUS_SB(sb)->hidden_dir->i_ino,
97 &str);
90 goto again; 98 goto again;
91 } 99 }
92 } else if (!dentry->d_fsdata) 100 } else if (!dentry->d_fsdata)
@@ -101,7 +109,7 @@ again:
101 if (IS_ERR(inode)) 109 if (IS_ERR(inode))
102 return ERR_CAST(inode); 110 return ERR_CAST(inode);
103 if (S_ISREG(inode->i_mode)) 111 if (S_ISREG(inode->i_mode))
104 HFSPLUS_I(inode).dev = linkid; 112 HFSPLUS_I(inode)->linkid = linkid;
105out: 113out:
106 d_add(dentry, inode); 114 d_add(dentry, inode);
107 return NULL; 115 return NULL;
@@ -124,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
124 if (filp->f_pos >= inode->i_size) 132 if (filp->f_pos >= inode->i_size)
125 return 0; 133 return 0;
126 134
127 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 135 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
128 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); 136 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
129 err = hfs_brec_find(&fd); 137 err = hfs_brec_find(&fd);
130 if (err) 138 if (err)
@@ -138,7 +146,8 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
138 filp->f_pos++; 146 filp->f_pos++;
139 /* fall through */ 147 /* fall through */
140 case 1: 148 case 1:
141 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 149 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
150 fd.entrylength);
142 if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { 151 if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
143 printk(KERN_ERR "hfs: bad catalog folder thread\n"); 152 printk(KERN_ERR "hfs: bad catalog folder thread\n");
144 err = -EIO; 153 err = -EIO;
@@ -168,20 +177,23 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
168 err = -EIO; 177 err = -EIO;
169 goto out; 178 goto out;
170 } 179 }
171 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 180 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
181 fd.entrylength);
172 type = be16_to_cpu(entry.type); 182 type = be16_to_cpu(entry.type);
173 len = HFSPLUS_MAX_STRLEN; 183 len = HFSPLUS_MAX_STRLEN;
174 err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); 184 err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
175 if (err) 185 if (err)
176 goto out; 186 goto out;
177 if (type == HFSPLUS_FOLDER) { 187 if (type == HFSPLUS_FOLDER) {
178 if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { 188 if (fd.entrylength <
189 sizeof(struct hfsplus_cat_folder)) {
179 printk(KERN_ERR "hfs: small dir entry\n"); 190 printk(KERN_ERR "hfs: small dir entry\n");
180 err = -EIO; 191 err = -EIO;
181 goto out; 192 goto out;
182 } 193 }
183 if (HFSPLUS_SB(sb).hidden_dir && 194 if (HFSPLUS_SB(sb)->hidden_dir &&
184 HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) 195 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
196 be32_to_cpu(entry.folder.id))
185 goto next; 197 goto next;
186 if (filldir(dirent, strbuf, len, filp->f_pos, 198 if (filldir(dirent, strbuf, len, filp->f_pos,
187 be32_to_cpu(entry.folder.id), DT_DIR)) 199 be32_to_cpu(entry.folder.id), DT_DIR))
@@ -200,7 +212,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
200 err = -EIO; 212 err = -EIO;
201 goto out; 213 goto out;
202 } 214 }
203 next: 215next:
204 filp->f_pos++; 216 filp->f_pos++;
205 if (filp->f_pos >= inode->i_size) 217 if (filp->f_pos >= inode->i_size)
206 goto out; 218 goto out;
@@ -217,7 +229,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
217 } 229 }
218 filp->private_data = rd; 230 filp->private_data = rd;
219 rd->file = filp; 231 rd->file = filp;
220 list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); 232 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
221 } 233 }
222 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 234 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
223out: 235out:
@@ -229,38 +241,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
229{ 241{
230 struct hfsplus_readdir_data *rd = file->private_data; 242 struct hfsplus_readdir_data *rd = file->private_data;
231 if (rd) { 243 if (rd) {
244 mutex_lock(&inode->i_mutex);
232 list_del(&rd->list); 245 list_del(&rd->list);
246 mutex_unlock(&inode->i_mutex);
233 kfree(rd); 247 kfree(rd);
234 } 248 }
235 return 0; 249 return 0;
236} 250}
237 251
238static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
239 struct nameidata *nd)
240{
241 struct inode *inode;
242 int res;
243
244 inode = hfsplus_new_inode(dir->i_sb, mode);
245 if (!inode)
246 return -ENOSPC;
247
248 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
249 if (res) {
250 inode->i_nlink = 0;
251 hfsplus_delete_inode(inode);
252 iput(inode);
253 return res;
254 }
255 hfsplus_instantiate(dentry, inode, inode->i_ino);
256 mark_inode_dirty(inode);
257 return 0;
258}
259
260static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, 252static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
261 struct dentry *dst_dentry) 253 struct dentry *dst_dentry)
262{ 254{
263 struct super_block *sb = dst_dir->i_sb; 255 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
264 struct inode *inode = src_dentry->d_inode; 256 struct inode *inode = src_dentry->d_inode;
265 struct inode *src_dir = src_dentry->d_parent->d_inode; 257 struct inode *src_dir = src_dentry->d_parent->d_inode;
266 struct qstr str; 258 struct qstr str;
@@ -270,7 +262,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
270 262
271 if (HFSPLUS_IS_RSRC(inode)) 263 if (HFSPLUS_IS_RSRC(inode))
272 return -EPERM; 264 return -EPERM;
265 if (!S_ISREG(inode->i_mode))
266 return -EPERM;
273 267
268 mutex_lock(&sbi->vh_mutex);
274 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { 269 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
275 for (;;) { 270 for (;;) {
276 get_random_bytes(&id, sizeof(cnid)); 271 get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +274,42 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
279 str.len = sprintf(name, "iNode%d", id); 274 str.len = sprintf(name, "iNode%d", id);
280 res = hfsplus_rename_cat(inode->i_ino, 275 res = hfsplus_rename_cat(inode->i_ino,
281 src_dir, &src_dentry->d_name, 276 src_dir, &src_dentry->d_name,
282 HFSPLUS_SB(sb).hidden_dir, &str); 277 sbi->hidden_dir, &str);
283 if (!res) 278 if (!res)
284 break; 279 break;
285 if (res != -EEXIST) 280 if (res != -EEXIST)
286 return res; 281 goto out;
287 } 282 }
288 HFSPLUS_I(inode).dev = id; 283 HFSPLUS_I(inode)->linkid = id;
289 cnid = HFSPLUS_SB(sb).next_cnid++; 284 cnid = sbi->next_cnid++;
290 src_dentry->d_fsdata = (void *)(unsigned long)cnid; 285 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
291 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); 286 res = hfsplus_create_cat(cnid, src_dir,
287 &src_dentry->d_name, inode);
292 if (res) 288 if (res)
293 /* panic? */ 289 /* panic? */
294 return res; 290 goto out;
295 HFSPLUS_SB(sb).file_count++; 291 sbi->file_count++;
296 } 292 }
297 cnid = HFSPLUS_SB(sb).next_cnid++; 293 cnid = sbi->next_cnid++;
298 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); 294 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
299 if (res) 295 if (res)
300 return res; 296 goto out;
301 297
302 inc_nlink(inode); 298 inc_nlink(inode);
303 hfsplus_instantiate(dst_dentry, inode, cnid); 299 hfsplus_instantiate(dst_dentry, inode, cnid);
304 atomic_inc(&inode->i_count); 300 ihold(inode);
305 inode->i_ctime = CURRENT_TIME_SEC; 301 inode->i_ctime = CURRENT_TIME_SEC;
306 mark_inode_dirty(inode); 302 mark_inode_dirty(inode);
307 HFSPLUS_SB(sb).file_count++; 303 sbi->file_count++;
308 sb->s_dirt = 1; 304 dst_dir->i_sb->s_dirt = 1;
309 305out:
310 return 0; 306 mutex_unlock(&sbi->vh_mutex);
307 return res;
311} 308}
312 309
313static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) 310static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
314{ 311{
315 struct super_block *sb = dir->i_sb; 312 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
316 struct inode *inode = dentry->d_inode; 313 struct inode *inode = dentry->d_inode;
317 struct qstr str; 314 struct qstr str;
318 char name[32]; 315 char name[32];
@@ -322,21 +319,24 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
322 if (HFSPLUS_IS_RSRC(inode)) 319 if (HFSPLUS_IS_RSRC(inode))
323 return -EPERM; 320 return -EPERM;
324 321
322 mutex_lock(&sbi->vh_mutex);
325 cnid = (u32)(unsigned long)dentry->d_fsdata; 323 cnid = (u32)(unsigned long)dentry->d_fsdata;
326 if (inode->i_ino == cnid && 324 if (inode->i_ino == cnid &&
327 atomic_read(&HFSPLUS_I(inode).opencnt)) { 325 atomic_read(&HFSPLUS_I(inode)->opencnt)) {
328 str.name = name; 326 str.name = name;
329 str.len = sprintf(name, "temp%lu", inode->i_ino); 327 str.len = sprintf(name, "temp%lu", inode->i_ino);
330 res = hfsplus_rename_cat(inode->i_ino, 328 res = hfsplus_rename_cat(inode->i_ino,
331 dir, &dentry->d_name, 329 dir, &dentry->d_name,
332 HFSPLUS_SB(sb).hidden_dir, &str); 330 sbi->hidden_dir, &str);
333 if (!res) 331 if (!res) {
334 inode->i_flags |= S_DEAD; 332 inode->i_flags |= S_DEAD;
335 return res; 333 drop_nlink(inode);
334 }
335 goto out;
336 } 336 }
337 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); 337 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
338 if (res) 338 if (res)
339 return res; 339 goto out;
340 340
341 if (inode->i_nlink > 0) 341 if (inode->i_nlink > 0)
342 drop_nlink(inode); 342 drop_nlink(inode);
@@ -344,10 +344,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
344 clear_nlink(inode); 344 clear_nlink(inode);
345 if (!inode->i_nlink) { 345 if (!inode->i_nlink) {
346 if (inode->i_ino != cnid) { 346 if (inode->i_ino != cnid) {
347 HFSPLUS_SB(sb).file_count--; 347 sbi->file_count--;
348 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { 348 if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
349 res = hfsplus_delete_cat(inode->i_ino, 349 res = hfsplus_delete_cat(inode->i_ino,
350 HFSPLUS_SB(sb).hidden_dir, 350 sbi->hidden_dir,
351 NULL); 351 NULL);
352 if (!res) 352 if (!res)
353 hfsplus_delete_inode(inode); 353 hfsplus_delete_inode(inode);
@@ -356,107 +356,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
356 } else 356 } else
357 hfsplus_delete_inode(inode); 357 hfsplus_delete_inode(inode);
358 } else 358 } else
359 HFSPLUS_SB(sb).file_count--; 359 sbi->file_count--;
360 inode->i_ctime = CURRENT_TIME_SEC; 360 inode->i_ctime = CURRENT_TIME_SEC;
361 mark_inode_dirty(inode); 361 mark_inode_dirty(inode);
362 362out:
363 mutex_unlock(&sbi->vh_mutex);
363 return res; 364 return res;
364} 365}
365 366
366static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
367{
368 struct inode *inode;
369 int res;
370
371 inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
372 if (!inode)
373 return -ENOSPC;
374
375 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
376 if (res) {
377 inode->i_nlink = 0;
378 hfsplus_delete_inode(inode);
379 iput(inode);
380 return res;
381 }
382 hfsplus_instantiate(dentry, inode, inode->i_ino);
383 mark_inode_dirty(inode);
384 return 0;
385}
386
387static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) 367static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
388{ 368{
389 struct inode *inode; 369 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
370 struct inode *inode = dentry->d_inode;
390 int res; 371 int res;
391 372
392 inode = dentry->d_inode;
393 if (inode->i_size != 2) 373 if (inode->i_size != 2)
394 return -ENOTEMPTY; 374 return -ENOTEMPTY;
375
376 mutex_lock(&sbi->vh_mutex);
395 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); 377 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
396 if (res) 378 if (res)
397 return res; 379 goto out;
398 clear_nlink(inode); 380 clear_nlink(inode);
399 inode->i_ctime = CURRENT_TIME_SEC; 381 inode->i_ctime = CURRENT_TIME_SEC;
400 hfsplus_delete_inode(inode); 382 hfsplus_delete_inode(inode);
401 mark_inode_dirty(inode); 383 mark_inode_dirty(inode);
402 return 0; 384out:
385 mutex_unlock(&sbi->vh_mutex);
386 return res;
403} 387}
404 388
405static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, 389static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
406 const char *symname) 390 const char *symname)
407{ 391{
408 struct super_block *sb; 392 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
409 struct inode *inode; 393 struct inode *inode;
410 int res; 394 int res = -ENOSPC;
411 395
412 sb = dir->i_sb; 396 mutex_lock(&sbi->vh_mutex);
413 inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); 397 inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
414 if (!inode) 398 if (!inode)
415 return -ENOSPC; 399 goto out;
416 400
417 res = page_symlink(inode, symname, strlen(symname) + 1); 401 res = page_symlink(inode, symname, strlen(symname) + 1);
418 if (res) { 402 if (res)
419 inode->i_nlink = 0; 403 goto out_err;
420 hfsplus_delete_inode(inode);
421 iput(inode);
422 return res;
423 }
424 404
425 mark_inode_dirty(inode);
426 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 405 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
406 if (res)
407 goto out_err;
427 408
428 if (!res) { 409 hfsplus_instantiate(dentry, inode, inode->i_ino);
429 hfsplus_instantiate(dentry, inode, inode->i_ino); 410 mark_inode_dirty(inode);
430 mark_inode_dirty(inode); 411 goto out;
431 }
432 412
413out_err:
414 inode->i_nlink = 0;
415 hfsplus_delete_inode(inode);
416 iput(inode);
417out:
418 mutex_unlock(&sbi->vh_mutex);
433 return res; 419 return res;
434} 420}
435 421
436static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, 422static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
437 int mode, dev_t rdev) 423 int mode, dev_t rdev)
438{ 424{
439 struct super_block *sb; 425 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
440 struct inode *inode; 426 struct inode *inode;
441 int res; 427 int res = -ENOSPC;
442 428
443 sb = dir->i_sb; 429 mutex_lock(&sbi->vh_mutex);
444 inode = hfsplus_new_inode(sb, mode); 430 inode = hfsplus_new_inode(dir->i_sb, mode);
445 if (!inode) 431 if (!inode)
446 return -ENOSPC; 432 goto out;
433
434 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
435 init_special_inode(inode, mode, rdev);
447 436
448 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 437 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
449 if (res) { 438 if (res) {
450 inode->i_nlink = 0; 439 inode->i_nlink = 0;
451 hfsplus_delete_inode(inode); 440 hfsplus_delete_inode(inode);
452 iput(inode); 441 iput(inode);
453 return res; 442 goto out;
454 } 443 }
455 init_special_inode(inode, mode, rdev); 444
456 hfsplus_instantiate(dentry, inode, inode->i_ino); 445 hfsplus_instantiate(dentry, inode, inode->i_ino);
457 mark_inode_dirty(inode); 446 mark_inode_dirty(inode);
447out:
448 mutex_unlock(&sbi->vh_mutex);
449 return res;
450}
458 451
459 return 0; 452static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
453 struct nameidata *nd)
454{
455 return hfsplus_mknod(dir, dentry, mode, 0);
456}
457
458static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
459{
460 return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
460} 461}
461 462
462static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, 463static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
466 467
467 /* Unlink destination if it already exists */ 468 /* Unlink destination if it already exists */
468 if (new_dentry->d_inode) { 469 if (new_dentry->d_inode) {
469 res = hfsplus_unlink(new_dir, new_dentry); 470 if (S_ISDIR(new_dentry->d_inode->i_mode))
471 res = hfsplus_rmdir(new_dir, new_dentry);
472 else
473 res = hfsplus_unlink(new_dir, new_dentry);
470 if (res) 474 if (res)
471 return res; 475 return res;
472 } 476 }
@@ -492,6 +496,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
492}; 496};
493 497
494const struct file_operations hfsplus_dir_operations = { 498const struct file_operations hfsplus_dir_operations = {
499 .fsync = hfsplus_file_fsync,
495 .read = generic_read_dir, 500 .read = generic_read_dir,
496 .readdir = hfsplus_readdir, 501 .readdir = hfsplus_readdir,
497 .unlocked_ioctl = hfsplus_ioctl, 502 .unlocked_ioctl = hfsplus_ioctl,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,37 +83,60 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
83 return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); 83 return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
84} 84}
85 85
86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) 86static void __hfsplus_ext_write_extent(struct inode *inode,
87 struct hfs_find_data *fd)
87{ 88{
89 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
88 int res; 90 int res;
89 91
90 hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, 92 WARN_ON(!mutex_is_locked(&hip->extents_lock));
91 HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 93
94 hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
95 HFSPLUS_IS_RSRC(inode) ?
96 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
97
92 res = hfs_brec_find(fd); 98 res = hfs_brec_find(fd);
93 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { 99 if (hip->extent_state & HFSPLUS_EXT_NEW) {
94 if (res != -ENOENT) 100 if (res != -ENOENT)
95 return; 101 return;
96 hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); 102 hfs_brec_insert(fd, hip->cached_extents,
97 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 103 sizeof(hfsplus_extent_rec));
104 hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
98 } else { 105 } else {
99 if (res) 106 if (res)
100 return; 107 return;
101 hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); 108 hfs_bnode_write(fd->bnode, hip->cached_extents,
102 HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; 109 fd->entryoffset, fd->entrylength);
110 hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
103 } 111 }
112
113 /*
114 * We can't just use hfsplus_mark_inode_dirty here, because we
115 * also get called from hfsplus_write_inode, which should not
116 * redirty the inode. Instead the callers have to be careful
117 * to explicily mark the inode dirty, too.
118 */
119 set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
104} 120}
105 121
106void hfsplus_ext_write_extent(struct inode *inode) 122static void hfsplus_ext_write_extent_locked(struct inode *inode)
107{ 123{
108 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { 124 if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
109 struct hfs_find_data fd; 125 struct hfs_find_data fd;
110 126
111 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 127 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
112 __hfsplus_ext_write_extent(inode, &fd); 128 __hfsplus_ext_write_extent(inode, &fd);
113 hfs_find_exit(&fd); 129 hfs_find_exit(&fd);
114 } 130 }
115} 131}
116 132
133void hfsplus_ext_write_extent(struct inode *inode)
134{
135 mutex_lock(&HFSPLUS_I(inode)->extents_lock);
136 hfsplus_ext_write_extent_locked(inode);
137 mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
138}
139
117static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, 140static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
118 struct hfsplus_extent *extent, 141 struct hfsplus_extent *extent,
119 u32 cnid, u32 block, u8 type) 142 u32 cnid, u32 block, u8 type)
@@ -130,39 +153,48 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
130 return -ENOENT; 153 return -ENOENT;
131 if (fd->entrylength != sizeof(hfsplus_extent_rec)) 154 if (fd->entrylength != sizeof(hfsplus_extent_rec))
132 return -EIO; 155 return -EIO;
133 hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfsplus_extent_rec)); 156 hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
157 sizeof(hfsplus_extent_rec));
134 return 0; 158 return 0;
135} 159}
136 160
137static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) 161static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
162 struct inode *inode, u32 block)
138{ 163{
164 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
139 int res; 165 int res;
140 166
141 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) 167 WARN_ON(!mutex_is_locked(&hip->extents_lock));
168
169 if (hip->extent_state & HFSPLUS_EXT_DIRTY)
142 __hfsplus_ext_write_extent(inode, fd); 170 __hfsplus_ext_write_extent(inode, fd);
143 171
144 res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, 172 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
145 block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 173 block, HFSPLUS_IS_RSRC(inode) ?
174 HFSPLUS_TYPE_RSRC :
175 HFSPLUS_TYPE_DATA);
146 if (!res) { 176 if (!res) {
147 HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); 177 hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
148 HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); 178 hip->cached_blocks =
179 hfsplus_ext_block_count(hip->cached_extents);
149 } else { 180 } else {
150 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 181 hip->cached_start = hip->cached_blocks = 0;
151 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 182 hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
152 } 183 }
153 return res; 184 return res;
154} 185}
155 186
156static int hfsplus_ext_read_extent(struct inode *inode, u32 block) 187static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
157{ 188{
189 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
158 struct hfs_find_data fd; 190 struct hfs_find_data fd;
159 int res; 191 int res;
160 192
161 if (block >= HFSPLUS_I(inode).cached_start && 193 if (block >= hip->cached_start &&
162 block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) 194 block < hip->cached_start + hip->cached_blocks)
163 return 0; 195 return 0;
164 196
165 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 197 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
166 res = __hfsplus_ext_cache_extent(&fd, inode, block); 198 res = __hfsplus_ext_cache_extent(&fd, inode, block);
167 hfs_find_exit(&fd); 199 hfs_find_exit(&fd);
168 return res; 200 return res;
@@ -172,21 +204,22 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
172int hfsplus_get_block(struct inode *inode, sector_t iblock, 204int hfsplus_get_block(struct inode *inode, sector_t iblock,
173 struct buffer_head *bh_result, int create) 205 struct buffer_head *bh_result, int create)
174{ 206{
175 struct super_block *sb; 207 struct super_block *sb = inode->i_sb;
208 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
209 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
176 int res = -EIO; 210 int res = -EIO;
177 u32 ablock, dblock, mask; 211 u32 ablock, dblock, mask;
212 int was_dirty = 0;
178 int shift; 213 int shift;
179 214
180 sb = inode->i_sb;
181
182 /* Convert inode block to disk allocation block */ 215 /* Convert inode block to disk allocation block */
183 shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; 216 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
184 ablock = iblock >> HFSPLUS_SB(sb).fs_shift; 217 ablock = iblock >> sbi->fs_shift;
185 218
186 if (iblock >= HFSPLUS_I(inode).fs_blocks) { 219 if (iblock >= hip->fs_blocks) {
187 if (iblock > HFSPLUS_I(inode).fs_blocks || !create) 220 if (iblock > hip->fs_blocks || !create)
188 return -EIO; 221 return -EIO;
189 if (ablock >= HFSPLUS_I(inode).alloc_blocks) { 222 if (ablock >= hip->alloc_blocks) {
190 res = hfsplus_file_extend(inode); 223 res = hfsplus_file_extend(inode);
191 if (res) 224 if (res)
192 return res; 225 return res;
@@ -194,36 +227,46 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
194 } else 227 } else
195 create = 0; 228 create = 0;
196 229
197 if (ablock < HFSPLUS_I(inode).first_blocks) { 230 if (ablock < hip->first_blocks) {
198 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); 231 dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
199 goto done; 232 goto done;
200 } 233 }
201 234
202 if (inode->i_ino == HFSPLUS_EXT_CNID) 235 if (inode->i_ino == HFSPLUS_EXT_CNID)
203 return -EIO; 236 return -EIO;
204 237
205 mutex_lock(&HFSPLUS_I(inode).extents_lock); 238 mutex_lock(&hip->extents_lock);
239
240 /*
241 * hfsplus_ext_read_extent will write out a cached extent into
242 * the extents btree. In that case we may have to mark the inode
243 * dirty even for a pure read of an extent here.
244 */
245 was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
206 res = hfsplus_ext_read_extent(inode, ablock); 246 res = hfsplus_ext_read_extent(inode, ablock);
207 if (!res) { 247 if (res) {
208 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - 248 mutex_unlock(&hip->extents_lock);
209 HFSPLUS_I(inode).cached_start);
210 } else {
211 mutex_unlock(&HFSPLUS_I(inode).extents_lock);
212 return -EIO; 249 return -EIO;
213 } 250 }
214 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 251 dblock = hfsplus_ext_find_block(hip->cached_extents,
252 ablock - hip->cached_start);
253 mutex_unlock(&hip->extents_lock);
215 254
216done: 255done:
217 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); 256 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
218 mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; 257 inode->i_ino, (long long)iblock, dblock);
219 map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); 258 mask = (1 << sbi->fs_shift) - 1;
259 map_bh(bh_result, sb,
260 (dblock << sbi->fs_shift) + sbi->blockoffset +
261 (iblock & mask));
220 if (create) { 262 if (create) {
221 set_buffer_new(bh_result); 263 set_buffer_new(bh_result);
222 HFSPLUS_I(inode).phys_size += sb->s_blocksize; 264 hip->phys_size += sb->s_blocksize;
223 HFSPLUS_I(inode).fs_blocks++; 265 hip->fs_blocks++;
224 inode_add_bytes(inode, sb->s_blocksize); 266 inode_add_bytes(inode, sb->s_blocksize);
225 mark_inode_dirty(inode);
226 } 267 }
268 if (create || was_dirty)
269 mark_inode_dirty(inode);
227 return 0; 270 return 0;
228} 271}
229 272
@@ -306,7 +349,8 @@ found:
306 } 349 }
307} 350}
308 351
309int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw *fork, int type) 352int hfsplus_free_fork(struct super_block *sb, u32 cnid,
353 struct hfsplus_fork_raw *fork, int type)
310{ 354{
311 struct hfs_find_data fd; 355 struct hfs_find_data fd;
312 hfsplus_extent_rec ext_entry; 356 hfsplus_extent_rec ext_entry;
@@ -327,7 +371,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
327 if (total_blocks == blocks) 371 if (total_blocks == blocks)
328 return 0; 372 return 0;
329 373
330 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 374 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
331 do { 375 do {
332 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, 376 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
333 total_blocks, type); 377 total_blocks, type);
@@ -348,29 +392,34 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
348int hfsplus_file_extend(struct inode *inode) 392int hfsplus_file_extend(struct inode *inode)
349{ 393{
350 struct super_block *sb = inode->i_sb; 394 struct super_block *sb = inode->i_sb;
395 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
396 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
351 u32 start, len, goal; 397 u32 start, len, goal;
352 int res; 398 int res;
353 399
354 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { 400 if (sbi->alloc_file->i_size * 8 <
355 // extend alloc file 401 sbi->total_blocks - sbi->free_blocks + 8) {
356 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, 402 /* extend alloc file */
357 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); 403 printk(KERN_ERR "hfs: extend alloc file! "
404 "(%llu,%u,%u)\n",
405 sbi->alloc_file->i_size * 8,
406 sbi->total_blocks, sbi->free_blocks);
358 return -ENOSPC; 407 return -ENOSPC;
359 } 408 }
360 409
361 mutex_lock(&HFSPLUS_I(inode).extents_lock); 410 mutex_lock(&hip->extents_lock);
362 if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) 411 if (hip->alloc_blocks == hip->first_blocks)
363 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); 412 goal = hfsplus_ext_lastblock(hip->first_extents);
364 else { 413 else {
365 res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); 414 res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
366 if (res) 415 if (res)
367 goto out; 416 goto out;
368 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); 417 goal = hfsplus_ext_lastblock(hip->cached_extents);
369 } 418 }
370 419
371 len = HFSPLUS_I(inode).clump_blocks; 420 len = hip->clump_blocks;
372 start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); 421 start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
373 if (start >= HFSPLUS_SB(sb).total_blocks) { 422 if (start >= sbi->total_blocks) {
374 start = hfsplus_block_allocate(sb, goal, 0, &len); 423 start = hfsplus_block_allocate(sb, goal, 0, &len);
375 if (start >= goal) { 424 if (start >= goal) {
376 res = -ENOSPC; 425 res = -ENOSPC;
@@ -379,56 +428,56 @@ int hfsplus_file_extend(struct inode *inode)
379 } 428 }
380 429
381 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); 430 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
382 if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { 431
383 if (!HFSPLUS_I(inode).first_blocks) { 432 if (hip->alloc_blocks <= hip->first_blocks) {
433 if (!hip->first_blocks) {
384 dprint(DBG_EXTENT, "first extents\n"); 434 dprint(DBG_EXTENT, "first extents\n");
385 /* no extents yet */ 435 /* no extents yet */
386 HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); 436 hip->first_extents[0].start_block = cpu_to_be32(start);
387 HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); 437 hip->first_extents[0].block_count = cpu_to_be32(len);
388 res = 0; 438 res = 0;
389 } else { 439 } else {
390 /* try to append to extents in inode */ 440 /* try to append to extents in inode */
391 res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, 441 res = hfsplus_add_extent(hip->first_extents,
392 HFSPLUS_I(inode).alloc_blocks, 442 hip->alloc_blocks,
393 start, len); 443 start, len);
394 if (res == -ENOSPC) 444 if (res == -ENOSPC)
395 goto insert_extent; 445 goto insert_extent;
396 } 446 }
397 if (!res) { 447 if (!res) {
398 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 448 hfsplus_dump_extent(hip->first_extents);
399 HFSPLUS_I(inode).first_blocks += len; 449 hip->first_blocks += len;
400 } 450 }
401 } else { 451 } else {
402 res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, 452 res = hfsplus_add_extent(hip->cached_extents,
403 HFSPLUS_I(inode).alloc_blocks - 453 hip->alloc_blocks - hip->cached_start,
404 HFSPLUS_I(inode).cached_start,
405 start, len); 454 start, len);
406 if (!res) { 455 if (!res) {
407 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 456 hfsplus_dump_extent(hip->cached_extents);
408 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 457 hip->extent_state |= HFSPLUS_EXT_DIRTY;
409 HFSPLUS_I(inode).cached_blocks += len; 458 hip->cached_blocks += len;
410 } else if (res == -ENOSPC) 459 } else if (res == -ENOSPC)
411 goto insert_extent; 460 goto insert_extent;
412 } 461 }
413out: 462out:
414 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 463 mutex_unlock(&hip->extents_lock);
415 if (!res) { 464 if (!res) {
416 HFSPLUS_I(inode).alloc_blocks += len; 465 hip->alloc_blocks += len;
417 mark_inode_dirty(inode); 466 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
418 } 467 }
419 return res; 468 return res;
420 469
421insert_extent: 470insert_extent:
422 dprint(DBG_EXTENT, "insert new extent\n"); 471 dprint(DBG_EXTENT, "insert new extent\n");
423 hfsplus_ext_write_extent(inode); 472 hfsplus_ext_write_extent_locked(inode);
424 473
425 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 474 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
426 HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); 475 hip->cached_extents[0].start_block = cpu_to_be32(start);
427 HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); 476 hip->cached_extents[0].block_count = cpu_to_be32(len);
428 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 477 hfsplus_dump_extent(hip->cached_extents);
429 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; 478 hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
430 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; 479 hip->cached_start = hip->alloc_blocks;
431 HFSPLUS_I(inode).cached_blocks = len; 480 hip->cached_blocks = len;
432 481
433 res = 0; 482 res = 0;
434 goto out; 483 goto out;
@@ -437,13 +486,16 @@ insert_extent:
437void hfsplus_file_truncate(struct inode *inode) 486void hfsplus_file_truncate(struct inode *inode)
438{ 487{
439 struct super_block *sb = inode->i_sb; 488 struct super_block *sb = inode->i_sb;
489 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
440 struct hfs_find_data fd; 490 struct hfs_find_data fd;
441 u32 alloc_cnt, blk_cnt, start; 491 u32 alloc_cnt, blk_cnt, start;
442 int res; 492 int res;
443 493
444 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, 494 dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
445 (long long)HFSPLUS_I(inode).phys_size, inode->i_size); 495 inode->i_ino, (long long)hip->phys_size,
446 if (inode->i_size > HFSPLUS_I(inode).phys_size) { 496 inode->i_size);
497
498 if (inode->i_size > hip->phys_size) {
447 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
448 struct page *page; 500 struct page *page;
449 void *fsdata; 501 void *fsdata;
@@ -455,52 +507,55 @@ void hfsplus_file_truncate(struct inode *inode)
455 &page, &fsdata); 507 &page, &fsdata);
456 if (res) 508 if (res)
457 return; 509 return;
458 res = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); 510 res = pagecache_write_end(NULL, mapping, size,
511 0, 0, page, fsdata);
459 if (res < 0) 512 if (res < 0)
460 return; 513 return;
461 mark_inode_dirty(inode); 514 mark_inode_dirty(inode);
462 return; 515 return;
463 } else if (inode->i_size == HFSPLUS_I(inode).phys_size) 516 } else if (inode->i_size == hip->phys_size)
464 return; 517 return;
465 518
466 blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; 519 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
467 alloc_cnt = HFSPLUS_I(inode).alloc_blocks; 520 HFSPLUS_SB(sb)->alloc_blksz_shift;
521 alloc_cnt = hip->alloc_blocks;
468 if (blk_cnt == alloc_cnt) 522 if (blk_cnt == alloc_cnt)
469 goto out; 523 goto out;
470 524
471 mutex_lock(&HFSPLUS_I(inode).extents_lock); 525 mutex_lock(&hip->extents_lock);
472 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 526 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
473 while (1) { 527 while (1) {
474 if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { 528 if (alloc_cnt == hip->first_blocks) {
475 hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, 529 hfsplus_free_extents(sb, hip->first_extents,
476 alloc_cnt, alloc_cnt - blk_cnt); 530 alloc_cnt, alloc_cnt - blk_cnt);
477 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 531 hfsplus_dump_extent(hip->first_extents);
478 HFSPLUS_I(inode).first_blocks = blk_cnt; 532 hip->first_blocks = blk_cnt;
479 break; 533 break;
480 } 534 }
481 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); 535 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
482 if (res) 536 if (res)
483 break; 537 break;
484 start = HFSPLUS_I(inode).cached_start; 538 start = hip->cached_start;
485 hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, 539 hfsplus_free_extents(sb, hip->cached_extents,
486 alloc_cnt - start, alloc_cnt - blk_cnt); 540 alloc_cnt - start, alloc_cnt - blk_cnt);
487 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 541 hfsplus_dump_extent(hip->cached_extents);
488 if (blk_cnt > start) { 542 if (blk_cnt > start) {
489 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 543 hip->extent_state |= HFSPLUS_EXT_DIRTY;
490 break; 544 break;
491 } 545 }
492 alloc_cnt = start; 546 alloc_cnt = start;
493 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 547 hip->cached_start = hip->cached_blocks = 0;
494 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 548 hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
495 hfs_brec_remove(&fd); 549 hfs_brec_remove(&fd);
496 } 550 }
497 hfs_find_exit(&fd); 551 hfs_find_exit(&fd);
498 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 552 mutex_unlock(&hip->extents_lock);
499 553
500 HFSPLUS_I(inode).alloc_blocks = blk_cnt; 554 hip->alloc_blocks = blk_cnt;
501out: 555out:
502 HFSPLUS_I(inode).phys_size = inode->i_size; 556 hip->phys_size = inode->i_size;
503 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 557 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
504 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 558 sb->s_blocksize_bits;
505 mark_inode_dirty(inode); 559 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
560 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
506} 561}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..d6857523336d 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -23,13 +23,16 @@
23#define DBG_EXTENT 0x00000020 23#define DBG_EXTENT 0x00000020
24#define DBG_BITMAP 0x00000040 24#define DBG_BITMAP 0x00000040
25 25
26//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) 26#if 0
27//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) 27#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
28//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) 28#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
29#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
30#endif
29#define DBG_MASK (0) 31#define DBG_MASK (0)
30 32
31#define dprint(flg, fmt, args...) \ 33#define dprint(flg, fmt, args...) \
32 if (flg & DBG_MASK) printk(fmt , ## args) 34 if (flg & DBG_MASK) \
35 printk(fmt , ## args)
33 36
34/* Runtime config options */ 37/* Runtime config options */
35#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ 38#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */
@@ -37,7 +40,8 @@
37#define HFSPLUS_TYPE_DATA 0x00 40#define HFSPLUS_TYPE_DATA 0x00
38#define HFSPLUS_TYPE_RSRC 0xFF 41#define HFSPLUS_TYPE_RSRC 0xFF
39 42
40typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *); 43typedef int (*btree_keycmp)(const hfsplus_btree_key *,
44 const hfsplus_btree_key *);
41 45
42#define NODE_HASH_SIZE 256 46#define NODE_HASH_SIZE 256
43 47
@@ -61,8 +65,7 @@ struct hfs_btree {
61 unsigned int max_key_len; 65 unsigned int max_key_len;
62 unsigned int depth; 66 unsigned int depth;
63 67
64 //unsigned int map1_size, map_size; 68 struct mutex tree_lock;
65 struct semaphore tree_lock;
66 69
67 unsigned int pages_per_bnode; 70 unsigned int pages_per_bnode;
68 spinlock_t hash_lock; 71 spinlock_t hash_lock;
@@ -107,8 +110,8 @@ struct hfsplus_vh;
107struct hfs_btree; 110struct hfs_btree;
108 111
109struct hfsplus_sb_info { 112struct hfsplus_sb_info {
110 struct buffer_head *s_vhbh;
111 struct hfsplus_vh *s_vhdr; 113 struct hfsplus_vh *s_vhdr;
114 struct hfsplus_vh *s_backup_vhdr;
112 struct hfs_btree *ext_tree; 115 struct hfs_btree *ext_tree;
113 struct hfs_btree *cat_tree; 116 struct hfs_btree *cat_tree;
114 struct hfs_btree *attr_tree; 117 struct hfs_btree *attr_tree;
@@ -118,19 +121,25 @@ struct hfsplus_sb_info {
118 121
119 /* Runtime variables */ 122 /* Runtime variables */
120 u32 blockoffset; 123 u32 blockoffset;
121 u32 sect_count; 124 sector_t part_start;
125 sector_t sect_count;
122 int fs_shift; 126 int fs_shift;
123 127
124 /* Stuff in host order from Vol Header */ 128 /* immutable data from the volume header */
125 u32 alloc_blksz; 129 u32 alloc_blksz;
126 int alloc_blksz_shift; 130 int alloc_blksz_shift;
127 u32 total_blocks; 131 u32 total_blocks;
132 u32 data_clump_blocks, rsrc_clump_blocks;
133
134 /* mutable data from the volume header, protected by alloc_mutex */
128 u32 free_blocks; 135 u32 free_blocks;
129 u32 next_alloc; 136 struct mutex alloc_mutex;
137
138 /* mutable data from the volume header, protected by vh_mutex */
130 u32 next_cnid; 139 u32 next_cnid;
131 u32 file_count; 140 u32 file_count;
132 u32 folder_count; 141 u32 folder_count;
133 u32 data_clump_blocks, rsrc_clump_blocks; 142 struct mutex vh_mutex;
134 143
135 /* Config options */ 144 /* Config options */
136 u32 creator; 145 u32 creator;
@@ -143,49 +152,92 @@ struct hfsplus_sb_info {
143 int part, session; 152 int part, session;
144 153
145 unsigned long flags; 154 unsigned long flags;
146
147 struct hlist_head rsrc_inodes;
148}; 155};
149 156
150#define HFSPLUS_SB_WRITEBACKUP 0x0001 157#define HFSPLUS_SB_WRITEBACKUP 0
151#define HFSPLUS_SB_NODECOMPOSE 0x0002 158#define HFSPLUS_SB_NODECOMPOSE 1
152#define HFSPLUS_SB_FORCE 0x0004 159#define HFSPLUS_SB_FORCE 2
153#define HFSPLUS_SB_HFSX 0x0008 160#define HFSPLUS_SB_HFSX 3
154#define HFSPLUS_SB_CASEFOLD 0x0010 161#define HFSPLUS_SB_CASEFOLD 4
162#define HFSPLUS_SB_NOBARRIER 5
163
164static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
165{
166 return sb->s_fs_info;
167}
155 168
156 169
157struct hfsplus_inode_info { 170struct hfsplus_inode_info {
158 struct mutex extents_lock; 171 atomic_t opencnt;
159 u32 clump_blocks, alloc_blocks; 172
160 sector_t fs_blocks; 173 /*
161 /* Allocation extents from catalog record or volume header */ 174 * Extent allocation information, protected by extents_lock.
162 hfsplus_extent_rec first_extents; 175 */
163 u32 first_blocks; 176 u32 first_blocks;
177 u32 clump_blocks;
178 u32 alloc_blocks;
179 u32 cached_start;
180 u32 cached_blocks;
181 hfsplus_extent_rec first_extents;
164 hfsplus_extent_rec cached_extents; 182 hfsplus_extent_rec cached_extents;
165 u32 cached_start, cached_blocks; 183 unsigned int extent_state;
166 atomic_t opencnt; 184 struct mutex extents_lock;
167 185
186 /*
187 * Immutable data.
188 */
168 struct inode *rsrc_inode; 189 struct inode *rsrc_inode;
169 unsigned long flags;
170
171 __be32 create_date; 190 __be32 create_date;
172 /* Device number in hfsplus_permissions in catalog */
173 u32 dev;
174 /* BSD system and user file flags */
175 u8 rootflags;
176 u8 userflags;
177 191
192 /*
193 * Protected by sbi->vh_mutex.
194 */
195 u32 linkid;
196
197 /*
198 * Accessed using atomic bitops.
199 */
200 unsigned long flags;
201
202 /*
203 * Protected by i_mutex.
204 */
205 sector_t fs_blocks;
206 u8 userflags; /* BSD user file flags */
178 struct list_head open_dir_list; 207 struct list_head open_dir_list;
179 loff_t phys_size; 208 loff_t phys_size;
209
180 struct inode vfs_inode; 210 struct inode vfs_inode;
181}; 211};
182 212
183#define HFSPLUS_FLG_RSRC 0x0001 213#define HFSPLUS_EXT_DIRTY 0x0001
184#define HFSPLUS_FLG_EXT_DIRTY 0x0002 214#define HFSPLUS_EXT_NEW 0x0002
185#define HFSPLUS_FLG_EXT_NEW 0x0004
186 215
187#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) 216#define HFSPLUS_I_RSRC 0 /* represents a resource fork */
188#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) 217#define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */
218#define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */
219#define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */
220
221#define HFSPLUS_IS_RSRC(inode) \
222 test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
223
224static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
225{
226 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
227}
228
229/*
230 * Mark an inode dirty, and also mark the btree in which the
231 * specific type of metadata is stored.
232 * For data or metadata that gets written back by into the catalog btree
233 * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
234 */
235static inline void hfsplus_mark_inode_dirty(struct inode *inode,
236 unsigned int flag)
237{
238 set_bit(flag, &HFSPLUS_I(inode)->flags);
239 mark_inode_dirty(inode);
240}
189 241
190struct hfs_find_data { 242struct hfs_find_data {
191 /* filled by caller */ 243 /* filled by caller */
@@ -303,14 +355,18 @@ int hfs_brec_read(struct hfs_find_data *, void *, int);
303int hfs_brec_goto(struct hfs_find_data *, int); 355int hfs_brec_goto(struct hfs_find_data *, int);
304 356
305/* catalog.c */ 357/* catalog.c */
306int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); 358int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
307int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); 359 const hfsplus_btree_key *);
308void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *); 360int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
361 const hfsplus_btree_key *);
362void hfsplus_cat_build_key(struct super_block *sb,
363 hfsplus_btree_key *, u32, struct qstr *);
309int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *); 364int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
310int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); 365int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
311int hfsplus_delete_cat(u32, struct inode *, struct qstr *); 366int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
312int hfsplus_rename_cat(u32, struct inode *, struct qstr *, 367int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
313 struct inode *, struct qstr *); 368 struct inode *, struct qstr *);
369void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
314 370
315/* dir.c */ 371/* dir.c */
316extern const struct inode_operations hfsplus_dir_inode_operations; 372extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -320,7 +376,8 @@ extern const struct file_operations hfsplus_dir_operations;
320int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); 376int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
321void hfsplus_ext_write_extent(struct inode *); 377void hfsplus_ext_write_extent(struct inode *);
322int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); 378int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
323int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int); 379int hfsplus_free_fork(struct super_block *, u32,
380 struct hfsplus_fork_raw *, int);
324int hfsplus_file_extend(struct inode *); 381int hfsplus_file_extend(struct inode *);
325void hfsplus_file_truncate(struct inode *); 382void hfsplus_file_truncate(struct inode *);
326 383
@@ -335,6 +392,7 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
335int hfsplus_cat_write_inode(struct inode *); 392int hfsplus_cat_write_inode(struct inode *);
336struct inode *hfsplus_new_inode(struct super_block *, int); 393struct inode *hfsplus_new_inode(struct super_block *, int);
337void hfsplus_delete_inode(struct inode *); 394void hfsplus_delete_inode(struct inode *);
395int hfsplus_file_fsync(struct file *file, int datasync);
338 396
339/* ioctl.c */ 397/* ioctl.c */
340long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 398long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
@@ -346,6 +404,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
346 404
347/* options.c */ 405/* options.c */
348int hfsplus_parse_options(char *, struct hfsplus_sb_info *); 406int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
407int hfsplus_parse_options_remount(char *input, int *force);
349void hfsplus_fill_defaults(struct hfsplus_sb_info *); 408void hfsplus_fill_defaults(struct hfsplus_sb_info *);
350int hfsplus_show_options(struct seq_file *, struct vfsmount *); 409int hfsplus_show_options(struct seq_file *, struct vfsmount *);
351 410
@@ -359,56 +418,26 @@ extern u16 hfsplus_decompose_table[];
359extern u16 hfsplus_compose_table[]; 418extern u16 hfsplus_compose_table[];
360 419
361/* unicode.c */ 420/* unicode.c */
362int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); 421int hfsplus_strcasecmp(const struct hfsplus_unistr *,
363int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); 422 const struct hfsplus_unistr *);
364int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); 423int hfsplus_strcmp(const struct hfsplus_unistr *,
365int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); 424 const struct hfsplus_unistr *);
366int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str); 425int hfsplus_uni2asc(struct super_block *,
367int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2); 426 const struct hfsplus_unistr *, char *, int *);
427int hfsplus_asc2uni(struct super_block *,
428 struct hfsplus_unistr *, const char *, int);
429int hfsplus_hash_dentry(const struct dentry *dentry,
430 const struct inode *inode, struct qstr *str);
431int hfsplus_compare_dentry(const struct dentry *parent,
432 const struct inode *pinode,
433 const struct dentry *dentry, const struct inode *inode,
434 unsigned int len, const char *str, const struct qstr *name);
368 435
369/* wrapper.c */ 436/* wrapper.c */
370int hfsplus_read_wrapper(struct super_block *); 437int hfsplus_read_wrapper(struct super_block *);
371
372int hfs_part_find(struct super_block *, sector_t *, sector_t *); 438int hfs_part_find(struct super_block *, sector_t *, sector_t *);
373 439int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
374/* access macros */ 440 void *data, int rw);
375/*
376static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
377{
378 return sb->s_fs_info;
379}
380static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
381{
382 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
383}
384*/
385#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info)
386#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
387
388#if 1
389#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); })
390#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; })
391#else
392#define hfsplus_kmap(p) kmap(p)
393#define hfsplus_kunmap(p) kunmap(p)
394#endif
395
396#define sb_bread512(sb, sec, data) ({ \
397 struct buffer_head *__bh; \
398 sector_t __block; \
399 loff_t __start; \
400 int __offset; \
401 \
402 __start = (loff_t)(sec) << HFSPLUS_SECTOR_SHIFT;\
403 __block = __start >> (sb)->s_blocksize_bits; \
404 __offset = __start & ((sb)->s_blocksize - 1); \
405 __bh = sb_bread((sb), __block); \
406 if (likely(__bh != NULL)) \
407 data = (void *)(__bh->b_data + __offset);\
408 else \
409 data = NULL; \
410 __bh; \
411})
412 441
413/* time macros */ 442/* time macros */
414#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) 443#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U)
@@ -419,6 +448,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
419#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) 448#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
420#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) 449#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
421 450
422#define kdev_t_to_nr(x) (x)
423
424#endif 451#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..927cdd6d5bf5 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -36,7 +36,8 @@
36#define HFSP_WRAPOFF_EMBEDSIG 0x7C 36#define HFSP_WRAPOFF_EMBEDSIG 0x7C
37#define HFSP_WRAPOFF_EMBEDEXT 0x7E 37#define HFSP_WRAPOFF_EMBEDEXT 0x7E
38 38
39#define HFSP_HIDDENDIR_NAME "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" 39#define HFSP_HIDDENDIR_NAME \
40 "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
40 41
41#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ 42#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */
42#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ 43#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */
@@ -200,6 +201,7 @@ struct hfsplus_cat_key {
200 struct hfsplus_unistr name; 201 struct hfsplus_unistr name;
201} __packed; 202} __packed;
202 203
204#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key))
203 205
204/* Structs from hfs.h */ 206/* Structs from hfs.h */
205struct hfsp_point { 207struct hfsp_point {
@@ -323,7 +325,7 @@ struct hfsplus_ext_key {
323 __be32 start_block; 325 __be32 start_block;
324} __packed; 326} __packed;
325 327
326#define HFSPLUS_EXT_KEYLEN 12 328#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
327 329
328/* HFS+ generic BTree key */ 330/* HFS+ generic BTree key */
329typedef union { 331typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..a8df651747f0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -8,6 +8,7 @@
8 * Inode handling routines 8 * Inode handling routines
9 */ 9 */
10 10
11#include <linux/blkdev.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
@@ -36,7 +37,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
36 *pagep = NULL; 37 *pagep = NULL;
37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 38 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
38 hfsplus_get_block, 39 hfsplus_get_block,
39 &HFSPLUS_I(mapping->host).phys_size); 40 &HFSPLUS_I(mapping->host)->phys_size);
40 if (unlikely(ret)) { 41 if (unlikely(ret)) {
41 loff_t isize = mapping->host->i_size; 42 loff_t isize = mapping->host->i_size;
42 if (pos + len > isize) 43 if (pos + len > isize)
@@ -62,13 +63,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
62 63
63 switch (inode->i_ino) { 64 switch (inode->i_ino) {
64 case HFSPLUS_EXT_CNID: 65 case HFSPLUS_EXT_CNID:
65 tree = HFSPLUS_SB(sb).ext_tree; 66 tree = HFSPLUS_SB(sb)->ext_tree;
66 break; 67 break;
67 case HFSPLUS_CAT_CNID: 68 case HFSPLUS_CAT_CNID:
68 tree = HFSPLUS_SB(sb).cat_tree; 69 tree = HFSPLUS_SB(sb)->cat_tree;
69 break; 70 break;
70 case HFSPLUS_ATTR_CNID: 71 case HFSPLUS_ATTR_CNID:
71 tree = HFSPLUS_SB(sb).attr_tree; 72 tree = HFSPLUS_SB(sb)->attr_tree;
72 break; 73 break;
73 default: 74 default:
74 BUG(); 75 BUG();
@@ -77,7 +78,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
77 if (!tree) 78 if (!tree)
78 return 0; 79 return 0;
79 if (tree->node_size >= PAGE_CACHE_SIZE) { 80 if (tree->node_size >= PAGE_CACHE_SIZE) {
80 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); 81 nidx = page->index >>
82 (tree->node_size_shift - PAGE_CACHE_SHIFT);
81 spin_lock(&tree->hash_lock); 83 spin_lock(&tree->hash_lock);
82 node = hfs_bnode_findhash(tree, nidx); 84 node = hfs_bnode_findhash(tree, nidx);
83 if (!node) 85 if (!node)
@@ -90,7 +92,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
90 } 92 }
91 spin_unlock(&tree->hash_lock); 93 spin_unlock(&tree->hash_lock);
92 } else { 94 } else {
93 nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift); 95 nidx = page->index <<
96 (PAGE_CACHE_SHIFT - tree->node_size_shift);
94 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); 97 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
95 spin_lock(&tree->hash_lock); 98 spin_lock(&tree->hash_lock);
96 do { 99 do {
@@ -166,18 +169,19 @@ const struct dentry_operations hfsplus_dentry_operations = {
166 .d_compare = hfsplus_compare_dentry, 169 .d_compare = hfsplus_compare_dentry,
167}; 170};
168 171
169static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry, 172static struct dentry *hfsplus_file_lookup(struct inode *dir,
170 struct nameidata *nd) 173 struct dentry *dentry, struct nameidata *nd)
171{ 174{
172 struct hfs_find_data fd; 175 struct hfs_find_data fd;
173 struct super_block *sb = dir->i_sb; 176 struct super_block *sb = dir->i_sb;
174 struct inode *inode = NULL; 177 struct inode *inode = NULL;
178 struct hfsplus_inode_info *hip;
175 int err; 179 int err;
176 180
177 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 181 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
178 goto out; 182 goto out;
179 183
180 inode = HFSPLUS_I(dir).rsrc_inode; 184 inode = HFSPLUS_I(dir)->rsrc_inode;
181 if (inode) 185 if (inode)
182 goto out; 186 goto out;
183 187
@@ -185,12 +189,15 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
185 if (!inode) 189 if (!inode)
186 return ERR_PTR(-ENOMEM); 190 return ERR_PTR(-ENOMEM);
187 191
192 hip = HFSPLUS_I(inode);
188 inode->i_ino = dir->i_ino; 193 inode->i_ino = dir->i_ino;
189 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 194 INIT_LIST_HEAD(&hip->open_dir_list);
190 mutex_init(&HFSPLUS_I(inode).extents_lock); 195 mutex_init(&hip->extents_lock);
191 HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; 196 hip->extent_state = 0;
197 hip->flags = 0;
198 set_bit(HFSPLUS_I_RSRC, &hip->flags);
192 199
193 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 200 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
194 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 201 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
195 if (!err) 202 if (!err)
196 err = hfsplus_cat_read_inode(inode, &fd); 203 err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,42 +206,48 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
199 iput(inode); 206 iput(inode);
200 return ERR_PTR(err); 207 return ERR_PTR(err);
201 } 208 }
202 HFSPLUS_I(inode).rsrc_inode = dir; 209 hip->rsrc_inode = dir;
203 HFSPLUS_I(dir).rsrc_inode = inode; 210 HFSPLUS_I(dir)->rsrc_inode = inode;
204 igrab(dir); 211 igrab(dir);
205 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); 212
213 /*
214 * __mark_inode_dirty expects inodes to be hashed. Since we don't
215 * want resource fork inodes in the regular inode space, we make them
216 * appear hashed, but do not put on any lists. hlist_del()
217 * will work fine and require no locking.
218 */
219 hlist_add_fake(&inode->i_hash);
220
206 mark_inode_dirty(inode); 221 mark_inode_dirty(inode);
207out: 222out:
208 d_add(dentry, inode); 223 d_add(dentry, inode);
209 return NULL; 224 return NULL;
210} 225}
211 226
212static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) 227static void hfsplus_get_perms(struct inode *inode,
228 struct hfsplus_perm *perms, int dir)
213{ 229{
214 struct super_block *sb = inode->i_sb; 230 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
215 u16 mode; 231 u16 mode;
216 232
217 mode = be16_to_cpu(perms->mode); 233 mode = be16_to_cpu(perms->mode);
218 234
219 inode->i_uid = be32_to_cpu(perms->owner); 235 inode->i_uid = be32_to_cpu(perms->owner);
220 if (!inode->i_uid && !mode) 236 if (!inode->i_uid && !mode)
221 inode->i_uid = HFSPLUS_SB(sb).uid; 237 inode->i_uid = sbi->uid;
222 238
223 inode->i_gid = be32_to_cpu(perms->group); 239 inode->i_gid = be32_to_cpu(perms->group);
224 if (!inode->i_gid && !mode) 240 if (!inode->i_gid && !mode)
225 inode->i_gid = HFSPLUS_SB(sb).gid; 241 inode->i_gid = sbi->gid;
226 242
227 if (dir) { 243 if (dir) {
228 mode = mode ? (mode & S_IALLUGO) : 244 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
229 (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
230 mode |= S_IFDIR; 245 mode |= S_IFDIR;
231 } else if (!mode) 246 } else if (!mode)
232 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & 247 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
233 ~(HFSPLUS_SB(sb).umask));
234 inode->i_mode = mode; 248 inode->i_mode = mode;
235 249
236 HFSPLUS_I(inode).rootflags = perms->rootflags; 250 HFSPLUS_I(inode)->userflags = perms->userflags;
237 HFSPLUS_I(inode).userflags = perms->userflags;
238 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) 251 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
239 inode->i_flags |= S_IMMUTABLE; 252 inode->i_flags |= S_IMMUTABLE;
240 else 253 else
@@ -245,30 +258,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
245 inode->i_flags &= ~S_APPEND; 258 inode->i_flags &= ~S_APPEND;
246} 259}
247 260
248static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
249{
250 if (inode->i_flags & S_IMMUTABLE)
251 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
252 else
253 perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
254 if (inode->i_flags & S_APPEND)
255 perms->rootflags |= HFSPLUS_FLG_APPEND;
256 else
257 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
258 perms->userflags = HFSPLUS_I(inode).userflags;
259 perms->mode = cpu_to_be16(inode->i_mode);
260 perms->owner = cpu_to_be32(inode->i_uid);
261 perms->group = cpu_to_be32(inode->i_gid);
262 perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
263}
264
265static int hfsplus_file_open(struct inode *inode, struct file *file) 261static int hfsplus_file_open(struct inode *inode, struct file *file)
266{ 262{
267 if (HFSPLUS_IS_RSRC(inode)) 263 if (HFSPLUS_IS_RSRC(inode))
268 inode = HFSPLUS_I(inode).rsrc_inode; 264 inode = HFSPLUS_I(inode)->rsrc_inode;
269 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 265 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
270 return -EOVERFLOW; 266 return -EOVERFLOW;
271 atomic_inc(&HFSPLUS_I(inode).opencnt); 267 atomic_inc(&HFSPLUS_I(inode)->opencnt);
272 return 0; 268 return 0;
273} 269}
274 270
@@ -277,12 +273,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
277 struct super_block *sb = inode->i_sb; 273 struct super_block *sb = inode->i_sb;
278 274
279 if (HFSPLUS_IS_RSRC(inode)) 275 if (HFSPLUS_IS_RSRC(inode))
280 inode = HFSPLUS_I(inode).rsrc_inode; 276 inode = HFSPLUS_I(inode)->rsrc_inode;
281 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { 277 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
282 mutex_lock(&inode->i_mutex); 278 mutex_lock(&inode->i_mutex);
283 hfsplus_file_truncate(inode); 279 hfsplus_file_truncate(inode);
284 if (inode->i_flags & S_DEAD) { 280 if (inode->i_flags & S_DEAD) {
285 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 281 hfsplus_delete_cat(inode->i_ino,
282 HFSPLUS_SB(sb)->hidden_dir, NULL);
286 hfsplus_delete_inode(inode); 283 hfsplus_delete_inode(inode);
287 } 284 }
288 mutex_unlock(&inode->i_mutex); 285 mutex_unlock(&inode->i_mutex);
@@ -311,29 +308,41 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
311 return 0; 308 return 0;
312} 309}
313 310
314static int hfsplus_file_fsync(struct file *filp, int datasync) 311int hfsplus_file_fsync(struct file *file, int datasync)
315{ 312{
316 struct inode *inode = filp->f_mapping->host; 313 struct inode *inode = file->f_mapping->host;
317 struct super_block * sb; 314 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
318 int ret, err; 315 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
319 316 int error = 0, error2;
320 /* sync the inode to buffers */ 317
321 ret = write_inode_now(inode, 0); 318 /*
322 319 * Sync inode metadata into the catalog and extent trees.
323 /* sync the superblock to buffers */ 320 */
324 sb = inode->i_sb; 321 sync_inode_metadata(inode, 1);
325 if (sb->s_dirt) { 322
326 if (!(sb->s_flags & MS_RDONLY)) 323 /*
327 hfsplus_sync_fs(sb, 1); 324 * And explicitly write out the btrees.
328 else 325 */
329 sb->s_dirt = 0; 326 if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
327 error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
328
329 if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
330 error2 =
331 filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
332 if (!error)
333 error = error2;
330 } 334 }
331 335
332 /* .. finally sync the buffers to disk */ 336 if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
333 err = sync_blockdev(sb->s_bdev); 337 error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
334 if (!ret) 338 if (!error)
335 ret = err; 339 error = error2;
336 return ret; 340 }
341
342 if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
343 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
344
345 return error;
337} 346}
338 347
339static const struct inode_operations hfsplus_file_inode_operations = { 348static const struct inode_operations hfsplus_file_inode_operations = {
@@ -346,7 +355,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
346}; 355};
347 356
348static const struct file_operations hfsplus_file_operations = { 357static const struct file_operations hfsplus_file_operations = {
349 .llseek = generic_file_llseek, 358 .llseek = generic_file_llseek,
350 .read = do_sync_read, 359 .read = do_sync_read,
351 .aio_read = generic_file_aio_read, 360 .aio_read = generic_file_aio_read,
352 .write = do_sync_write, 361 .write = do_sync_write,
@@ -361,47 +370,53 @@ static const struct file_operations hfsplus_file_operations = {
361 370
362struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 371struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
363{ 372{
373 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
364 struct inode *inode = new_inode(sb); 374 struct inode *inode = new_inode(sb);
375 struct hfsplus_inode_info *hip;
376
365 if (!inode) 377 if (!inode)
366 return NULL; 378 return NULL;
367 379
368 inode->i_ino = HFSPLUS_SB(sb).next_cnid++; 380 inode->i_ino = sbi->next_cnid++;
369 inode->i_mode = mode; 381 inode->i_mode = mode;
370 inode->i_uid = current_fsuid(); 382 inode->i_uid = current_fsuid();
371 inode->i_gid = current_fsgid(); 383 inode->i_gid = current_fsgid();
372 inode->i_nlink = 1; 384 inode->i_nlink = 1;
373 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 385 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
374 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 386
375 mutex_init(&HFSPLUS_I(inode).extents_lock); 387 hip = HFSPLUS_I(inode);
376 atomic_set(&HFSPLUS_I(inode).opencnt, 0); 388 INIT_LIST_HEAD(&hip->open_dir_list);
377 HFSPLUS_I(inode).flags = 0; 389 mutex_init(&hip->extents_lock);
378 memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); 390 atomic_set(&hip->opencnt, 0);
379 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 391 hip->extent_state = 0;
380 HFSPLUS_I(inode).alloc_blocks = 0; 392 hip->flags = 0;
381 HFSPLUS_I(inode).first_blocks = 0; 393 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
382 HFSPLUS_I(inode).cached_start = 0; 394 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
383 HFSPLUS_I(inode).cached_blocks = 0; 395 hip->alloc_blocks = 0;
384 HFSPLUS_I(inode).phys_size = 0; 396 hip->first_blocks = 0;
385 HFSPLUS_I(inode).fs_blocks = 0; 397 hip->cached_start = 0;
386 HFSPLUS_I(inode).rsrc_inode = NULL; 398 hip->cached_blocks = 0;
399 hip->phys_size = 0;
400 hip->fs_blocks = 0;
401 hip->rsrc_inode = NULL;
387 if (S_ISDIR(inode->i_mode)) { 402 if (S_ISDIR(inode->i_mode)) {
388 inode->i_size = 2; 403 inode->i_size = 2;
389 HFSPLUS_SB(sb).folder_count++; 404 sbi->folder_count++;
390 inode->i_op = &hfsplus_dir_inode_operations; 405 inode->i_op = &hfsplus_dir_inode_operations;
391 inode->i_fop = &hfsplus_dir_operations; 406 inode->i_fop = &hfsplus_dir_operations;
392 } else if (S_ISREG(inode->i_mode)) { 407 } else if (S_ISREG(inode->i_mode)) {
393 HFSPLUS_SB(sb).file_count++; 408 sbi->file_count++;
394 inode->i_op = &hfsplus_file_inode_operations; 409 inode->i_op = &hfsplus_file_inode_operations;
395 inode->i_fop = &hfsplus_file_operations; 410 inode->i_fop = &hfsplus_file_operations;
396 inode->i_mapping->a_ops = &hfsplus_aops; 411 inode->i_mapping->a_ops = &hfsplus_aops;
397 HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; 412 hip->clump_blocks = sbi->data_clump_blocks;
398 } else if (S_ISLNK(inode->i_mode)) { 413 } else if (S_ISLNK(inode->i_mode)) {
399 HFSPLUS_SB(sb).file_count++; 414 sbi->file_count++;
400 inode->i_op = &page_symlink_inode_operations; 415 inode->i_op = &page_symlink_inode_operations;
401 inode->i_mapping->a_ops = &hfsplus_aops; 416 inode->i_mapping->a_ops = &hfsplus_aops;
402 HFSPLUS_I(inode).clump_blocks = 1; 417 hip->clump_blocks = 1;
403 } else 418 } else
404 HFSPLUS_SB(sb).file_count++; 419 sbi->file_count++;
405 insert_inode_hash(inode); 420 insert_inode_hash(inode);
406 mark_inode_dirty(inode); 421 mark_inode_dirty(inode);
407 sb->s_dirt = 1; 422 sb->s_dirt = 1;
@@ -414,11 +429,11 @@ void hfsplus_delete_inode(struct inode *inode)
414 struct super_block *sb = inode->i_sb; 429 struct super_block *sb = inode->i_sb;
415 430
416 if (S_ISDIR(inode->i_mode)) { 431 if (S_ISDIR(inode->i_mode)) {
417 HFSPLUS_SB(sb).folder_count--; 432 HFSPLUS_SB(sb)->folder_count--;
418 sb->s_dirt = 1; 433 sb->s_dirt = 1;
419 return; 434 return;
420 } 435 }
421 HFSPLUS_SB(sb).file_count--; 436 HFSPLUS_SB(sb)->file_count--;
422 if (S_ISREG(inode->i_mode)) { 437 if (S_ISREG(inode->i_mode)) {
423 if (!inode->i_nlink) { 438 if (!inode->i_nlink) {
424 inode->i_size = 0; 439 inode->i_size = 0;
@@ -434,34 +449,40 @@ void hfsplus_delete_inode(struct inode *inode)
434void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 449void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
435{ 450{
436 struct super_block *sb = inode->i_sb; 451 struct super_block *sb = inode->i_sb;
452 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
453 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
437 u32 count; 454 u32 count;
438 int i; 455 int i;
439 456
440 memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, 457 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
441 sizeof(hfsplus_extent_rec));
442 for (count = 0, i = 0; i < 8; i++) 458 for (count = 0, i = 0; i < 8; i++)
443 count += be32_to_cpu(fork->extents[i].block_count); 459 count += be32_to_cpu(fork->extents[i].block_count);
444 HFSPLUS_I(inode).first_blocks = count; 460 hip->first_blocks = count;
445 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 461 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
446 HFSPLUS_I(inode).cached_start = 0; 462 hip->cached_start = 0;
447 HFSPLUS_I(inode).cached_blocks = 0; 463 hip->cached_blocks = 0;
448 464
449 HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); 465 hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
450 inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); 466 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
451 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 467 hip->fs_blocks =
452 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 468 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
453 HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; 469 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
454 if (!HFSPLUS_I(inode).clump_blocks) 470 hip->clump_blocks =
455 HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : 471 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
456 HFSPLUS_SB(sb).data_clump_blocks; 472 if (!hip->clump_blocks) {
473 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
474 sbi->rsrc_clump_blocks :
475 sbi->data_clump_blocks;
476 }
457} 477}
458 478
459void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 479void hfsplus_inode_write_fork(struct inode *inode,
480 struct hfsplus_fork_raw *fork)
460{ 481{
461 memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, 482 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
462 sizeof(hfsplus_extent_rec)); 483 sizeof(hfsplus_extent_rec));
463 fork->total_size = cpu_to_be64(inode->i_size); 484 fork->total_size = cpu_to_be64(inode->i_size);
464 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); 485 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
465} 486}
466 487
467int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) 488int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +493,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
472 493
473 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); 494 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
474 495
475 HFSPLUS_I(inode).dev = 0; 496 HFSPLUS_I(inode)->linkid = 0;
476 if (type == HFSPLUS_FOLDER) { 497 if (type == HFSPLUS_FOLDER) {
477 struct hfsplus_cat_folder *folder = &entry.folder; 498 struct hfsplus_cat_folder *folder = &entry.folder;
478 499
@@ -486,8 +507,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
486 inode->i_atime = hfsp_mt2ut(folder->access_date); 507 inode->i_atime = hfsp_mt2ut(folder->access_date);
487 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 508 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
488 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 509 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
489 HFSPLUS_I(inode).create_date = folder->create_date; 510 HFSPLUS_I(inode)->create_date = folder->create_date;
490 HFSPLUS_I(inode).fs_blocks = 0; 511 HFSPLUS_I(inode)->fs_blocks = 0;
491 inode->i_op = &hfsplus_dir_inode_operations; 512 inode->i_op = &hfsplus_dir_inode_operations;
492 inode->i_fop = &hfsplus_dir_operations; 513 inode->i_fop = &hfsplus_dir_operations;
493 } else if (type == HFSPLUS_FILE) { 514 } else if (type == HFSPLUS_FILE) {
@@ -498,13 +519,14 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
498 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, 519 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
499 sizeof(struct hfsplus_cat_file)); 520 sizeof(struct hfsplus_cat_file));
500 521
501 hfsplus_inode_read_fork(inode, HFSPLUS_IS_DATA(inode) ? 522 hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
502 &file->data_fork : &file->rsrc_fork); 523 &file->rsrc_fork : &file->data_fork);
503 hfsplus_get_perms(inode, &file->permissions, 0); 524 hfsplus_get_perms(inode, &file->permissions, 0);
504 inode->i_nlink = 1; 525 inode->i_nlink = 1;
505 if (S_ISREG(inode->i_mode)) { 526 if (S_ISREG(inode->i_mode)) {
506 if (file->permissions.dev) 527 if (file->permissions.dev)
507 inode->i_nlink = be32_to_cpu(file->permissions.dev); 528 inode->i_nlink =
529 be32_to_cpu(file->permissions.dev);
508 inode->i_op = &hfsplus_file_inode_operations; 530 inode->i_op = &hfsplus_file_inode_operations;
509 inode->i_fop = &hfsplus_file_operations; 531 inode->i_fop = &hfsplus_file_operations;
510 inode->i_mapping->a_ops = &hfsplus_aops; 532 inode->i_mapping->a_ops = &hfsplus_aops;
@@ -518,7 +540,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
518 inode->i_atime = hfsp_mt2ut(file->access_date); 540 inode->i_atime = hfsp_mt2ut(file->access_date);
519 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 541 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
520 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); 542 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
521 HFSPLUS_I(inode).create_date = file->create_date; 543 HFSPLUS_I(inode)->create_date = file->create_date;
522 } else { 544 } else {
523 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); 545 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
524 res = -EIO; 546 res = -EIO;
@@ -533,12 +555,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
533 hfsplus_cat_entry entry; 555 hfsplus_cat_entry entry;
534 556
535 if (HFSPLUS_IS_RSRC(inode)) 557 if (HFSPLUS_IS_RSRC(inode))
536 main_inode = HFSPLUS_I(inode).rsrc_inode; 558 main_inode = HFSPLUS_I(inode)->rsrc_inode;
537 559
538 if (!main_inode->i_nlink) 560 if (!main_inode->i_nlink)
539 return 0; 561 return 0;
540 562
541 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) 563 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
542 /* panic? */ 564 /* panic? */
543 return -EIO; 565 return -EIO;
544 566
@@ -554,7 +576,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
554 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 576 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
555 sizeof(struct hfsplus_cat_folder)); 577 sizeof(struct hfsplus_cat_folder));
556 /* simple node checks? */ 578 /* simple node checks? */
557 hfsplus_set_perms(inode, &folder->permissions); 579 hfsplus_cat_set_perms(inode, &folder->permissions);
558 folder->access_date = hfsp_ut2mt(inode->i_atime); 580 folder->access_date = hfsp_ut2mt(inode->i_atime);
559 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 581 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
560 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 582 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,12 +598,10 @@ int hfsplus_cat_write_inode(struct inode *inode)
576 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 598 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
577 sizeof(struct hfsplus_cat_file)); 599 sizeof(struct hfsplus_cat_file));
578 hfsplus_inode_write_fork(inode, &file->data_fork); 600 hfsplus_inode_write_fork(inode, &file->data_fork);
579 if (S_ISREG(inode->i_mode)) 601 hfsplus_cat_set_perms(inode, &file->permissions);
580 HFSPLUS_I(inode).dev = inode->i_nlink; 602 if (HFSPLUS_FLG_IMMUTABLE &
581 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 603 (file->permissions.rootflags |
582 HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev); 604 file->permissions.userflags))
583 hfsplus_set_perms(inode, &file->permissions);
584 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
585 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 605 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
586 else 606 else
587 file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); 607 file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
@@ -591,6 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
591 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 611 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
592 sizeof(struct hfsplus_cat_file)); 612 sizeof(struct hfsplus_cat_file));
593 } 613 }
614
615 set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
594out: 616out:
595 hfs_find_exit(&fd); 617 hfs_find_exit(&fd);
596 return 0; 618 return 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..508ce662ce12 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 23static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
25{ 24{
26 struct inode *inode = filp->f_path.dentry->d_inode; 25 struct inode *inode = file->f_path.dentry->d_inode;
26 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags = 0;
28
29 if (inode->i_flags & S_IMMUTABLE)
30 flags |= FS_IMMUTABLE_FL;
31 if (inode->i_flags & S_APPEND)
32 flags |= FS_APPEND_FL;
33 if (hip->userflags & HFSPLUS_FLG_NODUMP)
34 flags |= FS_NODUMP_FL;
35
36 return put_user(flags, user_flags);
37}
38
39static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags; 43 unsigned int flags;
44 int err = 0;
28 45
29 lock_kernel(); 46 err = mnt_want_write(file->f_path.mnt);
30 switch (cmd) { 47 if (err)
31 case HFSPLUS_IOC_EXT2_GETFLAGS: 48 goto out;
32 flags = 0;
33 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
34 flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
35 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
36 flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
37 if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
38 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
39 return put_user(flags, (int __user *)arg);
40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
41 int err = 0;
42 err = mnt_want_write(filp->f_path.mnt);
43 if (err) {
44 unlock_kernel();
45 return err;
46 }
47 49
48 if (!is_owner_or_cap(inode)) { 50 if (!is_owner_or_cap(inode)) {
49 err = -EACCES; 51 err = -EACCES;
50 goto setflags_out; 52 goto out_drop_write;
51 } 53 }
52 if (get_user(flags, (int __user *)arg)) {
53 err = -EFAULT;
54 goto setflags_out;
55 }
56 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
57 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
58 if (!capable(CAP_LINUX_IMMUTABLE)) {
59 err = -EPERM;
60 goto setflags_out;
61 }
62 }
63 54
64 /* don't silently ignore unsupported ext2 flags */ 55 if (get_user(flags, user_flags)) {
65 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { 56 err = -EFAULT;
66 err = -EOPNOTSUPP; 57 goto out_drop_write;
67 goto setflags_out; 58 }
68 } 59
69 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ 60 mutex_lock(&inode->i_mutex);
70 inode->i_flags |= S_IMMUTABLE; 61
71 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; 62 if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
72 } else { 63 inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
73 inode->i_flags &= ~S_IMMUTABLE; 64 if (!capable(CAP_LINUX_IMMUTABLE)) {
74 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; 65 err = -EPERM;
75 } 66 goto out_unlock_inode;
76 if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
77 inode->i_flags |= S_APPEND;
78 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
79 } else {
80 inode->i_flags &= ~S_APPEND;
81 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
82 } 67 }
83 if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
84 HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
85 else
86 HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
87
88 inode->i_ctime = CURRENT_TIME_SEC;
89 mark_inode_dirty(inode);
90setflags_out:
91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
93 return err;
94 } 68 }
69
70 /* don't silently ignore unsupported ext2 flags */
71 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
72 err = -EOPNOTSUPP;
73 goto out_unlock_inode;
74 }
75
76 if (flags & FS_IMMUTABLE_FL)
77 inode->i_flags |= S_IMMUTABLE;
78 else
79 inode->i_flags &= ~S_IMMUTABLE;
80
81 if (flags & FS_APPEND_FL)
82 inode->i_flags |= S_APPEND;
83 else
84 inode->i_flags &= ~S_APPEND;
85
86 if (flags & FS_NODUMP_FL)
87 hip->userflags |= HFSPLUS_FLG_NODUMP;
88 else
89 hip->userflags &= ~HFSPLUS_FLG_NODUMP;
90
91 inode->i_ctime = CURRENT_TIME_SEC;
92 mark_inode_dirty(inode);
93
94out_unlock_inode:
95 mutex_unlock(&inode->i_mutex);
96out_drop_write:
97 mnt_drop_write(file->f_path.mnt);
98out:
99 return err;
100}
101
102long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
103{
104 void __user *argp = (void __user *)arg;
105
106 switch (cmd) {
107 case HFSPLUS_IOC_EXT2_GETFLAGS:
108 return hfsplus_ioctl_getflags(file, argp);
109 case HFSPLUS_IOC_EXT2_SETFLAGS:
110 return hfsplus_ioctl_setflags(file, argp);
95 default: 111 default:
96 unlock_kernel();
97 return -ENOTTY; 112 return -ENOTTY;
98 } 113 }
99} 114}
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
110 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) 125 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
111 return -EOPNOTSUPP; 126 return -EOPNOTSUPP;
112 127
113 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 128 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
114 if (res) 129 if (res)
115 return res; 130 return res;
116 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 131 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -132,9 +147,11 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
132 res = -ERANGE; 147 res = -ERANGE;
133 } else 148 } else
134 res = -EOPNOTSUPP; 149 res = -EOPNOTSUPP;
135 if (!res) 150 if (!res) {
136 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 151 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
137 sizeof(struct hfsplus_cat_file)); 152 sizeof(struct hfsplus_cat_file));
153 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
154 }
138out: 155out:
139 hfs_find_exit(&fd); 156 hfs_find_exit(&fd);
140 return res; 157 return res;
@@ -153,7 +170,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
153 return -EOPNOTSUPP; 170 return -EOPNOTSUPP;
154 171
155 if (size) { 172 if (size) {
156 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 173 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
157 if (res) 174 if (res)
158 return res; 175 return res;
159 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 176 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +194,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
177 } else 194 } else
178 res = size ? -ERANGE : 4; 195 res = size ? -ERANGE : 4;
179 } else 196 } else
180 res = -ENODATA; 197 res = -EOPNOTSUPP;
181out: 198out:
182 if (size) 199 if (size)
183 hfs_find_exit(&fd); 200 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..bb62a5882147 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -23,6 +23,7 @@ enum {
23 opt_umask, opt_uid, opt_gid, 23 opt_umask, opt_uid, opt_gid,
24 opt_part, opt_session, opt_nls, 24 opt_part, opt_session, opt_nls,
25 opt_nodecompose, opt_decompose, 25 opt_nodecompose, opt_decompose,
26 opt_barrier, opt_nobarrier,
26 opt_force, opt_err 27 opt_force, opt_err
27}; 28};
28 29
@@ -37,6 +38,8 @@ static const match_table_t tokens = {
37 { opt_nls, "nls=%s" }, 38 { opt_nls, "nls=%s" },
38 { opt_decompose, "decompose" }, 39 { opt_decompose, "decompose" },
39 { opt_nodecompose, "nodecompose" }, 40 { opt_nodecompose, "nodecompose" },
41 { opt_barrier, "barrier" },
42 { opt_nobarrier, "nobarrier" },
40 { opt_force, "force" }, 43 { opt_force, "force" },
41 { opt_err, NULL } 44 { opt_err, NULL }
42}; 45};
@@ -65,6 +68,32 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
65 return 0; 68 return 0;
66} 69}
67 70
71int hfsplus_parse_options_remount(char *input, int *force)
72{
73 char *p;
74 substring_t args[MAX_OPT_ARGS];
75 int token;
76
77 if (!input)
78 return 0;
79
80 while ((p = strsep(&input, ",")) != NULL) {
81 if (!*p)
82 continue;
83
84 token = match_token(p, tokens, args);
85 switch (token) {
86 case opt_force:
87 *force = 1;
88 break;
89 default:
90 break;
91 }
92 }
93
94 return 1;
95}
96
68/* Parse options from mount. Returns 0 on failure */ 97/* Parse options from mount. Returns 0 on failure */
69/* input is the options passed to mount() as a string */ 98/* input is the options passed to mount() as a string */
70int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) 99int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
@@ -136,20 +165,28 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
136 if (p) 165 if (p)
137 sbi->nls = load_nls(p); 166 sbi->nls = load_nls(p);
138 if (!sbi->nls) { 167 if (!sbi->nls) {
139 printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p); 168 printk(KERN_ERR "hfs: unable to load "
169 "nls mapping \"%s\"\n",
170 p);
140 kfree(p); 171 kfree(p);
141 return 0; 172 return 0;
142 } 173 }
143 kfree(p); 174 kfree(p);
144 break; 175 break;
145 case opt_decompose: 176 case opt_decompose:
146 sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; 177 clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
147 break; 178 break;
148 case opt_nodecompose: 179 case opt_nodecompose:
149 sbi->flags |= HFSPLUS_SB_NODECOMPOSE; 180 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
181 break;
182 case opt_barrier:
183 clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
184 break;
185 case opt_nobarrier:
186 set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
150 break; 187 break;
151 case opt_force: 188 case opt_force:
152 sbi->flags |= HFSPLUS_SB_FORCE; 189 set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
153 break; 190 break;
154 default: 191 default:
155 return 0; 192 return 0;
@@ -171,20 +208,23 @@ done:
171 208
172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) 209int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
173{ 210{
174 struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); 211 struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
175 212
176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE) 213 if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 214 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
178 if (sbi->type != HFSPLUS_DEF_CR_TYPE) 215 if (sbi->type != HFSPLUS_DEF_CR_TYPE)
179 seq_printf(seq, ",type=%.4s", (char *)&sbi->type); 216 seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
180 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid); 217 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
218 sbi->uid, sbi->gid);
181 if (sbi->part >= 0) 219 if (sbi->part >= 0)
182 seq_printf(seq, ",part=%u", sbi->part); 220 seq_printf(seq, ",part=%u", sbi->part);
183 if (sbi->session >= 0) 221 if (sbi->session >= 0)
184 seq_printf(seq, ",session=%u", sbi->session); 222 seq_printf(seq, ",session=%u", sbi->session);
185 if (sbi->nls) 223 if (sbi->nls)
186 seq_printf(seq, ",nls=%s", sbi->nls->charset); 224 seq_printf(seq, ",nls=%s", sbi->nls->charset);
187 if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) 225 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
188 seq_printf(seq, ",nodecompose"); 226 seq_printf(seq, ",nodecompose");
227 if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
228 seq_printf(seq, ",nobarrier");
189 return 0; 229 return 0;
190} 230}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -2,7 +2,8 @@
2 * linux/fs/hfsplus/part_tbl.c 2 * linux/fs/hfsplus/part_tbl.c
3 * 3 *
4 * Copyright (C) 1996-1997 Paul H. Hargrove 4 * Copyright (C) 1996-1997 Paul H. Hargrove
5 * This file may be distributed under the terms of the GNU General Public License. 5 * This file may be distributed under the terms of
6 * the GNU General Public License.
6 * 7 *
7 * Original code to handle the new style Mac partition table based on 8 * Original code to handle the new style Mac partition table based on
8 * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). 9 * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
@@ -13,6 +14,7 @@
13 * 14 *
14 */ 15 */
15 16
17#include <linux/slab.h>
16#include "hfsplus_fs.h" 18#include "hfsplus_fs.h"
17 19
18/* offsets to various blocks */ 20/* offsets to various blocks */
@@ -58,76 +60,94 @@ struct new_pmap {
58 */ 60 */
59struct old_pmap { 61struct old_pmap {
60 __be16 pdSig; /* Signature bytes */ 62 __be16 pdSig; /* Signature bytes */
61 struct old_pmap_entry { 63 struct old_pmap_entry {
62 __be32 pdStart; 64 __be32 pdStart;
63 __be32 pdSize; 65 __be32 pdSize;
64 __be32 pdFSID; 66 __be32 pdFSID;
65 } pdEntry[42]; 67 } pdEntry[42];
66} __packed; 68} __packed;
67 69
70static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
71 sector_t *part_start, sector_t *part_size)
72{
73 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
74 int i;
75
76 for (i = 0; i < 42; i++) {
77 struct old_pmap_entry *p = &pm->pdEntry[i];
78
79 if (p->pdStart && p->pdSize &&
80 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
81 (sbi->part < 0 || sbi->part == i)) {
82 *part_start += be32_to_cpu(p->pdStart);
83 *part_size = be32_to_cpu(p->pdSize);
84 return 0;
85 }
86 }
87
88 return -ENOENT;
89}
90
91static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
92 sector_t *part_start, sector_t *part_size)
93{
94 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
95 int size = be32_to_cpu(pm->pmMapBlkCnt);
96 int res;
97 int i = 0;
98
99 do {
100 if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
101 (sbi->part < 0 || sbi->part == i)) {
102 *part_start += be32_to_cpu(pm->pmPyPartStart);
103 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
104 return 0;
105 }
106
107 if (++i >= size)
108 return -ENOENT;
109
110 res = hfsplus_submit_bio(sb->s_bdev,
111 *part_start + HFS_PMAP_BLK + i,
112 pm, READ);
113 if (res)
114 return res;
115 } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
116
117 return -ENOENT;
118}
119
68/* 120/*
69 * hfs_part_find() 121 * Parse the partition map looking for the start and length of a
70 * 122 * HFS/HFS+ partition.
71 * Parse the partition map looking for the
72 * start and length of the 'part'th HFS partition.
73 */ 123 */
74int hfs_part_find(struct super_block *sb, 124int hfs_part_find(struct super_block *sb,
75 sector_t *part_start, sector_t *part_size) 125 sector_t *part_start, sector_t *part_size)
76{ 126{
77 struct buffer_head *bh; 127 void *data;
78 __be16 *data; 128 int res;
79 int i, size, res;
80 129
81 res = -ENOENT; 130 data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
82 bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data); 131 if (!data)
83 if (!bh) 132 return -ENOMEM;
84 return -EIO;
85 133
86 switch (be16_to_cpu(*data)) { 134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
135 data, READ);
136 if (res)
137 goto out;
138
139 switch (be16_to_cpu(*((__be16 *)data))) {
87 case HFS_OLD_PMAP_MAGIC: 140 case HFS_OLD_PMAP_MAGIC:
88 { 141 res = hfs_parse_old_pmap(sb, data, part_start, part_size);
89 struct old_pmap *pm;
90 struct old_pmap_entry *p;
91
92 pm = (struct old_pmap *)bh->b_data;
93 p = pm->pdEntry;
94 size = 42;
95 for (i = 0; i < size; p++, i++) {
96 if (p->pdStart && p->pdSize &&
97 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
98 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
99 *part_start += be32_to_cpu(p->pdStart);
100 *part_size = be32_to_cpu(p->pdSize);
101 res = 0;
102 }
103 }
104 break; 142 break;
105 }
106 case HFS_NEW_PMAP_MAGIC: 143 case HFS_NEW_PMAP_MAGIC:
107 { 144 res = hfs_parse_new_pmap(sb, data, part_start, part_size);
108 struct new_pmap *pm; 145 break;
109 146 default:
110 pm = (struct new_pmap *)bh->b_data; 147 res = -ENOENT;
111 size = be32_to_cpu(pm->pmMapBlkCnt);
112 for (i = 0; i < size;) {
113 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
114 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
115 *part_start += be32_to_cpu(pm->pmPyPartStart);
116 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
117 res = 0;
118 break;
119 }
120 brelse(bh);
121 bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
122 if (!bh)
123 return -EIO;
124 if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
125 break;
126 }
127 break; 148 break;
128 }
129 } 149 }
130 brelse(bh); 150out:
131 151 kfree(data);
132 return res; 152 return res;
133} 153}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -10,9 +10,9 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/blkdev.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/vfs.h> 16#include <linux/vfs.h>
17#include <linux/nls.h> 17#include <linux/nls.h>
18 18
@@ -21,40 +21,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
21 21
22#include "hfsplus_fs.h" 22#include "hfsplus_fs.h"
23 23
24struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) 24static int hfsplus_system_read_inode(struct inode *inode)
25{ 25{
26 struct hfs_find_data fd; 26 struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
27 struct hfsplus_vh *vhdr;
28 struct inode *inode;
29 long err = -EIO;
30 27
31 inode = iget_locked(sb, ino); 28 switch (inode->i_ino) {
32 if (!inode)
33 return ERR_PTR(-ENOMEM);
34 if (!(inode->i_state & I_NEW))
35 return inode;
36
37 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
38 mutex_init(&HFSPLUS_I(inode).extents_lock);
39 HFSPLUS_I(inode).flags = 0;
40 HFSPLUS_I(inode).rsrc_inode = NULL;
41 atomic_set(&HFSPLUS_I(inode).opencnt, 0);
42
43 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
44 read_inode:
45 hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
46 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
47 if (!err)
48 err = hfsplus_cat_read_inode(inode, &fd);
49 hfs_find_exit(&fd);
50 if (err)
51 goto bad_inode;
52 goto done;
53 }
54 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
55 switch(inode->i_ino) {
56 case HFSPLUS_ROOT_CNID:
57 goto read_inode;
58 case HFSPLUS_EXT_CNID: 29 case HFSPLUS_EXT_CNID:
59 hfsplus_inode_read_fork(inode, &vhdr->ext_file); 30 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
60 inode->i_mapping->a_ops = &hfsplus_btree_aops; 31 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +46,102 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
75 inode->i_mapping->a_ops = &hfsplus_btree_aops; 46 inode->i_mapping->a_ops = &hfsplus_btree_aops;
76 break; 47 break;
77 default: 48 default:
78 goto bad_inode; 49 return -EIO;
50 }
51
52 return 0;
53}
54
55struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
56{
57 struct hfs_find_data fd;
58 struct inode *inode;
59 int err;
60
61 inode = iget_locked(sb, ino);
62 if (!inode)
63 return ERR_PTR(-ENOMEM);
64 if (!(inode->i_state & I_NEW))
65 return inode;
66
67 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
68 mutex_init(&HFSPLUS_I(inode)->extents_lock);
69 HFSPLUS_I(inode)->flags = 0;
70 HFSPLUS_I(inode)->extent_state = 0;
71 HFSPLUS_I(inode)->rsrc_inode = NULL;
72 atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
73
74 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
75 inode->i_ino == HFSPLUS_ROOT_CNID) {
76 hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
77 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
78 if (!err)
79 err = hfsplus_cat_read_inode(inode, &fd);
80 hfs_find_exit(&fd);
81 } else {
82 err = hfsplus_system_read_inode(inode);
83 }
84
85 if (err) {
86 iget_failed(inode);
87 return ERR_PTR(err);
79 } 88 }
80 89
81done:
82 unlock_new_inode(inode); 90 unlock_new_inode(inode);
83 return inode; 91 return inode;
84
85bad_inode:
86 iget_failed(inode);
87 return ERR_PTR(err);
88} 92}
89 93
90static int hfsplus_write_inode(struct inode *inode, 94static int hfsplus_system_write_inode(struct inode *inode)
91 struct writeback_control *wbc)
92{ 95{
93 struct hfsplus_vh *vhdr; 96 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
94 int ret = 0; 97 struct hfsplus_vh *vhdr = sbi->s_vhdr;
98 struct hfsplus_fork_raw *fork;
99 struct hfs_btree *tree = NULL;
95 100
96 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
97 hfsplus_ext_write_extent(inode);
98 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
99 return hfsplus_cat_write_inode(inode);
100 }
101 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
102 switch (inode->i_ino) { 101 switch (inode->i_ino) {
103 case HFSPLUS_ROOT_CNID:
104 ret = hfsplus_cat_write_inode(inode);
105 break;
106 case HFSPLUS_EXT_CNID: 102 case HFSPLUS_EXT_CNID:
107 if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { 103 fork = &vhdr->ext_file;
108 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 104 tree = sbi->ext_tree;
109 inode->i_sb->s_dirt = 1;
110 }
111 hfsplus_inode_write_fork(inode, &vhdr->ext_file);
112 hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
113 break; 105 break;
114 case HFSPLUS_CAT_CNID: 106 case HFSPLUS_CAT_CNID:
115 if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { 107 fork = &vhdr->cat_file;
116 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 108 tree = sbi->cat_tree;
117 inode->i_sb->s_dirt = 1;
118 }
119 hfsplus_inode_write_fork(inode, &vhdr->cat_file);
120 hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
121 break; 109 break;
122 case HFSPLUS_ALLOC_CNID: 110 case HFSPLUS_ALLOC_CNID:
123 if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { 111 fork = &vhdr->alloc_file;
124 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
125 inode->i_sb->s_dirt = 1;
126 }
127 hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
128 break; 112 break;
129 case HFSPLUS_START_CNID: 113 case HFSPLUS_START_CNID:
130 if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { 114 fork = &vhdr->start_file;
131 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
132 inode->i_sb->s_dirt = 1;
133 }
134 hfsplus_inode_write_fork(inode, &vhdr->start_file);
135 break; 115 break;
136 case HFSPLUS_ATTR_CNID: 116 case HFSPLUS_ATTR_CNID:
137 if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { 117 fork = &vhdr->attr_file;
138 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 118 tree = sbi->attr_tree;
139 inode->i_sb->s_dirt = 1; 119 default:
140 } 120 return -EIO;
141 hfsplus_inode_write_fork(inode, &vhdr->attr_file); 121 }
142 hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); 122
143 break; 123 if (fork->total_size != cpu_to_be64(inode->i_size)) {
124 set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
125 inode->i_sb->s_dirt = 1;
144 } 126 }
145 return ret; 127 hfsplus_inode_write_fork(inode, fork);
128 if (tree)
129 hfs_btree_write(tree);
130 return 0;
131}
132
133static int hfsplus_write_inode(struct inode *inode,
134 struct writeback_control *wbc)
135{
136 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
137
138 hfsplus_ext_write_extent(inode);
139
140 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
141 inode->i_ino == HFSPLUS_ROOT_CNID)
142 return hfsplus_cat_write_inode(inode);
143 else
144 return hfsplus_system_write_inode(inode);
146} 145}
147 146
148static void hfsplus_evict_inode(struct inode *inode) 147static void hfsplus_evict_inode(struct inode *inode)
@@ -151,52 +150,74 @@ static void hfsplus_evict_inode(struct inode *inode)
151 truncate_inode_pages(&inode->i_data, 0); 150 truncate_inode_pages(&inode->i_data, 0);
152 end_writeback(inode); 151 end_writeback(inode);
153 if (HFSPLUS_IS_RSRC(inode)) { 152 if (HFSPLUS_IS_RSRC(inode)) {
154 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 153 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
155 iput(HFSPLUS_I(inode).rsrc_inode); 154 iput(HFSPLUS_I(inode)->rsrc_inode);
156 } 155 }
157} 156}
158 157
159int hfsplus_sync_fs(struct super_block *sb, int wait) 158int hfsplus_sync_fs(struct super_block *sb, int wait)
160{ 159{
161 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 160 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
161 struct hfsplus_vh *vhdr = sbi->s_vhdr;
162 int write_backup = 0;
163 int error, error2;
164
165 if (!wait)
166 return 0;
162 167
163 dprint(DBG_SUPER, "hfsplus_write_super\n"); 168 dprint(DBG_SUPER, "hfsplus_write_super\n");
164 169
165 lock_super(sb);
166 sb->s_dirt = 0; 170 sb->s_dirt = 0;
167 171
168 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 172 /*
169 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 173 * Explicitly write out the special metadata inodes.
170 vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); 174 *
171 vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); 175 * While these special inodes are marked as hashed and written
172 vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count); 176 * out peridocically by the flusher threads we redirty them
173 177 * during writeout of normal inodes, and thus the life lock
174 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 178 * prevents us from getting the latest state to disk.
175 if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { 179 */
176 if (HFSPLUS_SB(sb).sect_count) { 180 error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
177 struct buffer_head *bh; 181 error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
178 u32 block, offset; 182 if (!error)
179 183 error = error2;
180 block = HFSPLUS_SB(sb).blockoffset; 184 error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
181 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); 185 if (!error)
182 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); 186 error = error2;
183 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, 187
184 HFSPLUS_SB(sb).sect_count, block, offset); 188 mutex_lock(&sbi->vh_mutex);
185 bh = sb_bread(sb, block); 189 mutex_lock(&sbi->alloc_mutex);
186 if (bh) { 190 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
187 vhdr = (struct hfsplus_vh *)(bh->b_data + offset); 191 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
188 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { 192 vhdr->folder_count = cpu_to_be32(sbi->folder_count);
189 memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); 193 vhdr->file_count = cpu_to_be32(sbi->file_count);
190 mark_buffer_dirty(bh); 194
191 brelse(bh); 195 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
192 } else 196 memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
193 printk(KERN_WARNING "hfs: backup not found!\n"); 197 write_backup = 1;
194 }
195 }
196 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
197 } 198 }
198 unlock_super(sb); 199
199 return 0; 200 error2 = hfsplus_submit_bio(sb->s_bdev,
201 sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
202 sbi->s_vhdr, WRITE_SYNC);
203 if (!error)
204 error = error2;
205 if (!write_backup)
206 goto out;
207
208 error2 = hfsplus_submit_bio(sb->s_bdev,
209 sbi->part_start + sbi->sect_count - 2,
210 sbi->s_backup_vhdr, WRITE_SYNC);
211 if (!error)
212 error2 = error;
213out:
214 mutex_unlock(&sbi->alloc_mutex);
215 mutex_unlock(&sbi->vh_mutex);
216
217 if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
218 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
219
220 return error;
200} 221}
201 222
202static void hfsplus_write_super(struct super_block *sb) 223static void hfsplus_write_super(struct super_block *sb)
@@ -209,48 +230,47 @@ static void hfsplus_write_super(struct super_block *sb)
209 230
210static void hfsplus_put_super(struct super_block *sb) 231static void hfsplus_put_super(struct super_block *sb)
211{ 232{
233 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
234
212 dprint(DBG_SUPER, "hfsplus_put_super\n"); 235 dprint(DBG_SUPER, "hfsplus_put_super\n");
236
213 if (!sb->s_fs_info) 237 if (!sb->s_fs_info)
214 return; 238 return;
215 239
216 lock_kernel(); 240 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
217 241 struct hfsplus_vh *vhdr = sbi->s_vhdr;
218 if (sb->s_dirt)
219 hfsplus_write_super(sb);
220 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
221 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
222 242
223 vhdr->modify_date = hfsp_now2mt(); 243 vhdr->modify_date = hfsp_now2mt();
224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); 244 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); 245 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
226 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 246
227 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 247 hfsplus_sync_fs(sb, 1);
228 } 248 }
229 249
230 hfs_btree_close(HFSPLUS_SB(sb).cat_tree); 250 hfs_btree_close(sbi->cat_tree);
231 hfs_btree_close(HFSPLUS_SB(sb).ext_tree); 251 hfs_btree_close(sbi->ext_tree);
232 iput(HFSPLUS_SB(sb).alloc_file); 252 iput(sbi->alloc_file);
233 iput(HFSPLUS_SB(sb).hidden_dir); 253 iput(sbi->hidden_dir);
234 brelse(HFSPLUS_SB(sb).s_vhbh); 254 kfree(sbi->s_vhdr);
235 unload_nls(HFSPLUS_SB(sb).nls); 255 kfree(sbi->s_backup_vhdr);
256 unload_nls(sbi->nls);
236 kfree(sb->s_fs_info); 257 kfree(sb->s_fs_info);
237 sb->s_fs_info = NULL; 258 sb->s_fs_info = NULL;
238
239 unlock_kernel();
240} 259}
241 260
242static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 261static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
243{ 262{
244 struct super_block *sb = dentry->d_sb; 263 struct super_block *sb = dentry->d_sb;
264 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
245 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 265 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
246 266
247 buf->f_type = HFSPLUS_SUPER_MAGIC; 267 buf->f_type = HFSPLUS_SUPER_MAGIC;
248 buf->f_bsize = sb->s_blocksize; 268 buf->f_bsize = sb->s_blocksize;
249 buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; 269 buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
250 buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; 270 buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
251 buf->f_bavail = buf->f_bfree; 271 buf->f_bavail = buf->f_bfree;
252 buf->f_files = 0xFFFFFFFF; 272 buf->f_files = 0xFFFFFFFF;
253 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 273 buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
254 buf->f_fsid.val[0] = (u32)id; 274 buf->f_fsid.val[0] = (u32)id;
255 buf->f_fsid.val[1] = (u32)(id >> 32); 275 buf->f_fsid.val[1] = (u32)(id >> 32);
256 buf->f_namelen = HFSPLUS_MAX_STRLEN; 276 buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,27 +283,32 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
263 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 283 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
264 return 0; 284 return 0;
265 if (!(*flags & MS_RDONLY)) { 285 if (!(*flags & MS_RDONLY)) {
266 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 286 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
267 struct hfsplus_sb_info sbi; 287 int force = 0;
268 288
269 memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); 289 if (!hfsplus_parse_options_remount(data, &force))
270 sbi.nls = HFSPLUS_SB(sb).nls;
271 if (!hfsplus_parse_options(data, &sbi))
272 return -EINVAL; 290 return -EINVAL;
273 291
274 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { 292 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
275 printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " 293 printk(KERN_WARNING "hfs: filesystem was "
276 "running fsck.hfsplus is recommended. leaving read-only.\n"); 294 "not cleanly unmounted, "
295 "running fsck.hfsplus is recommended. "
296 "leaving read-only.\n");
277 sb->s_flags |= MS_RDONLY; 297 sb->s_flags |= MS_RDONLY;
278 *flags |= MS_RDONLY; 298 *flags |= MS_RDONLY;
279 } else if (sbi.flags & HFSPLUS_SB_FORCE) { 299 } else if (force) {
280 /* nothing */ 300 /* nothing */
281 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 301 } else if (vhdr->attributes &
282 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); 302 cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
303 printk(KERN_WARNING "hfs: filesystem is marked locked, "
304 "leaving read-only.\n");
283 sb->s_flags |= MS_RDONLY; 305 sb->s_flags |= MS_RDONLY;
284 *flags |= MS_RDONLY; 306 *flags |= MS_RDONLY;
285 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { 307 } else if (vhdr->attributes &
286 printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n"); 308 cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
309 printk(KERN_WARNING "hfs: filesystem is "
310 "marked journaled, "
311 "leaving read-only.\n");
287 sb->s_flags |= MS_RDONLY; 312 sb->s_flags |= MS_RDONLY;
288 *flags |= MS_RDONLY; 313 *flags |= MS_RDONLY;
289 } 314 }
@@ -313,19 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
313 struct inode *root, *inode; 338 struct inode *root, *inode;
314 struct qstr str; 339 struct qstr str;
315 struct nls_table *nls = NULL; 340 struct nls_table *nls = NULL;
316 int err = -EINVAL; 341 int err;
317 342
343 err = -EINVAL;
318 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 344 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
319 if (!sbi) 345 if (!sbi)
320 return -ENOMEM; 346 goto out;
321 347
322 sb->s_fs_info = sbi; 348 sb->s_fs_info = sbi;
323 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 349 mutex_init(&sbi->alloc_mutex);
350 mutex_init(&sbi->vh_mutex);
324 hfsplus_fill_defaults(sbi); 351 hfsplus_fill_defaults(sbi);
352
353 err = -EINVAL;
325 if (!hfsplus_parse_options(data, sbi)) { 354 if (!hfsplus_parse_options(data, sbi)) {
326 printk(KERN_ERR "hfs: unable to parse mount options\n"); 355 printk(KERN_ERR "hfs: unable to parse mount options\n");
327 err = -EINVAL; 356 goto out_unload_nls;
328 goto cleanup;
329 } 357 }
330 358
331 /* temporarily use utf8 to correctly find the hidden dir below */ 359 /* temporarily use utf8 to correctly find the hidden dir below */
@@ -333,140 +361,160 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
333 sbi->nls = load_nls("utf8"); 361 sbi->nls = load_nls("utf8");
334 if (!sbi->nls) { 362 if (!sbi->nls) {
335 printk(KERN_ERR "hfs: unable to load nls for utf8\n"); 363 printk(KERN_ERR "hfs: unable to load nls for utf8\n");
336 err = -EINVAL; 364 goto out_unload_nls;
337 goto cleanup;
338 } 365 }
339 366
340 /* Grab the volume header */ 367 /* Grab the volume header */
341 if (hfsplus_read_wrapper(sb)) { 368 if (hfsplus_read_wrapper(sb)) {
342 if (!silent) 369 if (!silent)
343 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); 370 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
344 err = -EINVAL; 371 goto out_unload_nls;
345 goto cleanup;
346 } 372 }
347 vhdr = HFSPLUS_SB(sb).s_vhdr; 373 vhdr = sbi->s_vhdr;
348 374
349 /* Copy parts of the volume header into the superblock */ 375 /* Copy parts of the volume header into the superblock */
350 sb->s_magic = HFSPLUS_VOLHEAD_SIG; 376 sb->s_magic = HFSPLUS_VOLHEAD_SIG;
351 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || 377 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
352 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { 378 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
353 printk(KERN_ERR "hfs: wrong filesystem version\n"); 379 printk(KERN_ERR "hfs: wrong filesystem version\n");
354 goto cleanup; 380 goto out_free_vhdr;
355 } 381 }
356 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); 382 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
357 HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); 383 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
358 HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); 384 sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
359 HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); 385 sbi->file_count = be32_to_cpu(vhdr->file_count);
360 HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); 386 sbi->folder_count = be32_to_cpu(vhdr->folder_count);
361 HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); 387 sbi->data_clump_blocks =
362 HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 388 be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
363 if (!HFSPLUS_SB(sb).data_clump_blocks) 389 if (!sbi->data_clump_blocks)
364 HFSPLUS_SB(sb).data_clump_blocks = 1; 390 sbi->data_clump_blocks = 1;
365 HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 391 sbi->rsrc_clump_blocks =
366 if (!HFSPLUS_SB(sb).rsrc_clump_blocks) 392 be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
367 HFSPLUS_SB(sb).rsrc_clump_blocks = 1; 393 if (!sbi->rsrc_clump_blocks)
394 sbi->rsrc_clump_blocks = 1;
368 395
369 /* Set up operations so we can load metadata */ 396 /* Set up operations so we can load metadata */
370 sb->s_op = &hfsplus_sops; 397 sb->s_op = &hfsplus_sops;
371 sb->s_maxbytes = MAX_LFS_FILESIZE; 398 sb->s_maxbytes = MAX_LFS_FILESIZE;
372 399
373 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { 400 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
374 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " 401 printk(KERN_WARNING "hfs: Filesystem was "
375 "running fsck.hfsplus is recommended. mounting read-only.\n"); 402 "not cleanly unmounted, "
403 "running fsck.hfsplus is recommended. "
404 "mounting read-only.\n");
376 sb->s_flags |= MS_RDONLY; 405 sb->s_flags |= MS_RDONLY;
377 } else if (sbi->flags & HFSPLUS_SB_FORCE) { 406 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
378 /* nothing */ 407 /* nothing */
379 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 408 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
380 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 409 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
381 sb->s_flags |= MS_RDONLY; 410 sb->s_flags |= MS_RDONLY;
382 } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { 411 } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
383 printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " 412 !(sb->s_flags & MS_RDONLY)) {
384 "use the force option at your own risk, mounting read-only.\n"); 413 printk(KERN_WARNING "hfs: write access to "
414 "a journaled filesystem is not supported, "
415 "use the force option at your own risk, "
416 "mounting read-only.\n");
385 sb->s_flags |= MS_RDONLY; 417 sb->s_flags |= MS_RDONLY;
386 } 418 }
387 sbi->flags &= ~HFSPLUS_SB_FORCE;
388 419
389 /* Load metadata objects (B*Trees) */ 420 /* Load metadata objects (B*Trees) */
390 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 421 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
391 if (!HFSPLUS_SB(sb).ext_tree) { 422 if (!sbi->ext_tree) {
392 printk(KERN_ERR "hfs: failed to load extents file\n"); 423 printk(KERN_ERR "hfs: failed to load extents file\n");
393 goto cleanup; 424 goto out_free_vhdr;
394 } 425 }
395 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 426 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
396 if (!HFSPLUS_SB(sb).cat_tree) { 427 if (!sbi->cat_tree) {
397 printk(KERN_ERR "hfs: failed to load catalog file\n"); 428 printk(KERN_ERR "hfs: failed to load catalog file\n");
398 goto cleanup; 429 goto out_close_ext_tree;
399 } 430 }
400 431
401 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); 432 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
402 if (IS_ERR(inode)) { 433 if (IS_ERR(inode)) {
403 printk(KERN_ERR "hfs: failed to load allocation file\n"); 434 printk(KERN_ERR "hfs: failed to load allocation file\n");
404 err = PTR_ERR(inode); 435 err = PTR_ERR(inode);
405 goto cleanup; 436 goto out_close_cat_tree;
406 } 437 }
407 HFSPLUS_SB(sb).alloc_file = inode; 438 sbi->alloc_file = inode;
408 439
409 /* Load the root directory */ 440 /* Load the root directory */
410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); 441 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
411 if (IS_ERR(root)) { 442 if (IS_ERR(root)) {
412 printk(KERN_ERR "hfs: failed to load root directory\n"); 443 printk(KERN_ERR "hfs: failed to load root directory\n");
413 err = PTR_ERR(root); 444 err = PTR_ERR(root);
414 goto cleanup; 445 goto out_put_alloc_file;
415 } 446 }
416 sb->s_root = d_alloc_root(root);
417 if (!sb->s_root) {
418 iput(root);
419 err = -ENOMEM;
420 goto cleanup;
421 }
422 sb->s_root->d_op = &hfsplus_dentry_operations;
423 447
424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 448 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
425 str.name = HFSP_HIDDENDIR_NAME; 449 str.name = HFSP_HIDDENDIR_NAME;
426 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 450 hfs_find_init(sbi->cat_tree, &fd);
427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); 451 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 452 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
429 hfs_find_exit(&fd); 453 hfs_find_exit(&fd);
430 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) 454 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
431 goto cleanup; 455 goto out_put_root;
432 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); 456 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
433 if (IS_ERR(inode)) { 457 if (IS_ERR(inode)) {
434 err = PTR_ERR(inode); 458 err = PTR_ERR(inode);
435 goto cleanup; 459 goto out_put_root;
436 } 460 }
437 HFSPLUS_SB(sb).hidden_dir = inode; 461 sbi->hidden_dir = inode;
438 } else 462 } else
439 hfs_find_exit(&fd); 463 hfs_find_exit(&fd);
440 464
441 if (sb->s_flags & MS_RDONLY) 465 if (!(sb->s_flags & MS_RDONLY)) {
442 goto out; 466 /*
467 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
468 * all three are registered with Apple for our use
469 */
470 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
471 vhdr->modify_date = hfsp_now2mt();
472 be32_add_cpu(&vhdr->write_count, 1);
473 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
474 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
475 hfsplus_sync_fs(sb, 1);
443 476
444 /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused 477 if (!sbi->hidden_dir) {
445 * all three are registered with Apple for our use 478 mutex_lock(&sbi->vh_mutex);
446 */ 479 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
447 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); 480 hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
448 vhdr->modify_date = hfsp_now2mt(); 481 sbi->hidden_dir);
449 be32_add_cpu(&vhdr->write_count, 1); 482 mutex_unlock(&sbi->vh_mutex);
450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 483
451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 484 hfsplus_mark_inode_dirty(sbi->hidden_dir,
452 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 485 HFSPLUS_I_CAT_DIRTY);
453 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 486 }
454
455 if (!HFSPLUS_SB(sb).hidden_dir) {
456 printk(KERN_DEBUG "hfs: create hidden dir...\n");
457 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
458 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
459 &str, HFSPLUS_SB(sb).hidden_dir);
460 mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
461 } 487 }
462out: 488
489 sb->s_d_op = &hfsplus_dentry_operations;
490 sb->s_root = d_alloc_root(root);
491 if (!sb->s_root) {
492 err = -ENOMEM;
493 goto out_put_hidden_dir;
494 }
495
463 unload_nls(sbi->nls); 496 unload_nls(sbi->nls);
464 sbi->nls = nls; 497 sbi->nls = nls;
465 return 0; 498 return 0;
466 499
467cleanup: 500out_put_hidden_dir:
468 hfsplus_put_super(sb); 501 iput(sbi->hidden_dir);
502out_put_root:
503 iput(sbi->alloc_file);
504out_put_alloc_file:
505 iput(sbi->alloc_file);
506out_close_cat_tree:
507 hfs_btree_close(sbi->cat_tree);
508out_close_ext_tree:
509 hfs_btree_close(sbi->ext_tree);
510out_free_vhdr:
511 kfree(sbi->s_vhdr);
512 kfree(sbi->s_backup_vhdr);
513out_unload_nls:
514 unload_nls(sbi->nls);
469 unload_nls(nls); 515 unload_nls(nls);
516 kfree(sbi);
517out:
470 return err; 518 return err;
471} 519}
472 520
@@ -484,25 +532,31 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
484 return i ? &i->vfs_inode : NULL; 532 return i ? &i->vfs_inode : NULL;
485} 533}
486 534
535static void hfsplus_i_callback(struct rcu_head *head)
536{
537 struct inode *inode = container_of(head, struct inode, i_rcu);
538
539 INIT_LIST_HEAD(&inode->i_dentry);
540 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
541}
542
487static void hfsplus_destroy_inode(struct inode *inode) 543static void hfsplus_destroy_inode(struct inode *inode)
488{ 544{
489 kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); 545 call_rcu(&inode->i_rcu, hfsplus_i_callback);
490} 546}
491 547
492#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 548#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
493 549
494static int hfsplus_get_sb(struct file_system_type *fs_type, 550static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
495 int flags, const char *dev_name, void *data, 551 int flags, const char *dev_name, void *data)
496 struct vfsmount *mnt)
497{ 552{
498 return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super, 553 return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
499 mnt);
500} 554}
501 555
502static struct file_system_type hfsplus_fs_type = { 556static struct file_system_type hfsplus_fs_type = {
503 .owner = THIS_MODULE, 557 .owner = THIS_MODULE,
504 .name = "hfsplus", 558 .name = "hfsplus",
505 .get_sb = hfsplus_get_sb, 559 .mount = hfsplus_mount,
506 .kill_sb = kill_block_super, 560 .kill_sb = kill_block_super,
507 .fs_flags = FS_REQUIRES_DEV, 561 .fs_flags = FS_REQUIRES_DEV,
508}; 562};
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..a3f0bfcc881e 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -17,14 +17,14 @@
17/* Returns folded char, or 0 if ignorable */ 17/* Returns folded char, or 0 if ignorable */
18static inline u16 case_fold(u16 c) 18static inline u16 case_fold(u16 c)
19{ 19{
20 u16 tmp; 20 u16 tmp;
21 21
22 tmp = hfsplus_case_fold_table[c >> 8]; 22 tmp = hfsplus_case_fold_table[c >> 8];
23 if (tmp) 23 if (tmp)
24 tmp = hfsplus_case_fold_table[tmp + (c & 0xff)]; 24 tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
25 else 25 else
26 tmp = c; 26 tmp = c;
27 return tmp; 27 return tmp;
28} 28}
29 29
30/* Compare unicode strings, return values like normal strcmp */ 30/* Compare unicode strings, return values like normal strcmp */
@@ -118,10 +118,12 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
118 return NULL; 118 return NULL;
119} 119}
120 120
121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) 121int hfsplus_uni2asc(struct super_block *sb,
122 const struct hfsplus_unistr *ustr,
123 char *astr, int *len_p)
122{ 124{
123 const hfsplus_unichr *ip; 125 const hfsplus_unichr *ip;
124 struct nls_table *nls = HFSPLUS_SB(sb).nls; 126 struct nls_table *nls = HFSPLUS_SB(sb)->nls;
125 u8 *op; 127 u8 *op;
126 u16 cc, c0, c1; 128 u16 cc, c0, c1;
127 u16 *ce1, *ce2; 129 u16 *ce1, *ce2;
@@ -132,7 +134,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
132 ustrlen = be16_to_cpu(ustr->length); 134 ustrlen = be16_to_cpu(ustr->length);
133 len = *len_p; 135 len = *len_p;
134 ce1 = NULL; 136 ce1 = NULL;
135 compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 137 compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
136 138
137 while (ustrlen > 0) { 139 while (ustrlen > 0) {
138 c0 = be16_to_cpu(*ip++); 140 c0 = be16_to_cpu(*ip++);
@@ -171,7 +173,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
171 goto same; 173 goto same;
172 c1 = be16_to_cpu(*ip); 174 c1 = be16_to_cpu(*ip);
173 if (likely(compose)) 175 if (likely(compose))
174 ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c1); 176 ce1 = hfsplus_compose_lookup(
177 hfsplus_compose_table, c1);
175 if (ce1) 178 if (ce1)
176 break; 179 break;
177 switch (c0) { 180 switch (c0) {
@@ -199,7 +202,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
199 if (ce2) { 202 if (ce2) {
200 i = 1; 203 i = 1;
201 while (i < ustrlen) { 204 while (i < ustrlen) {
202 ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i])); 205 ce1 = hfsplus_compose_lookup(ce2,
206 be16_to_cpu(ip[i]));
203 if (!ce1) 207 if (!ce1)
204 break; 208 break;
205 i++; 209 i++;
@@ -211,7 +215,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
211 goto done; 215 goto done;
212 } 216 }
213 } 217 }
214 same: 218same:
215 switch (c0) { 219 switch (c0) {
216 case 0: 220 case 0:
217 cc = 0x2400; 221 cc = 0x2400;
@@ -222,7 +226,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
222 default: 226 default:
223 cc = c0; 227 cc = c0;
224 } 228 }
225 done: 229done:
226 res = nls->uni2char(cc, op, len); 230 res = nls->uni2char(cc, op, len);
227 if (res < 0) { 231 if (res < 0) {
228 if (res == -ENAMETOOLONG) 232 if (res == -ENAMETOOLONG)
@@ -246,7 +250,7 @@ out:
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len, 250static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc) 251 wchar_t *uc)
248{ 252{
249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); 253 int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
250 if (size <= 0) { 254 if (size <= 0) {
251 *uc = '?'; 255 *uc = '?';
252 size = 1; 256 size = 1;
@@ -293,7 +297,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
293 u16 *dstr, outlen = 0; 297 u16 *dstr, outlen = 0;
294 wchar_t c; 298 wchar_t c;
295 299
296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 300 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 301 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
298 size = asc2unichar(sb, astr, len, &c); 302 size = asc2unichar(sb, astr, len, &c);
299 303
@@ -320,7 +324,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
320 * Composed unicode characters are decomposed and case-folding is performed 324 * Composed unicode characters are decomposed and case-folding is performed
321 * if the appropriate bits are (un)set on the superblock. 325 * if the appropriate bits are (un)set on the superblock.
322 */ 326 */
323int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) 327int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
328 struct qstr *str)
324{ 329{
325 struct super_block *sb = dentry->d_sb; 330 struct super_block *sb = dentry->d_sb;
326 const char *astr; 331 const char *astr;
@@ -330,8 +335,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
330 wchar_t c; 335 wchar_t c;
331 u16 c2; 336 u16 c2;
332 337
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 338 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 339 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
335 hash = init_name_hash(); 340 hash = init_name_hash();
336 astr = str->name; 341 astr = str->name;
337 len = str->len; 342 len = str->len;
@@ -363,9 +368,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
363 * Composed unicode characters are decomposed and case-folding is performed 368 * Composed unicode characters are decomposed and case-folding is performed
364 * if the appropriate bits are (un)set on the superblock. 369 * if the appropriate bits are (un)set on the superblock.
365 */ 370 */
366int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2) 371int hfsplus_compare_dentry(const struct dentry *parent,
372 const struct inode *pinode,
373 const struct dentry *dentry, const struct inode *inode,
374 unsigned int len, const char *str, const struct qstr *name)
367{ 375{
368 struct super_block *sb = dentry->d_sb; 376 struct super_block *sb = parent->d_sb;
369 int casefold, decompose, size; 377 int casefold, decompose, size;
370 int dsize1, dsize2, len1, len2; 378 int dsize1, dsize2, len1, len2;
371 const u16 *dstr1, *dstr2; 379 const u16 *dstr1, *dstr2;
@@ -373,12 +381,12 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
373 u16 c1, c2; 381 u16 c1, c2;
374 wchar_t c; 382 wchar_t c;
375 383
376 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 384 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
377 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 385 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
378 astr1 = s1->name; 386 astr1 = str;
379 len1 = s1->len; 387 len1 = len;
380 astr2 = s2->name; 388 astr2 = name->name;
381 len2 = s2->len; 389 len2 = name->len;
382 dsize1 = dsize2 = 0; 390 dsize1 = dsize2 = 0;
383 dstr1 = dstr2 = NULL; 391 dstr1 = dstr2 = NULL;
384 392
@@ -388,7 +396,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
388 astr1 += size; 396 astr1 += size;
389 len1 -= size; 397 len1 -= size;
390 398
391 if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) { 399 if (decompose)
400 dstr1 = decompose_unichar(c, &dsize1);
401 if (!decompose || !dstr1) {
392 c1 = c; 402 c1 = c;
393 dstr1 = &c1; 403 dstr1 = &c1;
394 dsize1 = 1; 404 dsize1 = 1;
@@ -400,7 +410,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
400 astr2 += size; 410 astr2 += size;
401 len2 -= size; 411 len2 -= size;
402 412
403 if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) { 413 if (decompose)
414 dstr2 = decompose_unichar(c, &dsize2);
415 if (!decompose || !dstr2) {
404 c2 = c; 416 c2 = c;
405 dstr2 = &c2; 417 dstr2 = &c2;
406 dsize2 = 1; 418 dsize2 = 1;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,6 +24,40 @@ struct hfsplus_wd {
24 u16 embed_count; 24 u16 embed_count;
25}; 25};
26 26
27static void hfsplus_end_io_sync(struct bio *bio, int err)
28{
29 if (err)
30 clear_bit(BIO_UPTODATE, &bio->bi_flags);
31 complete(bio->bi_private);
32}
33
34int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
35 void *data, int rw)
36{
37 DECLARE_COMPLETION_ONSTACK(wait);
38 struct bio *bio;
39
40 bio = bio_alloc(GFP_NOIO, 1);
41 bio->bi_sector = sector;
42 bio->bi_bdev = bdev;
43 bio->bi_end_io = hfsplus_end_io_sync;
44 bio->bi_private = &wait;
45
46 /*
47 * We always submit one sector at a time, so bio_add_page must not fail.
48 */
49 if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE,
50 offset_in_page(data)) != HFSPLUS_SECTOR_SIZE)
51 BUG();
52
53 submit_bio(rw, bio);
54 wait_for_completion(&wait);
55
56 if (!bio_flagged(bio, BIO_UPTODATE))
57 return -EIO;
58 return 0;
59}
60
27static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) 61static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
28{ 62{
29 u32 extent; 63 u32 extent;
@@ -40,12 +74,14 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
40 !(attrib & HFSP_WRAP_ATTRIB_SPARED)) 74 !(attrib & HFSP_WRAP_ATTRIB_SPARED))
41 return 0; 75 return 0;
42 76
43 wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE)); 77 wd->ablk_size =
78 be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
44 if (wd->ablk_size < HFSPLUS_SECTOR_SIZE) 79 if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
45 return 0; 80 return 0;
46 if (wd->ablk_size % HFSPLUS_SECTOR_SIZE) 81 if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
47 return 0; 82 return 0;
48 wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); 83 wd->ablk_start =
84 be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
49 85
50 extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT); 86 extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
51 wd->embed_start = (extent >> 16) & 0xFFFF; 87 wd->embed_start = (extent >> 16) & 0xFFFF;
@@ -65,10 +101,11 @@ static int hfsplus_get_last_session(struct super_block *sb,
65 *start = 0; 101 *start = 0;
66 *size = sb->s_bdev->bd_inode->i_size >> 9; 102 *size = sb->s_bdev->bd_inode->i_size >> 9;
67 103
68 if (HFSPLUS_SB(sb).session >= 0) { 104 if (HFSPLUS_SB(sb)->session >= 0) {
69 te.cdte_track = HFSPLUS_SB(sb).session; 105 te.cdte_track = HFSPLUS_SB(sb)->session;
70 te.cdte_format = CDROM_LBA; 106 te.cdte_format = CDROM_LBA;
71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); 107 res = ioctl_by_bdev(sb->s_bdev,
108 CDROMREADTOCENTRY, (unsigned long)&te);
72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { 109 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
73 *start = (sector_t)te.cdte_addr.lba << 2; 110 *start = (sector_t)te.cdte_addr.lba << 2;
74 return 0; 111 return 0;
@@ -77,7 +114,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
77 return -EINVAL; 114 return -EINVAL;
78 } 115 }
79 ms_info.addr_format = CDROM_LBA; 116 ms_info.addr_format = CDROM_LBA;
80 res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info); 117 res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
118 (unsigned long)&ms_info);
81 if (!res && ms_info.xa_flag) 119 if (!res && ms_info.xa_flag)
82 *start = (sector_t)ms_info.addr.lba << 2; 120 *start = (sector_t)ms_info.addr.lba << 2;
83 return 0; 121 return 0;
@@ -87,97 +125,113 @@ static int hfsplus_get_last_session(struct super_block *sb,
87/* Takes in super block, returns true if good data read */ 125/* Takes in super block, returns true if good data read */
88int hfsplus_read_wrapper(struct super_block *sb) 126int hfsplus_read_wrapper(struct super_block *sb)
89{ 127{
90 struct buffer_head *bh; 128 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
91 struct hfsplus_vh *vhdr;
92 struct hfsplus_wd wd; 129 struct hfsplus_wd wd;
93 sector_t part_start, part_size; 130 sector_t part_start, part_size;
94 u32 blocksize; 131 u32 blocksize;
132 int error = 0;
95 133
134 error = -EINVAL;
96 blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE); 135 blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
97 if (!blocksize) 136 if (!blocksize)
98 return -EINVAL; 137 goto out;
99 138
100 if (hfsplus_get_last_session(sb, &part_start, &part_size)) 139 if (hfsplus_get_last_session(sb, &part_start, &part_size))
101 return -EINVAL; 140 goto out;
102 if ((u64)part_start + part_size > 0x100000000ULL) { 141 if ((u64)part_start + part_size > 0x100000000ULL) {
103 pr_err("hfs: volumes larger than 2TB are not supported yet\n"); 142 pr_err("hfs: volumes larger than 2TB are not supported yet\n");
104 return -EINVAL; 143 goto out;
105 } 144 }
106 while (1) {
107 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
108 if (!bh)
109 return -EIO;
110
111 if (vhdr->signature == cpu_to_be16(HFSP_WRAP_MAGIC)) {
112 if (!hfsplus_read_mdb(vhdr, &wd))
113 goto error;
114 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
115 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
116 part_size = wd.embed_count * wd.ablk_size;
117 brelse(bh);
118 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
119 if (!bh)
120 return -EIO;
121 }
122 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
123 break;
124 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
125 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
126 break;
127 }
128 brelse(bh);
129 145
130 /* check for a partition block 146 error = -ENOMEM;
147 sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
148 if (!sbi->s_vhdr)
149 goto out;
150 sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
151 if (!sbi->s_backup_vhdr)
152 goto out_free_vhdr;
153
154reread:
155 error = hfsplus_submit_bio(sb->s_bdev,
156 part_start + HFSPLUS_VOLHEAD_SECTOR,
157 sbi->s_vhdr, READ);
158 if (error)
159 goto out_free_backup_vhdr;
160
161 error = -EINVAL;
162 switch (sbi->s_vhdr->signature) {
163 case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
164 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
165 /*FALLTHRU*/
166 case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
167 break;
168 case cpu_to_be16(HFSP_WRAP_MAGIC):
169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
170 goto out_free_backup_vhdr;
171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
173 part_size = wd.embed_count * wd.ablk_size;
174 goto reread;
175 default:
176 /*
177 * Check for a partition block.
178 *
131 * (should do this only for cdrom/loop though) 179 * (should do this only for cdrom/loop though)
132 */ 180 */
133 if (hfs_part_find(sb, &part_start, &part_size)) 181 if (hfs_part_find(sb, &part_start, &part_size))
134 return -EINVAL; 182 goto out_free_backup_vhdr;
183 goto reread;
135 } 184 }
136 185
137 blocksize = be32_to_cpu(vhdr->blocksize); 186 error = hfsplus_submit_bio(sb->s_bdev,
138 brelse(bh); 187 part_start + part_size - 2,
188 sbi->s_backup_vhdr, READ);
189 if (error)
190 goto out_free_backup_vhdr;
191
192 error = -EINVAL;
193 if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
194 printk(KERN_WARNING
195 "hfs: invalid secondary volume header\n");
196 goto out_free_backup_vhdr;
197 }
198
199 blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
139 200
140 /* block size must be at least as large as a sector 201 /*
141 * and a multiple of 2 202 * Block size must be at least as large as a sector and a multiple of 2.
142 */ 203 */
143 if (blocksize < HFSPLUS_SECTOR_SIZE || 204 if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
144 ((blocksize - 1) & blocksize)) 205 goto out_free_backup_vhdr;
145 return -EINVAL; 206 sbi->alloc_blksz = blocksize;
146 HFSPLUS_SB(sb).alloc_blksz = blocksize; 207 sbi->alloc_blksz_shift = 0;
147 HFSPLUS_SB(sb).alloc_blksz_shift = 0;
148 while ((blocksize >>= 1) != 0) 208 while ((blocksize >>= 1) != 0)
149 HFSPLUS_SB(sb).alloc_blksz_shift++; 209 sbi->alloc_blksz_shift++;
150 blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); 210 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
151 211
152 /* align block size to block offset */ 212 /*
213 * Align block size to block offset.
214 */
153 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) 215 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
154 blocksize >>= 1; 216 blocksize >>= 1;
155 217
156 if (sb_set_blocksize(sb, blocksize) != blocksize) { 218 if (sb_set_blocksize(sb, blocksize) != blocksize) {
157 printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize); 219 printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
158 return -EINVAL; 220 blocksize);
221 goto out_free_backup_vhdr;
159 } 222 }
160 223
161 HFSPLUS_SB(sb).blockoffset = part_start >> 224 sbi->blockoffset =
162 (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); 225 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
163 HFSPLUS_SB(sb).sect_count = part_size; 226 sbi->part_start = part_start;
164 HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - 227 sbi->sect_count = part_size;
165 sb->s_blocksize_bits; 228 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
166
167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
168 if (!bh)
169 return -EIO;
170
171 /* should still be the same... */
172 if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
173 cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
174 cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
175 goto error;
176 HFSPLUS_SB(sb).s_vhbh = bh;
177 HFSPLUS_SB(sb).s_vhdr = vhdr;
178
179 return 0; 229 return 0;
180 error: 230
181 brelse(bh); 231out_free_backup_vhdr:
182 return -EINVAL; 232 kfree(sbi->s_backup_vhdr);
233out_free_vhdr:
234 kfree(sbi->s_vhdr);
235out:
236 return error;
183} 237}
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6bbd75c5589b..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
28 * #define ATTR_KILL_SUID 2048 28 * #define ATTR_KILL_SUID 2048
29 * #define ATTR_KILL_SGID 4096 29 * #define ATTR_KILL_SGID 4096
30 * 30 *
31 * and this is because they were added in 2.5 development in this patch: 31 * and this is because they were added in 2.5 development.
32 *
33 * http://linux.bkbits.net:8080/linux-2.5/
34 * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
35 * |src/.|src/include|src/include/linux|related/include/linux/fs.h
36 *
37 * Actually, they are not needed by most ->setattr() methods - they are set by 32 * Actually, they are not needed by most ->setattr() methods - they are set by
38 * callers of notify_change() to notify that the setuid/setgid bits must be 33 * callers of notify_change() to notify that the setuid/setgid bits must be
39 * dropped. 34 * dropped.
@@ -96,7 +91,6 @@ extern int rename_file(char *from, char *to);
96extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 91extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
97 long long *bfree_out, long long *bavail_out, 92 long long *bfree_out, long long *bavail_out,
98 long long *files_out, long long *ffree_out, 93 long long *files_out, long long *ffree_out,
99 void *fsid_out, int fsid_size, long *namelen_out, 94 void *fsid_out, int fsid_size, long *namelen_out);
100 long *spare_out);
101 95
102#endif 96#endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
32 32
33#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) 33#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
34 34
35static int hostfs_d_delete(struct dentry *dentry) 35static int hostfs_d_delete(const struct dentry *dentry)
36{ 36{
37 return 1; 37 return 1;
38} 38}
@@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args,
92 92
93static char *__dentry_name(struct dentry *dentry, char *name) 93static char *__dentry_name(struct dentry *dentry, char *name)
94{ 94{
95 char *p = __dentry_path(dentry, name, PATH_MAX); 95 char *p = dentry_path_raw(dentry, name, PATH_MAX);
96 char *root; 96 char *root;
97 size_t len; 97 size_t len;
98 98
99 spin_unlock(&dcache_lock);
100
101 root = dentry->d_sb->s_fs_info; 99 root = dentry->d_sb->s_fs_info;
102 len = strlen(root); 100 len = strlen(root);
103 if (IS_ERR(p)) { 101 if (IS_ERR(p)) {
@@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry)
123 if (!name) 121 if (!name)
124 return NULL; 122 return NULL;
125 123
126 spin_lock(&dcache_lock);
127 return __dentry_name(dentry, name); /* will unlock */ 124 return __dentry_name(dentry, name); /* will unlock */
128} 125}
129 126
130static char *inode_name(struct inode *ino) 127static char *inode_name(struct inode *ino)
131{ 128{
132 struct dentry *dentry; 129 struct dentry *dentry;
133 char *name = __getname(); 130 char *name;
134 if (!name)
135 return NULL;
136 131
137 spin_lock(&dcache_lock); 132 dentry = d_find_alias(ino);
138 if (list_empty(&ino->i_dentry)) { 133 if (!dentry)
139 spin_unlock(&dcache_lock);
140 __putname(name);
141 return NULL; 134 return NULL;
142 } 135
143 dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias); 136 name = dentry_name(dentry);
144 return __dentry_name(dentry, name); /* will unlock */ 137
138 dput(dentry);
139
140 return name;
145} 141}
146 142
147static char *follow_link(char *link) 143static char *follow_link(char *link)
@@ -217,7 +213,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
217 err = do_statfs(dentry->d_sb->s_fs_info, 213 err = do_statfs(dentry->d_sb->s_fs_info,
218 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, 214 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
219 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 215 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
220 &sf->f_namelen, sf->f_spare); 216 &sf->f_namelen);
221 if (err) 217 if (err)
222 return err; 218 return err;
223 sf->f_blocks = f_blocks; 219 sf->f_blocks = f_blocks;
@@ -251,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
251 } 247 }
252} 248}
253 249
254static void hostfs_destroy_inode(struct inode *inode) 250static void hostfs_i_callback(struct rcu_head *head)
255{ 251{
252 struct inode *inode = container_of(head, struct inode, i_rcu);
253 INIT_LIST_HEAD(&inode->i_dentry);
256 kfree(HOSTFS_I(inode)); 254 kfree(HOSTFS_I(inode));
257} 255}
258 256
257static void hostfs_destroy_inode(struct inode *inode)
258{
259 call_rcu(&inode->i_rcu, hostfs_i_callback);
260}
261
259static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 262static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
260{ 263{
261 const char *root_path = vfs->mnt_sb->s_fs_info; 264 const char *root_path = vfs->mnt_sb->s_fs_info;
@@ -609,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
609 goto out_put; 612 goto out_put;
610 613
611 d_add(dentry, inode); 614 d_add(dentry, inode);
612 dentry->d_op = &hostfs_dentry_ops;
613 return NULL; 615 return NULL;
614 616
615 out_put: 617 out_put:
@@ -746,11 +748,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
746 return err; 748 return err;
747} 749}
748 750
749int hostfs_permission(struct inode *ino, int desired) 751int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
750{ 752{
751 char *name; 753 char *name;
752 int r = 0, w = 0, x = 0, err; 754 int r = 0, w = 0, x = 0, err;
753 755
756 if (flags & IPERM_FLAG_RCU)
757 return -ECHILD;
758
754 if (desired & MAY_READ) r = 1; 759 if (desired & MAY_READ) r = 1;
755 if (desired & MAY_WRITE) w = 1; 760 if (desired & MAY_WRITE) w = 1;
756 if (desired & MAY_EXEC) x = 1; 761 if (desired & MAY_EXEC) x = 1;
@@ -765,7 +770,7 @@ int hostfs_permission(struct inode *ino, int desired)
765 err = access_file(name, r, w, x); 770 err = access_file(name, r, w, x);
766 __putname(name); 771 __putname(name);
767 if (!err) 772 if (!err)
768 err = generic_permission(ino, desired, NULL); 773 err = generic_permission(ino, desired, flags, NULL);
769 return err; 774 return err;
770} 775}
771 776
@@ -916,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
916 sb->s_blocksize_bits = 10; 921 sb->s_blocksize_bits = 10;
917 sb->s_magic = HOSTFS_SUPER_MAGIC; 922 sb->s_magic = HOSTFS_SUPER_MAGIC;
918 sb->s_op = &hostfs_sbops; 923 sb->s_op = &hostfs_sbops;
924 sb->s_d_op = &hostfs_dentry_ops;
919 sb->s_maxbytes = MAX_LFS_FILESIZE; 925 sb->s_maxbytes = MAX_LFS_FILESIZE;
920 926
921 /* NULL is printed as <NULL> by sprintf: avoid that. */ 927 /* NULL is printed as <NULL> by sprintf: avoid that. */
@@ -962,11 +968,11 @@ out:
962 return err; 968 return err;
963} 969}
964 970
965static int hostfs_read_sb(struct file_system_type *type, 971static struct dentry *hostfs_read_sb(struct file_system_type *type,
966 int flags, const char *dev_name, 972 int flags, const char *dev_name,
967 void *data, struct vfsmount *mnt) 973 void *data)
968{ 974{
969 return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt); 975 return mount_nodev(type, flags, data, hostfs_fill_sb_common);
970} 976}
971 977
972static void hostfs_kill_sb(struct super_block *s) 978static void hostfs_kill_sb(struct super_block *s)
@@ -978,7 +984,7 @@ static void hostfs_kill_sb(struct super_block *s)
978static struct file_system_type hostfs_type = { 984static struct file_system_type hostfs_type = {
979 .owner = THIS_MODULE, 985 .owner = THIS_MODULE,
980 .name = "hostfs", 986 .name = "hostfs",
981 .get_sb = hostfs_read_sb, 987 .mount = hostfs_read_sb,
982 .kill_sb = hostfs_kill_sb, 988 .kill_sb = hostfs_kill_sb,
983 .fs_flags = 0, 989 .fs_flags = 0,
984}; 990};
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
94 94
95 dir = opendir(path); 95 dir = opendir(path);
96 *err_out = errno; 96 *err_out = errno;
97 if (dir == NULL) 97
98 return NULL;
99 return dir; 98 return dir;
100} 99}
101 100
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
205 if (attrs->ia_valid & HOSTFS_ATTR_MODE) { 204 if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
206 if (fd >= 0) { 205 if (fd >= 0) {
207 if (fchmod(fd, attrs->ia_mode) != 0) 206 if (fchmod(fd, attrs->ia_mode) != 0)
208 return (-errno); 207 return -errno;
209 } else if (chmod(file, attrs->ia_mode) != 0) { 208 } else if (chmod(file, attrs->ia_mode) != 0) {
210 return -errno; 209 return -errno;
211 } 210 }
@@ -364,8 +363,7 @@ int rename_file(char *from, char *to)
364int do_statfs(char *root, long *bsize_out, long long *blocks_out, 363int do_statfs(char *root, long *bsize_out, long long *blocks_out,
365 long long *bfree_out, long long *bavail_out, 364 long long *bfree_out, long long *bavail_out,
366 long long *files_out, long long *ffree_out, 365 long long *files_out, long long *ffree_out,
367 void *fsid_out, int fsid_size, long *namelen_out, 366 void *fsid_out, int fsid_size, long *namelen_out)
368 long *spare_out)
369{ 367{
370 struct statfs64 buf; 368 struct statfs64 buf;
371 int err; 369 int err;
@@ -384,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
384 sizeof(buf.f_fsid) > fsid_size ? fsid_size : 382 sizeof(buf.f_fsid) > fsid_size ? fsid_size :
385 sizeof(buf.f_fsid)); 383 sizeof(buf.f_fsid));
386 *namelen_out = buf.f_namelen; 384 *namelen_out = buf.f_namelen;
387 spare_out[0] = buf.f_spare[0]; 385
388 spare_out[1] = buf.f_spare[1];
389 spare_out[2] = buf.f_spare[2];
390 spare_out[3] = buf.f_spare[3];
391 spare_out[4] = buf.f_spare[4];
392 return 0; 386 return 0;
393} 387}
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
1config HPFS_FS 1config HPFS_FS
2 tristate "OS/2 HPFS file system support" 2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # nontrivial to fix
4 help 5 help
5 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS 6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
6 is the file system used for organizing files on OS/2 hard disk 7 is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e3..793cb9d943d2 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
14#ifdef DEBUG_LOCKS 14#ifdef DEBUG_LOCKS
15 printk("lock creation\n"); 15 printk("lock creation\n");
16#endif 16#endif
17 down(&hpfs_sb(s)->hpfs_creation_de); 17 mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
18} 18}
19 19
20void hpfs_unlock_creation(struct super_block *s) 20void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
22#ifdef DEBUG_LOCKS 22#ifdef DEBUG_LOCKS
23 printk("unlock creation\n"); 23 printk("unlock creation\n");
24#endif 24#endif
25 up(&hpfs_sb(s)->hpfs_creation_de); 25 mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
26} 26}
27 27
28/* Map a sector into a buffer and return pointers to it and to the buffer. */ 28/* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5f..05d4816e4e77 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
12 * Note: the dentry argument is the parent dentry. 12 * Note: the dentry argument is the parent dentry.
13 */ 13 */
14 14
15static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr) 15static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
16 struct qstr *qstr)
16{ 17{
17 unsigned long hash; 18 unsigned long hash;
18 int i; 19 int i;
@@ -34,29 +35,30 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
34 return 0; 35 return 0;
35} 36}
36 37
37static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 38static int hpfs_compare_dentry(const struct dentry *parent,
39 const struct inode *pinode,
40 const struct dentry *dentry, const struct inode *inode,
41 unsigned int len, const char *str, const struct qstr *name)
38{ 42{
39 unsigned al=a->len; 43 unsigned al = len;
40 unsigned bl=b->len; 44 unsigned bl = name->len;
41 hpfs_adjust_length(a->name, &al); 45
46 hpfs_adjust_length(str, &al);
42 /*hpfs_adjust_length(b->name, &bl);*/ 47 /*hpfs_adjust_length(b->name, &bl);*/
43 /* 'a' is the qstr of an already existing dentry, so the name 48
44 * must be valid. 'b' must be validated first. 49 /*
50 * 'str' is the nane of an already existing dentry, so the name
51 * must be valid. 'name' must be validated first.
45 */ 52 */
46 53
47 if (hpfs_chk_name(b->name, &bl)) 54 if (hpfs_chk_name(name->name, &bl))
48 return 1; 55 return 1;
49 if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0)) 56 if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
50 return 1; 57 return 1;
51 return 0; 58 return 0;
52} 59}
53 60
54static const struct dentry_operations hpfs_dentry_operations = { 61const struct dentry_operations hpfs_dentry_operations = {
55 .d_hash = hpfs_hash_dentry, 62 .d_hash = hpfs_hash_dentry,
56 .d_compare = hpfs_compare_dentry, 63 .d_compare = hpfs_compare_dentry,
57}; 64};
58
59void hpfs_set_dentry_operations(struct dentry *dentry)
60{
61 dentry->d_op = &hpfs_dentry_operations;
62}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceba..d32f63a569f7 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
298 298
299 end: 299 end:
300 end_add: 300 end_add:
301 hpfs_set_dentry_operations(dentry);
302 unlock_kernel(); 301 unlock_kernel();
303 d_add(dentry, result); 302 d_add(dentry, result);
304 return NULL; 303 return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b59eac0232a0..1c43dbea55e8 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
87 unsigned *sb_bmp_dir; /* main bitmap directory */ 87 unsigned *sb_bmp_dir; /* main bitmap directory */
88 unsigned sb_c_bitmap; /* current bitmap */ 88 unsigned sb_c_bitmap; /* current bitmap */
89 unsigned sb_max_fwd_alloc; /* max forwad allocation */ 89 unsigned sb_max_fwd_alloc; /* max forwad allocation */
90 struct semaphore hpfs_creation_de; /* when creating dirents, nobody else 90 struct mutex hpfs_creation_de; /* when creating dirents, nobody else
91 can alloc blocks */ 91 can alloc blocks */
92 /*unsigned sb_mounting : 1;*/ 92 /*unsigned sb_mounting : 1;*/
93 int sb_timeshift; 93 int sb_timeshift;
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
233 233
234/* dentry.c */ 234/* dentry.c */
235 235
236void hpfs_set_dentry_operations(struct dentry *); 236extern const struct dentry_operations hpfs_dentry_operations;
237 237
238/* dir.c */ 238/* dir.c */
239 239
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd10..1ae35baa539e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
281 attr->ia_size != i_size_read(inode)) { 281 attr->ia_size != i_size_read(inode)) {
282 error = vmtruncate(inode, attr->ia_size); 282 error = vmtruncate(inode, attr->ia_size);
283 if (error) 283 if (error)
284 return error; 284 goto out_unlock;
285 } 285 }
286 286
287 setattr_copy(inode, attr); 287 setattr_copy(inode, attr);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f65..f4ad9e31ddc4 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
419 unlock_kernel(); 419 unlock_kernel();
420 return -ENOSPC; 420 return -ENOSPC;
421 } 421 }
422 if (generic_permission(inode, MAY_WRITE, NULL) || 422 if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
423 !S_ISREG(inode->i_mode) || 423 !S_ISREG(inode->i_mode) ||
424 get_write_access(inode)) { 424 get_write_access(inode)) {
425 d_rehash(dentry); 425 d_rehash(dentry);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 2607010be2fe..b30426b1fc97 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
177 return &ei->vfs_inode; 177 return &ei->vfs_inode;
178} 178}
179 179
180static void hpfs_destroy_inode(struct inode *inode) 180static void hpfs_i_callback(struct rcu_head *head)
181{ 181{
182 struct inode *inode = container_of(head, struct inode, i_rcu);
183 INIT_LIST_HEAD(&inode->i_dentry);
182 kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); 184 kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
183} 185}
184 186
187static void hpfs_destroy_inode(struct inode *inode)
188{
189 call_rcu(&inode->i_rcu, hpfs_i_callback);
190}
191
185static void init_once(void *foo) 192static void init_once(void *foo)
186{ 193{
187 struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; 194 struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
@@ -477,17 +484,21 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 484
478 int o; 485 int o;
479 486
487 lock_kernel();
488
480 save_mount_options(s, options); 489 save_mount_options(s, options);
481 490
482 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 491 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
483 if (!sbi) 492 if (!sbi) {
493 unlock_kernel();
484 return -ENOMEM; 494 return -ENOMEM;
495 }
485 s->s_fs_info = sbi; 496 s->s_fs_info = sbi;
486 497
487 sbi->sb_bmp_dir = NULL; 498 sbi->sb_bmp_dir = NULL;
488 sbi->sb_cp_table = NULL; 499 sbi->sb_cp_table = NULL;
489 500
490 init_MUTEX(&sbi->hpfs_creation_de); 501 mutex_init(&sbi->hpfs_creation_de);
491 502
492 uid = current_uid(); 503 uid = current_uid();
493 gid = current_gid(); 504 gid = current_gid();
@@ -539,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
539 /* Fill superblock stuff */ 550 /* Fill superblock stuff */
540 s->s_magic = HPFS_SUPER_MAGIC; 551 s->s_magic = HPFS_SUPER_MAGIC;
541 s->s_op = &hpfs_sops; 552 s->s_op = &hpfs_sops;
553 s->s_d_op = &hpfs_dentry_operations;
542 554
543 sbi->sb_root = superblock->root; 555 sbi->sb_root = superblock->root;
544 sbi->sb_fs_size = superblock->n_sectors; 556 sbi->sb_fs_size = superblock->n_sectors;
@@ -640,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
640 iput(root); 652 iput(root);
641 goto bail0; 653 goto bail0;
642 } 654 }
643 hpfs_set_dentry_operations(s->s_root);
644 655
645 /* 656 /*
646 * find the root directory's . pointer & finish filling in the inode 657 * find the root directory's . pointer & finish filling in the inode
@@ -666,6 +677,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
666 root->i_blocks = 5; 677 root->i_blocks = 5;
667 hpfs_brelse4(&qbh); 678 hpfs_brelse4(&qbh);
668 } 679 }
680 unlock_kernel();
669 return 0; 681 return 0;
670 682
671bail4: brelse(bh2); 683bail4: brelse(bh2);
@@ -677,20 +689,20 @@ bail0:
677 kfree(sbi->sb_cp_table); 689 kfree(sbi->sb_cp_table);
678 s->s_fs_info = NULL; 690 s->s_fs_info = NULL;
679 kfree(sbi); 691 kfree(sbi);
692 unlock_kernel();
680 return -EINVAL; 693 return -EINVAL;
681} 694}
682 695
683static int hpfs_get_sb(struct file_system_type *fs_type, 696static struct dentry *hpfs_mount(struct file_system_type *fs_type,
684 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 697 int flags, const char *dev_name, void *data)
685{ 698{
686 return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super, 699 return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
687 mnt);
688} 700}
689 701
690static struct file_system_type hpfs_fs_type = { 702static struct file_system_type hpfs_fs_type = {
691 .owner = THIS_MODULE, 703 .owner = THIS_MODULE,
692 .name = "hpfs", 704 .name = "hpfs",
693 .get_sb = hpfs_get_sb, 705 .mount = hpfs_mount,
694 .kill_sb = kill_block_super, 706 .kill_sb = kill_block_super,
695 .fs_flags = FS_REQUIRES_DEV, 707 .fs_flags = FS_REQUIRES_DEV,
696}; 708};
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7b027720d820..87ed48e0343d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -598,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
598 .readdir = hppfs_readdir, 598 .readdir = hppfs_readdir,
599 .open = hppfs_dir_open, 599 .open = hppfs_dir_open,
600 .fsync = hppfs_fsync, 600 .fsync = hppfs_fsync,
601 .llseek = default_llseek,
601}; 602};
602 603
603static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) 604static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -631,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
631 mntput(ino->i_sb->s_fs_info); 632 mntput(ino->i_sb->s_fs_info);
632} 633}
633 634
634static void hppfs_destroy_inode(struct inode *inode) 635static void hppfs_i_callback(struct rcu_head *head)
635{ 636{
637 struct inode *inode = container_of(head, struct inode, i_rcu);
638 INIT_LIST_HEAD(&inode->i_dentry);
636 kfree(HPPFS_I(inode)); 639 kfree(HPPFS_I(inode));
637} 640}
638 641
642static void hppfs_destroy_inode(struct inode *inode)
643{
644 call_rcu(&inode->i_rcu, hppfs_i_callback);
645}
646
639static const struct super_operations hppfs_sbops = { 647static const struct super_operations hppfs_sbops = {
640 .alloc_inode = hppfs_alloc_inode, 648 .alloc_inode = hppfs_alloc_inode,
641 .destroy_inode = hppfs_destroy_inode, 649 .destroy_inode = hppfs_destroy_inode,
@@ -747,17 +755,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
747 return(err); 755 return(err);
748} 756}
749 757
750static int hppfs_read_super(struct file_system_type *type, 758static struct dentry *hppfs_read_super(struct file_system_type *type,
751 int flags, const char *dev_name, 759 int flags, const char *dev_name,
752 void *data, struct vfsmount *mnt) 760 void *data)
753{ 761{
754 return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt); 762 return mount_nodev(type, flags, data, hppfs_fill_super);
755} 763}
756 764
757static struct file_system_type hppfs_type = { 765static struct file_system_type hppfs_type = {
758 .owner = THIS_MODULE, 766 .owner = THIS_MODULE,
759 .name = "hppfs", 767 .name = "hppfs",
760 .get_sb = hppfs_read_super, 768 .mount = hppfs_read_super,
761 .kill_sb = kill_anon_super, 769 .kill_sb = kill_anon_super,
762 .fs_flags = 0, 770 .fs_flags = 0,
763}; 771};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..9885082b470f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/magic.h> 33#include <linux/magic.h>
34#include <linux/migrate.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -455,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
455 inode = new_inode(sb); 456 inode = new_inode(sb);
456 if (inode) { 457 if (inode) {
457 struct hugetlbfs_inode_info *info; 458 struct hugetlbfs_inode_info *info;
459 inode->i_ino = get_next_ino();
458 inode->i_mode = mode; 460 inode->i_mode = mode;
459 inode->i_uid = uid; 461 inode->i_uid = uid;
460 inode->i_gid = gid; 462 inode->i_gid = gid;
@@ -573,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
573 return 0; 575 return 0;
574} 576}
575 577
578static int hugetlbfs_migrate_page(struct address_space *mapping,
579 struct page *newpage, struct page *page)
580{
581 int rc;
582
583 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
584 if (rc)
585 return rc;
586 migrate_page_copy(newpage, page);
587
588 return 0;
589}
590
576static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 591static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
577{ 592{
578 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 593 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -648,17 +663,25 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
648 return &p->vfs_inode; 663 return &p->vfs_inode;
649} 664}
650 665
666static void hugetlbfs_i_callback(struct rcu_head *head)
667{
668 struct inode *inode = container_of(head, struct inode, i_rcu);
669 INIT_LIST_HEAD(&inode->i_dentry);
670 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
671}
672
651static void hugetlbfs_destroy_inode(struct inode *inode) 673static void hugetlbfs_destroy_inode(struct inode *inode)
652{ 674{
653 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 675 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
654 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 676 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
655 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 677 call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
656} 678}
657 679
658static const struct address_space_operations hugetlbfs_aops = { 680static const struct address_space_operations hugetlbfs_aops = {
659 .write_begin = hugetlbfs_write_begin, 681 .write_begin = hugetlbfs_write_begin,
660 .write_end = hugetlbfs_write_end, 682 .write_end = hugetlbfs_write_end,
661 .set_page_dirty = hugetlbfs_set_page_dirty, 683 .set_page_dirty = hugetlbfs_set_page_dirty,
684 .migratepage = hugetlbfs_migrate_page,
662}; 685};
663 686
664 687
@@ -674,6 +697,7 @@ const struct file_operations hugetlbfs_file_operations = {
674 .mmap = hugetlbfs_file_mmap, 697 .mmap = hugetlbfs_file_mmap,
675 .fsync = noop_fsync, 698 .fsync = noop_fsync,
676 .get_unmapped_area = hugetlb_get_unmapped_area, 699 .get_unmapped_area = hugetlb_get_unmapped_area,
700 .llseek = default_llseek,
677}; 701};
678 702
679static const struct inode_operations hugetlbfs_dir_inode_operations = { 703static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -879,15 +903,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
879 } 903 }
880} 904}
881 905
882static int hugetlbfs_get_sb(struct file_system_type *fs_type, 906static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
883 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 907 int flags, const char *dev_name, void *data)
884{ 908{
885 return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); 909 return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
886} 910}
887 911
888static struct file_system_type hugetlbfs_fs_type = { 912static struct file_system_type hugetlbfs_fs_type = {
889 .name = "hugetlbfs", 913 .name = "hugetlbfs",
890 .get_sb = hugetlbfs_get_sb, 914 .mount = hugetlbfs_mount,
891 .kill_sb = kill_litter_super, 915 .kill_sb = kill_litter_super,
892}; 916};
893 917
@@ -915,8 +939,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
915 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 939 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
916 *user = current_user(); 940 *user = current_user();
917 if (user_shm_lock(size, *user)) { 941 if (user_shm_lock(size, *user)) {
918 WARN_ONCE(1, 942 printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
919 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
920 } else { 943 } else {
921 *user = NULL; 944 *user = NULL;
922 return ERR_PTR(-EPERM); 945 return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..da85e56378f3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,11 +24,11 @@
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/async.h> 25#include <linux/async.h>
26#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h>
27 28
28/* 29/*
29 * This is needed for the following functions: 30 * This is needed for the following functions:
30 * - inode_has_buffers 31 * - inode_has_buffers
31 * - invalidate_inode_buffers
32 * - invalidate_bdev 32 * - invalidate_bdev
33 * 33 *
34 * FIXME: remove all knowledge of the buffer layer from this file 34 * FIXME: remove all knowledge of the buffer layer from this file
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
72 * allowing for low-overhead inode sync() operations. 72 * allowing for low-overhead inode sync() operations.
73 */ 73 */
74 74
75LIST_HEAD(inode_in_use); 75static LIST_HEAD(inode_lru);
76LIST_HEAD(inode_unused);
77static struct hlist_head *inode_hashtable __read_mostly; 76static struct hlist_head *inode_hashtable __read_mostly;
78 77
79/* 78/*
@@ -103,8 +102,43 @@ static DECLARE_RWSEM(iprune_sem);
103 */ 102 */
104struct inodes_stat_t inodes_stat; 103struct inodes_stat_t inodes_stat;
105 104
105static DEFINE_PER_CPU(unsigned int, nr_inodes);
106
106static struct kmem_cache *inode_cachep __read_mostly; 107static struct kmem_cache *inode_cachep __read_mostly;
107 108
109static int get_nr_inodes(void)
110{
111 int i;
112 int sum = 0;
113 for_each_possible_cpu(i)
114 sum += per_cpu(nr_inodes, i);
115 return sum < 0 ? 0 : sum;
116}
117
118static inline int get_nr_inodes_unused(void)
119{
120 return inodes_stat.nr_unused;
121}
122
123int get_nr_dirty_inodes(void)
124{
125 /* not actually dirty inodes, but a wild approximation */
126 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
127 return nr_dirty > 0 ? nr_dirty : 0;
128}
129
130/*
131 * Handle nr_inode sysctl
132 */
133#ifdef CONFIG_SYSCTL
134int proc_nr_inodes(ctl_table *table, int write,
135 void __user *buffer, size_t *lenp, loff_t *ppos)
136{
137 inodes_stat.nr_inodes = get_nr_inodes();
138 return proc_dointvec(table, write, buffer, lenp, ppos);
139}
140#endif
141
108static void wake_up_inode(struct inode *inode) 142static void wake_up_inode(struct inode *inode)
109{ 143{
110 /* 144 /*
@@ -192,6 +226,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
192 inode->i_fsnotify_mask = 0; 226 inode->i_fsnotify_mask = 0;
193#endif 227#endif
194 228
229 this_cpu_inc(nr_inodes);
230
195 return 0; 231 return 0;
196out: 232out:
197 return -ENOMEM; 233 return -ENOMEM;
@@ -221,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb)
221 return inode; 257 return inode;
222} 258}
223 259
260void free_inode_nonrcu(struct inode *inode)
261{
262 kmem_cache_free(inode_cachep, inode);
263}
264EXPORT_SYMBOL(free_inode_nonrcu);
265
224void __destroy_inode(struct inode *inode) 266void __destroy_inode(struct inode *inode)
225{ 267{
226 BUG_ON(inode_has_buffers(inode)); 268 BUG_ON(inode_has_buffers(inode));
@@ -232,16 +274,25 @@ void __destroy_inode(struct inode *inode)
232 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 274 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
233 posix_acl_release(inode->i_default_acl); 275 posix_acl_release(inode->i_default_acl);
234#endif 276#endif
277 this_cpu_dec(nr_inodes);
235} 278}
236EXPORT_SYMBOL(__destroy_inode); 279EXPORT_SYMBOL(__destroy_inode);
237 280
238void destroy_inode(struct inode *inode) 281static void i_callback(struct rcu_head *head)
239{ 282{
283 struct inode *inode = container_of(head, struct inode, i_rcu);
284 INIT_LIST_HEAD(&inode->i_dentry);
285 kmem_cache_free(inode_cachep, inode);
286}
287
288static void destroy_inode(struct inode *inode)
289{
290 BUG_ON(!list_empty(&inode->i_lru));
240 __destroy_inode(inode); 291 __destroy_inode(inode);
241 if (inode->i_sb->s_op->destroy_inode) 292 if (inode->i_sb->s_op->destroy_inode)
242 inode->i_sb->s_op->destroy_inode(inode); 293 inode->i_sb->s_op->destroy_inode(inode);
243 else 294 else
244 kmem_cache_free(inode_cachep, (inode)); 295 call_rcu(&inode->i_rcu, i_callback);
245} 296}
246 297
247/* 298/*
@@ -255,6 +306,8 @@ void inode_init_once(struct inode *inode)
255 INIT_HLIST_NODE(&inode->i_hash); 306 INIT_HLIST_NODE(&inode->i_hash);
256 INIT_LIST_HEAD(&inode->i_dentry); 307 INIT_LIST_HEAD(&inode->i_dentry);
257 INIT_LIST_HEAD(&inode->i_devices); 308 INIT_LIST_HEAD(&inode->i_devices);
309 INIT_LIST_HEAD(&inode->i_wb_list);
310 INIT_LIST_HEAD(&inode->i_lru);
258 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 311 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
259 spin_lock_init(&inode->i_data.tree_lock); 312 spin_lock_init(&inode->i_data.tree_lock);
260 spin_lock_init(&inode->i_data.i_mmap_lock); 313 spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -281,13 +334,108 @@ static void init_once(void *foo)
281 */ 334 */
282void __iget(struct inode *inode) 335void __iget(struct inode *inode)
283{ 336{
284 if (atomic_inc_return(&inode->i_count) != 1) 337 atomic_inc(&inode->i_count);
285 return; 338}
339
340/*
341 * get additional reference to inode; caller must already hold one.
342 */
343void ihold(struct inode *inode)
344{
345 WARN_ON(atomic_inc_return(&inode->i_count) < 2);
346}
347EXPORT_SYMBOL(ihold);
348
349static void inode_lru_list_add(struct inode *inode)
350{
351 if (list_empty(&inode->i_lru)) {
352 list_add(&inode->i_lru, &inode_lru);
353 inodes_stat.nr_unused++;
354 }
355}
356
357static void inode_lru_list_del(struct inode *inode)
358{
359 if (!list_empty(&inode->i_lru)) {
360 list_del_init(&inode->i_lru);
361 inodes_stat.nr_unused--;
362 }
363}
364
365static inline void __inode_sb_list_add(struct inode *inode)
366{
367 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
368}
369
370/**
371 * inode_sb_list_add - add inode to the superblock list of inodes
372 * @inode: inode to add
373 */
374void inode_sb_list_add(struct inode *inode)
375{
376 spin_lock(&inode_lock);
377 __inode_sb_list_add(inode);
378 spin_unlock(&inode_lock);
379}
380EXPORT_SYMBOL_GPL(inode_sb_list_add);
381
382static inline void __inode_sb_list_del(struct inode *inode)
383{
384 list_del_init(&inode->i_sb_list);
385}
386
387static unsigned long hash(struct super_block *sb, unsigned long hashval)
388{
389 unsigned long tmp;
390
391 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
392 L1_CACHE_BYTES;
393 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
394 return tmp & I_HASHMASK;
395}
396
397/**
398 * __insert_inode_hash - hash an inode
399 * @inode: unhashed inode
400 * @hashval: unsigned long value used to locate this object in the
401 * inode_hashtable.
402 *
403 * Add an inode to the inode hash for this superblock.
404 */
405void __insert_inode_hash(struct inode *inode, unsigned long hashval)
406{
407 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
408
409 spin_lock(&inode_lock);
410 hlist_add_head(&inode->i_hash, b);
411 spin_unlock(&inode_lock);
412}
413EXPORT_SYMBOL(__insert_inode_hash);
414
415/**
416 * __remove_inode_hash - remove an inode from the hash
417 * @inode: inode to unhash
418 *
419 * Remove an inode from the superblock.
420 */
421static void __remove_inode_hash(struct inode *inode)
422{
423 hlist_del_init(&inode->i_hash);
424}
286 425
287 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 426/**
288 list_move(&inode->i_list, &inode_in_use); 427 * remove_inode_hash - remove an inode from the hash
289 inodes_stat.nr_unused--; 428 * @inode: inode to unhash
429 *
430 * Remove an inode from the superblock.
431 */
432void remove_inode_hash(struct inode *inode)
433{
434 spin_lock(&inode_lock);
435 hlist_del_init(&inode->i_hash);
436 spin_unlock(&inode_lock);
290} 437}
438EXPORT_SYMBOL(remove_inode_hash);
291 439
292void end_writeback(struct inode *inode) 440void end_writeback(struct inode *inode)
293{ 441{
@@ -297,6 +445,7 @@ void end_writeback(struct inode *inode)
297 BUG_ON(!(inode->i_state & I_FREEING)); 445 BUG_ON(!(inode->i_state & I_FREEING));
298 BUG_ON(inode->i_state & I_CLEAR); 446 BUG_ON(inode->i_state & I_CLEAR);
299 inode_sync_wait(inode); 447 inode_sync_wait(inode);
448 /* don't need i_lock here, no concurrent mods to i_state */
300 inode->i_state = I_FREEING | I_CLEAR; 449 inode->i_state = I_FREEING | I_CLEAR;
301} 450}
302EXPORT_SYMBOL(end_writeback); 451EXPORT_SYMBOL(end_writeback);
@@ -327,101 +476,113 @@ static void evict(struct inode *inode)
327 */ 476 */
328static void dispose_list(struct list_head *head) 477static void dispose_list(struct list_head *head)
329{ 478{
330 int nr_disposed = 0;
331
332 while (!list_empty(head)) { 479 while (!list_empty(head)) {
333 struct inode *inode; 480 struct inode *inode;
334 481
335 inode = list_first_entry(head, struct inode, i_list); 482 inode = list_first_entry(head, struct inode, i_lru);
336 list_del(&inode->i_list); 483 list_del_init(&inode->i_lru);
337 484
338 evict(inode); 485 evict(inode);
339 486
340 spin_lock(&inode_lock); 487 spin_lock(&inode_lock);
341 hlist_del_init(&inode->i_hash); 488 __remove_inode_hash(inode);
342 list_del_init(&inode->i_sb_list); 489 __inode_sb_list_del(inode);
343 spin_unlock(&inode_lock); 490 spin_unlock(&inode_lock);
344 491
345 wake_up_inode(inode); 492 wake_up_inode(inode);
346 destroy_inode(inode); 493 destroy_inode(inode);
347 nr_disposed++;
348 } 494 }
349 spin_lock(&inode_lock);
350 inodes_stat.nr_inodes -= nr_disposed;
351 spin_unlock(&inode_lock);
352} 495}
353 496
354/* 497/**
355 * Invalidate all inodes for a device. 498 * evict_inodes - evict all evictable inodes for a superblock
499 * @sb: superblock to operate on
500 *
501 * Make sure that no inodes with zero refcount are retained. This is
502 * called by superblock shutdown after having MS_ACTIVE flag removed,
503 * so any inode reaching zero refcount during or after that call will
504 * be immediately evicted.
356 */ 505 */
357static int invalidate_list(struct list_head *head, struct list_head *dispose) 506void evict_inodes(struct super_block *sb)
358{ 507{
359 struct list_head *next; 508 struct inode *inode, *next;
360 int busy = 0, count = 0; 509 LIST_HEAD(dispose);
361
362 next = head->next;
363 for (;;) {
364 struct list_head *tmp = next;
365 struct inode *inode;
366 510
367 /* 511 down_write(&iprune_sem);
368 * We can reschedule here without worrying about the list's
369 * consistency because the per-sb list of inodes must not
370 * change during umount anymore, and because iprune_sem keeps
371 * shrink_icache_memory() away.
372 */
373 cond_resched_lock(&inode_lock);
374 512
375 next = next->next; 513 spin_lock(&inode_lock);
376 if (tmp == head) 514 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
377 break; 515 if (atomic_read(&inode->i_count))
378 inode = list_entry(tmp, struct inode, i_sb_list);
379 if (inode->i_state & I_NEW)
380 continue; 516 continue;
381 invalidate_inode_buffers(inode); 517
382 if (!atomic_read(&inode->i_count)) { 518 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
383 list_move(&inode->i_list, dispose); 519 WARN_ON(1);
384 WARN_ON(inode->i_state & I_NEW);
385 inode->i_state |= I_FREEING;
386 count++;
387 continue; 520 continue;
388 } 521 }
389 busy = 1; 522
523 inode->i_state |= I_FREEING;
524
525 /*
526 * Move the inode off the IO lists and LRU once I_FREEING is
527 * set so that it won't get moved back on there if it is dirty.
528 */
529 list_move(&inode->i_lru, &dispose);
530 list_del_init(&inode->i_wb_list);
531 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
532 inodes_stat.nr_unused--;
390 } 533 }
391 /* only unused inodes may be cached with i_count zero */ 534 spin_unlock(&inode_lock);
392 inodes_stat.nr_unused -= count; 535
393 return busy; 536 dispose_list(&dispose);
537 up_write(&iprune_sem);
394} 538}
395 539
396/** 540/**
397 * invalidate_inodes - discard the inodes on a device 541 * invalidate_inodes - attempt to free all inodes on a superblock
398 * @sb: superblock 542 * @sb: superblock to operate on
399 * 543 *
400 * Discard all of the inodes for a given superblock. If the discard 544 * Attempts to free all inodes for a given superblock. If there were any
401 * fails because there are busy inodes then a non zero value is returned. 545 * busy inodes return a non-zero value, else zero.
402 * If the discard is successful all the inodes have been discarded.
403 */ 546 */
404int invalidate_inodes(struct super_block *sb) 547int invalidate_inodes(struct super_block *sb)
405{ 548{
406 int busy; 549 int busy = 0;
407 LIST_HEAD(throw_away); 550 struct inode *inode, *next;
551 LIST_HEAD(dispose);
408 552
409 down_write(&iprune_sem); 553 down_write(&iprune_sem);
554
410 spin_lock(&inode_lock); 555 spin_lock(&inode_lock);
411 fsnotify_unmount_inodes(&sb->s_inodes); 556 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
412 busy = invalidate_list(&sb->s_inodes, &throw_away); 557 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
558 continue;
559 if (atomic_read(&inode->i_count)) {
560 busy = 1;
561 continue;
562 }
563
564 inode->i_state |= I_FREEING;
565
566 /*
567 * Move the inode off the IO lists and LRU once I_FREEING is
568 * set so that it won't get moved back on there if it is dirty.
569 */
570 list_move(&inode->i_lru, &dispose);
571 list_del_init(&inode->i_wb_list);
572 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
573 inodes_stat.nr_unused--;
574 }
413 spin_unlock(&inode_lock); 575 spin_unlock(&inode_lock);
414 576
415 dispose_list(&throw_away); 577 dispose_list(&dispose);
416 up_write(&iprune_sem); 578 up_write(&iprune_sem);
417 579
418 return busy; 580 return busy;
419} 581}
420EXPORT_SYMBOL(invalidate_inodes);
421 582
422static int can_unuse(struct inode *inode) 583static int can_unuse(struct inode *inode)
423{ 584{
424 if (inode->i_state) 585 if (inode->i_state & ~I_REFERENCED)
425 return 0; 586 return 0;
426 if (inode_has_buffers(inode)) 587 if (inode_has_buffers(inode))
427 return 0; 588 return 0;
@@ -433,22 +594,24 @@ static int can_unuse(struct inode *inode)
433} 594}
434 595
435/* 596/*
436 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 597 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
437 * a temporary list and then are freed outside inode_lock by dispose_list(). 598 * temporary list and then are freed outside inode_lock by dispose_list().
438 * 599 *
439 * Any inodes which are pinned purely because of attached pagecache have their 600 * Any inodes which are pinned purely because of attached pagecache have their
440 * pagecache removed. We expect the final iput() on that inode to add it to 601 * pagecache removed. If the inode has metadata buffers attached to
441 * the front of the inode_unused list. So look for it there and if the 602 * mapping->private_list then try to remove them.
442 * inode is still freeable, proceed. The right inode is found 99.9% of the
443 * time in testing on a 4-way.
444 * 603 *
445 * If the inode has metadata buffers attached to mapping->private_list then 604 * If the inode has the I_REFERENCED flag set, then it means that it has been
446 * try to remove them. 605 * used recently - the flag is set in iput_final(). When we encounter such an
606 * inode, clear the flag and move it to the back of the LRU so it gets another
607 * pass through the LRU before it gets reclaimed. This is necessary because of
608 * the fact we are doing lazy LRU updates to minimise lock contention so the
609 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
610 * with this flag set because they are the inodes that are out of order.
447 */ 611 */
448static void prune_icache(int nr_to_scan) 612static void prune_icache(int nr_to_scan)
449{ 613{
450 LIST_HEAD(freeable); 614 LIST_HEAD(freeable);
451 int nr_pruned = 0;
452 int nr_scanned; 615 int nr_scanned;
453 unsigned long reap = 0; 616 unsigned long reap = 0;
454 617
@@ -457,13 +620,26 @@ static void prune_icache(int nr_to_scan)
457 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 620 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
458 struct inode *inode; 621 struct inode *inode;
459 622
460 if (list_empty(&inode_unused)) 623 if (list_empty(&inode_lru))
461 break; 624 break;
462 625
463 inode = list_entry(inode_unused.prev, struct inode, i_list); 626 inode = list_entry(inode_lru.prev, struct inode, i_lru);
627
628 /*
629 * Referenced or dirty inodes are still in use. Give them
630 * another pass through the LRU as we canot reclaim them now.
631 */
632 if (atomic_read(&inode->i_count) ||
633 (inode->i_state & ~I_REFERENCED)) {
634 list_del_init(&inode->i_lru);
635 inodes_stat.nr_unused--;
636 continue;
637 }
464 638
465 if (inode->i_state || atomic_read(&inode->i_count)) { 639 /* recently referenced inodes get one more pass */
466 list_move(&inode->i_list, &inode_unused); 640 if (inode->i_state & I_REFERENCED) {
641 list_move(&inode->i_lru, &inode_lru);
642 inode->i_state &= ~I_REFERENCED;
467 continue; 643 continue;
468 } 644 }
469 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 645 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -475,18 +651,23 @@ static void prune_icache(int nr_to_scan)
475 iput(inode); 651 iput(inode);
476 spin_lock(&inode_lock); 652 spin_lock(&inode_lock);
477 653
478 if (inode != list_entry(inode_unused.next, 654 if (inode != list_entry(inode_lru.next,
479 struct inode, i_list)) 655 struct inode, i_lru))
480 continue; /* wrong inode or list_empty */ 656 continue; /* wrong inode or list_empty */
481 if (!can_unuse(inode)) 657 if (!can_unuse(inode))
482 continue; 658 continue;
483 } 659 }
484 list_move(&inode->i_list, &freeable);
485 WARN_ON(inode->i_state & I_NEW); 660 WARN_ON(inode->i_state & I_NEW);
486 inode->i_state |= I_FREEING; 661 inode->i_state |= I_FREEING;
487 nr_pruned++; 662
663 /*
664 * Move the inode off the IO lists and LRU once I_FREEING is
665 * set so that it won't get moved back on there if it is dirty.
666 */
667 list_move(&inode->i_lru, &freeable);
668 list_del_init(&inode->i_wb_list);
669 inodes_stat.nr_unused--;
488 } 670 }
489 inodes_stat.nr_unused -= nr_pruned;
490 if (current_is_kswapd()) 671 if (current_is_kswapd())
491 __count_vm_events(KSWAPD_INODESTEAL, reap); 672 __count_vm_events(KSWAPD_INODESTEAL, reap);
492 else 673 else
@@ -518,7 +699,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
518 return -1; 699 return -1;
519 prune_icache(nr); 700 prune_icache(nr);
520 } 701 }
521 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 702 return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
522} 703}
523 704
524static struct shrinker icache_shrinker = { 705static struct shrinker icache_shrinker = {
@@ -529,9 +710,6 @@ static struct shrinker icache_shrinker = {
529static void __wait_on_freeing_inode(struct inode *inode); 710static void __wait_on_freeing_inode(struct inode *inode);
530/* 711/*
531 * Called with the inode lock held. 712 * Called with the inode lock held.
532 * NOTE: we are not increasing the inode-refcount, you must call __iget()
533 * by hand after calling find_inode now! This simplifies iunique and won't
534 * add any additional branch in the common code.
535 */ 713 */
536static struct inode *find_inode(struct super_block *sb, 714static struct inode *find_inode(struct super_block *sb,
537 struct hlist_head *head, 715 struct hlist_head *head,
@@ -551,9 +729,10 @@ repeat:
551 __wait_on_freeing_inode(inode); 729 __wait_on_freeing_inode(inode);
552 goto repeat; 730 goto repeat;
553 } 731 }
554 break; 732 __iget(inode);
733 return inode;
555 } 734 }
556 return node ? inode : NULL; 735 return NULL;
557} 736}
558 737
559/* 738/*
@@ -576,53 +755,49 @@ repeat:
576 __wait_on_freeing_inode(inode); 755 __wait_on_freeing_inode(inode);
577 goto repeat; 756 goto repeat;
578 } 757 }
579 break; 758 __iget(inode);
759 return inode;
580 } 760 }
581 return node ? inode : NULL; 761 return NULL;
582}
583
584static unsigned long hash(struct super_block *sb, unsigned long hashval)
585{
586 unsigned long tmp;
587
588 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
589 L1_CACHE_BYTES;
590 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
591 return tmp & I_HASHMASK;
592}
593
594static inline void
595__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
596 struct inode *inode)
597{
598 inodes_stat.nr_inodes++;
599 list_add(&inode->i_list, &inode_in_use);
600 list_add(&inode->i_sb_list, &sb->s_inodes);
601 if (head)
602 hlist_add_head(&inode->i_hash, head);
603} 762}
604 763
605/** 764/*
606 * inode_add_to_lists - add a new inode to relevant lists 765 * Each cpu owns a range of LAST_INO_BATCH numbers.
607 * @sb: superblock inode belongs to 766 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
608 * @inode: inode to mark in use 767 * to renew the exhausted range.
609 * 768 *
610 * When an inode is allocated it needs to be accounted for, added to the in use 769 * This does not significantly increase overflow rate because every CPU can
611 * list, the owning superblock and the inode hash. This needs to be done under 770 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
612 * the inode_lock, so export a function to do this rather than the inode lock 771 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
613 * itself. We calculate the hash list to add to here so it is all internal 772 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
614 * which requires the caller to have already set up the inode number in the 773 * overflow rate by 2x, which does not seem too significant.
615 * inode to add. 774 *
775 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
776 * error if st_ino won't fit in target struct field. Use 32bit counter
777 * here to attempt to avoid that.
616 */ 778 */
617void inode_add_to_lists(struct super_block *sb, struct inode *inode) 779#define LAST_INO_BATCH 1024
780static DEFINE_PER_CPU(unsigned int, last_ino);
781
782unsigned int get_next_ino(void)
618{ 783{
619 struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); 784 unsigned int *p = &get_cpu_var(last_ino);
785 unsigned int res = *p;
620 786
621 spin_lock(&inode_lock); 787#ifdef CONFIG_SMP
622 __inode_add_to_lists(sb, head, inode); 788 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
623 spin_unlock(&inode_lock); 789 static atomic_t shared_last_ino;
790 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
791
792 res = next - LAST_INO_BATCH;
793 }
794#endif
795
796 *p = ++res;
797 put_cpu_var(last_ino);
798 return res;
624} 799}
625EXPORT_SYMBOL_GPL(inode_add_to_lists); 800EXPORT_SYMBOL(get_next_ino);
626 801
627/** 802/**
628 * new_inode - obtain an inode 803 * new_inode - obtain an inode
@@ -638,12 +813,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
638 */ 813 */
639struct inode *new_inode(struct super_block *sb) 814struct inode *new_inode(struct super_block *sb)
640{ 815{
641 /*
642 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
643 * error if st_ino won't fit in target struct field. Use 32bit counter
644 * here to attempt to avoid that.
645 */
646 static unsigned int last_ino;
647 struct inode *inode; 816 struct inode *inode;
648 817
649 spin_lock_prefetch(&inode_lock); 818 spin_lock_prefetch(&inode_lock);
@@ -651,8 +820,7 @@ struct inode *new_inode(struct super_block *sb)
651 inode = alloc_inode(sb); 820 inode = alloc_inode(sb);
652 if (inode) { 821 if (inode) {
653 spin_lock(&inode_lock); 822 spin_lock(&inode_lock);
654 __inode_add_to_lists(sb, NULL, inode); 823 __inode_sb_list_add(inode);
655 inode->i_ino = ++last_ino;
656 inode->i_state = 0; 824 inode->i_state = 0;
657 spin_unlock(&inode_lock); 825 spin_unlock(&inode_lock);
658 } 826 }
@@ -663,7 +831,7 @@ EXPORT_SYMBOL(new_inode);
663void unlock_new_inode(struct inode *inode) 831void unlock_new_inode(struct inode *inode)
664{ 832{
665#ifdef CONFIG_DEBUG_LOCK_ALLOC 833#ifdef CONFIG_DEBUG_LOCK_ALLOC
666 if (inode->i_mode & S_IFDIR) { 834 if (S_ISDIR(inode->i_mode)) {
667 struct file_system_type *type = inode->i_sb->s_type; 835 struct file_system_type *type = inode->i_sb->s_type;
668 836
669 /* Set new key only if filesystem hasn't already changed it */ 837 /* Set new key only if filesystem hasn't already changed it */
@@ -720,7 +888,8 @@ static struct inode *get_new_inode(struct super_block *sb,
720 if (set(inode, data)) 888 if (set(inode, data))
721 goto set_failed; 889 goto set_failed;
722 890
723 __inode_add_to_lists(sb, head, inode); 891 hlist_add_head(&inode->i_hash, head);
892 __inode_sb_list_add(inode);
724 inode->i_state = I_NEW; 893 inode->i_state = I_NEW;
725 spin_unlock(&inode_lock); 894 spin_unlock(&inode_lock);
726 895
@@ -735,7 +904,6 @@ static struct inode *get_new_inode(struct super_block *sb,
735 * us. Use the old inode instead of the one we just 904 * us. Use the old inode instead of the one we just
736 * allocated. 905 * allocated.
737 */ 906 */
738 __iget(old);
739 spin_unlock(&inode_lock); 907 spin_unlock(&inode_lock);
740 destroy_inode(inode); 908 destroy_inode(inode);
741 inode = old; 909 inode = old;
@@ -767,7 +935,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
767 old = find_inode_fast(sb, head, ino); 935 old = find_inode_fast(sb, head, ino);
768 if (!old) { 936 if (!old) {
769 inode->i_ino = ino; 937 inode->i_ino = ino;
770 __inode_add_to_lists(sb, head, inode); 938 hlist_add_head(&inode->i_hash, head);
939 __inode_sb_list_add(inode);
771 inode->i_state = I_NEW; 940 inode->i_state = I_NEW;
772 spin_unlock(&inode_lock); 941 spin_unlock(&inode_lock);
773 942
@@ -782,7 +951,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
782 * us. Use the old inode instead of the one we just 951 * us. Use the old inode instead of the one we just
783 * allocated. 952 * allocated.
784 */ 953 */
785 __iget(old);
786 spin_unlock(&inode_lock); 954 spin_unlock(&inode_lock);
787 destroy_inode(inode); 955 destroy_inode(inode);
788 inode = old; 956 inode = old;
@@ -791,6 +959,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
791 return inode; 959 return inode;
792} 960}
793 961
962/*
963 * search the inode cache for a matching inode number.
964 * If we find one, then the inode number we are trying to
965 * allocate is not unique and so we should not use it.
966 *
967 * Returns 1 if the inode number is unique, 0 if it is not.
968 */
969static int test_inode_iunique(struct super_block *sb, unsigned long ino)
970{
971 struct hlist_head *b = inode_hashtable + hash(sb, ino);
972 struct hlist_node *node;
973 struct inode *inode;
974
975 hlist_for_each_entry(inode, node, b, i_hash) {
976 if (inode->i_ino == ino && inode->i_sb == sb)
977 return 0;
978 }
979
980 return 1;
981}
982
794/** 983/**
795 * iunique - get a unique inode number 984 * iunique - get a unique inode number
796 * @sb: superblock 985 * @sb: superblock
@@ -812,19 +1001,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
812 * error if st_ino won't fit in target struct field. Use 32bit counter 1001 * error if st_ino won't fit in target struct field. Use 32bit counter
813 * here to attempt to avoid that. 1002 * here to attempt to avoid that.
814 */ 1003 */
1004 static DEFINE_SPINLOCK(iunique_lock);
815 static unsigned int counter; 1005 static unsigned int counter;
816 struct inode *inode;
817 struct hlist_head *head;
818 ino_t res; 1006 ino_t res;
819 1007
820 spin_lock(&inode_lock); 1008 spin_lock(&inode_lock);
1009 spin_lock(&iunique_lock);
821 do { 1010 do {
822 if (counter <= max_reserved) 1011 if (counter <= max_reserved)
823 counter = max_reserved + 1; 1012 counter = max_reserved + 1;
824 res = counter++; 1013 res = counter++;
825 head = inode_hashtable + hash(sb, res); 1014 } while (!test_inode_iunique(sb, res));
826 inode = find_inode_fast(sb, head, res); 1015 spin_unlock(&iunique_lock);
827 } while (inode != NULL);
828 spin_unlock(&inode_lock); 1016 spin_unlock(&inode_lock);
829 1017
830 return res; 1018 return res;
@@ -876,7 +1064,6 @@ static struct inode *ifind(struct super_block *sb,
876 spin_lock(&inode_lock); 1064 spin_lock(&inode_lock);
877 inode = find_inode(sb, head, test, data); 1065 inode = find_inode(sb, head, test, data);
878 if (inode) { 1066 if (inode) {
879 __iget(inode);
880 spin_unlock(&inode_lock); 1067 spin_unlock(&inode_lock);
881 if (likely(wait)) 1068 if (likely(wait))
882 wait_on_inode(inode); 1069 wait_on_inode(inode);
@@ -909,7 +1096,6 @@ static struct inode *ifind_fast(struct super_block *sb,
909 spin_lock(&inode_lock); 1096 spin_lock(&inode_lock);
910 inode = find_inode_fast(sb, head, ino); 1097 inode = find_inode_fast(sb, head, ino);
911 if (inode) { 1098 if (inode) {
912 __iget(inode);
913 spin_unlock(&inode_lock); 1099 spin_unlock(&inode_lock);
914 wait_on_inode(inode); 1100 wait_on_inode(inode);
915 return inode; 1101 return inode;
@@ -1095,7 +1281,7 @@ int insert_inode_locked(struct inode *inode)
1095 __iget(old); 1281 __iget(old);
1096 spin_unlock(&inode_lock); 1282 spin_unlock(&inode_lock);
1097 wait_on_inode(old); 1283 wait_on_inode(old);
1098 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1284 if (unlikely(!inode_unhashed(old))) {
1099 iput(old); 1285 iput(old);
1100 return -EBUSY; 1286 return -EBUSY;
1101 } 1287 }
@@ -1134,7 +1320,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1134 __iget(old); 1320 __iget(old);
1135 spin_unlock(&inode_lock); 1321 spin_unlock(&inode_lock);
1136 wait_on_inode(old); 1322 wait_on_inode(old);
1137 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1323 if (unlikely(!inode_unhashed(old))) {
1138 iput(old); 1324 iput(old);
1139 return -EBUSY; 1325 return -EBUSY;
1140 } 1326 }
@@ -1143,36 +1329,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1143} 1329}
1144EXPORT_SYMBOL(insert_inode_locked4); 1330EXPORT_SYMBOL(insert_inode_locked4);
1145 1331
1146/**
1147 * __insert_inode_hash - hash an inode
1148 * @inode: unhashed inode
1149 * @hashval: unsigned long value used to locate this object in the
1150 * inode_hashtable.
1151 *
1152 * Add an inode to the inode hash for this superblock.
1153 */
1154void __insert_inode_hash(struct inode *inode, unsigned long hashval)
1155{
1156 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
1157 spin_lock(&inode_lock);
1158 hlist_add_head(&inode->i_hash, head);
1159 spin_unlock(&inode_lock);
1160}
1161EXPORT_SYMBOL(__insert_inode_hash);
1162
1163/**
1164 * remove_inode_hash - remove an inode from the hash
1165 * @inode: inode to unhash
1166 *
1167 * Remove an inode from the superblock.
1168 */
1169void remove_inode_hash(struct inode *inode)
1170{
1171 spin_lock(&inode_lock);
1172 hlist_del_init(&inode->i_hash);
1173 spin_unlock(&inode_lock);
1174}
1175EXPORT_SYMBOL(remove_inode_hash);
1176 1332
1177int generic_delete_inode(struct inode *inode) 1333int generic_delete_inode(struct inode *inode)
1178{ 1334{
@@ -1187,7 +1343,7 @@ EXPORT_SYMBOL(generic_delete_inode);
1187 */ 1343 */
1188int generic_drop_inode(struct inode *inode) 1344int generic_drop_inode(struct inode *inode)
1189{ 1345{
1190 return !inode->i_nlink || hlist_unhashed(&inode->i_hash); 1346 return !inode->i_nlink || inode_unhashed(inode);
1191} 1347}
1192EXPORT_SYMBOL_GPL(generic_drop_inode); 1348EXPORT_SYMBOL_GPL(generic_drop_inode);
1193 1349
@@ -1213,10 +1369,11 @@ static void iput_final(struct inode *inode)
1213 drop = generic_drop_inode(inode); 1369 drop = generic_drop_inode(inode);
1214 1370
1215 if (!drop) { 1371 if (!drop) {
1216 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1217 list_move(&inode->i_list, &inode_unused);
1218 inodes_stat.nr_unused++;
1219 if (sb->s_flags & MS_ACTIVE) { 1372 if (sb->s_flags & MS_ACTIVE) {
1373 inode->i_state |= I_REFERENCED;
1374 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1375 inode_lru_list_add(inode);
1376 }
1220 spin_unlock(&inode_lock); 1377 spin_unlock(&inode_lock);
1221 return; 1378 return;
1222 } 1379 }
@@ -1227,19 +1384,23 @@ static void iput_final(struct inode *inode)
1227 spin_lock(&inode_lock); 1384 spin_lock(&inode_lock);
1228 WARN_ON(inode->i_state & I_NEW); 1385 WARN_ON(inode->i_state & I_NEW);
1229 inode->i_state &= ~I_WILL_FREE; 1386 inode->i_state &= ~I_WILL_FREE;
1230 inodes_stat.nr_unused--; 1387 __remove_inode_hash(inode);
1231 hlist_del_init(&inode->i_hash);
1232 } 1388 }
1233 list_del_init(&inode->i_list); 1389
1234 list_del_init(&inode->i_sb_list);
1235 WARN_ON(inode->i_state & I_NEW); 1390 WARN_ON(inode->i_state & I_NEW);
1236 inode->i_state |= I_FREEING; 1391 inode->i_state |= I_FREEING;
1237 inodes_stat.nr_inodes--; 1392
1393 /*
1394 * Move the inode off the IO lists and LRU once I_FREEING is
1395 * set so that it won't get moved back on there if it is dirty.
1396 */
1397 inode_lru_list_del(inode);
1398 list_del_init(&inode->i_wb_list);
1399
1400 __inode_sb_list_del(inode);
1238 spin_unlock(&inode_lock); 1401 spin_unlock(&inode_lock);
1239 evict(inode); 1402 evict(inode);
1240 spin_lock(&inode_lock); 1403 remove_inode_hash(inode);
1241 hlist_del_init(&inode->i_hash);
1242 spin_unlock(&inode_lock);
1243 wake_up_inode(inode); 1404 wake_up_inode(inode);
1244 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 1405 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1245 destroy_inode(inode); 1406 destroy_inode(inode);
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..0663568b1247 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,12 +63,17 @@ extern int copy_mount_string(const void __user *, char **);
63 63
64extern void free_vfsmnt(struct vfsmount *); 64extern void free_vfsmnt(struct vfsmount *);
65extern struct vfsmount *alloc_vfsmnt(const char *); 65extern struct vfsmount *alloc_vfsmnt(const char *);
66extern unsigned int mnt_get_count(struct vfsmount *mnt);
66extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); 67extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
67extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, 68extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
68 struct vfsmount *); 69 struct vfsmount *);
69extern void release_mounts(struct list_head *); 70extern void release_mounts(struct list_head *);
70extern void umount_tree(struct vfsmount *, int, struct list_head *); 71extern void umount_tree(struct vfsmount *, int, struct list_head *);
71extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 72extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
73extern int finish_automount(struct vfsmount *, struct path *);
74
75extern void mnt_make_longterm(struct vfsmount *);
76extern void mnt_make_shortterm(struct vfsmount *);
72 77
73extern void __init mnt_init(void); 78extern void __init mnt_init(void);
74 79
@@ -101,3 +106,10 @@ extern void put_super(struct super_block *sb);
101struct nameidata; 106struct nameidata;
102extern struct file *nameidata_to_filp(struct nameidata *); 107extern struct file *nameidata_to_filp(struct nameidata *);
103extern void release_open_intent(struct nameidata *); 108extern void release_open_intent(struct nameidata *);
109
110/*
111 * inode.c
112 */
113extern int get_nr_dirty_inodes(void);
114extern void evict_inodes(struct super_block *);
115extern int invalidate_inodes(struct super_block *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f855ea4fc888..1eebeb72b202 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/syscalls.h> 7#include <linux/syscalls.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/smp_lock.h>
10#include <linux/capability.h> 9#include <linux/capability.h>
11#include <linux/file.h> 10#include <linux/file.h>
12#include <linux/fs.h> 11#include <linux/fs.h>
@@ -87,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
87 u64 phys, u64 len, u32 flags) 86 u64 phys, u64 len, u32 flags)
88{ 87{
89 struct fiemap_extent extent; 88 struct fiemap_extent extent;
90 struct fiemap_extent *dest = fieinfo->fi_extents_start; 89 struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
91 90
92 /* only count the extents */ 91 /* only count the extents */
93 if (fieinfo->fi_extents_max == 0) { 92 if (fieinfo->fi_extents_max == 0) {
@@ -174,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
174static int ioctl_fiemap(struct file *filp, unsigned long arg) 173static int ioctl_fiemap(struct file *filp, unsigned long arg)
175{ 174{
176 struct fiemap fiemap; 175 struct fiemap fiemap;
176 struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
177 struct fiemap_extent_info fieinfo = { 0, }; 177 struct fiemap_extent_info fieinfo = { 0, };
178 struct inode *inode = filp->f_path.dentry->d_inode; 178 struct inode *inode = filp->f_path.dentry->d_inode;
179 struct super_block *sb = inode->i_sb; 179 struct super_block *sb = inode->i_sb;
@@ -183,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
183 if (!inode->i_op->fiemap) 183 if (!inode->i_op->fiemap)
184 return -EOPNOTSUPP; 184 return -EOPNOTSUPP;
185 185
186 if (copy_from_user(&fiemap, (struct fiemap __user *)arg, 186 if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
187 sizeof(struct fiemap)))
188 return -EFAULT; 187 return -EFAULT;
189 188
190 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) 189 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -197,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
197 196
198 fieinfo.fi_flags = fiemap.fm_flags; 197 fieinfo.fi_flags = fiemap.fm_flags;
199 fieinfo.fi_extents_max = fiemap.fm_extent_count; 198 fieinfo.fi_extents_max = fiemap.fm_extent_count;
200 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); 199 fieinfo.fi_extents_start = ufiemap->fm_extents;
201 200
202 if (fiemap.fm_extent_count != 0 && 201 if (fiemap.fm_extent_count != 0 &&
203 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, 202 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -210,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
210 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); 209 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
211 fiemap.fm_flags = fieinfo.fi_flags; 210 fiemap.fm_flags = fieinfo.fi_flags;
212 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; 211 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
213 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) 212 if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
214 error = -EFAULT; 213 error = -EFAULT;
215 214
216 return error; 215 return error;
@@ -274,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
274 len = isize; 273 len = isize;
275 } 274 }
276 275
276 /*
277 * Some filesystems can't deal with being asked to map less than
278 * blocksize, so make sure our len is at least block length.
279 */
280 if (logical_to_blk(inode, len) == 0)
281 len = blk_to_logical(inode, 1);
282
277 start_blk = logical_to_blk(inode, start); 283 start_blk = logical_to_blk(inode, start);
278 last_blk = logical_to_blk(inode, start + len - 1); 284 last_blk = logical_to_blk(inode, start + len - 1);
279 285
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc6..7da2a06508e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
103 } 103 }
104 104
105 ret = -ESRCH; 105 ret = -ESRCH;
106 /* 106 rcu_read_lock();
107 * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
108 * so we can't use rcu_read_lock(). See re-copy of ->ioprio
109 * in copy_process().
110 */
111 read_lock(&tasklist_lock);
112 switch (which) { 107 switch (which) {
113 case IOPRIO_WHO_PROCESS: 108 case IOPRIO_WHO_PROCESS:
114 if (!who) 109 if (!who)
@@ -153,7 +148,7 @@ free_uid:
153 ret = -EINVAL; 148 ret = -EINVAL;
154 } 149 }
155 150
156 read_unlock(&tasklist_lock); 151 rcu_read_unlock();
157 return ret; 152 return ret;
158} 153}
159 154
@@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
197 int ret = -ESRCH; 192 int ret = -ESRCH;
198 int tmpio; 193 int tmpio;
199 194
200 read_lock(&tasklist_lock); 195 rcu_read_lock();
201 switch (which) { 196 switch (which) {
202 case IOPRIO_WHO_PROCESS: 197 case IOPRIO_WHO_PROCESS:
203 if (!who) 198 if (!who)
@@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
250 ret = -EINVAL; 245 ret = -EINVAL;
251 } 246 }
252 247
253 read_unlock(&tasklist_lock); 248 rcu_read_unlock();
254 return ret; 249 return ret;
255} 250}
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac68..0542b6eedf80 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
10 * 10 *
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h>
14#include <linux/gfp.h> 13#include <linux/gfp.h>
15#include "isofs.h" 14#include "isofs.h"
16 15
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
255 char *tmpname; 254 char *tmpname;
256 struct iso_directory_record *tmpde; 255 struct iso_directory_record *tmpde;
257 struct inode *inode = filp->f_path.dentry->d_inode; 256 struct inode *inode = filp->f_path.dentry->d_inode;
257 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
258 258
259 tmpname = (char *)__get_free_page(GFP_KERNEL); 259 tmpname = (char *)__get_free_page(GFP_KERNEL);
260 if (tmpname == NULL) 260 if (tmpname == NULL)
261 return -ENOMEM; 261 return -ENOMEM;
262 262
263 lock_kernel(); 263 mutex_lock(&sbi->s_mutex);
264 tmpde = (struct iso_directory_record *) (tmpname+1024); 264 tmpde = (struct iso_directory_record *) (tmpname+1024);
265 265
266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); 266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
267 267
268 free_page((unsigned long) tmpname); 268 free_page((unsigned long) tmpname);
269 unlock_kernel(); 269 mutex_unlock(&sbi->s_mutex);
270 return result; 270 return result;
271} 271}
272 272
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5a44811b5027..a0f3833c0dbf 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/nls.h> 18#include <linux/nls.h>
19#include <linux/ctype.h> 19#include <linux/ctype.h>
20#include <linux/smp_lock.h>
21#include <linux/statfs.h> 20#include <linux/statfs.h>
22#include <linux/cdrom.h> 21#include <linux/cdrom.h>
23#include <linux/parser.h> 22#include <linux/parser.h>
@@ -27,16 +26,32 @@
27 26
28#define BEQUIET 27#define BEQUIET
29 28
30static int isofs_hashi(struct dentry *parent, struct qstr *qstr); 29static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
31static int isofs_hash(struct dentry *parent, struct qstr *qstr); 30 struct qstr *qstr);
32static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b); 31static int isofs_hash(const struct dentry *parent, const struct inode *inode,
33static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b); 32 struct qstr *qstr);
33static int isofs_dentry_cmpi(const struct dentry *parent,
34 const struct inode *pinode,
35 const struct dentry *dentry, const struct inode *inode,
36 unsigned int len, const char *str, const struct qstr *name);
37static int isofs_dentry_cmp(const struct dentry *parent,
38 const struct inode *pinode,
39 const struct dentry *dentry, const struct inode *inode,
40 unsigned int len, const char *str, const struct qstr *name);
34 41
35#ifdef CONFIG_JOLIET 42#ifdef CONFIG_JOLIET
36static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr); 43static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
37static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr); 44 struct qstr *qstr);
38static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); 45static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
39static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b); 46 struct qstr *qstr);
47static int isofs_dentry_cmpi_ms(const struct dentry *parent,
48 const struct inode *pinode,
49 const struct dentry *dentry, const struct inode *inode,
50 unsigned int len, const char *str, const struct qstr *name);
51static int isofs_dentry_cmp_ms(const struct dentry *parent,
52 const struct inode *pinode,
53 const struct dentry *dentry, const struct inode *inode,
54 unsigned int len, const char *str, const struct qstr *name);
40#endif 55#endif
41 56
42static void isofs_put_super(struct super_block *sb) 57static void isofs_put_super(struct super_block *sb)
@@ -44,11 +59,7 @@ static void isofs_put_super(struct super_block *sb)
44 struct isofs_sb_info *sbi = ISOFS_SB(sb); 59 struct isofs_sb_info *sbi = ISOFS_SB(sb);
45 60
46#ifdef CONFIG_JOLIET 61#ifdef CONFIG_JOLIET
47 lock_kernel();
48
49 unload_nls(sbi->s_nls_iocharset); 62 unload_nls(sbi->s_nls_iocharset);
50
51 unlock_kernel();
52#endif 63#endif
53 64
54 kfree(sbi); 65 kfree(sbi);
@@ -70,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
70 return &ei->vfs_inode; 81 return &ei->vfs_inode;
71} 82}
72 83
73static void isofs_destroy_inode(struct inode *inode) 84static void isofs_i_callback(struct rcu_head *head)
74{ 85{
86 struct inode *inode = container_of(head, struct inode, i_rcu);
87 INIT_LIST_HEAD(&inode->i_dentry);
75 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); 88 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
76} 89}
77 90
91static void isofs_destroy_inode(struct inode *inode)
92{
93 call_rcu(&inode->i_rcu, isofs_i_callback);
94}
95
78static void init_once(void *foo) 96static void init_once(void *foo)
79{ 97{
80 struct iso_inode_info *ei = foo; 98 struct iso_inode_info *ei = foo;
@@ -165,7 +183,7 @@ struct iso9660_options{
165 * Compute the hash for the isofs name corresponding to the dentry. 183 * Compute the hash for the isofs name corresponding to the dentry.
166 */ 184 */
167static int 185static int
168isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms) 186isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
169{ 187{
170 const char *name; 188 const char *name;
171 int len; 189 int len;
@@ -186,7 +204,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
186 * Compute the hash for the isofs name corresponding to the dentry. 204 * Compute the hash for the isofs name corresponding to the dentry.
187 */ 205 */
188static int 206static int
189isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) 207isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
190{ 208{
191 const char *name; 209 const char *name;
192 int len; 210 int len;
@@ -211,100 +229,94 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
211} 229}
212 230
213/* 231/*
214 * Case insensitive compare of two isofs names. 232 * Compare of two isofs names.
215 */
216static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a,
217 struct qstr *b, int ms)
218{
219 int alen, blen;
220
221 /* A filename cannot end in '.' or we treat it like it has none */
222 alen = a->len;
223 blen = b->len;
224 if (ms) {
225 while (alen && a->name[alen-1] == '.')
226 alen--;
227 while (blen && b->name[blen-1] == '.')
228 blen--;
229 }
230 if (alen == blen) {
231 if (strnicmp(a->name, b->name, alen) == 0)
232 return 0;
233 }
234 return 1;
235}
236
237/*
238 * Case sensitive compare of two isofs names.
239 */ 233 */
240static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a, 234static int isofs_dentry_cmp_common(
241 struct qstr *b, int ms) 235 unsigned int len, const char *str,
236 const struct qstr *name, int ms, int ci)
242{ 237{
243 int alen, blen; 238 int alen, blen;
244 239
245 /* A filename cannot end in '.' or we treat it like it has none */ 240 /* A filename cannot end in '.' or we treat it like it has none */
246 alen = a->len; 241 alen = name->len;
247 blen = b->len; 242 blen = len;
248 if (ms) { 243 if (ms) {
249 while (alen && a->name[alen-1] == '.') 244 while (alen && name->name[alen-1] == '.')
250 alen--; 245 alen--;
251 while (blen && b->name[blen-1] == '.') 246 while (blen && str[blen-1] == '.')
252 blen--; 247 blen--;
253 } 248 }
254 if (alen == blen) { 249 if (alen == blen) {
255 if (strncmp(a->name, b->name, alen) == 0) 250 if (ci) {
256 return 0; 251 if (strnicmp(name->name, str, alen) == 0)
252 return 0;
253 } else {
254 if (strncmp(name->name, str, alen) == 0)
255 return 0;
256 }
257 } 257 }
258 return 1; 258 return 1;
259} 259}
260 260
261static int 261static int
262isofs_hash(struct dentry *dentry, struct qstr *qstr) 262isofs_hash(const struct dentry *dentry, const struct inode *inode,
263 struct qstr *qstr)
263{ 264{
264 return isofs_hash_common(dentry, qstr, 0); 265 return isofs_hash_common(dentry, qstr, 0);
265} 266}
266 267
267static int 268static int
268isofs_hashi(struct dentry *dentry, struct qstr *qstr) 269isofs_hashi(const struct dentry *dentry, const struct inode *inode,
270 struct qstr *qstr)
269{ 271{
270 return isofs_hashi_common(dentry, qstr, 0); 272 return isofs_hashi_common(dentry, qstr, 0);
271} 273}
272 274
273static int 275static int
274isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b) 276isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
277 const struct dentry *dentry, const struct inode *inode,
278 unsigned int len, const char *str, const struct qstr *name)
275{ 279{
276 return isofs_dentry_cmp_common(dentry, a, b, 0); 280 return isofs_dentry_cmp_common(len, str, name, 0, 0);
277} 281}
278 282
279static int 283static int
280isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b) 284isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
285 const struct dentry *dentry, const struct inode *inode,
286 unsigned int len, const char *str, const struct qstr *name)
281{ 287{
282 return isofs_dentry_cmpi_common(dentry, a, b, 0); 288 return isofs_dentry_cmp_common(len, str, name, 0, 1);
283} 289}
284 290
285#ifdef CONFIG_JOLIET 291#ifdef CONFIG_JOLIET
286static int 292static int
287isofs_hash_ms(struct dentry *dentry, struct qstr *qstr) 293isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
294 struct qstr *qstr)
288{ 295{
289 return isofs_hash_common(dentry, qstr, 1); 296 return isofs_hash_common(dentry, qstr, 1);
290} 297}
291 298
292static int 299static int
293isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr) 300isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
301 struct qstr *qstr)
294{ 302{
295 return isofs_hashi_common(dentry, qstr, 1); 303 return isofs_hashi_common(dentry, qstr, 1);
296} 304}
297 305
298static int 306static int
299isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) 307isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
308 const struct dentry *dentry, const struct inode *inode,
309 unsigned int len, const char *str, const struct qstr *name)
300{ 310{
301 return isofs_dentry_cmp_common(dentry, a, b, 1); 311 return isofs_dentry_cmp_common(len, str, name, 1, 0);
302} 312}
303 313
304static int 314static int
305isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b) 315isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
316 const struct dentry *dentry, const struct inode *inode,
317 unsigned int len, const char *str, const struct qstr *name)
306{ 318{
307 return isofs_dentry_cmpi_common(dentry, a, b, 1); 319 return isofs_dentry_cmp_common(len, str, name, 1, 1);
308} 320}
309#endif 321#endif
310 322
@@ -549,6 +561,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
549} 561}
550 562
551/* 563/*
564 * Check if root directory is empty (has less than 3 files).
565 *
566 * Used to detect broken CDs where ISO root directory is empty but Joliet root
567 * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
568 * (and Joliet used instead) or else no files would be visible.
569 */
570static bool rootdir_empty(struct super_block *sb, unsigned long block)
571{
572 int offset = 0, files = 0, de_len;
573 struct iso_directory_record *de;
574 struct buffer_head *bh;
575
576 bh = sb_bread(sb, block);
577 if (!bh)
578 return true;
579 while (files < 3) {
580 de = (struct iso_directory_record *) (bh->b_data + offset);
581 de_len = *(unsigned char *) de;
582 if (de_len == 0)
583 break;
584 files++;
585 offset += de_len;
586 }
587 brelse(bh);
588 return files < 3;
589}
590
591/*
552 * Initialize the superblock and read the root inode. 592 * Initialize the superblock and read the root inode.
553 * 593 *
554 * Note: a check_disk_change() has been done immediately prior 594 * Note: a check_disk_change() has been done immediately prior
@@ -823,6 +863,7 @@ root_found:
823 sbi->s_utf8 = opt.utf8; 863 sbi->s_utf8 = opt.utf8;
824 sbi->s_nocompress = opt.nocompress; 864 sbi->s_nocompress = opt.nocompress;
825 sbi->s_overriderockperm = opt.overriderockperm; 865 sbi->s_overriderockperm = opt.overriderockperm;
866 mutex_init(&sbi->s_mutex);
826 /* 867 /*
827 * It would be incredibly stupid to allow people to mark every file 868 * It would be incredibly stupid to allow people to mark every file
828 * on the disk as suid, so we merely allow them to set the default 869 * on the disk as suid, so we merely allow them to set the default
@@ -847,6 +888,18 @@ root_found:
847 goto out_no_root; 888 goto out_no_root;
848 889
849 /* 890 /*
891 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
892 * correct Joliet root directory.
893 */
894 if (sbi->s_rock == 1 && joliet_level &&
895 rootdir_empty(s, sbi->s_firstdatazone)) {
896 printk(KERN_NOTICE
897 "ISOFS: primary root directory is empty. "
898 "Disabling Rock Ridge and switching to Joliet.");
899 sbi->s_rock = 0;
900 }
901
902 /*
850 * If this disk has both Rock Ridge and Joliet on it, then we 903 * If this disk has both Rock Ridge and Joliet on it, then we
851 * want to use Rock Ridge by default. This can be overridden 904 * want to use Rock Ridge by default. This can be overridden
852 * by using the norock mount option. There is still one other 905 * by using the norock mount option. There is still one other
@@ -886,17 +939,18 @@ root_found:
886 goto out_iput; 939 goto out_iput;
887 } 940 }
888 941
889 /* get the root dentry */
890 s->s_root = d_alloc_root(inode);
891 if (!(s->s_root))
892 goto out_no_root;
893
894 table = 0; 942 table = 0;
895 if (joliet_level) 943 if (joliet_level)
896 table += 2; 944 table += 2;
897 if (opt.check == 'r') 945 if (opt.check == 'r')
898 table++; 946 table++;
899 s->s_root->d_op = &isofs_dentry_ops[table]; 947
948 s->s_d_op = &isofs_dentry_ops[table];
949
950 /* get the root dentry */
951 s->s_root = d_alloc_root(inode);
952 if (!(s->s_root))
953 goto out_no_root;
900 954
901 kfree(opt.iocharset); 955 kfree(opt.iocharset);
902 956
@@ -966,27 +1020,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
966 * or getblk() if they are not. Returns the number of blocks inserted 1020 * or getblk() if they are not. Returns the number of blocks inserted
967 * (-ve == error.) 1021 * (-ve == error.)
968 */ 1022 */
969int isofs_get_blocks(struct inode *inode, sector_t iblock_s, 1023int isofs_get_blocks(struct inode *inode, sector_t iblock,
970 struct buffer_head **bh, unsigned long nblocks) 1024 struct buffer_head **bh, unsigned long nblocks)
971{ 1025{
972 unsigned long b_off; 1026 unsigned long b_off = iblock;
973 unsigned offset, sect_size; 1027 unsigned offset, sect_size;
974 unsigned int firstext; 1028 unsigned int firstext;
975 unsigned long nextblk, nextoff; 1029 unsigned long nextblk, nextoff;
976 long iblock = (long)iblock_s;
977 int section, rv, error; 1030 int section, rv, error;
978 struct iso_inode_info *ei = ISOFS_I(inode); 1031 struct iso_inode_info *ei = ISOFS_I(inode);
979 1032
980 lock_kernel();
981
982 error = -EIO; 1033 error = -EIO;
983 rv = 0; 1034 rv = 0;
984 if (iblock < 0 || iblock != iblock_s) { 1035 if (iblock != b_off) {
985 printk(KERN_DEBUG "%s: block number too large\n", __func__); 1036 printk(KERN_DEBUG "%s: block number too large\n", __func__);
986 goto abort; 1037 goto abort;
987 } 1038 }
988 1039
989 b_off = iblock;
990 1040
991 offset = 0; 1041 offset = 0;
992 firstext = ei->i_first_extent; 1042 firstext = ei->i_first_extent;
@@ -1004,8 +1054,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1004 * I/O errors. 1054 * I/O errors.
1005 */ 1055 */
1006 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { 1056 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
1007 printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", 1057 printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
1008 __func__, iblock, (unsigned long) inode->i_size); 1058 __func__, b_off,
1059 (unsigned long long)inode->i_size);
1009 goto abort; 1060 goto abort;
1010 } 1061 }
1011 1062
@@ -1031,9 +1082,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1031 if (++section > 100) { 1082 if (++section > 100) {
1032 printk(KERN_DEBUG "%s: More than 100 file sections ?!?" 1083 printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
1033 " aborting...\n", __func__); 1084 " aborting...\n", __func__);
1034 printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " 1085 printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
1035 "nextblk=%lu nextoff=%lu\n", __func__, 1086 "nextblk=%lu nextoff=%lu\n", __func__,
1036 iblock, firstext, (unsigned) sect_size, 1087 b_off, firstext, (unsigned) sect_size,
1037 nextblk, nextoff); 1088 nextblk, nextoff);
1038 goto abort; 1089 goto abort;
1039 } 1090 }
@@ -1054,7 +1105,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1054 1105
1055 error = 0; 1106 error = 0;
1056abort: 1107abort:
1057 unlock_kernel();
1058 return rv != 0 ? rv : error; 1108 return rv != 0 ? rv : error;
1059} 1109}
1060 1110
@@ -1475,17 +1525,16 @@ struct inode *isofs_iget(struct super_block *sb,
1475 return inode; 1525 return inode;
1476} 1526}
1477 1527
1478static int isofs_get_sb(struct file_system_type *fs_type, 1528static struct dentry *isofs_mount(struct file_system_type *fs_type,
1479 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1529 int flags, const char *dev_name, void *data)
1480{ 1530{
1481 return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, 1531 return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
1482 mnt);
1483} 1532}
1484 1533
1485static struct file_system_type iso9660_fs_type = { 1534static struct file_system_type iso9660_fs_type = {
1486 .owner = THIS_MODULE, 1535 .owner = THIS_MODULE,
1487 .name = "iso9660", 1536 .name = "iso9660",
1488 .get_sb = isofs_get_sb, 1537 .mount = isofs_mount,
1489 .kill_sb = kill_block_super, 1538 .kill_sb = kill_block_super,
1490 .fs_flags = FS_REQUIRES_DEV, 1539 .fs_flags = FS_REQUIRES_DEV,
1491}; 1540};
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52a..2882dc089f87 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
55 gid_t s_gid; 55 gid_t s_gid;
56 uid_t s_uid; 56 uid_t s_uid;
57 struct nls_table *s_nls_iocharset; /* Native language support table */ 57 struct nls_table *s_nls_iocharset; /* Native language support table */
58 struct mutex s_mutex; /* replaces BKL, please remove if possible */
58}; 59};
59 60
60#define ISOFS_INVALID_MODE ((mode_t) -1) 61#define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867c..4fb3e8074fd4 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
6 * (C) 1991 Linus Torvalds - minix filesystem 6 * (C) 1991 Linus Torvalds - minix filesystem
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/gfp.h> 9#include <linux/gfp.h>
11#include "isofs.h" 10#include "isofs.h"
12 11
@@ -38,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
38 37
39 qstr.name = compare; 38 qstr.name = compare;
40 qstr.len = dlen; 39 qstr.len = dlen;
41 return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr); 40 return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
41 dentry->d_name.len, dentry->d_name.name, &qstr);
42} 42}
43 43
44/* 44/*
@@ -168,16 +168,15 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
168 int found; 168 int found;
169 unsigned long uninitialized_var(block); 169 unsigned long uninitialized_var(block);
170 unsigned long uninitialized_var(offset); 170 unsigned long uninitialized_var(offset);
171 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
171 struct inode *inode; 172 struct inode *inode;
172 struct page *page; 173 struct page *page;
173 174
174 dentry->d_op = dir->i_sb->s_root->d_op;
175
176 page = alloc_page(GFP_USER); 175 page = alloc_page(GFP_USER);
177 if (!page) 176 if (!page)
178 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
179 178
180 lock_kernel(); 179 mutex_lock(&sbi->s_mutex);
181 found = isofs_find_entry(dir, dentry, 180 found = isofs_find_entry(dir, dentry,
182 &block, &offset, 181 &block, &offset,
183 page_address(page), 182 page_address(page),
@@ -188,10 +187,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
188 if (found) { 187 if (found) {
189 inode = isofs_iget(dir->i_sb, block, offset); 188 inode = isofs_iget(dir->i_sb, block, offset);
190 if (IS_ERR(inode)) { 189 if (IS_ERR(inode)) {
191 unlock_kernel(); 190 mutex_unlock(&sbi->s_mutex);
192 return ERR_CAST(inode); 191 return ERR_CAST(inode);
193 } 192 }
194 } 193 }
195 unlock_kernel(); 194 mutex_unlock(&sbi->s_mutex);
196 return d_splice_alias(inode, dentry); 195 return d_splice_alias(inode, dentry);
197} 196}
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550fd..f9cd04db6eab 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/smp_lock.h>
12 11
13#include "isofs.h" 12#include "isofs.h"
14#include "rock.h" 13#include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
661{ 660{
662 struct inode *inode = page->mapping->host; 661 struct inode *inode = page->mapping->host;
663 struct iso_inode_info *ei = ISOFS_I(inode); 662 struct iso_inode_info *ei = ISOFS_I(inode);
663 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
664 char *link = kmap(page); 664 char *link = kmap(page);
665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
666 struct buffer_head *bh; 666 struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
673 struct rock_state rs; 673 struct rock_state rs;
674 int ret; 674 int ret;
675 675
676 if (!ISOFS_SB(inode->i_sb)->s_rock) 676 if (!sbi->s_rock)
677 goto error; 677 goto error;
678 678
679 init_rock_state(&rs, inode); 679 init_rock_state(&rs, inode);
680 block = ei->i_iget5_block; 680 block = ei->i_iget5_block;
681 lock_kernel(); 681 mutex_lock(&sbi->s_mutex);
682 bh = sb_bread(inode->i_sb, block); 682 bh = sb_bread(inode->i_sb, block);
683 if (!bh) 683 if (!bh)
684 goto out_noread; 684 goto out_noread;
@@ -748,7 +748,7 @@ repeat:
748 goto fail; 748 goto fail;
749 brelse(bh); 749 brelse(bh);
750 *rpnt = '\0'; 750 *rpnt = '\0';
751 unlock_kernel(); 751 mutex_unlock(&sbi->s_mutex);
752 SetPageUptodate(page); 752 SetPageUptodate(page);
753 kunmap(page); 753 kunmap(page);
754 unlock_page(page); 754 unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
765 printk("symlink spans iso9660 blocks\n"); 765 printk("symlink spans iso9660 blocks\n");
766fail: 766fail:
767 brelse(bh); 767 brelse(bh);
768 unlock_kernel(); 768 mutex_unlock(&sbi->s_mutex);
769error: 769error:
770 SetPageError(page); 770 SetPageError(page);
771 kunmap(page); 771 kunmap(page);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 05a38b9c4c0e..e4b87bc1fa56 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
221 goto restart; 221 goto restart;
222 } 222 }
223 if (buffer_locked(bh)) { 223 if (buffer_locked(bh)) {
224 atomic_inc(&bh->b_count); 224 get_bh(bh);
225 spin_unlock(&journal->j_list_lock); 225 spin_unlock(&journal->j_list_lock);
226 jbd_unlock_bh_state(bh); 226 jbd_unlock_bh_state(bh);
227 wait_on_buffer(bh); 227 wait_on_buffer(bh);
@@ -283,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
283 int ret = 0; 283 int ret = 0;
284 284
285 if (buffer_locked(bh)) { 285 if (buffer_locked(bh)) {
286 atomic_inc(&bh->b_count); 286 get_bh(bh);
287 spin_unlock(&journal->j_list_lock); 287 spin_unlock(&journal->j_list_lock);
288 jbd_unlock_bh_state(bh); 288 jbd_unlock_bh_state(bh);
289 wait_on_buffer(bh); 289 wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..34a4861c14b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
137 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "write commit block");
138 set_buffer_dirty(bh); 138 set_buffer_dirty(bh);
139 139
140 if (journal->j_flags & JFS_BARRIER) { 140 if (journal->j_flags & JFS_BARRIER)
141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER); 141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
142 142 else
143 /*
144 * Is it possible for another commit to fail at roughly
145 * the same time as this one? If so, we don't want to
146 * trust the barrier flag in the super, but instead want
147 * to remember if we sent a barrier request
148 */
149 if (ret == -EOPNOTSUPP) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n",
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JFS_BARRIER;
158 spin_unlock(&journal->j_state_lock);
159
160 /* And try again, without the barrier */
161 set_buffer_uptodate(bh);
162 set_buffer_dirty(bh);
163 ret = sync_dirty_buffer(bh);
164 }
165 } else {
166 ret = sync_dirty_buffer(bh); 143 ret = sync_dirty_buffer(bh);
167 }
168 144
169 put_bh(bh); /* One for getblk() */ 145 put_bh(bh); /* One for getblk() */
170 journal_put_journal_head(descriptor); 146 journal_put_journal_head(descriptor);
@@ -318,7 +294,7 @@ void journal_commit_transaction(journal_t *journal)
318 int first_tag = 0; 294 int first_tag = 0;
319 int tag_flag; 295 int tag_flag;
320 int i; 296 int i;
321 int write_op = WRITE; 297 int write_op = WRITE_SYNC;
322 298
323 /* 299 /*
324 * First job: lock down the current transaction and wait for 300 * First job: lock down the current transaction and wait for
@@ -611,13 +587,13 @@ void journal_commit_transaction(journal_t *journal)
611 /* Bump b_count to prevent truncate from stumbling over 587 /* Bump b_count to prevent truncate from stumbling over
612 the shadowed buffer! @@@ This can go if we ever get 588 the shadowed buffer! @@@ This can go if we ever get
613 rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 589 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
614 atomic_inc(&jh2bh(jh)->b_count); 590 get_bh(jh2bh(jh));
615 591
616 /* Make a temporary IO buffer with which to write it out 592 /* Make a temporary IO buffer with which to write it out
617 (this will requeue both the metadata buffer and the 593 (this will requeue both the metadata buffer and the
618 temporary IO buffer). new_bh goes on BJ_IO*/ 594 temporary IO buffer). new_bh goes on BJ_IO*/
619 595
620 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 596 set_buffer_jwrite(jh2bh(jh));
621 /* 597 /*
622 * akpm: journal_write_metadata_buffer() sets 598 * akpm: journal_write_metadata_buffer() sets
623 * new_bh->b_transaction to commit_transaction. 599 * new_bh->b_transaction to commit_transaction.
@@ -627,7 +603,7 @@ void journal_commit_transaction(journal_t *journal)
627 JBUFFER_TRACE(jh, "ph3: write metadata"); 603 JBUFFER_TRACE(jh, "ph3: write metadata");
628 flags = journal_write_metadata_buffer(commit_transaction, 604 flags = journal_write_metadata_buffer(commit_transaction,
629 jh, &new_jh, blocknr); 605 jh, &new_jh, blocknr);
630 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 606 set_buffer_jwrite(jh2bh(new_jh));
631 wbuf[bufs++] = jh2bh(new_jh); 607 wbuf[bufs++] = jh2bh(new_jh);
632 608
633 /* Record the new block's tag in the current descriptor 609 /* Record the new block's tag in the current descriptor
@@ -737,7 +713,7 @@ wait_for_iobuf:
737 shadowed buffer */ 713 shadowed buffer */
738 jh = commit_transaction->t_shadow_list->b_tprev; 714 jh = commit_transaction->t_shadow_list->b_tprev;
739 bh = jh2bh(jh); 715 bh = jh2bh(jh);
740 clear_bit(BH_JWrite, &bh->b_state); 716 clear_buffer_jwrite(bh);
741 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 717 J_ASSERT_BH(bh, buffer_jbddirty(bh));
742 718
743 /* The metadata is now released for reuse, but we need 719 /* The metadata is now released for reuse, but we need
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2c4b1f109da9..da1b5e4ffce1 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
36#include <linux/poison.h> 36#include <linux/poison.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/ratelimit.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/page.h> 42#include <asm/page.h>
@@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
84 85
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 86static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno); 87static void __journal_abort_soft (journal_t *journal, int errno);
88static const char *journal_dev_name(journal_t *journal, char *buffer);
87 89
88/* 90/*
89 * Helper function used to manage commit timeouts 91 * Helper function used to manage commit timeouts
@@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
439 */ 441 */
440 if (!tid_geq(journal->j_commit_request, target)) { 442 if (!tid_geq(journal->j_commit_request, target)) {
441 /* 443 /*
442 * We want a new commit: OK, mark the request and wakup the 444 * We want a new commit: OK, mark the request and wakeup the
443 * commit thread. We do _not_ do the commit ourselves. 445 * commit thread. We do _not_ do the commit ourselves.
444 */ 446 */
445 447
@@ -950,6 +952,8 @@ int journal_create(journal_t *journal)
950 if (err) 952 if (err)
951 return err; 953 return err;
952 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 954 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
955 if (unlikely(!bh))
956 return -ENOMEM;
953 lock_buffer(bh); 957 lock_buffer(bh);
954 memset (bh->b_data, 0, journal->j_blocksize); 958 memset (bh->b_data, 0, journal->j_blocksize);
955 BUFFER_TRACE(bh, "marking dirty"); 959 BUFFER_TRACE(bh, "marking dirty");
@@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait)
1010 goto out; 1014 goto out;
1011 } 1015 }
1012 1016
1017 if (buffer_write_io_error(bh)) {
1018 char b[BDEVNAME_SIZE];
1019 /*
1020 * Oh, dear. A previous attempt to write the journal
1021 * superblock failed. This could happen because the
1022 * USB device was yanked out. Or it could happen to
1023 * be a transient write error and maybe the block will
1024 * be remapped. Nothing we can do but to retry the
1025 * write and hope for the best.
1026 */
1027 printk(KERN_ERR "JBD: previous I/O error detected "
1028 "for journal superblock update for %s.\n",
1029 journal_dev_name(journal, b));
1030 clear_buffer_write_io_error(bh);
1031 set_buffer_uptodate(bh);
1032 }
1033
1013 spin_lock(&journal->j_state_lock); 1034 spin_lock(&journal->j_state_lock);
1014 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", 1035 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
1015 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1036 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1021,9 +1042,17 @@ void journal_update_superblock(journal_t *journal, int wait)
1021 1042
1022 BUFFER_TRACE(bh, "marking dirty"); 1043 BUFFER_TRACE(bh, "marking dirty");
1023 mark_buffer_dirty(bh); 1044 mark_buffer_dirty(bh);
1024 if (wait) 1045 if (wait) {
1025 sync_dirty_buffer(bh); 1046 sync_dirty_buffer(bh);
1026 else 1047 if (buffer_write_io_error(bh)) {
1048 char b[BDEVNAME_SIZE];
1049 printk(KERN_ERR "JBD: I/O error detected "
1050 "when updating journal superblock for %s.\n",
1051 journal_dev_name(journal, b));
1052 clear_buffer_write_io_error(bh);
1053 set_buffer_uptodate(bh);
1054 }
1055 } else
1027 write_dirty_buffer(bh, WRITE); 1056 write_dirty_buffer(bh, WRITE);
1028 1057
1029out: 1058out:
@@ -1719,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void)
1719static struct journal_head *journal_alloc_journal_head(void) 1748static struct journal_head *journal_alloc_journal_head(void)
1720{ 1749{
1721 struct journal_head *ret; 1750 struct journal_head *ret;
1722 static unsigned long last_warning;
1723 1751
1724#ifdef CONFIG_JBD_DEBUG 1752#ifdef CONFIG_JBD_DEBUG
1725 atomic_inc(&nr_journal_heads); 1753 atomic_inc(&nr_journal_heads);
@@ -1727,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void)
1727 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1755 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
1728 if (ret == NULL) { 1756 if (ret == NULL) {
1729 jbd_debug(1, "out of memory for journal_head\n"); 1757 jbd_debug(1, "out of memory for journal_head\n");
1730 if (time_after(jiffies, last_warning + 5*HZ)) { 1758 printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1731 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", 1759 __func__);
1732 __func__); 1760
1733 last_warning = jiffies;
1734 }
1735 while (ret == NULL) { 1761 while (ret == NULL) {
1736 yield(); 1762 yield();
1737 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); 1763 ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 81051dafebf5..5b43e96788e6 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -296,10 +296,10 @@ int journal_skip_recovery(journal_t *journal)
296#ifdef CONFIG_JBD_DEBUG 296#ifdef CONFIG_JBD_DEBUG
297 int dropped = info.end_transaction - 297 int dropped = info.end_transaction -
298 be32_to_cpu(journal->j_superblock->s_sequence); 298 be32_to_cpu(journal->j_superblock->s_sequence);
299#endif
300 jbd_debug(1, 299 jbd_debug(1,
301 "JBD: ignoring %d transaction%s from the journal.\n", 300 "JBD: ignoring %d transaction%s from the journal.\n",
302 dropped, (dropped == 1) ? "" : "s"); 301 dropped, (dropped == 1) ? "" : "s");
302#endif
303 journal->j_transaction_sequence = ++info.end_transaction; 303 journal->j_transaction_sequence = ++info.end_transaction;
304 } 304 }
305 305
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a491..5b2e4c30a2a1 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
207 * the committing transaction. Really, we only need to give it 207 * the committing transaction. Really, we only need to give it
208 * committing_transaction->t_outstanding_credits plus "enough" for 208 * committing_transaction->t_outstanding_credits plus "enough" for
209 * the log control blocks. 209 * the log control blocks.
210 * Also, this test is inconsitent with the matching one in 210 * Also, this test is inconsistent with the matching one in
211 * journal_extend(). 211 * journal_extend().
212 */ 212 */
213 if (__log_space_left(journal) < jbd_space_needed(journal)) { 213 if (__log_space_left(journal) < jbd_space_needed(journal)) {
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
293 jbd_free_handle(handle); 293 jbd_free_handle(handle);
294 current->journal_info = NULL; 294 current->journal_info = NULL;
295 handle = ERR_PTR(err); 295 handle = ERR_PTR(err);
296 goto out;
297 } 296 }
298out:
299 return handle; 297 return handle;
300} 298}
301 299
@@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
528 transaction = handle->h_transaction; 526 transaction = handle->h_transaction;
529 journal = transaction->t_journal; 527 journal = transaction->t_journal;
530 528
531 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); 529 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
532 530
533 JBUFFER_TRACE(jh, "entry"); 531 JBUFFER_TRACE(jh, "entry");
534repeat: 532repeat:
@@ -713,7 +711,7 @@ done:
713 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 711 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
714 "Possible IO failure.\n"); 712 "Possible IO failure.\n");
715 page = jh2bh(jh)->b_page; 713 page = jh2bh(jh)->b_page;
716 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 714 offset = offset_in_page(jh2bh(jh)->b_data);
717 source = kmap_atomic(page, KM_USER0); 715 source = kmap_atomic(page, KM_USER0);
718 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 716 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
719 kunmap_atomic(source, KM_USER0); 717 kunmap_atomic(source, KM_USER0);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6a79fd0a1a32 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
299 transaction->t_chp_stats.cs_forced_to_close++; 299 transaction->t_chp_stats.cs_forced_to_close++;
300 spin_unlock(&journal->j_list_lock); 300 spin_unlock(&journal->j_list_lock);
301 jbd_unlock_bh_state(bh); 301 jbd_unlock_bh_state(bh);
302 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
303 /*
304 * The journal thread is dead; so starting and
305 * waiting for a commit to finish will cause
306 * us to wait for a _very_ long time.
307 */
308 printk(KERN_ERR "JBD2: %s: "
309 "Waiting for Godot: block %llu\n",
310 journal->j_devname,
311 (unsigned long long) bh->b_blocknr);
302 jbd2_log_start_commit(journal, tid); 312 jbd2_log_start_commit(journal, tid);
303 jbd2_log_wait_commit(journal, tid); 313 jbd2_log_wait_commit(journal, tid);
304 ret = 1; 314 ret = 1;
@@ -532,8 +542,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
532 */ 542 */
533 if ((journal->j_fs_dev != journal->j_dev) && 543 if ((journal->j_fs_dev != journal->j_dev) &&
534 (journal->j_flags & JBD2_BARRIER)) 544 (journal->j_flags & JBD2_BARRIER))
535 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 545 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
536 BLKDEV_IFL_WAIT);
537 if (!(journal->j_flags & JBD2_ABORT)) 546 if (!(journal->j_flags & JBD2_ABORT))
538 jbd2_journal_update_superblock(journal, 1); 547 jbd2_journal_update_superblock(journal, 1);
539 return 0; 548 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..f3ad1598b201 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/bitops.h>
29#include <trace/events/jbd2.h> 30#include <trace/events/jbd2.h>
31#include <asm/system.h>
30 32
31/* 33/*
32 * Default IO end handler for temporary BJ_IO buffer_heads. 34 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -134,25 +136,11 @@ static int journal_submit_commit_record(journal_t *journal,
134 136
135 if (journal->j_flags & JBD2_BARRIER && 137 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 138 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh); 140 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
139 if (ret == -EOPNOTSUPP) { 141 else
140 printk(KERN_WARNING
141 "JBD2: Disabling barriers on %s, "
142 "not supported by device\n", journal->j_devname);
143 write_lock(&journal->j_state_lock);
144 journal->j_flags &= ~JBD2_BARRIER;
145 write_unlock(&journal->j_state_lock);
146
147 /* And try again, without the barrier */
148 lock_buffer(bh);
149 set_buffer_uptodate(bh);
150 clear_buffer_dirty(bh);
151 ret = submit_bh(WRITE_SYNC_PLUG, bh);
152 }
153 } else {
154 ret = submit_bh(WRITE_SYNC_PLUG, bh); 142 ret = submit_bh(WRITE_SYNC_PLUG, bh);
155 } 143
156 *cbh = bh; 144 *cbh = bh;
157 return ret; 145 return ret;
158} 146}
@@ -166,29 +154,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
166{ 154{
167 int ret = 0; 155 int ret = 0;
168 156
169retry:
170 clear_buffer_dirty(bh); 157 clear_buffer_dirty(bh);
171 wait_on_buffer(bh); 158 wait_on_buffer(bh);
172 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
173 printk(KERN_WARNING
174 "JBD2: %s: disabling barries on %s - not supported "
175 "by device\n", __func__, journal->j_devname);
176 write_lock(&journal->j_state_lock);
177 journal->j_flags &= ~JBD2_BARRIER;
178 write_unlock(&journal->j_state_lock);
179
180 lock_buffer(bh);
181 clear_buffer_dirty(bh);
182 set_buffer_uptodate(bh);
183 bh->b_end_io = journal_end_buffer_io_sync;
184
185 ret = submit_bh(WRITE_SYNC_PLUG, bh);
186 if (ret) {
187 unlock_buffer(bh);
188 return ret;
189 }
190 goto retry;
191 }
192 159
193 if (unlikely(!buffer_uptodate(bh))) 160 if (unlikely(!buffer_uptodate(bh)))
194 ret = -EIO; 161 ret = -EIO;
@@ -236,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
236 spin_lock(&journal->j_list_lock); 203 spin_lock(&journal->j_list_lock);
237 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 204 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
238 mapping = jinode->i_vfs_inode->i_mapping; 205 mapping = jinode->i_vfs_inode->i_mapping;
239 jinode->i_flags |= JI_COMMIT_RUNNING; 206 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
240 spin_unlock(&journal->j_list_lock); 207 spin_unlock(&journal->j_list_lock);
241 /* 208 /*
242 * submit the inode data buffers. We use writepage 209 * submit the inode data buffers. We use writepage
@@ -251,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
251 spin_lock(&journal->j_list_lock); 218 spin_lock(&journal->j_list_lock);
252 J_ASSERT(jinode->i_transaction == commit_transaction); 219 J_ASSERT(jinode->i_transaction == commit_transaction);
253 commit_transaction->t_flushed_data_blocks = 1; 220 commit_transaction->t_flushed_data_blocks = 1;
254 jinode->i_flags &= ~JI_COMMIT_RUNNING; 221 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 smp_mb__after_clear_bit();
255 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 223 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
256 } 224 }
257 spin_unlock(&journal->j_list_lock); 225 spin_unlock(&journal->j_list_lock);
@@ -272,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
272 /* For locking, see the comment in journal_submit_data_buffers() */ 240 /* For locking, see the comment in journal_submit_data_buffers() */
273 spin_lock(&journal->j_list_lock); 241 spin_lock(&journal->j_list_lock);
274 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
275 jinode->i_flags |= JI_COMMIT_RUNNING; 243 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
276 spin_unlock(&journal->j_list_lock); 244 spin_unlock(&journal->j_list_lock);
277 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 245 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
278 if (err) { 246 if (err) {
@@ -288,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
288 ret = err; 256 ret = err;
289 } 257 }
290 spin_lock(&journal->j_list_lock); 258 spin_lock(&journal->j_list_lock);
291 jinode->i_flags &= ~JI_COMMIT_RUNNING; 259 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 smp_mb__after_clear_bit();
292 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 261 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
293 } 262 }
294 263
@@ -360,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
360 int tag_bytes = journal_tag_bytes(journal); 329 int tag_bytes = journal_tag_bytes(journal);
361 struct buffer_head *cbh = NULL; /* For transactional checksums */ 330 struct buffer_head *cbh = NULL; /* For transactional checksums */
362 __u32 crc32_sum = ~0; 331 __u32 crc32_sum = ~0;
363 int write_op = WRITE; 332 int write_op = WRITE_SYNC;
364 333
365 /* 334 /*
366 * First job: lock down the current transaction and wait for 335 * First job: lock down the current transaction and wait for
@@ -701,6 +670,16 @@ start_journal_io:
701 } 670 }
702 } 671 }
703 672
673 err = journal_finish_inode_data_buffers(journal, commit_transaction);
674 if (err) {
675 printk(KERN_WARNING
676 "JBD2: Detected IO errors while flushing file data "
677 "on %s\n", journal->j_devname);
678 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
679 jbd2_journal_abort(journal, err);
680 err = 0;
681 }
682
704 /* 683 /*
705 * If the journal is not located on the file system device, 684 * If the journal is not located on the file system device,
706 * then we must flush the file system device before we issue 685 * then we must flush the file system device before we issue
@@ -709,8 +688,7 @@ start_journal_io:
709 if (commit_transaction->t_flushed_data_blocks && 688 if (commit_transaction->t_flushed_data_blocks &&
710 (journal->j_fs_dev != journal->j_dev) && 689 (journal->j_fs_dev != journal->j_dev) &&
711 (journal->j_flags & JBD2_BARRIER)) 690 (journal->j_flags & JBD2_BARRIER))
712 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 691 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
713 BLKDEV_IFL_WAIT);
714 692
715 /* Done it all: now write the commit record asynchronously. */ 693 /* Done it all: now write the commit record asynchronously. */
716 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 694 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -719,19 +697,6 @@ start_journal_io:
719 &cbh, crc32_sum); 697 &cbh, crc32_sum);
720 if (err) 698 if (err)
721 __jbd2_journal_abort_hard(journal); 699 __jbd2_journal_abort_hard(journal);
722 if (journal->j_flags & JBD2_BARRIER)
723 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
724 BLKDEV_IFL_WAIT);
725 }
726
727 err = journal_finish_inode_data_buffers(journal, commit_transaction);
728 if (err) {
729 printk(KERN_WARNING
730 "JBD2: Detected IO errors while flushing file data "
731 "on %s\n", journal->j_devname);
732 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
733 jbd2_journal_abort(journal, err);
734 err = 0;
735 } 700 }
736 701
737 /* Lo and behold: we have just managed to send a transaction to 702 /* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +810,11 @@ wait_for_iobuf:
845 } 810 }
846 if (!err && !is_journal_aborted(journal)) 811 if (!err && !is_journal_aborted(journal))
847 err = journal_wait_on_commit_record(journal, cbh); 812 err = journal_wait_on_commit_record(journal, cbh);
813 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
814 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
815 journal->j_flags & JBD2_BARRIER) {
816 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
817 }
848 818
849 if (err) 819 if (err)
850 jbd2_journal_abort(journal, err); 820 jbd2_journal_abort(journal, err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..9e4686900f18 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -42,12 +42,15 @@
42#include <linux/log2.h> 42#include <linux/log2.h>
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/backing-dev.h> 44#include <linux/backing-dev.h>
45#include <linux/bitops.h>
46#include <linux/ratelimit.h>
45 47
46#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
47#include <trace/events/jbd2.h> 49#include <trace/events/jbd2.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
50#include <asm/page.h> 52#include <asm/page.h>
53#include <asm/system.h>
51 54
52EXPORT_SYMBOL(jbd2_journal_extend); 55EXPORT_SYMBOL(jbd2_journal_extend);
53EXPORT_SYMBOL(jbd2_journal_stop); 56EXPORT_SYMBOL(jbd2_journal_stop);
@@ -91,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
91EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 94EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
92EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 95EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
93EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 96EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
97EXPORT_SYMBOL(jbd2_inode_cache);
94 98
95static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 99static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
96static void __journal_abort_soft (journal_t *journal, int errno); 100static void __journal_abort_soft (journal_t *journal, int errno);
@@ -478,7 +482,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
478 */ 482 */
479 if (!tid_geq(journal->j_commit_request, target)) { 483 if (!tid_geq(journal->j_commit_request, target)) {
480 /* 484 /*
481 * We want a new commit: OK, mark the request and wakup the 485 * We want a new commit: OK, mark the request and wakeup the
482 * commit thread. We do _not_ do the commit ourselves. 486 * commit thread. We do _not_ do the commit ourselves.
483 */ 487 */
484 488
@@ -825,7 +829,7 @@ static journal_t * journal_init_common (void)
825 829
826 journal = kzalloc(sizeof(*journal), GFP_KERNEL); 830 journal = kzalloc(sizeof(*journal), GFP_KERNEL);
827 if (!journal) 831 if (!journal)
828 goto fail; 832 return NULL;
829 833
830 init_waitqueue_head(&journal->j_wait_transaction_locked); 834 init_waitqueue_head(&journal->j_wait_transaction_locked);
831 init_waitqueue_head(&journal->j_wait_logspace); 835 init_waitqueue_head(&journal->j_wait_logspace);
@@ -850,14 +854,12 @@ static journal_t * journal_init_common (void)
850 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); 854 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
851 if (err) { 855 if (err) {
852 kfree(journal); 856 kfree(journal);
853 goto fail; 857 return NULL;
854 } 858 }
855 859
856 spin_lock_init(&journal->j_history_lock); 860 spin_lock_init(&journal->j_history_lock);
857 861
858 return journal; 862 return journal;
859fail:
860 return NULL;
861} 863}
862 864
863/* jbd2_journal_init_dev and jbd2_journal_init_inode: 865/* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -897,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
897 899
898 /* journal descriptor can store up to n blocks -bzzz */ 900 /* journal descriptor can store up to n blocks -bzzz */
899 journal->j_blocksize = blocksize; 901 journal->j_blocksize = blocksize;
902 journal->j_dev = bdev;
903 journal->j_fs_dev = fs_dev;
904 journal->j_blk_offset = start;
905 journal->j_maxlen = len;
906 bdevname(journal->j_dev, journal->j_devname);
907 p = journal->j_devname;
908 while ((p = strchr(p, '/')))
909 *p = '!';
900 jbd2_stats_proc_init(journal); 910 jbd2_stats_proc_init(journal);
901 n = journal->j_blocksize / sizeof(journal_block_tag_t); 911 n = journal->j_blocksize / sizeof(journal_block_tag_t);
902 journal->j_wbufsize = n; 912 journal->j_wbufsize = n;
@@ -906,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
906 __func__); 916 __func__);
907 goto out_err; 917 goto out_err;
908 } 918 }
909 journal->j_dev = bdev;
910 journal->j_fs_dev = fs_dev;
911 journal->j_blk_offset = start;
912 journal->j_maxlen = len;
913 bdevname(journal->j_dev, journal->j_devname);
914 p = journal->j_devname;
915 while ((p = strchr(p, '/')))
916 *p = '!';
917 919
918 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 920 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
919 if (!bh) { 921 if (!bh) {
@@ -1371,6 +1373,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1371 1373
1372 if (!compat && !ro && !incompat) 1374 if (!compat && !ro && !incompat)
1373 return 1; 1375 return 1;
1376 /* Load journal superblock if it is not loaded yet. */
1377 if (journal->j_format_version == 0 &&
1378 journal_get_superblock(journal) != 0)
1379 return 0;
1374 if (journal->j_format_version == 1) 1380 if (journal->j_format_version == 1)
1375 return 0; 1381 return 0;
1376 1382
@@ -1832,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
1832 */ 1838 */
1833#define JBD2_MAX_SLABS 8 1839#define JBD2_MAX_SLABS 8
1834static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; 1840static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1835static DECLARE_MUTEX(jbd2_slab_create_sem);
1836 1841
1837static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { 1842static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1838 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", 1843 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1853,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
1853 1858
1854static int jbd2_journal_create_slab(size_t size) 1859static int jbd2_journal_create_slab(size_t size)
1855{ 1860{
1861 static DEFINE_MUTEX(jbd2_slab_create_mutex);
1856 int i = order_base_2(size) - 10; 1862 int i = order_base_2(size) - 10;
1857 size_t slab_size; 1863 size_t slab_size;
1858 1864
@@ -1864,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
1864 1870
1865 if (unlikely(i < 0)) 1871 if (unlikely(i < 0))
1866 i = 0; 1872 i = 0;
1867 down(&jbd2_slab_create_sem); 1873 mutex_lock(&jbd2_slab_create_mutex);
1868 if (jbd2_slab[i]) { 1874 if (jbd2_slab[i]) {
1869 up(&jbd2_slab_create_sem); 1875 mutex_unlock(&jbd2_slab_create_mutex);
1870 return 0; /* Already created */ 1876 return 0; /* Already created */
1871 } 1877 }
1872 1878
1873 slab_size = 1 << (i+10); 1879 slab_size = 1 << (i+10);
1874 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, 1880 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1875 slab_size, 0, NULL); 1881 slab_size, 0, NULL);
1876 up(&jbd2_slab_create_sem); 1882 mutex_unlock(&jbd2_slab_create_mutex);
1877 if (!jbd2_slab[i]) { 1883 if (!jbd2_slab[i]) {
1878 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); 1884 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1879 return -ENOMEM; 1885 return -ENOMEM;
@@ -1976,7 +1982,6 @@ static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
1976static struct journal_head *journal_alloc_journal_head(void) 1982static struct journal_head *journal_alloc_journal_head(void)
1977{ 1983{
1978 struct journal_head *ret; 1984 struct journal_head *ret;
1979 static unsigned long last_warning;
1980 1985
1981#ifdef CONFIG_JBD2_DEBUG 1986#ifdef CONFIG_JBD2_DEBUG
1982 atomic_inc(&nr_journal_heads); 1987 atomic_inc(&nr_journal_heads);
@@ -1984,11 +1989,7 @@ static struct journal_head *journal_alloc_journal_head(void)
1984 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1989 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1985 if (!ret) { 1990 if (!ret) {
1986 jbd_debug(1, "out of memory for journal_head\n"); 1991 jbd_debug(1, "out of memory for journal_head\n");
1987 if (time_after(jiffies, last_warning + 5*HZ)) { 1992 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
1988 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1989 __func__);
1990 last_warning = jiffies;
1991 }
1992 while (!ret) { 1993 while (!ret) {
1993 yield(); 1994 yield();
1994 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1995 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -2206,7 +2207,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
2206restart: 2207restart:
2207 spin_lock(&journal->j_list_lock); 2208 spin_lock(&journal->j_list_lock);
2208 /* Is commit writing out inode - we have to wait */ 2209 /* Is commit writing out inode - we have to wait */
2209 if (jinode->i_flags & JI_COMMIT_RUNNING) { 2210 if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
2210 wait_queue_head_t *wq; 2211 wait_queue_head_t *wq;
2211 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); 2212 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2212 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); 2213 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -2286,17 +2287,19 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2286 2287
2287#endif 2288#endif
2288 2289
2289struct kmem_cache *jbd2_handle_cache; 2290struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2290 2291
2291static int __init journal_init_handle_cache(void) 2292static int __init journal_init_handle_cache(void)
2292{ 2293{
2293 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", 2294 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2294 sizeof(handle_t),
2295 0, /* offset */
2296 SLAB_TEMPORARY, /* flags */
2297 NULL); /* ctor */
2298 if (jbd2_handle_cache == NULL) { 2295 if (jbd2_handle_cache == NULL) {
2299 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 2296 printk(KERN_EMERG "JBD2: failed to create handle cache\n");
2297 return -ENOMEM;
2298 }
2299 jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
2300 if (jbd2_inode_cache == NULL) {
2301 printk(KERN_EMERG "JBD2: failed to create inode cache\n");
2302 kmem_cache_destroy(jbd2_handle_cache);
2300 return -ENOMEM; 2303 return -ENOMEM;
2301 } 2304 }
2302 return 0; 2305 return 0;
@@ -2306,6 +2309,9 @@ static void jbd2_journal_destroy_handle_cache(void)
2306{ 2309{
2307 if (jbd2_handle_cache) 2310 if (jbd2_handle_cache)
2308 kmem_cache_destroy(jbd2_handle_cache); 2311 kmem_cache_destroy(jbd2_handle_cache);
2312 if (jbd2_inode_cache)
2313 kmem_cache_destroy(jbd2_inode_cache);
2314
2309} 2315}
2310 2316
2311/* 2317/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 2bc4d5f116f1..1cad869494f0 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -299,10 +299,10 @@ int jbd2_journal_skip_recovery(journal_t *journal)
299#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
300 int dropped = info.end_transaction - 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence); 301 be32_to_cpu(journal->j_superblock->s_sequence);
302#endif
303 jbd_debug(1, 302 jbd_debug(1,
304 "JBD: ignoring %d transaction%s from the journal.\n", 303 "JBD: ignoring %d transaction%s from the journal.\n",
305 dropped, (dropped == 1) ? "" : "s"); 304 dropped, (dropped == 1) ? "" : "s");
305#endif
306 journal->j_transaction_sequence = ++info.end_transaction; 306 journal->j_transaction_sequence = ++info.end_transaction;
307 } 307 }
308 308
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f3479d6e0a83..faad2bd787c7 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -156,6 +156,7 @@ alloc_transaction:
156 */ 156 */
157repeat: 157repeat:
158 read_lock(&journal->j_state_lock); 158 read_lock(&journal->j_state_lock);
159 BUG_ON(journal->j_flags & JBD2_UNMOUNT);
159 if (is_journal_aborted(journal) || 160 if (is_journal_aborted(journal) ||
160 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 161 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
161 read_unlock(&journal->j_state_lock); 162 read_unlock(&journal->j_state_lock);
@@ -250,7 +251,7 @@ repeat:
250 * the committing transaction. Really, we only need to give it 251 * the committing transaction. Really, we only need to give it
251 * committing_transaction->t_outstanding_credits plus "enough" for 252 * committing_transaction->t_outstanding_credits plus "enough" for
252 * the log control blocks. 253 * the log control blocks.
253 * Also, this test is inconsitent with the matching one in 254 * Also, this test is inconsistent with the matching one in
254 * jbd2_journal_extend(). 255 * jbd2_journal_extend().
255 */ 256 */
256 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 257 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
@@ -339,9 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
339 jbd2_free_handle(handle); 340 jbd2_free_handle(handle);
340 current->journal_info = NULL; 341 current->journal_info = NULL;
341 handle = ERR_PTR(err); 342 handle = ERR_PTR(err);
342 goto out;
343 } 343 }
344out:
345 return handle; 344 return handle;
346} 345}
347EXPORT_SYMBOL(jbd2__journal_start); 346EXPORT_SYMBOL(jbd2__journal_start);
@@ -588,7 +587,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
588 transaction = handle->h_transaction; 587 transaction = handle->h_transaction;
589 journal = transaction->t_journal; 588 journal = transaction->t_journal;
590 589
591 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); 590 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
592 591
593 JBUFFER_TRACE(jh, "entry"); 592 JBUFFER_TRACE(jh, "entry");
594repeat: 593repeat:
@@ -773,7 +772,7 @@ done:
773 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 772 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
774 "Possible IO failure.\n"); 773 "Possible IO failure.\n");
775 page = jh2bh(jh)->b_page; 774 page = jh2bh(jh)->b_page;
776 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 775 offset = offset_in_page(jh2bh(jh)->b_data);
777 source = kmap_atomic(page, KM_USER0); 776 source = kmap_atomic(page, KM_USER0);
778 /* Fire data frozen trigger just before we copy the data */ 777 /* Fire data frozen trigger just before we copy the data */
779 jbd2_buffer_frozen_trigger(jh, source + offset, 778 jbd2_buffer_frozen_trigger(jh, source + offset,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bbd..95b79672150a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
259 return rc; 259 return rc;
260} 260}
261 261
262int jffs2_check_acl(struct inode *inode, int mask) 262int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
263{ 263{
264 struct posix_acl *acl; 264 struct posix_acl *acl;
265 int rc; 265 int rc;
266 266
267 if (flags & IPERM_FLAG_RCU)
268 return -ECHILD;
269
267 acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); 270 acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
268 if (IS_ERR(acl)) 271 if (IS_ERR(acl))
269 return PTR_ERR(acl); 272 return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d9541..3119f59253d3 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
26 26
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29extern int jffs2_check_acl(struct inode *, int); 29extern int jffs2_check_acl(struct inode *, int, unsigned int);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a906f538d11c..3005ec4520ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -23,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
23static inline struct jffs2_inode_cache * 23static inline struct jffs2_inode_cache *
24first_inode_chain(int *i, struct jffs2_sb_info *c) 24first_inode_chain(int *i, struct jffs2_sb_info *c)
25{ 25{
26 for (; *i < INOCACHE_HASHSIZE; (*i)++) { 26 for (; *i < c->inocache_hashsize; (*i)++) {
27 if (c->inocache_list[*i]) 27 if (c->inocache_list[*i])
28 return c->inocache_list[*i]; 28 return c->inocache_list[*i];
29 } 29 }
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; 336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
337#ifndef __ECOS 337#ifndef __ECOS
338 if (jffs2_blocks_use_vmalloc(c)) 338 if (jffs2_blocks_use_vmalloc(c))
339 c->blocks = vmalloc(size); 339 c->blocks = vzalloc(size);
340 else 340 else
341#endif 341#endif
342 c->blocks = kmalloc(size, GFP_KERNEL); 342 c->blocks = kzalloc(size, GFP_KERNEL);
343 if (!c->blocks) 343 if (!c->blocks)
344 return -ENOMEM; 344 return -ENOMEM;
345 345
346 memset(c->blocks, 0, size);
347 for (i=0; i<c->nr_blocks; i++) { 346 for (i=0; i<c->nr_blocks; i++) {
348 INIT_LIST_HEAD(&c->blocks[i].list); 347 INIT_LIST_HEAD(&c->blocks[i].list);
349 c->blocks[i].offset = i * c->sector_size; 348 c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 617a1e5694c1..de4247021d25 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -103,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
103 spin_unlock(&jffs2_compressor_list_lock); 103 spin_unlock(&jffs2_compressor_list_lock);
104 *datalen = orig_slen; 104 *datalen = orig_slen;
105 *cdatalen = orig_dlen; 105 *cdatalen = orig_dlen;
106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL); 106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
107 spin_lock(&jffs2_compressor_list_lock); 107 spin_lock(&jffs2_compressor_list_lock);
108 this->usecount--; 108 this->usecount--;
109 if (!compr_ret) { 109 if (!compr_ret) {
@@ -152,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
152 spin_unlock(&jffs2_compressor_list_lock); 152 spin_unlock(&jffs2_compressor_list_lock);
153 *datalen = orig_slen; 153 *datalen = orig_slen;
154 *cdatalen = orig_dlen; 154 *cdatalen = orig_dlen;
155 compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL); 155 compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
156 spin_lock(&jffs2_compressor_list_lock); 156 spin_lock(&jffs2_compressor_list_lock);
157 this->usecount--; 157 this->usecount--;
158 if (!compr_ret) { 158 if (!compr_ret) {
@@ -220,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
220 if (comprtype == this->compr) { 220 if (comprtype == this->compr) {
221 this->usecount++; 221 this->usecount++;
222 spin_unlock(&jffs2_compressor_list_lock); 222 spin_unlock(&jffs2_compressor_list_lock);
223 ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL); 223 ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
224 spin_lock(&jffs2_compressor_list_lock); 224 spin_lock(&jffs2_compressor_list_lock);
225 if (ret) { 225 if (ret) {
226 printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret); 226 printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index e471a9106fd9..13bb7597ab39 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -49,9 +49,9 @@ struct jffs2_compressor {
49 char *name; 49 char *name;
50 char compr; /* JFFS2_COMPR_XXX */ 50 char compr; /* JFFS2_COMPR_XXX */
51 int (*compress)(unsigned char *data_in, unsigned char *cpage_out, 51 int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
52 uint32_t *srclen, uint32_t *destlen, void *model); 52 uint32_t *srclen, uint32_t *destlen);
53 int (*decompress)(unsigned char *cdata_in, unsigned char *data_out, 53 int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
54 uint32_t cdatalen, uint32_t datalen, void *model); 54 uint32_t cdatalen, uint32_t datalen);
55 int usecount; 55 int usecount;
56 int disabled; /* if set the compressor won't compress */ 56 int disabled; /* if set the compressor won't compress */
57 unsigned char *compr_buf; /* used by size compr. mode */ 57 unsigned char *compr_buf; /* used by size compr. mode */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index ed25ae7c98eb..af186ee674d8 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -42,7 +42,7 @@ static int __init alloc_workspace(void)
42} 42}
43 43
44static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, 44static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
45 uint32_t *sourcelen, uint32_t *dstlen, void *model) 45 uint32_t *sourcelen, uint32_t *dstlen)
46{ 46{
47 size_t compress_size; 47 size_t compress_size;
48 int ret; 48 int ret;
@@ -67,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
67} 67}
68 68
69static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out, 69static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
70 uint32_t srclen, uint32_t destlen, void *model) 70 uint32_t srclen, uint32_t destlen)
71{ 71{
72 size_t dl = destlen; 72 size_t dl = destlen;
73 int ret; 73 int ret;
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 9696ad9ef5f7..16a5047903a6 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -31,8 +31,7 @@
31/* _compress returns the compressed size, -1 if bigger */ 31/* _compress returns the compressed size, -1 if bigger */
32static int jffs2_rtime_compress(unsigned char *data_in, 32static int jffs2_rtime_compress(unsigned char *data_in,
33 unsigned char *cpage_out, 33 unsigned char *cpage_out,
34 uint32_t *sourcelen, uint32_t *dstlen, 34 uint32_t *sourcelen, uint32_t *dstlen)
35 void *model)
36{ 35{
37 short positions[256]; 36 short positions[256];
38 int outpos = 0; 37 int outpos = 0;
@@ -73,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
73 72
74static int jffs2_rtime_decompress(unsigned char *data_in, 73static int jffs2_rtime_decompress(unsigned char *data_in,
75 unsigned char *cpage_out, 74 unsigned char *cpage_out,
76 uint32_t srclen, uint32_t destlen, 75 uint32_t srclen, uint32_t destlen)
77 void *model)
78{ 76{
79 short positions[256]; 77 short positions[256];
80 int outpos = 0; 78 int outpos = 0;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index a12b4f763373..9e7cec808c4c 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -298,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
298#if 0 298#if 0
299/* _compress returns the compressed size, -1 if bigger */ 299/* _compress returns the compressed size, -1 if bigger */
300int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, 300int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
301 uint32_t *sourcelen, uint32_t *dstlen, void *model) 301 uint32_t *sourcelen, uint32_t *dstlen)
302{ 302{
303 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, 303 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
304 cpage_out, sourcelen, dstlen); 304 cpage_out, sourcelen, dstlen);
@@ -306,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
306#endif 306#endif
307static int jffs2_dynrubin_compress(unsigned char *data_in, 307static int jffs2_dynrubin_compress(unsigned char *data_in,
308 unsigned char *cpage_out, 308 unsigned char *cpage_out,
309 uint32_t *sourcelen, uint32_t *dstlen, 309 uint32_t *sourcelen, uint32_t *dstlen)
310 void *model)
311{ 310{
312 int bits[8]; 311 int bits[8];
313 unsigned char histo[256]; 312 unsigned char histo[256];
@@ -387,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits,
387 386
388static int jffs2_rubinmips_decompress(unsigned char *data_in, 387static int jffs2_rubinmips_decompress(unsigned char *data_in,
389 unsigned char *cpage_out, 388 unsigned char *cpage_out,
390 uint32_t sourcelen, uint32_t dstlen, 389 uint32_t sourcelen, uint32_t dstlen)
391 void *model)
392{ 390{
393 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, 391 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
394 cpage_out, sourcelen, dstlen); 392 cpage_out, sourcelen, dstlen);
@@ -397,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
397 395
398static int jffs2_dynrubin_decompress(unsigned char *data_in, 396static int jffs2_dynrubin_decompress(unsigned char *data_in,
399 unsigned char *cpage_out, 397 unsigned char *cpage_out,
400 uint32_t sourcelen, uint32_t dstlen, 398 uint32_t sourcelen, uint32_t dstlen)
401 void *model)
402{ 399{
403 int bits[8]; 400 int bits[8];
404 int c; 401 int c;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 97fc45de6f81..fd05a0b9431d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -68,8 +68,7 @@ static void free_workspaces(void)
68 68
69static int jffs2_zlib_compress(unsigned char *data_in, 69static int jffs2_zlib_compress(unsigned char *data_in,
70 unsigned char *cpage_out, 70 unsigned char *cpage_out,
71 uint32_t *sourcelen, uint32_t *dstlen, 71 uint32_t *sourcelen, uint32_t *dstlen)
72 void *model)
73{ 72{
74 int ret; 73 int ret;
75 74
@@ -136,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
136 135
137static int jffs2_zlib_decompress(unsigned char *data_in, 136static int jffs2_zlib_decompress(unsigned char *data_in,
138 unsigned char *cpage_out, 137 unsigned char *cpage_out,
139 uint32_t srclen, uint32_t destlen, 138 uint32_t srclen, uint32_t destlen)
140 void *model)
141{ 139{
142 int ret; 140 int ret;
143 int wbits = MAX_WBITS; 141 int wbits = MAX_WBITS;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..92978658ed18 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
289 mutex_unlock(&f->sem); 289 mutex_unlock(&f->sem);
290 d_instantiate(dentry, old_dentry->d_inode); 290 d_instantiate(dentry, old_dentry->d_inode);
291 dir_i->i_mtime = dir_i->i_ctime = ITIME(now); 291 dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
292 atomic_inc(&old_dentry->d_inode->i_count); 292 ihold(old_dentry->d_inode);
293 } 293 }
294 return ret; 294 return ret;
295} 295}
@@ -367,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
367 } 367 }
368 368
369 /* We use f->target field to store the target path. */ 369 /* We use f->target field to store the target path. */
370 f->target = kmalloc(targetlen + 1, GFP_KERNEL); 370 f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
371 if (!f->target) { 371 if (!f->target) {
372 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); 372 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
373 mutex_unlock(&f->sem); 373 mutex_unlock(&f->sem);
@@ -376,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
376 goto fail; 376 goto fail;
377 } 377 }
378 378
379 memcpy(f->target, target, targetlen + 1);
380 D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target)); 379 D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
381 380
382 /* No data here. Only a metadata node, which will be 381 /* No data here. Only a metadata node, which will be
@@ -864,7 +863,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
864 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); 863 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
865 /* Might as well let the VFS know */ 864 /* Might as well let the VFS know */
866 d_instantiate(new_dentry, old_dentry->d_inode); 865 d_instantiate(new_dentry, old_dentry->d_inode);
867 atomic_inc(&old_dentry->d_inode->i_count); 866 ihold(old_dentry->d_inode);
868 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); 867 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
869 return ret; 868 return ret;
870 } 869 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index abac961f617b..e513f1913c15 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -151,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
151 } 151 }
152 152
153 /* Be nice */ 153 /* Be nice */
154 yield(); 154 cond_resched();
155 mutex_lock(&c->erase_free_sem); 155 mutex_lock(&c->erase_free_sem);
156 spin_lock(&c->erase_completion_lock); 156 spin_lock(&c->erase_completion_lock);
157 } 157 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 6b2964a19850..e896e67767eb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -21,7 +21,6 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/vfs.h> 22#include <linux/vfs.h>
23#include <linux/crc32.h> 23#include <linux/crc32.h>
24#include <linux/smp_lock.h>
25#include "nodelist.h" 24#include "nodelist.h"
26 25
27static int jffs2_flash_setup(struct jffs2_sb_info *c); 26static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -391,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
391 This also catches the case where it was stopped and this 390 This also catches the case where it was stopped and this
392 is just a remount to restart it. 391 is just a remount to restart it.
393 Flush the writebuffer, if neccecary, else we loose it */ 392 Flush the writebuffer, if neccecary, else we loose it */
394 lock_kernel();
395 if (!(sb->s_flags & MS_RDONLY)) { 393 if (!(sb->s_flags & MS_RDONLY)) {
396 jffs2_stop_garbage_collect_thread(c); 394 jffs2_stop_garbage_collect_thread(c);
397 mutex_lock(&c->alloc_sem); 395 mutex_lock(&c->alloc_sem);
@@ -403,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
403 jffs2_start_garbage_collect_thread(c); 401 jffs2_start_garbage_collect_thread(c);
404 402
405 *flags |= MS_NOATIME; 403 *flags |= MS_NOATIME;
406
407 unlock_kernel();
408 return 0; 404 return 0;
409} 405}
410 406
@@ -478,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
478 return inode; 474 return inode;
479} 475}
480 476
477static int calculate_inocache_hashsize(uint32_t flash_size)
478{
479 /*
480 * Pick a inocache hash size based on the size of the medium.
481 * Count how many megabytes we're dealing with, apply a hashsize twice
482 * that size, but rounding down to the usual big powers of 2. And keep
483 * to sensible bounds.
484 */
485
486 int size_mb = flash_size / 1024 / 1024;
487 int hashsize = (size_mb * 2) & ~0x3f;
488
489 if (hashsize < INOCACHE_HASHSIZE_MIN)
490 return INOCACHE_HASHSIZE_MIN;
491 if (hashsize > INOCACHE_HASHSIZE_MAX)
492 return INOCACHE_HASHSIZE_MAX;
493
494 return hashsize;
495}
481 496
482int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) 497int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
483{ 498{
@@ -524,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
524 if (ret) 539 if (ret)
525 return ret; 540 return ret;
526 541
527 c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); 542 c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
543 c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
528 if (!c->inocache_list) { 544 if (!c->inocache_list) {
529 ret = -ENOMEM; 545 ret = -ENOMEM;
530 goto out_wbuf; 546 goto out_wbuf;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 846a79452497..31dce611337c 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -219,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
219 if (!list_empty(&c->erase_complete_list) || 219 if (!list_empty(&c->erase_complete_list) ||
220 !list_empty(&c->erase_pending_list)) { 220 !list_empty(&c->erase_pending_list)) {
221 spin_unlock(&c->erase_completion_lock); 221 spin_unlock(&c->erase_completion_lock);
222 mutex_unlock(&c->alloc_sem);
222 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); 223 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
223 if (jffs2_erase_pending_blocks(c, 1)) { 224 if (jffs2_erase_pending_blocks(c, 1))
224 mutex_unlock(&c->alloc_sem);
225 return 0; 225 return 0;
226 } 226
227 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); 227 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
228 spin_lock(&c->erase_completion_lock); 228 spin_lock(&c->erase_completion_lock);
229 mutex_lock(&c->alloc_sem);
229 } 230 }
230 231
231 /* First, work out which block we're garbage-collecting */ 232 /* First, work out which block we're garbage-collecting */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 6784bc89add1..0bc6a6c80a56 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -100,6 +100,7 @@ struct jffs2_sb_info {
100 wait_queue_head_t erase_wait; /* For waiting for erases to complete */ 100 wait_queue_head_t erase_wait; /* For waiting for erases to complete */
101 101
102 wait_queue_head_t inocache_wq; 102 wait_queue_head_t inocache_wq;
103 int inocache_hashsize;
103 struct jffs2_inode_cache **inocache_list; 104 struct jffs2_inode_cache **inocache_list;
104 spinlock_t inocache_lock; 105 spinlock_t inocache_lock;
105 106
@@ -143,4 +144,4 @@ struct jffs2_sb_info {
143 void *os_priv; 144 void *os_priv;
144}; 145};
145 146
146#endif /* _JFFS2_FB_SB */ 147#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index af02bd138469..5e03233c2363 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t
420{ 420{
421 struct jffs2_inode_cache *ret; 421 struct jffs2_inode_cache *ret;
422 422
423 ret = c->inocache_list[ino % INOCACHE_HASHSIZE]; 423 ret = c->inocache_list[ino % c->inocache_hashsize];
424 while (ret && ret->ino < ino) { 424 while (ret && ret->ino < ino) {
425 ret = ret->next; 425 ret = ret->next;
426 } 426 }
@@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new
441 441
442 dbg_inocache("add %p (ino #%u)\n", new, new->ino); 442 dbg_inocache("add %p (ino #%u)\n", new, new->ino);
443 443
444 prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE]; 444 prev = &c->inocache_list[new->ino % c->inocache_hashsize];
445 445
446 while ((*prev) && (*prev)->ino < new->ino) { 446 while ((*prev) && (*prev)->ino < new->ino) {
447 prev = &(*prev)->next; 447 prev = &(*prev)->next;
@@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
462 dbg_inocache("del %p (ino #%u)\n", old, old->ino); 462 dbg_inocache("del %p (ino #%u)\n", old, old->ino);
463 spin_lock(&c->inocache_lock); 463 spin_lock(&c->inocache_lock);
464 464
465 prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE]; 465 prev = &c->inocache_list[old->ino % c->inocache_hashsize];
466 466
467 while ((*prev) && (*prev)->ino < old->ino) { 467 while ((*prev) && (*prev)->ino < old->ino) {
468 prev = &(*prev)->next; 468 prev = &(*prev)->next;
@@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
487 int i; 487 int i;
488 struct jffs2_inode_cache *this, *next; 488 struct jffs2_inode_cache *this, *next;
489 489
490 for (i=0; i<INOCACHE_HASHSIZE; i++) { 490 for (i=0; i < c->inocache_hashsize; i++) {
491 this = c->inocache_list[i]; 491 this = c->inocache_list[i];
492 while (this) { 492 while (this) {
493 next = this->next; 493 next = this->next;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 523a91691052..5a53d9bdb2b5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -199,7 +199,8 @@ struct jffs2_inode_cache {
199#define RAWNODE_CLASS_XATTR_DATUM 1 199#define RAWNODE_CLASS_XATTR_DATUM 1
200#define RAWNODE_CLASS_XATTR_REF 2 200#define RAWNODE_CLASS_XATTR_REF 2
201 201
202#define INOCACHE_HASHSIZE 128 202#define INOCACHE_HASHSIZE_MIN 128
203#define INOCACHE_HASHSIZE_MAX 1024
203 204
204#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size) 205#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
205 206
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 46f870d1cc36..b632dddcb482 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -20,7 +20,7 @@
20#include "summary.h" 20#include "summary.h"
21#include "debug.h" 21#include "debug.h"
22 22
23#define DEFAULT_EMPTY_SCAN_SIZE 1024 23#define DEFAULT_EMPTY_SCAN_SIZE 256
24 24
25#define noisy_printk(noise, args...) do { \ 25#define noisy_printk(noise, args...) do { \
26 if (*(noise)) { \ 26 if (*(noise)) { \
@@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
435 unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) { 435 unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
436 struct jffs2_unknown_node *node; 436 struct jffs2_unknown_node *node;
437 struct jffs2_unknown_node crcnode; 437 struct jffs2_unknown_node crcnode;
438 uint32_t ofs, prevofs; 438 uint32_t ofs, prevofs, max_ofs;
439 uint32_t hdr_crc, buf_ofs, buf_len; 439 uint32_t hdr_crc, buf_ofs, buf_len;
440 int err; 440 int err;
441 int noise = 0; 441 int noise = 0;
@@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
550 550
551 /* We temporarily use 'ofs' as a pointer into the buffer/jeb */ 551 /* We temporarily use 'ofs' as a pointer into the buffer/jeb */
552 ofs = 0; 552 ofs = 0;
553 553 max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
554 /* Scan only 4KiB of 0xFF before declaring it's empty */ 554 /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
555 while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) 555 while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
556 ofs += 4; 556 ofs += 4;
557 557
558 if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) { 558 if (ofs == max_ofs) {
559#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 559#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
560 if (jffs2_cleanmarker_oob(c)) { 560 if (jffs2_cleanmarker_oob(c)) {
561 /* scan oob, take care of cleanmarker */ 561 /* scan oob, take care of cleanmarker */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 662bba099501..853b8e300084 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/list.h> 16#include <linux/list.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
@@ -41,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
41 return &f->vfs_inode; 40 return &f->vfs_inode;
42} 41}
43 42
44static void jffs2_destroy_inode(struct inode *inode) 43static void jffs2_i_callback(struct rcu_head *head)
45{ 44{
45 struct inode *inode = container_of(head, struct inode, i_rcu);
46 INIT_LIST_HEAD(&inode->i_dentry);
46 kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); 47 kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
47} 48}
48 49
50static void jffs2_destroy_inode(struct inode *inode)
51{
52 call_rcu(&inode->i_rcu, jffs2_i_callback);
53}
54
49static void jffs2_i_init_once(void *foo) 55static void jffs2_i_init_once(void *foo)
50{ 56{
51 struct jffs2_inode_info *f = foo; 57 struct jffs2_inode_info *f = foo;
@@ -146,6 +152,7 @@ static const struct super_operations jffs2_super_operations =
146static int jffs2_fill_super(struct super_block *sb, void *data, int silent) 152static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
147{ 153{
148 struct jffs2_sb_info *c; 154 struct jffs2_sb_info *c;
155 int ret;
149 156
150 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():" 157 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
151 " New superblock for device %d (\"%s\")\n", 158 " New superblock for device %d (\"%s\")\n",
@@ -175,15 +182,15 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
175#ifdef CONFIG_JFFS2_FS_POSIX_ACL 182#ifdef CONFIG_JFFS2_FS_POSIX_ACL
176 sb->s_flags |= MS_POSIXACL; 183 sb->s_flags |= MS_POSIXACL;
177#endif 184#endif
178 return jffs2_do_fill_super(sb, data, silent); 185 ret = jffs2_do_fill_super(sb, data, silent);
186 return ret;
179} 187}
180 188
181static int jffs2_get_sb(struct file_system_type *fs_type, 189static struct dentry *jffs2_mount(struct file_system_type *fs_type,
182 int flags, const char *dev_name, 190 int flags, const char *dev_name,
183 void *data, struct vfsmount *mnt) 191 void *data)
184{ 192{
185 return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super, 193 return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
186 mnt);
187} 194}
188 195
189static void jffs2_put_super (struct super_block *sb) 196static void jffs2_put_super (struct super_block *sb)
@@ -192,8 +199,6 @@ static void jffs2_put_super (struct super_block *sb)
192 199
193 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); 200 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
194 201
195 lock_kernel();
196
197 if (sb->s_dirt) 202 if (sb->s_dirt)
198 jffs2_write_super(sb); 203 jffs2_write_super(sb);
199 204
@@ -215,8 +220,6 @@ static void jffs2_put_super (struct super_block *sb)
215 if (c->mtd->sync) 220 if (c->mtd->sync)
216 c->mtd->sync(c->mtd); 221 c->mtd->sync(c->mtd);
217 222
218 unlock_kernel();
219
220 D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); 223 D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
221} 224}
222 225
@@ -232,7 +235,7 @@ static void jffs2_kill_sb(struct super_block *sb)
232static struct file_system_type jffs2_fs_type = { 235static struct file_system_type jffs2_fs_type = {
233 .owner = THIS_MODULE, 236 .owner = THIS_MODULE,
234 .name = "jffs2", 237 .name = "jffs2",
235 .get_sb = jffs2_get_sb, 238 .mount = jffs2_mount,
236 .kill_sb = jffs2_kill_sb, 239 .kill_sb = jffs2_kill_sb,
237}; 240};
238 241
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a49..4f9cc0482949 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
152 offset, je32_to_cpu(rx.hdr_crc), crc); 152 offset, je32_to_cpu(rx.hdr_crc), crc);
153 xd->flags |= JFFS2_XFLAGS_INVALID; 153 xd->flags |= JFFS2_XFLAGS_INVALID;
154 return EIO; 154 return -EIO;
155 } 155 }
156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); 156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK 157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
167 je32_to_cpu(rx.xid), xd->xid, 167 je32_to_cpu(rx.xid), xd->xid,
168 je32_to_cpu(rx.version), xd->version); 168 je32_to_cpu(rx.version), xd->version);
169 xd->flags |= JFFS2_XFLAGS_INVALID; 169 xd->flags |= JFFS2_XFLAGS_INVALID;
170 return EIO; 170 return -EIO;
171 } 171 }
172 xd->xprefix = rx.xprefix; 172 xd->xprefix = rx.xprefix;
173 xd->name_len = rx.name_len; 173 xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
230 ref_offset(xd->node), xd->data_crc, crc); 230 ref_offset(xd->node), xd->data_crc, crc);
231 kfree(data); 231 kfree(data);
232 xd->flags |= JFFS2_XFLAGS_INVALID; 232 xd->flags |= JFFS2_XFLAGS_INVALID;
233 return EIO; 233 return -EIO;
234 } 234 }
235 235
236 xd->flags |= JFFS2_XFLAGS_HOT; 236 xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
268 if (xd->xname) 268 if (xd->xname)
269 return 0; 269 return 0;
270 if (xd->flags & JFFS2_XFLAGS_INVALID) 270 if (xd->flags & JFFS2_XFLAGS_INVALID)
271 return EIO; 271 return -EIO;
272 if (unlikely(is_xattr_datum_unchecked(c, xd))) 272 if (unlikely(is_xattr_datum_unchecked(c, xd)))
273 rc = do_verify_xattr_datum(c, xd); 273 rc = do_verify_xattr_datum(c, xd);
274 if (!rc) 274 if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
460 if (crc != je32_to_cpu(rr.node_crc)) { 460 if (crc != je32_to_cpu(rr.node_crc)) {
461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
462 offset, je32_to_cpu(rr.node_crc), crc); 462 offset, je32_to_cpu(rr.node_crc), crc);
463 return EIO; 463 return -EIO;
464 } 464 }
465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK 465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF 466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, 470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, 471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
472 je32_to_cpu(rr.totlen), PAD(sizeof(rr))); 472 je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
473 return EIO; 473 return -EIO;
474 } 474 }
475 ref->ino = je32_to_cpu(rr.ino); 475 ref->ino = je32_to_cpu(rr.ino);
476 ref->xid = je32_to_cpu(rr.xid); 476 ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4e..e5de9422fa32 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
114 return rc; 114 return rc;
115} 115}
116 116
117int jfs_check_acl(struct inode *inode, int mask) 117int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
118{ 118{
119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); 119 struct posix_acl *acl;
120
121 if (flags & IPERM_FLAG_RCU)
122 return -ECHILD;
120 123
124 acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
121 if (IS_ERR(acl)) 125 if (IS_ERR(acl))
122 return PTR_ERR(acl); 126 return PTR_ERR(acl);
123 if (acl) { 127 if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878d..f9285c4900fa 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
20 20
21#ifdef CONFIG_JFS_POSIX_ACL 21#ifdef CONFIG_JFS_POSIX_ACL
22 22
23int jfs_check_acl(struct inode *, int); 23int jfs_check_acl(struct inode *, int, unsigned int flags);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_acl_chmod(struct inode *inode); 25int jfs_acl_chmod(struct inode *inode);
26 26
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
497 * appear hashed, but do not put on any lists. hlist_del() 497 * appear hashed, but do not put on any lists. hlist_del()
498 * will work fine and require no locking. 498 * will work fine and require no locking.
499 */ 499 */
500 ip->i_hash.pprev = &ip->i_hash.next; 500 hlist_add_fake(&ip->i_hash);
501 501
502 return (ip); 502 return (ip);
503} 503}
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a14516..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
1010 * option 2 - shutdown file systems 1010 * option 2 - shutdown file systems
1011 * associated with log ? 1011 * associated with log ?
1012 * option 3 - extend log ? 1012 * option 3 - extend log ?
1013 */
1014 /*
1015 * option 4 - second chance 1013 * option 4 - second chance
1016 * 1014 *
1017 * mark log wrapped, and continue. 1015 * mark log wrapped, and continue.
1018 * when all active transactions are completed, 1016 * when all active transactions are completed,
1019 * mark log vaild for recovery. 1017 * mark log valid for recovery.
1020 * if crashed during invalid state, log state 1018 * if crashed during invalid state, log state
1021 * implies invald log, forcing fsck(). 1019 * implies invalid log, forcing fsck().
1022 */ 1020 */
1023 /* mark log state log wrap in log superblock */ 1021 /* mark log state log wrap in log superblock */
1024 /* log->state = LOGWRAP; */ 1022 /* log->state = LOGWRAP; */
@@ -1122,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
1122 * file systems to log may have n-to-1 relationship; 1120 * file systems to log may have n-to-1 relationship;
1123 */ 1121 */
1124 1122
1125 bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE); 1123 bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1124 log);
1126 if (IS_ERR(bdev)) { 1125 if (IS_ERR(bdev)) {
1127 rc = -PTR_ERR(bdev); 1126 rc = -PTR_ERR(bdev);
1128 goto free; 1127 goto free;
1129 } 1128 }
1130 1129
1131 if ((rc = bd_claim(bdev, log))) {
1132 goto close;
1133 }
1134
1135 log->bdev = bdev; 1130 log->bdev = bdev;
1136 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); 1131 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
1137 1132
@@ -1139,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
1139 * initialize log: 1134 * initialize log:
1140 */ 1135 */
1141 if ((rc = lmLogInit(log))) 1136 if ((rc = lmLogInit(log)))
1142 goto unclaim; 1137 goto close;
1143 1138
1144 list_add(&log->journal_list, &jfs_external_logs); 1139 list_add(&log->journal_list, &jfs_external_logs);
1145 1140
@@ -1165,11 +1160,8 @@ journal_found:
1165 list_del(&log->journal_list); 1160 list_del(&log->journal_list);
1166 lbmLogShutdown(log); 1161 lbmLogShutdown(log);
1167 1162
1168 unclaim:
1169 bd_release(bdev);
1170
1171 close: /* close external log device */ 1163 close: /* close external log device */
1172 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1164 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1173 1165
1174 free: /* free log descriptor */ 1166 free: /* free log descriptor */
1175 mutex_unlock(&jfs_log_mutex); 1167 mutex_unlock(&jfs_log_mutex);
@@ -1514,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
1514 bdev = log->bdev; 1506 bdev = log->bdev;
1515 rc = lmLogShutdown(log); 1507 rc = lmLogShutdown(log);
1516 1508
1517 bd_release(bdev); 1509 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1518 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1519 1510
1520 kfree(log); 1511 kfree(log);
1521 1512
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45a..9895595fd2f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
97 97
98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); 98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
99 if (ipaimap == NULL) { 99 if (ipaimap == NULL) {
100 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 100 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
101 rc = -EIO; 101 rc = -EIO;
102 goto errout20; 102 goto errout20;
103 } 103 }
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { 148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); 149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
150 if (!ipaimap2) { 150 if (!ipaimap2) {
151 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 151 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
152 rc = -EIO; 152 rc = -EIO;
153 goto errout35; 153 goto errout35;
154 } 154 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1279 * lazy commit thread finishes processing 1279 * lazy commit thread finishes processing
1280 */ 1280 */
1281 if (tblk->xflag & COMMIT_DELETE) { 1281 if (tblk->xflag & COMMIT_DELETE) {
1282 atomic_inc(&tblk->u.ip->i_count); 1282 ihold(tblk->u.ip);
1283 /* 1283 /*
1284 * Avoid a rare deadlock 1284 * Avoid a rare deadlock
1285 * 1285 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..81ead850ddb6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/namei.h>
21#include <linux/ctype.h> 22#include <linux/ctype.h>
22#include <linux/quotaops.h> 23#include <linux/quotaops.h>
23#include <linux/exportfs.h> 24#include <linux/exportfs.h>
@@ -839,7 +840,7 @@ static int jfs_link(struct dentry *old_dentry,
839 ip->i_ctime = CURRENT_TIME; 840 ip->i_ctime = CURRENT_TIME;
840 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 841 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
841 mark_inode_dirty(dir); 842 mark_inode_dirty(dir);
842 atomic_inc(&ip->i_count); 843 ihold(ip);
843 844
844 iplist[0] = ip; 845 iplist[0] = ip;
845 iplist[1] = dir; 846 iplist[1] = dir;
@@ -1464,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1464 1465
1465 jfs_info("jfs_lookup: name = %s", name); 1466 jfs_info("jfs_lookup: name = %s", name);
1466 1467
1467 if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
1468 dentry->d_op = &jfs_ci_dentry_operations;
1469
1470 if ((name[0] == '.') && (len == 1)) 1468 if ((name[0] == '.') && (len == 1))
1471 inum = dip->i_ino; 1469 inum = dip->i_ino;
1472 else if (strcmp(name, "..") == 0) 1470 else if (strcmp(name, "..") == 0)
@@ -1491,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1491 return ERR_CAST(ip); 1489 return ERR_CAST(ip);
1492 } 1490 }
1493 1491
1494 dentry = d_splice_alias(ip, dentry); 1492 return d_splice_alias(ip, dentry);
1495
1496 if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
1497 dentry->d_op = &jfs_ci_dentry_operations;
1498
1499 return dentry;
1500} 1493}
1501 1494
1502static struct inode *jfs_nfs_get_inode(struct super_block *sb, 1495static struct inode *jfs_nfs_get_inode(struct super_block *sb,
@@ -1573,7 +1566,8 @@ const struct file_operations jfs_dir_operations = {
1573 .llseek = generic_file_llseek, 1566 .llseek = generic_file_llseek,
1574}; 1567};
1575 1568
1576static int jfs_ci_hash(struct dentry *dir, struct qstr *this) 1569static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
1570 struct qstr *this)
1577{ 1571{
1578 unsigned long hash; 1572 unsigned long hash;
1579 int i; 1573 int i;
@@ -1586,32 +1580,63 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
1586 return 0; 1580 return 0;
1587} 1581}
1588 1582
1589static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b) 1583static int jfs_ci_compare(const struct dentry *parent,
1584 const struct inode *pinode,
1585 const struct dentry *dentry, const struct inode *inode,
1586 unsigned int len, const char *str, const struct qstr *name)
1590{ 1587{
1591 int i, result = 1; 1588 int i, result = 1;
1592 1589
1593 if (a->len != b->len) 1590 if (len != name->len)
1594 goto out; 1591 goto out;
1595 for (i=0; i < a->len; i++) { 1592 for (i=0; i < len; i++) {
1596 if (tolower(a->name[i]) != tolower(b->name[i])) 1593 if (tolower(str[i]) != tolower(name->name[i]))
1597 goto out; 1594 goto out;
1598 } 1595 }
1599 result = 0; 1596 result = 0;
1597out:
1598 return result;
1599}
1600 1600
1601static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
1602{
1603 if (nd->flags & LOOKUP_RCU)
1604 return -ECHILD;
1601 /* 1605 /*
1602 * We want creates to preserve case. A negative dentry, a, that 1606 * This is not negative dentry. Always valid.
1603 * has a different case than b may cause a new entry to be created 1607 *
1604 * with the wrong case. Since we can't tell if a comes from a negative 1608 * Note, rename() to existing directory entry will have ->d_inode,
1605 * dentry, we blindly replace it with b. This should be harmless if 1609 * and will use existing name which isn't specified name by user.
1606 * a is not a negative dentry. 1610 *
1611 * We may be able to drop this positive dentry here. But dropping
1612 * positive dentry isn't good idea. So it's unsupported like
1613 * rename("filename", "FILENAME") for now.
1607 */ 1614 */
1608 memcpy((unsigned char *)a->name, b->name, a->len); 1615 if (dentry->d_inode)
1609out: 1616 return 1;
1610 return result; 1617
1618 /*
1619 * This may be nfsd (or something), anyway, we can't see the
1620 * intent of this. So, since this can be for creation, drop it.
1621 */
1622 if (!nd)
1623 return 0;
1624
1625 /*
1626 * Drop the negative dentry, in order to make sure to use the
1627 * case sensitive name which is specified by user if this is
1628 * for creation.
1629 */
1630 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
1631 if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
1632 return 0;
1633 }
1634 return 1;
1611} 1635}
1612 1636
1613const struct dentry_operations jfs_ci_dentry_operations = 1637const struct dentry_operations jfs_ci_dentry_operations =
1614{ 1638{
1615 .d_hash = jfs_ci_hash, 1639 .d_hash = jfs_ci_hash,
1616 .d_compare = jfs_ci_compare, 1640 .d_compare = jfs_ci_compare,
1641 .d_revalidate = jfs_ci_revalidate,
1617}; 1642};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index ec8c3e4baca3..eeca48a031ab 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <asm/uaccess.h> 34#include <asm/uaccess.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/smp_lock.h>
37 36
38#include "jfs_incore.h" 37#include "jfs_incore.h"
39#include "jfs_filsys.h" 38#include "jfs_filsys.h"
@@ -116,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
116 return &jfs_inode->vfs_inode; 115 return &jfs_inode->vfs_inode;
117} 116}
118 117
118static void jfs_i_callback(struct rcu_head *head)
119{
120 struct inode *inode = container_of(head, struct inode, i_rcu);
121 struct jfs_inode_info *ji = JFS_IP(inode);
122 INIT_LIST_HEAD(&inode->i_dentry);
123 kmem_cache_free(jfs_inode_cachep, ji);
124}
125
119static void jfs_destroy_inode(struct inode *inode) 126static void jfs_destroy_inode(struct inode *inode)
120{ 127{
121 struct jfs_inode_info *ji = JFS_IP(inode); 128 struct jfs_inode_info *ji = JFS_IP(inode);
@@ -129,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
129 ji->active_ag = -1; 136 ji->active_ag = -1;
130 } 137 }
131 spin_unlock_irq(&ji->ag_lock); 138 spin_unlock_irq(&ji->ag_lock);
132 kmem_cache_free(jfs_inode_cachep, ji); 139 call_rcu(&inode->i_rcu, jfs_i_callback);
133} 140}
134 141
135static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) 142static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -176,8 +183,6 @@ static void jfs_put_super(struct super_block *sb)
176 183
177 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 184 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
178 185
179 lock_kernel();
180
181 rc = jfs_umount(sb); 186 rc = jfs_umount(sb);
182 if (rc) 187 if (rc)
183 jfs_err("jfs_umount failed with return code %d", rc); 188 jfs_err("jfs_umount failed with return code %d", rc);
@@ -188,8 +193,6 @@ static void jfs_put_super(struct super_block *sb)
188 iput(sbi->direct_inode); 193 iput(sbi->direct_inode);
189 194
190 kfree(sbi); 195 kfree(sbi);
191
192 unlock_kernel();
193} 196}
194 197
195enum { 198enum {
@@ -369,19 +372,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
369 if (!parse_options(data, sb, &newLVSize, &flag)) { 372 if (!parse_options(data, sb, &newLVSize, &flag)) {
370 return -EINVAL; 373 return -EINVAL;
371 } 374 }
372 lock_kernel(); 375
373 if (newLVSize) { 376 if (newLVSize) {
374 if (sb->s_flags & MS_RDONLY) { 377 if (sb->s_flags & MS_RDONLY) {
375 printk(KERN_ERR 378 printk(KERN_ERR
376 "JFS: resize requires volume to be mounted read-write\n"); 379 "JFS: resize requires volume to be mounted read-write\n");
377 unlock_kernel();
378 return -EROFS; 380 return -EROFS;
379 } 381 }
380 rc = jfs_extendfs(sb, newLVSize, 0); 382 rc = jfs_extendfs(sb, newLVSize, 0);
381 if (rc) { 383 if (rc)
382 unlock_kernel();
383 return rc; 384 return rc;
384 }
385 } 385 }
386 386
387 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 387 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -397,36 +397,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
397 /* mark the fs r/w for quota activity */ 397 /* mark the fs r/w for quota activity */
398 sb->s_flags &= ~MS_RDONLY; 398 sb->s_flags &= ~MS_RDONLY;
399 399
400 unlock_kernel();
401 dquot_resume(sb, -1); 400 dquot_resume(sb, -1);
402 return ret; 401 return ret;
403 } 402 }
404 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 403 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
405 rc = dquot_suspend(sb, -1); 404 rc = dquot_suspend(sb, -1);
406 if (rc < 0) { 405 if (rc < 0) {
407 unlock_kernel();
408 return rc; 406 return rc;
409 } 407 }
410 rc = jfs_umount_rw(sb); 408 rc = jfs_umount_rw(sb);
411 JFS_SBI(sb)->flag = flag; 409 JFS_SBI(sb)->flag = flag;
412 unlock_kernel();
413 return rc; 410 return rc;
414 } 411 }
415 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) 412 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
416 if (!(sb->s_flags & MS_RDONLY)) { 413 if (!(sb->s_flags & MS_RDONLY)) {
417 rc = jfs_umount_rw(sb); 414 rc = jfs_umount_rw(sb);
418 if (rc) { 415 if (rc)
419 unlock_kernel();
420 return rc; 416 return rc;
421 } 417
422 JFS_SBI(sb)->flag = flag; 418 JFS_SBI(sb)->flag = flag;
423 ret = jfs_mount_rw(sb, 1); 419 ret = jfs_mount_rw(sb, 1);
424 unlock_kernel();
425 return ret; 420 return ret;
426 } 421 }
427 JFS_SBI(sb)->flag = flag; 422 JFS_SBI(sb)->flag = flag;
428 423
429 unlock_kernel();
430 return 0; 424 return 0;
431} 425}
432 426
@@ -446,6 +440,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
446 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); 440 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
447 if (!sbi) 441 if (!sbi)
448 return -ENOMEM; 442 return -ENOMEM;
443
449 sb->s_fs_info = sbi; 444 sb->s_fs_info = sbi;
450 sbi->sb = sb; 445 sbi->sb = sb;
451 sbi->uid = sbi->gid = sbi->umask = -1; 446 sbi->uid = sbi->gid = sbi->umask = -1;
@@ -520,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
520 515
521 sb->s_magic = JFS_SUPER_MAGIC; 516 sb->s_magic = JFS_SUPER_MAGIC;
522 517
518 if (sbi->mntflag & JFS_OS2)
519 sb->s_d_op = &jfs_ci_dentry_operations;
520
523 inode = jfs_iget(sb, ROOT_I); 521 inode = jfs_iget(sb, ROOT_I);
524 if (IS_ERR(inode)) { 522 if (IS_ERR(inode)) {
525 ret = PTR_ERR(inode); 523 ret = PTR_ERR(inode);
@@ -529,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
529 if (!sb->s_root) 527 if (!sb->s_root)
530 goto out_no_root; 528 goto out_no_root;
531 529
532 if (sbi->mntflag & JFS_OS2)
533 sb->s_root->d_op = &jfs_ci_dentry_operations;
534
535 /* logical blocks are represented by 40 bits in pxd_t, etc. */ 530 /* logical blocks are represented by 40 bits in pxd_t, etc. */
536 sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; 531 sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
537#if BITS_PER_LONG == 32 532#if BITS_PER_LONG == 32
@@ -596,11 +591,10 @@ static int jfs_unfreeze(struct super_block *sb)
596 return 0; 591 return 0;
597} 592}
598 593
599static int jfs_get_sb(struct file_system_type *fs_type, 594static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
600 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 595 int flags, const char *dev_name, void *data)
601{ 596{
602 return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super, 597 return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
603 mnt);
604} 598}
605 599
606static int jfs_sync_fs(struct super_block *sb, int wait) 600static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -783,7 +777,7 @@ static const struct export_operations jfs_export_operations = {
783static struct file_system_type jfs_fs_type = { 777static struct file_system_type jfs_fs_type = {
784 .owner = THIS_MODULE, 778 .owner = THIS_MODULE,
785 .name = "jfs", 779 .name = "jfs",
786 .get_sb = jfs_get_sb, 780 .mount = jfs_do_mount,
787 .kill_sb = kill_block_super, 781 .kill_sb = kill_block_super,
788 .fs_flags = FS_REQUIRES_DEV, 782 .fs_flags = FS_REQUIRES_DEV,
789}; 783};
diff --git a/fs/libfs.c b/fs/libfs.c
index 0a9da95317f7..c88eab55aec9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,11 @@
16 16
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19static inline int simple_positive(struct dentry *dentry)
20{
21 return dentry->d_inode && !d_unhashed(dentry);
22}
23
19int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, 24int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
20 struct kstat *stat) 25 struct kstat *stat)
21{ 26{
@@ -37,7 +42,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
37 * Retaining negative dentries for an in-memory filesystem just wastes 42 * Retaining negative dentries for an in-memory filesystem just wastes
38 * memory and lookup time: arrange for them to be deleted immediately. 43 * memory and lookup time: arrange for them to be deleted immediately.
39 */ 44 */
40static int simple_delete_dentry(struct dentry *dentry) 45static int simple_delete_dentry(const struct dentry *dentry)
41{ 46{
42 return 1; 47 return 1;
43} 48}
@@ -54,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
54 59
55 if (dentry->d_name.len > NAME_MAX) 60 if (dentry->d_name.len > NAME_MAX)
56 return ERR_PTR(-ENAMETOOLONG); 61 return ERR_PTR(-ENAMETOOLONG);
57 dentry->d_op = &simple_dentry_operations; 62 d_set_d_op(dentry, &simple_dentry_operations);
58 d_add(dentry, NULL); 63 d_add(dentry, NULL);
59 return NULL; 64 return NULL;
60} 65}
@@ -76,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file)
76 81
77loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) 82loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
78{ 83{
79 mutex_lock(&file->f_path.dentry->d_inode->i_mutex); 84 struct dentry *dentry = file->f_path.dentry;
85 mutex_lock(&dentry->d_inode->i_mutex);
80 switch (origin) { 86 switch (origin) {
81 case 1: 87 case 1:
82 offset += file->f_pos; 88 offset += file->f_pos;
@@ -84,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
84 if (offset >= 0) 90 if (offset >= 0)
85 break; 91 break;
86 default: 92 default:
87 mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); 93 mutex_unlock(&dentry->d_inode->i_mutex);
88 return -EINVAL; 94 return -EINVAL;
89 } 95 }
90 if (offset != file->f_pos) { 96 if (offset != file->f_pos) {
@@ -94,21 +100,24 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
94 struct dentry *cursor = file->private_data; 100 struct dentry *cursor = file->private_data;
95 loff_t n = file->f_pos - 2; 101 loff_t n = file->f_pos - 2;
96 102
97 spin_lock(&dcache_lock); 103 spin_lock(&dentry->d_lock);
104 /* d_lock not required for cursor */
98 list_del(&cursor->d_u.d_child); 105 list_del(&cursor->d_u.d_child);
99 p = file->f_path.dentry->d_subdirs.next; 106 p = dentry->d_subdirs.next;
100 while (n && p != &file->f_path.dentry->d_subdirs) { 107 while (n && p != &dentry->d_subdirs) {
101 struct dentry *next; 108 struct dentry *next;
102 next = list_entry(p, struct dentry, d_u.d_child); 109 next = list_entry(p, struct dentry, d_u.d_child);
103 if (!d_unhashed(next) && next->d_inode) 110 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
111 if (simple_positive(next))
104 n--; 112 n--;
113 spin_unlock(&next->d_lock);
105 p = p->next; 114 p = p->next;
106 } 115 }
107 list_add_tail(&cursor->d_u.d_child, p); 116 list_add_tail(&cursor->d_u.d_child, p);
108 spin_unlock(&dcache_lock); 117 spin_unlock(&dentry->d_lock);
109 } 118 }
110 } 119 }
111 mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); 120 mutex_unlock(&dentry->d_inode->i_mutex);
112 return offset; 121 return offset;
113} 122}
114 123
@@ -148,29 +157,35 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
148 i++; 157 i++;
149 /* fallthrough */ 158 /* fallthrough */
150 default: 159 default:
151 spin_lock(&dcache_lock); 160 spin_lock(&dentry->d_lock);
152 if (filp->f_pos == 2) 161 if (filp->f_pos == 2)
153 list_move(q, &dentry->d_subdirs); 162 list_move(q, &dentry->d_subdirs);
154 163
155 for (p=q->next; p != &dentry->d_subdirs; p=p->next) { 164 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
156 struct dentry *next; 165 struct dentry *next;
157 next = list_entry(p, struct dentry, d_u.d_child); 166 next = list_entry(p, struct dentry, d_u.d_child);
158 if (d_unhashed(next) || !next->d_inode) 167 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
168 if (!simple_positive(next)) {
169 spin_unlock(&next->d_lock);
159 continue; 170 continue;
171 }
160 172
161 spin_unlock(&dcache_lock); 173 spin_unlock(&next->d_lock);
174 spin_unlock(&dentry->d_lock);
162 if (filldir(dirent, next->d_name.name, 175 if (filldir(dirent, next->d_name.name,
163 next->d_name.len, filp->f_pos, 176 next->d_name.len, filp->f_pos,
164 next->d_inode->i_ino, 177 next->d_inode->i_ino,
165 dt_type(next->d_inode)) < 0) 178 dt_type(next->d_inode)) < 0)
166 return 0; 179 return 0;
167 spin_lock(&dcache_lock); 180 spin_lock(&dentry->d_lock);
181 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
168 /* next is still alive */ 182 /* next is still alive */
169 list_move(q, p); 183 list_move(q, p);
184 spin_unlock(&next->d_lock);
170 p = q; 185 p = q;
171 filp->f_pos++; 186 filp->f_pos++;
172 } 187 }
173 spin_unlock(&dcache_lock); 188 spin_unlock(&dentry->d_lock);
174 } 189 }
175 return 0; 190 return 0;
176} 191}
@@ -201,9 +216,9 @@ static const struct super_operations simple_super_operations = {
201 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 216 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
202 * will never be mountable) 217 * will never be mountable)
203 */ 218 */
204int get_sb_pseudo(struct file_system_type *fs_type, char *name, 219struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
205 const struct super_operations *ops, unsigned long magic, 220 const struct super_operations *ops,
206 struct vfsmount *mnt) 221 const struct dentry_operations *dops, unsigned long magic)
207{ 222{
208 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 223 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
209 struct dentry *dentry; 224 struct dentry *dentry;
@@ -211,7 +226,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
211 struct qstr d_name = {.name = name, .len = strlen(name)}; 226 struct qstr d_name = {.name = name, .len = strlen(name)};
212 227
213 if (IS_ERR(s)) 228 if (IS_ERR(s))
214 return PTR_ERR(s); 229 return ERR_CAST(s);
215 230
216 s->s_flags = MS_NOUSER; 231 s->s_flags = MS_NOUSER;
217 s->s_maxbytes = MAX_LFS_FILESIZE; 232 s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -240,13 +255,13 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
240 dentry->d_parent = dentry; 255 dentry->d_parent = dentry;
241 d_instantiate(dentry, root); 256 d_instantiate(dentry, root);
242 s->s_root = dentry; 257 s->s_root = dentry;
258 s->s_d_op = dops;
243 s->s_flags |= MS_ACTIVE; 259 s->s_flags |= MS_ACTIVE;
244 simple_set_mnt(mnt, s); 260 return dget(s->s_root);
245 return 0;
246 261
247Enomem: 262Enomem:
248 deactivate_locked_super(s); 263 deactivate_locked_super(s);
249 return -ENOMEM; 264 return ERR_PTR(-ENOMEM);
250} 265}
251 266
252int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 267int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -255,29 +270,29 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
255 270
256 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 271 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
257 inc_nlink(inode); 272 inc_nlink(inode);
258 atomic_inc(&inode->i_count); 273 ihold(inode);
259 dget(dentry); 274 dget(dentry);
260 d_instantiate(dentry, inode); 275 d_instantiate(dentry, inode);
261 return 0; 276 return 0;
262} 277}
263 278
264static inline int simple_positive(struct dentry *dentry)
265{
266 return dentry->d_inode && !d_unhashed(dentry);
267}
268
269int simple_empty(struct dentry *dentry) 279int simple_empty(struct dentry *dentry)
270{ 280{
271 struct dentry *child; 281 struct dentry *child;
272 int ret = 0; 282 int ret = 0;
273 283
274 spin_lock(&dcache_lock); 284 spin_lock(&dentry->d_lock);
275 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) 285 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
276 if (simple_positive(child)) 286 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
287 if (simple_positive(child)) {
288 spin_unlock(&child->d_lock);
277 goto out; 289 goto out;
290 }
291 spin_unlock(&child->d_lock);
292 }
278 ret = 1; 293 ret = 1;
279out: 294out:
280 spin_unlock(&dcache_lock); 295 spin_unlock(&dentry->d_lock);
281 return ret; 296 return ret;
282} 297}
283 298
@@ -892,10 +907,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
892 */ 907 */
893int generic_file_fsync(struct file *file, int datasync) 908int generic_file_fsync(struct file *file, int datasync)
894{ 909{
895 struct writeback_control wbc = {
896 .sync_mode = WB_SYNC_ALL,
897 .nr_to_write = 0, /* metadata-only; caller takes care of data */
898 };
899 struct inode *inode = file->f_mapping->host; 910 struct inode *inode = file->f_mapping->host;
900 int err; 911 int err;
901 int ret; 912 int ret;
@@ -906,13 +917,42 @@ int generic_file_fsync(struct file *file, int datasync)
906 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 917 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
907 return ret; 918 return ret;
908 919
909 err = sync_inode(inode, &wbc); 920 err = sync_inode_metadata(inode, 1);
910 if (ret == 0) 921 if (ret == 0)
911 ret = err; 922 ret = err;
912 return ret; 923 return ret;
913} 924}
914EXPORT_SYMBOL(generic_file_fsync); 925EXPORT_SYMBOL(generic_file_fsync);
915 926
927/**
928 * generic_check_addressable - Check addressability of file system
929 * @blocksize_bits: log of file system block size
930 * @num_blocks: number of blocks in file system
931 *
932 * Determine whether a file system with @num_blocks blocks (and a
933 * block size of 2**@blocksize_bits) is addressable by the sector_t
934 * and page cache of the system. Return 0 if so and -EFBIG otherwise.
935 */
936int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
937{
938 u64 last_fs_block = num_blocks - 1;
939 u64 last_fs_page =
940 last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
941
942 if (unlikely(num_blocks == 0))
943 return 0;
944
945 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
946 return -EINVAL;
947
948 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
949 (last_fs_page > (pgoff_t)(~0ULL))) {
950 return -EFBIG;
951 }
952 return 0;
953}
954EXPORT_SYMBOL(generic_check_addressable);
955
916/* 956/*
917 * No-op implementation of ->fsync for in-memory filesystems. 957 * No-op implementation of ->fsync for in-memory filesystems.
918 */ 958 */
@@ -926,7 +966,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
926EXPORT_SYMBOL(dcache_dir_open); 966EXPORT_SYMBOL(dcache_dir_open);
927EXPORT_SYMBOL(dcache_readdir); 967EXPORT_SYMBOL(dcache_readdir);
928EXPORT_SYMBOL(generic_read_dir); 968EXPORT_SYMBOL(generic_read_dir);
929EXPORT_SYMBOL(get_sb_pseudo); 969EXPORT_SYMBOL(mount_pseudo);
930EXPORT_SYMBOL(simple_write_begin); 970EXPORT_SYMBOL(simple_write_begin);
931EXPORT_SYMBOL(simple_write_end); 971EXPORT_SYMBOL(simple_write_end);
932EXPORT_SYMBOL(simple_dir_inode_operations); 972EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 97f6073ab339..ca58d64374ca 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ 7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
8 svcproc.o svcsubs.o mon.o xdr.o grace.o 8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
9lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
10lockd-objs := $(lockd-objs-y) 10lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
new file mode 100644
index 000000000000..f848b52c67b1
--- /dev/null
+++ b/fs/lockd/clnt4xdr.c
@@ -0,0 +1,605 @@
1/*
2 * linux/fs/lockd/clnt4xdr.c
3 *
4 * XDR functions to encode/decode NLM version 4 RPC arguments and results.
5 *
6 * NLM client-side only.
7 *
8 * Copyright (C) 2010, Oracle. All rights reserved.
9 */
10
11#include <linux/types.h>
12#include <linux/sunrpc/xdr.h>
13#include <linux/sunrpc/clnt.h>
14#include <linux/sunrpc/stats.h>
15#include <linux/lockd/lockd.h>
16
17#define NLMDBG_FACILITY NLMDBG_XDR
18
19#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
20# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
21#endif
22
23#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
24# error "NLM host name cannot be larger than NLM's maximum string length!"
25#endif
26
27/*
28 * Declare the space requirements for NLM arguments and replies as
29 * number of 32bit-words
30 */
31#define NLM4_void_sz (0)
32#define NLM4_cookie_sz (1+(NLM_MAXCOOKIELEN>>2))
33#define NLM4_caller_sz (1+(NLMCLNT_OHSIZE>>2))
34#define NLM4_owner_sz (1+(NLMCLNT_OHSIZE>>2))
35#define NLM4_fhandle_sz (1+(NFS3_FHSIZE>>2))
36#define NLM4_lock_sz (5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz)
37#define NLM4_holder_sz (6+NLM4_owner_sz)
38
39#define NLM4_testargs_sz (NLM4_cookie_sz+1+NLM4_lock_sz)
40#define NLM4_lockargs_sz (NLM4_cookie_sz+4+NLM4_lock_sz)
41#define NLM4_cancargs_sz (NLM4_cookie_sz+2+NLM4_lock_sz)
42#define NLM4_unlockargs_sz (NLM4_cookie_sz+NLM4_lock_sz)
43
44#define NLM4_testres_sz (NLM4_cookie_sz+1+NLM4_holder_sz)
45#define NLM4_res_sz (NLM4_cookie_sz+1)
46#define NLM4_norep_sz (0)
47
48
49static s64 loff_t_to_s64(loff_t offset)
50{
51 s64 res;
52
53 if (offset >= NLM4_OFFSET_MAX)
54 res = NLM4_OFFSET_MAX;
55 else if (offset <= -NLM4_OFFSET_MAX)
56 res = -NLM4_OFFSET_MAX;
57 else
58 res = offset;
59 return res;
60}
61
62static void nlm4_compute_offsets(const struct nlm_lock *lock,
63 u64 *l_offset, u64 *l_len)
64{
65 const struct file_lock *fl = &lock->fl;
66
67 BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
68 BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
69 fl->fl_end != OFFSET_MAX);
70
71 *l_offset = loff_t_to_s64(fl->fl_start);
72 if (fl->fl_end == OFFSET_MAX)
73 *l_len = 0;
74 else
75 *l_len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
76}
77
78/*
79 * Handle decode buffer overflows out-of-line.
80 */
81static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
82{
83 dprintk("lockd: %s prematurely hit the end of our receive buffer. "
84 "Remaining buffer length is %tu words.\n",
85 func, xdr->end - xdr->p);
86}
87
88
89/*
90 * Encode/decode NLMv4 basic data types
91 *
92 * Basic NLMv4 data types are defined in Appendix II, section 6.1.4
93 * of RFC 1813: "NFS Version 3 Protocol Specification" and in Chapter
94 * 10 of X/Open's "Protocols for Interworking: XNFS, Version 3W".
95 *
96 * Not all basic data types have their own encoding and decoding
97 * functions. For run-time efficiency, some data types are encoded
98 * or decoded inline.
99 */
100
101static void encode_bool(struct xdr_stream *xdr, const int value)
102{
103 __be32 *p;
104
105 p = xdr_reserve_space(xdr, 4);
106 *p = value ? xdr_one : xdr_zero;
107}
108
109static void encode_int32(struct xdr_stream *xdr, const s32 value)
110{
111 __be32 *p;
112
113 p = xdr_reserve_space(xdr, 4);
114 *p = cpu_to_be32(value);
115}
116
117/*
118 * typedef opaque netobj<MAXNETOBJ_SZ>
119 */
120static void encode_netobj(struct xdr_stream *xdr,
121 const u8 *data, const unsigned int length)
122{
123 __be32 *p;
124
125 BUG_ON(length > XDR_MAX_NETOBJ);
126 p = xdr_reserve_space(xdr, 4 + length);
127 xdr_encode_opaque(p, data, length);
128}
129
130static int decode_netobj(struct xdr_stream *xdr,
131 struct xdr_netobj *obj)
132{
133 u32 length;
134 __be32 *p;
135
136 p = xdr_inline_decode(xdr, 4);
137 if (unlikely(p == NULL))
138 goto out_overflow;
139 length = be32_to_cpup(p++);
140 if (unlikely(length > XDR_MAX_NETOBJ))
141 goto out_size;
142 obj->len = length;
143 obj->data = (u8 *)p;
144 return 0;
145out_size:
146 dprintk("NFS: returned netobj was too long: %u\n", length);
147 return -EIO;
148out_overflow:
149 print_overflow_msg(__func__, xdr);
150 return -EIO;
151}
152
153/*
154 * netobj cookie;
155 */
156static void encode_cookie(struct xdr_stream *xdr,
157 const struct nlm_cookie *cookie)
158{
159 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
160 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
161}
162
163static int decode_cookie(struct xdr_stream *xdr,
164 struct nlm_cookie *cookie)
165{
166 u32 length;
167 __be32 *p;
168
169 p = xdr_inline_decode(xdr, 4);
170 if (unlikely(p == NULL))
171 goto out_overflow;
172 length = be32_to_cpup(p++);
173 /* apparently HPUX can return empty cookies */
174 if (length == 0)
175 goto out_hpux;
176 if (length > NLM_MAXCOOKIELEN)
177 goto out_size;
178 p = xdr_inline_decode(xdr, length);
179 if (unlikely(p == NULL))
180 goto out_overflow;
181 cookie->len = length;
182 memcpy(cookie->data, p, length);
183 return 0;
184out_hpux:
185 cookie->len = 4;
186 memset(cookie->data, 0, 4);
187 return 0;
188out_size:
189 dprintk("NFS: returned cookie was too long: %u\n", length);
190 return -EIO;
191out_overflow:
192 print_overflow_msg(__func__, xdr);
193 return -EIO;
194}
195
196/*
197 * netobj fh;
198 */
199static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
200{
201 BUG_ON(fh->size > NFS3_FHSIZE);
202 encode_netobj(xdr, (u8 *)&fh->data, fh->size);
203}
204
205/*
206 * enum nlm4_stats {
207 * NLM4_GRANTED = 0,
208 * NLM4_DENIED = 1,
209 * NLM4_DENIED_NOLOCKS = 2,
210 * NLM4_BLOCKED = 3,
211 * NLM4_DENIED_GRACE_PERIOD = 4,
212 * NLM4_DEADLCK = 5,
213 * NLM4_ROFS = 6,
214 * NLM4_STALE_FH = 7,
215 * NLM4_FBIG = 8,
216 * NLM4_FAILED = 9
217 * };
218 *
219 * struct nlm4_stat {
220 * nlm4_stats stat;
221 * };
222 *
223 * NB: we don't swap bytes for the NLM status values. The upper
224 * layers deal directly with the status value in network byte
225 * order.
226 */
227static void encode_nlm4_stat(struct xdr_stream *xdr,
228 const __be32 stat)
229{
230 __be32 *p;
231
232 BUG_ON(be32_to_cpu(stat) > NLM_FAILED);
233 p = xdr_reserve_space(xdr, 4);
234 *p = stat;
235}
236
237static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
238{
239 __be32 *p;
240
241 p = xdr_inline_decode(xdr, 4);
242 if (unlikely(p == NULL))
243 goto out_overflow;
244 if (unlikely(*p > nlm4_failed))
245 goto out_bad_xdr;
246 *stat = *p;
247 return 0;
248out_bad_xdr:
249 dprintk("%s: server returned invalid nlm4_stats value: %u\n",
250 __func__, be32_to_cpup(p));
251 return -EIO;
252out_overflow:
253 print_overflow_msg(__func__, xdr);
254 return -EIO;
255}
256
257/*
258 * struct nlm4_holder {
259 * bool exclusive;
260 * int32 svid;
261 * netobj oh;
262 * uint64 l_offset;
263 * uint64 l_len;
264 * };
265 */
266static void encode_nlm4_holder(struct xdr_stream *xdr,
267 const struct nlm_res *result)
268{
269 const struct nlm_lock *lock = &result->lock;
270 u64 l_offset, l_len;
271 __be32 *p;
272
273 encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
274 encode_int32(xdr, lock->svid);
275 encode_netobj(xdr, lock->oh.data, lock->oh.len);
276
277 p = xdr_reserve_space(xdr, 4 + 4);
278 nlm4_compute_offsets(lock, &l_offset, &l_len);
279 p = xdr_encode_hyper(p, l_offset);
280 xdr_encode_hyper(p, l_len);
281}
282
283static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
284{
285 struct nlm_lock *lock = &result->lock;
286 struct file_lock *fl = &lock->fl;
287 u64 l_offset, l_len;
288 u32 exclusive;
289 int error;
290 __be32 *p;
291 s32 end;
292
293 memset(lock, 0, sizeof(*lock));
294 locks_init_lock(fl);
295
296 p = xdr_inline_decode(xdr, 4 + 4);
297 if (unlikely(p == NULL))
298 goto out_overflow;
299 exclusive = be32_to_cpup(p++);
300 lock->svid = be32_to_cpup(p);
301 fl->fl_pid = (pid_t)lock->svid;
302
303 error = decode_netobj(xdr, &lock->oh);
304 if (unlikely(error))
305 goto out;
306
307 p = xdr_inline_decode(xdr, 8 + 8);
308 if (unlikely(p == NULL))
309 goto out_overflow;
310
311 fl->fl_flags = FL_POSIX;
312 fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
313 p = xdr_decode_hyper(p, &l_offset);
314 xdr_decode_hyper(p, &l_len);
315 end = l_offset + l_len - 1;
316
317 fl->fl_start = (loff_t)l_offset;
318 if (l_len == 0 || end < 0)
319 fl->fl_end = OFFSET_MAX;
320 else
321 fl->fl_end = (loff_t)end;
322 error = 0;
323out:
324 return error;
325out_overflow:
326 print_overflow_msg(__func__, xdr);
327 return -EIO;
328}
329
330/*
331 * string caller_name<LM_MAXSTRLEN>;
332 */
333static void encode_caller_name(struct xdr_stream *xdr, const char *name)
334{
335 /* NB: client-side does not set lock->len */
336 u32 length = strlen(name);
337 __be32 *p;
338
339 BUG_ON(length > NLM_MAXSTRLEN);
340 p = xdr_reserve_space(xdr, 4 + length);
341 xdr_encode_opaque(p, name, length);
342}
343
344/*
345 * struct nlm4_lock {
346 * string caller_name<LM_MAXSTRLEN>;
347 * netobj fh;
348 * netobj oh;
349 * int32 svid;
350 * uint64 l_offset;
351 * uint64 l_len;
352 * };
353 */
354static void encode_nlm4_lock(struct xdr_stream *xdr,
355 const struct nlm_lock *lock)
356{
357 u64 l_offset, l_len;
358 __be32 *p;
359
360 encode_caller_name(xdr, lock->caller);
361 encode_fh(xdr, &lock->fh);
362 encode_netobj(xdr, lock->oh.data, lock->oh.len);
363
364 p = xdr_reserve_space(xdr, 4 + 8 + 8);
365 *p++ = cpu_to_be32(lock->svid);
366
367 nlm4_compute_offsets(lock, &l_offset, &l_len);
368 p = xdr_encode_hyper(p, l_offset);
369 xdr_encode_hyper(p, l_len);
370}
371
372
373/*
374 * NLMv4 XDR encode functions
375 *
376 * NLMv4 argument types are defined in Appendix II of RFC 1813:
377 * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
378 * "Protocols for Interworking: XNFS, Version 3W".
379 */
380
381/*
382 * struct nlm4_testargs {
383 * netobj cookie;
384 * bool exclusive;
385 * struct nlm4_lock alock;
386 * };
387 */
388static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
389 struct xdr_stream *xdr,
390 const struct nlm_args *args)
391{
392 const struct nlm_lock *lock = &args->lock;
393
394 encode_cookie(xdr, &args->cookie);
395 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
396 encode_nlm4_lock(xdr, lock);
397}
398
399/*
400 * struct nlm4_lockargs {
401 * netobj cookie;
402 * bool block;
403 * bool exclusive;
404 * struct nlm4_lock alock;
405 * bool reclaim;
406 * int state;
407 * };
408 */
409static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
410 struct xdr_stream *xdr,
411 const struct nlm_args *args)
412{
413 const struct nlm_lock *lock = &args->lock;
414
415 encode_cookie(xdr, &args->cookie);
416 encode_bool(xdr, args->block);
417 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
418 encode_nlm4_lock(xdr, lock);
419 encode_bool(xdr, args->reclaim);
420 encode_int32(xdr, args->state);
421}
422
423/*
424 * struct nlm4_cancargs {
425 * netobj cookie;
426 * bool block;
427 * bool exclusive;
428 * struct nlm4_lock alock;
429 * };
430 */
431static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
432 struct xdr_stream *xdr,
433 const struct nlm_args *args)
434{
435 const struct nlm_lock *lock = &args->lock;
436
437 encode_cookie(xdr, &args->cookie);
438 encode_bool(xdr, args->block);
439 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
440 encode_nlm4_lock(xdr, lock);
441}
442
443/*
444 * struct nlm4_unlockargs {
445 * netobj cookie;
446 * struct nlm4_lock alock;
447 * };
448 */
449static void nlm4_xdr_enc_unlockargs(struct rpc_rqst *req,
450 struct xdr_stream *xdr,
451 const struct nlm_args *args)
452{
453 const struct nlm_lock *lock = &args->lock;
454
455 encode_cookie(xdr, &args->cookie);
456 encode_nlm4_lock(xdr, lock);
457}
458
459/*
460 * struct nlm4_res {
461 * netobj cookie;
462 * nlm4_stat stat;
463 * };
464 */
465static void nlm4_xdr_enc_res(struct rpc_rqst *req,
466 struct xdr_stream *xdr,
467 const struct nlm_res *result)
468{
469 encode_cookie(xdr, &result->cookie);
470 encode_nlm4_stat(xdr, result->status);
471}
472
473/*
474 * union nlm4_testrply switch (nlm4_stats stat) {
475 * case NLM4_DENIED:
476 * struct nlm4_holder holder;
477 * default:
478 * void;
479 * };
480 *
481 * struct nlm4_testres {
482 * netobj cookie;
483 * nlm4_testrply test_stat;
484 * };
485 */
486static void nlm4_xdr_enc_testres(struct rpc_rqst *req,
487 struct xdr_stream *xdr,
488 const struct nlm_res *result)
489{
490 encode_cookie(xdr, &result->cookie);
491 encode_nlm4_stat(xdr, result->status);
492 if (result->status == nlm_lck_denied)
493 encode_nlm4_holder(xdr, result);
494}
495
496
497/*
498 * NLMv4 XDR decode functions
499 *
500 * NLMv4 argument types are defined in Appendix II of RFC 1813:
501 * "NFS Version 3 Protocol Specification" and Chapter 10 of X/Open's
502 * "Protocols for Interworking: XNFS, Version 3W".
503 */
504
505/*
506 * union nlm4_testrply switch (nlm4_stats stat) {
507 * case NLM4_DENIED:
508 * struct nlm4_holder holder;
509 * default:
510 * void;
511 * };
512 *
513 * struct nlm4_testres {
514 * netobj cookie;
515 * nlm4_testrply test_stat;
516 * };
517 */
518static int decode_nlm4_testrply(struct xdr_stream *xdr,
519 struct nlm_res *result)
520{
521 int error;
522
523 error = decode_nlm4_stat(xdr, &result->status);
524 if (unlikely(error))
525 goto out;
526 if (result->status == nlm_lck_denied)
527 error = decode_nlm4_holder(xdr, result);
528out:
529 return error;
530}
531
532static int nlm4_xdr_dec_testres(struct rpc_rqst *req,
533 struct xdr_stream *xdr,
534 struct nlm_res *result)
535{
536 int error;
537
538 error = decode_cookie(xdr, &result->cookie);
539 if (unlikely(error))
540 goto out;
541 error = decode_nlm4_testrply(xdr, result);
542out:
543 return error;
544}
545
546/*
547 * struct nlm4_res {
548 * netobj cookie;
549 * nlm4_stat stat;
550 * };
551 */
552static int nlm4_xdr_dec_res(struct rpc_rqst *req,
553 struct xdr_stream *xdr,
554 struct nlm_res *result)
555{
556 int error;
557
558 error = decode_cookie(xdr, &result->cookie);
559 if (unlikely(error))
560 goto out;
561 error = decode_nlm4_stat(xdr, &result->status);
562out:
563 return error;
564}
565
566
567/*
568 * For NLM, a void procedure really returns nothing
569 */
570#define nlm4_xdr_dec_norep NULL
571
572#define PROC(proc, argtype, restype) \
573[NLMPROC_##proc] = { \
574 .p_proc = NLMPROC_##proc, \
575 .p_encode = (kxdreproc_t)nlm4_xdr_enc_##argtype, \
576 .p_decode = (kxdrdproc_t)nlm4_xdr_dec_##restype, \
577 .p_arglen = NLM4_##argtype##_sz, \
578 .p_replen = NLM4_##restype##_sz, \
579 .p_statidx = NLMPROC_##proc, \
580 .p_name = #proc, \
581 }
582
583static struct rpc_procinfo nlm4_procedures[] = {
584 PROC(TEST, testargs, testres),
585 PROC(LOCK, lockargs, res),
586 PROC(CANCEL, cancargs, res),
587 PROC(UNLOCK, unlockargs, res),
588 PROC(GRANTED, testargs, res),
589 PROC(TEST_MSG, testargs, norep),
590 PROC(LOCK_MSG, lockargs, norep),
591 PROC(CANCEL_MSG, cancargs, norep),
592 PROC(UNLOCK_MSG, unlockargs, norep),
593 PROC(GRANTED_MSG, testargs, norep),
594 PROC(TEST_RES, testres, norep),
595 PROC(LOCK_RES, res, norep),
596 PROC(CANCEL_RES, res, norep),
597 PROC(UNLOCK_RES, res, norep),
598 PROC(GRANTED_RES, res, norep),
599};
600
601struct rpc_version nlm_version4 = {
602 .number = 4,
603 .nrprocs = ARRAY_SIZE(nlm4_procedures),
604 .procs = nlm4_procedures,
605};
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993c..8d4ea8351e3d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
14#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
16#include <linux/lockd/lockd.h> 16#include <linux/lockd/lockd.h>
17#include <linux/smp_lock.h>
18#include <linux/kthread.h> 17#include <linux/kthread.h>
19 18
20#define NLMDBG_FACILITY NLMDBG_CLIENT 19#define NLMDBG_FACILITY NLMDBG_CLIENT
@@ -42,6 +41,7 @@ struct nlm_wait {
42}; 41};
43 42
44static LIST_HEAD(nlm_blocked); 43static LIST_HEAD(nlm_blocked);
44static DEFINE_SPINLOCK(nlm_blocked_lock);
45 45
46/** 46/**
47 * nlmclnt_init - Set up per-NFS mount point lockd data structures 47 * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(nlmclnt_init);
79 */ 79 */
80void nlmclnt_done(struct nlm_host *host) 80void nlmclnt_done(struct nlm_host *host)
81{ 81{
82 nlm_release_host(host); 82 nlmclnt_release_host(host);
83 lockd_down(); 83 lockd_down();
84} 84}
85EXPORT_SYMBOL_GPL(nlmclnt_done); 85EXPORT_SYMBOL_GPL(nlmclnt_done);
@@ -97,7 +97,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
97 block->b_lock = fl; 97 block->b_lock = fl;
98 init_waitqueue_head(&block->b_wait); 98 init_waitqueue_head(&block->b_wait);
99 block->b_status = nlm_lck_blocked; 99 block->b_status = nlm_lck_blocked;
100
101 spin_lock(&nlm_blocked_lock);
100 list_add(&block->b_list, &nlm_blocked); 102 list_add(&block->b_list, &nlm_blocked);
103 spin_unlock(&nlm_blocked_lock);
101 } 104 }
102 return block; 105 return block;
103} 106}
@@ -106,7 +109,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
106{ 109{
107 if (block == NULL) 110 if (block == NULL)
108 return; 111 return;
112 spin_lock(&nlm_blocked_lock);
109 list_del(&block->b_list); 113 list_del(&block->b_list);
114 spin_unlock(&nlm_blocked_lock);
110 kfree(block); 115 kfree(block);
111} 116}
112 117
@@ -154,6 +159,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
154 * Look up blocked request based on arguments. 159 * Look up blocked request based on arguments.
155 * Warning: must not use cookie to match it! 160 * Warning: must not use cookie to match it!
156 */ 161 */
162 spin_lock(&nlm_blocked_lock);
157 list_for_each_entry(block, &nlm_blocked, b_list) { 163 list_for_each_entry(block, &nlm_blocked, b_list) {
158 struct file_lock *fl_blocked = block->b_lock; 164 struct file_lock *fl_blocked = block->b_lock;
159 165
@@ -178,6 +184,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
178 wake_up(&block->b_wait); 184 wake_up(&block->b_wait);
179 res = nlm_granted; 185 res = nlm_granted;
180 } 186 }
187 spin_unlock(&nlm_blocked_lock);
181 return res; 188 return res;
182} 189}
183 190
@@ -216,10 +223,6 @@ reclaimer(void *ptr)
216 allow_signal(SIGKILL); 223 allow_signal(SIGKILL);
217 224
218 down_write(&host->h_rwsem); 225 down_write(&host->h_rwsem);
219
220 /* This one ensures that our parent doesn't terminate while the
221 * reclaim is in progress */
222 lock_kernel();
223 lockd_up(); /* note: this cannot fail as lockd is already running */ 226 lockd_up(); /* note: this cannot fail as lockd is already running */
224 227
225 dprintk("lockd: reclaiming locks for host %s\n", host->h_name); 228 dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +263,17 @@ restart:
260 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); 263 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
261 264
262 /* Now, wake up all processes that sleep on a blocked lock */ 265 /* Now, wake up all processes that sleep on a blocked lock */
266 spin_lock(&nlm_blocked_lock);
263 list_for_each_entry(block, &nlm_blocked, b_list) { 267 list_for_each_entry(block, &nlm_blocked, b_list) {
264 if (block->b_host == host) { 268 if (block->b_host == host) {
265 block->b_status = nlm_lck_denied_grace_period; 269 block->b_status = nlm_lck_denied_grace_period;
266 wake_up(&block->b_wait); 270 wake_up(&block->b_wait);
267 } 271 }
268 } 272 }
273 spin_unlock(&nlm_blocked_lock);
269 274
270 /* Release host handle after use */ 275 /* Release host handle after use */
271 nlm_release_host(host); 276 nlmclnt_release_host(host);
272 lockd_down(); 277 lockd_down();
273 unlock_kernel();
274 return 0; 278 return 0;
275} 279}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab4..adb45ec9038c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/types.h> 11#include <linux/types.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
@@ -59,7 +58,7 @@ static void nlm_put_lockowner(struct nlm_lockowner *lockowner)
59 return; 58 return;
60 list_del(&lockowner->list); 59 list_del(&lockowner->list);
61 spin_unlock(&lockowner->host->h_lock); 60 spin_unlock(&lockowner->host->h_lock);
62 nlm_release_host(lockowner->host); 61 nlmclnt_release_host(lockowner->host);
63 kfree(lockowner); 62 kfree(lockowner);
64} 63}
65 64
@@ -166,7 +165,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
166 /* Set up the argument struct */ 165 /* Set up the argument struct */
167 nlmclnt_setlockargs(call, fl); 166 nlmclnt_setlockargs(call, fl);
168 167
169 lock_kernel();
170 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { 168 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
171 if (fl->fl_type != F_UNLCK) { 169 if (fl->fl_type != F_UNLCK) {
172 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; 170 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +175,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
177 status = nlmclnt_test(call, fl); 175 status = nlmclnt_test(call, fl);
178 else 176 else
179 status = -EINVAL; 177 status = -EINVAL;
180
181 fl->fl_ops->fl_release_private(fl); 178 fl->fl_ops->fl_release_private(fl);
182 fl->fl_ops = NULL; 179 fl->fl_ops = NULL;
183 unlock_kernel();
184 180
185 dprintk("lockd: clnt proc returns %d\n", status); 181 dprintk("lockd: clnt proc returns %d\n", status);
186 return status; 182 return status;
@@ -211,24 +207,22 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
211 printk("nlm_alloc_call: failed, waiting for memory\n"); 207 printk("nlm_alloc_call: failed, waiting for memory\n");
212 schedule_timeout_interruptible(5*HZ); 208 schedule_timeout_interruptible(5*HZ);
213 } 209 }
214 nlm_release_host(host); 210 nlmclnt_release_host(host);
215 return NULL; 211 return NULL;
216} 212}
217 213
218void nlm_release_call(struct nlm_rqst *call) 214void nlmclnt_release_call(struct nlm_rqst *call)
219{ 215{
220 if (!atomic_dec_and_test(&call->a_count)) 216 if (!atomic_dec_and_test(&call->a_count))
221 return; 217 return;
222 nlm_release_host(call->a_host); 218 nlmclnt_release_host(call->a_host);
223 nlmclnt_release_lockargs(call); 219 nlmclnt_release_lockargs(call);
224 kfree(call); 220 kfree(call);
225} 221}
226 222
227static void nlmclnt_rpc_release(void *data) 223static void nlmclnt_rpc_release(void *data)
228{ 224{
229 lock_kernel(); 225 nlmclnt_release_call(data);
230 nlm_release_call(data);
231 unlock_kernel();
232} 226}
233 227
234static int nlm_wait_on_grace(wait_queue_head_t *queue) 228static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -442,20 +436,24 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
442 status = nlm_stat_to_errno(req->a_res.status); 436 status = nlm_stat_to_errno(req->a_res.status);
443 } 437 }
444out: 438out:
445 nlm_release_call(req); 439 nlmclnt_release_call(req);
446 return status; 440 return status;
447} 441}
448 442
449static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) 443static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
450{ 444{
445 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
451 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; 446 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
452 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); 447 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
453 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); 448 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
449 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
454} 450}
455 451
456static void nlmclnt_locks_release_private(struct file_lock *fl) 452static void nlmclnt_locks_release_private(struct file_lock *fl)
457{ 453{
454 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
458 list_del(&fl->fl_u.nfs_fl.list); 455 list_del(&fl->fl_u.nfs_fl.list);
456 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
459 nlm_put_lockowner(fl->fl_u.nfs_fl.owner); 457 nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
460} 458}
461 459
@@ -595,7 +593,7 @@ again:
595out_unblock: 593out_unblock:
596 nlmclnt_finish_block(block); 594 nlmclnt_finish_block(block);
597out: 595out:
598 nlm_release_call(req); 596 nlmclnt_release_call(req);
599 return status; 597 return status;
600out_unlock: 598out_unlock:
601 /* Fatal error: ensure that we remove the lock altogether */ 599 /* Fatal error: ensure that we remove the lock altogether */
@@ -696,7 +694,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
696 /* What to do now? I'm out of my depth... */ 694 /* What to do now? I'm out of my depth... */
697 status = -ENOLCK; 695 status = -ENOLCK;
698out: 696out:
699 nlm_release_call(req); 697 nlmclnt_release_call(req);
700 return status; 698 return status;
701} 699}
702 700
@@ -721,9 +719,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
721die: 719die:
722 return; 720 return;
723 retry_rebind: 721 retry_rebind:
724 lock_kernel();
725 nlm_rebind_host(req->a_host); 722 nlm_rebind_host(req->a_host);
726 unlock_kernel();
727 retry_unlock: 723 retry_unlock:
728 rpc_restart_call(task); 724 rpc_restart_call(task);
729} 725}
@@ -759,7 +755,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
759 NLMPROC_CANCEL, &nlmclnt_cancel_ops); 755 NLMPROC_CANCEL, &nlmclnt_cancel_ops);
760 if (status == 0 && req->a_res.status == nlm_lck_denied) 756 if (status == 0 && req->a_res.status == nlm_lck_denied)
761 status = -ENOLCK; 757 status = -ENOLCK;
762 nlm_release_call(req); 758 nlmclnt_release_call(req);
763 return status; 759 return status;
764} 760}
765 761
@@ -801,9 +797,7 @@ retry_cancel:
801 /* Don't ever retry more than 3 times */ 797 /* Don't ever retry more than 3 times */
802 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) 798 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
803 goto die; 799 goto die;
804 lock_kernel();
805 nlm_rebind_host(req->a_host); 800 nlm_rebind_host(req->a_host);
806 unlock_kernel();
807 rpc_restart_call(task); 801 rpc_restart_call(task);
808 rpc_delay(task, 30 * HZ); 802 rpc_delay(task, 30 * HZ);
809} 803}
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
new file mode 100644
index 000000000000..180ac34feb9a
--- /dev/null
+++ b/fs/lockd/clntxdr.c
@@ -0,0 +1,627 @@
1/*
2 * linux/fs/lockd/clntxdr.c
3 *
4 * XDR functions to encode/decode NLM version 3 RPC arguments and results.
5 * NLM version 3 is backwards compatible with NLM versions 1 and 2.
6 *
7 * NLM client-side only.
8 *
9 * Copyright (C) 2010, Oracle. All rights reserved.
10 */
11
12#include <linux/types.h>
13#include <linux/sunrpc/xdr.h>
14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/stats.h>
16#include <linux/lockd/lockd.h>
17
18#define NLMDBG_FACILITY NLMDBG_XDR
19
20#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
21# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
22#endif
23
24/*
25 * Declare the space requirements for NLM arguments and replies as
26 * number of 32bit-words
27 */
28#define NLM_cookie_sz (1+(NLM_MAXCOOKIELEN>>2))
29#define NLM_caller_sz (1+(NLMCLNT_OHSIZE>>2))
30#define NLM_owner_sz (1+(NLMCLNT_OHSIZE>>2))
31#define NLM_fhandle_sz (1+(NFS2_FHSIZE>>2))
32#define NLM_lock_sz (3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz)
33#define NLM_holder_sz (4+NLM_owner_sz)
34
35#define NLM_testargs_sz (NLM_cookie_sz+1+NLM_lock_sz)
36#define NLM_lockargs_sz (NLM_cookie_sz+4+NLM_lock_sz)
37#define NLM_cancargs_sz (NLM_cookie_sz+2+NLM_lock_sz)
38#define NLM_unlockargs_sz (NLM_cookie_sz+NLM_lock_sz)
39
40#define NLM_testres_sz (NLM_cookie_sz+1+NLM_holder_sz)
41#define NLM_res_sz (NLM_cookie_sz+1)
42#define NLM_norep_sz (0)
43
44
45static s32 loff_t_to_s32(loff_t offset)
46{
47 s32 res;
48
49 if (offset >= NLM_OFFSET_MAX)
50 res = NLM_OFFSET_MAX;
51 else if (offset <= -NLM_OFFSET_MAX)
52 res = -NLM_OFFSET_MAX;
53 else
54 res = offset;
55 return res;
56}
57
58static void nlm_compute_offsets(const struct nlm_lock *lock,
59 u32 *l_offset, u32 *l_len)
60{
61 const struct file_lock *fl = &lock->fl;
62
63 BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
64 BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
65 fl->fl_end != OFFSET_MAX);
66
67 *l_offset = loff_t_to_s32(fl->fl_start);
68 if (fl->fl_end == OFFSET_MAX)
69 *l_len = 0;
70 else
71 *l_len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
72}
73
74/*
75 * Handle decode buffer overflows out-of-line.
76 */
77static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
78{
79 dprintk("lockd: %s prematurely hit the end of our receive buffer. "
80 "Remaining buffer length is %tu words.\n",
81 func, xdr->end - xdr->p);
82}
83
84
85/*
86 * Encode/decode NLMv3 basic data types
87 *
88 * Basic NLMv3 data types are not defined in an IETF standards
89 * document. X/Open has a description of these data types that
90 * is useful. See Chapter 10 of "Protocols for Interworking:
91 * XNFS, Version 3W".
92 *
93 * Not all basic data types have their own encoding and decoding
94 * functions. For run-time efficiency, some data types are encoded
95 * or decoded inline.
96 */
97
98static void encode_bool(struct xdr_stream *xdr, const int value)
99{
100 __be32 *p;
101
102 p = xdr_reserve_space(xdr, 4);
103 *p = value ? xdr_one : xdr_zero;
104}
105
106static void encode_int32(struct xdr_stream *xdr, const s32 value)
107{
108 __be32 *p;
109
110 p = xdr_reserve_space(xdr, 4);
111 *p = cpu_to_be32(value);
112}
113
114/*
115 * typedef opaque netobj<MAXNETOBJ_SZ>
116 */
117static void encode_netobj(struct xdr_stream *xdr,
118 const u8 *data, const unsigned int length)
119{
120 __be32 *p;
121
122 BUG_ON(length > XDR_MAX_NETOBJ);
123 p = xdr_reserve_space(xdr, 4 + length);
124 xdr_encode_opaque(p, data, length);
125}
126
127static int decode_netobj(struct xdr_stream *xdr,
128 struct xdr_netobj *obj)
129{
130 u32 length;
131 __be32 *p;
132
133 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(p == NULL))
135 goto out_overflow;
136 length = be32_to_cpup(p++);
137 if (unlikely(length > XDR_MAX_NETOBJ))
138 goto out_size;
139 obj->len = length;
140 obj->data = (u8 *)p;
141 return 0;
142out_size:
143 dprintk("NFS: returned netobj was too long: %u\n", length);
144 return -EIO;
145out_overflow:
146 print_overflow_msg(__func__, xdr);
147 return -EIO;
148}
149
150/*
151 * netobj cookie;
152 */
153static void encode_cookie(struct xdr_stream *xdr,
154 const struct nlm_cookie *cookie)
155{
156 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
157 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
158}
159
160static int decode_cookie(struct xdr_stream *xdr,
161 struct nlm_cookie *cookie)
162{
163 u32 length;
164 __be32 *p;
165
166 p = xdr_inline_decode(xdr, 4);
167 if (unlikely(p == NULL))
168 goto out_overflow;
169 length = be32_to_cpup(p++);
170 /* apparently HPUX can return empty cookies */
171 if (length == 0)
172 goto out_hpux;
173 if (length > NLM_MAXCOOKIELEN)
174 goto out_size;
175 p = xdr_inline_decode(xdr, length);
176 if (unlikely(p == NULL))
177 goto out_overflow;
178 cookie->len = length;
179 memcpy(cookie->data, p, length);
180 return 0;
181out_hpux:
182 cookie->len = 4;
183 memset(cookie->data, 0, 4);
184 return 0;
185out_size:
186 dprintk("NFS: returned cookie was too long: %u\n", length);
187 return -EIO;
188out_overflow:
189 print_overflow_msg(__func__, xdr);
190 return -EIO;
191}
192
193/*
194 * netobj fh;
195 */
196static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
197{
198 BUG_ON(fh->size != NFS2_FHSIZE);
199 encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
200}
201
202/*
203 * enum nlm_stats {
204 * LCK_GRANTED = 0,
205 * LCK_DENIED = 1,
206 * LCK_DENIED_NOLOCKS = 2,
207 * LCK_BLOCKED = 3,
208 * LCK_DENIED_GRACE_PERIOD = 4
209 * };
210 *
211 *
212 * struct nlm_stat {
213 * nlm_stats stat;
214 * };
215 *
216 * NB: we don't swap bytes for the NLM status values. The upper
217 * layers deal directly with the status value in network byte
218 * order.
219 */
220
221static void encode_nlm_stat(struct xdr_stream *xdr,
222 const __be32 stat)
223{
224 __be32 *p;
225
226 BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD);
227 p = xdr_reserve_space(xdr, 4);
228 *p = stat;
229}
230
231static int decode_nlm_stat(struct xdr_stream *xdr,
232 __be32 *stat)
233{
234 __be32 *p;
235
236 p = xdr_inline_decode(xdr, 4);
237 if (unlikely(p == NULL))
238 goto out_overflow;
239 if (unlikely(*p > nlm_lck_denied_grace_period))
240 goto out_enum;
241 *stat = *p;
242 return 0;
243out_enum:
244 dprintk("%s: server returned invalid nlm_stats value: %u\n",
245 __func__, be32_to_cpup(p));
246 return -EIO;
247out_overflow:
248 print_overflow_msg(__func__, xdr);
249 return -EIO;
250}
251
252/*
253 * struct nlm_holder {
254 * bool exclusive;
255 * int uppid;
256 * netobj oh;
257 * unsigned l_offset;
258 * unsigned l_len;
259 * };
260 */
261static void encode_nlm_holder(struct xdr_stream *xdr,
262 const struct nlm_res *result)
263{
264 const struct nlm_lock *lock = &result->lock;
265 u32 l_offset, l_len;
266 __be32 *p;
267
268 encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
269 encode_int32(xdr, lock->svid);
270 encode_netobj(xdr, lock->oh.data, lock->oh.len);
271
272 p = xdr_reserve_space(xdr, 4 + 4);
273 nlm_compute_offsets(lock, &l_offset, &l_len);
274 *p++ = cpu_to_be32(l_offset);
275 *p = cpu_to_be32(l_len);
276}
277
278static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
279{
280 struct nlm_lock *lock = &result->lock;
281 struct file_lock *fl = &lock->fl;
282 u32 exclusive, l_offset, l_len;
283 int error;
284 __be32 *p;
285 s32 end;
286
287 memset(lock, 0, sizeof(*lock));
288 locks_init_lock(fl);
289
290 p = xdr_inline_decode(xdr, 4 + 4);
291 if (unlikely(p == NULL))
292 goto out_overflow;
293 exclusive = be32_to_cpup(p++);
294 lock->svid = be32_to_cpup(p);
295 fl->fl_pid = (pid_t)lock->svid;
296
297 error = decode_netobj(xdr, &lock->oh);
298 if (unlikely(error))
299 goto out;
300
301 p = xdr_inline_decode(xdr, 4 + 4);
302 if (unlikely(p == NULL))
303 goto out_overflow;
304
305 fl->fl_flags = FL_POSIX;
306 fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
307 l_offset = be32_to_cpup(p++);
308 l_len = be32_to_cpup(p);
309 end = l_offset + l_len - 1;
310
311 fl->fl_start = (loff_t)l_offset;
312 if (l_len == 0 || end < 0)
313 fl->fl_end = OFFSET_MAX;
314 else
315 fl->fl_end = (loff_t)end;
316 error = 0;
317out:
318 return error;
319out_overflow:
320 print_overflow_msg(__func__, xdr);
321 return -EIO;
322}
323
324/*
325 * string caller_name<LM_MAXSTRLEN>;
326 */
327static void encode_caller_name(struct xdr_stream *xdr, const char *name)
328{
329 /* NB: client-side does not set lock->len */
330 u32 length = strlen(name);
331 __be32 *p;
332
333 BUG_ON(length > NLM_MAXSTRLEN);
334 p = xdr_reserve_space(xdr, 4 + length);
335 xdr_encode_opaque(p, name, length);
336}
337
338/*
339 * struct nlm_lock {
340 * string caller_name<LM_MAXSTRLEN>;
341 * netobj fh;
342 * netobj oh;
343 * int uppid;
344 * unsigned l_offset;
345 * unsigned l_len;
346 * };
347 */
348static void encode_nlm_lock(struct xdr_stream *xdr,
349 const struct nlm_lock *lock)
350{
351 u32 l_offset, l_len;
352 __be32 *p;
353
354 encode_caller_name(xdr, lock->caller);
355 encode_fh(xdr, &lock->fh);
356 encode_netobj(xdr, lock->oh.data, lock->oh.len);
357
358 p = xdr_reserve_space(xdr, 4 + 4 + 4);
359 *p++ = cpu_to_be32(lock->svid);
360
361 nlm_compute_offsets(lock, &l_offset, &l_len);
362 *p++ = cpu_to_be32(l_offset);
363 *p = cpu_to_be32(l_len);
364}
365
366
367/*
368 * NLMv3 XDR encode functions
369 *
370 * NLMv3 argument types are defined in Chapter 10 of The Open Group's
371 * "Protocols for Interworking: XNFS, Version 3W".
372 */
373
374/*
375 * struct nlm_testargs {
376 * netobj cookie;
377 * bool exclusive;
378 * struct nlm_lock alock;
379 * };
380 */
381static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
382 struct xdr_stream *xdr,
383 const struct nlm_args *args)
384{
385 const struct nlm_lock *lock = &args->lock;
386
387 encode_cookie(xdr, &args->cookie);
388 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
389 encode_nlm_lock(xdr, lock);
390}
391
392/*
393 * struct nlm_lockargs {
394 * netobj cookie;
395 * bool block;
396 * bool exclusive;
397 * struct nlm_lock alock;
398 * bool reclaim;
399 * int state;
400 * };
401 */
402static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
403 struct xdr_stream *xdr,
404 const struct nlm_args *args)
405{
406 const struct nlm_lock *lock = &args->lock;
407
408 encode_cookie(xdr, &args->cookie);
409 encode_bool(xdr, args->block);
410 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
411 encode_nlm_lock(xdr, lock);
412 encode_bool(xdr, args->reclaim);
413 encode_int32(xdr, args->state);
414}
415
416/*
417 * struct nlm_cancargs {
418 * netobj cookie;
419 * bool block;
420 * bool exclusive;
421 * struct nlm_lock alock;
422 * };
423 */
424static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
425 struct xdr_stream *xdr,
426 const struct nlm_args *args)
427{
428 const struct nlm_lock *lock = &args->lock;
429
430 encode_cookie(xdr, &args->cookie);
431 encode_bool(xdr, args->block);
432 encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
433 encode_nlm_lock(xdr, lock);
434}
435
436/*
437 * struct nlm_unlockargs {
438 * netobj cookie;
439 * struct nlm_lock alock;
440 * };
441 */
442static void nlm_xdr_enc_unlockargs(struct rpc_rqst *req,
443 struct xdr_stream *xdr,
444 const struct nlm_args *args)
445{
446 const struct nlm_lock *lock = &args->lock;
447
448 encode_cookie(xdr, &args->cookie);
449 encode_nlm_lock(xdr, lock);
450}
451
452/*
453 * struct nlm_res {
454 * netobj cookie;
455 * nlm_stat stat;
456 * };
457 */
458static void nlm_xdr_enc_res(struct rpc_rqst *req,
459 struct xdr_stream *xdr,
460 const struct nlm_res *result)
461{
462 encode_cookie(xdr, &result->cookie);
463 encode_nlm_stat(xdr, result->status);
464}
465
466/*
467 * union nlm_testrply switch (nlm_stats stat) {
468 * case LCK_DENIED:
469 * struct nlm_holder holder;
470 * default:
471 * void;
472 * };
473 *
474 * struct nlm_testres {
475 * netobj cookie;
476 * nlm_testrply test_stat;
477 * };
478 */
479static void encode_nlm_testrply(struct xdr_stream *xdr,
480 const struct nlm_res *result)
481{
482 if (result->status == nlm_lck_denied)
483 encode_nlm_holder(xdr, result);
484}
485
486static void nlm_xdr_enc_testres(struct rpc_rqst *req,
487 struct xdr_stream *xdr,
488 const struct nlm_res *result)
489{
490 encode_cookie(xdr, &result->cookie);
491 encode_nlm_stat(xdr, result->status);
492 encode_nlm_testrply(xdr, result);
493}
494
495
496/*
497 * NLMv3 XDR decode functions
498 *
499 * NLMv3 result types are defined in Chapter 10 of The Open Group's
500 * "Protocols for Interworking: XNFS, Version 3W".
501 */
502
503/*
504 * union nlm_testrply switch (nlm_stats stat) {
505 * case LCK_DENIED:
506 * struct nlm_holder holder;
507 * default:
508 * void;
509 * };
510 *
511 * struct nlm_testres {
512 * netobj cookie;
513 * nlm_testrply test_stat;
514 * };
515 */
516static int decode_nlm_testrply(struct xdr_stream *xdr,
517 struct nlm_res *result)
518{
519 int error;
520
521 error = decode_nlm_stat(xdr, &result->status);
522 if (unlikely(error))
523 goto out;
524 if (result->status == nlm_lck_denied)
525 error = decode_nlm_holder(xdr, result);
526out:
527 return error;
528}
529
530static int nlm_xdr_dec_testres(struct rpc_rqst *req,
531 struct xdr_stream *xdr,
532 struct nlm_res *result)
533{
534 int error;
535
536 error = decode_cookie(xdr, &result->cookie);
537 if (unlikely(error))
538 goto out;
539 error = decode_nlm_testrply(xdr, result);
540out:
541 return error;
542}
543
544/*
545 * struct nlm_res {
546 * netobj cookie;
547 * nlm_stat stat;
548 * };
549 */
550static int nlm_xdr_dec_res(struct rpc_rqst *req,
551 struct xdr_stream *xdr,
552 struct nlm_res *result)
553{
554 int error;
555
556 error = decode_cookie(xdr, &result->cookie);
557 if (unlikely(error))
558 goto out;
559 error = decode_nlm_stat(xdr, &result->status);
560out:
561 return error;
562}
563
564
565/*
566 * For NLM, a void procedure really returns nothing
567 */
568#define nlm_xdr_dec_norep NULL
569
570#define PROC(proc, argtype, restype) \
571[NLMPROC_##proc] = { \
572 .p_proc = NLMPROC_##proc, \
573 .p_encode = (kxdreproc_t)nlm_xdr_enc_##argtype, \
574 .p_decode = (kxdrdproc_t)nlm_xdr_dec_##restype, \
575 .p_arglen = NLM_##argtype##_sz, \
576 .p_replen = NLM_##restype##_sz, \
577 .p_statidx = NLMPROC_##proc, \
578 .p_name = #proc, \
579 }
580
581static struct rpc_procinfo nlm_procedures[] = {
582 PROC(TEST, testargs, testres),
583 PROC(LOCK, lockargs, res),
584 PROC(CANCEL, cancargs, res),
585 PROC(UNLOCK, unlockargs, res),
586 PROC(GRANTED, testargs, res),
587 PROC(TEST_MSG, testargs, norep),
588 PROC(LOCK_MSG, lockargs, norep),
589 PROC(CANCEL_MSG, cancargs, norep),
590 PROC(UNLOCK_MSG, unlockargs, norep),
591 PROC(GRANTED_MSG, testargs, norep),
592 PROC(TEST_RES, testres, norep),
593 PROC(LOCK_RES, res, norep),
594 PROC(CANCEL_RES, res, norep),
595 PROC(UNLOCK_RES, res, norep),
596 PROC(GRANTED_RES, res, norep),
597};
598
599static struct rpc_version nlm_version1 = {
600 .number = 1,
601 .nrprocs = ARRAY_SIZE(nlm_procedures),
602 .procs = nlm_procedures,
603};
604
605static struct rpc_version nlm_version3 = {
606 .number = 3,
607 .nrprocs = ARRAY_SIZE(nlm_procedures),
608 .procs = nlm_procedures,
609};
610
611static struct rpc_version *nlm_versions[] = {
612 [1] = &nlm_version1,
613 [3] = &nlm_version3,
614#ifdef CONFIG_LOCKD_V4
615 [4] = &nlm_version4,
616#endif
617};
618
619static struct rpc_stat nlm_rpc_stats;
620
621struct rpc_program nlm_program = {
622 .name = "lockd",
623 .number = NLM_PROGRAM,
624 .nrvers = ARRAY_SIZE(nlm_versions),
625 .version = nlm_versions,
626 .stats = &nlm_rpc_stats,
627};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104c..b7c99bfb3da6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -25,9 +25,22 @@
25#define NLM_HOST_EXPIRE (300 * HZ) 25#define NLM_HOST_EXPIRE (300 * HZ)
26#define NLM_HOST_COLLECT (120 * HZ) 26#define NLM_HOST_COLLECT (120 * HZ)
27 27
28static struct hlist_head nlm_hosts[NLM_HOST_NRHASH]; 28static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH];
29static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH];
30
31#define for_each_host(host, pos, chain, table) \
32 for ((chain) = (table); \
33 (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
34 hlist_for_each_entry((host), (pos), (chain), h_hash)
35
36#define for_each_host_safe(host, pos, next, chain, table) \
37 for ((chain) = (table); \
38 (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
39 hlist_for_each_entry_safe((host), (pos), (next), \
40 (chain), h_hash)
41
29static unsigned long next_gc; 42static unsigned long next_gc;
30static int nrhosts; 43static unsigned long nrhosts;
31static DEFINE_MUTEX(nlm_host_mutex); 44static DEFINE_MUTEX(nlm_host_mutex);
32 45
33static void nlm_gc_hosts(void); 46static void nlm_gc_hosts(void);
@@ -40,8 +53,6 @@ struct nlm_lookup_host_info {
40 const u32 version; /* NLM version to search for */ 53 const u32 version; /* NLM version to search for */
41 const char *hostname; /* remote's hostname */ 54 const char *hostname; /* remote's hostname */
42 const size_t hostname_len; /* it's length */ 55 const size_t hostname_len; /* it's length */
43 const struct sockaddr *src_sap; /* our address (optional) */
44 const size_t src_len; /* it's length */
45 const int noresvport; /* use non-priv port */ 56 const int noresvport; /* use non-priv port */
46}; 57};
47 58
@@ -88,126 +99,83 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
88} 99}
89 100
90/* 101/*
91 * Common host lookup routine for server & client 102 * Allocate and initialize an nlm_host. Common to both client and server.
92 */ 103 */
93static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) 104static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
105 struct nsm_handle *nsm)
94{ 106{
95 struct hlist_head *chain; 107 struct nlm_host *host = NULL;
96 struct hlist_node *pos; 108 unsigned long now = jiffies;
97 struct nlm_host *host;
98 struct nsm_handle *nsm = NULL;
99
100 mutex_lock(&nlm_host_mutex);
101
102 if (time_after_eq(jiffies, next_gc))
103 nlm_gc_hosts();
104
105 /* We may keep several nlm_host objects for a peer, because each
106 * nlm_host is identified by
107 * (address, protocol, version, server/client)
108 * We could probably simplify this a little by putting all those
109 * different NLM rpc_clients into one single nlm_host object.
110 * This would allow us to have one nlm_host per address.
111 */
112 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
113 hlist_for_each_entry(host, pos, chain, h_hash) {
114 if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
115 continue;
116
117 /* See if we have an NSM handle for this client */
118 if (!nsm)
119 nsm = host->h_nsmhandle;
120
121 if (host->h_proto != ni->protocol)
122 continue;
123 if (host->h_version != ni->version)
124 continue;
125 if (host->h_server != ni->server)
126 continue;
127 if (ni->server &&
128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
129 continue;
130
131 /* Move to head of hash chain. */
132 hlist_del(&host->h_hash);
133 hlist_add_head(&host->h_hash, chain);
134
135 nlm_get_host(host);
136 dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
137 host->h_name, host->h_addrbuf);
138 goto out;
139 }
140 109
141 /* 110 if (nsm != NULL)
142 * The host wasn't in our hash table. If we don't
143 * have an NSM handle for it yet, create one.
144 */
145 if (nsm)
146 atomic_inc(&nsm->sm_count); 111 atomic_inc(&nsm->sm_count);
147 else { 112 else {
148 host = NULL; 113 host = NULL;
149 nsm = nsm_get_handle(ni->sap, ni->salen, 114 nsm = nsm_get_handle(ni->sap, ni->salen,
150 ni->hostname, ni->hostname_len); 115 ni->hostname, ni->hostname_len);
151 if (!nsm) { 116 if (unlikely(nsm == NULL)) {
152 dprintk("lockd: nlm_lookup_host failed; " 117 dprintk("lockd: %s failed; no nsm handle\n",
153 "no nsm handle\n"); 118 __func__);
154 goto out; 119 goto out;
155 } 120 }
156 } 121 }
157 122
158 host = kzalloc(sizeof(*host), GFP_KERNEL); 123 host = kmalloc(sizeof(*host), GFP_KERNEL);
159 if (!host) { 124 if (unlikely(host == NULL)) {
125 dprintk("lockd: %s failed; no memory\n", __func__);
160 nsm_release(nsm); 126 nsm_release(nsm);
161 dprintk("lockd: nlm_lookup_host failed; no memory\n");
162 goto out; 127 goto out;
163 } 128 }
164 host->h_name = nsm->sm_name; 129
165 host->h_addrbuf = nsm->sm_addrbuf;
166 memcpy(nlm_addr(host), ni->sap, ni->salen); 130 memcpy(nlm_addr(host), ni->sap, ni->salen);
167 host->h_addrlen = ni->salen; 131 host->h_addrlen = ni->salen;
168 rpc_set_port(nlm_addr(host), 0); 132 rpc_set_port(nlm_addr(host), 0);
169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); 133 host->h_srcaddrlen = 0;
134
135 host->h_rpcclnt = NULL;
136 host->h_name = nsm->sm_name;
170 host->h_version = ni->version; 137 host->h_version = ni->version;
171 host->h_proto = ni->protocol; 138 host->h_proto = ni->protocol;
172 host->h_rpcclnt = NULL; 139 host->h_reclaiming = 0;
173 mutex_init(&host->h_mutex); 140 host->h_server = ni->server;
174 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 141 host->h_noresvport = ni->noresvport;
175 host->h_expires = jiffies + NLM_HOST_EXPIRE; 142 host->h_inuse = 0;
176 atomic_set(&host->h_count, 1);
177 init_waitqueue_head(&host->h_gracewait); 143 init_waitqueue_head(&host->h_gracewait);
178 init_rwsem(&host->h_rwsem); 144 init_rwsem(&host->h_rwsem);
179 host->h_state = 0; /* pseudo NSM state */ 145 host->h_state = 0;
180 host->h_nsmstate = 0; /* real NSM state */ 146 host->h_nsmstate = 0;
181 host->h_nsmhandle = nsm; 147 host->h_pidcount = 0;
182 host->h_server = ni->server; 148 atomic_set(&host->h_count, 1);
183 host->h_noresvport = ni->noresvport; 149 mutex_init(&host->h_mutex);
184 hlist_add_head(&host->h_hash, chain); 150 host->h_nextrebind = now + NLM_HOST_REBIND;
151 host->h_expires = now + NLM_HOST_EXPIRE;
185 INIT_LIST_HEAD(&host->h_lockowners); 152 INIT_LIST_HEAD(&host->h_lockowners);
186 spin_lock_init(&host->h_lock); 153 spin_lock_init(&host->h_lock);
187 INIT_LIST_HEAD(&host->h_granted); 154 INIT_LIST_HEAD(&host->h_granted);
188 INIT_LIST_HEAD(&host->h_reclaim); 155 INIT_LIST_HEAD(&host->h_reclaim);
189 156 host->h_nsmhandle = nsm;
190 nrhosts++; 157 host->h_addrbuf = nsm->sm_addrbuf;
191
192 dprintk("lockd: nlm_lookup_host created host %s\n",
193 host->h_name);
194 158
195out: 159out:
196 mutex_unlock(&nlm_host_mutex);
197 return host; 160 return host;
198} 161}
199 162
200/* 163/*
201 * Destroy a host 164 * Destroy an nlm_host and free associated resources
165 *
166 * Caller must hold nlm_host_mutex.
202 */ 167 */
203static void 168static void nlm_destroy_host_locked(struct nlm_host *host)
204nlm_destroy_host(struct nlm_host *host)
205{ 169{
206 struct rpc_clnt *clnt; 170 struct rpc_clnt *clnt;
207 171
172 dprintk("lockd: destroy host %s\n", host->h_name);
173
208 BUG_ON(!list_empty(&host->h_lockowners)); 174 BUG_ON(!list_empty(&host->h_lockowners));
209 BUG_ON(atomic_read(&host->h_count)); 175 BUG_ON(atomic_read(&host->h_count));
210 176
177 hlist_del_init(&host->h_hash);
178
211 nsm_unmonitor(host); 179 nsm_unmonitor(host);
212 nsm_release(host->h_nsmhandle); 180 nsm_release(host->h_nsmhandle);
213 181
@@ -215,6 +183,8 @@ nlm_destroy_host(struct nlm_host *host)
215 if (clnt != NULL) 183 if (clnt != NULL)
216 rpc_shutdown_client(clnt); 184 rpc_shutdown_client(clnt);
217 kfree(host); 185 kfree(host);
186
187 nrhosts--;
218} 188}
219 189
220/** 190/**
@@ -238,9 +208,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
238 const char *hostname, 208 const char *hostname,
239 int noresvport) 209 int noresvport)
240{ 210{
241 const struct sockaddr source = {
242 .sa_family = AF_UNSPEC,
243 };
244 struct nlm_lookup_host_info ni = { 211 struct nlm_lookup_host_info ni = {
245 .server = 0, 212 .server = 0,
246 .sap = sap, 213 .sap = sap,
@@ -249,16 +216,78 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
249 .version = version, 216 .version = version,
250 .hostname = hostname, 217 .hostname = hostname,
251 .hostname_len = strlen(hostname), 218 .hostname_len = strlen(hostname),
252 .src_sap = &source,
253 .src_len = sizeof(source),
254 .noresvport = noresvport, 219 .noresvport = noresvport,
255 }; 220 };
221 struct hlist_head *chain;
222 struct hlist_node *pos;
223 struct nlm_host *host;
224 struct nsm_handle *nsm = NULL;
256 225
257 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, 226 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
258 (hostname ? hostname : "<none>"), version, 227 (hostname ? hostname : "<none>"), version,
259 (protocol == IPPROTO_UDP ? "udp" : "tcp")); 228 (protocol == IPPROTO_UDP ? "udp" : "tcp"));
260 229
261 return nlm_lookup_host(&ni); 230 mutex_lock(&nlm_host_mutex);
231
232 chain = &nlm_client_hosts[nlm_hash_address(sap)];
233 hlist_for_each_entry(host, pos, chain, h_hash) {
234 if (!rpc_cmp_addr(nlm_addr(host), sap))
235 continue;
236
237 /* Same address. Share an NSM handle if we already have one */
238 if (nsm == NULL)
239 nsm = host->h_nsmhandle;
240
241 if (host->h_proto != protocol)
242 continue;
243 if (host->h_version != version)
244 continue;
245
246 nlm_get_host(host);
247 dprintk("lockd: %s found host %s (%s)\n", __func__,
248 host->h_name, host->h_addrbuf);
249 goto out;
250 }
251
252 host = nlm_alloc_host(&ni, nsm);
253 if (unlikely(host == NULL))
254 goto out;
255
256 hlist_add_head(&host->h_hash, chain);
257 nrhosts++;
258
259 dprintk("lockd: %s created host %s (%s)\n", __func__,
260 host->h_name, host->h_addrbuf);
261
262out:
263 mutex_unlock(&nlm_host_mutex);
264 return host;
265}
266
267/**
268 * nlmclnt_release_host - release client nlm_host
269 * @host: nlm_host to release
270 *
271 */
272void nlmclnt_release_host(struct nlm_host *host)
273{
274 if (host == NULL)
275 return;
276
277 dprintk("lockd: release client host %s\n", host->h_name);
278
279 BUG_ON(atomic_read(&host->h_count) < 0);
280 BUG_ON(host->h_server);
281
282 if (atomic_dec_and_test(&host->h_count)) {
283 BUG_ON(!list_empty(&host->h_lockowners));
284 BUG_ON(!list_empty(&host->h_granted));
285 BUG_ON(!list_empty(&host->h_reclaim));
286
287 mutex_lock(&nlm_host_mutex);
288 nlm_destroy_host_locked(host);
289 mutex_unlock(&nlm_host_mutex);
290 }
262} 291}
263 292
264/** 293/**
@@ -283,12 +312,18 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
283 const char *hostname, 312 const char *hostname,
284 const size_t hostname_len) 313 const size_t hostname_len)
285{ 314{
315 struct hlist_head *chain;
316 struct hlist_node *pos;
317 struct nlm_host *host = NULL;
318 struct nsm_handle *nsm = NULL;
286 struct sockaddr_in sin = { 319 struct sockaddr_in sin = {
287 .sin_family = AF_INET, 320 .sin_family = AF_INET,
288 }; 321 };
289 struct sockaddr_in6 sin6 = { 322 struct sockaddr_in6 sin6 = {
290 .sin6_family = AF_INET6, 323 .sin6_family = AF_INET6,
291 }; 324 };
325 struct sockaddr *src_sap;
326 size_t src_len = rqstp->rq_addrlen;
292 struct nlm_lookup_host_info ni = { 327 struct nlm_lookup_host_info ni = {
293 .server = 1, 328 .server = 1,
294 .sap = svc_addr(rqstp), 329 .sap = svc_addr(rqstp),
@@ -297,27 +332,91 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
297 .version = rqstp->rq_vers, 332 .version = rqstp->rq_vers,
298 .hostname = hostname, 333 .hostname = hostname,
299 .hostname_len = hostname_len, 334 .hostname_len = hostname_len,
300 .src_len = rqstp->rq_addrlen,
301 }; 335 };
302 336
303 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, 337 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
304 (int)hostname_len, hostname, rqstp->rq_vers, 338 (int)hostname_len, hostname, rqstp->rq_vers,
305 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); 339 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
306 340
341 mutex_lock(&nlm_host_mutex);
342
307 switch (ni.sap->sa_family) { 343 switch (ni.sap->sa_family) {
308 case AF_INET: 344 case AF_INET:
309 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; 345 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
310 ni.src_sap = (struct sockaddr *)&sin; 346 src_sap = (struct sockaddr *)&sin;
311 break; 347 break;
312 case AF_INET6: 348 case AF_INET6:
313 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); 349 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
314 ni.src_sap = (struct sockaddr *)&sin6; 350 src_sap = (struct sockaddr *)&sin6;
315 break; 351 break;
316 default: 352 default:
317 return NULL; 353 dprintk("lockd: %s failed; unrecognized address family\n",
354 __func__);
355 goto out;
318 } 356 }
319 357
320 return nlm_lookup_host(&ni); 358 if (time_after_eq(jiffies, next_gc))
359 nlm_gc_hosts();
360
361 chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
362 hlist_for_each_entry(host, pos, chain, h_hash) {
363 if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
364 continue;
365
366 /* Same address. Share an NSM handle if we already have one */
367 if (nsm == NULL)
368 nsm = host->h_nsmhandle;
369
370 if (host->h_proto != ni.protocol)
371 continue;
372 if (host->h_version != ni.version)
373 continue;
374 if (!rpc_cmp_addr(nlm_srcaddr(host), src_sap))
375 continue;
376
377 /* Move to head of hash chain. */
378 hlist_del(&host->h_hash);
379 hlist_add_head(&host->h_hash, chain);
380
381 nlm_get_host(host);
382 dprintk("lockd: %s found host %s (%s)\n",
383 __func__, host->h_name, host->h_addrbuf);
384 goto out;
385 }
386
387 host = nlm_alloc_host(&ni, nsm);
388 if (unlikely(host == NULL))
389 goto out;
390
391 memcpy(nlm_srcaddr(host), src_sap, src_len);
392 host->h_srcaddrlen = src_len;
393 hlist_add_head(&host->h_hash, chain);
394 nrhosts++;
395
396 dprintk("lockd: %s created host %s (%s)\n",
397 __func__, host->h_name, host->h_addrbuf);
398
399out:
400 mutex_unlock(&nlm_host_mutex);
401 return host;
402}
403
404/**
405 * nlmsvc_release_host - release server nlm_host
406 * @host: nlm_host to release
407 *
408 * Host is destroyed later in nlm_gc_host().
409 */
410void nlmsvc_release_host(struct nlm_host *host)
411{
412 if (host == NULL)
413 return;
414
415 dprintk("lockd: release server host %s\n", host->h_name);
416
417 BUG_ON(atomic_read(&host->h_count) < 0);
418 BUG_ON(!host->h_server);
419 atomic_dec(&host->h_count);
321} 420}
322 421
323/* 422/*
@@ -353,10 +452,10 @@ nlm_bind_host(struct nlm_host *host)
353 .to_retries = 5U, 452 .to_retries = 5U,
354 }; 453 };
355 struct rpc_create_args args = { 454 struct rpc_create_args args = {
455 .net = &init_net,
356 .protocol = host->h_proto, 456 .protocol = host->h_proto,
357 .address = nlm_addr(host), 457 .address = nlm_addr(host),
358 .addrsize = host->h_addrlen, 458 .addrsize = host->h_addrlen,
359 .saddress = nlm_srcaddr(host),
360 .timeout = &timeparms, 459 .timeout = &timeparms,
361 .servername = host->h_name, 460 .servername = host->h_name,
362 .program = &nlm_program, 461 .program = &nlm_program,
@@ -375,6 +474,8 @@ nlm_bind_host(struct nlm_host *host)
375 args.flags |= RPC_CLNT_CREATE_HARDRTRY; 474 args.flags |= RPC_CLNT_CREATE_HARDRTRY;
376 if (host->h_noresvport) 475 if (host->h_noresvport)
377 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 476 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
477 if (host->h_srcaddrlen)
478 args.saddress = nlm_srcaddr(host);
378 479
379 clnt = rpc_create(&args); 480 clnt = rpc_create(&args);
380 if (!IS_ERR(clnt)) 481 if (!IS_ERR(clnt))
@@ -415,20 +516,29 @@ struct nlm_host * nlm_get_host(struct nlm_host *host)
415 return host; 516 return host;
416} 517}
417 518
418/* 519static struct nlm_host *next_host_state(struct hlist_head *cache,
419 * Release NLM host after use 520 struct nsm_handle *nsm,
420 */ 521 const struct nlm_reboot *info)
421void nlm_release_host(struct nlm_host *host)
422{ 522{
423 if (host != NULL) { 523 struct nlm_host *host;
424 dprintk("lockd: release host %s\n", host->h_name); 524 struct hlist_head *chain;
425 BUG_ON(atomic_read(&host->h_count) < 0); 525 struct hlist_node *pos;
426 if (atomic_dec_and_test(&host->h_count)) { 526
427 BUG_ON(!list_empty(&host->h_lockowners)); 527 mutex_lock(&nlm_host_mutex);
428 BUG_ON(!list_empty(&host->h_granted)); 528 for_each_host(host, pos, chain, cache) {
429 BUG_ON(!list_empty(&host->h_reclaim)); 529 if (host->h_nsmhandle == nsm
530 && host->h_nsmstate != info->state) {
531 host->h_nsmstate = info->state;
532 host->h_state++;
533
534 nlm_get_host(host);
535 mutex_unlock(&nlm_host_mutex);
536 return host;
430 } 537 }
431 } 538 }
539
540 mutex_unlock(&nlm_host_mutex);
541 return NULL;
432} 542}
433 543
434/** 544/**
@@ -440,8 +550,6 @@ void nlm_release_host(struct nlm_host *host)
440 */ 550 */
441void nlm_host_rebooted(const struct nlm_reboot *info) 551void nlm_host_rebooted(const struct nlm_reboot *info)
442{ 552{
443 struct hlist_head *chain;
444 struct hlist_node *pos;
445 struct nsm_handle *nsm; 553 struct nsm_handle *nsm;
446 struct nlm_host *host; 554 struct nlm_host *host;
447 555
@@ -454,32 +562,15 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
454 * lock for this. 562 * lock for this.
455 * To avoid processing a host several times, we match the nsmstate. 563 * To avoid processing a host several times, we match the nsmstate.
456 */ 564 */
457again: mutex_lock(&nlm_host_mutex); 565 while ((host = next_host_state(nlm_server_hosts, nsm, info)) != NULL) {
458 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 566 nlmsvc_free_host_resources(host);
459 hlist_for_each_entry(host, pos, chain, h_hash) { 567 nlmsvc_release_host(host);
460 if (host->h_nsmhandle == nsm
461 && host->h_nsmstate != info->state) {
462 host->h_nsmstate = info->state;
463 host->h_state++;
464
465 nlm_get_host(host);
466 mutex_unlock(&nlm_host_mutex);
467
468 if (host->h_server) {
469 /* We're server for this guy, just ditch
470 * all the locks he held. */
471 nlmsvc_free_host_resources(host);
472 } else {
473 /* He's the server, initiate lock recovery. */
474 nlmclnt_recovery(host);
475 }
476
477 nlm_release_host(host);
478 goto again;
479 }
480 }
481 } 568 }
482 mutex_unlock(&nlm_host_mutex); 569 while ((host = next_host_state(nlm_client_hosts, nsm, info)) != NULL) {
570 nlmclnt_recovery(host);
571 nlmclnt_release_host(host);
572 }
573
483 nsm_release(nsm); 574 nsm_release(nsm);
484} 575}
485 576
@@ -499,13 +590,11 @@ nlm_shutdown_hosts(void)
499 590
500 /* First, make all hosts eligible for gc */ 591 /* First, make all hosts eligible for gc */
501 dprintk("lockd: nuking all hosts...\n"); 592 dprintk("lockd: nuking all hosts...\n");
502 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 593 for_each_host(host, pos, chain, nlm_server_hosts) {
503 hlist_for_each_entry(host, pos, chain, h_hash) { 594 host->h_expires = jiffies - 1;
504 host->h_expires = jiffies - 1; 595 if (host->h_rpcclnt) {
505 if (host->h_rpcclnt) { 596 rpc_shutdown_client(host->h_rpcclnt);
506 rpc_shutdown_client(host->h_rpcclnt); 597 host->h_rpcclnt = NULL;
507 host->h_rpcclnt = NULL;
508 }
509 } 598 }
510 } 599 }
511 600
@@ -514,15 +603,13 @@ nlm_shutdown_hosts(void)
514 mutex_unlock(&nlm_host_mutex); 603 mutex_unlock(&nlm_host_mutex);
515 604
516 /* complain if any hosts are left */ 605 /* complain if any hosts are left */
517 if (nrhosts) { 606 if (nrhosts != 0) {
518 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); 607 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
519 dprintk("lockd: %d hosts left:\n", nrhosts); 608 dprintk("lockd: %lu hosts left:\n", nrhosts);
520 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 609 for_each_host(host, pos, chain, nlm_server_hosts) {
521 hlist_for_each_entry(host, pos, chain, h_hash) { 610 dprintk(" %s (cnt %d use %d exp %ld)\n",
522 dprintk(" %s (cnt %d use %d exp %ld)\n", 611 host->h_name, atomic_read(&host->h_count),
523 host->h_name, atomic_read(&host->h_count), 612 host->h_inuse, host->h_expires);
524 host->h_inuse, host->h_expires);
525 }
526 } 613 }
527 } 614 }
528} 615}
@@ -540,29 +627,22 @@ nlm_gc_hosts(void)
540 struct nlm_host *host; 627 struct nlm_host *host;
541 628
542 dprintk("lockd: host garbage collection\n"); 629 dprintk("lockd: host garbage collection\n");
543 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 630 for_each_host(host, pos, chain, nlm_server_hosts)
544 hlist_for_each_entry(host, pos, chain, h_hash) 631 host->h_inuse = 0;
545 host->h_inuse = 0;
546 }
547 632
548 /* Mark all hosts that hold locks, blocks or shares */ 633 /* Mark all hosts that hold locks, blocks or shares */
549 nlmsvc_mark_resources(); 634 nlmsvc_mark_resources();
550 635
551 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 636 for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
552 hlist_for_each_entry_safe(host, pos, next, chain, h_hash) { 637 if (atomic_read(&host->h_count) || host->h_inuse
553 if (atomic_read(&host->h_count) || host->h_inuse 638 || time_before(jiffies, host->h_expires)) {
554 || time_before(jiffies, host->h_expires)) { 639 dprintk("nlm_gc_hosts skipping %s "
555 dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n", 640 "(cnt %d use %d exp %ld)\n",
556 host->h_name, atomic_read(&host->h_count), 641 host->h_name, atomic_read(&host->h_count),
557 host->h_inuse, host->h_expires); 642 host->h_inuse, host->h_expires);
558 continue; 643 continue;
559 }
560 dprintk("lockd: delete host %s\n", host->h_name);
561 hlist_del_init(&host->h_hash);
562
563 nlm_destroy_host(host);
564 nrhosts--;
565 } 644 }
645 nlm_destroy_host_locked(host);
566 } 646 }
567 647
568 next_gc = jiffies + NLM_HOST_COLLECT; 648 next_gc = jiffies + NLM_HOST_COLLECT;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fbab..23d7451b2938 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK), 69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
70 }; 70 };
71 struct rpc_create_args args = { 71 struct rpc_create_args args = {
72 .net = &init_net,
72 .protocol = XPRT_TRANSPORT_UDP, 73 .protocol = XPRT_TRANSPORT_UDP,
73 .address = (struct sockaddr *)&sin, 74 .address = (struct sockaddr *)&sin,
74 .addrsize = sizeof(sin), 75 .addrsize = sizeof(sin),
@@ -400,26 +401,22 @@ void nsm_release(struct nsm_handle *nsm)
400 * Status Monitor wire protocol. 401 * Status Monitor wire protocol.
401 */ 402 */
402 403
403static int encode_nsm_string(struct xdr_stream *xdr, const char *string) 404static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
404{ 405{
405 const u32 len = strlen(string); 406 const u32 len = strlen(string);
406 __be32 *p; 407 __be32 *p;
407 408
408 if (unlikely(len > SM_MAXSTRLEN)) 409 BUG_ON(len > SM_MAXSTRLEN);
409 return -EIO; 410 p = xdr_reserve_space(xdr, 4 + len);
410 p = xdr_reserve_space(xdr, sizeof(u32) + len);
411 if (unlikely(p == NULL))
412 return -EIO;
413 xdr_encode_opaque(p, string, len); 411 xdr_encode_opaque(p, string, len);
414 return 0;
415} 412}
416 413
417/* 414/*
418 * "mon_name" specifies the host to be monitored. 415 * "mon_name" specifies the host to be monitored.
419 */ 416 */
420static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) 417static void encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
421{ 418{
422 return encode_nsm_string(xdr, argp->mon_name); 419 encode_nsm_string(xdr, argp->mon_name);
423} 420}
424 421
425/* 422/*
@@ -428,35 +425,25 @@ static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
428 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" 425 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
429 * has changed. 426 * has changed.
430 */ 427 */
431static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) 428static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
432{ 429{
433 int status;
434 __be32 *p; 430 __be32 *p;
435 431
436 status = encode_nsm_string(xdr, utsname()->nodename); 432 encode_nsm_string(xdr, utsname()->nodename);
437 if (unlikely(status != 0)) 433 p = xdr_reserve_space(xdr, 4 + 4 + 4);
438 return status; 434 *p++ = cpu_to_be32(argp->prog);
439 p = xdr_reserve_space(xdr, 3 * sizeof(u32)); 435 *p++ = cpu_to_be32(argp->vers);
440 if (unlikely(p == NULL)) 436 *p = cpu_to_be32(argp->proc);
441 return -EIO;
442 *p++ = htonl(argp->prog);
443 *p++ = htonl(argp->vers);
444 *p++ = htonl(argp->proc);
445 return 0;
446} 437}
447 438
448/* 439/*
449 * The "mon_id" argument specifies the non-private arguments 440 * The "mon_id" argument specifies the non-private arguments
450 * of an NSMPROC_MON or NSMPROC_UNMON call. 441 * of an NSMPROC_MON or NSMPROC_UNMON call.
451 */ 442 */
452static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) 443static void encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
453{ 444{
454 int status; 445 encode_mon_name(xdr, argp);
455 446 encode_my_id(xdr, argp);
456 status = encode_mon_name(xdr, argp);
457 if (unlikely(status != 0))
458 return status;
459 return encode_my_id(xdr, argp);
460} 447}
461 448
462/* 449/*
@@ -464,68 +451,56 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
464 * by the NSMPROC_MON call. This information will be supplied in the 451 * by the NSMPROC_MON call. This information will be supplied in the
465 * NLMPROC_SM_NOTIFY call. 452 * NLMPROC_SM_NOTIFY call.
466 */ 453 */
467static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) 454static void encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
468{ 455{
469 __be32 *p; 456 __be32 *p;
470 457
471 p = xdr_reserve_space(xdr, SM_PRIV_SIZE); 458 p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
472 if (unlikely(p == NULL))
473 return -EIO;
474 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); 459 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
475 return 0;
476} 460}
477 461
478static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p, 462static void nsm_xdr_enc_mon(struct rpc_rqst *req, struct xdr_stream *xdr,
479 const struct nsm_args *argp) 463 const struct nsm_args *argp)
480{ 464{
481 struct xdr_stream xdr; 465 encode_mon_id(xdr, argp);
482 int status; 466 encode_priv(xdr, argp);
483
484 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
485 status = encode_mon_id(&xdr, argp);
486 if (unlikely(status))
487 return status;
488 return encode_priv(&xdr, argp);
489} 467}
490 468
491static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p, 469static void nsm_xdr_enc_unmon(struct rpc_rqst *req, struct xdr_stream *xdr,
492 const struct nsm_args *argp) 470 const struct nsm_args *argp)
493{ 471{
494 struct xdr_stream xdr; 472 encode_mon_id(xdr, argp);
495
496 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
497 return encode_mon_id(&xdr, argp);
498} 473}
499 474
500static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p, 475static int nsm_xdr_dec_stat_res(struct rpc_rqst *rqstp,
501 struct nsm_res *resp) 476 struct xdr_stream *xdr,
477 struct nsm_res *resp)
502{ 478{
503 struct xdr_stream xdr; 479 __be32 *p;
504 480
505 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 481 p = xdr_inline_decode(xdr, 4 + 4);
506 p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
507 if (unlikely(p == NULL)) 482 if (unlikely(p == NULL))
508 return -EIO; 483 return -EIO;
509 resp->status = ntohl(*p++); 484 resp->status = be32_to_cpup(p++);
510 resp->state = ntohl(*p); 485 resp->state = be32_to_cpup(p);
511 486
512 dprintk("lockd: xdr_dec_stat_res status %d state %d\n", 487 dprintk("lockd: %s status %d state %d\n",
513 resp->status, resp->state); 488 __func__, resp->status, resp->state);
514 return 0; 489 return 0;
515} 490}
516 491
517static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, 492static int nsm_xdr_dec_stat(struct rpc_rqst *rqstp,
518 struct nsm_res *resp) 493 struct xdr_stream *xdr,
494 struct nsm_res *resp)
519{ 495{
520 struct xdr_stream xdr; 496 __be32 *p;
521 497
522 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 498 p = xdr_inline_decode(xdr, 4);
523 p = xdr_inline_decode(&xdr, sizeof(u32));
524 if (unlikely(p == NULL)) 499 if (unlikely(p == NULL))
525 return -EIO; 500 return -EIO;
526 resp->state = ntohl(*p); 501 resp->state = be32_to_cpup(p);
527 502
528 dprintk("lockd: xdr_dec_stat state %d\n", resp->state); 503 dprintk("lockd: %s state %d\n", __func__, resp->state);
529 return 0; 504 return 0;
530} 505}
531 506
@@ -541,8 +516,8 @@ static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
541static struct rpc_procinfo nsm_procedures[] = { 516static struct rpc_procinfo nsm_procedures[] = {
542[NSMPROC_MON] = { 517[NSMPROC_MON] = {
543 .p_proc = NSMPROC_MON, 518 .p_proc = NSMPROC_MON,
544 .p_encode = (kxdrproc_t)xdr_enc_mon, 519 .p_encode = (kxdreproc_t)nsm_xdr_enc_mon,
545 .p_decode = (kxdrproc_t)xdr_dec_stat_res, 520 .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat_res,
546 .p_arglen = SM_mon_sz, 521 .p_arglen = SM_mon_sz,
547 .p_replen = SM_monres_sz, 522 .p_replen = SM_monres_sz,
548 .p_statidx = NSMPROC_MON, 523 .p_statidx = NSMPROC_MON,
@@ -550,8 +525,8 @@ static struct rpc_procinfo nsm_procedures[] = {
550 }, 525 },
551[NSMPROC_UNMON] = { 526[NSMPROC_UNMON] = {
552 .p_proc = NSMPROC_UNMON, 527 .p_proc = NSMPROC_UNMON,
553 .p_encode = (kxdrproc_t)xdr_enc_unmon, 528 .p_encode = (kxdreproc_t)nsm_xdr_enc_unmon,
554 .p_decode = (kxdrproc_t)xdr_dec_stat, 529 .p_decode = (kxdrdproc_t)nsm_xdr_dec_stat,
555 .p_arglen = SM_mon_id_sz, 530 .p_arglen = SM_mon_id_sz,
556 .p_replen = SM_unmonres_sz, 531 .p_replen = SM_unmonres_sz,
557 .p_statidx = NSMPROC_UNMON, 532 .p_statidx = NSMPROC_UNMON,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a0391..abfff9d7979d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/smp.h> 24#include <linux/smp.h>
25#include <linux/smp_lock.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/kthread.h> 26#include <linux/kthread.h>
28#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
130 129
131 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); 130 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
132 131
133 /*
134 * FIXME: it would be nice if lockd didn't spend its entire life
135 * running under the BKL. At the very least, it would be good to
136 * have someone clarify what it's intended to protect here. I've
137 * seen some handwavy posts about posix locking needing to be
138 * done under the BKL, but it's far from clear.
139 */
140 lock_kernel();
141
142 if (!nlm_timeout) 132 if (!nlm_timeout)
143 nlm_timeout = LOCKD_DFLT_TIMEO; 133 nlm_timeout = LOCKD_DFLT_TIMEO;
144 nlmsvc_timeout = nlm_timeout * HZ; 134 nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
195 if (nlmsvc_ops) 185 if (nlmsvc_ops)
196 nlmsvc_invalidate_all(); 186 nlmsvc_invalidate_all();
197 nlm_shutdown_hosts(); 187 nlm_shutdown_hosts();
198 unlock_kernel();
199 return 0; 188 return 0;
200} 189}
201 190
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
206 195
207 xprt = svc_find_xprt(serv, name, family, 0); 196 xprt = svc_find_xprt(serv, name, family, 0);
208 if (xprt == NULL) 197 if (xprt == NULL)
209 return svc_create_xprt(serv, name, family, port, 198 return svc_create_xprt(serv, name, &init_net, family, port,
210 SVC_SOCK_DEFAULTS); 199 SVC_SOCK_DEFAULTS);
211 svc_xprt_put(xprt); 200 svc_xprt_put(xprt);
212 return 0; 201 return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a134..9a41fdc19511 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/smp_lock.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/lockd/share.h> 13#include <linux/lockd/share.h>
15 14
@@ -52,7 +51,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
52 return 0; 51 return 0;
53 52
54no_locks: 53no_locks:
55 nlm_release_host(host); 54 nlmsvc_release_host(host);
56 if (error) 55 if (error)
57 return error; 56 return error;
58 return nlm_lck_denied_nolocks; 57 return nlm_lck_denied_nolocks;
@@ -93,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
93 else 92 else
94 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); 93 dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
95 94
96 nlm_release_host(host); 95 nlmsvc_release_host(host);
97 nlm_release_file(file); 96 nlm_release_file(file);
98 return rc; 97 return rc;
99} 98}
@@ -135,7 +134,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
135 else 134 else
136 dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 135 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
137 136
138 nlm_release_host(host); 137 nlmsvc_release_host(host);
139 nlm_release_file(file); 138 nlm_release_file(file);
140 return rc; 139 return rc;
141} 140}
@@ -165,7 +164,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
165 resp->status = nlmsvc_cancel_blocked(file, &argp->lock); 164 resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
166 165
167 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); 166 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status));
168 nlm_release_host(host); 167 nlmsvc_release_host(host);
169 nlm_release_file(file); 168 nlm_release_file(file);
170 return rpc_success; 169 return rpc_success;
171} 170}
@@ -198,7 +197,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
198 resp->status = nlmsvc_unlock(file, &argp->lock); 197 resp->status = nlmsvc_unlock(file, &argp->lock);
199 198
200 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); 199 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status));
201 nlm_release_host(host); 200 nlmsvc_release_host(host);
202 nlm_release_file(file); 201 nlm_release_file(file);
203 return rpc_success; 202 return rpc_success;
204} 203}
@@ -230,9 +229,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
230 229
231static void nlm4svc_callback_release(void *data) 230static void nlm4svc_callback_release(void *data)
232{ 231{
233 lock_kernel(); 232 nlmsvc_release_call(data);
234 nlm_release_call(data);
235 unlock_kernel();
236} 233}
237 234
238static const struct rpc_call_ops nlm4svc_callback_ops = { 235static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -264,7 +261,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
264 261
265 stat = func(rqstp, argp, &call->a_res); 262 stat = func(rqstp, argp, &call->a_res);
266 if (stat != 0) { 263 if (stat != 0) {
267 nlm_release_call(call); 264 nlmsvc_release_call(call);
268 return stat; 265 return stat;
269 } 266 }
270 267
@@ -337,7 +334,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
337 resp->status = nlmsvc_share_file(host, file, argp); 334 resp->status = nlmsvc_share_file(host, file, argp);
338 335
339 dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); 336 dprintk("lockd: SHARE status %d\n", ntohl(resp->status));
340 nlm_release_host(host); 337 nlmsvc_release_host(host);
341 nlm_release_file(file); 338 nlm_release_file(file);
342 return rpc_success; 339 return rpc_success;
343} 340}
@@ -370,7 +367,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
370 resp->status = nlmsvc_unshare_file(host, file, argp); 367 resp->status = nlmsvc_unshare_file(host, file, argp);
371 368
372 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); 369 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status));
373 nlm_release_host(host); 370 nlmsvc_release_host(host);
374 nlm_release_file(file); 371 nlm_release_file(file);
375 return rpc_success; 372 return rpc_success;
376} 373}
@@ -402,7 +399,7 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
402 return rpc_success; 399 return rpc_success;
403 400
404 nlmsvc_free_host_resources(host); 401 nlmsvc_free_host_resources(host);
405 nlm_release_host(host); 402 nlmsvc_release_host(host);
406 return rpc_success; 403 return rpc_success;
407} 404}
408 405
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc5..6e31695d046f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/smp_lock.h>
29#include <linux/sunrpc/clnt.h> 28#include <linux/sunrpc/clnt.h>
30#include <linux/sunrpc/svc.h> 29#include <linux/sunrpc/svc.h>
31#include <linux/lockd/nlm.h> 30#include <linux/lockd/nlm.h>
@@ -47,17 +46,19 @@ static void nlmsvc_remove_block(struct nlm_block *block);
47static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); 46static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
48static void nlmsvc_freegrantargs(struct nlm_rqst *call); 47static void nlmsvc_freegrantargs(struct nlm_rqst *call);
49static const struct rpc_call_ops nlmsvc_grant_ops; 48static const struct rpc_call_ops nlmsvc_grant_ops;
49static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
50 50
51/* 51/*
52 * The list of blocked locks to retry 52 * The list of blocked locks to retry
53 */ 53 */
54static LIST_HEAD(nlm_blocked); 54static LIST_HEAD(nlm_blocked);
55static DEFINE_SPINLOCK(nlm_blocked_lock);
55 56
56/* 57/*
57 * Insert a blocked lock into the global list 58 * Insert a blocked lock into the global list
58 */ 59 */
59static void 60static void
60nlmsvc_insert_block(struct nlm_block *block, unsigned long when) 61nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
61{ 62{
62 struct nlm_block *b; 63 struct nlm_block *b;
63 struct list_head *pos; 64 struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
87 block->b_when = when; 88 block->b_when = when;
88} 89}
89 90
91static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
92{
93 spin_lock(&nlm_blocked_lock);
94 nlmsvc_insert_block_locked(block, when);
95 spin_unlock(&nlm_blocked_lock);
96}
97
90/* 98/*
91 * Remove a block from the global list 99 * Remove a block from the global list
92 */ 100 */
@@ -94,7 +102,9 @@ static inline void
94nlmsvc_remove_block(struct nlm_block *block) 102nlmsvc_remove_block(struct nlm_block *block)
95{ 103{
96 if (!list_empty(&block->b_list)) { 104 if (!list_empty(&block->b_list)) {
105 spin_lock(&nlm_blocked_lock);
97 list_del_init(&block->b_list); 106 list_del_init(&block->b_list);
107 spin_unlock(&nlm_blocked_lock);
98 nlmsvc_release_block(block); 108 nlmsvc_release_block(block);
99 } 109 }
100} 110}
@@ -224,7 +234,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
224failed_free: 234failed_free:
225 kfree(block); 235 kfree(block);
226failed: 236failed:
227 nlm_release_call(call); 237 nlmsvc_release_call(call);
228 return NULL; 238 return NULL;
229} 239}
230 240
@@ -257,7 +267,7 @@ static void nlmsvc_free_block(struct kref *kref)
257 mutex_unlock(&file->f_mutex); 267 mutex_unlock(&file->f_mutex);
258 268
259 nlmsvc_freegrantargs(block->b_call); 269 nlmsvc_freegrantargs(block->b_call);
260 nlm_release_call(block->b_call); 270 nlmsvc_release_call(block->b_call);
261 nlm_release_file(block->b_file); 271 nlm_release_file(block->b_file);
262 kfree(block->b_fl); 272 kfree(block->b_fl);
263 kfree(block); 273 kfree(block);
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
651 struct nlm_block *block; 661 struct nlm_block *block;
652 int rc = -ENOENT; 662 int rc = -ENOENT;
653 663
654 lock_kernel(); 664 spin_lock(&nlm_blocked_lock);
655 list_for_each_entry(block, &nlm_blocked, b_list) { 665 list_for_each_entry(block, &nlm_blocked, b_list) {
656 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 666 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
657 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", 667 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
665 } else if (result == 0) 675 } else if (result == 0)
666 block->b_granted = 1; 676 block->b_granted = 1;
667 677
668 nlmsvc_insert_block(block, 0); 678 nlmsvc_insert_block_locked(block, 0);
669 svc_wake_up(block->b_daemon); 679 svc_wake_up(block->b_daemon);
670 rc = 0; 680 rc = 0;
671 break; 681 break;
672 } 682 }
673 } 683 }
674 unlock_kernel(); 684 spin_unlock(&nlm_blocked_lock);
675 if (rc == -ENOENT) 685 if (rc == -ENOENT)
676 printk(KERN_WARNING "lockd: grant for unknown block\n"); 686 printk(KERN_WARNING "lockd: grant for unknown block\n");
677 return rc; 687 return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
690 struct nlm_block *block; 700 struct nlm_block *block;
691 701
692 dprintk("lockd: VFS unblock notification for block %p\n", fl); 702 dprintk("lockd: VFS unblock notification for block %p\n", fl);
703 spin_lock(&nlm_blocked_lock);
693 list_for_each_entry(block, &nlm_blocked, b_list) { 704 list_for_each_entry(block, &nlm_blocked, b_list) {
694 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 705 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
695 nlmsvc_insert_block(block, 0); 706 nlmsvc_insert_block_locked(block, 0);
707 spin_unlock(&nlm_blocked_lock);
696 svc_wake_up(block->b_daemon); 708 svc_wake_up(block->b_daemon);
697 return; 709 return;
698 } 710 }
699 } 711 }
700 712 spin_unlock(&nlm_blocked_lock);
701 printk(KERN_WARNING "lockd: notification for unknown block!\n"); 713 printk(KERN_WARNING "lockd: notification for unknown block!\n");
702} 714}
703 715
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
803 815
804 dprintk("lockd: GRANT_MSG RPC callback\n"); 816 dprintk("lockd: GRANT_MSG RPC callback\n");
805 817
806 lock_kernel(); 818 spin_lock(&nlm_blocked_lock);
807 /* if the block is not on a list at this point then it has 819 /* if the block is not on a list at this point then it has
808 * been invalidated. Don't try to requeue it. 820 * been invalidated. Don't try to requeue it.
809 * 821 *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
825 /* Call was successful, now wait for client callback */ 837 /* Call was successful, now wait for client callback */
826 timeout = 60 * HZ; 838 timeout = 60 * HZ;
827 } 839 }
828 nlmsvc_insert_block(block, timeout); 840 nlmsvc_insert_block_locked(block, timeout);
829 svc_wake_up(block->b_daemon); 841 svc_wake_up(block->b_daemon);
830out: 842out:
831 unlock_kernel(); 843 spin_unlock(&nlm_blocked_lock);
832} 844}
833 845
846/*
847 * FIXME: nlmsvc_release_block() grabs a mutex. This is not allowed for an
848 * .rpc_release rpc_call_op
849 */
834static void nlmsvc_grant_release(void *data) 850static void nlmsvc_grant_release(void *data)
835{ 851{
836 struct nlm_rqst *call = data; 852 struct nlm_rqst *call = data;
837
838 lock_kernel();
839 nlmsvc_release_block(call->a_block); 853 nlmsvc_release_block(call->a_block);
840 unlock_kernel();
841} 854}
842 855
843static const struct rpc_call_ops nlmsvc_grant_ops = { 856static const struct rpc_call_ops nlmsvc_grant_ops = {
@@ -922,3 +935,32 @@ nlmsvc_retry_blocked(void)
922 935
923 return timeout; 936 return timeout;
924} 937}
938
939#ifdef RPC_DEBUG
940static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
941{
942 /*
943 * We can get away with a static buffer because we're only
944 * called with BKL held.
945 */
946 static char buf[2*NLM_MAXCOOKIELEN+1];
947 unsigned int i, len = sizeof(buf);
948 char *p = buf;
949
950 len--; /* allow for trailing \0 */
951 if (len < 3)
952 return "???";
953 for (i = 0 ; i < cookie->len ; i++) {
954 if (len < 2) {
955 strcpy(p-3, "...");
956 break;
957 }
958 sprintf(p, "%02x", cookie->data[i]);
959 p += 2;
960 len -= 2;
961 }
962 *p = '\0';
963
964 return buf;
965}
966#endif
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7c..d27aab11f324 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/smp_lock.h>
13#include <linux/lockd/lockd.h> 12#include <linux/lockd/lockd.h>
14#include <linux/lockd/share.h> 13#include <linux/lockd/share.h>
15 14
@@ -81,7 +80,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
81 return 0; 80 return 0;
82 81
83no_locks: 82no_locks:
84 nlm_release_host(host); 83 nlmsvc_release_host(host);
85 if (error) 84 if (error)
86 return error; 85 return error;
87 return nlm_lck_denied_nolocks; 86 return nlm_lck_denied_nolocks;
@@ -123,7 +122,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
123 dprintk("lockd: TEST status %d vers %d\n", 122 dprintk("lockd: TEST status %d vers %d\n",
124 ntohl(resp->status), rqstp->rq_vers); 123 ntohl(resp->status), rqstp->rq_vers);
125 124
126 nlm_release_host(host); 125 nlmsvc_release_host(host);
127 nlm_release_file(file); 126 nlm_release_file(file);
128 return rc; 127 return rc;
129} 128}
@@ -165,7 +164,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
165 else 164 else
166 dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 165 dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
167 166
168 nlm_release_host(host); 167 nlmsvc_release_host(host);
169 nlm_release_file(file); 168 nlm_release_file(file);
170 return rc; 169 return rc;
171} 170}
@@ -195,7 +194,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
195 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); 194 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
196 195
197 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); 196 dprintk("lockd: CANCEL status %d\n", ntohl(resp->status));
198 nlm_release_host(host); 197 nlmsvc_release_host(host);
199 nlm_release_file(file); 198 nlm_release_file(file);
200 return rpc_success; 199 return rpc_success;
201} 200}
@@ -228,7 +227,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
228 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); 227 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
229 228
230 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); 229 dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status));
231 nlm_release_host(host); 230 nlmsvc_release_host(host);
232 nlm_release_file(file); 231 nlm_release_file(file);
233 return rpc_success; 232 return rpc_success;
234} 233}
@@ -258,11 +257,17 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
258 -task->tk_status); 257 -task->tk_status);
259} 258}
260 259
260void nlmsvc_release_call(struct nlm_rqst *call)
261{
262 if (!atomic_dec_and_test(&call->a_count))
263 return;
264 nlmsvc_release_host(call->a_host);
265 kfree(call);
266}
267
261static void nlmsvc_callback_release(void *data) 268static void nlmsvc_callback_release(void *data)
262{ 269{
263 lock_kernel(); 270 nlmsvc_release_call(data);
264 nlm_release_call(data);
265 unlock_kernel();
266} 271}
267 272
268static const struct rpc_call_ops nlmsvc_callback_ops = { 273static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -294,7 +299,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
294 299
295 stat = func(rqstp, argp, &call->a_res); 300 stat = func(rqstp, argp, &call->a_res);
296 if (stat != 0) { 301 if (stat != 0) {
297 nlm_release_call(call); 302 nlmsvc_release_call(call);
298 return stat; 303 return stat;
299 } 304 }
300 305
@@ -369,7 +374,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
369 resp->status = cast_status(nlmsvc_share_file(host, file, argp)); 374 resp->status = cast_status(nlmsvc_share_file(host, file, argp));
370 375
371 dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); 376 dprintk("lockd: SHARE status %d\n", ntohl(resp->status));
372 nlm_release_host(host); 377 nlmsvc_release_host(host);
373 nlm_release_file(file); 378 nlm_release_file(file);
374 return rpc_success; 379 return rpc_success;
375} 380}
@@ -402,7 +407,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
402 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); 407 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
403 408
404 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); 409 dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status));
405 nlm_release_host(host); 410 nlmsvc_release_host(host);
406 nlm_release_file(file); 411 nlm_release_file(file);
407 return rpc_success; 412 return rpc_success;
408} 413}
@@ -434,7 +439,7 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
434 return rpc_success; 439 return rpc_success;
435 440
436 nlmsvc_free_host_resources(host); 441 nlmsvc_free_host_resources(host);
437 nlm_release_host(host); 442 nlmsvc_release_host(host);
438 return rpc_success; 443 return rpc_success;
439} 444}
440 445
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3da..1ca0679c80bf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
170 170
171again: 171again:
172 file->f_locks = 0; 172 file->f_locks = 0;
173 lock_flocks(); /* protects i_flock list */
173 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 174 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
174 if (fl->fl_lmops != &nlmsvc_lock_operations) 175 if (fl->fl_lmops != &nlmsvc_lock_operations)
175 continue; 176 continue;
@@ -181,6 +182,7 @@ again:
181 if (match(lockhost, host)) { 182 if (match(lockhost, host)) {
182 struct file_lock lock = *fl; 183 struct file_lock lock = *fl;
183 184
185 unlock_flocks();
184 lock.fl_type = F_UNLCK; 186 lock.fl_type = F_UNLCK;
185 lock.fl_start = 0; 187 lock.fl_start = 0;
186 lock.fl_end = OFFSET_MAX; 188 lock.fl_end = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
192 goto again; 194 goto again;
193 } 195 }
194 } 196 }
197 unlock_flocks();
195 198
196 return 0; 199 return 0;
197} 200}
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
226 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 229 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
227 return 1; 230 return 1;
228 231
232 lock_flocks();
229 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 233 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
230 if (fl->fl_lmops == &nlmsvc_lock_operations) 234 if (fl->fl_lmops == &nlmsvc_lock_operations) {
235 unlock_flocks();
231 return 1; 236 return 1;
237 }
232 } 238 }
239 unlock_flocks();
233 file->f_locks = 0; 240 file->f_locks = 0;
234 return 0; 241 return 0;
235} 242}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index b583ab0a4cbb..964666c68a86 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -149,37 +149,6 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
149} 149}
150 150
151/* 151/*
152 * Encode a lock as part of an NLM call
153 */
154static __be32 *
155nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
156{
157 struct file_lock *fl = &lock->fl;
158 __s32 start, len;
159
160 if (!(p = xdr_encode_string(p, lock->caller))
161 || !(p = nlm_encode_fh(p, &lock->fh))
162 || !(p = nlm_encode_oh(p, &lock->oh)))
163 return NULL;
164
165 if (fl->fl_start > NLM_OFFSET_MAX
166 || (fl->fl_end > NLM_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
167 return NULL;
168
169 start = loff_t_to_s32(fl->fl_start);
170 if (fl->fl_end == OFFSET_MAX)
171 len = 0;
172 else
173 len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
174
175 *p++ = htonl(lock->svid);
176 *p++ = htonl(start);
177 *p++ = htonl(len);
178
179 return p;
180}
181
182/*
183 * Encode result of a TEST/TEST_MSG call 152 * Encode result of a TEST/TEST_MSG call
184 */ 153 */
185static __be32 * 154static __be32 *
@@ -372,259 +341,3 @@ nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
372{ 341{
373 return xdr_ressize_check(rqstp, p); 342 return xdr_ressize_check(rqstp, p);
374} 343}
375
376/*
377 * Now, the client side XDR functions
378 */
379#ifdef NLMCLNT_SUPPORT_SHARES
380static int
381nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
382{
383 return 0;
384}
385#endif
386
387static int
388nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
389{
390 struct nlm_lock *lock = &argp->lock;
391
392 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
393 return -EIO;
394 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
395 if (!(p = nlm_encode_lock(p, lock)))
396 return -EIO;
397 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
398 return 0;
399}
400
401static int
402nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
403{
404 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
405 return -EIO;
406 resp->status = *p++;
407 if (resp->status == nlm_lck_denied) {
408 struct file_lock *fl = &resp->lock.fl;
409 u32 excl;
410 s32 start, len, end;
411
412 memset(&resp->lock, 0, sizeof(resp->lock));
413 locks_init_lock(fl);
414 excl = ntohl(*p++);
415 resp->lock.svid = ntohl(*p++);
416 fl->fl_pid = (pid_t)resp->lock.svid;
417 if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
418 return -EIO;
419
420 fl->fl_flags = FL_POSIX;
421 fl->fl_type = excl? F_WRLCK : F_RDLCK;
422 start = ntohl(*p++);
423 len = ntohl(*p++);
424 end = start + len - 1;
425
426 fl->fl_start = s32_to_loff_t(start);
427 if (len == 0 || end < 0)
428 fl->fl_end = OFFSET_MAX;
429 else
430 fl->fl_end = s32_to_loff_t(end);
431 }
432 return 0;
433}
434
435
436static int
437nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
438{
439 struct nlm_lock *lock = &argp->lock;
440
441 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
442 return -EIO;
443 *p++ = argp->block? xdr_one : xdr_zero;
444 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
445 if (!(p = nlm_encode_lock(p, lock)))
446 return -EIO;
447 *p++ = argp->reclaim? xdr_one : xdr_zero;
448 *p++ = htonl(argp->state);
449 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
450 return 0;
451}
452
453static int
454nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
455{
456 struct nlm_lock *lock = &argp->lock;
457
458 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
459 return -EIO;
460 *p++ = argp->block? xdr_one : xdr_zero;
461 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
462 if (!(p = nlm_encode_lock(p, lock)))
463 return -EIO;
464 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
465 return 0;
466}
467
468static int
469nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
470{
471 struct nlm_lock *lock = &argp->lock;
472
473 if (!(p = nlm_encode_cookie(p, &argp->cookie)))
474 return -EIO;
475 if (!(p = nlm_encode_lock(p, lock)))
476 return -EIO;
477 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
478 return 0;
479}
480
481static int
482nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
483{
484 if (!(p = nlm_encode_cookie(p, &resp->cookie)))
485 return -EIO;
486 *p++ = resp->status;
487 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
488 return 0;
489}
490
491static int
492nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
493{
494 if (!(p = nlm_encode_testres(p, resp)))
495 return -EIO;
496 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
497 return 0;
498}
499
500static int
501nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
502{
503 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
504 return -EIO;
505 resp->status = *p++;
506 return 0;
507}
508
509#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
510# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
511#endif
512
513/*
514 * Buffer requirements for NLM
515 */
516#define NLM_void_sz 0
517#define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
518#define NLM_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
519#define NLM_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
520#define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE)
521#define NLM_lock_sz 3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
522#define NLM_holder_sz 4+NLM_owner_sz
523
524#define NLM_testargs_sz NLM_cookie_sz+1+NLM_lock_sz
525#define NLM_lockargs_sz NLM_cookie_sz+4+NLM_lock_sz
526#define NLM_cancargs_sz NLM_cookie_sz+2+NLM_lock_sz
527#define NLM_unlockargs_sz NLM_cookie_sz+NLM_lock_sz
528
529#define NLM_testres_sz NLM_cookie_sz+1+NLM_holder_sz
530#define NLM_res_sz NLM_cookie_sz+1
531#define NLM_norep_sz 0
532
533/*
534 * For NLM, a void procedure really returns nothing
535 */
536#define nlmclt_decode_norep NULL
537
538#define PROC(proc, argtype, restype) \
539[NLMPROC_##proc] = { \
540 .p_proc = NLMPROC_##proc, \
541 .p_encode = (kxdrproc_t) nlmclt_encode_##argtype, \
542 .p_decode = (kxdrproc_t) nlmclt_decode_##restype, \
543 .p_arglen = NLM_##argtype##_sz, \
544 .p_replen = NLM_##restype##_sz, \
545 .p_statidx = NLMPROC_##proc, \
546 .p_name = #proc, \
547 }
548
549static struct rpc_procinfo nlm_procedures[] = {
550 PROC(TEST, testargs, testres),
551 PROC(LOCK, lockargs, res),
552 PROC(CANCEL, cancargs, res),
553 PROC(UNLOCK, unlockargs, res),
554 PROC(GRANTED, testargs, res),
555 PROC(TEST_MSG, testargs, norep),
556 PROC(LOCK_MSG, lockargs, norep),
557 PROC(CANCEL_MSG, cancargs, norep),
558 PROC(UNLOCK_MSG, unlockargs, norep),
559 PROC(GRANTED_MSG, testargs, norep),
560 PROC(TEST_RES, testres, norep),
561 PROC(LOCK_RES, res, norep),
562 PROC(CANCEL_RES, res, norep),
563 PROC(UNLOCK_RES, res, norep),
564 PROC(GRANTED_RES, res, norep),
565#ifdef NLMCLNT_SUPPORT_SHARES
566 PROC(SHARE, shareargs, shareres),
567 PROC(UNSHARE, shareargs, shareres),
568 PROC(NM_LOCK, lockargs, res),
569 PROC(FREE_ALL, notify, void),
570#endif
571};
572
573static struct rpc_version nlm_version1 = {
574 .number = 1,
575 .nrprocs = 16,
576 .procs = nlm_procedures,
577};
578
579static struct rpc_version nlm_version3 = {
580 .number = 3,
581 .nrprocs = 24,
582 .procs = nlm_procedures,
583};
584
585static struct rpc_version * nlm_versions[] = {
586 [1] = &nlm_version1,
587 [3] = &nlm_version3,
588#ifdef CONFIG_LOCKD_V4
589 [4] = &nlm_version4,
590#endif
591};
592
593static struct rpc_stat nlm_stats;
594
595struct rpc_program nlm_program = {
596 .name = "lockd",
597 .number = NLM_PROGRAM,
598 .nrvers = ARRAY_SIZE(nlm_versions),
599 .version = nlm_versions,
600 .stats = &nlm_stats,
601};
602
603#ifdef RPC_DEBUG
604const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
605{
606 /*
607 * We can get away with a static buffer because we're only
608 * called with BKL held.
609 */
610 static char buf[2*NLM_MAXCOOKIELEN+1];
611 unsigned int i, len = sizeof(buf);
612 char *p = buf;
613
614 len--; /* allow for trailing \0 */
615 if (len < 3)
616 return "???";
617 for (i = 0 ; i < cookie->len ; i++) {
618 if (len < 2) {
619 strcpy(p-3, "...");
620 break;
621 }
622 sprintf(p, "%02x", cookie->data[i]);
623 p += 2;
624 len -= 2;
625 }
626 *p = '\0';
627
628 return buf;
629}
630#endif
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ad9dbbc9145d..dfa4789cd460 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -93,15 +93,6 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
93 return p + XDR_QUADLEN(f->size); 93 return p + XDR_QUADLEN(f->size);
94} 94}
95 95
96static __be32 *
97nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
98{
99 *p++ = htonl(f->size);
100 if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
101 memcpy(p, f->data, f->size);
102 return p + XDR_QUADLEN(f->size);
103}
104
105/* 96/*
106 * Encode and decode owner handle 97 * Encode and decode owner handle
107 */ 98 */
@@ -112,12 +103,6 @@ nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
112} 103}
113 104
114static __be32 * 105static __be32 *
115nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
116{
117 return xdr_encode_netobj(p, oh);
118}
119
120static __be32 *
121nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) 106nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
122{ 107{
123 struct file_lock *fl = &lock->fl; 108 struct file_lock *fl = &lock->fl;
@@ -150,38 +135,6 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
150} 135}
151 136
152/* 137/*
153 * Encode a lock as part of an NLM call
154 */
155static __be32 *
156nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
157{
158 struct file_lock *fl = &lock->fl;
159 __s64 start, len;
160
161 if (!(p = xdr_encode_string(p, lock->caller))
162 || !(p = nlm4_encode_fh(p, &lock->fh))
163 || !(p = nlm4_encode_oh(p, &lock->oh)))
164 return NULL;
165
166 if (fl->fl_start > NLM4_OFFSET_MAX
167 || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
168 return NULL;
169
170 *p++ = htonl(lock->svid);
171
172 start = loff_t_to_s64(fl->fl_start);
173 if (fl->fl_end == OFFSET_MAX)
174 len = 0;
175 else
176 len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
177
178 p = xdr_encode_hyper(p, start);
179 p = xdr_encode_hyper(p, len);
180
181 return p;
182}
183
184/*
185 * Encode result of a TEST/TEST_MSG call 138 * Encode result of a TEST/TEST_MSG call
186 */ 139 */
187static __be32 * 140static __be32 *
@@ -379,211 +332,3 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
379{ 332{
380 return xdr_ressize_check(rqstp, p); 333 return xdr_ressize_check(rqstp, p);
381} 334}
382
383/*
384 * Now, the client side XDR functions
385 */
386#ifdef NLMCLNT_SUPPORT_SHARES
387static int
388nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
389{
390 return 0;
391}
392#endif
393
394static int
395nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
396{
397 struct nlm_lock *lock = &argp->lock;
398
399 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
400 return -EIO;
401 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
402 if (!(p = nlm4_encode_lock(p, lock)))
403 return -EIO;
404 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
405 return 0;
406}
407
408static int
409nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
410{
411 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
412 return -EIO;
413 resp->status = *p++;
414 if (resp->status == nlm_lck_denied) {
415 struct file_lock *fl = &resp->lock.fl;
416 u32 excl;
417 __u64 start, len;
418 __s64 end;
419
420 memset(&resp->lock, 0, sizeof(resp->lock));
421 locks_init_lock(fl);
422 excl = ntohl(*p++);
423 resp->lock.svid = ntohl(*p++);
424 fl->fl_pid = (pid_t)resp->lock.svid;
425 if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
426 return -EIO;
427
428 fl->fl_flags = FL_POSIX;
429 fl->fl_type = excl? F_WRLCK : F_RDLCK;
430 p = xdr_decode_hyper(p, &start);
431 p = xdr_decode_hyper(p, &len);
432 end = start + len - 1;
433
434 fl->fl_start = s64_to_loff_t(start);
435 if (len == 0 || end < 0)
436 fl->fl_end = OFFSET_MAX;
437 else
438 fl->fl_end = s64_to_loff_t(end);
439 }
440 return 0;
441}
442
443
444static int
445nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
446{
447 struct nlm_lock *lock = &argp->lock;
448
449 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
450 return -EIO;
451 *p++ = argp->block? xdr_one : xdr_zero;
452 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
453 if (!(p = nlm4_encode_lock(p, lock)))
454 return -EIO;
455 *p++ = argp->reclaim? xdr_one : xdr_zero;
456 *p++ = htonl(argp->state);
457 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
458 return 0;
459}
460
461static int
462nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
463{
464 struct nlm_lock *lock = &argp->lock;
465
466 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
467 return -EIO;
468 *p++ = argp->block? xdr_one : xdr_zero;
469 *p++ = (lock->fl.fl_type == F_WRLCK)? xdr_one : xdr_zero;
470 if (!(p = nlm4_encode_lock(p, lock)))
471 return -EIO;
472 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
473 return 0;
474}
475
476static int
477nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
478{
479 struct nlm_lock *lock = &argp->lock;
480
481 if (!(p = nlm4_encode_cookie(p, &argp->cookie)))
482 return -EIO;
483 if (!(p = nlm4_encode_lock(p, lock)))
484 return -EIO;
485 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
486 return 0;
487}
488
489static int
490nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
491{
492 if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
493 return -EIO;
494 *p++ = resp->status;
495 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
496 return 0;
497}
498
499static int
500nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
501{
502 if (!(p = nlm4_encode_testres(p, resp)))
503 return -EIO;
504 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
505 return 0;
506}
507
508static int
509nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
510{
511 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
512 return -EIO;
513 resp->status = *p++;
514 return 0;
515}
516
517#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
518# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
519#endif
520
521#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
522# error "NLM host name cannot be larger than NLM's maximum string length!"
523#endif
524
525/*
526 * Buffer requirements for NLM
527 */
528#define NLM4_void_sz 0
529#define NLM4_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
530#define NLM4_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
531#define NLM4_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
532#define NLM4_fhandle_sz 1+XDR_QUADLEN(NFS3_FHSIZE)
533#define NLM4_lock_sz 5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
534#define NLM4_holder_sz 6+NLM4_owner_sz
535
536#define NLM4_testargs_sz NLM4_cookie_sz+1+NLM4_lock_sz
537#define NLM4_lockargs_sz NLM4_cookie_sz+4+NLM4_lock_sz
538#define NLM4_cancargs_sz NLM4_cookie_sz+2+NLM4_lock_sz
539#define NLM4_unlockargs_sz NLM4_cookie_sz+NLM4_lock_sz
540
541#define NLM4_testres_sz NLM4_cookie_sz+1+NLM4_holder_sz
542#define NLM4_res_sz NLM4_cookie_sz+1
543#define NLM4_norep_sz 0
544
545/*
546 * For NLM, a void procedure really returns nothing
547 */
548#define nlm4clt_decode_norep NULL
549
550#define PROC(proc, argtype, restype) \
551[NLMPROC_##proc] = { \
552 .p_proc = NLMPROC_##proc, \
553 .p_encode = (kxdrproc_t) nlm4clt_encode_##argtype, \
554 .p_decode = (kxdrproc_t) nlm4clt_decode_##restype, \
555 .p_arglen = NLM4_##argtype##_sz, \
556 .p_replen = NLM4_##restype##_sz, \
557 .p_statidx = NLMPROC_##proc, \
558 .p_name = #proc, \
559 }
560
561static struct rpc_procinfo nlm4_procedures[] = {
562 PROC(TEST, testargs, testres),
563 PROC(LOCK, lockargs, res),
564 PROC(CANCEL, cancargs, res),
565 PROC(UNLOCK, unlockargs, res),
566 PROC(GRANTED, testargs, res),
567 PROC(TEST_MSG, testargs, norep),
568 PROC(LOCK_MSG, lockargs, norep),
569 PROC(CANCEL_MSG, cancargs, norep),
570 PROC(UNLOCK_MSG, unlockargs, norep),
571 PROC(GRANTED_MSG, testargs, norep),
572 PROC(TEST_RES, testres, norep),
573 PROC(LOCK_RES, res, norep),
574 PROC(CANCEL_RES, res, norep),
575 PROC(UNLOCK_RES, res, norep),
576 PROC(GRANTED_RES, res, norep),
577#ifdef NLMCLNT_SUPPORT_SHARES
578 PROC(SHARE, shareargs, shareres),
579 PROC(UNSHARE, shareargs, shareres),
580 PROC(NM_LOCK, lockargs, res),
581 PROC(FREE_ALL, notify, void),
582#endif
583};
584
585struct rpc_version nlm_version4 = {
586 .number = 4,
587 .nrprocs = 24,
588 .procs = nlm4_procedures,
589};
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc048..0f3998291f78 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
122#include <linux/module.h> 122#include <linux/module.h>
123#include <linux/security.h> 123#include <linux/security.h>
124#include <linux/slab.h> 124#include <linux/slab.h>
125#include <linux/smp_lock.h>
126#include <linux/syscalls.h> 125#include <linux/syscalls.h>
127#include <linux/time.h> 126#include <linux/time.h>
128#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
@@ -142,14 +141,32 @@ int lease_break_time = 45;
142 141
143static LIST_HEAD(file_lock_list); 142static LIST_HEAD(file_lock_list);
144static LIST_HEAD(blocked_list); 143static LIST_HEAD(blocked_list);
144static DEFINE_SPINLOCK(file_lock_lock);
145
146/*
147 * Protects the two list heads above, plus the inode->i_flock list
148 * FIXME: should use a spinlock, once lockd and ceph are ready.
149 */
150void lock_flocks(void)
151{
152 spin_lock(&file_lock_lock);
153}
154EXPORT_SYMBOL_GPL(lock_flocks);
155
156void unlock_flocks(void)
157{
158 spin_unlock(&file_lock_lock);
159}
160EXPORT_SYMBOL_GPL(unlock_flocks);
145 161
146static struct kmem_cache *filelock_cache __read_mostly; 162static struct kmem_cache *filelock_cache __read_mostly;
147 163
148/* Allocate an empty lock structure. */ 164/* Allocate an empty lock structure. */
149static struct file_lock *locks_alloc_lock(void) 165struct file_lock *locks_alloc_lock(void)
150{ 166{
151 return kmem_cache_alloc(filelock_cache, GFP_KERNEL); 167 return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
152} 168}
169EXPORT_SYMBOL_GPL(locks_alloc_lock);
153 170
154void locks_release_private(struct file_lock *fl) 171void locks_release_private(struct file_lock *fl)
155{ 172{
@@ -168,7 +185,7 @@ void locks_release_private(struct file_lock *fl)
168EXPORT_SYMBOL_GPL(locks_release_private); 185EXPORT_SYMBOL_GPL(locks_release_private);
169 186
170/* Free a lock which is not in use. */ 187/* Free a lock which is not in use. */
171static void locks_free_lock(struct file_lock *fl) 188void locks_free_lock(struct file_lock *fl)
172{ 189{
173 BUG_ON(waitqueue_active(&fl->fl_wait)); 190 BUG_ON(waitqueue_active(&fl->fl_wait));
174 BUG_ON(!list_empty(&fl->fl_block)); 191 BUG_ON(!list_empty(&fl->fl_block));
@@ -177,6 +194,7 @@ static void locks_free_lock(struct file_lock *fl)
177 locks_release_private(fl); 194 locks_release_private(fl);
178 kmem_cache_free(filelock_cache, fl); 195 kmem_cache_free(filelock_cache, fl);
179} 196}
197EXPORT_SYMBOL(locks_free_lock);
180 198
181void locks_init_lock(struct file_lock *fl) 199void locks_init_lock(struct file_lock *fl)
182{ 200{
@@ -216,11 +234,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
216 fl->fl_ops->fl_copy_lock(new, fl); 234 fl->fl_ops->fl_copy_lock(new, fl);
217 new->fl_ops = fl->fl_ops; 235 new->fl_ops = fl->fl_ops;
218 } 236 }
219 if (fl->fl_lmops) { 237 if (fl->fl_lmops)
220 if (fl->fl_lmops->fl_copy_lock)
221 fl->fl_lmops->fl_copy_lock(new, fl);
222 new->fl_lmops = fl->fl_lmops; 238 new->fl_lmops = fl->fl_lmops;
223 }
224} 239}
225 240
226/* 241/*
@@ -429,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
429 fl->fl_file->f_owner.signum = 0; 444 fl->fl_file->f_owner.signum = 0;
430} 445}
431 446
432static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
433{
434 return fl->fl_file == try->fl_file;
435}
436
437static const struct lock_manager_operations lease_manager_ops = { 447static const struct lock_manager_operations lease_manager_ops = {
438 .fl_break = lease_break_callback, 448 .fl_break = lease_break_callback,
439 .fl_release_private = lease_release_private_callback, 449 .fl_release_private = lease_release_private_callback,
440 .fl_mylease = lease_mylease_callback,
441 .fl_change = lease_modify, 450 .fl_change = lease_modify,
442}; 451};
443 452
@@ -511,9 +520,9 @@ static void __locks_delete_block(struct file_lock *waiter)
511 */ 520 */
512static void locks_delete_block(struct file_lock *waiter) 521static void locks_delete_block(struct file_lock *waiter)
513{ 522{
514 lock_kernel(); 523 lock_flocks();
515 __locks_delete_block(waiter); 524 __locks_delete_block(waiter);
516 unlock_kernel(); 525 unlock_flocks();
517} 526}
518 527
519/* Insert waiter into blocker's block list. 528/* Insert waiter into blocker's block list.
@@ -644,7 +653,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
644{ 653{
645 struct file_lock *cfl; 654 struct file_lock *cfl;
646 655
647 lock_kernel(); 656 lock_flocks();
648 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { 657 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
649 if (!IS_POSIX(cfl)) 658 if (!IS_POSIX(cfl))
650 continue; 659 continue;
@@ -657,7 +666,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
657 fl->fl_pid = pid_vnr(cfl->fl_nspid); 666 fl->fl_pid = pid_vnr(cfl->fl_nspid);
658 } else 667 } else
659 fl->fl_type = F_UNLCK; 668 fl->fl_type = F_UNLCK;
660 unlock_kernel(); 669 unlock_flocks();
661 return; 670 return;
662} 671}
663EXPORT_SYMBOL(posix_test_lock); 672EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +739,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
730 int error = 0; 739 int error = 0;
731 int found = 0; 740 int found = 0;
732 741
733 lock_kernel(); 742 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
734 if (request->fl_flags & FL_ACCESS)
735 goto find_conflict;
736
737 if (request->fl_type != F_UNLCK) {
738 error = -ENOMEM;
739 new_fl = locks_alloc_lock(); 743 new_fl = locks_alloc_lock();
740 if (new_fl == NULL) 744 if (!new_fl)
741 goto out; 745 return -ENOMEM;
742 error = 0;
743 } 746 }
744 747
748 lock_flocks();
749 if (request->fl_flags & FL_ACCESS)
750 goto find_conflict;
751
745 for_each_lock(inode, before) { 752 for_each_lock(inode, before) {
746 struct file_lock *fl = *before; 753 struct file_lock *fl = *before;
747 if (IS_POSIX(fl)) 754 if (IS_POSIX(fl))
@@ -767,8 +774,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
767 * If a higher-priority process was blocked on the old file lock, 774 * If a higher-priority process was blocked on the old file lock,
768 * give it the opportunity to lock the file. 775 * give it the opportunity to lock the file.
769 */ 776 */
770 if (found) 777 if (found) {
778 unlock_flocks();
771 cond_resched(); 779 cond_resched();
780 lock_flocks();
781 }
772 782
773find_conflict: 783find_conflict:
774 for_each_lock(inode, before) { 784 for_each_lock(inode, before) {
@@ -794,7 +804,7 @@ find_conflict:
794 error = 0; 804 error = 0;
795 805
796out: 806out:
797 unlock_kernel(); 807 unlock_flocks();
798 if (new_fl) 808 if (new_fl)
799 locks_free_lock(new_fl); 809 locks_free_lock(new_fl);
800 return error; 810 return error;
@@ -823,7 +833,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
823 new_fl2 = locks_alloc_lock(); 833 new_fl2 = locks_alloc_lock();
824 } 834 }
825 835
826 lock_kernel(); 836 lock_flocks();
827 if (request->fl_type != F_UNLCK) { 837 if (request->fl_type != F_UNLCK) {
828 for_each_lock(inode, before) { 838 for_each_lock(inode, before) {
829 fl = *before; 839 fl = *before;
@@ -991,7 +1001,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
991 locks_wake_up_blocks(left); 1001 locks_wake_up_blocks(left);
992 } 1002 }
993 out: 1003 out:
994 unlock_kernel(); 1004 unlock_flocks();
995 /* 1005 /*
996 * Free any unused locks. 1006 * Free any unused locks.
997 */ 1007 */
@@ -1066,14 +1076,14 @@ int locks_mandatory_locked(struct inode *inode)
1066 /* 1076 /*
1067 * Search the lock list for this inode for any POSIX locks. 1077 * Search the lock list for this inode for any POSIX locks.
1068 */ 1078 */
1069 lock_kernel(); 1079 lock_flocks();
1070 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1080 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1071 if (!IS_POSIX(fl)) 1081 if (!IS_POSIX(fl))
1072 continue; 1082 continue;
1073 if (fl->fl_owner != owner) 1083 if (fl->fl_owner != owner)
1074 break; 1084 break;
1075 } 1085 }
1076 unlock_kernel(); 1086 unlock_flocks();
1077 return fl ? -EAGAIN : 0; 1087 return fl ? -EAGAIN : 0;
1078} 1088}
1079 1089
@@ -1186,7 +1196,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1186 1196
1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); 1197 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1188 1198
1189 lock_kernel(); 1199 lock_flocks();
1190 1200
1191 time_out_leases(inode); 1201 time_out_leases(inode);
1192 1202
@@ -1247,8 +1257,10 @@ restart:
1247 break_time++; 1257 break_time++;
1248 } 1258 }
1249 locks_insert_block(flock, new_fl); 1259 locks_insert_block(flock, new_fl);
1260 unlock_flocks();
1250 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1261 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1251 !new_fl->fl_next, break_time); 1262 !new_fl->fl_next, break_time);
1263 lock_flocks();
1252 __locks_delete_block(new_fl); 1264 __locks_delete_block(new_fl);
1253 if (error >= 0) { 1265 if (error >= 0) {
1254 if (error == 0) 1266 if (error == 0)
@@ -1263,7 +1275,7 @@ restart:
1263 } 1275 }
1264 1276
1265out: 1277out:
1266 unlock_kernel(); 1278 unlock_flocks();
1267 if (!IS_ERR(new_fl)) 1279 if (!IS_ERR(new_fl))
1268 locks_free_lock(new_fl); 1280 locks_free_lock(new_fl);
1269 return error; 1281 return error;
@@ -1319,7 +1331,7 @@ int fcntl_getlease(struct file *filp)
1319 struct file_lock *fl; 1331 struct file_lock *fl;
1320 int type = F_UNLCK; 1332 int type = F_UNLCK;
1321 1333
1322 lock_kernel(); 1334 lock_flocks();
1323 time_out_leases(filp->f_path.dentry->d_inode); 1335 time_out_leases(filp->f_path.dentry->d_inode);
1324 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); 1336 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
1325 fl = fl->fl_next) { 1337 fl = fl->fl_next) {
@@ -1328,7 +1340,7 @@ int fcntl_getlease(struct file *filp)
1328 break; 1340 break;
1329 } 1341 }
1330 } 1342 }
1331 unlock_kernel(); 1343 unlock_flocks();
1332 return type; 1344 return type;
1333} 1345}
1334 1346
@@ -1341,41 +1353,37 @@ int fcntl_getlease(struct file *filp)
1341 * The (input) flp->fl_lmops->fl_break function is required 1353 * The (input) flp->fl_lmops->fl_break function is required
1342 * by break_lease(). 1354 * by break_lease().
1343 * 1355 *
1344 * Called with kernel lock held. 1356 * Called with file_lock_lock held.
1345 */ 1357 */
1346int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1358int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1347{ 1359{
1348 struct file_lock *fl, **before, **my_before = NULL, *lease; 1360 struct file_lock *fl, **before, **my_before = NULL, *lease;
1349 struct file_lock *new_fl = NULL;
1350 struct dentry *dentry = filp->f_path.dentry; 1361 struct dentry *dentry = filp->f_path.dentry;
1351 struct inode *inode = dentry->d_inode; 1362 struct inode *inode = dentry->d_inode;
1352 int error, rdlease_count = 0, wrlease_count = 0; 1363 int error, rdlease_count = 0, wrlease_count = 0;
1353 1364
1365 lease = *flp;
1366
1367 error = -EACCES;
1354 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) 1368 if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
1355 return -EACCES; 1369 goto out;
1370 error = -EINVAL;
1356 if (!S_ISREG(inode->i_mode)) 1371 if (!S_ISREG(inode->i_mode))
1357 return -EINVAL; 1372 goto out;
1358 error = security_file_lock(filp, arg); 1373 error = security_file_lock(filp, arg);
1359 if (error) 1374 if (error)
1360 return error; 1375 goto out;
1361 1376
1362 time_out_leases(inode); 1377 time_out_leases(inode);
1363 1378
1364 BUG_ON(!(*flp)->fl_lmops->fl_break); 1379 BUG_ON(!(*flp)->fl_lmops->fl_break);
1365 1380
1366 lease = *flp;
1367
1368 if (arg != F_UNLCK) { 1381 if (arg != F_UNLCK) {
1369 error = -ENOMEM;
1370 new_fl = locks_alloc_lock();
1371 if (new_fl == NULL)
1372 goto out;
1373
1374 error = -EAGAIN; 1382 error = -EAGAIN;
1375 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1383 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1376 goto out; 1384 goto out;
1377 if ((arg == F_WRLCK) 1385 if ((arg == F_WRLCK)
1378 && ((atomic_read(&dentry->d_count) > 1) 1386 && ((dentry->d_count > 1)
1379 || (atomic_read(&inode->i_count) > 1))) 1387 || (atomic_read(&inode->i_count) > 1)))
1380 goto out; 1388 goto out;
1381 } 1389 }
@@ -1391,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1391 for (before = &inode->i_flock; 1399 for (before = &inode->i_flock;
1392 ((fl = *before) != NULL) && IS_LEASE(fl); 1400 ((fl = *before) != NULL) && IS_LEASE(fl);
1393 before = &fl->fl_next) { 1401 before = &fl->fl_next) {
1394 if (lease->fl_lmops->fl_mylease(fl, lease)) 1402 if (fl->fl_file == filp)
1395 my_before = before; 1403 my_before = before;
1396 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) 1404 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
1397 /* 1405 /*
@@ -1410,12 +1418,12 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1410 goto out; 1418 goto out;
1411 1419
1412 if (my_before != NULL) { 1420 if (my_before != NULL) {
1413 *flp = *my_before;
1414 error = lease->fl_lmops->fl_change(my_before, arg); 1421 error = lease->fl_lmops->fl_change(my_before, arg);
1422 if (!error)
1423 *flp = *my_before;
1415 goto out; 1424 goto out;
1416 } 1425 }
1417 1426
1418 error = 0;
1419 if (arg == F_UNLCK) 1427 if (arg == F_UNLCK)
1420 goto out; 1428 goto out;
1421 1429
@@ -1423,20 +1431,23 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1423 if (!leases_enable) 1431 if (!leases_enable)
1424 goto out; 1432 goto out;
1425 1433
1426 locks_copy_lock(new_fl, lease); 1434 locks_insert_lock(before, lease);
1427 locks_insert_lock(before, new_fl);
1428
1429 *flp = new_fl;
1430 return 0; 1435 return 0;
1431 1436
1432out: 1437out:
1433 if (new_fl != NULL)
1434 locks_free_lock(new_fl);
1435 return error; 1438 return error;
1436} 1439}
1437EXPORT_SYMBOL(generic_setlease); 1440EXPORT_SYMBOL(generic_setlease);
1438 1441
1439 /** 1442static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1443{
1444 if (filp->f_op && filp->f_op->setlease)
1445 return filp->f_op->setlease(filp, arg, lease);
1446 else
1447 return generic_setlease(filp, arg, lease);
1448}
1449
1450/**
1440 * vfs_setlease - sets a lease on an open file 1451 * vfs_setlease - sets a lease on an open file
1441 * @filp: file pointer 1452 * @filp: file pointer
1442 * @arg: type of lease to obtain 1453 * @arg: type of lease to obtain
@@ -1467,17 +1478,67 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1467{ 1478{
1468 int error; 1479 int error;
1469 1480
1470 lock_kernel(); 1481 lock_flocks();
1471 if (filp->f_op && filp->f_op->setlease) 1482 error = __vfs_setlease(filp, arg, lease);
1472 error = filp->f_op->setlease(filp, arg, lease); 1483 unlock_flocks();
1473 else
1474 error = generic_setlease(filp, arg, lease);
1475 unlock_kernel();
1476 1484
1477 return error; 1485 return error;
1478} 1486}
1479EXPORT_SYMBOL_GPL(vfs_setlease); 1487EXPORT_SYMBOL_GPL(vfs_setlease);
1480 1488
1489static int do_fcntl_delete_lease(struct file *filp)
1490{
1491 struct file_lock fl, *flp = &fl;
1492
1493 lease_init(filp, F_UNLCK, flp);
1494
1495 return vfs_setlease(filp, F_UNLCK, &flp);
1496}
1497
1498static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1499{
1500 struct file_lock *fl, *ret;
1501 struct fasync_struct *new;
1502 int error;
1503
1504 fl = lease_alloc(filp, arg);
1505 if (IS_ERR(fl))
1506 return PTR_ERR(fl);
1507
1508 new = fasync_alloc();
1509 if (!new) {
1510 locks_free_lock(fl);
1511 return -ENOMEM;
1512 }
1513 ret = fl;
1514 lock_flocks();
1515 error = __vfs_setlease(filp, arg, &ret);
1516 if (error) {
1517 unlock_flocks();
1518 locks_free_lock(fl);
1519 goto out_free_fasync;
1520 }
1521 if (ret != fl)
1522 locks_free_lock(fl);
1523
1524 /*
1525 * fasync_insert_entry() returns the old entry if any.
1526 * If there was no old entry, then it used 'new' and
1527 * inserted it into the fasync list. Clear new so that
1528 * we don't release it here.
1529 */
1530 if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
1531 new = NULL;
1532
1533 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1534 unlock_flocks();
1535
1536out_free_fasync:
1537 if (new)
1538 fasync_free(new);
1539 return error;
1540}
1541
1481/** 1542/**
1482 * fcntl_setlease - sets a lease on an open file 1543 * fcntl_setlease - sets a lease on an open file
1483 * @fd: open file descriptor 1544 * @fd: open file descriptor
@@ -1490,34 +1551,9 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
1490 */ 1551 */
1491int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1552int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1492{ 1553{
1493 struct file_lock fl, *flp = &fl; 1554 if (arg == F_UNLCK)
1494 struct inode *inode = filp->f_path.dentry->d_inode; 1555 return do_fcntl_delete_lease(filp);
1495 int error; 1556 return do_fcntl_add_lease(fd, filp, arg);
1496
1497 locks_init_lock(&fl);
1498 error = lease_init(filp, arg, &fl);
1499 if (error)
1500 return error;
1501
1502 lock_kernel();
1503
1504 error = vfs_setlease(filp, arg, &flp);
1505 if (error || arg == F_UNLCK)
1506 goto out_unlock;
1507
1508 error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
1509 if (error < 0) {
1510 /* remove lease just inserted by setlease */
1511 flp->fl_type = F_UNLCK | F_INPROGRESS;
1512 flp->fl_break_time = jiffies - 10;
1513 time_out_leases(inode);
1514 goto out_unlock;
1515 }
1516
1517 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1518out_unlock:
1519 unlock_kernel();
1520 return error;
1521} 1557}
1522 1558
1523/** 1559/**
@@ -2020,7 +2056,7 @@ void locks_remove_flock(struct file *filp)
2020 fl.fl_ops->fl_release_private(&fl); 2056 fl.fl_ops->fl_release_private(&fl);
2021 } 2057 }
2022 2058
2023 lock_kernel(); 2059 lock_flocks();
2024 before = &inode->i_flock; 2060 before = &inode->i_flock;
2025 2061
2026 while ((fl = *before) != NULL) { 2062 while ((fl = *before) != NULL) {
@@ -2038,7 +2074,7 @@ void locks_remove_flock(struct file *filp)
2038 } 2074 }
2039 before = &fl->fl_next; 2075 before = &fl->fl_next;
2040 } 2076 }
2041 unlock_kernel(); 2077 unlock_flocks();
2042} 2078}
2043 2079
2044/** 2080/**
@@ -2053,12 +2089,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
2053{ 2089{
2054 int status = 0; 2090 int status = 0;
2055 2091
2056 lock_kernel(); 2092 lock_flocks();
2057 if (waiter->fl_next) 2093 if (waiter->fl_next)
2058 __locks_delete_block(waiter); 2094 __locks_delete_block(waiter);
2059 else 2095 else
2060 status = -ENOENT; 2096 status = -ENOENT;
2061 unlock_kernel(); 2097 unlock_flocks();
2062 return status; 2098 return status;
2063} 2099}
2064 2100
@@ -2085,7 +2121,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
2085#include <linux/seq_file.h> 2121#include <linux/seq_file.h>
2086 2122
2087static void lock_get_status(struct seq_file *f, struct file_lock *fl, 2123static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2088 int id, char *pfx) 2124 loff_t id, char *pfx)
2089{ 2125{
2090 struct inode *inode = NULL; 2126 struct inode *inode = NULL;
2091 unsigned int fl_pid; 2127 unsigned int fl_pid;
@@ -2098,7 +2134,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2098 if (fl->fl_file != NULL) 2134 if (fl->fl_file != NULL)
2099 inode = fl->fl_file->f_path.dentry->d_inode; 2135 inode = fl->fl_file->f_path.dentry->d_inode;
2100 2136
2101 seq_printf(f, "%d:%s ", id, pfx); 2137 seq_printf(f, "%lld:%s ", id, pfx);
2102 if (IS_POSIX(fl)) { 2138 if (IS_POSIX(fl)) {
2103 seq_printf(f, "%6s %s ", 2139 seq_printf(f, "%6s %s ",
2104 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", 2140 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2161,30 +2197,33 @@ static int locks_show(struct seq_file *f, void *v)
2161 2197
2162 fl = list_entry(v, struct file_lock, fl_link); 2198 fl = list_entry(v, struct file_lock, fl_link);
2163 2199
2164 lock_get_status(f, fl, (long)f->private, ""); 2200 lock_get_status(f, fl, *((loff_t *)f->private), "");
2165 2201
2166 list_for_each_entry(bfl, &fl->fl_block, fl_block) 2202 list_for_each_entry(bfl, &fl->fl_block, fl_block)
2167 lock_get_status(f, bfl, (long)f->private, " ->"); 2203 lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
2168 2204
2169 f->private++;
2170 return 0; 2205 return 0;
2171} 2206}
2172 2207
2173static void *locks_start(struct seq_file *f, loff_t *pos) 2208static void *locks_start(struct seq_file *f, loff_t *pos)
2174{ 2209{
2175 lock_kernel(); 2210 loff_t *p = f->private;
2176 f->private = (void *)1; 2211
2212 lock_flocks();
2213 *p = (*pos + 1);
2177 return seq_list_start(&file_lock_list, *pos); 2214 return seq_list_start(&file_lock_list, *pos);
2178} 2215}
2179 2216
2180static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2217static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2181{ 2218{
2219 loff_t *p = f->private;
2220 ++*p;
2182 return seq_list_next(v, &file_lock_list, pos); 2221 return seq_list_next(v, &file_lock_list, pos);
2183} 2222}
2184 2223
2185static void locks_stop(struct seq_file *f, void *v) 2224static void locks_stop(struct seq_file *f, void *v)
2186{ 2225{
2187 unlock_kernel(); 2226 unlock_flocks();
2188} 2227}
2189 2228
2190static const struct seq_operations locks_seq_operations = { 2229static const struct seq_operations locks_seq_operations = {
@@ -2196,14 +2235,14 @@ static const struct seq_operations locks_seq_operations = {
2196 2235
2197static int locks_open(struct inode *inode, struct file *filp) 2236static int locks_open(struct inode *inode, struct file *filp)
2198{ 2237{
2199 return seq_open(filp, &locks_seq_operations); 2238 return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
2200} 2239}
2201 2240
2202static const struct file_operations proc_locks_operations = { 2241static const struct file_operations proc_locks_operations = {
2203 .open = locks_open, 2242 .open = locks_open,
2204 .read = seq_read, 2243 .read = seq_read,
2205 .llseek = seq_lseek, 2244 .llseek = seq_lseek,
2206 .release = seq_release, 2245 .release = seq_release_private,
2207}; 2246};
2208 2247
2209static int __init proc_locks_init(void) 2248static int __init proc_locks_init(void)
@@ -2231,7 +2270,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2231{ 2270{
2232 struct file_lock *fl; 2271 struct file_lock *fl;
2233 int result = 1; 2272 int result = 1;
2234 lock_kernel(); 2273 lock_flocks();
2235 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2274 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2236 if (IS_POSIX(fl)) { 2275 if (IS_POSIX(fl)) {
2237 if (fl->fl_type == F_RDLCK) 2276 if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2287,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2248 result = 0; 2287 result = 0;
2249 break; 2288 break;
2250 } 2289 }
2251 unlock_kernel(); 2290 unlock_flocks();
2252 return result; 2291 return result;
2253} 2292}
2254 2293
@@ -2271,7 +2310,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2271{ 2310{
2272 struct file_lock *fl; 2311 struct file_lock *fl;
2273 int result = 1; 2312 int result = 1;
2274 lock_kernel(); 2313 lock_flocks();
2275 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2314 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2276 if (IS_POSIX(fl)) { 2315 if (IS_POSIX(fl)) {
2277 if ((fl->fl_end < start) || (fl->fl_start > (start + len))) 2316 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2325,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2286 result = 0; 2325 result = 0;
2287 break; 2326 break;
2288 } 2327 }
2289 unlock_kernel(); 2328 unlock_flocks();
2290 return result; 2329 return result;
2291} 2330}
2292 2331
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a3040..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
298 return sync_request(page, bdev, WRITE); 298 return sync_request(page, bdev, WRITE);
299} 299}
300 300
301static void bdev_put_device(struct super_block *sb) 301static void bdev_put_device(struct logfs_super *s)
302{ 302{
303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); 303 blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
304} 304}
305 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs) 306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -320,20 +320,24 @@ static const struct logfs_device_ops bd_devops = {
320 .put_device = bdev_put_device, 320 .put_device = bdev_put_device,
321}; 321};
322 322
323int logfs_get_sb_bdev(struct file_system_type *type, int flags, 323int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
324 const char *devname, struct vfsmount *mnt) 324 const char *devname)
325{ 325{
326 struct block_device *bdev; 326 struct block_device *bdev;
327 327
328 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); 328 bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
329 type);
329 if (IS_ERR(bdev)) 330 if (IS_ERR(bdev))
330 return PTR_ERR(bdev); 331 return PTR_ERR(bdev);
331 332
332 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { 333 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
333 int mtdnr = MINOR(bdev->bd_dev); 334 int mtdnr = MINOR(bdev->bd_dev);
334 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 335 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
335 return logfs_get_sb_mtd(type, flags, mtdnr, mnt); 336 return logfs_get_sb_mtd(p, mtdnr);
336 } 337 }
337 338
338 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); 339 p->s_bdev = bdev;
340 p->s_mtd = NULL;
341 p->s_devops = &bd_devops;
342 return 0;
339} 343}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4b..7466e9dcc8c5 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
230 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); 230 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
231} 231}
232 232
233static void mtd_put_device(struct super_block *sb) 233static void mtd_put_device(struct logfs_super *s)
234{ 234{
235 put_mtd_device(logfs_super(sb)->s_mtd); 235 put_mtd_device(s->s_mtd);
236} 236}
237 237
238static int mtd_can_write_buf(struct super_block *sb, u64 ofs) 238static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
@@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
265 .put_device = mtd_put_device, 265 .put_device = mtd_put_device,
266}; 266};
267 267
268int logfs_get_sb_mtd(struct file_system_type *type, int flags, 268int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
269 int mtdnr, struct vfsmount *mnt)
270{ 269{
271 struct mtd_info *mtd; 270 struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
272 const struct logfs_device_ops *devops = &mtd_devops;
273
274 mtd = get_mtd_device(NULL, mtdnr);
275 if (IS_ERR(mtd)) 271 if (IS_ERR(mtd))
276 return PTR_ERR(mtd); 272 return PTR_ERR(mtd);
277 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); 273
274 s->s_bdev = NULL;
275 s->s_mtd = mtd;
276 s->s_devops = &mtd_devops;
277 return 0;
278} 278}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9777eb5b5522..f9ddf0c388c8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
555 return __logfs_create(dir, dentry, inode, target, destlen); 555 return __logfs_create(dir, dentry, inode, target, destlen);
556} 556}
557 557
558static int logfs_permission(struct inode *inode, int mask) 558static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
559{ 559{
560 return generic_permission(inode, mask, NULL); 560 if (flags & IPERM_FLAG_RCU)
561 return -ECHILD;
562 return generic_permission(inode, mask, flags, NULL);
561} 563}
562 564
563static int logfs_link(struct dentry *old_dentry, struct inode *dir, 565static int logfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -569,7 +571,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
569 return -EMLINK; 571 return -EMLINK;
570 572
571 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 573 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
572 atomic_inc(&inode->i_count); 574 ihold(inode);
573 inode->i_nlink++; 575 inode->i_nlink++;
574 mark_inode_dirty_sync(inode); 576 mark_inode_dirty_sync(inode);
575 577
@@ -827,4 +829,5 @@ const struct file_operations logfs_dir_fops = {
827 .unlocked_ioctl = logfs_ioctl, 829 .unlocked_ioctl = logfs_ioctl,
828 .readdir = logfs_readdir, 830 .readdir = logfs_readdir,
829 .read = generic_read_dir, 831 .read = generic_read_dir,
832 .llseek = default_llseek,
830}; 833};
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098f..03b8c240aeda 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
141 return __logfs_iget(sb, ino); 141 return __logfs_iget(sb, ino);
142} 142}
143 143
144static void logfs_i_callback(struct rcu_head *head)
145{
146 struct inode *inode = container_of(head, struct inode, i_rcu);
147 INIT_LIST_HEAD(&inode->i_dentry);
148 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
149}
150
144static void __logfs_destroy_inode(struct inode *inode) 151static void __logfs_destroy_inode(struct inode *inode)
145{ 152{
146 struct logfs_inode *li = logfs_inode(inode); 153 struct logfs_inode *li = logfs_inode(inode);
147 154
148 BUG_ON(li->li_block); 155 BUG_ON(li->li_block);
149 list_del(&li->li_freeing_list); 156 list_del(&li->li_freeing_list);
150 kmem_cache_free(logfs_inode_cache, li); 157 call_rcu(&inode->i_rcu, logfs_i_callback);
151} 158}
152 159
153static void logfs_destroy_inode(struct inode *inode) 160static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e135..9da29706f91c 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
828 super->s_journal_seg[i] = segno; 828 super->s_journal_seg[i] = segno;
829 super->s_journal_ec[i] = ec; 829 super->s_journal_ec[i] = ec;
830 logfs_set_segment_reserved(sb, segno); 830 logfs_set_segment_reserved(sb, segno);
831 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL); 831 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
832 BUG_ON(err); /* mempool should prevent this */ 832 BUG_ON(err); /* mempool should prevent this */
833 err = logfs_erase_segment(sb, segno, 1); 833 err = logfs_erase_segment(sb, segno, 1);
834 BUG_ON(err); /* FIXME: remount-ro would be nicer */ 834 BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b8786264d243..57afd4a6fabb 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
136 int (*erase_segment)(struct logfs_area *area); 136 int (*erase_segment)(struct logfs_area *area);
137}; 137};
138 138
139struct logfs_super; /* forward */
139/** 140/**
140 * struct logfs_device_ops - device access operations 141 * struct logfs_device_ops - device access operations
141 * 142 *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
156 int ensure_write); 157 int ensure_write);
157 int (*can_write_buf)(struct super_block *sb, u64 ofs); 158 int (*can_write_buf)(struct super_block *sb, u64 ofs);
158 void (*sync)(struct super_block *sb); 159 void (*sync)(struct super_block *sb);
159 void (*put_device)(struct super_block *sb); 160 void (*put_device)(struct logfs_super *s);
160}; 161};
161 162
162/** 163/**
@@ -471,11 +472,13 @@ void logfs_compr_exit(void);
471 472
472/* dev_bdev.c */ 473/* dev_bdev.c */
473#ifdef CONFIG_BLOCK 474#ifdef CONFIG_BLOCK
474int logfs_get_sb_bdev(struct file_system_type *type, int flags, 475int logfs_get_sb_bdev(struct logfs_super *s,
475 const char *devname, struct vfsmount *mnt); 476 struct file_system_type *type,
477 const char *devname);
476#else 478#else
477static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, 479static inline int logfs_get_sb_bdev(struct logfs_super *s,
478 const char *devname, struct vfsmount *mnt) 480 struct file_system_type *type,
481 const char *devname)
479{ 482{
480 return -ENODEV; 483 return -ENODEV;
481} 484}
@@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
483 486
484/* dev_mtd.c */ 487/* dev_mtd.c */
485#ifdef CONFIG_MTD 488#ifdef CONFIG_MTD
486int logfs_get_sb_mtd(struct file_system_type *type, int flags, 489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
487 int mtdnr, struct vfsmount *mnt);
488#else 490#else
489static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, 491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
490 int mtdnr, struct vfsmount *mnt)
491{ 492{
492 return -ENODEV; 493 return -ENODEV;
493} 494}
@@ -619,9 +620,6 @@ void emergency_read_end(struct page *page);
619void logfs_crash_dump(struct super_block *sb); 620void logfs_crash_dump(struct super_block *sb);
620void *memchr_inv(const void *s, int c, size_t n); 621void *memchr_inv(const void *s, int c, size_t n);
621int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); 622int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
622int logfs_get_sb_device(struct file_system_type *type, int flags,
623 struct mtd_info *mtd, struct block_device *bdev,
624 const struct logfs_device_ops *devops, struct vfsmount *mnt);
625int logfs_check_ds(struct logfs_disk_super *ds); 623int logfs_check_ds(struct logfs_disk_super *ds);
626int logfs_write_sb(struct super_block *sb); 624int logfs_write_sb(struct super_block *sb);
627 625
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e188..ee99a9f5dfd3 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
1994 1994
1995 /* FIXME: transaction is part of logfs_block now. Is that enough? */ 1995 /* FIXME: transaction is part of logfs_block now. Is that enough? */
1996 err = logfs_write_buf(master_inode, page, 0); 1996 err = logfs_write_buf(master_inode, page, 0);
1997 if (err)
1998 move_page_to_inode(inode, page);
1999
1997 logfs_put_write_page(page); 2000 logfs_put_write_page(page);
1998 return err; 2001 return err;
1999} 2002}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5336155c5d81..33435e4b14d2 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
325 return 0; 325 return 0;
326} 326}
327 327
328static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) 328static int logfs_get_sb_final(struct super_block *sb)
329{ 329{
330 struct logfs_super *super = logfs_super(sb); 330 struct logfs_super *super = logfs_super(sb);
331 struct inode *rootdir; 331 struct inode *rootdir;
@@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
356 } 356 }
357 357
358 log_super("LogFS: Finished mounting\n"); 358 log_super("LogFS: Finished mounting\n");
359 simple_set_mnt(mnt, sb);
360 return 0; 359 return 0;
361 360
362fail: 361fail:
@@ -529,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb)
529 logfs_cleanup_rw(sb); 528 logfs_cleanup_rw(sb);
530 if (super->s_erase_page) 529 if (super->s_erase_page)
531 __free_page(super->s_erase_page); 530 __free_page(super->s_erase_page);
532 super->s_devops->put_device(sb); 531 super->s_devops->put_device(super);
533 logfs_mempool_destroy(super->s_btree_pool); 532 logfs_mempool_destroy(super->s_btree_pool);
534 logfs_mempool_destroy(super->s_alias_pool); 533 logfs_mempool_destroy(super->s_alias_pool);
535 kfree(super); 534 kfree(super);
536 log_super("LogFS: Finished unmounting\n"); 535 log_super("LogFS: Finished unmounting\n");
537} 536}
538 537
539int logfs_get_sb_device(struct file_system_type *type, int flags, 538static struct dentry *logfs_get_sb_device(struct logfs_super *super,
540 struct mtd_info *mtd, struct block_device *bdev, 539 struct file_system_type *type, int flags)
541 const struct logfs_device_ops *devops, struct vfsmount *mnt)
542{ 540{
543 struct logfs_super *super;
544 struct super_block *sb; 541 struct super_block *sb;
545 int err = -ENOMEM; 542 int err = -ENOMEM;
546 static int mount_count; 543 static int mount_count;
547 544
548 log_super("LogFS: Start mount %x\n", mount_count++); 545 log_super("LogFS: Start mount %x\n", mount_count++);
549 super = kzalloc(sizeof(*super), GFP_KERNEL);
550 if (!super)
551 goto err0;
552 546
553 super->s_mtd = mtd;
554 super->s_bdev = bdev;
555 err = -EINVAL; 547 err = -EINVAL;
556 sb = sget(type, logfs_sb_test, logfs_sb_set, super); 548 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
557 if (IS_ERR(sb)) 549 if (IS_ERR(sb)) {
558 goto err0; 550 super->s_devops->put_device(super);
551 kfree(super);
552 return ERR_CAST(sb);
553 }
559 554
560 if (sb->s_root) { 555 if (sb->s_root) {
561 /* Device is already in use */ 556 /* Device is already in use */
562 err = 0; 557 super->s_devops->put_device(super);
563 simple_set_mnt(mnt, sb); 558 kfree(super);
564 goto err0; 559 return dget(sb->s_root);
565 } 560 }
566 561
567 super->s_devops = devops;
568
569 /* 562 /*
570 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache 563 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
571 * only covers 16TB and the upper 8TB are used for indirect blocks. 564 * only covers 16TB and the upper 8TB are used for indirect blocks.
@@ -581,10 +574,12 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
581 goto err1; 574 goto err1;
582 575
583 sb->s_flags |= MS_ACTIVE; 576 sb->s_flags |= MS_ACTIVE;
584 err = logfs_get_sb_final(sb, mnt); 577 err = logfs_get_sb_final(sb);
585 if (err) 578 if (err) {
586 deactivate_locked_super(sb); 579 deactivate_locked_super(sb);
587 return err; 580 return ERR_PTR(err);
581 }
582 return dget(sb->s_root);
588 583
589err1: 584err1:
590 /* no ->s_root, no ->put_super() */ 585 /* no ->s_root, no ->put_super() */
@@ -592,37 +587,45 @@ err1:
592 iput(super->s_segfile_inode); 587 iput(super->s_segfile_inode);
593 iput(super->s_mapping_inode); 588 iput(super->s_mapping_inode);
594 deactivate_locked_super(sb); 589 deactivate_locked_super(sb);
595 return err; 590 return ERR_PTR(err);
596err0:
597 kfree(super);
598 //devops->put_device(sb);
599 return err;
600} 591}
601 592
602static int logfs_get_sb(struct file_system_type *type, int flags, 593static struct dentry *logfs_mount(struct file_system_type *type, int flags,
603 const char *devname, void *data, struct vfsmount *mnt) 594 const char *devname, void *data)
604{ 595{
605 ulong mtdnr; 596 ulong mtdnr;
597 struct logfs_super *super;
598 int err;
606 599
607 if (!devname) 600 super = kzalloc(sizeof(*super), GFP_KERNEL);
608 return logfs_get_sb_bdev(type, flags, devname, mnt); 601 if (!super)
609 if (strncmp(devname, "mtd", 3)) 602 return ERR_PTR(-ENOMEM);
610 return logfs_get_sb_bdev(type, flags, devname, mnt);
611 603
612 { 604 if (!devname)
605 err = logfs_get_sb_bdev(super, type, devname);
606 else if (strncmp(devname, "mtd", 3))
607 err = logfs_get_sb_bdev(super, type, devname);
608 else {
613 char *garbage; 609 char *garbage;
614 mtdnr = simple_strtoul(devname+3, &garbage, 0); 610 mtdnr = simple_strtoul(devname+3, &garbage, 0);
615 if (*garbage) 611 if (*garbage)
616 return -EINVAL; 612 err = -EINVAL;
613 else
614 err = logfs_get_sb_mtd(super, mtdnr);
615 }
616
617 if (err) {
618 kfree(super);
619 return ERR_PTR(err);
617 } 620 }
618 621
619 return logfs_get_sb_mtd(type, flags, mtdnr, mnt); 622 return logfs_get_sb_device(super, type, flags);
620} 623}
621 624
622static struct file_system_type logfs_fs_type = { 625static struct file_system_type logfs_fs_type = {
623 .owner = THIS_MODULE, 626 .owner = THIS_MODULE,
624 .name = "logfs", 627 .name = "logfs",
625 .get_sb = logfs_get_sb, 628 .mount = logfs_mount,
626 .kill_sb = logfs_kill_sb, 629 .kill_sb = logfs_kill_sb,
627 .fs_flags = FS_REQUIRES_DEV, 630 .fs_flags = FS_REQUIRES_DEV,
628 631
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 93444747237b..a25444ab2baf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -76,18 +76,6 @@ EXPORT_SYMBOL(mb_cache_entry_find_first);
76EXPORT_SYMBOL(mb_cache_entry_find_next); 76EXPORT_SYMBOL(mb_cache_entry_find_next);
77#endif 77#endif
78 78
79struct mb_cache {
80 struct list_head c_cache_list;
81 const char *c_name;
82 atomic_t c_entry_count;
83 int c_max_entries;
84 int c_bucket_bits;
85 struct kmem_cache *c_entry_cache;
86 struct list_head *c_block_hash;
87 struct list_head *c_index_hash;
88};
89
90
91/* 79/*
92 * Global data: list of all mbcache's, lru list, and a spinlock for 80 * Global data: list of all mbcache's, lru list, and a spinlock for
93 * accessing cache data structures on SMP machines. The lru list is 81 * accessing cache data structures on SMP machines. The lru list is
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e39d6bf2e8fb..ae0b83f476a6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
68 return &ei->vfs_inode; 68 return &ei->vfs_inode;
69} 69}
70 70
71static void minix_destroy_inode(struct inode *inode) 71static void minix_i_callback(struct rcu_head *head)
72{ 72{
73 struct inode *inode = container_of(head, struct inode, i_rcu);
74 INIT_LIST_HEAD(&inode->i_dentry);
73 kmem_cache_free(minix_inode_cachep, minix_i(inode)); 75 kmem_cache_free(minix_inode_cachep, minix_i(inode));
74} 76}
75 77
78static void minix_destroy_inode(struct inode *inode)
79{
80 call_rcu(&inode->i_rcu, minix_i_callback);
81}
82
76static void init_once(void *foo) 83static void init_once(void *foo)
77{ 84{
78 struct minix_inode_info *ei = (struct minix_inode_info *) foo; 85 struct minix_inode_info *ei = (struct minix_inode_info *) foo;
@@ -614,17 +621,16 @@ void minix_truncate(struct inode * inode)
614 V2_minix_truncate(inode); 621 V2_minix_truncate(inode);
615} 622}
616 623
617static int minix_get_sb(struct file_system_type *fs_type, 624static struct dentry *minix_mount(struct file_system_type *fs_type,
618 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 625 int flags, const char *dev_name, void *data)
619{ 626{
620 return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super, 627 return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
621 mnt);
622} 628}
623 629
624static struct file_system_type minix_fs_type = { 630static struct file_system_type minix_fs_type = {
625 .owner = THIS_MODULE, 631 .owner = THIS_MODULE,
626 .name = "minix", 632 .name = "minix",
627 .get_sb = minix_get_sb, 633 .mount = minix_mount,
628 .kill_sb = kill_block_super, 634 .kill_sb = kill_block_super,
629 .fs_flags = FS_REQUIRES_DEV, 635 .fs_flags = FS_REQUIRES_DEV,
630}; 636};
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..ce7337ddfdbf 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
23 struct inode * inode = NULL; 23 struct inode * inode = NULL;
24 ino_t ino; 24 ino_t ino;
25 25
26 dentry->d_op = dir->i_sb->s_root->d_op;
27
28 if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) 26 if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
29 return ERR_PTR(-ENAMETOOLONG); 27 return ERR_PTR(-ENAMETOOLONG);
30 28
@@ -101,7 +99,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
101 99
102 inode->i_ctime = CURRENT_TIME_SEC; 100 inode->i_ctime = CURRENT_TIME_SEC;
103 inode_inc_link_count(inode); 101 inode_inc_link_count(inode);
104 atomic_inc(&inode->i_count); 102 ihold(inode);
105 return add_nondir(dentry, inode); 103 return add_nondir(dentry, inode);
106} 104}
107 105
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
40 * status of that page is hard. See end_buffer_async_read() for the details. 40 * status of that page is hard. See end_buffer_async_read() for the details.
41 * There is no point in duplicating all that complexity. 41 * There is no point in duplicating all that complexity.
42 */ 42 */
43static void mpage_end_io_read(struct bio *bio, int err) 43static void mpage_end_io(struct bio *bio, int err)
44{ 44{
45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
50 50
51 if (--bvec >= bio->bi_io_vec) 51 if (--bvec >= bio->bi_io_vec)
52 prefetchw(&bvec->bv_page->flags); 52 prefetchw(&bvec->bv_page->flags);
53 53 if (bio_data_dir(bio) == READ) {
54 if (uptodate) { 54 if (uptodate) {
55 SetPageUptodate(page); 55 SetPageUptodate(page);
56 } else { 56 } else {
57 ClearPageUptodate(page); 57 ClearPageUptodate(page);
58 SetPageError(page); 58 SetPageError(page);
59 } 59 }
60 unlock_page(page); 60 unlock_page(page);
61 } while (bvec >= bio->bi_io_vec); 61 } else { /* bio_data_dir(bio) == WRITE */
62 bio_put(bio); 62 if (!uptodate) {
63} 63 SetPageError(page);
64 64 if (page->mapping)
65static void mpage_end_io_write(struct bio *bio, int err) 65 set_bit(AS_EIO, &page->mapping->flags);
66{ 66 }
67 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 67 end_page_writeback(page);
68 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
69
70 do {
71 struct page *page = bvec->bv_page;
72
73 if (--bvec >= bio->bi_io_vec)
74 prefetchw(&bvec->bv_page->flags);
75
76 if (!uptodate){
77 SetPageError(page);
78 if (page->mapping)
79 set_bit(AS_EIO, &page->mapping->flags);
80 } 68 }
81 end_page_writeback(page);
82 } while (bvec >= bio->bi_io_vec); 69 } while (bvec >= bio->bi_io_vec);
83 bio_put(bio); 70 bio_put(bio);
84} 71}
85 72
86static struct bio *mpage_bio_submit(int rw, struct bio *bio) 73static struct bio *mpage_bio_submit(int rw, struct bio *bio)
87{ 74{
88 bio->bi_end_io = mpage_end_io_read; 75 bio->bi_end_io = mpage_end_io;
89 if (rw == WRITE)
90 bio->bi_end_io = mpage_end_io_write;
91 submit_bio(rw, bio); 76 submit_bio(rw, bio);
92 return NULL; 77 return NULL;
93} 78}
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..7d77f24d32a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
169/* 169/*
170 * This does basic POSIX ACL permission checking 170 * This does basic POSIX ACL permission checking
171 */ 171 */
172static int acl_permission_check(struct inode *inode, int mask, 172static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
173 int (*check_acl)(struct inode *inode, int mask)) 173 int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
174{ 174{
175 umode_t mode = inode->i_mode; 175 umode_t mode = inode->i_mode;
176 176
@@ -180,7 +180,7 @@ static int acl_permission_check(struct inode *inode, int mask,
180 mode >>= 6; 180 mode >>= 6;
181 else { 181 else {
182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
183 int error = check_acl(inode, mask); 183 int error = check_acl(inode, mask, flags);
184 if (error != -EAGAIN) 184 if (error != -EAGAIN)
185 return error; 185 return error;
186 } 186 }
@@ -198,25 +198,30 @@ static int acl_permission_check(struct inode *inode, int mask,
198} 198}
199 199
200/** 200/**
201 * generic_permission - check for access rights on a Posix-like filesystem 201 * generic_permission - check for access rights on a Posix-like filesystem
202 * @inode: inode to check access rights for 202 * @inode: inode to check access rights for
203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
204 * @check_acl: optional callback to check for Posix ACLs 204 * @check_acl: optional callback to check for Posix ACLs
205 * @flags: IPERM_FLAG_ flags.
205 * 206 *
206 * Used to check for read/write/execute permissions on a file. 207 * Used to check for read/write/execute permissions on a file.
207 * We use "fsuid" for this, letting us set arbitrary permissions 208 * We use "fsuid" for this, letting us set arbitrary permissions
208 * for filesystem access without changing the "normal" uids which 209 * for filesystem access without changing the "normal" uids which
209 * are used for other things.. 210 * are used for other things.
211 *
212 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
213 * request cannot be satisfied (eg. requires blocking or too much complexity).
214 * It would then be called again in ref-walk mode.
210 */ 215 */
211int generic_permission(struct inode *inode, int mask, 216int generic_permission(struct inode *inode, int mask, unsigned int flags,
212 int (*check_acl)(struct inode *inode, int mask)) 217 int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
213{ 218{
214 int ret; 219 int ret;
215 220
216 /* 221 /*
217 * Do the basic POSIX ACL permission checks. 222 * Do the basic POSIX ACL permission checks.
218 */ 223 */
219 ret = acl_permission_check(inode, mask, check_acl); 224 ret = acl_permission_check(inode, mask, flags, check_acl);
220 if (ret != -EACCES) 225 if (ret != -EACCES)
221 return ret; 226 return ret;
222 227
@@ -271,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
271 } 276 }
272 277
273 if (inode->i_op->permission) 278 if (inode->i_op->permission)
274 retval = inode->i_op->permission(inode, mask); 279 retval = inode->i_op->permission(inode, mask, 0);
275 else 280 else
276 retval = generic_permission(inode, mask, inode->i_op->check_acl); 281 retval = generic_permission(inode, mask, 0,
282 inode->i_op->check_acl);
277 283
278 if (retval) 284 if (retval)
279 return retval; 285 return retval;
@@ -375,6 +381,181 @@ void path_put(struct path *path)
375EXPORT_SYMBOL(path_put); 381EXPORT_SYMBOL(path_put);
376 382
377/** 383/**
384 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
385 * @nd: nameidata pathwalk data to drop
386 * Returns: 0 on success, -ECHILD on failure
387 *
388 * Path walking has 2 modes, rcu-walk and ref-walk (see
389 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
390 * to drop out of rcu-walk mode and take normal reference counts on dentries
391 * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
392 * refcounts at the last known good point before rcu-walk got stuck, so
393 * ref-walk may continue from there. If this is not successful (eg. a seqcount
394 * has changed), then failure is returned and path walk restarts from the
395 * beginning in ref-walk mode.
396 *
397 * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
398 * ref-walk. Must be called from rcu-walk context.
399 */
400static int nameidata_drop_rcu(struct nameidata *nd)
401{
402 struct fs_struct *fs = current->fs;
403 struct dentry *dentry = nd->path.dentry;
404
405 BUG_ON(!(nd->flags & LOOKUP_RCU));
406 if (nd->root.mnt) {
407 spin_lock(&fs->lock);
408 if (nd->root.mnt != fs->root.mnt ||
409 nd->root.dentry != fs->root.dentry)
410 goto err_root;
411 }
412 spin_lock(&dentry->d_lock);
413 if (!__d_rcu_to_refcount(dentry, nd->seq))
414 goto err;
415 BUG_ON(nd->inode != dentry->d_inode);
416 spin_unlock(&dentry->d_lock);
417 if (nd->root.mnt) {
418 path_get(&nd->root);
419 spin_unlock(&fs->lock);
420 }
421 mntget(nd->path.mnt);
422
423 rcu_read_unlock();
424 br_read_unlock(vfsmount_lock);
425 nd->flags &= ~LOOKUP_RCU;
426 return 0;
427err:
428 spin_unlock(&dentry->d_lock);
429err_root:
430 if (nd->root.mnt)
431 spin_unlock(&fs->lock);
432 return -ECHILD;
433}
434
435/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
436static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
437{
438 if (nd->flags & LOOKUP_RCU)
439 return nameidata_drop_rcu(nd);
440 return 0;
441}
442
443/**
444 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
445 * @nd: nameidata pathwalk data to drop
446 * @dentry: dentry to drop
447 * Returns: 0 on success, -ECHILD on failure
448 *
449 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
450 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
451 * @nd. Must be called from rcu-walk context.
452 */
453static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
454{
455 struct fs_struct *fs = current->fs;
456 struct dentry *parent = nd->path.dentry;
457
458 /*
459 * It can be possible to revalidate the dentry that we started
460 * the path walk with. force_reval_path may also revalidate the
461 * dentry already committed to the nameidata.
462 */
463 if (unlikely(parent == dentry))
464 return nameidata_drop_rcu(nd);
465
466 BUG_ON(!(nd->flags & LOOKUP_RCU));
467 if (nd->root.mnt) {
468 spin_lock(&fs->lock);
469 if (nd->root.mnt != fs->root.mnt ||
470 nd->root.dentry != fs->root.dentry)
471 goto err_root;
472 }
473 spin_lock(&parent->d_lock);
474 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
475 if (!__d_rcu_to_refcount(dentry, nd->seq))
476 goto err;
477 /*
478 * If the sequence check on the child dentry passed, then the child has
479 * not been removed from its parent. This means the parent dentry must
480 * be valid and able to take a reference at this point.
481 */
482 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
483 BUG_ON(!parent->d_count);
484 parent->d_count++;
485 spin_unlock(&dentry->d_lock);
486 spin_unlock(&parent->d_lock);
487 if (nd->root.mnt) {
488 path_get(&nd->root);
489 spin_unlock(&fs->lock);
490 }
491 mntget(nd->path.mnt);
492
493 rcu_read_unlock();
494 br_read_unlock(vfsmount_lock);
495 nd->flags &= ~LOOKUP_RCU;
496 return 0;
497err:
498 spin_unlock(&dentry->d_lock);
499 spin_unlock(&parent->d_lock);
500err_root:
501 if (nd->root.mnt)
502 spin_unlock(&fs->lock);
503 return -ECHILD;
504}
505
506/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
507static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
508{
509 if (nd->flags & LOOKUP_RCU)
510 return nameidata_dentry_drop_rcu(nd, dentry);
511 return 0;
512}
513
514/**
515 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
516 * @nd: nameidata pathwalk data to drop
517 * Returns: 0 on success, -ECHILD on failure
518 *
519 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
520 * nd->path should be the final element of the lookup, so nd->root is discarded.
521 * Must be called from rcu-walk context.
522 */
523static int nameidata_drop_rcu_last(struct nameidata *nd)
524{
525 struct dentry *dentry = nd->path.dentry;
526
527 BUG_ON(!(nd->flags & LOOKUP_RCU));
528 nd->flags &= ~LOOKUP_RCU;
529 nd->root.mnt = NULL;
530 spin_lock(&dentry->d_lock);
531 if (!__d_rcu_to_refcount(dentry, nd->seq))
532 goto err_unlock;
533 BUG_ON(nd->inode != dentry->d_inode);
534 spin_unlock(&dentry->d_lock);
535
536 mntget(nd->path.mnt);
537
538 rcu_read_unlock();
539 br_read_unlock(vfsmount_lock);
540
541 return 0;
542
543err_unlock:
544 spin_unlock(&dentry->d_lock);
545 rcu_read_unlock();
546 br_read_unlock(vfsmount_lock);
547 return -ECHILD;
548}
549
550/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
551static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
552{
553 if (likely(nd->flags & LOOKUP_RCU))
554 return nameidata_drop_rcu_last(nd);
555 return 0;
556}
557
558/**
378 * release_open_intent - free up open intent resources 559 * release_open_intent - free up open intent resources
379 * @nd: pointer to nameidata 560 * @nd: pointer to nameidata
380 */ 561 */
@@ -386,10 +567,33 @@ void release_open_intent(struct nameidata *nd)
386 fput(nd->intent.open.file); 567 fput(nd->intent.open.file);
387} 568}
388 569
570/*
571 * Call d_revalidate and handle filesystems that request rcu-walk
572 * to be dropped. This may be called and return in rcu-walk mode,
573 * regardless of success or error. If -ECHILD is returned, the caller
574 * must return -ECHILD back up the path walk stack so path walk may
575 * be restarted in ref-walk mode.
576 */
577static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
578{
579 int status;
580
581 status = dentry->d_op->d_revalidate(dentry, nd);
582 if (status == -ECHILD) {
583 if (nameidata_dentry_drop_rcu(nd, dentry))
584 return status;
585 status = dentry->d_op->d_revalidate(dentry, nd);
586 }
587
588 return status;
589}
590
389static inline struct dentry * 591static inline struct dentry *
390do_revalidate(struct dentry *dentry, struct nameidata *nd) 592do_revalidate(struct dentry *dentry, struct nameidata *nd)
391{ 593{
392 int status = dentry->d_op->d_revalidate(dentry, nd); 594 int status;
595
596 status = d_revalidate(dentry, nd);
393 if (unlikely(status <= 0)) { 597 if (unlikely(status <= 0)) {
394 /* 598 /*
395 * The dentry failed validation. 599 * The dentry failed validation.
@@ -397,19 +601,36 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
397 * the dentry otherwise d_revalidate is asking us 601 * the dentry otherwise d_revalidate is asking us
398 * to return a fail status. 602 * to return a fail status.
399 */ 603 */
400 if (!status) { 604 if (status < 0) {
605 /* If we're in rcu-walk, we don't have a ref */
606 if (!(nd->flags & LOOKUP_RCU))
607 dput(dentry);
608 dentry = ERR_PTR(status);
609
610 } else {
611 /* Don't d_invalidate in rcu-walk mode */
612 if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
613 return ERR_PTR(-ECHILD);
401 if (!d_invalidate(dentry)) { 614 if (!d_invalidate(dentry)) {
402 dput(dentry); 615 dput(dentry);
403 dentry = NULL; 616 dentry = NULL;
404 } 617 }
405 } else {
406 dput(dentry);
407 dentry = ERR_PTR(status);
408 } 618 }
409 } 619 }
410 return dentry; 620 return dentry;
411} 621}
412 622
623static inline int need_reval_dot(struct dentry *dentry)
624{
625 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
626 return 0;
627
628 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
629 return 0;
630
631 return 1;
632}
633
413/* 634/*
414 * force_reval_path - force revalidation of a dentry 635 * force_reval_path - force revalidation of a dentry
415 * 636 *
@@ -433,17 +654,19 @@ force_reval_path(struct path *path, struct nameidata *nd)
433 654
434 /* 655 /*
435 * only check on filesystems where it's possible for the dentry to 656 * only check on filesystems where it's possible for the dentry to
436 * become stale. It's assumed that if this flag is set then the 657 * become stale.
437 * d_revalidate op will also be defined.
438 */ 658 */
439 if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) 659 if (!need_reval_dot(dentry))
440 return 0; 660 return 0;
441 661
442 status = dentry->d_op->d_revalidate(dentry, nd); 662 status = d_revalidate(dentry, nd);
443 if (status > 0) 663 if (status > 0)
444 return 0; 664 return 0;
445 665
446 if (!status) { 666 if (!status) {
667 /* Don't d_invalidate in rcu-walk mode */
668 if (nameidata_drop_rcu(nd))
669 return -ECHILD;
447 d_invalidate(dentry); 670 d_invalidate(dentry);
448 status = -ESTALE; 671 status = -ESTALE;
449 } 672 }
@@ -459,26 +682,27 @@ force_reval_path(struct path *path, struct nameidata *nd)
459 * short-cut DAC fails, then call ->permission() to do more 682 * short-cut DAC fails, then call ->permission() to do more
460 * complete permission check. 683 * complete permission check.
461 */ 684 */
462static int exec_permission(struct inode *inode) 685static inline int exec_permission(struct inode *inode, unsigned int flags)
463{ 686{
464 int ret; 687 int ret;
465 688
466 if (inode->i_op->permission) { 689 if (inode->i_op->permission) {
467 ret = inode->i_op->permission(inode, MAY_EXEC); 690 ret = inode->i_op->permission(inode, MAY_EXEC, flags);
468 if (!ret) 691 } else {
469 goto ok; 692 ret = acl_permission_check(inode, MAY_EXEC, flags,
470 return ret; 693 inode->i_op->check_acl);
471 } 694 }
472 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); 695 if (likely(!ret))
473 if (!ret)
474 goto ok; 696 goto ok;
697 if (ret == -ECHILD)
698 return ret;
475 699
476 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) 700 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
477 goto ok; 701 goto ok;
478 702
479 return ret; 703 return ret;
480ok: 704ok:
481 return security_inode_permission(inode, MAY_EXEC); 705 return security_inode_exec_permission(inode, flags);
482} 706}
483 707
484static __always_inline void set_root(struct nameidata *nd) 708static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +713,23 @@ static __always_inline void set_root(struct nameidata *nd)
489 713
490static int link_path_walk(const char *, struct nameidata *); 714static int link_path_walk(const char *, struct nameidata *);
491 715
716static __always_inline void set_root_rcu(struct nameidata *nd)
717{
718 if (!nd->root.mnt) {
719 struct fs_struct *fs = current->fs;
720 unsigned seq;
721
722 do {
723 seq = read_seqcount_begin(&fs->seq);
724 nd->root = fs->root;
725 } while (read_seqcount_retry(&fs->seq, seq));
726 }
727}
728
492static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 729static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
493{ 730{
731 int ret;
732
494 if (IS_ERR(link)) 733 if (IS_ERR(link))
495 goto fail; 734 goto fail;
496 735
@@ -500,8 +739,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
500 nd->path = nd->root; 739 nd->path = nd->root;
501 path_get(&nd->root); 740 path_get(&nd->root);
502 } 741 }
742 nd->inode = nd->path.dentry->d_inode;
503 743
504 return link_path_walk(link, nd); 744 ret = link_path_walk(link, nd);
745 return ret;
505fail: 746fail:
506 path_put(&nd->path); 747 path_put(&nd->path);
507 return PTR_ERR(link); 748 return PTR_ERR(link);
@@ -514,30 +755,30 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
514 mntput(path->mnt); 755 mntput(path->mnt);
515} 756}
516 757
517static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 758static inline void path_to_nameidata(const struct path *path,
759 struct nameidata *nd)
518{ 760{
519 dput(nd->path.dentry); 761 if (!(nd->flags & LOOKUP_RCU)) {
520 if (nd->path.mnt != path->mnt) { 762 dput(nd->path.dentry);
521 mntput(nd->path.mnt); 763 if (nd->path.mnt != path->mnt)
522 nd->path.mnt = path->mnt; 764 mntput(nd->path.mnt);
523 } 765 }
766 nd->path.mnt = path->mnt;
524 nd->path.dentry = path->dentry; 767 nd->path.dentry = path->dentry;
525} 768}
526 769
527static __always_inline int 770static __always_inline int
528__do_follow_link(struct path *path, struct nameidata *nd, void **p) 771__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
529{ 772{
530 int error; 773 int error;
531 struct dentry *dentry = path->dentry; 774 struct dentry *dentry = link->dentry;
532 775
533 touch_atime(path->mnt, dentry); 776 touch_atime(link->mnt, dentry);
534 nd_set_link(nd, NULL); 777 nd_set_link(nd, NULL);
535 778
536 if (path->mnt != nd->path.mnt) { 779 if (link->mnt == nd->path.mnt)
537 path_to_nameidata(path, nd); 780 mntget(link->mnt);
538 dget(dentry); 781
539 }
540 mntget(path->mnt);
541 nd->last_type = LAST_BIND; 782 nd->last_type = LAST_BIND;
542 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 783 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
543 error = PTR_ERR(*p); 784 error = PTR_ERR(*p);
@@ -591,6 +832,20 @@ loop:
591 return err; 832 return err;
592} 833}
593 834
835static int follow_up_rcu(struct path *path)
836{
837 struct vfsmount *parent;
838 struct dentry *mountpoint;
839
840 parent = path->mnt->mnt_parent;
841 if (parent == path->mnt)
842 return 0;
843 mountpoint = path->mnt->mnt_mountpoint;
844 path->dentry = mountpoint;
845 path->mnt = parent;
846 return 1;
847}
848
594int follow_up(struct path *path) 849int follow_up(struct path *path)
595{ 850{
596 struct vfsmount *parent; 851 struct vfsmount *parent;
@@ -612,58 +867,295 @@ int follow_up(struct path *path)
612 return 1; 867 return 1;
613} 868}
614 869
615/* no need for dcache_lock, as serialization is taken care in 870/*
616 * namespace.c 871 * Perform an automount
872 * - return -EISDIR to tell follow_managed() to stop and return the path we
873 * were called with.
617 */ 874 */
618static int __follow_mount(struct path *path) 875static int follow_automount(struct path *path, unsigned flags,
876 bool *need_mntput)
619{ 877{
620 int res = 0; 878 struct vfsmount *mnt;
621 while (d_mountpoint(path->dentry)) { 879 int err;
622 struct vfsmount *mounted = lookup_mnt(path); 880
623 if (!mounted) 881 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
624 break; 882 return -EREMOTE;
883
884 /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
885 * and this is the terminal part of the path.
886 */
887 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
888 return -EISDIR; /* we actually want to stop here */
889
890 /* We want to mount if someone is trying to open/create a file of any
891 * type under the mountpoint, wants to traverse through the mountpoint
892 * or wants to open the mounted directory.
893 *
894 * We don't want to mount if someone's just doing a stat and they've
895 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
896 * appended a '/' to the name.
897 */
898 if (!(flags & LOOKUP_FOLLOW) &&
899 !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
900 LOOKUP_OPEN | LOOKUP_CREATE)))
901 return -EISDIR;
902
903 current->total_link_count++;
904 if (current->total_link_count >= 40)
905 return -ELOOP;
906
907 mnt = path->dentry->d_op->d_automount(path);
908 if (IS_ERR(mnt)) {
909 /*
910 * The filesystem is allowed to return -EISDIR here to indicate
911 * it doesn't want to automount. For instance, autofs would do
912 * this so that its userspace daemon can mount on this dentry.
913 *
914 * However, we can only permit this if it's a terminal point in
915 * the path being looked up; if it wasn't then the remainder of
916 * the path is inaccessible and we should say so.
917 */
918 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
919 return -EREMOTE;
920 return PTR_ERR(mnt);
921 }
922
923 if (!mnt) /* mount collision */
924 return 0;
925
926 err = finish_automount(mnt, path);
927
928 switch (err) {
929 case -EBUSY:
930 /* Someone else made a mount here whilst we were busy */
931 return 0;
932 case 0:
625 dput(path->dentry); 933 dput(path->dentry);
626 if (res) 934 if (*need_mntput)
627 mntput(path->mnt); 935 mntput(path->mnt);
936 path->mnt = mnt;
937 path->dentry = dget(mnt->mnt_root);
938 *need_mntput = true;
939 return 0;
940 default:
941 return err;
942 }
943
944}
945
946/*
947 * Handle a dentry that is managed in some way.
948 * - Flagged for transit management (autofs)
949 * - Flagged as mountpoint
950 * - Flagged as automount point
951 *
952 * This may only be called in refwalk mode.
953 *
954 * Serialization is taken care of in namespace.c
955 */
956static int follow_managed(struct path *path, unsigned flags)
957{
958 unsigned managed;
959 bool need_mntput = false;
960 int ret;
961
962 /* Given that we're not holding a lock here, we retain the value in a
963 * local variable for each dentry as we look at it so that we don't see
964 * the components of that value change under us */
965 while (managed = ACCESS_ONCE(path->dentry->d_flags),
966 managed &= DCACHE_MANAGED_DENTRY,
967 unlikely(managed != 0)) {
968 /* Allow the filesystem to manage the transit without i_mutex
969 * being held. */
970 if (managed & DCACHE_MANAGE_TRANSIT) {
971 BUG_ON(!path->dentry->d_op);
972 BUG_ON(!path->dentry->d_op->d_manage);
973 ret = path->dentry->d_op->d_manage(path->dentry,
974 false, false);
975 if (ret < 0)
976 return ret == -EISDIR ? 0 : ret;
977 }
978
979 /* Transit to a mounted filesystem. */
980 if (managed & DCACHE_MOUNTED) {
981 struct vfsmount *mounted = lookup_mnt(path);
982 if (mounted) {
983 dput(path->dentry);
984 if (need_mntput)
985 mntput(path->mnt);
986 path->mnt = mounted;
987 path->dentry = dget(mounted->mnt_root);
988 need_mntput = true;
989 continue;
990 }
991
992 /* Something is mounted on this dentry in another
993 * namespace and/or whatever was mounted there in this
994 * namespace got unmounted before we managed to get the
995 * vfsmount_lock */
996 }
997
998 /* Handle an automount point */
999 if (managed & DCACHE_NEED_AUTOMOUNT) {
1000 ret = follow_automount(path, flags, &need_mntput);
1001 if (ret < 0)
1002 return ret == -EISDIR ? 0 : ret;
1003 continue;
1004 }
1005
1006 /* We didn't change the current path point */
1007 break;
1008 }
1009 return 0;
1010}
1011
1012int follow_down_one(struct path *path)
1013{
1014 struct vfsmount *mounted;
1015
1016 mounted = lookup_mnt(path);
1017 if (mounted) {
1018 dput(path->dentry);
1019 mntput(path->mnt);
628 path->mnt = mounted; 1020 path->mnt = mounted;
629 path->dentry = dget(mounted->mnt_root); 1021 path->dentry = dget(mounted->mnt_root);
630 res = 1; 1022 return 1;
631 } 1023 }
632 return res; 1024 return 0;
633} 1025}
634 1026
635static void follow_mount(struct path *path) 1027/*
1028 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
1029 * meet a managed dentry and we're not walking to "..". True is returned to
1030 * continue, false to abort.
1031 */
1032static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1033 struct inode **inode, bool reverse_transit)
636{ 1034{
637 while (d_mountpoint(path->dentry)) { 1035 while (d_mountpoint(path->dentry)) {
638 struct vfsmount *mounted = lookup_mnt(path); 1036 struct vfsmount *mounted;
1037 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
1038 !reverse_transit &&
1039 path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
1040 return false;
1041 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
639 if (!mounted) 1042 if (!mounted)
640 break; 1043 break;
641 dput(path->dentry);
642 mntput(path->mnt);
643 path->mnt = mounted; 1044 path->mnt = mounted;
644 path->dentry = dget(mounted->mnt_root); 1045 path->dentry = mounted->mnt_root;
1046 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1047 *inode = path->dentry->d_inode;
1048 }
1049
1050 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1051 return reverse_transit;
1052 return true;
1053}
1054
1055static int follow_dotdot_rcu(struct nameidata *nd)
1056{
1057 struct inode *inode = nd->inode;
1058
1059 set_root_rcu(nd);
1060
1061 while (1) {
1062 if (nd->path.dentry == nd->root.dentry &&
1063 nd->path.mnt == nd->root.mnt) {
1064 break;
1065 }
1066 if (nd->path.dentry != nd->path.mnt->mnt_root) {
1067 struct dentry *old = nd->path.dentry;
1068 struct dentry *parent = old->d_parent;
1069 unsigned seq;
1070
1071 seq = read_seqcount_begin(&parent->d_seq);
1072 if (read_seqcount_retry(&old->d_seq, nd->seq))
1073 return -ECHILD;
1074 inode = parent->d_inode;
1075 nd->path.dentry = parent;
1076 nd->seq = seq;
1077 break;
1078 }
1079 if (!follow_up_rcu(&nd->path))
1080 break;
1081 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1082 inode = nd->path.dentry->d_inode;
645 } 1083 }
1084 __follow_mount_rcu(nd, &nd->path, &inode, true);
1085 nd->inode = inode;
1086
1087 return 0;
646} 1088}
647 1089
648/* no need for dcache_lock, as serialization is taken care in 1090/*
649 * namespace.c 1091 * Follow down to the covering mount currently visible to userspace. At each
1092 * point, the filesystem owning that dentry may be queried as to whether the
1093 * caller is permitted to proceed or not.
1094 *
1095 * Care must be taken as namespace_sem may be held (indicated by mounting_here
1096 * being true).
650 */ 1097 */
651int follow_down(struct path *path) 1098int follow_down(struct path *path, bool mounting_here)
652{ 1099{
653 struct vfsmount *mounted; 1100 unsigned managed;
1101 int ret;
654 1102
655 mounted = lookup_mnt(path); 1103 while (managed = ACCESS_ONCE(path->dentry->d_flags),
656 if (mounted) { 1104 unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1105 /* Allow the filesystem to manage the transit without i_mutex
1106 * being held.
1107 *
1108 * We indicate to the filesystem if someone is trying to mount
1109 * something here. This gives autofs the chance to deny anyone
1110 * other than its daemon the right to mount on its
1111 * superstructure.
1112 *
1113 * The filesystem may sleep at this point.
1114 */
1115 if (managed & DCACHE_MANAGE_TRANSIT) {
1116 BUG_ON(!path->dentry->d_op);
1117 BUG_ON(!path->dentry->d_op->d_manage);
1118 ret = path->dentry->d_op->d_manage(
1119 path->dentry, mounting_here, false);
1120 if (ret < 0)
1121 return ret == -EISDIR ? 0 : ret;
1122 }
1123
1124 /* Transit to a mounted filesystem. */
1125 if (managed & DCACHE_MOUNTED) {
1126 struct vfsmount *mounted = lookup_mnt(path);
1127 if (!mounted)
1128 break;
1129 dput(path->dentry);
1130 mntput(path->mnt);
1131 path->mnt = mounted;
1132 path->dentry = dget(mounted->mnt_root);
1133 continue;
1134 }
1135
1136 /* Don't handle automount points here */
1137 break;
1138 }
1139 return 0;
1140}
1141
1142/*
1143 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1144 */
1145static void follow_mount(struct path *path)
1146{
1147 while (d_mountpoint(path->dentry)) {
1148 struct vfsmount *mounted = lookup_mnt(path);
1149 if (!mounted)
1150 break;
657 dput(path->dentry); 1151 dput(path->dentry);
658 mntput(path->mnt); 1152 mntput(path->mnt);
659 path->mnt = mounted; 1153 path->mnt = mounted;
660 path->dentry = dget(mounted->mnt_root); 1154 path->dentry = dget(mounted->mnt_root);
661 return 1;
662 } 1155 }
663 return 0;
664} 1156}
665 1157
666static __always_inline void follow_dotdot(struct nameidata *nd) 1158static void follow_dotdot(struct nameidata *nd)
667{ 1159{
668 set_root(nd); 1160 set_root(nd);
669 1161
@@ -684,6 +1176,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
684 break; 1176 break;
685 } 1177 }
686 follow_mount(&nd->path); 1178 follow_mount(&nd->path);
1179 nd->inode = nd->path.dentry->d_inode;
687} 1180}
688 1181
689/* 1182/*
@@ -721,17 +1214,19 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
721 * It _is_ time-critical. 1214 * It _is_ time-critical.
722 */ 1215 */
723static int do_lookup(struct nameidata *nd, struct qstr *name, 1216static int do_lookup(struct nameidata *nd, struct qstr *name,
724 struct path *path) 1217 struct path *path, struct inode **inode)
725{ 1218{
726 struct vfsmount *mnt = nd->path.mnt; 1219 struct vfsmount *mnt = nd->path.mnt;
727 struct dentry *dentry, *parent; 1220 struct dentry *dentry, *parent = nd->path.dentry;
728 struct inode *dir; 1221 struct inode *dir;
1222 int err;
1223
729 /* 1224 /*
730 * See if the low-level filesystem might want 1225 * See if the low-level filesystem might want
731 * to use its own hash.. 1226 * to use its own hash..
732 */ 1227 */
733 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { 1228 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
734 int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name); 1229 err = parent->d_op->d_hash(parent, nd->inode, name);
735 if (err < 0) 1230 if (err < 0)
736 return err; 1231 return err;
737 } 1232 }
@@ -741,21 +1236,52 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
741 * of a false negative due to a concurrent rename, we're going to 1236 * of a false negative due to a concurrent rename, we're going to
742 * do the non-racy lookup, below. 1237 * do the non-racy lookup, below.
743 */ 1238 */
744 dentry = __d_lookup(nd->path.dentry, name); 1239 if (nd->flags & LOOKUP_RCU) {
1240 unsigned seq;
1241
1242 *inode = nd->inode;
1243 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1244 if (!dentry) {
1245 if (nameidata_drop_rcu(nd))
1246 return -ECHILD;
1247 goto need_lookup;
1248 }
1249 /* Memory barrier in read_seqcount_begin of child is enough */
1250 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1251 return -ECHILD;
1252
1253 nd->seq = seq;
1254 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
1255 goto need_revalidate;
1256done2:
1257 path->mnt = mnt;
1258 path->dentry = dentry;
1259 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1260 return 0;
1261 if (nameidata_drop_rcu(nd))
1262 return -ECHILD;
1263 /* fallthru */
1264 }
1265 dentry = __d_lookup(parent, name);
745 if (!dentry) 1266 if (!dentry)
746 goto need_lookup; 1267 goto need_lookup;
747found: 1268found:
748 if (dentry->d_op && dentry->d_op->d_revalidate) 1269 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
749 goto need_revalidate; 1270 goto need_revalidate;
750done: 1271done:
751 path->mnt = mnt; 1272 path->mnt = mnt;
752 path->dentry = dentry; 1273 path->dentry = dentry;
753 __follow_mount(path); 1274 err = follow_managed(path, nd->flags);
1275 if (unlikely(err < 0)) {
1276 path_put_conditional(path, nd);
1277 return err;
1278 }
1279 *inode = path->dentry->d_inode;
754 return 0; 1280 return 0;
755 1281
756need_lookup: 1282need_lookup:
757 parent = nd->path.dentry;
758 dir = parent->d_inode; 1283 dir = parent->d_inode;
1284 BUG_ON(nd->inode != dir);
759 1285
760 mutex_lock(&dir->i_mutex); 1286 mutex_lock(&dir->i_mutex);
761 /* 1287 /*
@@ -789,6 +1315,8 @@ need_revalidate:
789 goto need_lookup; 1315 goto need_lookup;
790 if (IS_ERR(dentry)) 1316 if (IS_ERR(dentry))
791 goto fail; 1317 goto fail;
1318 if (nd->flags & LOOKUP_RCU)
1319 goto done2;
792 goto done; 1320 goto done;
793 1321
794fail: 1322fail:
@@ -796,17 +1324,6 @@ fail:
796} 1324}
797 1325
798/* 1326/*
799 * This is a temporary kludge to deal with "automount" symlinks; proper
800 * solution is to trigger them on follow_mount(), so that do_lookup()
801 * would DTRT. To be killed before 2.6.34-final.
802 */
803static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
804{
805 return inode && unlikely(inode->i_op->follow_link) &&
806 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
807}
808
809/*
810 * Name resolution. 1327 * Name resolution.
811 * This is the basic name resolution function, turning a pathname into 1328 * This is the basic name resolution function, turning a pathname into
812 * the final dentry. We expect 'base' to be positive and a directory. 1329 * the final dentry. We expect 'base' to be positive and a directory.
@@ -817,7 +1334,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
817static int link_path_walk(const char *name, struct nameidata *nd) 1334static int link_path_walk(const char *name, struct nameidata *nd)
818{ 1335{
819 struct path next; 1336 struct path next;
820 struct inode *inode;
821 int err; 1337 int err;
822 unsigned int lookup_flags = nd->flags; 1338 unsigned int lookup_flags = nd->flags;
823 1339
@@ -826,18 +1342,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
826 if (!*name) 1342 if (!*name)
827 goto return_reval; 1343 goto return_reval;
828 1344
829 inode = nd->path.dentry->d_inode;
830 if (nd->depth) 1345 if (nd->depth)
831 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); 1346 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
832 1347
833 /* At this point we know we have a real path component. */ 1348 /* At this point we know we have a real path component. */
834 for(;;) { 1349 for(;;) {
1350 struct inode *inode;
835 unsigned long hash; 1351 unsigned long hash;
836 struct qstr this; 1352 struct qstr this;
837 unsigned int c; 1353 unsigned int c;
838 1354
839 nd->flags |= LOOKUP_CONTINUE; 1355 nd->flags |= LOOKUP_CONTINUE;
840 err = exec_permission(inode); 1356 if (nd->flags & LOOKUP_RCU) {
1357 err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1358 if (err == -ECHILD) {
1359 if (nameidata_drop_rcu(nd))
1360 return -ECHILD;
1361 goto exec_again;
1362 }
1363 } else {
1364exec_again:
1365 err = exec_permission(nd->inode, 0);
1366 }
841 if (err) 1367 if (err)
842 break; 1368 break;
843 1369
@@ -868,37 +1394,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
868 if (this.name[0] == '.') switch (this.len) { 1394 if (this.name[0] == '.') switch (this.len) {
869 default: 1395 default:
870 break; 1396 break;
871 case 2: 1397 case 2:
872 if (this.name[1] != '.') 1398 if (this.name[1] != '.')
873 break; 1399 break;
874 follow_dotdot(nd); 1400 if (nd->flags & LOOKUP_RCU) {
875 inode = nd->path.dentry->d_inode; 1401 if (follow_dotdot_rcu(nd))
1402 return -ECHILD;
1403 } else
1404 follow_dotdot(nd);
876 /* fallthrough */ 1405 /* fallthrough */
877 case 1: 1406 case 1:
878 continue; 1407 continue;
879 } 1408 }
880 /* This does the actual lookups.. */ 1409 /* This does the actual lookups.. */
881 err = do_lookup(nd, &this, &next); 1410 err = do_lookup(nd, &this, &next, &inode);
882 if (err) 1411 if (err)
883 break; 1412 break;
884
885 err = -ENOENT; 1413 err = -ENOENT;
886 inode = next.dentry->d_inode;
887 if (!inode) 1414 if (!inode)
888 goto out_dput; 1415 goto out_dput;
889 1416
890 if (inode->i_op->follow_link) { 1417 if (inode->i_op->follow_link) {
1418 /* We commonly drop rcu-walk here */
1419 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1420 return -ECHILD;
1421 BUG_ON(inode != next.dentry->d_inode);
891 err = do_follow_link(&next, nd); 1422 err = do_follow_link(&next, nd);
892 if (err) 1423 if (err)
893 goto return_err; 1424 goto return_err;
1425 nd->inode = nd->path.dentry->d_inode;
894 err = -ENOENT; 1426 err = -ENOENT;
895 inode = nd->path.dentry->d_inode; 1427 if (!nd->inode)
896 if (!inode)
897 break; 1428 break;
898 } else 1429 } else {
899 path_to_nameidata(&next, nd); 1430 path_to_nameidata(&next, nd);
1431 nd->inode = inode;
1432 }
900 err = -ENOTDIR; 1433 err = -ENOTDIR;
901 if (!inode->i_op->lookup) 1434 if (!nd->inode->i_op->lookup)
902 break; 1435 break;
903 continue; 1436 continue;
904 /* here ends the main loop */ 1437 /* here ends the main loop */
@@ -913,32 +1446,40 @@ last_component:
913 if (this.name[0] == '.') switch (this.len) { 1446 if (this.name[0] == '.') switch (this.len) {
914 default: 1447 default:
915 break; 1448 break;
916 case 2: 1449 case 2:
917 if (this.name[1] != '.') 1450 if (this.name[1] != '.')
918 break; 1451 break;
919 follow_dotdot(nd); 1452 if (nd->flags & LOOKUP_RCU) {
920 inode = nd->path.dentry->d_inode; 1453 if (follow_dotdot_rcu(nd))
1454 return -ECHILD;
1455 } else
1456 follow_dotdot(nd);
921 /* fallthrough */ 1457 /* fallthrough */
922 case 1: 1458 case 1:
923 goto return_reval; 1459 goto return_reval;
924 } 1460 }
925 err = do_lookup(nd, &this, &next); 1461 err = do_lookup(nd, &this, &next, &inode);
926 if (err) 1462 if (err)
927 break; 1463 break;
928 inode = next.dentry->d_inode; 1464 if (inode && unlikely(inode->i_op->follow_link) &&
929 if (follow_on_final(inode, lookup_flags)) { 1465 (lookup_flags & LOOKUP_FOLLOW)) {
1466 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1467 return -ECHILD;
1468 BUG_ON(inode != next.dentry->d_inode);
930 err = do_follow_link(&next, nd); 1469 err = do_follow_link(&next, nd);
931 if (err) 1470 if (err)
932 goto return_err; 1471 goto return_err;
933 inode = nd->path.dentry->d_inode; 1472 nd->inode = nd->path.dentry->d_inode;
934 } else 1473 } else {
935 path_to_nameidata(&next, nd); 1474 path_to_nameidata(&next, nd);
1475 nd->inode = inode;
1476 }
936 err = -ENOENT; 1477 err = -ENOENT;
937 if (!inode) 1478 if (!nd->inode)
938 break; 1479 break;
939 if (lookup_flags & LOOKUP_DIRECTORY) { 1480 if (lookup_flags & LOOKUP_DIRECTORY) {
940 err = -ENOTDIR; 1481 err = -ENOTDIR;
941 if (!inode->i_op->lookup) 1482 if (!nd->inode->i_op->lookup)
942 break; 1483 break;
943 } 1484 }
944 goto return_base; 1485 goto return_base;
@@ -958,25 +1499,43 @@ return_reval:
958 * We bypassed the ordinary revalidation routines. 1499 * We bypassed the ordinary revalidation routines.
959 * We may need to check the cached dentry for staleness. 1500 * We may need to check the cached dentry for staleness.
960 */ 1501 */
961 if (nd->path.dentry && nd->path.dentry->d_sb && 1502 if (need_reval_dot(nd->path.dentry)) {
962 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
963 err = -ESTALE;
964 /* Note: we do not d_invalidate() */ 1503 /* Note: we do not d_invalidate() */
965 if (!nd->path.dentry->d_op->d_revalidate( 1504 err = d_revalidate(nd->path.dentry, nd);
966 nd->path.dentry, nd)) 1505 if (!err)
1506 err = -ESTALE;
1507 if (err < 0)
967 break; 1508 break;
968 } 1509 }
969return_base: 1510return_base:
1511 if (nameidata_drop_rcu_last_maybe(nd))
1512 return -ECHILD;
970 return 0; 1513 return 0;
971out_dput: 1514out_dput:
972 path_put_conditional(&next, nd); 1515 if (!(nd->flags & LOOKUP_RCU))
1516 path_put_conditional(&next, nd);
973 break; 1517 break;
974 } 1518 }
975 path_put(&nd->path); 1519 if (!(nd->flags & LOOKUP_RCU))
1520 path_put(&nd->path);
976return_err: 1521return_err:
977 return err; 1522 return err;
978} 1523}
979 1524
1525static inline int path_walk_rcu(const char *name, struct nameidata *nd)
1526{
1527 current->total_link_count = 0;
1528
1529 return link_path_walk(name, nd);
1530}
1531
1532static inline int path_walk_simple(const char *name, struct nameidata *nd)
1533{
1534 current->total_link_count = 0;
1535
1536 return link_path_walk(name, nd);
1537}
1538
980static int path_walk(const char *name, struct nameidata *nd) 1539static int path_walk(const char *name, struct nameidata *nd)
981{ 1540{
982 struct path save = nd->path; 1541 struct path save = nd->path;
@@ -1002,6 +1561,93 @@ static int path_walk(const char *name, struct nameidata *nd)
1002 return result; 1561 return result;
1003} 1562}
1004 1563
1564static void path_finish_rcu(struct nameidata *nd)
1565{
1566 if (nd->flags & LOOKUP_RCU) {
1567 /* RCU dangling. Cancel it. */
1568 nd->flags &= ~LOOKUP_RCU;
1569 nd->root.mnt = NULL;
1570 rcu_read_unlock();
1571 br_read_unlock(vfsmount_lock);
1572 }
1573 if (nd->file)
1574 fput(nd->file);
1575}
1576
1577static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1578{
1579 int retval = 0;
1580 int fput_needed;
1581 struct file *file;
1582
1583 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1584 nd->flags = flags | LOOKUP_RCU;
1585 nd->depth = 0;
1586 nd->root.mnt = NULL;
1587 nd->file = NULL;
1588
1589 if (*name=='/') {
1590 struct fs_struct *fs = current->fs;
1591 unsigned seq;
1592
1593 br_read_lock(vfsmount_lock);
1594 rcu_read_lock();
1595
1596 do {
1597 seq = read_seqcount_begin(&fs->seq);
1598 nd->root = fs->root;
1599 nd->path = nd->root;
1600 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1601 } while (read_seqcount_retry(&fs->seq, seq));
1602
1603 } else if (dfd == AT_FDCWD) {
1604 struct fs_struct *fs = current->fs;
1605 unsigned seq;
1606
1607 br_read_lock(vfsmount_lock);
1608 rcu_read_lock();
1609
1610 do {
1611 seq = read_seqcount_begin(&fs->seq);
1612 nd->path = fs->pwd;
1613 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1614 } while (read_seqcount_retry(&fs->seq, seq));
1615
1616 } else {
1617 struct dentry *dentry;
1618
1619 file = fget_light(dfd, &fput_needed);
1620 retval = -EBADF;
1621 if (!file)
1622 goto out_fail;
1623
1624 dentry = file->f_path.dentry;
1625
1626 retval = -ENOTDIR;
1627 if (!S_ISDIR(dentry->d_inode->i_mode))
1628 goto fput_fail;
1629
1630 retval = file_permission(file, MAY_EXEC);
1631 if (retval)
1632 goto fput_fail;
1633
1634 nd->path = file->f_path;
1635 if (fput_needed)
1636 nd->file = file;
1637
1638 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1639 br_read_lock(vfsmount_lock);
1640 rcu_read_lock();
1641 }
1642 nd->inode = nd->path.dentry->d_inode;
1643 return 0;
1644
1645fput_fail:
1646 fput_light(file, fput_needed);
1647out_fail:
1648 return retval;
1649}
1650
1005static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1651static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1006{ 1652{
1007 int retval = 0; 1653 int retval = 0;
@@ -1042,6 +1688,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
1042 1688
1043 fput_light(file, fput_needed); 1689 fput_light(file, fput_needed);
1044 } 1690 }
1691 nd->inode = nd->path.dentry->d_inode;
1045 return 0; 1692 return 0;
1046 1693
1047fput_fail: 1694fput_fail:
@@ -1054,16 +1701,53 @@ out_fail:
1054static int do_path_lookup(int dfd, const char *name, 1701static int do_path_lookup(int dfd, const char *name,
1055 unsigned int flags, struct nameidata *nd) 1702 unsigned int flags, struct nameidata *nd)
1056{ 1703{
1057 int retval = path_init(dfd, name, flags, nd); 1704 int retval;
1058 if (!retval) 1705
1059 retval = path_walk(name, nd); 1706 /*
1060 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1707 * Path walking is largely split up into 2 different synchronisation
1061 nd->path.dentry->d_inode)) 1708 * schemes, rcu-walk and ref-walk (explained in
1062 audit_inode(name, nd->path.dentry); 1709 * Documentation/filesystems/path-lookup.txt). These share much of the
1710 * path walk code, but some things particularly setup, cleanup, and
1711 * following mounts are sufficiently divergent that functions are
1712 * duplicated. Typically there is a function foo(), and its RCU
1713 * analogue, foo_rcu().
1714 *
1715 * -ECHILD is the error number of choice (just to avoid clashes) that
1716 * is returned if some aspect of an rcu-walk fails. Such an error must
1717 * be handled by restarting a traditional ref-walk (which will always
1718 * be able to complete).
1719 */
1720 retval = path_init_rcu(dfd, name, flags, nd);
1721 if (unlikely(retval))
1722 return retval;
1723 retval = path_walk_rcu(name, nd);
1724 path_finish_rcu(nd);
1063 if (nd->root.mnt) { 1725 if (nd->root.mnt) {
1064 path_put(&nd->root); 1726 path_put(&nd->root);
1065 nd->root.mnt = NULL; 1727 nd->root.mnt = NULL;
1066 } 1728 }
1729
1730 if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
1731 /* slower, locked walk */
1732 if (retval == -ESTALE)
1733 flags |= LOOKUP_REVAL;
1734 retval = path_init(dfd, name, flags, nd);
1735 if (unlikely(retval))
1736 return retval;
1737 retval = path_walk(name, nd);
1738 if (nd->root.mnt) {
1739 path_put(&nd->root);
1740 nd->root.mnt = NULL;
1741 }
1742 }
1743
1744 if (likely(!retval)) {
1745 if (unlikely(!audit_dummy_context())) {
1746 if (nd->path.dentry && nd->inode)
1747 audit_inode(name, nd->path.dentry);
1748 }
1749 }
1750
1067 return retval; 1751 return retval;
1068} 1752}
1069 1753
@@ -1106,10 +1790,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1106 path_get(&nd->path); 1790 path_get(&nd->path);
1107 nd->root = nd->path; 1791 nd->root = nd->path;
1108 path_get(&nd->root); 1792 path_get(&nd->root);
1793 nd->inode = nd->path.dentry->d_inode;
1109 1794
1110 retval = path_walk(name, nd); 1795 retval = path_walk(name, nd);
1111 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1796 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1112 nd->path.dentry->d_inode)) 1797 nd->inode))
1113 audit_inode(name, nd->path.dentry); 1798 audit_inode(name, nd->path.dentry);
1114 1799
1115 path_put(&nd->root); 1800 path_put(&nd->root);
@@ -1121,18 +1806,20 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1121static struct dentry *__lookup_hash(struct qstr *name, 1806static struct dentry *__lookup_hash(struct qstr *name,
1122 struct dentry *base, struct nameidata *nd) 1807 struct dentry *base, struct nameidata *nd)
1123{ 1808{
1809 struct inode *inode = base->d_inode;
1124 struct dentry *dentry; 1810 struct dentry *dentry;
1125 struct inode *inode;
1126 int err; 1811 int err;
1127 1812
1128 inode = base->d_inode; 1813 err = exec_permission(inode, 0);
1814 if (err)
1815 return ERR_PTR(err);
1129 1816
1130 /* 1817 /*
1131 * See if the low-level filesystem might want 1818 * See if the low-level filesystem might want
1132 * to use its own hash.. 1819 * to use its own hash..
1133 */ 1820 */
1134 if (base->d_op && base->d_op->d_hash) { 1821 if (base->d_flags & DCACHE_OP_HASH) {
1135 err = base->d_op->d_hash(base, name); 1822 err = base->d_op->d_hash(base, inode, name);
1136 dentry = ERR_PTR(err); 1823 dentry = ERR_PTR(err);
1137 if (err < 0) 1824 if (err < 0)
1138 goto out; 1825 goto out;
@@ -1145,7 +1832,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
1145 */ 1832 */
1146 dentry = d_lookup(base, name); 1833 dentry = d_lookup(base, name);
1147 1834
1148 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 1835 if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
1149 dentry = do_revalidate(dentry, nd); 1836 dentry = do_revalidate(dentry, nd);
1150 1837
1151 if (!dentry) 1838 if (!dentry)
@@ -1161,11 +1848,6 @@ out:
1161 */ 1848 */
1162static struct dentry *lookup_hash(struct nameidata *nd) 1849static struct dentry *lookup_hash(struct nameidata *nd)
1163{ 1850{
1164 int err;
1165
1166 err = exec_permission(nd->path.dentry->d_inode);
1167 if (err)
1168 return ERR_PTR(err);
1169 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1851 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1170} 1852}
1171 1853
@@ -1213,9 +1895,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1213 if (err) 1895 if (err)
1214 return ERR_PTR(err); 1896 return ERR_PTR(err);
1215 1897
1216 err = exec_permission(base->d_inode);
1217 if (err)
1218 return ERR_PTR(err);
1219 return __lookup_hash(&this, base, NULL); 1898 return __lookup_hash(&this, base, NULL);
1220} 1899}
1221 1900
@@ -1454,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
1454 return break_lease(inode, flag); 2133 return break_lease(inode, flag);
1455} 2134}
1456 2135
1457static int handle_truncate(struct path *path) 2136static int handle_truncate(struct file *filp)
1458{ 2137{
2138 struct path *path = &filp->f_path;
1459 struct inode *inode = path->dentry->d_inode; 2139 struct inode *inode = path->dentry->d_inode;
1460 int error = get_write_access(inode); 2140 int error = get_write_access(inode);
1461 if (error) 2141 if (error)
@@ -1469,7 +2149,7 @@ static int handle_truncate(struct path *path)
1469 if (!error) { 2149 if (!error) {
1470 error = do_truncate(path->dentry, 0, 2150 error = do_truncate(path->dentry, 0,
1471 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 2151 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1472 NULL); 2152 filp);
1473 } 2153 }
1474 put_write_access(inode); 2154 put_write_access(inode);
1475 return error; 2155 return error;
@@ -1496,6 +2176,7 @@ out_unlock:
1496 mutex_unlock(&dir->d_inode->i_mutex); 2176 mutex_unlock(&dir->d_inode->i_mutex);
1497 dput(nd->path.dentry); 2177 dput(nd->path.dentry);
1498 nd->path.dentry = path->dentry; 2178 nd->path.dentry = path->dentry;
2179
1499 if (error) 2180 if (error)
1500 return error; 2181 return error;
1501 /* Don't check for write permission, don't truncate */ 2182 /* Don't check for write permission, don't truncate */
@@ -1566,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
1566 } 2247 }
1567 if (!IS_ERR(filp)) { 2248 if (!IS_ERR(filp)) {
1568 if (will_truncate) { 2249 if (will_truncate) {
1569 error = handle_truncate(&nd->path); 2250 error = handle_truncate(filp);
1570 if (error) { 2251 if (error) {
1571 fput(filp); 2252 fput(filp);
1572 filp = ERR_PTR(error); 2253 filp = ERR_PTR(error);
@@ -1580,6 +2261,7 @@ static struct file *finish_open(struct nameidata *nd,
1580 */ 2261 */
1581 if (will_truncate) 2262 if (will_truncate)
1582 mnt_drop_write(nd->path.mnt); 2263 mnt_drop_write(nd->path.mnt);
2264 path_put(&nd->path);
1583 return filp; 2265 return filp;
1584 2266
1585exit: 2267exit:
@@ -1589,6 +2271,9 @@ exit:
1589 return ERR_PTR(error); 2271 return ERR_PTR(error);
1590} 2272}
1591 2273
2274/*
2275 * Handle O_CREAT case for do_filp_open
2276 */
1592static struct file *do_last(struct nameidata *nd, struct path *path, 2277static struct file *do_last(struct nameidata *nd, struct path *path,
1593 int open_flag, int acc_mode, 2278 int open_flag, int acc_mode,
1594 int mode, const char *pathname) 2279 int mode, const char *pathname)
@@ -1602,50 +2287,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1602 follow_dotdot(nd); 2287 follow_dotdot(nd);
1603 dir = nd->path.dentry; 2288 dir = nd->path.dentry;
1604 case LAST_DOT: 2289 case LAST_DOT:
1605 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 2290 if (need_reval_dot(dir)) {
1606 if (!dir->d_op->d_revalidate(dir, nd)) { 2291 int status = d_revalidate(nd->path.dentry, nd);
1607 error = -ESTALE; 2292 if (!status)
2293 status = -ESTALE;
2294 if (status < 0) {
2295 error = status;
1608 goto exit; 2296 goto exit;
1609 } 2297 }
1610 } 2298 }
1611 /* fallthrough */ 2299 /* fallthrough */
1612 case LAST_ROOT: 2300 case LAST_ROOT:
1613 if (open_flag & O_CREAT) 2301 goto exit;
1614 goto exit;
1615 /* fallthrough */
1616 case LAST_BIND: 2302 case LAST_BIND:
1617 audit_inode(pathname, dir); 2303 audit_inode(pathname, dir);
1618 goto ok; 2304 goto ok;
1619 } 2305 }
1620 2306
1621 /* trailing slashes? */ 2307 /* trailing slashes? */
1622 if (nd->last.name[nd->last.len]) { 2308 if (nd->last.name[nd->last.len])
1623 if (open_flag & O_CREAT) 2309 goto exit;
1624 goto exit;
1625 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1626 }
1627
1628 /* just plain open? */
1629 if (!(open_flag & O_CREAT)) {
1630 error = do_lookup(nd, &nd->last, path);
1631 if (error)
1632 goto exit;
1633 error = -ENOENT;
1634 if (!path->dentry->d_inode)
1635 goto exit_dput;
1636 if (path->dentry->d_inode->i_op->follow_link)
1637 return NULL;
1638 error = -ENOTDIR;
1639 if (nd->flags & LOOKUP_DIRECTORY) {
1640 if (!path->dentry->d_inode->i_op->lookup)
1641 goto exit_dput;
1642 }
1643 path_to_nameidata(path, nd);
1644 audit_inode(pathname, nd->path.dentry);
1645 goto ok;
1646 }
1647 2310
1648 /* OK, it's O_CREAT */
1649 mutex_lock(&dir->d_inode->i_mutex); 2311 mutex_lock(&dir->d_inode->i_mutex);
1650 2312
1651 path->dentry = lookup_hash(nd); 2313 path->dentry = lookup_hash(nd);
@@ -1681,6 +2343,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1681 } 2343 }
1682 filp = nameidata_to_filp(nd); 2344 filp = nameidata_to_filp(nd);
1683 mnt_drop_write(nd->path.mnt); 2345 mnt_drop_write(nd->path.mnt);
2346 path_put(&nd->path);
1684 if (!IS_ERR(filp)) { 2347 if (!IS_ERR(filp)) {
1685 error = ima_file_check(filp, acc_mode); 2348 error = ima_file_check(filp, acc_mode);
1686 if (error) { 2349 if (error) {
@@ -1701,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1701 if (open_flag & O_EXCL) 2364 if (open_flag & O_EXCL)
1702 goto exit_dput; 2365 goto exit_dput;
1703 2366
1704 if (__follow_mount(path)) { 2367 error = follow_managed(path, nd->flags);
1705 error = -ELOOP; 2368 if (error < 0)
1706 if (open_flag & O_NOFOLLOW) 2369 goto exit_dput;
1707 goto exit_dput;
1708 }
1709 2370
1710 error = -ENOENT; 2371 error = -ENOENT;
1711 if (!path->dentry->d_inode) 2372 if (!path->dentry->d_inode)
@@ -1715,8 +2376,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1715 return NULL; 2376 return NULL;
1716 2377
1717 path_to_nameidata(path, nd); 2378 path_to_nameidata(path, nd);
2379 nd->inode = path->dentry->d_inode;
1718 error = -EISDIR; 2380 error = -EISDIR;
1719 if (S_ISDIR(path->dentry->d_inode->i_mode)) 2381 if (S_ISDIR(nd->inode->i_mode))
1720 goto exit; 2382 goto exit;
1721ok: 2383ok:
1722 filp = finish_open(nd, open_flag, acc_mode); 2384 filp = finish_open(nd, open_flag, acc_mode);
@@ -1747,11 +2409,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
1747 struct path path; 2409 struct path path;
1748 int count = 0; 2410 int count = 0;
1749 int flag = open_to_namei_flags(open_flag); 2411 int flag = open_to_namei_flags(open_flag);
1750 int force_reval = 0; 2412 int flags;
1751 2413
1752 if (!(open_flag & O_CREAT)) 2414 if (!(open_flag & O_CREAT))
1753 mode = 0; 2415 mode = 0;
1754 2416
2417 /* Must never be set by userspace */
2418 open_flag &= ~FMODE_NONOTIFY;
2419
1755 /* 2420 /*
1756 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 2421 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1757 * check for O_DSYNC if the need any syncing at all we enforce it's 2422 * check for O_DSYNC if the need any syncing at all we enforce it's
@@ -1773,54 +2438,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
1773 if (open_flag & O_APPEND) 2438 if (open_flag & O_APPEND)
1774 acc_mode |= MAY_APPEND; 2439 acc_mode |= MAY_APPEND;
1775 2440
1776 /* find the parent */ 2441 flags = LOOKUP_OPEN;
1777reval: 2442 if (open_flag & O_CREAT) {
1778 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 2443 flags |= LOOKUP_CREATE;
2444 if (open_flag & O_EXCL)
2445 flags |= LOOKUP_EXCL;
2446 }
2447 if (open_flag & O_DIRECTORY)
2448 flags |= LOOKUP_DIRECTORY;
2449 if (!(open_flag & O_NOFOLLOW))
2450 flags |= LOOKUP_FOLLOW;
2451
2452 filp = get_empty_filp();
2453 if (!filp)
2454 return ERR_PTR(-ENFILE);
2455
2456 filp->f_flags = open_flag;
2457 nd.intent.open.file = filp;
2458 nd.intent.open.flags = flag;
2459 nd.intent.open.create_mode = mode;
2460
2461 if (open_flag & O_CREAT)
2462 goto creat;
2463
2464 /* !O_CREAT, simple open */
2465 error = do_path_lookup(dfd, pathname, flags, &nd);
2466 if (unlikely(error))
2467 goto out_filp;
2468 error = -ELOOP;
2469 if (!(nd.flags & LOOKUP_FOLLOW)) {
2470 if (nd.inode->i_op->follow_link)
2471 goto out_path;
2472 }
2473 error = -ENOTDIR;
2474 if (nd.flags & LOOKUP_DIRECTORY) {
2475 if (!nd.inode->i_op->lookup)
2476 goto out_path;
2477 }
2478 audit_inode(pathname, nd.path.dentry);
2479 filp = finish_open(&nd, open_flag, acc_mode);
2480 return filp;
2481
2482creat:
2483 /* OK, have to create the file. Find the parent. */
2484 error = path_init_rcu(dfd, pathname,
2485 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
1779 if (error) 2486 if (error)
1780 return ERR_PTR(error); 2487 goto out_filp;
1781 if (force_reval) 2488 error = path_walk_rcu(pathname, &nd);
1782 nd.flags |= LOOKUP_REVAL; 2489 path_finish_rcu(&nd);
2490 if (unlikely(error == -ECHILD || error == -ESTALE)) {
2491 /* slower, locked walk */
2492 if (error == -ESTALE) {
2493reval:
2494 flags |= LOOKUP_REVAL;
2495 }
2496 error = path_init(dfd, pathname,
2497 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2498 if (error)
2499 goto out_filp;
1783 2500
1784 current->total_link_count = 0; 2501 error = path_walk_simple(pathname, &nd);
1785 error = link_path_walk(pathname, &nd);
1786 if (error) {
1787 filp = ERR_PTR(error);
1788 goto out;
1789 } 2502 }
1790 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) 2503 if (unlikely(error))
2504 goto out_filp;
2505 if (unlikely(!audit_dummy_context()))
1791 audit_inode(pathname, nd.path.dentry); 2506 audit_inode(pathname, nd.path.dentry);
1792 2507
1793 /* 2508 /*
1794 * We have the parent and last component. 2509 * We have the parent and last component.
1795 */ 2510 */
1796 2511 nd.flags = flags;
1797 error = -ENFILE;
1798 filp = get_empty_filp();
1799 if (filp == NULL)
1800 goto exit_parent;
1801 nd.intent.open.file = filp;
1802 filp->f_flags = open_flag;
1803 nd.intent.open.flags = flag;
1804 nd.intent.open.create_mode = mode;
1805 nd.flags &= ~LOOKUP_PARENT;
1806 nd.flags |= LOOKUP_OPEN;
1807 if (open_flag & O_CREAT) {
1808 nd.flags |= LOOKUP_CREATE;
1809 if (open_flag & O_EXCL)
1810 nd.flags |= LOOKUP_EXCL;
1811 }
1812 if (open_flag & O_DIRECTORY)
1813 nd.flags |= LOOKUP_DIRECTORY;
1814 if (!(open_flag & O_NOFOLLOW))
1815 nd.flags |= LOOKUP_FOLLOW;
1816 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2512 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1817 while (unlikely(!filp)) { /* trailing symlink */ 2513 while (unlikely(!filp)) { /* trailing symlink */
1818 struct path holder; 2514 struct path link = path;
1819 struct inode *inode = path.dentry->d_inode; 2515 struct inode *linki = link.dentry->d_inode;
1820 void *cookie; 2516 void *cookie;
1821 error = -ELOOP; 2517 error = -ELOOP;
1822 /* S_ISDIR part is a temporary automount kludge */ 2518 if (!(nd.flags & LOOKUP_FOLLOW))
1823 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
1824 goto exit_dput; 2519 goto exit_dput;
1825 if (count++ == 32) 2520 if (count++ == 32)
1826 goto exit_dput; 2521 goto exit_dput;
@@ -1836,41 +2531,37 @@ reval:
1836 * just set LAST_BIND. 2531 * just set LAST_BIND.
1837 */ 2532 */
1838 nd.flags |= LOOKUP_PARENT; 2533 nd.flags |= LOOKUP_PARENT;
1839 error = security_inode_follow_link(path.dentry, &nd); 2534 error = security_inode_follow_link(link.dentry, &nd);
1840 if (error) 2535 if (error)
1841 goto exit_dput; 2536 goto exit_dput;
1842 error = __do_follow_link(&path, &nd, &cookie); 2537 error = __do_follow_link(&link, &nd, &cookie);
1843 if (unlikely(error)) { 2538 if (unlikely(error)) {
2539 if (!IS_ERR(cookie) && linki->i_op->put_link)
2540 linki->i_op->put_link(link.dentry, &nd, cookie);
1844 /* nd.path had been dropped */ 2541 /* nd.path had been dropped */
1845 if (!IS_ERR(cookie) && inode->i_op->put_link) 2542 nd.path = link;
1846 inode->i_op->put_link(path.dentry, &nd, cookie); 2543 goto out_path;
1847 path_put(&path);
1848 release_open_intent(&nd);
1849 filp = ERR_PTR(error);
1850 goto out;
1851 } 2544 }
1852 holder = path;
1853 nd.flags &= ~LOOKUP_PARENT; 2545 nd.flags &= ~LOOKUP_PARENT;
1854 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2546 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1855 if (inode->i_op->put_link) 2547 if (linki->i_op->put_link)
1856 inode->i_op->put_link(holder.dentry, &nd, cookie); 2548 linki->i_op->put_link(link.dentry, &nd, cookie);
1857 path_put(&holder); 2549 path_put(&link);
1858 } 2550 }
1859out: 2551out:
1860 if (nd.root.mnt) 2552 if (nd.root.mnt)
1861 path_put(&nd.root); 2553 path_put(&nd.root);
1862 if (filp == ERR_PTR(-ESTALE) && !force_reval) { 2554 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
1863 force_reval = 1;
1864 goto reval; 2555 goto reval;
1865 }
1866 return filp; 2556 return filp;
1867 2557
1868exit_dput: 2558exit_dput:
1869 path_put_conditional(&path, &nd); 2559 path_put_conditional(&path, &nd);
2560out_path:
2561 path_put(&nd.path);
2562out_filp:
1870 if (!IS_ERR(nd.intent.open.file)) 2563 if (!IS_ERR(nd.intent.open.file))
1871 release_open_intent(&nd); 2564 release_open_intent(&nd);
1872exit_parent:
1873 path_put(&nd.path);
1874 filp = ERR_PTR(error); 2565 filp = ERR_PTR(error);
1875 goto out; 2566 goto out;
1876} 2567}
@@ -2131,12 +2822,10 @@ void dentry_unhash(struct dentry *dentry)
2131{ 2822{
2132 dget(dentry); 2823 dget(dentry);
2133 shrink_dcache_parent(dentry); 2824 shrink_dcache_parent(dentry);
2134 spin_lock(&dcache_lock);
2135 spin_lock(&dentry->d_lock); 2825 spin_lock(&dentry->d_lock);
2136 if (atomic_read(&dentry->d_count) == 2) 2826 if (dentry->d_count == 2)
2137 __d_drop(dentry); 2827 __d_drop(dentry);
2138 spin_unlock(&dentry->d_lock); 2828 spin_unlock(&dentry->d_lock);
2139 spin_unlock(&dcache_lock);
2140} 2829}
2141 2830
2142int vfs_rmdir(struct inode *dir, struct dentry *dentry) 2831int vfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -2291,7 +2980,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2291 goto slashes; 2980 goto slashes;
2292 inode = dentry->d_inode; 2981 inode = dentry->d_inode;
2293 if (inode) 2982 if (inode)
2294 atomic_inc(&inode->i_count); 2983 ihold(inode);
2295 error = mnt_want_write(nd.path.mnt); 2984 error = mnt_want_write(nd.path.mnt);
2296 if (error) 2985 if (error)
2297 goto exit2; 2986 goto exit2;
@@ -2885,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
2885}; 3574};
2886 3575
2887EXPORT_SYMBOL(user_path_at); 3576EXPORT_SYMBOL(user_path_at);
3577EXPORT_SYMBOL(follow_down_one);
2888EXPORT_SYMBOL(follow_down); 3578EXPORT_SYMBOL(follow_down);
2889EXPORT_SYMBOL(follow_up); 3579EXPORT_SYMBOL(follow_up);
2890EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3580EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index a72eaabfe8f2..7b0b95371696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/smp_lock.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/kernel.h> 17#include <linux/kernel.h>
19#include <linux/acct.h> 18#include <linux/acct.h>
@@ -139,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
139 mnt->mnt_group_id = 0; 138 mnt->mnt_group_id = 0;
140} 139}
141 140
141/*
142 * vfsmount lock must be held for read
143 */
144static inline void mnt_add_count(struct vfsmount *mnt, int n)
145{
146#ifdef CONFIG_SMP
147 this_cpu_add(mnt->mnt_pcp->mnt_count, n);
148#else
149 preempt_disable();
150 mnt->mnt_count += n;
151 preempt_enable();
152#endif
153}
154
155static inline void mnt_set_count(struct vfsmount *mnt, int n)
156{
157#ifdef CONFIG_SMP
158 this_cpu_write(mnt->mnt_pcp->mnt_count, n);
159#else
160 mnt->mnt_count = n;
161#endif
162}
163
164/*
165 * vfsmount lock must be held for read
166 */
167static inline void mnt_inc_count(struct vfsmount *mnt)
168{
169 mnt_add_count(mnt, 1);
170}
171
172/*
173 * vfsmount lock must be held for read
174 */
175static inline void mnt_dec_count(struct vfsmount *mnt)
176{
177 mnt_add_count(mnt, -1);
178}
179
180/*
181 * vfsmount lock must be held for write
182 */
183unsigned int mnt_get_count(struct vfsmount *mnt)
184{
185#ifdef CONFIG_SMP
186 unsigned int count = 0;
187 int cpu;
188
189 for_each_possible_cpu(cpu) {
190 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
191 }
192
193 return count;
194#else
195 return mnt->mnt_count;
196#endif
197}
198
142struct vfsmount *alloc_vfsmnt(const char *name) 199struct vfsmount *alloc_vfsmnt(const char *name)
143{ 200{
144 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -155,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
155 goto out_free_id; 212 goto out_free_id;
156 } 213 }
157 214
158 atomic_set(&mnt->mnt_count, 1); 215#ifdef CONFIG_SMP
216 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
217 if (!mnt->mnt_pcp)
218 goto out_free_devname;
219
220 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
221#else
222 mnt->mnt_count = 1;
223 mnt->mnt_writers = 0;
224#endif
225
159 INIT_LIST_HEAD(&mnt->mnt_hash); 226 INIT_LIST_HEAD(&mnt->mnt_hash);
160 INIT_LIST_HEAD(&mnt->mnt_child); 227 INIT_LIST_HEAD(&mnt->mnt_child);
161 INIT_LIST_HEAD(&mnt->mnt_mounts); 228 INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -167,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
167#ifdef CONFIG_FSNOTIFY 234#ifdef CONFIG_FSNOTIFY
168 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 235 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
169#endif 236#endif
170#ifdef CONFIG_SMP
171 mnt->mnt_writers = alloc_percpu(int);
172 if (!mnt->mnt_writers)
173 goto out_free_devname;
174#else
175 mnt->mnt_writers = 0;
176#endif
177 } 237 }
178 return mnt; 238 return mnt;
179 239
@@ -217,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)
217} 277}
218EXPORT_SYMBOL_GPL(__mnt_is_readonly); 278EXPORT_SYMBOL_GPL(__mnt_is_readonly);
219 279
220static inline void inc_mnt_writers(struct vfsmount *mnt) 280static inline void mnt_inc_writers(struct vfsmount *mnt)
221{ 281{
222#ifdef CONFIG_SMP 282#ifdef CONFIG_SMP
223 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; 283 this_cpu_inc(mnt->mnt_pcp->mnt_writers);
224#else 284#else
225 mnt->mnt_writers++; 285 mnt->mnt_writers++;
226#endif 286#endif
227} 287}
228 288
229static inline void dec_mnt_writers(struct vfsmount *mnt) 289static inline void mnt_dec_writers(struct vfsmount *mnt)
230{ 290{
231#ifdef CONFIG_SMP 291#ifdef CONFIG_SMP
232 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; 292 this_cpu_dec(mnt->mnt_pcp->mnt_writers);
233#else 293#else
234 mnt->mnt_writers--; 294 mnt->mnt_writers--;
235#endif 295#endif
236} 296}
237 297
238static unsigned int count_mnt_writers(struct vfsmount *mnt) 298static unsigned int mnt_get_writers(struct vfsmount *mnt)
239{ 299{
240#ifdef CONFIG_SMP 300#ifdef CONFIG_SMP
241 unsigned int count = 0; 301 unsigned int count = 0;
242 int cpu; 302 int cpu;
243 303
244 for_each_possible_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
245 count += *per_cpu_ptr(mnt->mnt_writers, cpu); 305 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
246 } 306 }
247 307
248 return count; 308 return count;
@@ -274,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt)
274 int ret = 0; 334 int ret = 0;
275 335
276 preempt_disable(); 336 preempt_disable();
277 inc_mnt_writers(mnt); 337 mnt_inc_writers(mnt);
278 /* 338 /*
279 * The store to inc_mnt_writers must be visible before we pass 339 * The store to mnt_inc_writers must be visible before we pass
280 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 340 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
281 * incremented count after it has set MNT_WRITE_HOLD. 341 * incremented count after it has set MNT_WRITE_HOLD.
282 */ 342 */
@@ -290,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt)
290 */ 350 */
291 smp_rmb(); 351 smp_rmb();
292 if (__mnt_is_readonly(mnt)) { 352 if (__mnt_is_readonly(mnt)) {
293 dec_mnt_writers(mnt); 353 mnt_dec_writers(mnt);
294 ret = -EROFS; 354 ret = -EROFS;
295 goto out; 355 goto out;
296 } 356 }
@@ -318,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt)
318 if (__mnt_is_readonly(mnt)) 378 if (__mnt_is_readonly(mnt))
319 return -EROFS; 379 return -EROFS;
320 preempt_disable(); 380 preempt_disable();
321 inc_mnt_writers(mnt); 381 mnt_inc_writers(mnt);
322 preempt_enable(); 382 preempt_enable();
323 return 0; 383 return 0;
324} 384}
@@ -352,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
352void mnt_drop_write(struct vfsmount *mnt) 412void mnt_drop_write(struct vfsmount *mnt)
353{ 413{
354 preempt_disable(); 414 preempt_disable();
355 dec_mnt_writers(mnt); 415 mnt_dec_writers(mnt);
356 preempt_enable(); 416 preempt_enable();
357} 417}
358EXPORT_SYMBOL_GPL(mnt_drop_write); 418EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -385,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
385 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 445 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
386 * we're counting up here. 446 * we're counting up here.
387 */ 447 */
388 if (count_mnt_writers(mnt) > 0) 448 if (mnt_get_writers(mnt) > 0)
389 ret = -EBUSY; 449 ret = -EBUSY;
390 else 450 else
391 mnt->mnt_flags |= MNT_READONLY; 451 mnt->mnt_flags |= MNT_READONLY;
@@ -419,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
419 kfree(mnt->mnt_devname); 479 kfree(mnt->mnt_devname);
420 mnt_free_id(mnt); 480 mnt_free_id(mnt);
421#ifdef CONFIG_SMP 481#ifdef CONFIG_SMP
422 free_percpu(mnt->mnt_writers); 482 free_percpu(mnt->mnt_pcp);
423#endif 483#endif
424 kmem_cache_free(mnt_cache, mnt); 484 kmem_cache_free(mnt_cache, mnt);
425} 485}
@@ -493,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
493} 553}
494 554
495/* 555/*
556 * Clear dentry's mounted state if it has no remaining mounts.
557 * vfsmount_lock must be held for write.
558 */
559static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
560{
561 unsigned u;
562
563 for (u = 0; u < HASH_SIZE; u++) {
564 struct vfsmount *p;
565
566 list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
567 if (p->mnt_mountpoint == dentry)
568 return;
569 }
570 }
571 spin_lock(&dentry->d_lock);
572 dentry->d_flags &= ~DCACHE_MOUNTED;
573 spin_unlock(&dentry->d_lock);
574}
575
576/*
496 * vfsmount lock must be held for write 577 * vfsmount lock must be held for write
497 */ 578 */
498static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 579static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
@@ -503,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
503 mnt->mnt_mountpoint = mnt->mnt_root; 584 mnt->mnt_mountpoint = mnt->mnt_root;
504 list_del_init(&mnt->mnt_child); 585 list_del_init(&mnt->mnt_child);
505 list_del_init(&mnt->mnt_hash); 586 list_del_init(&mnt->mnt_hash);
506 old_path->dentry->d_mounted--; 587 dentry_reset_mounted(old_path->mnt, old_path->dentry);
507} 588}
508 589
509/* 590/*
@@ -514,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
514{ 595{
515 child_mnt->mnt_parent = mntget(mnt); 596 child_mnt->mnt_parent = mntget(mnt);
516 child_mnt->mnt_mountpoint = dget(dentry); 597 child_mnt->mnt_mountpoint = dget(dentry);
517 dentry->d_mounted++; 598 spin_lock(&dentry->d_lock);
599 dentry->d_flags |= DCACHE_MOUNTED;
600 spin_unlock(&dentry->d_lock);
518} 601}
519 602
520/* 603/*
@@ -528,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
528 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
529} 612}
530 613
614static inline void __mnt_make_longterm(struct vfsmount *mnt)
615{
616#ifdef CONFIG_SMP
617 atomic_inc(&mnt->mnt_longterm);
618#endif
619}
620
621/* needs vfsmount lock for write */
622static inline void __mnt_make_shortterm(struct vfsmount *mnt)
623{
624#ifdef CONFIG_SMP
625 atomic_dec(&mnt->mnt_longterm);
626#endif
627}
628
531/* 629/*
532 * vfsmount lock must be held for write 630 * vfsmount lock must be held for write
533 */ 631 */
@@ -541,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
541 BUG_ON(parent == mnt); 639 BUG_ON(parent == mnt);
542 640
543 list_add_tail(&head, &mnt->mnt_list); 641 list_add_tail(&head, &mnt->mnt_list);
544 list_for_each_entry(m, &head, mnt_list) 642 list_for_each_entry(m, &head, mnt_list) {
545 m->mnt_ns = n; 643 m->mnt_ns = n;
644 __mnt_make_longterm(m);
645 }
646
546 list_splice(&head, n->list.prev); 647 list_splice(&head, n->list.prev);
547 648
548 list_add_tail(&mnt->mnt_hash, mount_hashtable + 649 list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -595,7 +696,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
595 goto out_free; 696 goto out_free;
596 } 697 }
597 698
598 mnt->mnt_flags = old->mnt_flags; 699 mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
599 atomic_inc(&sb->s_active); 700 atomic_inc(&sb->s_active);
600 mnt->mnt_sb = sb; 701 mnt->mnt_sb = sb;
601 mnt->mnt_root = dget(root); 702 mnt->mnt_root = dget(root);
@@ -630,9 +731,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
630 return NULL; 731 return NULL;
631} 732}
632 733
633static inline void __mntput(struct vfsmount *mnt) 734static inline void mntfree(struct vfsmount *mnt)
634{ 735{
635 struct super_block *sb = mnt->mnt_sb; 736 struct super_block *sb = mnt->mnt_sb;
737
636 /* 738 /*
637 * This probably indicates that somebody messed 739 * This probably indicates that somebody messed
638 * up a mnt_want/drop_write() pair. If this 740 * up a mnt_want/drop_write() pair. If this
@@ -640,38 +742,69 @@ static inline void __mntput(struct vfsmount *mnt)
640 * to make r/w->r/o transitions. 742 * to make r/w->r/o transitions.
641 */ 743 */
642 /* 744 /*
643 * atomic_dec_and_lock() used to deal with ->mnt_count decrements 745 * The locking used to deal with mnt_count decrement provides barriers,
644 * provides barriers, so count_mnt_writers() below is safe. AV 746 * so mnt_get_writers() below is safe.
645 */ 747 */
646 WARN_ON(count_mnt_writers(mnt)); 748 WARN_ON(mnt_get_writers(mnt));
647 fsnotify_vfsmount_delete(mnt); 749 fsnotify_vfsmount_delete(mnt);
648 dput(mnt->mnt_root); 750 dput(mnt->mnt_root);
649 free_vfsmnt(mnt); 751 free_vfsmnt(mnt);
650 deactivate_super(sb); 752 deactivate_super(sb);
651} 753}
652 754
653void mntput_no_expire(struct vfsmount *mnt) 755static void mntput_no_expire(struct vfsmount *mnt)
654{ 756{
655repeat: 757put_again:
656 if (atomic_add_unless(&mnt->mnt_count, -1, 1)) 758#ifdef CONFIG_SMP
759 br_read_lock(vfsmount_lock);
760 if (likely(atomic_read(&mnt->mnt_longterm))) {
761 mnt_dec_count(mnt);
762 br_read_unlock(vfsmount_lock);
657 return; 763 return;
764 }
765 br_read_unlock(vfsmount_lock);
766
658 br_write_lock(vfsmount_lock); 767 br_write_lock(vfsmount_lock);
659 if (!atomic_dec_and_test(&mnt->mnt_count)) { 768 mnt_dec_count(mnt);
769 if (mnt_get_count(mnt)) {
660 br_write_unlock(vfsmount_lock); 770 br_write_unlock(vfsmount_lock);
661 return; 771 return;
662 } 772 }
663 if (likely(!mnt->mnt_pinned)) { 773#else
664 br_write_unlock(vfsmount_lock); 774 mnt_dec_count(mnt);
665 __mntput(mnt); 775 if (likely(mnt_get_count(mnt)))
666 return; 776 return;
777 br_write_lock(vfsmount_lock);
778#endif
779 if (unlikely(mnt->mnt_pinned)) {
780 mnt_add_count(mnt, mnt->mnt_pinned + 1);
781 mnt->mnt_pinned = 0;
782 br_write_unlock(vfsmount_lock);
783 acct_auto_close_mnt(mnt);
784 goto put_again;
667 } 785 }
668 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
669 mnt->mnt_pinned = 0;
670 br_write_unlock(vfsmount_lock); 786 br_write_unlock(vfsmount_lock);
671 acct_auto_close_mnt(mnt); 787 mntfree(mnt);
672 goto repeat;
673} 788}
674EXPORT_SYMBOL(mntput_no_expire); 789
790void mntput(struct vfsmount *mnt)
791{
792 if (mnt) {
793 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
794 if (unlikely(mnt->mnt_expiry_mark))
795 mnt->mnt_expiry_mark = 0;
796 mntput_no_expire(mnt);
797 }
798}
799EXPORT_SYMBOL(mntput);
800
801struct vfsmount *mntget(struct vfsmount *mnt)
802{
803 if (mnt)
804 mnt_inc_count(mnt);
805 return mnt;
806}
807EXPORT_SYMBOL(mntget);
675 808
676void mnt_pin(struct vfsmount *mnt) 809void mnt_pin(struct vfsmount *mnt)
677{ 810{
@@ -679,19 +812,17 @@ void mnt_pin(struct vfsmount *mnt)
679 mnt->mnt_pinned++; 812 mnt->mnt_pinned++;
680 br_write_unlock(vfsmount_lock); 813 br_write_unlock(vfsmount_lock);
681} 814}
682
683EXPORT_SYMBOL(mnt_pin); 815EXPORT_SYMBOL(mnt_pin);
684 816
685void mnt_unpin(struct vfsmount *mnt) 817void mnt_unpin(struct vfsmount *mnt)
686{ 818{
687 br_write_lock(vfsmount_lock); 819 br_write_lock(vfsmount_lock);
688 if (mnt->mnt_pinned) { 820 if (mnt->mnt_pinned) {
689 atomic_inc(&mnt->mnt_count); 821 mnt_inc_count(mnt);
690 mnt->mnt_pinned--; 822 mnt->mnt_pinned--;
691 } 823 }
692 br_write_unlock(vfsmount_lock); 824 br_write_unlock(vfsmount_lock);
693} 825}
694
695EXPORT_SYMBOL(mnt_unpin); 826EXPORT_SYMBOL(mnt_unpin);
696 827
697static inline void mangle(struct seq_file *m, const char *s) 828static inline void mangle(struct seq_file *m, const char *s)
@@ -986,12 +1117,13 @@ int may_umount_tree(struct vfsmount *mnt)
986 int minimum_refs = 0; 1117 int minimum_refs = 0;
987 struct vfsmount *p; 1118 struct vfsmount *p;
988 1119
989 br_read_lock(vfsmount_lock); 1120 /* write lock needed for mnt_get_count */
1121 br_write_lock(vfsmount_lock);
990 for (p = mnt; p; p = next_mnt(p, mnt)) { 1122 for (p = mnt; p; p = next_mnt(p, mnt)) {
991 actual_refs += atomic_read(&p->mnt_count); 1123 actual_refs += mnt_get_count(p);
992 minimum_refs += 2; 1124 minimum_refs += 2;
993 } 1125 }
994 br_read_unlock(vfsmount_lock); 1126 br_write_unlock(vfsmount_lock);
995 1127
996 if (actual_refs > minimum_refs) 1128 if (actual_refs > minimum_refs)
997 return 0; 1129 return 0;
@@ -1018,10 +1150,10 @@ int may_umount(struct vfsmount *mnt)
1018{ 1150{
1019 int ret = 1; 1151 int ret = 1;
1020 down_read(&namespace_sem); 1152 down_read(&namespace_sem);
1021 br_read_lock(vfsmount_lock); 1153 br_write_lock(vfsmount_lock);
1022 if (propagate_mount_busy(mnt, 2)) 1154 if (propagate_mount_busy(mnt, 2))
1023 ret = 0; 1155 ret = 0;
1024 br_read_unlock(vfsmount_lock); 1156 br_write_unlock(vfsmount_lock);
1025 up_read(&namespace_sem); 1157 up_read(&namespace_sem);
1026 return ret; 1158 return ret;
1027} 1159}
@@ -1058,26 +1190,29 @@ void release_mounts(struct list_head *head)
1058 */ 1190 */
1059void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1191void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1060{ 1192{
1193 LIST_HEAD(tmp_list);
1061 struct vfsmount *p; 1194 struct vfsmount *p;
1062 1195
1063 for (p = mnt; p; p = next_mnt(p, mnt)) 1196 for (p = mnt; p; p = next_mnt(p, mnt))
1064 list_move(&p->mnt_hash, kill); 1197 list_move(&p->mnt_hash, &tmp_list);
1065 1198
1066 if (propagate) 1199 if (propagate)
1067 propagate_umount(kill); 1200 propagate_umount(&tmp_list);
1068 1201
1069 list_for_each_entry(p, kill, mnt_hash) { 1202 list_for_each_entry(p, &tmp_list, mnt_hash) {
1070 list_del_init(&p->mnt_expire); 1203 list_del_init(&p->mnt_expire);
1071 list_del_init(&p->mnt_list); 1204 list_del_init(&p->mnt_list);
1072 __touch_mnt_namespace(p->mnt_ns); 1205 __touch_mnt_namespace(p->mnt_ns);
1073 p->mnt_ns = NULL; 1206 p->mnt_ns = NULL;
1207 __mnt_make_shortterm(p);
1074 list_del_init(&p->mnt_child); 1208 list_del_init(&p->mnt_child);
1075 if (p->mnt_parent != p) { 1209 if (p->mnt_parent != p) {
1076 p->mnt_parent->mnt_ghosts++; 1210 p->mnt_parent->mnt_ghosts++;
1077 p->mnt_mountpoint->d_mounted--; 1211 dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
1078 } 1212 }
1079 change_mnt_propagation(p, MS_PRIVATE); 1213 change_mnt_propagation(p, MS_PRIVATE);
1080 } 1214 }
1215 list_splice(&tmp_list, kill);
1081} 1216}
1082 1217
1083static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 1218static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1103,8 +1238,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
1103 flags & (MNT_FORCE | MNT_DETACH)) 1238 flags & (MNT_FORCE | MNT_DETACH))
1104 return -EINVAL; 1239 return -EINVAL;
1105 1240
1106 if (atomic_read(&mnt->mnt_count) != 2) 1241 /*
1242 * probably don't strictly need the lock here if we examined
1243 * all race cases, but it's a slowpath.
1244 */
1245 br_write_lock(vfsmount_lock);
1246 if (mnt_get_count(mnt) != 2) {
1247 br_write_lock(vfsmount_lock);
1107 return -EBUSY; 1248 return -EBUSY;
1249 }
1250 br_write_unlock(vfsmount_lock);
1108 1251
1109 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1252 if (!xchg(&mnt->mnt_expiry_mark, 1))
1110 return -EAGAIN; 1253 return -EAGAIN;
@@ -1668,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
1668 return err; 1811 return err;
1669 1812
1670 down_write(&namespace_sem); 1813 down_write(&namespace_sem);
1671 while (d_mountpoint(path->dentry) && 1814 err = follow_down(path, true);
1672 follow_down(path)) 1815 if (err < 0)
1673 ; 1816 goto out;
1817
1674 err = -EINVAL; 1818 err = -EINVAL;
1675 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1819 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1676 goto out; 1820 goto out;
@@ -1728,6 +1872,8 @@ out:
1728 return err; 1872 return err;
1729} 1873}
1730 1874
1875static int do_add_mount(struct vfsmount *, struct path *, int);
1876
1731/* 1877/*
1732 * create a new mount for userspace and request it to be added into the 1878 * create a new mount for userspace and request it to be added into the
1733 * namespace's tree 1879 * namespace's tree
@@ -1736,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1736 int mnt_flags, char *name, void *data) 1882 int mnt_flags, char *name, void *data)
1737{ 1883{
1738 struct vfsmount *mnt; 1884 struct vfsmount *mnt;
1885 int err;
1739 1886
1740 if (!type) 1887 if (!type)
1741 return -EINVAL; 1888 return -EINVAL;
@@ -1744,21 +1891,51 @@ static int do_new_mount(struct path *path, char *type, int flags,
1744 if (!capable(CAP_SYS_ADMIN)) 1891 if (!capable(CAP_SYS_ADMIN))
1745 return -EPERM; 1892 return -EPERM;
1746 1893
1747 lock_kernel();
1748 mnt = do_kern_mount(type, flags, name, data); 1894 mnt = do_kern_mount(type, flags, name, data);
1749 unlock_kernel();
1750 if (IS_ERR(mnt)) 1895 if (IS_ERR(mnt))
1751 return PTR_ERR(mnt); 1896 return PTR_ERR(mnt);
1752 1897
1753 return do_add_mount(mnt, path, mnt_flags, NULL); 1898 err = do_add_mount(mnt, path, mnt_flags);
1899 if (err)
1900 mntput(mnt);
1901 return err;
1902}
1903
1904int finish_automount(struct vfsmount *m, struct path *path)
1905{
1906 int err;
1907 /* The new mount record should have at least 2 refs to prevent it being
1908 * expired before we get a chance to add it
1909 */
1910 BUG_ON(mnt_get_count(m) < 2);
1911
1912 if (m->mnt_sb == path->mnt->mnt_sb &&
1913 m->mnt_root == path->dentry) {
1914 err = -ELOOP;
1915 goto fail;
1916 }
1917
1918 err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
1919 if (!err)
1920 return 0;
1921fail:
1922 /* remove m from any expiration list it may be on */
1923 if (!list_empty(&m->mnt_expire)) {
1924 down_write(&namespace_sem);
1925 br_write_lock(vfsmount_lock);
1926 list_del_init(&m->mnt_expire);
1927 br_write_unlock(vfsmount_lock);
1928 up_write(&namespace_sem);
1929 }
1930 mntput(m);
1931 mntput(m);
1932 return err;
1754} 1933}
1755 1934
1756/* 1935/*
1757 * add a mount into a namespace's mount tree 1936 * add a mount into a namespace's mount tree
1758 * - provide the option of adding the new mount to an expiration list
1759 */ 1937 */
1760int do_add_mount(struct vfsmount *newmnt, struct path *path, 1938static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1761 int mnt_flags, struct list_head *fslist)
1762{ 1939{
1763 int err; 1940 int err;
1764 1941
@@ -1766,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1766 1943
1767 down_write(&namespace_sem); 1944 down_write(&namespace_sem);
1768 /* Something was mounted here while we slept */ 1945 /* Something was mounted here while we slept */
1769 while (d_mountpoint(path->dentry) && 1946 err = follow_down(path, true);
1770 follow_down(path)) 1947 if (err < 0)
1771 ; 1948 goto unlock;
1949
1772 err = -EINVAL; 1950 err = -EINVAL;
1773 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) 1951 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1774 goto unlock; 1952 goto unlock;
@@ -1784,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1784 goto unlock; 1962 goto unlock;
1785 1963
1786 newmnt->mnt_flags = mnt_flags; 1964 newmnt->mnt_flags = mnt_flags;
1787 if ((err = graft_tree(newmnt, path))) 1965 err = graft_tree(newmnt, path);
1788 goto unlock;
1789
1790 if (fslist) /* add to the specified expiration list */
1791 list_add_tail(&newmnt->mnt_expire, fslist);
1792
1793 up_write(&namespace_sem);
1794 return 0;
1795 1966
1796unlock: 1967unlock:
1797 up_write(&namespace_sem); 1968 up_write(&namespace_sem);
1798 mntput(newmnt);
1799 return err; 1969 return err;
1800} 1970}
1801 1971
1802EXPORT_SYMBOL_GPL(do_add_mount); 1972/**
1973 * mnt_set_expiry - Put a mount on an expiration list
1974 * @mnt: The mount to list.
1975 * @expiry_list: The list to add the mount to.
1976 */
1977void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
1978{
1979 down_write(&namespace_sem);
1980 br_write_lock(vfsmount_lock);
1981
1982 list_add_tail(&mnt->mnt_expire, expiry_list);
1983
1984 br_write_unlock(vfsmount_lock);
1985 up_write(&namespace_sem);
1986}
1987EXPORT_SYMBOL(mnt_set_expiry);
1803 1988
1804/* 1989/*
1805 * process a list of expirable mountpoints with the intent of discarding any 1990 * process a list of expirable mountpoints with the intent of discarding any
@@ -2088,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
2088 return new_ns; 2273 return new_ns;
2089} 2274}
2090 2275
2276void mnt_make_longterm(struct vfsmount *mnt)
2277{
2278 __mnt_make_longterm(mnt);
2279}
2280
2281void mnt_make_shortterm(struct vfsmount *mnt)
2282{
2283#ifdef CONFIG_SMP
2284 if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
2285 return;
2286 br_write_lock(vfsmount_lock);
2287 atomic_dec(&mnt->mnt_longterm);
2288 br_write_unlock(vfsmount_lock);
2289#endif
2290}
2291
2091/* 2292/*
2092 * Allocate a new namespace structure and populate it with contents 2293 * Allocate a new namespace structure and populate it with contents
2093 * copied from the namespace of the passed in task structure. 2294 * copied from the namespace of the passed in task structure.
@@ -2125,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2125 q = new_ns->root; 2326 q = new_ns->root;
2126 while (p) { 2327 while (p) {
2127 q->mnt_ns = new_ns; 2328 q->mnt_ns = new_ns;
2329 __mnt_make_longterm(q);
2128 if (fs) { 2330 if (fs) {
2129 if (p == fs->root.mnt) { 2331 if (p == fs->root.mnt) {
2130 rootmnt = p;
2131 fs->root.mnt = mntget(q); 2332 fs->root.mnt = mntget(q);
2333 __mnt_make_longterm(q);
2334 mnt_make_shortterm(p);
2335 rootmnt = p;
2132 } 2336 }
2133 if (p == fs->pwd.mnt) { 2337 if (p == fs->pwd.mnt) {
2134 pwdmnt = p;
2135 fs->pwd.mnt = mntget(q); 2338 fs->pwd.mnt = mntget(q);
2339 __mnt_make_longterm(q);
2340 mnt_make_shortterm(p);
2341 pwdmnt = p;
2136 } 2342 }
2137 } 2343 }
2138 p = next_mnt(p, mnt_ns->root); 2344 p = next_mnt(p, mnt_ns->root);
@@ -2176,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2176 new_ns = alloc_mnt_ns(); 2382 new_ns = alloc_mnt_ns();
2177 if (!IS_ERR(new_ns)) { 2383 if (!IS_ERR(new_ns)) {
2178 mnt->mnt_ns = new_ns; 2384 mnt->mnt_ns = new_ns;
2385 __mnt_make_longterm(mnt);
2179 new_ns->root = mnt; 2386 new_ns->root = mnt;
2180 list_add(&new_ns->list, &new_ns->root->mnt_list); 2387 list_add(&new_ns->list, &new_ns->root->mnt_list);
2181 } 2388 }
@@ -2330,6 +2537,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2330 touch_mnt_namespace(current->nsproxy->mnt_ns); 2537 touch_mnt_namespace(current->nsproxy->mnt_ns);
2331 br_write_unlock(vfsmount_lock); 2538 br_write_unlock(vfsmount_lock);
2332 chroot_fs_refs(&root, &new); 2539 chroot_fs_refs(&root, &new);
2540
2333 error = 0; 2541 error = 0;
2334 path_put(&root_parent); 2542 path_put(&root_parent);
2335 path_put(&parent_path); 2543 path_put(&parent_path);
@@ -2356,6 +2564,7 @@ static void __init init_mount_tree(void)
2356 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2564 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
2357 if (IS_ERR(mnt)) 2565 if (IS_ERR(mnt))
2358 panic("Can't create rootfs"); 2566 panic("Can't create rootfs");
2567
2359 ns = create_mnt_ns(mnt); 2568 ns = create_mnt_ns(mnt);
2360 if (IS_ERR(ns)) 2569 if (IS_ERR(ns))
2361 panic("Can't allocate initial namespace"); 2570 panic("Can't allocate initial namespace");
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd58..f6946bb5cb55 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,13 +17,11 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/namei.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/byteorder.h> 22#include <asm/byteorder.h>
22#include <linux/smp_lock.h>
23 23
24#include <linux/ncp_fs.h> 24#include "ncp_fs.h"
25
26#include "ncplib_kernel.h"
27 25
28static void ncp_read_volume_list(struct file *, void *, filldir_t, 26static void ncp_read_volume_list(struct file *, void *, filldir_t,
29 struct ncp_cache_control *); 27 struct ncp_cache_control *);
@@ -75,11 +73,14 @@ const struct inode_operations ncp_dir_inode_operations =
75 * Dentry operations routines 73 * Dentry operations routines
76 */ 74 */
77static int ncp_lookup_validate(struct dentry *, struct nameidata *); 75static int ncp_lookup_validate(struct dentry *, struct nameidata *);
78static int ncp_hash_dentry(struct dentry *, struct qstr *); 76static int ncp_hash_dentry(const struct dentry *, const struct inode *,
79static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *); 77 struct qstr *);
80static int ncp_delete_dentry(struct dentry *); 78static int ncp_compare_dentry(const struct dentry *, const struct inode *,
81 79 const struct dentry *, const struct inode *,
82static const struct dentry_operations ncp_dentry_operations = 80 unsigned int, const char *, const struct qstr *);
81static int ncp_delete_dentry(const struct dentry *);
82
83const struct dentry_operations ncp_dentry_operations =
83{ 84{
84 .d_revalidate = ncp_lookup_validate, 85 .d_revalidate = ncp_lookup_validate,
85 .d_hash = ncp_hash_dentry, 86 .d_hash = ncp_hash_dentry,
@@ -87,28 +88,49 @@ static const struct dentry_operations ncp_dentry_operations =
87 .d_delete = ncp_delete_dentry, 88 .d_delete = ncp_delete_dentry,
88}; 89};
89 90
90const struct dentry_operations ncp_root_dentry_operations = 91#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
92
93static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
91{ 94{
92 .d_hash = ncp_hash_dentry, 95#ifdef CONFIG_NCPFS_SMALLDOS
93 .d_compare = ncp_compare_dentry, 96 int ns = ncp_namespace(i);
94 .d_delete = ncp_delete_dentry, 97
95}; 98 if ((ns == NW_NS_DOS)
99#ifdef CONFIG_NCPFS_OS2_NS
100 || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
101#endif /* CONFIG_NCPFS_OS2_NS */
102 )
103 return 0;
104#endif /* CONFIG_NCPFS_SMALLDOS */
105 return 1;
106}
107
108#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS)
96 109
110static inline int ncp_case_sensitive(const struct inode *i)
111{
112#ifdef CONFIG_NCPFS_NFS_NS
113 return ncp_namespace(i) == NW_NS_NFS;
114#else
115 return 0;
116#endif /* CONFIG_NCPFS_NFS_NS */
117}
97 118
98/* 119/*
99 * Note: leave the hash unchanged if the directory 120 * Note: leave the hash unchanged if the directory
100 * is case-sensitive. 121 * is case-sensitive.
101 */ 122 */
102static int 123static int
103ncp_hash_dentry(struct dentry *dentry, struct qstr *this) 124ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
125 struct qstr *this)
104{ 126{
105 struct nls_table *t; 127 if (!ncp_case_sensitive(inode)) {
106 unsigned long hash; 128 struct super_block *sb = dentry->d_sb;
107 int i; 129 struct nls_table *t;
130 unsigned long hash;
131 int i;
108 132
109 t = NCP_IO_TABLE(dentry); 133 t = NCP_IO_TABLE(sb);
110
111 if (!ncp_case_sensitive(dentry->d_inode)) {
112 hash = init_name_hash(); 134 hash = init_name_hash();
113 for (i=0; i<this->len ; i++) 135 for (i=0; i<this->len ; i++)
114 hash = partial_name_hash(ncp_tolower(t, this->name[i]), 136 hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -119,15 +141,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
119} 141}
120 142
121static int 143static int
122ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b) 144ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
145 const struct dentry *dentry, const struct inode *inode,
146 unsigned int len, const char *str, const struct qstr *name)
123{ 147{
124 if (a->len != b->len) 148 if (len != name->len)
125 return 1; 149 return 1;
126 150
127 if (ncp_case_sensitive(dentry->d_inode)) 151 if (ncp_case_sensitive(pinode))
128 return strncmp(a->name, b->name, a->len); 152 return strncmp(str, name->name, len);
129 153
130 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len); 154 return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
131} 155}
132 156
133/* 157/*
@@ -136,7 +160,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
136 * Closing files can be safely postponed until iput() - it's done there anyway. 160 * Closing files can be safely postponed until iput() - it's done there anyway.
137 */ 161 */
138static int 162static int
139ncp_delete_dentry(struct dentry * dentry) 163ncp_delete_dentry(const struct dentry * dentry)
140{ 164{
141 struct inode *inode = dentry->d_inode; 165 struct inode *inode = dentry->d_inode;
142 166
@@ -266,7 +290,7 @@ leave_me:;
266 290
267 291
268static int 292static int
269__ncp_lookup_validate(struct dentry *dentry) 293ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
270{ 294{
271 struct ncp_server *server; 295 struct ncp_server *server;
272 struct dentry *parent; 296 struct dentry *parent;
@@ -275,6 +299,12 @@ __ncp_lookup_validate(struct dentry *dentry)
275 int res, val = 0, len; 299 int res, val = 0, len;
276 __u8 __name[NCP_MAXPATHLEN + 1]; 300 __u8 __name[NCP_MAXPATHLEN + 1];
277 301
302 if (dentry == dentry->d_sb->s_root)
303 return 1;
304
305 if (nd->flags & LOOKUP_RCU)
306 return -ECHILD;
307
278 parent = dget_parent(dentry); 308 parent = dget_parent(dentry);
279 dir = parent->d_inode; 309 dir = parent->d_inode;
280 310
@@ -283,9 +313,6 @@ __ncp_lookup_validate(struct dentry *dentry)
283 313
284 server = NCP_SERVER(dir); 314 server = NCP_SERVER(dir);
285 315
286 if (!ncp_conn_valid(server))
287 goto finished;
288
289 /* 316 /*
290 * Inspired by smbfs: 317 * Inspired by smbfs:
291 * The default validation is based on dentry age: 318 * The default validation is based on dentry age:
@@ -304,8 +331,11 @@ __ncp_lookup_validate(struct dentry *dentry)
304 if (ncp_is_server_root(dir)) { 331 if (ncp_is_server_root(dir)) {
305 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 332 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
306 dentry->d_name.len, 1); 333 dentry->d_name.len, 1);
307 if (!res) 334 if (!res) {
308 res = ncp_lookup_volume(server, __name, &(finfo.i)); 335 res = ncp_lookup_volume(server, __name, &(finfo.i));
336 if (!res)
337 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
338 }
309 } else { 339 } else {
310 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 340 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
311 dentry->d_name.len, !ncp_preserve_case(dir)); 341 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +350,17 @@ __ncp_lookup_validate(struct dentry *dentry)
320 * what we remember, it's not valid any more. 350 * what we remember, it's not valid any more.
321 */ 351 */
322 if (!res) { 352 if (!res) {
323 if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) { 353 struct inode *inode = dentry->d_inode;
354
355 mutex_lock(&inode->i_mutex);
356 if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
324 ncp_new_dentry(dentry); 357 ncp_new_dentry(dentry);
325 val=1; 358 val=1;
326 } else 359 } else
327 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); 360 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
328 361
329 ncp_update_inode2(dentry->d_inode, &finfo); 362 ncp_update_inode2(inode, &finfo);
363 mutex_unlock(&inode->i_mutex);
330 } 364 }
331 365
332finished: 366finished:
@@ -335,16 +369,6 @@ finished:
335 return val; 369 return val;
336} 370}
337 371
338static int
339ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
340{
341 int res;
342 lock_kernel();
343 res = __ncp_lookup_validate(dentry);
344 unlock_kernel();
345 return res;
346}
347
348static struct dentry * 372static struct dentry *
349ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) 373ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
350{ 374{
@@ -364,21 +388,21 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
364 } 388 }
365 389
366 /* If a pointer is invalid, we search the dentry. */ 390 /* If a pointer is invalid, we search the dentry. */
367 spin_lock(&dcache_lock); 391 spin_lock(&parent->d_lock);
368 next = parent->d_subdirs.next; 392 next = parent->d_subdirs.next;
369 while (next != &parent->d_subdirs) { 393 while (next != &parent->d_subdirs) {
370 dent = list_entry(next, struct dentry, d_u.d_child); 394 dent = list_entry(next, struct dentry, d_u.d_child);
371 if ((unsigned long)dent->d_fsdata == fpos) { 395 if ((unsigned long)dent->d_fsdata == fpos) {
372 if (dent->d_inode) 396 if (dent->d_inode)
373 dget_locked(dent); 397 dget(dent);
374 else 398 else
375 dent = NULL; 399 dent = NULL;
376 spin_unlock(&dcache_lock); 400 spin_unlock(&parent->d_lock);
377 goto out; 401 goto out;
378 } 402 }
379 next = next->next; 403 next = next->next;
380 } 404 }
381 spin_unlock(&dcache_lock); 405 spin_unlock(&parent->d_lock);
382 return NULL; 406 return NULL;
383 407
384out: 408out:
@@ -411,8 +435,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
411 int result, mtime_valid = 0; 435 int result, mtime_valid = 0;
412 time_t mtime = 0; 436 time_t mtime = 0;
413 437
414 lock_kernel();
415
416 ctl.page = NULL; 438 ctl.page = NULL;
417 ctl.cache = NULL; 439 ctl.cache = NULL;
418 440
@@ -421,6 +443,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
421 (int) filp->f_pos); 443 (int) filp->f_pos);
422 444
423 result = -EIO; 445 result = -EIO;
446 /* Do not generate '.' and '..' when server is dead. */
424 if (!ncp_conn_valid(server)) 447 if (!ncp_conn_valid(server))
425 goto out; 448 goto out;
426 449
@@ -532,6 +555,12 @@ read_really:
532 ctl.head.end = ctl.fpos - 1; 555 ctl.head.end = ctl.fpos - 1;
533 ctl.head.eof = ctl.valid; 556 ctl.head.eof = ctl.valid;
534finished: 557finished:
558 if (ctl.page) {
559 kunmap(ctl.page);
560 SetPageUptodate(ctl.page);
561 unlock_page(ctl.page);
562 page_cache_release(ctl.page);
563 }
535 if (page) { 564 if (page) {
536 cache->head = ctl.head; 565 cache->head = ctl.head;
537 kunmap(page); 566 kunmap(page);
@@ -539,23 +568,17 @@ finished:
539 unlock_page(page); 568 unlock_page(page);
540 page_cache_release(page); 569 page_cache_release(page);
541 } 570 }
542 if (ctl.page) {
543 kunmap(ctl.page);
544 SetPageUptodate(ctl.page);
545 unlock_page(ctl.page);
546 page_cache_release(ctl.page);
547 }
548out: 571out:
549 unlock_kernel();
550 return result; 572 return result;
551} 573}
552 574
553static int 575static int
554ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 576ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
555 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry) 577 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
578 int inval_childs)
556{ 579{
557 struct dentry *newdent, *dentry = filp->f_path.dentry; 580 struct dentry *newdent, *dentry = filp->f_path.dentry;
558 struct inode *newino, *inode = dentry->d_inode; 581 struct inode *dir = dentry->d_inode;
559 struct ncp_cache_control ctl = *ctrl; 582 struct ncp_cache_control ctl = *ctrl;
560 struct qstr qname; 583 struct qstr qname;
561 int valid = 0; 584 int valid = 0;
@@ -564,16 +587,16 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
564 __u8 __name[NCP_MAXPATHLEN + 1]; 587 __u8 __name[NCP_MAXPATHLEN + 1];
565 588
566 qname.len = sizeof(__name); 589 qname.len = sizeof(__name);
567 if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len, 590 if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
568 entry->i.entryName, entry->i.nameLen, 591 entry->i.entryName, entry->i.nameLen,
569 !ncp_preserve_entry_case(inode, entry->i.NSCreator))) 592 !ncp_preserve_entry_case(dir, entry->i.NSCreator)))
570 return 1; /* I'm not sure */ 593 return 1; /* I'm not sure */
571 594
572 qname.name = __name; 595 qname.name = __name;
573 qname.hash = full_name_hash(qname.name, qname.len); 596 qname.hash = full_name_hash(qname.name, qname.len);
574 597
575 if (dentry->d_op && dentry->d_op->d_hash) 598 if (dentry->d_op && dentry->d_op->d_hash)
576 if (dentry->d_op->d_hash(dentry, &qname) != 0) 599 if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
577 goto end_advance; 600 goto end_advance;
578 601
579 newdent = d_lookup(dentry, &qname); 602 newdent = d_lookup(dentry, &qname);
@@ -584,22 +607,40 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
584 goto end_advance; 607 goto end_advance;
585 } else { 608 } else {
586 hashed = 1; 609 hashed = 1;
587 memcpy((char *) newdent->d_name.name, qname.name, 610
588 newdent->d_name.len); 611 /* If case sensitivity changed for this volume, all entries below this one
612 should be thrown away. This entry itself is not affected, as its case
613 sensitivity is controlled by its own parent. */
614 if (inval_childs)
615 shrink_dcache_parent(newdent);
616
617 /*
618 * NetWare's OS2 namespace is case preserving yet case
619 * insensitive. So we update dentry's name as received from
620 * server. Parent dir's i_mutex is locked because we're in
621 * readdir.
622 */
623 dentry_update_name_case(newdent, &qname);
589 } 624 }
590 625
591 if (!newdent->d_inode) { 626 if (!newdent->d_inode) {
627 struct inode *inode;
628
592 entry->opened = 0; 629 entry->opened = 0;
593 entry->ino = iunique(inode->i_sb, 2); 630 entry->ino = iunique(dir->i_sb, 2);
594 newino = ncp_iget(inode->i_sb, entry); 631 inode = ncp_iget(dir->i_sb, entry);
595 if (newino) { 632 if (inode) {
596 newdent->d_op = &ncp_dentry_operations; 633 d_instantiate(newdent, inode);
597 d_instantiate(newdent, newino);
598 if (!hashed) 634 if (!hashed)
599 d_rehash(newdent); 635 d_rehash(newdent);
600 } 636 }
601 } else 637 } else {
602 ncp_update_inode2(newdent->d_inode, entry); 638 struct inode *inode = newdent->d_inode;
639
640 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
641 ncp_update_inode2(inode, entry);
642 mutex_unlock(&inode->i_mutex);
643 }
603 644
604 if (newdent->d_inode) { 645 if (newdent->d_inode) {
605 ino = newdent->d_inode->i_ino; 646 ino = newdent->d_inode->i_ino;
@@ -617,7 +658,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
617 ctl.cache = NULL; 658 ctl.cache = NULL;
618 ctl.idx -= NCP_DIRCACHE_SIZE; 659 ctl.idx -= NCP_DIRCACHE_SIZE;
619 ctl.ofs += 1; 660 ctl.ofs += 1;
620 ctl.page = grab_cache_page(&inode->i_data, ctl.ofs); 661 ctl.page = grab_cache_page(&dir->i_data, ctl.ofs);
621 if (ctl.page) 662 if (ctl.page)
622 ctl.cache = kmap(ctl.page); 663 ctl.cache = kmap(ctl.page);
623 } 664 }
@@ -633,7 +674,7 @@ end_advance:
633 if (!ino) 674 if (!ino)
634 ino = find_inode_number(dentry, &qname); 675 ino = find_inode_number(dentry, &qname);
635 if (!ino) 676 if (!ino)
636 ino = iunique(inode->i_sb, 2); 677 ino = iunique(dir->i_sb, 2);
637 ctl.filled = filldir(dirent, qname.name, qname.len, 678 ctl.filled = filldir(dirent, qname.name, qname.len,
638 filp->f_pos, ino, DT_UNKNOWN); 679 filp->f_pos, ino, DT_UNKNOWN);
639 if (!ctl.filled) 680 if (!ctl.filled)
@@ -660,6 +701,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
660 (unsigned long) filp->f_pos); 701 (unsigned long) filp->f_pos);
661 702
662 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { 703 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
704 int inval_dentry;
663 705
664 if (ncp_get_volume_info_with_number(server, i, &info) != 0) 706 if (ncp_get_volume_info_with_number(server, i, &info) != 0)
665 return; 707 return;
@@ -675,8 +717,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
675 info.volume_name); 717 info.volume_name);
676 continue; 718 continue;
677 } 719 }
720 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
678 entry.volume = entry.i.volNumber; 721 entry.volume = entry.i.volNumber;
679 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 722 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
680 return; 723 return;
681 } 724 }
682} 725}
@@ -739,7 +782,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
739 rpl += onerpl; 782 rpl += onerpl;
740 rpls -= onerpl; 783 rpls -= onerpl;
741 entry.volume = entry.i.volNumber; 784 entry.volume = entry.i.volNumber;
742 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 785 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
743 break; 786 break;
744 } 787 }
745 } while (more); 788 } while (more);
@@ -775,17 +818,19 @@ int ncp_conn_logged_in(struct super_block *sb)
775 if (dent) { 818 if (dent) {
776 struct inode* ino = dent->d_inode; 819 struct inode* ino = dent->d_inode;
777 if (ino) { 820 if (ino) {
821 ncp_update_known_namespace(server, volNumber, NULL);
778 NCP_FINFO(ino)->volNumber = volNumber; 822 NCP_FINFO(ino)->volNumber = volNumber;
779 NCP_FINFO(ino)->dirEntNum = dirEntNum; 823 NCP_FINFO(ino)->dirEntNum = dirEntNum;
780 NCP_FINFO(ino)->DosDirNum = DosDirNum; 824 NCP_FINFO(ino)->DosDirNum = DosDirNum;
825 result = 0;
781 } else { 826 } else {
782 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); 827 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
783 } 828 }
784 } else { 829 } else {
785 DPRINTK("ncpfs: sb->s_root == NULL!\n"); 830 DPRINTK("ncpfs: sb->s_root == NULL!\n");
786 } 831 }
787 } 832 } else
788 result = 0; 833 result = 0;
789 834
790out: 835out:
791 return result; 836 return result;
@@ -799,7 +844,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
799 int error, res, len; 844 int error, res, len;
800 __u8 __name[NCP_MAXPATHLEN + 1]; 845 __u8 __name[NCP_MAXPATHLEN + 1];
801 846
802 lock_kernel();
803 error = -EIO; 847 error = -EIO;
804 if (!ncp_conn_valid(server)) 848 if (!ncp_conn_valid(server))
805 goto finished; 849 goto finished;
@@ -813,6 +857,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
813 dentry->d_name.len, 1); 857 dentry->d_name.len, 1);
814 if (!res) 858 if (!res)
815 res = ncp_lookup_volume(server, __name, &(finfo.i)); 859 res = ncp_lookup_volume(server, __name, &(finfo.i));
860 if (!res)
861 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
816 } else { 862 } else {
817 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 863 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
818 dentry->d_name.len, !ncp_preserve_case(dir)); 864 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -839,14 +885,12 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
839 if (inode) { 885 if (inode) {
840 ncp_new_dentry(dentry); 886 ncp_new_dentry(dentry);
841add_entry: 887add_entry:
842 dentry->d_op = &ncp_dentry_operations;
843 d_add(dentry, inode); 888 d_add(dentry, inode);
844 error = 0; 889 error = 0;
845 } 890 }
846 891
847finished: 892finished:
848 PPRINTK("ncp_lookup: result=%d\n", error); 893 PPRINTK("ncp_lookup: result=%d\n", error);
849 unlock_kernel();
850 return ERR_PTR(error); 894 return ERR_PTR(error);
851} 895}
852 896
@@ -887,11 +931,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
887 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n", 931 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
888 dentry->d_parent->d_name.name, dentry->d_name.name, mode); 932 dentry->d_parent->d_name.name, dentry->d_name.name, mode);
889 933
890 error = -EIO;
891 lock_kernel();
892 if (!ncp_conn_valid(server))
893 goto out;
894
895 ncp_age_dentry(server, dentry); 934 ncp_age_dentry(server, dentry);
896 len = sizeof(__name); 935 len = sizeof(__name);
897 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 936 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +956,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
917 if (result) { 956 if (result) {
918 if (result == 0x87) 957 if (result == 0x87)
919 error = -ENAMETOOLONG; 958 error = -ENAMETOOLONG;
959 else if (result < 0)
960 error = result;
920 DPRINTK("ncp_create: %s/%s failed\n", 961 DPRINTK("ncp_create: %s/%s failed\n",
921 dentry->d_parent->d_name.name, dentry->d_name.name); 962 dentry->d_parent->d_name.name, dentry->d_name.name);
922 goto out; 963 goto out;
@@ -935,7 +976,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
935 976
936 error = ncp_instantiate(dir, dentry, &finfo); 977 error = ncp_instantiate(dir, dentry, &finfo);
937out: 978out:
938 unlock_kernel();
939 return error; 979 return error;
940} 980}
941 981
@@ -955,11 +995,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
955 DPRINTK("ncp_mkdir: making %s/%s\n", 995 DPRINTK("ncp_mkdir: making %s/%s\n",
956 dentry->d_parent->d_name.name, dentry->d_name.name); 996 dentry->d_parent->d_name.name, dentry->d_name.name);
957 997
958 error = -EIO;
959 lock_kernel();
960 if (!ncp_conn_valid(server))
961 goto out;
962
963 ncp_age_dentry(server, dentry); 998 ncp_age_dentry(server, dentry);
964 len = sizeof(__name); 999 len = sizeof(__name);
965 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 1000 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1002,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
967 if (error) 1002 if (error)
968 goto out; 1003 goto out;
969 1004
970 error = -EACCES; 1005 error = ncp_open_create_file_or_subdir(server, dir, __name,
971 if (ncp_open_create_file_or_subdir(server, dir, __name,
972 OC_MODE_CREATE, aDIR, 1006 OC_MODE_CREATE, aDIR,
973 cpu_to_le16(0xffff), 1007 cpu_to_le16(0xffff),
974 &finfo) == 0) 1008 &finfo);
975 { 1009 if (error == 0) {
976 if (ncp_is_nfs_extras(server, finfo.volume)) { 1010 if (ncp_is_nfs_extras(server, finfo.volume)) {
977 mode |= S_IFDIR; 1011 mode |= S_IFDIR;
978 finfo.i.nfs.mode = mode; 1012 finfo.i.nfs.mode = mode;
@@ -983,9 +1017,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
983 goto out; 1017 goto out;
984 } 1018 }
985 error = ncp_instantiate(dir, dentry, &finfo); 1019 error = ncp_instantiate(dir, dentry, &finfo);
1020 } else if (error > 0) {
1021 error = -EACCES;
986 } 1022 }
987out: 1023out:
988 unlock_kernel();
989 return error; 1024 return error;
990} 1025}
991 1026
@@ -998,11 +1033,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
998 DPRINTK("ncp_rmdir: removing %s/%s\n", 1033 DPRINTK("ncp_rmdir: removing %s/%s\n",
999 dentry->d_parent->d_name.name, dentry->d_name.name); 1034 dentry->d_parent->d_name.name, dentry->d_name.name);
1000 1035
1001 error = -EIO;
1002 lock_kernel();
1003 if (!ncp_conn_valid(server))
1004 goto out;
1005
1006 error = -EBUSY; 1036 error = -EBUSY;
1007 if (!d_unhashed(dentry)) 1037 if (!d_unhashed(dentry))
1008 goto out; 1038 goto out;
@@ -1036,11 +1066,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1036 error = -ENOENT; 1066 error = -ENOENT;
1037 break; 1067 break;
1038 default: 1068 default:
1039 error = -EACCES; 1069 error = result < 0 ? result : -EACCES;
1040 break; 1070 break;
1041 } 1071 }
1042out: 1072out:
1043 unlock_kernel();
1044 return error; 1073 return error;
1045} 1074}
1046 1075
@@ -1050,15 +1079,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1050 struct ncp_server *server; 1079 struct ncp_server *server;
1051 int error; 1080 int error;
1052 1081
1053 lock_kernel();
1054 server = NCP_SERVER(dir); 1082 server = NCP_SERVER(dir);
1055 DPRINTK("ncp_unlink: unlinking %s/%s\n", 1083 DPRINTK("ncp_unlink: unlinking %s/%s\n",
1056 dentry->d_parent->d_name.name, dentry->d_name.name); 1084 dentry->d_parent->d_name.name, dentry->d_name.name);
1057 1085
1058 error = -EIO;
1059 if (!ncp_conn_valid(server))
1060 goto out;
1061
1062 /* 1086 /*
1063 * Check whether to close the file ... 1087 * Check whether to close the file ...
1064 */ 1088 */
@@ -1097,12 +1121,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1097 error = -ENOENT; 1121 error = -ENOENT;
1098 break; 1122 break;
1099 default: 1123 default:
1100 error = -EACCES; 1124 error = error < 0 ? error : -EACCES;
1101 break; 1125 break;
1102 } 1126 }
1103
1104out:
1105 unlock_kernel();
1106 return error; 1127 return error;
1107} 1128}
1108 1129
@@ -1118,11 +1139,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1118 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1139 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1119 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1140 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1120 1141
1121 error = -EIO;
1122 lock_kernel();
1123 if (!ncp_conn_valid(server))
1124 goto out;
1125
1126 ncp_age_dentry(server, old_dentry); 1142 ncp_age_dentry(server, old_dentry);
1127 ncp_age_dentry(server, new_dentry); 1143 ncp_age_dentry(server, new_dentry);
1128 1144
@@ -1161,11 +1177,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1161 error = -ENOENT; 1177 error = -ENOENT;
1162 break; 1178 break;
1163 default: 1179 default:
1164 error = -EACCES; 1180 error = error < 0 ? error : -EACCES;
1165 break; 1181 break;
1166 } 1182 }
1167out: 1183out:
1168 unlock_kernel();
1169 return error; 1184 return error;
1170} 1185}
1171 1186
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbdae..0ed65e0c3dfe 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,10 +17,8 @@
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/smp_lock.h>
21 20
22#include <linux/ncp_fs.h> 21#include "ncp_fs.h"
23#include "ncplib_kernel.h"
24 22
25static int ncp_fsync(struct file *file, int datasync) 23static int ncp_fsync(struct file *file, int datasync)
26{ 24{
@@ -113,9 +111,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
113 DPRINTK("ncp_file_read: enter %s/%s\n", 111 DPRINTK("ncp_file_read: enter %s/%s\n",
114 dentry->d_parent->d_name.name, dentry->d_name.name); 112 dentry->d_parent->d_name.name, dentry->d_name.name);
115 113
116 if (!ncp_conn_valid(NCP_SERVER(inode)))
117 return -EIO;
118
119 pos = *ppos; 114 pos = *ppos;
120 115
121 if ((ssize_t) count < 0) { 116 if ((ssize_t) count < 0) {
@@ -192,13 +187,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
192 187
193 DPRINTK("ncp_file_write: enter %s/%s\n", 188 DPRINTK("ncp_file_write: enter %s/%s\n",
194 dentry->d_parent->d_name.name, dentry->d_name.name); 189 dentry->d_parent->d_name.name, dentry->d_name.name);
195 if (!ncp_conn_valid(NCP_SERVER(inode)))
196 return -EIO;
197 if ((ssize_t) count < 0) 190 if ((ssize_t) count < 0)
198 return -EINVAL; 191 return -EINVAL;
199 pos = *ppos; 192 pos = *ppos;
200 if (file->f_flags & O_APPEND) { 193 if (file->f_flags & O_APPEND) {
201 pos = inode->i_size; 194 pos = i_size_read(inode);
202 } 195 }
203 196
204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { 197 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +257,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
264 257
265 *ppos = pos; 258 *ppos = pos;
266 259
267 if (pos > inode->i_size) { 260 if (pos > i_size_read(inode)) {
268 inode->i_size = pos; 261 mutex_lock(&inode->i_mutex);
262 if (pos > i_size_read(inode))
263 i_size_write(inode, pos);
264 mutex_unlock(&inode->i_mutex);
269 } 265 }
270 DPRINTK("ncp_file_write: exit %s/%s\n", 266 DPRINTK("ncp_file_write: exit %s/%s\n",
271 dentry->d_parent->d_name.name, dentry->d_name.name); 267 dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +277,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
281 return 0; 277 return 0;
282} 278}
283 279
284static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
285{
286 loff_t ret;
287 lock_kernel();
288 ret = generic_file_llseek_unlocked(file, offset, origin);
289 unlock_kernel();
290 return ret;
291}
292
293const struct file_operations ncp_file_operations = 280const struct file_operations ncp_file_operations =
294{ 281{
295 .llseek = ncp_remote_llseek, 282 .llseek = generic_file_llseek,
296 .read = ncp_file_read, 283 .read = ncp_file_read,
297 .write = ncp_file_write, 284 .write = ncp_file_write,
298 .unlocked_ioctl = ncp_ioctl, 285 .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b4de38cf49f5..00a1d1c3d3a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,16 +26,14 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/smp_lock.h>
30#include <linux/vfs.h> 29#include <linux/vfs.h>
31#include <linux/mount.h> 30#include <linux/mount.h>
32#include <linux/seq_file.h> 31#include <linux/seq_file.h>
33 32#include <linux/namei.h>
34#include <linux/ncp_fs.h>
35 33
36#include <net/sock.h> 34#include <net/sock.h>
37 35
38#include "ncplib_kernel.h" 36#include "ncp_fs.h"
39#include "getopt.h" 37#include "getopt.h"
40 38
41#define NCP_DEFAULT_FILE_MODE 0600 39#define NCP_DEFAULT_FILE_MODE 0600
@@ -59,11 +57,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
59 return &ei->vfs_inode; 57 return &ei->vfs_inode;
60} 58}
61 59
62static void ncp_destroy_inode(struct inode *inode) 60static void ncp_i_callback(struct rcu_head *head)
63{ 61{
62 struct inode *inode = container_of(head, struct inode, i_rcu);
63 INIT_LIST_HEAD(&inode->i_dentry);
64 kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); 64 kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
65} 65}
66 66
67static void ncp_destroy_inode(struct inode *inode)
68{
69 call_rcu(&inode->i_rcu, ncp_i_callback);
70}
71
67static void init_once(void *foo) 72static void init_once(void *foo)
68{ 73{
69 struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; 74 struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
@@ -139,7 +144,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
139 inode->i_mode = nwi->nfs.mode; 144 inode->i_mode = nwi->nfs.mode;
140 } 145 }
141 146
142 inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT; 147 inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
143 148
144 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate); 149 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
145 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate); 150 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +163,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
158 inode->i_mode = server->m.dir_mode; 163 inode->i_mode = server->m.dir_mode;
159 /* for directories dataStreamSize seems to be some 164 /* for directories dataStreamSize seems to be some
160 Object ID ??? */ 165 Object ID ??? */
161 inode->i_size = NCP_BLOCK_SIZE; 166 i_size_write(inode, NCP_BLOCK_SIZE);
162 } else { 167 } else {
168 u32 size;
169
163 inode->i_mode = server->m.file_mode; 170 inode->i_mode = server->m.file_mode;
164 inode->i_size = le32_to_cpu(nwi->dataStreamSize); 171 size = le32_to_cpu(nwi->dataStreamSize);
172 i_size_write(inode, size);
165#ifdef CONFIG_NCPFS_EXTRAS 173#ifdef CONFIG_NCPFS_EXTRAS
166 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 174 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS))
167 && (nwi->attributes & aSHARED)) { 175 && (nwi->attributes & aSHARED)) {
168 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) { 176 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
169 case aHIDDEN: 177 case aHIDDEN:
170 if (server->m.flags & NCP_MOUNT_SYMLINKS) { 178 if (server->m.flags & NCP_MOUNT_SYMLINKS) {
171 if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE) 179 if (/* (size >= NCP_MIN_SYMLINK_SIZE)
172 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) { 180 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
173 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; 181 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
174 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK; 182 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
175 break; 183 break;
@@ -208,7 +216,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
208} 216}
209 217
210/* 218/*
211 * Fill in the inode based on the ncp_entry_info structure. 219 * Fill in the inode based on the ncp_entry_info structure. Used only for brand new inodes.
212 */ 220 */
213static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) 221static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
214{ 222{
@@ -254,6 +262,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
254 if (inode) { 262 if (inode) {
255 atomic_set(&NCP_FINFO(inode)->opened, info->opened); 263 atomic_set(&NCP_FINFO(inode)->opened, info->opened);
256 264
265 inode->i_mapping->backing_dev_info = sb->s_bdi;
257 inode->i_ino = info->ino; 266 inode->i_ino = info->ino;
258 ncp_set_attr(inode, info); 267 ncp_set_attr(inode, info);
259 if (S_ISREG(inode->i_mode)) { 268 if (S_ISREG(inode->i_mode)) {
@@ -299,12 +308,19 @@ ncp_evict_inode(struct inode *inode)
299 308
300static void ncp_stop_tasks(struct ncp_server *server) { 309static void ncp_stop_tasks(struct ncp_server *server) {
301 struct sock* sk = server->ncp_sock->sk; 310 struct sock* sk = server->ncp_sock->sk;
302 311
312 lock_sock(sk);
303 sk->sk_error_report = server->error_report; 313 sk->sk_error_report = server->error_report;
304 sk->sk_data_ready = server->data_ready; 314 sk->sk_data_ready = server->data_ready;
305 sk->sk_write_space = server->write_space; 315 sk->sk_write_space = server->write_space;
316 release_sock(sk);
306 del_timer_sync(&server->timeout_tm); 317 del_timer_sync(&server->timeout_tm);
307 flush_scheduled_work(); 318
319 flush_work_sync(&server->rcv.tq);
320 if (sk->sk_socket->type == SOCK_STREAM)
321 flush_work_sync(&server->tx.tq);
322 else
323 flush_work_sync(&server->timeout_tq);
308} 324}
309 325
310static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt) 326static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -526,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
526 sb->s_blocksize_bits = 10; 542 sb->s_blocksize_bits = 10;
527 sb->s_magic = NCP_SUPER_MAGIC; 543 sb->s_magic = NCP_SUPER_MAGIC;
528 sb->s_op = &ncp_sops; 544 sb->s_op = &ncp_sops;
545 sb->s_d_op = &ncp_dentry_operations;
529 sb->s_bdi = &server->bdi; 546 sb->s_bdi = &server->bdi;
530 547
531 server = NCP_SBP(sb); 548 server = NCP_SBP(sb);
@@ -565,10 +582,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
565/* server->conn_status = 0; */ 582/* server->conn_status = 0; */
566/* server->root_dentry = NULL; */ 583/* server->root_dentry = NULL; */
567/* server->root_setuped = 0; */ 584/* server->root_setuped = 0; */
585 mutex_init(&server->root_setup_lock);
568#ifdef CONFIG_NCPFS_PACKET_SIGNING 586#ifdef CONFIG_NCPFS_PACKET_SIGNING
569/* server->sign_wanted = 0; */ 587/* server->sign_wanted = 0; */
570/* server->sign_active = 0; */ 588/* server->sign_active = 0; */
571#endif 589#endif
590 init_rwsem(&server->auth_rwsem);
572 server->auth.auth_type = NCP_AUTH_NONE; 591 server->auth.auth_type = NCP_AUTH_NONE;
573/* server->auth.object_name_len = 0; */ 592/* server->auth.object_name_len = 0; */
574/* server->auth.object_name = NULL; */ 593/* server->auth.object_name = NULL; */
@@ -593,16 +612,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
593 server->nls_io = load_nls_default(); 612 server->nls_io = load_nls_default();
594#endif /* CONFIG_NCPFS_NLS */ 613#endif /* CONFIG_NCPFS_NLS */
595 614
596 server->dentry_ttl = 0; /* no caching */ 615 atomic_set(&server->dentry_ttl, 0); /* no caching */
597 616
598 INIT_LIST_HEAD(&server->tx.requests); 617 INIT_LIST_HEAD(&server->tx.requests);
599 mutex_init(&server->rcv.creq_mutex); 618 mutex_init(&server->rcv.creq_mutex);
600 server->tx.creq = NULL; 619 server->tx.creq = NULL;
601 server->rcv.creq = NULL; 620 server->rcv.creq = NULL;
602 server->data_ready = sock->sk->sk_data_ready;
603 server->write_space = sock->sk->sk_write_space;
604 server->error_report = sock->sk->sk_error_report;
605 sock->sk->sk_user_data = server;
606 621
607 init_timer(&server->timeout_tm); 622 init_timer(&server->timeout_tm);
608#undef NCP_PACKET_SIZE 623#undef NCP_PACKET_SIZE
@@ -619,6 +634,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
619 if (server->rxbuf == NULL) 634 if (server->rxbuf == NULL)
620 goto out_txbuf; 635 goto out_txbuf;
621 636
637 lock_sock(sock->sk);
638 server->data_ready = sock->sk->sk_data_ready;
639 server->write_space = sock->sk->sk_write_space;
640 server->error_report = sock->sk->sk_error_report;
641 sock->sk->sk_user_data = server;
622 sock->sk->sk_data_ready = ncp_tcp_data_ready; 642 sock->sk->sk_data_ready = ncp_tcp_data_ready;
623 sock->sk->sk_error_report = ncp_tcp_error_report; 643 sock->sk->sk_error_report = ncp_tcp_error_report;
624 if (sock->type == SOCK_STREAM) { 644 if (sock->type == SOCK_STREAM) {
@@ -634,6 +654,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
634 server->timeout_tm.data = (unsigned long)server; 654 server->timeout_tm.data = (unsigned long)server;
635 server->timeout_tm.function = ncpdgram_timeout_call; 655 server->timeout_tm.function = ncpdgram_timeout_call;
636 } 656 }
657 release_sock(sock->sk);
637 658
638 ncp_lock_server(server); 659 ncp_lock_server(server);
639 error = ncp_connect(server); 660 error = ncp_connect(server);
@@ -658,8 +679,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
658 goto out_disconnect; 679 goto out_disconnect;
659 } 680 }
660 } 681 }
682 ncp_lock_server(server);
661 if (options & 2) 683 if (options & 2)
662 server->sign_wanted = 1; 684 server->sign_wanted = 1;
685 ncp_unlock_server(server);
663 } 686 }
664 else 687 else
665#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 688#endif /* CONFIG_NCPFS_PACKET_SIGNING */
@@ -699,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
699 sb->s_root = d_alloc_root(root_inode); 722 sb->s_root = d_alloc_root(root_inode);
700 if (!sb->s_root) 723 if (!sb->s_root)
701 goto out_no_root; 724 goto out_no_root;
702 sb->s_root->d_op = &ncp_root_dentry_operations;
703 return 0; 725 return 0;
704 726
705out_no_root: 727out_no_root:
@@ -720,6 +742,9 @@ out_nls:
720 unload_nls(server->nls_io); 742 unload_nls(server->nls_io);
721 unload_nls(server->nls_vol); 743 unload_nls(server->nls_vol);
722#endif 744#endif
745 mutex_destroy(&server->rcv.creq_mutex);
746 mutex_destroy(&server->root_setup_lock);
747 mutex_destroy(&server->mutex);
723out_fput2: 748out_fput2:
724 if (server->info_filp) 749 if (server->info_filp)
725 fput(server->info_filp); 750 fput(server->info_filp);
@@ -743,8 +768,6 @@ static void ncp_put_super(struct super_block *sb)
743{ 768{
744 struct ncp_server *server = NCP_SBP(sb); 769 struct ncp_server *server = NCP_SBP(sb);
745 770
746 lock_kernel();
747
748 ncp_lock_server(server); 771 ncp_lock_server(server);
749 ncp_disconnect(server); 772 ncp_disconnect(server);
750 ncp_unlock_server(server); 773 ncp_unlock_server(server);
@@ -756,6 +779,9 @@ static void ncp_put_super(struct super_block *sb)
756 unload_nls(server->nls_vol); 779 unload_nls(server->nls_vol);
757 unload_nls(server->nls_io); 780 unload_nls(server->nls_io);
758#endif /* CONFIG_NCPFS_NLS */ 781#endif /* CONFIG_NCPFS_NLS */
782 mutex_destroy(&server->rcv.creq_mutex);
783 mutex_destroy(&server->root_setup_lock);
784 mutex_destroy(&server->mutex);
759 785
760 if (server->info_filp) 786 if (server->info_filp)
761 fput(server->info_filp); 787 fput(server->info_filp);
@@ -771,8 +797,6 @@ static void ncp_put_super(struct super_block *sb)
771 vfree(server->packet); 797 vfree(server->packet);
772 sb->s_fs_info = NULL; 798 sb->s_fs_info = NULL;
773 kfree(server); 799 kfree(server);
774
775 unlock_kernel();
776} 800}
777 801
778static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) 802static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +875,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
851 875
852 result = -EIO; 876 result = -EIO;
853 877
854 lock_kernel();
855
856 server = NCP_SERVER(inode); 878 server = NCP_SERVER(inode);
857 if ((!server) || !ncp_conn_valid(server)) 879 if (!server) /* How this could happen? */
858 goto out; 880 goto out;
859 881
860 /* ageing the dentry to force validation */ 882 /* ageing the dentry to force validation */
@@ -981,8 +1003,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
981 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode), 1003 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
982 inode, info_mask, &info); 1004 inode, info_mask, &info);
983 if (result != 0) { 1005 if (result != 0) {
984 result = -EACCES;
985
986 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) { 1006 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
987 /* NetWare seems not to allow this. I 1007 /* NetWare seems not to allow this. I
988 do not know why. So, just tell the 1008 do not know why. So, just tell the
@@ -1005,20 +1025,21 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
1005 mark_inode_dirty(inode); 1025 mark_inode_dirty(inode);
1006 1026
1007out: 1027out:
1008 unlock_kernel(); 1028 if (result > 0)
1029 result = -EACCES;
1009 return result; 1030 return result;
1010} 1031}
1011 1032
1012static int ncp_get_sb(struct file_system_type *fs_type, 1033static struct dentry *ncp_mount(struct file_system_type *fs_type,
1013 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1034 int flags, const char *dev_name, void *data)
1014{ 1035{
1015 return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt); 1036 return mount_nodev(fs_type, flags, data, ncp_fill_super);
1016} 1037}
1017 1038
1018static struct file_system_type ncp_fs_type = { 1039static struct file_system_type ncp_fs_type = {
1019 .owner = THIS_MODULE, 1040 .owner = THIS_MODULE,
1020 .name = "ncpfs", 1041 .name = "ncpfs",
1021 .get_sb = ncp_get_sb, 1042 .mount = ncp_mount,
1022 .kill_sb = kill_anon_super, 1043 .kill_sb = kill_anon_super,
1023 .fs_flags = FS_BINARY_MOUNTDATA, 1044 .fs_flags = FS_BINARY_MOUNTDATA,
1024}; 1045};
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 84a8cfc4e38e..790e92a9ec63 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,15 +17,12 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/highuid.h> 19#include <linux/highuid.h>
20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
23 22
24#include <linux/ncp_fs.h>
25
26#include <asm/uaccess.h> 23#include <asm/uaccess.h>
27 24
28#include "ncplib_kernel.h" 25#include "ncp_fs.h"
29 26
30/* maximum limit for ncp_objectname_ioctl */ 27/* maximum limit for ncp_objectname_ioctl */
31#define NCP_OBJECT_NAME_MAX_LEN 4096 28#define NCP_OBJECT_NAME_MAX_LEN 4096
@@ -35,16 +32,11 @@
35#define NCP_PACKET_SIZE_INTERNAL 65536 32#define NCP_PACKET_SIZE_INTERNAL 65536
36 33
37static int 34static int
38ncp_get_fs_info(struct ncp_server * server, struct file *file, 35ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
39 struct ncp_fs_info __user *arg) 36 struct ncp_fs_info __user *arg)
40{ 37{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct ncp_fs_info info; 38 struct ncp_fs_info info;
43 39
44 if (file_permission(file, MAY_WRITE) != 0
45 && current_uid() != server->m.mounted_uid)
46 return -EACCES;
47
48 if (copy_from_user(&info, arg, sizeof(info))) 40 if (copy_from_user(&info, arg, sizeof(info)))
49 return -EFAULT; 41 return -EFAULT;
50 42
@@ -65,16 +57,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
65} 57}
66 58
67static int 59static int
68ncp_get_fs_info_v2(struct ncp_server * server, struct file *file, 60ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
69 struct ncp_fs_info_v2 __user * arg) 61 struct ncp_fs_info_v2 __user * arg)
70{ 62{
71 struct inode *inode = file->f_path.dentry->d_inode;
72 struct ncp_fs_info_v2 info2; 63 struct ncp_fs_info_v2 info2;
73 64
74 if (file_permission(file, MAY_WRITE) != 0
75 && current_uid() != server->m.mounted_uid)
76 return -EACCES;
77
78 if (copy_from_user(&info2, arg, sizeof(info2))) 65 if (copy_from_user(&info2, arg, sizeof(info2)))
79 return -EFAULT; 66 return -EFAULT;
80 67
@@ -136,16 +123,11 @@ struct compat_ncp_privatedata_ioctl
136#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl) 123#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl)
137 124
138static int 125static int
139ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file, 126ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
140 struct compat_ncp_fs_info_v2 __user * arg) 127 struct compat_ncp_fs_info_v2 __user * arg)
141{ 128{
142 struct inode *inode = file->f_path.dentry->d_inode;
143 struct compat_ncp_fs_info_v2 info2; 129 struct compat_ncp_fs_info_v2 info2;
144 130
145 if (file_permission(file, MAY_WRITE) != 0
146 && current_uid() != server->m.mounted_uid)
147 return -EACCES;
148
149 if (copy_from_user(&info2, arg, sizeof(info2))) 131 if (copy_from_user(&info2, arg, sizeof(info2)))
150 return -EFAULT; 132 return -EFAULT;
151 133
@@ -182,11 +164,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
182 struct nls_table *iocharset; 164 struct nls_table *iocharset;
183 struct nls_table *oldset_io; 165 struct nls_table *oldset_io;
184 struct nls_table *oldset_cp; 166 struct nls_table *oldset_cp;
185 167 int utf8;
186 if (!capable(CAP_SYS_ADMIN)) 168 int err;
187 return -EACCES;
188 if (server->root_setuped)
189 return -EBUSY;
190 169
191 if (copy_from_user(&user, arg, sizeof(user))) 170 if (copy_from_user(&user, arg, sizeof(user)))
192 return -EFAULT; 171 return -EFAULT;
@@ -206,28 +185,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
206 user.iocharset[NCP_IOCSNAME_LEN] = 0; 185 user.iocharset[NCP_IOCSNAME_LEN] = 0;
207 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) { 186 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
208 iocharset = load_nls_default(); 187 iocharset = load_nls_default();
209 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 188 utf8 = 0;
210 } else if (!strcmp(user.iocharset, "utf8")) { 189 } else if (!strcmp(user.iocharset, "utf8")) {
211 iocharset = load_nls_default(); 190 iocharset = load_nls_default();
212 NCP_SET_FLAG(server, NCP_FLAG_UTF8); 191 utf8 = 1;
213 } else { 192 } else {
214 iocharset = load_nls(user.iocharset); 193 iocharset = load_nls(user.iocharset);
215 if (!iocharset) { 194 if (!iocharset) {
216 unload_nls(codepage); 195 unload_nls(codepage);
217 return -EBADRQC; 196 return -EBADRQC;
218 } 197 }
219 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 198 utf8 = 0;
220 } 199 }
221 200
222 oldset_cp = server->nls_vol; 201 mutex_lock(&server->root_setup_lock);
223 server->nls_vol = codepage; 202 if (server->root_setuped) {
224 oldset_io = server->nls_io; 203 oldset_cp = codepage;
225 server->nls_io = iocharset; 204 oldset_io = iocharset;
226 205 err = -EBUSY;
206 } else {
207 if (utf8)
208 NCP_SET_FLAG(server, NCP_FLAG_UTF8);
209 else
210 NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
211 oldset_cp = server->nls_vol;
212 server->nls_vol = codepage;
213 oldset_io = server->nls_io;
214 server->nls_io = iocharset;
215 err = 0;
216 }
217 mutex_unlock(&server->root_setup_lock);
227 unload_nls(oldset_cp); 218 unload_nls(oldset_cp);
228 unload_nls(oldset_io); 219 unload_nls(oldset_io);
229 220
230 return 0; 221 return err;
231} 222}
232 223
233static int 224static int
@@ -237,6 +228,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
237 int len; 228 int len;
238 229
239 memset(&user, 0, sizeof(user)); 230 memset(&user, 0, sizeof(user));
231 mutex_lock(&server->root_setup_lock);
240 if (server->nls_vol && server->nls_vol->charset) { 232 if (server->nls_vol && server->nls_vol->charset) {
241 len = strlen(server->nls_vol->charset); 233 len = strlen(server->nls_vol->charset);
242 if (len > NCP_IOCSNAME_LEN) 234 if (len > NCP_IOCSNAME_LEN)
@@ -254,6 +246,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
254 strncpy(user.iocharset, server->nls_io->charset, len); 246 strncpy(user.iocharset, server->nls_io->charset, len);
255 user.iocharset[len] = 0; 247 user.iocharset[len] = 0;
256 } 248 }
249 mutex_unlock(&server->root_setup_lock);
257 250
258 if (copy_to_user(arg, &user, sizeof(user))) 251 if (copy_to_user(arg, &user, sizeof(user)))
259 return -EFAULT; 252 return -EFAULT;
@@ -261,25 +254,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 254}
262#endif /* CONFIG_NCPFS_NLS */ 255#endif /* CONFIG_NCPFS_NLS */
263 256
264static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 257static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
265{ 258{
266 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 259 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 260 int result;
269 struct ncp_ioctl_request request; 261 struct ncp_ioctl_request request;
270 char* bouncebuffer; 262 char* bouncebuffer;
271 void __user *argp = (void __user *)arg; 263 void __user *argp = (void __user *)arg;
272 uid_t uid = current_uid();
273 264
274 switch (cmd) { 265 switch (cmd) {
275#ifdef CONFIG_COMPAT 266#ifdef CONFIG_COMPAT
276 case NCP_IOC_NCPREQUEST_32: 267 case NCP_IOC_NCPREQUEST_32:
277#endif 268#endif
278 case NCP_IOC_NCPREQUEST: 269 case NCP_IOC_NCPREQUEST:
279 if (file_permission(filp, MAY_WRITE) != 0
280 && uid != server->m.mounted_uid)
281 return -EACCES;
282
283#ifdef CONFIG_COMPAT 270#ifdef CONFIG_COMPAT
284 if (cmd == NCP_IOC_NCPREQUEST_32) { 271 if (cmd == NCP_IOC_NCPREQUEST_32) {
285 struct compat_ncp_ioctl_request request32; 272 struct compat_ncp_ioctl_request request32;
@@ -314,7 +301,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
314 server->current_size = request.size; 301 server->current_size = request.size;
315 memcpy(server->packet, bouncebuffer, request.size); 302 memcpy(server->packet, bouncebuffer, request.size);
316 303
317 result = ncp_request2(server, request.function, 304 result = ncp_request2(server, request.function,
318 bouncebuffer, NCP_PACKET_SIZE_INTERNAL); 305 bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
319 if (result < 0) 306 if (result < 0)
320 result = -EIO; 307 result = -EIO;
@@ -331,69 +318,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
331 318
332 case NCP_IOC_CONN_LOGGED_IN: 319 case NCP_IOC_CONN_LOGGED_IN:
333 320
334 if (!capable(CAP_SYS_ADMIN))
335 return -EACCES;
336 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE)) 321 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
337 return -EINVAL; 322 return -EINVAL;
323 mutex_lock(&server->root_setup_lock);
338 if (server->root_setuped) 324 if (server->root_setuped)
339 return -EBUSY; 325 result = -EBUSY;
340 server->root_setuped = 1; 326 else {
341 return ncp_conn_logged_in(inode->i_sb); 327 result = ncp_conn_logged_in(inode->i_sb);
328 if (result == 0)
329 server->root_setuped = 1;
330 }
331 mutex_unlock(&server->root_setup_lock);
332 return result;
342 333
343 case NCP_IOC_GET_FS_INFO: 334 case NCP_IOC_GET_FS_INFO:
344 return ncp_get_fs_info(server, filp, argp); 335 return ncp_get_fs_info(server, inode, argp);
345 336
346 case NCP_IOC_GET_FS_INFO_V2: 337 case NCP_IOC_GET_FS_INFO_V2:
347 return ncp_get_fs_info_v2(server, filp, argp); 338 return ncp_get_fs_info_v2(server, inode, argp);
348 339
349#ifdef CONFIG_COMPAT 340#ifdef CONFIG_COMPAT
350 case NCP_IOC_GET_FS_INFO_V2_32: 341 case NCP_IOC_GET_FS_INFO_V2_32:
351 return ncp_get_compat_fs_info_v2(server, filp, argp); 342 return ncp_get_compat_fs_info_v2(server, inode, argp);
352#endif 343#endif
353 /* we have too many combinations of CONFIG_COMPAT, 344 /* we have too many combinations of CONFIG_COMPAT,
354 * CONFIG_64BIT and CONFIG_UID16, so just handle 345 * CONFIG_64BIT and CONFIG_UID16, so just handle
355 * any of the possible ioctls */ 346 * any of the possible ioctls */
356 case NCP_IOC_GETMOUNTUID16: 347 case NCP_IOC_GETMOUNTUID16:
357 case NCP_IOC_GETMOUNTUID32: 348 {
358 case NCP_IOC_GETMOUNTUID64:
359 if (file_permission(filp, MAY_READ) != 0
360 && uid != server->m.mounted_uid)
361 return -EACCES;
362
363 if (cmd == NCP_IOC_GETMOUNTUID16) {
364 u16 uid; 349 u16 uid;
350
365 SET_UID(uid, server->m.mounted_uid); 351 SET_UID(uid, server->m.mounted_uid);
366 if (put_user(uid, (u16 __user *)argp)) 352 if (put_user(uid, (u16 __user *)argp))
367 return -EFAULT; 353 return -EFAULT;
368 } else if (cmd == NCP_IOC_GETMOUNTUID32) { 354 return 0;
369 if (put_user(server->m.mounted_uid,
370 (u32 __user *)argp))
371 return -EFAULT;
372 } else {
373 if (put_user(server->m.mounted_uid,
374 (u64 __user *)argp))
375 return -EFAULT;
376 } 355 }
356 case NCP_IOC_GETMOUNTUID32:
357 if (put_user(server->m.mounted_uid,
358 (u32 __user *)argp))
359 return -EFAULT;
360 return 0;
361 case NCP_IOC_GETMOUNTUID64:
362 if (put_user(server->m.mounted_uid,
363 (u64 __user *)argp))
364 return -EFAULT;
377 return 0; 365 return 0;
378 366
379 case NCP_IOC_GETROOT: 367 case NCP_IOC_GETROOT:
380 { 368 {
381 struct ncp_setroot_ioctl sr; 369 struct ncp_setroot_ioctl sr;
382 370
383 if (file_permission(filp, MAY_READ) != 0 371 result = -EACCES;
384 && uid != server->m.mounted_uid) 372 mutex_lock(&server->root_setup_lock);
385 return -EACCES;
386
387 if (server->m.mounted_vol[0]) { 373 if (server->m.mounted_vol[0]) {
388 struct dentry* dentry = inode->i_sb->s_root; 374 struct dentry* dentry = inode->i_sb->s_root;
389 375
390 if (dentry) { 376 if (dentry) {
391 struct inode* s_inode = dentry->d_inode; 377 struct inode* s_inode = dentry->d_inode;
392 378
393 if (s_inode) { 379 if (s_inode) {
394 sr.volNumber = NCP_FINFO(s_inode)->volNumber; 380 sr.volNumber = NCP_FINFO(s_inode)->volNumber;
395 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; 381 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
396 sr.namespace = server->name_space[sr.volNumber]; 382 sr.namespace = server->name_space[sr.volNumber];
383 result = 0;
397 } else 384 } else
398 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 385 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
399 } else 386 } else
@@ -402,10 +389,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
402 sr.volNumber = -1; 389 sr.volNumber = -1;
403 sr.namespace = 0; 390 sr.namespace = 0;
404 sr.dirEntNum = 0; 391 sr.dirEntNum = 0;
392 result = 0;
405 } 393 }
406 if (copy_to_user(argp, &sr, sizeof(sr))) 394 mutex_unlock(&server->root_setup_lock);
407 return -EFAULT; 395 if (!result && copy_to_user(argp, &sr, sizeof(sr)))
408 return 0; 396 result = -EFAULT;
397 return result;
409 } 398 }
410 399
411 case NCP_IOC_SETROOT: 400 case NCP_IOC_SETROOT:
@@ -416,103 +405,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
416 __le32 dosde; 405 __le32 dosde;
417 struct dentry* dentry; 406 struct dentry* dentry;
418 407
419 if (!capable(CAP_SYS_ADMIN))
420 {
421 return -EACCES;
422 }
423 if (server->root_setuped) return -EBUSY;
424 if (copy_from_user(&sr, argp, sizeof(sr))) 408 if (copy_from_user(&sr, argp, sizeof(sr)))
425 return -EFAULT; 409 return -EFAULT;
426 if (sr.volNumber < 0) { 410 mutex_lock(&server->root_setup_lock);
427 server->m.mounted_vol[0] = 0; 411 if (server->root_setuped)
428 vnum = NCP_NUMBER_OF_VOLUMES; 412 result = -EBUSY;
429 de = 0; 413 else {
430 dosde = 0; 414 if (sr.volNumber < 0) {
431 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) { 415 server->m.mounted_vol[0] = 0;
432 return -EINVAL; 416 vnum = NCP_NUMBER_OF_VOLUMES;
433 } else if (ncp_mount_subdir(server, sr.volNumber, 417 de = 0;
434 sr.namespace, sr.dirEntNum, 418 dosde = 0;
435 &vnum, &de, &dosde)) { 419 result = 0;
436 return -ENOENT; 420 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
437 } 421 result = -EINVAL;
438 422 } else if (ncp_mount_subdir(server, sr.volNumber,
439 dentry = inode->i_sb->s_root; 423 sr.namespace, sr.dirEntNum,
440 server->root_setuped = 1; 424 &vnum, &de, &dosde)) {
441 if (dentry) { 425 result = -ENOENT;
442 struct inode* s_inode = dentry->d_inode;
443
444 if (s_inode) {
445 NCP_FINFO(s_inode)->volNumber = vnum;
446 NCP_FINFO(s_inode)->dirEntNum = de;
447 NCP_FINFO(s_inode)->DosDirNum = dosde;
448 } else 426 } else
449 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 427 result = 0;
450 } else 428
451 DPRINTK("ncpfs: s_root==NULL\n"); 429 if (result == 0) {
430 dentry = inode->i_sb->s_root;
431 if (dentry) {
432 struct inode* s_inode = dentry->d_inode;
433
434 if (s_inode) {
435 NCP_FINFO(s_inode)->volNumber = vnum;
436 NCP_FINFO(s_inode)->dirEntNum = de;
437 NCP_FINFO(s_inode)->DosDirNum = dosde;
438 server->root_setuped = 1;
439 } else {
440 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
441 result = -EIO;
442 }
443 } else {
444 DPRINTK("ncpfs: s_root==NULL\n");
445 result = -EIO;
446 }
447 }
448 result = 0;
449 }
450 mutex_unlock(&server->root_setup_lock);
452 451
453 return 0; 452 return result;
454 } 453 }
455 454
456#ifdef CONFIG_NCPFS_PACKET_SIGNING 455#ifdef CONFIG_NCPFS_PACKET_SIGNING
457 case NCP_IOC_SIGN_INIT: 456 case NCP_IOC_SIGN_INIT:
458 if (file_permission(filp, MAY_WRITE) != 0 457 {
459 && uid != server->m.mounted_uid) 458 struct ncp_sign_init sign;
460 return -EACCES;
461
462 if (argp) {
463 if (server->sign_wanted)
464 {
465 struct ncp_sign_init sign;
466 459
460 if (argp)
467 if (copy_from_user(&sign, argp, sizeof(sign))) 461 if (copy_from_user(&sign, argp, sizeof(sign)))
468 return -EFAULT; 462 return -EFAULT;
469 memcpy(server->sign_root,sign.sign_root,8); 463 ncp_lock_server(server);
470 memcpy(server->sign_last,sign.sign_last,16); 464 mutex_lock(&server->rcv.creq_mutex);
471 server->sign_active = 1; 465 if (argp) {
466 if (server->sign_wanted) {
467 memcpy(server->sign_root,sign.sign_root,8);
468 memcpy(server->sign_last,sign.sign_last,16);
469 server->sign_active = 1;
470 }
471 /* ignore when signatures not wanted */
472 } else {
473 server->sign_active = 0;
472 } 474 }
473 /* ignore when signatures not wanted */ 475 mutex_unlock(&server->rcv.creq_mutex);
474 } else { 476 ncp_unlock_server(server);
475 server->sign_active = 0; 477 return 0;
476 } 478 }
477 return 0; 479
478
479 case NCP_IOC_SIGN_WANTED: 480 case NCP_IOC_SIGN_WANTED:
480 if (file_permission(filp, MAY_READ) != 0 481 {
481 && uid != server->m.mounted_uid) 482 int state;
482 return -EACCES; 483
483 484 ncp_lock_server(server);
484 if (put_user(server->sign_wanted, (int __user *)argp)) 485 state = server->sign_wanted;
485 return -EFAULT; 486 ncp_unlock_server(server);
486 return 0; 487 if (put_user(state, (int __user *)argp))
488 return -EFAULT;
489 return 0;
490 }
487 491
488 case NCP_IOC_SET_SIGN_WANTED: 492 case NCP_IOC_SET_SIGN_WANTED:
489 { 493 {
490 int newstate; 494 int newstate;
491 495
492 if (file_permission(filp, MAY_WRITE) != 0
493 && uid != server->m.mounted_uid)
494 return -EACCES;
495
496 /* get only low 8 bits... */ 496 /* get only low 8 bits... */
497 if (get_user(newstate, (unsigned char __user *)argp)) 497 if (get_user(newstate, (unsigned char __user *)argp))
498 return -EFAULT; 498 return -EFAULT;
499 result = 0;
500 ncp_lock_server(server);
499 if (server->sign_active) { 501 if (server->sign_active) {
500 /* cannot turn signatures OFF when active */ 502 /* cannot turn signatures OFF when active */
501 if (!newstate) return -EINVAL; 503 if (!newstate)
504 result = -EINVAL;
502 } else { 505 } else {
503 server->sign_wanted = newstate != 0; 506 server->sign_wanted = newstate != 0;
504 } 507 }
505 return 0; 508 ncp_unlock_server(server);
509 return result;
506 } 510 }
507 511
508#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 512#endif /* CONFIG_NCPFS_PACKET_SIGNING */
509 513
510#ifdef CONFIG_NCPFS_IOCTL_LOCKING 514#ifdef CONFIG_NCPFS_IOCTL_LOCKING
511 case NCP_IOC_LOCKUNLOCK: 515 case NCP_IOC_LOCKUNLOCK:
512 if (file_permission(filp, MAY_WRITE) != 0
513 && uid != server->m.mounted_uid)
514 return -EACCES;
515
516 { 516 {
517 struct ncp_lock_ioctl rqdata; 517 struct ncp_lock_ioctl rqdata;
518 518
@@ -541,16 +541,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
541 { 541 {
542 return result; 542 return result;
543 } 543 }
544 result = -EIO;
545 if (!ncp_conn_valid(server))
546 goto outrel;
547 result = -EISDIR; 544 result = -EISDIR;
548 if (!S_ISREG(inode->i_mode)) 545 if (!S_ISREG(inode->i_mode))
549 goto outrel; 546 goto outrel;
550 if (rqdata.cmd == NCP_LOCK_CLEAR) 547 if (rqdata.cmd == NCP_LOCK_CLEAR)
551 { 548 {
552 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode), 549 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
553 NCP_FINFO(inode)->file_handle, 550 NCP_FINFO(inode)->file_handle,
554 rqdata.offset, 551 rqdata.offset,
555 rqdata.length); 552 rqdata.length);
556 if (result > 0) result = 0; /* no such lock */ 553 if (result > 0) result = 0; /* no such lock */
@@ -573,7 +570,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
573 rqdata.timeout); 570 rqdata.timeout);
574 if (result > 0) result = -EAGAIN; 571 if (result > 0) result = -EAGAIN;
575 } 572 }
576outrel: 573outrel:
577 ncp_inode_close(inode); 574 ncp_inode_close(inode);
578 return result; 575 return result;
579 } 576 }
@@ -581,60 +578,62 @@ outrel:
581 578
582#ifdef CONFIG_COMPAT 579#ifdef CONFIG_COMPAT
583 case NCP_IOC_GETOBJECTNAME_32: 580 case NCP_IOC_GETOBJECTNAME_32:
584 if (uid != server->m.mounted_uid)
585 return -EACCES;
586 { 581 {
587 struct compat_ncp_objectname_ioctl user; 582 struct compat_ncp_objectname_ioctl user;
588 size_t outl; 583 size_t outl;
589 584
590 if (copy_from_user(&user, argp, sizeof(user))) 585 if (copy_from_user(&user, argp, sizeof(user)))
591 return -EFAULT; 586 return -EFAULT;
587 down_read(&server->auth_rwsem);
592 user.auth_type = server->auth.auth_type; 588 user.auth_type = server->auth.auth_type;
593 outl = user.object_name_len; 589 outl = user.object_name_len;
594 user.object_name_len = server->auth.object_name_len; 590 user.object_name_len = server->auth.object_name_len;
595 if (outl > user.object_name_len) 591 if (outl > user.object_name_len)
596 outl = user.object_name_len; 592 outl = user.object_name_len;
593 result = 0;
597 if (outl) { 594 if (outl) {
598 if (copy_to_user(compat_ptr(user.object_name), 595 if (copy_to_user(compat_ptr(user.object_name),
599 server->auth.object_name, 596 server->auth.object_name,
600 outl)) return -EFAULT; 597 outl))
598 result = -EFAULT;
601 } 599 }
602 if (copy_to_user(argp, &user, sizeof(user))) 600 up_read(&server->auth_rwsem);
603 return -EFAULT; 601 if (!result && copy_to_user(argp, &user, sizeof(user)))
604 return 0; 602 result = -EFAULT;
603 return result;
605 } 604 }
606#endif 605#endif
607 606
608 case NCP_IOC_GETOBJECTNAME: 607 case NCP_IOC_GETOBJECTNAME:
609 if (uid != server->m.mounted_uid)
610 return -EACCES;
611 { 608 {
612 struct ncp_objectname_ioctl user; 609 struct ncp_objectname_ioctl user;
613 size_t outl; 610 size_t outl;
614 611
615 if (copy_from_user(&user, argp, sizeof(user))) 612 if (copy_from_user(&user, argp, sizeof(user)))
616 return -EFAULT; 613 return -EFAULT;
614 down_read(&server->auth_rwsem);
617 user.auth_type = server->auth.auth_type; 615 user.auth_type = server->auth.auth_type;
618 outl = user.object_name_len; 616 outl = user.object_name_len;
619 user.object_name_len = server->auth.object_name_len; 617 user.object_name_len = server->auth.object_name_len;
620 if (outl > user.object_name_len) 618 if (outl > user.object_name_len)
621 outl = user.object_name_len; 619 outl = user.object_name_len;
620 result = 0;
622 if (outl) { 621 if (outl) {
623 if (copy_to_user(user.object_name, 622 if (copy_to_user(user.object_name,
624 server->auth.object_name, 623 server->auth.object_name,
625 outl)) return -EFAULT; 624 outl))
625 result = -EFAULT;
626 } 626 }
627 if (copy_to_user(argp, &user, sizeof(user))) 627 up_read(&server->auth_rwsem);
628 return -EFAULT; 628 if (!result && copy_to_user(argp, &user, sizeof(user)))
629 return 0; 629 result = -EFAULT;
630 return result;
630 } 631 }
631 632
632#ifdef CONFIG_COMPAT 633#ifdef CONFIG_COMPAT
633 case NCP_IOC_SETOBJECTNAME_32: 634 case NCP_IOC_SETOBJECTNAME_32:
634#endif 635#endif
635 case NCP_IOC_SETOBJECTNAME: 636 case NCP_IOC_SETOBJECTNAME:
636 if (uid != server->m.mounted_uid)
637 return -EACCES;
638 { 637 {
639 struct ncp_objectname_ioctl user; 638 struct ncp_objectname_ioctl user;
640 void* newname; 639 void* newname;
@@ -666,9 +665,7 @@ outrel:
666 } else { 665 } else {
667 newname = NULL; 666 newname = NULL;
668 } 667 }
669 /* enter critical section */ 668 down_write(&server->auth_rwsem);
670 /* maybe that kfree can sleep so do that this way */
671 /* it is at least more SMP friendly (in future...) */
672 oldname = server->auth.object_name; 669 oldname = server->auth.object_name;
673 oldnamelen = server->auth.object_name_len; 670 oldnamelen = server->auth.object_name_len;
674 oldprivate = server->priv.data; 671 oldprivate = server->priv.data;
@@ -678,7 +675,7 @@ outrel:
678 server->auth.object_name = newname; 675 server->auth.object_name = newname;
679 server->priv.len = 0; 676 server->priv.len = 0;
680 server->priv.data = NULL; 677 server->priv.data = NULL;
681 /* leave critical section */ 678 up_write(&server->auth_rwsem);
682 kfree(oldprivate); 679 kfree(oldprivate);
683 kfree(oldname); 680 kfree(oldname);
684 return 0; 681 return 0;
@@ -688,8 +685,6 @@ outrel:
688 case NCP_IOC_GETPRIVATEDATA_32: 685 case NCP_IOC_GETPRIVATEDATA_32:
689#endif 686#endif
690 case NCP_IOC_GETPRIVATEDATA: 687 case NCP_IOC_GETPRIVATEDATA:
691 if (uid != server->m.mounted_uid)
692 return -EACCES;
693 { 688 {
694 struct ncp_privatedata_ioctl user; 689 struct ncp_privatedata_ioctl user;
695 size_t outl; 690 size_t outl;
@@ -706,14 +701,20 @@ outrel:
706 if (copy_from_user(&user, argp, sizeof(user))) 701 if (copy_from_user(&user, argp, sizeof(user)))
707 return -EFAULT; 702 return -EFAULT;
708 703
704 down_read(&server->auth_rwsem);
709 outl = user.len; 705 outl = user.len;
710 user.len = server->priv.len; 706 user.len = server->priv.len;
711 if (outl > user.len) outl = user.len; 707 if (outl > user.len) outl = user.len;
708 result = 0;
712 if (outl) { 709 if (outl) {
713 if (copy_to_user(user.data, 710 if (copy_to_user(user.data,
714 server->priv.data, 711 server->priv.data,
715 outl)) return -EFAULT; 712 outl))
713 result = -EFAULT;
716 } 714 }
715 up_read(&server->auth_rwsem);
716 if (result)
717 return result;
717#ifdef CONFIG_COMPAT 718#ifdef CONFIG_COMPAT
718 if (cmd == NCP_IOC_GETPRIVATEDATA_32) { 719 if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
719 struct compat_ncp_privatedata_ioctl user32; 720 struct compat_ncp_privatedata_ioctl user32;
@@ -733,8 +734,6 @@ outrel:
733 case NCP_IOC_SETPRIVATEDATA_32: 734 case NCP_IOC_SETPRIVATEDATA_32:
734#endif 735#endif
735 case NCP_IOC_SETPRIVATEDATA: 736 case NCP_IOC_SETPRIVATEDATA:
736 if (uid != server->m.mounted_uid)
737 return -EACCES;
738 { 737 {
739 struct ncp_privatedata_ioctl user; 738 struct ncp_privatedata_ioctl user;
740 void* new; 739 void* new;
@@ -762,12 +761,12 @@ outrel:
762 } else { 761 } else {
763 new = NULL; 762 new = NULL;
764 } 763 }
765 /* enter critical section */ 764 down_write(&server->auth_rwsem);
766 old = server->priv.data; 765 old = server->priv.data;
767 oldlen = server->priv.len; 766 oldlen = server->priv.len;
768 server->priv.len = user.len; 767 server->priv.len = user.len;
769 server->priv.data = new; 768 server->priv.data = new;
770 /* leave critical section */ 769 up_write(&server->auth_rwsem);
771 kfree(old); 770 kfree(old);
772 return 0; 771 return 0;
773 } 772 }
@@ -775,17 +774,13 @@ outrel:
775#ifdef CONFIG_NCPFS_NLS 774#ifdef CONFIG_NCPFS_NLS
776 case NCP_IOC_SETCHARSETS: 775 case NCP_IOC_SETCHARSETS:
777 return ncp_set_charsets(server, argp); 776 return ncp_set_charsets(server, argp);
778 777
779 case NCP_IOC_GETCHARSETS: 778 case NCP_IOC_GETCHARSETS:
780 return ncp_get_charsets(server, argp); 779 return ncp_get_charsets(server, argp);
781 780
782#endif /* CONFIG_NCPFS_NLS */ 781#endif /* CONFIG_NCPFS_NLS */
783 782
784 case NCP_IOC_SETDENTRYTTL: 783 case NCP_IOC_SETDENTRYTTL:
785 if (file_permission(filp, MAY_WRITE) != 0 &&
786 uid != server->m.mounted_uid)
787 return -EACCES;
788
789 { 784 {
790 u_int32_t user; 785 u_int32_t user;
791 786
@@ -795,13 +790,13 @@ outrel:
795 if (user > 20000) 790 if (user > 20000)
796 return -EINVAL; 791 return -EINVAL;
797 user = (user * HZ) / 1000; 792 user = (user * HZ) / 1000;
798 server->dentry_ttl = user; 793 atomic_set(&server->dentry_ttl, user);
799 return 0; 794 return 0;
800 } 795 }
801 796
802 case NCP_IOC_GETDENTRYTTL: 797 case NCP_IOC_GETDENTRYTTL:
803 { 798 {
804 u_int32_t user = (server->dentry_ttl * 1000) / HZ; 799 u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
805 if (copy_to_user(argp, &user, sizeof(user))) 800 if (copy_to_user(argp, &user, sizeof(user)))
806 return -EFAULT; 801 return -EFAULT;
807 return 0; 802 return 0;
@@ -811,59 +806,103 @@ outrel:
811 return -EINVAL; 806 return -EINVAL;
812} 807}
813 808
814static int ncp_ioctl_need_write(unsigned int cmd) 809long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
815{ 810{
811 struct inode *inode = filp->f_dentry->d_inode;
812 struct ncp_server *server = NCP_SERVER(inode);
813 uid_t uid = current_uid();
814 int need_drop_write = 0;
815 long ret;
816
816 switch (cmd) { 817 switch (cmd) {
817 case NCP_IOC_GET_FS_INFO:
818 case NCP_IOC_GET_FS_INFO_V2:
819 case NCP_IOC_NCPREQUEST:
820 case NCP_IOC_SETDENTRYTTL:
821 case NCP_IOC_SIGN_INIT:
822 case NCP_IOC_LOCKUNLOCK:
823 case NCP_IOC_SET_SIGN_WANTED:
824 return 1;
825 case NCP_IOC_GETOBJECTNAME:
826 case NCP_IOC_SETOBJECTNAME:
827 case NCP_IOC_GETPRIVATEDATA:
828 case NCP_IOC_SETPRIVATEDATA:
829 case NCP_IOC_SETCHARSETS: 818 case NCP_IOC_SETCHARSETS:
830 case NCP_IOC_GETCHARSETS:
831 case NCP_IOC_CONN_LOGGED_IN: 819 case NCP_IOC_CONN_LOGGED_IN:
832 case NCP_IOC_GETDENTRYTTL:
833 case NCP_IOC_GETMOUNTUID2:
834 case NCP_IOC_SIGN_WANTED:
835 case NCP_IOC_GETROOT:
836 case NCP_IOC_SETROOT: 820 case NCP_IOC_SETROOT:
837 return 0; 821 if (!capable(CAP_SYS_ADMIN)) {
838 default: 822 ret = -EACCES;
839 /* unknown IOCTL command, assume write */ 823 goto out;
840 return 1; 824 }
825 break;
841 } 826 }
842} 827 if (server->m.mounted_uid != uid) {
843 828 switch (cmd) {
844long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845{
846 long ret;
847
848 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) {
850 /* 829 /*
851 * inside the ioctl(), any failures which 830 * Only mount owner can issue these ioctls. Information
852 * are because of file_permission() are 831 * necessary to authenticate to other NDS servers are
853 * -EACCESS, so it seems consistent to keep 832 * stored here.
854 * that here.
855 */ 833 */
856 if (mnt_want_write(filp->f_path.mnt)) { 834 case NCP_IOC_GETOBJECTNAME:
835 case NCP_IOC_SETOBJECTNAME:
836 case NCP_IOC_GETPRIVATEDATA:
837 case NCP_IOC_SETPRIVATEDATA:
838#ifdef CONFIG_COMPAT
839 case NCP_IOC_GETOBJECTNAME_32:
840 case NCP_IOC_SETOBJECTNAME_32:
841 case NCP_IOC_GETPRIVATEDATA_32:
842 case NCP_IOC_SETPRIVATEDATA_32:
843#endif
857 ret = -EACCES; 844 ret = -EACCES;
858 goto out; 845 goto out;
846 /*
847 * These require write access on the inode if user id
848 * does not match. Note that they do not write to the
849 * file... But old code did mnt_want_write, so I keep
850 * it as is. Of course not for mountpoint owner, as
851 * that breaks read-only mounts altogether as ncpmount
852 * needs working NCP_IOC_NCPREQUEST and
853 * NCP_IOC_GET_FS_INFO. Some of these codes (setdentryttl,
854 * signinit, setsignwanted) should be probably restricted
855 * to owner only, or even more to CAP_SYS_ADMIN).
856 */
857 case NCP_IOC_GET_FS_INFO:
858 case NCP_IOC_GET_FS_INFO_V2:
859 case NCP_IOC_NCPREQUEST:
860 case NCP_IOC_SETDENTRYTTL:
861 case NCP_IOC_SIGN_INIT:
862 case NCP_IOC_LOCKUNLOCK:
863 case NCP_IOC_SET_SIGN_WANTED:
864#ifdef CONFIG_COMPAT
865 case NCP_IOC_GET_FS_INFO_V2_32:
866 case NCP_IOC_NCPREQUEST_32:
867#endif
868 ret = mnt_want_write_file(filp);
869 if (ret)
870 goto out;
871 need_drop_write = 1;
872 ret = inode_permission(inode, MAY_WRITE);
873 if (ret)
874 goto outDropWrite;
875 break;
876 /*
877 * Read access required.
878 */
879 case NCP_IOC_GETMOUNTUID16:
880 case NCP_IOC_GETMOUNTUID32:
881 case NCP_IOC_GETMOUNTUID64:
882 case NCP_IOC_GETROOT:
883 case NCP_IOC_SIGN_WANTED:
884 ret = inode_permission(inode, MAY_READ);
885 if (ret)
886 goto out;
887 break;
888 /*
889 * Anybody can read these.
890 */
891 case NCP_IOC_GETCHARSETS:
892 case NCP_IOC_GETDENTRYTTL:
893 default:
894 /* Three codes below are protected by CAP_SYS_ADMIN above. */
895 case NCP_IOC_SETCHARSETS:
896 case NCP_IOC_CONN_LOGGED_IN:
897 case NCP_IOC_SETROOT:
898 break;
859 } 899 }
860 } 900 }
861 ret = __ncp_ioctl(filp, cmd, arg); 901 ret = __ncp_ioctl(inode, cmd, arg);
862 if (ncp_ioctl_need_write(cmd)) 902outDropWrite:
903 if (need_drop_write)
863 mnt_drop_write(filp->f_path.mnt); 904 mnt_drop_write(filp->f_path.mnt);
864
865out: 905out:
866 unlock_kernel();
867 return ret; 906 return ret;
868} 907}
869 908
@@ -872,10 +911,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
872{ 911{
873 long ret; 912 long ret;
874 913
875 lock_kernel();
876 arg = (unsigned long) compat_ptr(arg); 914 arg = (unsigned long) compat_ptr(arg);
877 ret = ncp_ioctl(file, cmd, arg); 915 ret = ncp_ioctl(file, cmd, arg);
878 unlock_kernel();
879 return ret; 916 return ret;
880} 917}
881#endif 918#endif
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1ee..a7c07b44b100 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
16#include <linux/mman.h> 16#include <linux/mman.h>
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h>
20 19
21#include "ncplib_kernel.h"
22#include <asm/uaccess.h> 20#include <asm/uaccess.h>
23#include <asm/system.h> 21#include <asm/system.h>
24 22
23#include "ncp_fs.h"
24
25/* 25/*
26 * Fill in the supplied page for mmap 26 * Fill in the supplied page for mmap
27 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock 27 * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 000000000000..31831afe1c3b
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
1#include <linux/ncp_fs.h>
2#include "ncp_fs_i.h"
3#include "ncp_fs_sb.h"
4
5/* define because it is easy to change PRINTK to {*}PRINTK */
6#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
7
8#undef NCPFS_PARANOIA
9#ifdef NCPFS_PARANOIA
10#define PPRINTK(format, args...) PRINTK(format , ## args)
11#else
12#define PPRINTK(format, args...)
13#endif
14
15#ifndef DEBUG_NCP
16#define DEBUG_NCP 0
17#endif
18#if DEBUG_NCP > 0
19#define DPRINTK(format, args...) PRINTK(format , ## args)
20#else
21#define DPRINTK(format, args...)
22#endif
23#if DEBUG_NCP > 1
24#define DDPRINTK(format, args...) PRINTK(format , ## args)
25#else
26#define DDPRINTK(format, args...)
27#endif
28
29#define NCP_MAX_RPC_TIMEOUT (6*HZ)
30
31
32struct ncp_entry_info {
33 struct nw_info_struct i;
34 ino_t ino;
35 int opened;
36 int access;
37 unsigned int volume;
38 __u8 file_handle[6];
39};
40
41static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
42{
43 return sb->s_fs_info;
44}
45
46#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb)
47static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
48{
49 return container_of(inode, struct ncp_inode_info, vfs_inode);
50}
51
52/* linux/fs/ncpfs/inode.c */
53int ncp_notify_change(struct dentry *, struct iattr *);
54struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
55void ncp_update_inode(struct inode *, struct ncp_entry_info *);
56void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
57
58/* linux/fs/ncpfs/dir.c */
59extern const struct inode_operations ncp_dir_inode_operations;
60extern const struct file_operations ncp_dir_operations;
61extern const struct dentry_operations ncp_dentry_operations;
62int ncp_conn_logged_in(struct super_block *);
63int ncp_date_dos2unix(__le16 time, __le16 date);
64void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
65
66/* linux/fs/ncpfs/ioctl.c */
67long ncp_ioctl(struct file *, unsigned int, unsigned long);
68long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
69
70/* linux/fs/ncpfs/sock.c */
71int ncp_request2(struct ncp_server *server, int function,
72 void* reply, int max_reply_size);
73static inline int ncp_request(struct ncp_server *server, int function) {
74 return ncp_request2(server, function, server->packet, server->packet_size);
75}
76int ncp_connect(struct ncp_server *server);
77int ncp_disconnect(struct ncp_server *server);
78void ncp_lock_server(struct ncp_server *server);
79void ncp_unlock_server(struct ncp_server *server);
80
81/* linux/fs/ncpfs/symlink.c */
82#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
83extern const struct address_space_operations ncp_symlink_aops;
84int ncp_symlink(struct inode*, struct dentry*, const char*);
85#endif
86
87/* linux/fs/ncpfs/file.c */
88extern const struct inode_operations ncp_file_inode_operations;
89extern const struct file_operations ncp_file_operations;
90int ncp_make_open(struct inode *, int);
91
92/* linux/fs/ncpfs/mmap.c */
93int ncp_mmap(struct file *, struct vm_area_struct *);
94
95/* linux/fs/ncpfs/ncplib_kernel.c */
96int ncp_make_closed(struct inode *);
97
98#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 000000000000..4b0bec477846
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
1/*
2 * ncp_fs_i.h
3 *
4 * Copyright (C) 1995 Volker Lendecke
5 *
6 */
7
8#ifndef _LINUX_NCP_FS_I
9#define _LINUX_NCP_FS_I
10
11/*
12 * This is the ncpfs part of the inode structure. This must contain
13 * all the information we need to work with an inode after creation.
14 */
15struct ncp_inode_info {
16 __le32 dirEntNum;
17 __le32 DosDirNum;
18 __u8 volNumber;
19 __le32 nwattr;
20 struct mutex open_mutex;
21 atomic_t opened;
22 int access;
23 int flags;
24#define NCPI_KLUDGE_SYMLINK 0x0001
25 __u8 file_handle[6];
26 struct inode vfs_inode;
27};
28
29#endif /* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 000000000000..4af803f13516
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
1/*
2 * ncp_fs_sb.h
3 *
4 * Copyright (C) 1995, 1996 by Volker Lendecke
5 *
6 */
7
8#ifndef _NCP_FS_SB
9#define _NCP_FS_SB
10
11#include <linux/types.h>
12#include <linux/ncp_mount.h>
13#include <linux/net.h>
14#include <linux/mutex.h>
15#include <linux/backing-dev.h>
16#include <linux/workqueue.h>
17
18#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */
19
20struct sock;
21
22struct ncp_mount_data_kernel {
23 unsigned long flags; /* NCP_MOUNT_* flags */
24 unsigned int int_flags; /* internal flags */
25#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001
26 __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */
27 struct pid *wdog_pid; /* Who cares for our watchdog packets? */
28 unsigned int ncp_fd; /* The socket to the ncp port */
29 unsigned int time_out; /* How long should I wait after
30 sending a NCP request? */
31 unsigned int retry_count; /* And how often should I retry? */
32 unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
33 __kernel_uid32_t uid;
34 __kernel_gid32_t gid;
35 __kernel_mode_t file_mode;
36 __kernel_mode_t dir_mode;
37 int info_fd;
38};
39
40struct ncp_server {
41
42 struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
43 interest for us later, so we store
44 it completely. */
45
46 __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
47
48 struct file *ncp_filp; /* File pointer to ncp socket */
49 struct socket *ncp_sock;/* ncp socket */
50 struct file *info_filp;
51 struct socket *info_sock;
52
53 u8 sequence;
54 u8 task;
55 u16 connection; /* Remote connection number */
56
57 u8 completion; /* Status message from server */
58 u8 conn_status; /* Bit 4 = 1 ==> Server going down, no
59 requests allowed anymore.
60 Bit 0 = 1 ==> Server is down. */
61
62 int buffer_size; /* Negotiated bufsize */
63
64 int reply_size; /* Size of last reply */
65
66 int packet_size;
67 unsigned char *packet; /* Here we prepare requests and
68 receive replies */
69 unsigned char *txbuf; /* Storage for current request */
70 unsigned char *rxbuf; /* Storage for reply to current request */
71
72 int lock; /* To prevent mismatch in protocols. */
73 struct mutex mutex;
74
75 int current_size; /* for packet preparation */
76 int has_subfunction;
77 int ncp_reply_size;
78
79 int root_setuped;
80 struct mutex root_setup_lock;
81
82 /* info for packet signing */
83 int sign_wanted; /* 1=Server needs signed packets */
84 int sign_active; /* 0=don't do signing, 1=do */
85 char sign_root[8]; /* generated from password and encr. key */
86 char sign_last[16];
87
88 /* Authentication info: NDS or BINDERY, username */
89 struct {
90 int auth_type;
91 size_t object_name_len;
92 void* object_name;
93 int object_type;
94 } auth;
95 /* Password info */
96 struct {
97 size_t len;
98 void* data;
99 } priv;
100 struct rw_semaphore auth_rwsem;
101
102 /* nls info: codepage for volume and charset for I/O */
103 struct nls_table *nls_vol;
104 struct nls_table *nls_io;
105
106 /* maximum age in jiffies */
107 atomic_t dentry_ttl;
108
109 /* miscellaneous */
110 unsigned int flags;
111
112 spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
113
114 void (*data_ready)(struct sock* sk, int len);
115 void (*error_report)(struct sock* sk);
116 void (*write_space)(struct sock* sk); /* STREAM mode only */
117 struct {
118 struct work_struct tq; /* STREAM/DGRAM: data/error ready */
119 struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */
120 struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */
121
122 unsigned int state; /* STREAM only: receiver state */
123 struct {
124 __u32 magic __packed;
125 __u32 len __packed;
126 __u16 type __packed;
127 __u16 p1 __packed;
128 __u16 p2 __packed;
129 __u16 p3 __packed;
130 __u16 type2 __packed;
131 } buf; /* STREAM only: temporary buffer */
132 unsigned char* ptr; /* STREAM only: pointer to data */
133 size_t len; /* STREAM only: length of data to receive */
134 } rcv;
135 struct {
136 struct list_head requests; /* STREAM only: queued requests */
137 struct work_struct tq; /* STREAM only: transmitter ready */
138 struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */
139 } tx;
140 struct timer_list timeout_tm; /* DGRAM only: timeout timer */
141 struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */
142 int timeout_last; /* DGRAM only: current timeout length */
143 int timeout_retries; /* DGRAM only: retries left */
144 struct {
145 size_t len;
146 __u8 data[128];
147 } unexpected_packet;
148 struct backing_dev_info bdi;
149};
150
151extern void ncp_tcp_rcv_proc(struct work_struct *work);
152extern void ncp_tcp_tx_proc(struct work_struct *work);
153extern void ncpdgram_rcv_proc(struct work_struct *work);
154extern void ncpdgram_timeout_proc(struct work_struct *work);
155extern void ncpdgram_timeout_call(unsigned long server);
156extern void ncp_tcp_data_ready(struct sock* sk, int len);
157extern void ncp_tcp_write_space(struct sock* sk);
158extern void ncp_tcp_error_report(struct sock* sk);
159
160#define NCP_FLAG_UTF8 1
161
162#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag))
163#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag))
164#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag))
165
166static inline int ncp_conn_valid(struct ncp_server *server)
167{
168 return ((server->conn_status & 0x11) == 0);
169}
170
171static inline void ncp_invalidate_conn(struct ncp_server *server)
172{
173 server->conn_status |= 0x01;
174}
175
176#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a5970..981a95617fc9 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
11 11
12 12
13 13
14#include "ncplib_kernel.h" 14#include "ncp_fs.h"
15 15
16static inline void assert_server_locked(struct ncp_server *server) 16static inline void assert_server_locked(struct ncp_server *server)
17{ 17{
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]); 107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
108} 108}
109 109
110static inline u8 BVAL(void *data) 110static inline u8 BVAL(const void *data)
111{ 111{
112 return *(u8 *)data; 112 return *(const u8 *)data;
113} 113}
114 114
115static u8 ncp_reply_byte(struct ncp_server *server, int offset) 115static u8 ncp_reply_byte(struct ncp_server *server, int offset)
116{ 116{
117 return *(u8 *)ncp_reply_data(server, offset); 117 return *(const u8 *)ncp_reply_data(server, offset);
118} 118}
119 119
120static inline u16 WVAL_LH(void *data) 120static inline u16 WVAL_LH(const void *data)
121{ 121{
122 return get_unaligned_le16(data); 122 return get_unaligned_le16(data);
123} 123}
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
134 return get_unaligned_be16(ncp_reply_data(server, offset)); 134 return get_unaligned_be16(ncp_reply_data(server, offset));
135} 135}
136 136
137static inline u32 DVAL_LH(void *data) 137static inline u32 DVAL_LH(const void *data)
138{ 138{
139 return get_unaligned_le32(data); 139 return get_unaligned_le32(data);
140} 140}
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
349 return result; 349 return result;
350} 350}
351 351
352void ncp_extract_file_info(void *structure, struct nw_info_struct *target) 352void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
353{ 353{
354 __u8 *name_len; 354 const __u8 *name_len;
355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen); 355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
356 356
357 memcpy(target, structure, info_struct_size); 357 memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
364} 364}
365 365
366#ifdef CONFIG_NCPFS_NFS_NS 366#ifdef CONFIG_NCPFS_NFS_NS
367static inline void ncp_extract_nfs_info(unsigned char *structure, 367static inline void ncp_extract_nfs_info(const unsigned char *structure,
368 struct nw_nfs_info *target) 368 struct nw_nfs_info *target)
369{ 369{
370 target->mode = DVAL_LH(structure); 370 target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
417 * Returns information for a (one-component) name relative to 417 * Returns information for a (one-component) name relative to
418 * the specified directory. 418 * the specified directory.
419 */ 419 */
420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path, 420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
421 struct nw_info_struct *target) 421 struct nw_info_struct *target)
422{ 422{
423 __u8 volnum = NCP_FINFO(dir)->volNumber; 423 __u8 volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
452#ifdef CONFIG_NCPFS_NFS_NS 452#ifdef CONFIG_NCPFS_NFS_NS
453static int 453static int
454ncp_obtain_DOS_dir_base(struct ncp_server *server, 454ncp_obtain_DOS_dir_base(struct ncp_server *server,
455 __u8 volnum, __le32 dirent, 455 __u8 ns, __u8 volnum, __le32 dirent,
456 char *path, /* At most 1 component */ 456 const char *path, /* At most 1 component */
457 __le32 *DOS_dir_base) 457 __le32 *DOS_dir_base)
458{ 458{
459 int result; 459 int result;
460 460
461 ncp_init_request(server); 461 ncp_init_request(server);
462 ncp_add_byte(server, 6); /* subfunction */ 462 ncp_add_byte(server, 6); /* subfunction */
463 ncp_add_byte(server, server->name_space[volnum]); 463 ncp_add_byte(server, ns);
464 ncp_add_byte(server, server->name_space[volnum]); 464 ncp_add_byte(server, ns);
465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ 465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
466 ncp_add_dword(server, RIM_DIRECTORY); 466 ncp_add_dword(server, RIM_DIRECTORY);
467 ncp_add_handle_path(server, volnum, dirent, 1, path); 467 ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */ 523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
524} 524}
525 525
526int
527ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
528{
529 int ns = ncp_get_known_namespace(server, volume);
530
531 if (ret_ns)
532 *ret_ns = ns;
533
534 DPRINTK("lookup_vol: namespace[%d] = %d\n",
535 volume, server->name_space[volume]);
536
537 if (server->name_space[volume] == ns)
538 return 0;
539 server->name_space[volume] = ns;
540 return 1;
541}
542
526static int 543static int
527ncp_ObtainSpecificDirBase(struct ncp_server *server, 544ncp_ObtainSpecificDirBase(struct ncp_server *server,
528 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base, 545 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
529 char *path, /* At most 1 component */ 546 const char *path, /* At most 1 component */
530 __le32 *dirEntNum, __le32 *DosDirNum) 547 __le32 *dirEntNum, __le32 *DosDirNum)
531{ 548{
532 int result; 549 int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
560{ 577{
561 int dstNS; 578 int dstNS;
562 int result; 579 int result;
563 580
564 dstNS = ncp_get_known_namespace(server, volNumber); 581 ncp_update_known_namespace(server, volNumber, &dstNS);
565 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 582 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber,
566 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0) 583 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
567 { 584 {
568 return result; 585 return result;
569 } 586 }
570 server->name_space[volNumber] = dstNS;
571 *volume = volNumber; 587 *volume = volNumber;
572 server->m.mounted_vol[1] = 0; 588 server->m.mounted_vol[1] = 0;
573 server->m.mounted_vol[0] = 'X'; 589 server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
575} 591}
576 592
577int 593int
578ncp_get_volume_root(struct ncp_server *server, const char *volname, 594ncp_get_volume_root(struct ncp_server *server,
579 __u32* volume, __le32* dirent, __le32* dosdirent) 595 const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
580{ 596{
581 int result; 597 int result;
582 __u8 volnum;
583 598
584 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); 599 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
585 600
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
601 return result; 616 return result;
602 } 617 }
603 *dirent = *dosdirent = ncp_reply_dword(server, 4); 618 *dirent = *dosdirent = ncp_reply_dword(server, 4);
604 volnum = ncp_reply_byte(server, 8); 619 *volume = ncp_reply_byte(server, 8);
605 ncp_unlock_server(server); 620 ncp_unlock_server(server);
606 *volume = volnum;
607
608 server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
609
610 DPRINTK("lookup_vol: namespace[%d] = %d\n",
611 volnum, server->name_space[volnum]);
612
613 return 0; 621 return 0;
614} 622}
615 623
616int 624int
617ncp_lookup_volume(struct ncp_server *server, const char *volname, 625ncp_lookup_volume(struct ncp_server *server,
618 struct nw_info_struct *target) 626 const char *volname, struct nw_info_struct *target)
619{ 627{
620 int result; 628 int result;
621 629
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
625 if (result) { 633 if (result) {
626 return result; 634 return result;
627 } 635 }
636 ncp_update_known_namespace(server, target->volNumber, NULL);
628 target->nameLen = strlen(volname); 637 target->nameLen = strlen(volname);
629 memcpy(target->entryName, volname, target->nameLen+1); 638 memcpy(target->entryName, volname, target->nameLen+1);
630 target->attributes = aDIR; 639 target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
676{ 685{
677 int result = 0; 686 int result = 0;
678 687
688 ncp_init_request(server);
679 if (server->name_space[volnum] == NW_NS_NFS) { 689 if (server->name_space[volnum] == NW_NS_NFS) {
680 ncp_init_request(server);
681 ncp_add_byte(server, 25); /* subfunction */ 690 ncp_add_byte(server, 25); /* subfunction */
682 ncp_add_byte(server, server->name_space[volnum]); 691 ncp_add_byte(server, server->name_space[volnum]);
683 ncp_add_byte(server, NW_NS_NFS); 692 ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
690 ncp_add_dword_lh(server, 1); /* nlinks */ 699 ncp_add_dword_lh(server, 1); /* nlinks */
691 ncp_add_dword_lh(server, rdev); 700 ncp_add_dword_lh(server, rdev);
692 result = ncp_request(server, 87); 701 result = ncp_request(server, 87);
693 ncp_unlock_server(server);
694 } 702 }
703 ncp_unlock_server(server);
695 return result; 704 return result;
696} 705}
697#endif 706#endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
700static int 709static int
701ncp_DeleteNSEntry(struct ncp_server *server, 710ncp_DeleteNSEntry(struct ncp_server *server,
702 __u8 have_dir_base, __u8 volnum, __le32 dirent, 711 __u8 have_dir_base, __u8 volnum, __le32 dirent,
703 char* name, __u8 ns, __le16 attr) 712 const char* name, __u8 ns, __le16 attr)
704{ 713{
705 int result; 714 int result;
706 715
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
734 743
735int 744int
736ncp_del_file_or_subdir(struct ncp_server *server, 745ncp_del_file_or_subdir(struct ncp_server *server,
737 struct inode *dir, char *name) 746 struct inode *dir, const char *name)
738{ 747{
739 __u8 volnum = NCP_FINFO(dir)->volNumber; 748 __u8 volnum = NCP_FINFO(dir)->volNumber;
740 __le32 dirent = NCP_FINFO(dir)->dirEntNum; 749 __le32 dirent = NCP_FINFO(dir)->dirEntNum;
750 int name_space;
741 751
752 name_space = server->name_space[volnum];
742#ifdef CONFIG_NCPFS_NFS_NS 753#ifdef CONFIG_NCPFS_NFS_NS
743 if (server->name_space[volnum]==NW_NS_NFS) 754 if (name_space == NW_NS_NFS)
744 { 755 {
745 int result; 756 int result;
746 757
747 result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent); 758 result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
748 if (result) return result; 759 if (result) return result;
749 return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006)); 760 name = NULL;
761 name_space = NW_NS_DOS;
750 } 762 }
751 else
752#endif /* CONFIG_NCPFS_NFS_NS */ 763#endif /* CONFIG_NCPFS_NFS_NS */
753 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006)); 764 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
754} 765}
755 766
756static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6]) 767static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
765/* If both dir and name are NULL, then in target there's already a 776/* If both dir and name are NULL, then in target there's already a
766 looked-up entry that wants to be opened. */ 777 looked-up entry that wants to be opened. */
767int ncp_open_create_file_or_subdir(struct ncp_server *server, 778int ncp_open_create_file_or_subdir(struct ncp_server *server,
768 struct inode *dir, char *name, 779 struct inode *dir, const char *name,
769 int open_create_mode, 780 int open_create_mode,
770 __le32 create_attributes, 781 __le32 create_attributes,
771 __le16 desired_acc_rights, 782 __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
890 901
891static int 902static int
892ncp_RenameNSEntry(struct ncp_server *server, 903ncp_RenameNSEntry(struct ncp_server *server,
893 struct inode *old_dir, char *old_name, __le16 old_type, 904 struct inode *old_dir, const char *old_name, __le16 old_type,
894 struct inode *new_dir, char *new_name) 905 struct inode *new_dir, const char *new_name)
895{ 906{
896 int result = -EINVAL; 907 int result = -EINVAL;
897 908
@@ -929,8 +940,8 @@ out:
929} 940}
930 941
931int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 942int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
932 struct inode *old_dir, char *old_name, 943 struct inode *old_dir, const char *old_name,
933 struct inode *new_dir, char *new_name) 944 struct inode *new_dir, const char *new_name)
934{ 945{
935 int result; 946 int result;
936 __le16 old_type = cpu_to_le16(0x06); 947 __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
958ncp_read_kernel(struct ncp_server *server, const char *file_id, 969ncp_read_kernel(struct ncp_server *server, const char *file_id,
959 __u32 offset, __u16 to_read, char *target, int *bytes_read) 970 __u32 offset, __u16 to_read, char *target, int *bytes_read)
960{ 971{
961 char *source; 972 const char *source;
962 int result; 973 int result;
963 974
964 ncp_init_request(server); 975 ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57dc..09881e6aa5ad 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
32#include <linux/ctype.h> 32#include <linux/ctype.h>
33#endif /* CONFIG_NCPFS_NLS */ 33#endif /* CONFIG_NCPFS_NLS */
34 34
35#include <linux/ncp_fs.h>
36
37#define NCP_MIN_SYMLINK_SIZE 8 35#define NCP_MIN_SYMLINK_SIZE 8
38#define NCP_MAX_SYMLINK_SIZE 512 36#define NCP_MAX_SYMLINK_SIZE 512
39 37
@@ -65,10 +63,11 @@ static inline void ncp_inode_close(struct inode *inode) {
65 atomic_dec(&NCP_FINFO(inode)->opened); 63 atomic_dec(&NCP_FINFO(inode)->opened);
66} 64}
67 65
68void ncp_extract_file_info(void* src, struct nw_info_struct* target); 66void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
69int ncp_obtain_info(struct ncp_server *server, struct inode *, char *, 67int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
70 struct nw_info_struct *target); 68 struct nw_info_struct *target);
71int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target); 69int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
70int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
72int ncp_get_volume_root(struct ncp_server *server, const char *volname, 71int ncp_get_volume_root(struct ncp_server *server, const char *volname,
73 __u32 *volume, __le32 *dirent, __le32 *dosdirent); 72 __u32 *volume, __le32 *dirent, __le32 *dosdirent);
74int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *); 73int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +79,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
80 __u32 mode, __u32 rdev); 79 __u32 mode, __u32 rdev);
81 80
82int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*); 81int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
83int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *); 82int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
84int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *, 83int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
85 int, __le32, __le16, struct ncp_entry_info *); 84 int, __le32, __le16, struct ncp_entry_info *);
86 85
87int ncp_initialize_search(struct ncp_server *, struct inode *, 86int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +92,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
93 char** rbuf, size_t* rsize); 92 char** rbuf, size_t* rsize);
94 93
95int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 94int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
96 struct inode *, char *, struct inode *, char *); 95 struct inode *, const char *, struct inode *, const char *);
97 96
98 97
99int 98int
@@ -134,7 +133,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
134 const unsigned char *, unsigned int, int); 133 const unsigned char *, unsigned int, int);
135 134
136#define NCP_ESC ':' 135#define NCP_ESC ':'
137#define NCP_IO_TABLE(dentry) (NCP_SERVER((dentry)->d_inode)->nls_io) 136#define NCP_IO_TABLE(sb) (NCP_SBP(sb)->nls_io)
138#define ncp_tolower(t, c) nls_tolower(t, c) 137#define ncp_tolower(t, c) nls_tolower(t, c)
139#define ncp_toupper(t, c) nls_toupper(t, c) 138#define ncp_toupper(t, c) nls_toupper(t, c)
140#define ncp_strnicmp(t, s1, s2, len) \ 139#define ncp_strnicmp(t, s1, s2, len) \
@@ -149,15 +148,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
149int ncp__vol2io(unsigned char *, unsigned int *, 148int ncp__vol2io(unsigned char *, unsigned int *,
150 const unsigned char *, unsigned int, int); 149 const unsigned char *, unsigned int, int);
151 150
152#define NCP_IO_TABLE(dentry) NULL 151#define NCP_IO_TABLE(sb) NULL
153#define ncp_tolower(t, c) tolower(c) 152#define ncp_tolower(t, c) tolower(c)
154#define ncp_toupper(t, c) toupper(c) 153#define ncp_toupper(t, c) toupper(c)
155#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U) 154#define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U)
156#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U) 155#define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U)
157 156
158 157
159static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1, 158static inline int ncp_strnicmp(const struct nls_table *t,
160 const unsigned char *s2, int len) 159 const unsigned char *s1, const unsigned char *s2, int len)
161{ 160{
162 while (len--) { 161 while (len--) {
163 if (tolower(*s1++) != tolower(*s2++)) 162 if (tolower(*s1++) != tolower(*s2++))
@@ -170,13 +169,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
170#endif /* CONFIG_NCPFS_NLS */ 169#endif /* CONFIG_NCPFS_NLS */
171 170
172#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time) 171#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time)
173#define NCP_MAX_AGE(server) ((server)->dentry_ttl) 172#define NCP_MAX_AGE(server) atomic_read(&(server)->dentry_ttl)
174#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server)) 173#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
175 174
176static inline void 175static inline void
177ncp_age_dentry(struct ncp_server* server, struct dentry* dentry) 176ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
178{ 177{
179 dentry->d_time = jiffies - server->dentry_ttl; 178 dentry->d_time = jiffies - NCP_MAX_AGE(server);
180} 179}
181 180
182static inline void 181static inline void
@@ -192,7 +191,7 @@ ncp_renew_dentries(struct dentry *parent)
192 struct list_head *next; 191 struct list_head *next;
193 struct dentry *dentry; 192 struct dentry *dentry;
194 193
195 spin_lock(&dcache_lock); 194 spin_lock(&parent->d_lock);
196 next = parent->d_subdirs.next; 195 next = parent->d_subdirs.next;
197 while (next != &parent->d_subdirs) { 196 while (next != &parent->d_subdirs) {
198 dentry = list_entry(next, struct dentry, d_u.d_child); 197 dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -204,7 +203,7 @@ ncp_renew_dentries(struct dentry *parent)
204 203
205 next = next->next; 204 next = next->next;
206 } 205 }
207 spin_unlock(&dcache_lock); 206 spin_unlock(&parent->d_lock);
208} 207}
209 208
210static inline void 209static inline void
@@ -214,7 +213,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
214 struct list_head *next; 213 struct list_head *next;
215 struct dentry *dentry; 214 struct dentry *dentry;
216 215
217 spin_lock(&dcache_lock); 216 spin_lock(&parent->d_lock);
218 next = parent->d_subdirs.next; 217 next = parent->d_subdirs.next;
219 while (next != &parent->d_subdirs) { 218 while (next != &parent->d_subdirs) {
220 dentry = list_entry(next, struct dentry, d_u.d_child); 219 dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -222,7 +221,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
222 ncp_age_dentry(server, dentry); 221 ncp_age_dentry(server, dentry);
223 next = next->next; 222 next = next->next;
224 } 223 }
225 spin_unlock(&dcache_lock); 224 spin_unlock(&parent->d_lock);
226} 225}
227 226
228struct ncp_cache_head { 227struct ncp_cache_head {
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6cf..08907599dcd2 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,25 +11,26 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/ncp.h> 12#include <linux/ncp.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include "ncp_fs.h"
14#include "ncpsign_kernel.h" 15#include "ncpsign_kernel.h"
15 16
16/* i386: 32-bit, little endian, handles mis-alignment */ 17/* i386: 32-bit, little endian, handles mis-alignment */
17#ifdef __i386__ 18#ifdef __i386__
18#define GET_LE32(p) (*(int *)(p)) 19#define GET_LE32(p) (*(const int *)(p))
19#define PUT_LE32(p,v) { *(int *)(p)=v; } 20#define PUT_LE32(p,v) { *(int *)(p)=v; }
20#else 21#else
21/* from include/ncplib.h */ 22/* from include/ncplib.h */
22#define BVAL(buf,pos) (((__u8 *)(buf))[pos]) 23#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
23#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos)) 24#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
24#define BSET(buf,pos,val) (BVAL(buf,pos) = (val)) 25#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
25 26
26static inline __u16 27static inline __u16
27WVAL_LH(__u8 * buf, int pos) 28WVAL_LH(const __u8 * buf, int pos)
28{ 29{
29 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8; 30 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
30} 31}
31static inline __u32 32static inline __u32
32DVAL_LH(__u8 * buf, int pos) 33DVAL_LH(const __u8 * buf, int pos)
33{ 34{
34 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16; 35 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
35} 36}
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381cc..d9a1438bb1f6 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
8#ifndef _NCPSIGN_KERNEL_H 8#ifndef _NCPSIGN_KERNEL_H
9#define _NCPSIGN_KERNEL_H 9#define _NCPSIGN_KERNEL_H
10 10
11#include <linux/ncp_fs.h>
12
13#ifdef CONFIG_NCPFS_PACKET_SIGNING 11#ifdef CONFIG_NCPFS_PACKET_SIGNING
14void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff); 12void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
15int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff); 13int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6e..3a1587222c8a 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
28#include <linux/poll.h> 28#include <linux/poll.h>
29#include <linux/file.h> 29#include <linux/file.h>
30 30
31#include <linux/ncp_fs.h> 31#include "ncp_fs.h"
32 32
33#include "ncpsign_kernel.h" 33#include "ncpsign_kernel.h"
34 34
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
746 return -EIO; 746 return -EIO;
747 } 747 }
748 if (!ncp_conn_valid(server)) { 748 if (!ncp_conn_valid(server)) {
749 printk(KERN_ERR "ncpfs: Connection invalid!\n");
750 return -EIO; 749 return -EIO;
751 } 750 }
752 { 751 {
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b337..661f861d80c6 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
25 25
26#include <linux/errno.h> 26#include <linux/errno.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h>
29#include <linux/time.h> 28#include <linux/time.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <linux/mm.h> 30#include <linux/mm.h>
32#include <linux/stat.h> 31#include <linux/stat.h>
33#include "ncplib_kernel.h" 32#include "ncp_fs.h"
34
35 33
36/* these magic numbers must appear in the symlink file -- this makes it a bit 34/* these magic numbers must appear in the symlink file -- this makes it a bit
37 more resilient against the magic attributes being set on random files. */ 35 more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e13db613cb..ba306658a6db 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -76,13 +76,17 @@ config NFS_V4
76 76
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select PNFS_FILE_LAYOUT
80 help 81 help
81 This option enables support for minor version 1 of the NFSv4 protocol 82 This option enables support for minor version 1 of the NFSv4 protocol
82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. 83 (RFC 5661) in the kernel's NFS client.
83 84
84 If unsure, say N. 85 If unsure, say N.
85 86
87config PNFS_FILE_LAYOUT
88 tristate
89
86config ROOT_NFS 90config ROOT_NFS
87 bool "Root file system on NFS" 91 bool "Root file system on NFS"
88 depends on NFS_FS=y && IP_PNP 92 depends on NFS_FS=y && IP_PNP
@@ -117,3 +121,14 @@ config NFS_USE_KERNEL_DNS
117 select DNS_RESOLVER 121 select DNS_RESOLVER
118 select KEYS 122 select KEYS
119 default y 123 default y
124
125config NFS_USE_NEW_IDMAPPER
126 bool "Use the new idmapper upcall routine"
127 depends on NFS_V4 && KEYS
128 help
129 Say Y here if you want NFS to use the new idmapper upcall functions.
130 You will need /sbin/request-key (usually provided by the keyutils
131 package). For details, read
132 <file:Documentation/filesystems/nfs/idmapper.txt>.
133
134 If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 delegation.o idmap.o \ 15 delegation.o idmap.o \
16 callback.o callback_xdr.o callback_proc.o \ 16 callback.o callback_xdr.o callback_proc.o \
17 nfs4namespace.o 17 nfs4namespace.o
18nfs-$(CONFIG_NFS_V4_1) += pnfs.o
18nfs-$(CONFIG_SYSCTL) += sysctl.o 19nfs-$(CONFIG_SYSCTL) += sysctl.o
19nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
21
22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e17b49e2eabd..e3d294269058 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/ip.h> 10#include <linux/ip.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/smp_lock.h>
13#include <linux/sunrpc/svc.h> 12#include <linux/sunrpc/svc.h>
14#include <linux/sunrpc/svcsock.h> 13#include <linux/sunrpc/svcsock.h>
15#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
@@ -17,9 +16,7 @@
17#include <linux/freezer.h> 16#include <linux/freezer.h>
18#include <linux/kthread.h> 17#include <linux/kthread.h>
19#include <linux/sunrpc/svcauth_gss.h> 18#include <linux/sunrpc/svcauth_gss.h>
20#if defined(CONFIG_NFS_V4_1)
21#include <linux/sunrpc/bc_xprt.h> 19#include <linux/sunrpc/bc_xprt.h>
22#endif
23 20
24#include <net/inet_sock.h> 21#include <net/inet_sock.h>
25 22
@@ -109,7 +106,7 @@ nfs4_callback_up(struct svc_serv *serv)
109{ 106{
110 int ret; 107 int ret;
111 108
112 ret = svc_create_xprt(serv, "tcp", PF_INET, 109 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
113 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 110 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
114 if (ret <= 0) 111 if (ret <= 0)
115 goto out_err; 112 goto out_err;
@@ -117,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv)
117 dprintk("NFS: Callback listener port = %u (af %u)\n", 114 dprintk("NFS: Callback listener port = %u (af %u)\n",
118 nfs_callback_tcpport, PF_INET); 115 nfs_callback_tcpport, PF_INET);
119 116
120 ret = svc_create_xprt(serv, "tcp", PF_INET6, 117 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
121 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 118 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
122 if (ret > 0) { 119 if (ret > 0) {
123 nfs_callback_tcpport6 = ret; 120 nfs_callback_tcpport6 = ret;
@@ -178,30 +175,38 @@ nfs41_callback_svc(void *vrqstp)
178struct svc_rqst * 175struct svc_rqst *
179nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 176nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
180{ 177{
181 struct svc_xprt *bc_xprt; 178 struct svc_rqst *rqstp;
182 struct svc_rqst *rqstp = ERR_PTR(-ENOMEM); 179 int ret;
183 180
184 dprintk("--> %s\n", __func__); 181 /*
185 /* Create a svc_sock for the service */ 182 * Create an svc_sock for the back channel service that shares the
186 bc_xprt = svc_sock_create(serv, xprt->prot); 183 * fore channel connection.
187 if (!bc_xprt) 184 * Returns the input port (0) and sets the svc_serv bc_xprt on success
185 */
186 ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
187 SVC_SOCK_ANONYMOUS);
188 if (ret < 0) {
189 rqstp = ERR_PTR(ret);
188 goto out; 190 goto out;
191 }
189 192
190 /* 193 /*
191 * Save the svc_serv in the transport so that it can 194 * Save the svc_serv in the transport so that it can
192 * be referenced when the session backchannel is initialized 195 * be referenced when the session backchannel is initialized
193 */ 196 */
194 serv->bc_xprt = bc_xprt;
195 xprt->bc_serv = serv; 197 xprt->bc_serv = serv;
196 198
197 INIT_LIST_HEAD(&serv->sv_cb_list); 199 INIT_LIST_HEAD(&serv->sv_cb_list);
198 spin_lock_init(&serv->sv_cb_lock); 200 spin_lock_init(&serv->sv_cb_lock);
199 init_waitqueue_head(&serv->sv_cb_waitq); 201 init_waitqueue_head(&serv->sv_cb_waitq);
200 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); 202 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
201 if (IS_ERR(rqstp)) 203 if (IS_ERR(rqstp)) {
202 svc_sock_destroy(bc_xprt); 204 svc_xprt_put(serv->sv_bc_xprt);
205 serv->sv_bc_xprt = NULL;
206 }
203out: 207out:
204 dprintk("--> %s return %p\n", __func__, rqstp); 208 dprintk("--> %s return %ld\n", __func__,
209 IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
205 return rqstp; 210 return rqstp;
206} 211}
207 212
@@ -323,58 +328,58 @@ void nfs_callback_down(int minorversion)
323 mutex_unlock(&nfs_callback_mutex); 328 mutex_unlock(&nfs_callback_mutex);
324} 329}
325 330
326static int check_gss_callback_principal(struct nfs_client *clp, 331/* Boolean check of RPC_AUTH_GSS principal */
327 struct svc_rqst *rqstp) 332int
333check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
328{ 334{
329 struct rpc_clnt *r = clp->cl_rpcclient; 335 struct rpc_clnt *r = clp->cl_rpcclient;
330 char *p = svc_gss_principal(rqstp); 336 char *p = svc_gss_principal(rqstp);
331 337
338 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
339 return 1;
340
341 /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
342 if (clp->cl_minorversion != 0)
343 return 0;
332 /* 344 /*
333 * It might just be a normal user principal, in which case 345 * It might just be a normal user principal, in which case
334 * userspace won't bother to tell us the name at all. 346 * userspace won't bother to tell us the name at all.
335 */ 347 */
336 if (p == NULL) 348 if (p == NULL)
337 return SVC_DENIED; 349 return 0;
338 350
339 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ 351 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
340 352
341 if (memcmp(p, "nfs@", 4) != 0) 353 if (memcmp(p, "nfs@", 4) != 0)
342 return SVC_DENIED; 354 return 0;
343 p += 4; 355 p += 4;
344 if (strcmp(p, r->cl_server) != 0) 356 if (strcmp(p, r->cl_server) != 0)
345 return SVC_DENIED; 357 return 0;
346 return SVC_OK; 358 return 1;
347} 359}
348 360
361/*
362 * pg_authenticate method for nfsv4 callback threads.
363 *
364 * The authflavor has been negotiated, so an incorrect flavor is a server
365 * bug. Drop packets with incorrect authflavor.
366 *
367 * All other checking done after NFS decoding where the nfs_client can be
368 * found in nfs4_callback_compound
369 */
349static int nfs_callback_authenticate(struct svc_rqst *rqstp) 370static int nfs_callback_authenticate(struct svc_rqst *rqstp)
350{ 371{
351 struct nfs_client *clp;
352 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
353 int ret = SVC_OK;
354
355 /* Don't talk to strangers */
356 clp = nfs_find_client(svc_addr(rqstp), 4);
357 if (clp == NULL)
358 return SVC_DROP;
359
360 dprintk("%s: %s NFSv4 callback!\n", __func__,
361 svc_print_addr(rqstp, buf, sizeof(buf)));
362
363 switch (rqstp->rq_authop->flavour) { 372 switch (rqstp->rq_authop->flavour) {
364 case RPC_AUTH_NULL: 373 case RPC_AUTH_NULL:
365 if (rqstp->rq_proc != CB_NULL) 374 if (rqstp->rq_proc != CB_NULL)
366 ret = SVC_DENIED; 375 return SVC_DROP;
367 break; 376 break;
368 case RPC_AUTH_UNIX: 377 case RPC_AUTH_GSS:
369 break; 378 /* No RPC_AUTH_GSS support yet in NFSv4.1 */
370 case RPC_AUTH_GSS: 379 if (svc_is_backchannel(rqstp))
371 ret = check_gss_callback_principal(clp, rqstp); 380 return SVC_DROP;
372 break;
373 default:
374 ret = SVC_DENIED;
375 } 381 }
376 nfs_put_client(clp); 382 return SVC_OK;
377 return ret;
378} 383}
379 384
380/* 385/*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 85a7cfd1b8dd..46d93ce7311b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
7 */ 7 */
8#ifndef __LINUX_FS_NFS_CALLBACK_H 8#ifndef __LINUX_FS_NFS_CALLBACK_H
9#define __LINUX_FS_NFS_CALLBACK_H 9#define __LINUX_FS_NFS_CALLBACK_H
10#include <linux/sunrpc/svc.h>
10 11
11#define NFS4_CALLBACK 0x40000000 12#define NFS4_CALLBACK 0x40000000
12#define NFS4_CALLBACK_XDRSIZE 2048 13#define NFS4_CALLBACK_XDRSIZE 2048
@@ -34,10 +35,16 @@ enum nfs4_callback_opnum {
34 OP_CB_ILLEGAL = 10044, 35 OP_CB_ILLEGAL = 10044,
35}; 36};
36 37
38struct cb_process_state {
39 __be32 drc_status;
40 struct nfs_client *clp;
41};
42
37struct cb_compound_hdr_arg { 43struct cb_compound_hdr_arg {
38 unsigned int taglen; 44 unsigned int taglen;
39 const char *tag; 45 const char *tag;
40 unsigned int minorversion; 46 unsigned int minorversion;
47 unsigned int cb_ident; /* v4.0 callback identifier */
41 unsigned nops; 48 unsigned nops;
42}; 49};
43 50
@@ -103,14 +110,23 @@ struct cb_sequenceres {
103 uint32_t csr_target_highestslotid; 110 uint32_t csr_target_highestslotid;
104}; 111};
105 112
106extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, 113extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
107 struct cb_sequenceres *res); 114 struct cb_sequenceres *res,
115 struct cb_process_state *cps);
108 116
109extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, 117extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
110 const nfs4_stateid *stateid); 118 const nfs4_stateid *stateid);
111 119
112#define RCA4_TYPE_MASK_RDATA_DLG 0 120#define RCA4_TYPE_MASK_RDATA_DLG 0
113#define RCA4_TYPE_MASK_WDATA_DLG 1 121#define RCA4_TYPE_MASK_WDATA_DLG 1
122#define RCA4_TYPE_MASK_DIR_DLG 2
123#define RCA4_TYPE_MASK_FILE_LAYOUT 3
124#define RCA4_TYPE_MASK_BLK_LAYOUT 4
125#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
126#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
127#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
128#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
129#define RCA4_TYPE_MASK_ALL 0xf31f
114 130
115struct cb_recallanyargs { 131struct cb_recallanyargs {
116 struct sockaddr *craa_addr; 132 struct sockaddr *craa_addr;
@@ -118,25 +134,52 @@ struct cb_recallanyargs {
118 uint32_t craa_type_mask; 134 uint32_t craa_type_mask;
119}; 135};
120 136
121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); 137extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
138 void *dummy,
139 struct cb_process_state *cps);
122 140
123struct cb_recallslotargs { 141struct cb_recallslotargs {
124 struct sockaddr *crsa_addr; 142 struct sockaddr *crsa_addr;
125 uint32_t crsa_target_max_slots; 143 uint32_t crsa_target_max_slots;
126}; 144};
127extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, 145extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
128 void *dummy); 146 void *dummy,
129 147 struct cb_process_state *cps);
130#endif /* CONFIG_NFS_V4_1 */ 148
149struct cb_layoutrecallargs {
150 struct sockaddr *cbl_addr;
151 uint32_t cbl_recall_type;
152 uint32_t cbl_layout_type;
153 uint32_t cbl_layoutchanged;
154 union {
155 struct {
156 struct nfs_fh cbl_fh;
157 struct pnfs_layout_range cbl_range;
158 nfs4_stateid cbl_stateid;
159 };
160 struct nfs_fsid cbl_fsid;
161 };
162};
131 163
132extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 164extern unsigned nfs4_callback_layoutrecall(
133extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); 165 struct cb_layoutrecallargs *args,
166 void *dummy, struct cb_process_state *cps);
134 167
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170#endif /* CONFIG_NFS_V4_1 */
171extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
173 struct cb_getattrres *res,
174 struct cb_process_state *cps);
175extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
176 struct cb_process_state *cps);
135#ifdef CONFIG_NFS_V4 177#ifdef CONFIG_NFS_V4
136extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); 178extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
137extern void nfs_callback_down(int minorversion); 179extern void nfs_callback_down(int minorversion);
138extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, 180extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
139 const nfs4_stateid *stateid); 181 const nfs4_stateid *stateid);
182extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
140#endif /* CONFIG_NFS_V4 */ 183#endif /* CONFIG_NFS_V4 */
141/* 184/*
142 * nfs41: Callbacks are expected to not cause substantial latency, 185 * nfs41: Callbacks are expected to not cause substantial latency,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..89587573fe50 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,30 +12,33 @@
12#include "callback.h" 12#include "callback.h"
13#include "delegation.h" 13#include "delegation.h"
14#include "internal.h" 14#include "internal.h"
15#include "pnfs.h"
15 16
16#ifdef NFS_DEBUG 17#ifdef NFS_DEBUG
17#define NFSDBG_FACILITY NFSDBG_CALLBACK 18#define NFSDBG_FACILITY NFSDBG_CALLBACK
18#endif 19#endif
19 20
20__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) 21__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
22 struct cb_getattrres *res,
23 struct cb_process_state *cps)
21{ 24{
22 struct nfs_client *clp;
23 struct nfs_delegation *delegation; 25 struct nfs_delegation *delegation;
24 struct nfs_inode *nfsi; 26 struct nfs_inode *nfsi;
25 struct inode *inode; 27 struct inode *inode;
26 28
29 res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
30 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
31 goto out;
32
27 res->bitmap[0] = res->bitmap[1] = 0; 33 res->bitmap[0] = res->bitmap[1] = 0;
28 res->status = htonl(NFS4ERR_BADHANDLE); 34 res->status = htonl(NFS4ERR_BADHANDLE);
29 clp = nfs_find_client(args->addr, 4);
30 if (clp == NULL)
31 goto out;
32 35
33 dprintk("NFS: GETATTR callback request from %s\n", 36 dprintk("NFS: GETATTR callback request from %s\n",
34 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 37 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
35 38
36 inode = nfs_delegation_find_inode(clp, &args->fh); 39 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
37 if (inode == NULL) 40 if (inode == NULL)
38 goto out_putclient; 41 goto out;
39 nfsi = NFS_I(inode); 42 nfsi = NFS_I(inode);
40 rcu_read_lock(); 43 rcu_read_lock();
41 delegation = rcu_dereference(nfsi->delegation); 44 delegation = rcu_dereference(nfsi->delegation);
@@ -55,49 +58,41 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
55out_iput: 58out_iput:
56 rcu_read_unlock(); 59 rcu_read_unlock();
57 iput(inode); 60 iput(inode);
58out_putclient:
59 nfs_put_client(clp);
60out: 61out:
61 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); 62 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
62 return res->status; 63 return res->status;
63} 64}
64 65
65__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 66__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
67 struct cb_process_state *cps)
66{ 68{
67 struct nfs_client *clp;
68 struct inode *inode; 69 struct inode *inode;
69 __be32 res; 70 __be32 res;
70 71
71 res = htonl(NFS4ERR_BADHANDLE); 72 res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
72 clp = nfs_find_client(args->addr, 4); 73 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
73 if (clp == NULL)
74 goto out; 74 goto out;
75 75
76 dprintk("NFS: RECALL callback request from %s\n", 76 dprintk("NFS: RECALL callback request from %s\n",
77 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 77 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
78 78
79 do { 79 res = htonl(NFS4ERR_BADHANDLE);
80 struct nfs_client *prev = clp; 80 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
81 81 if (inode == NULL)
82 inode = nfs_delegation_find_inode(clp, &args->fh); 82 goto out;
83 if (inode != NULL) { 83 /* Set up a helper thread to actually return the delegation */
84 /* Set up a helper thread to actually return the delegation */ 84 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
85 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { 85 case 0:
86 case 0: 86 res = 0;
87 res = 0; 87 break;
88 break; 88 case -ENOENT:
89 case -ENOENT: 89 if (res != 0)
90 if (res != 0) 90 res = htonl(NFS4ERR_BAD_STATEID);
91 res = htonl(NFS4ERR_BAD_STATEID); 91 break;
92 break; 92 default:
93 default: 93 res = htonl(NFS4ERR_RESOURCE);
94 res = htonl(NFS4ERR_RESOURCE); 94 }
95 } 95 iput(inode);
96 iput(inode);
97 }
98 clp = nfs_find_client_next(prev);
99 nfs_put_client(prev);
100 } while (clp != NULL);
101out: 96out:
102 dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); 97 dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
103 return res; 98 return res;
@@ -113,16 +108,149 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
113 108
114#if defined(CONFIG_NFS_V4_1) 109#if defined(CONFIG_NFS_V4_1)
115 110
111static u32 initiate_file_draining(struct nfs_client *clp,
112 struct cb_layoutrecallargs *args)
113{
114 struct pnfs_layout_hdr *lo;
115 struct inode *ino;
116 bool found = false;
117 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
118 LIST_HEAD(free_me_list);
119
120 spin_lock(&clp->cl_lock);
121 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
122 if (nfs_compare_fh(&args->cbl_fh,
123 &NFS_I(lo->plh_inode)->fh))
124 continue;
125 ino = igrab(lo->plh_inode);
126 if (!ino)
127 continue;
128 found = true;
129 /* Without this, layout can be freed as soon
130 * as we release cl_lock.
131 */
132 get_layout_hdr(lo);
133 break;
134 }
135 spin_unlock(&clp->cl_lock);
136 if (!found)
137 return NFS4ERR_NOMATCHING_LAYOUT;
138
139 spin_lock(&ino->i_lock);
140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
141 mark_matching_lsegs_invalid(lo, &free_me_list,
142 args->cbl_range.iomode))
143 rv = NFS4ERR_DELAY;
144 else
145 rv = NFS4ERR_NOMATCHING_LAYOUT;
146 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
147 spin_unlock(&ino->i_lock);
148 pnfs_free_lseg_list(&free_me_list);
149 put_layout_hdr(lo);
150 iput(ino);
151 return rv;
152}
153
154static u32 initiate_bulk_draining(struct nfs_client *clp,
155 struct cb_layoutrecallargs *args)
156{
157 struct pnfs_layout_hdr *lo;
158 struct inode *ino;
159 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
160 struct pnfs_layout_hdr *tmp;
161 LIST_HEAD(recall_list);
162 LIST_HEAD(free_me_list);
163 struct pnfs_layout_range range = {
164 .iomode = IOMODE_ANY,
165 .offset = 0,
166 .length = NFS4_MAX_UINT64,
167 };
168
169 spin_lock(&clp->cl_lock);
170 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
171 if ((args->cbl_recall_type == RETURN_FSID) &&
172 memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
173 &args->cbl_fsid, sizeof(struct nfs_fsid)))
174 continue;
175 if (!igrab(lo->plh_inode))
176 continue;
177 get_layout_hdr(lo);
178 BUG_ON(!list_empty(&lo->plh_bulk_recall));
179 list_add(&lo->plh_bulk_recall, &recall_list);
180 }
181 spin_unlock(&clp->cl_lock);
182 list_for_each_entry_safe(lo, tmp,
183 &recall_list, plh_bulk_recall) {
184 ino = lo->plh_inode;
185 spin_lock(&ino->i_lock);
186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
187 if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock);
191 put_layout_hdr(lo);
192 iput(ino);
193 }
194 pnfs_free_lseg_list(&free_me_list);
195 return rv;
196}
197
198static u32 do_callback_layoutrecall(struct nfs_client *clp,
199 struct cb_layoutrecallargs *args)
200{
201 u32 res = NFS4ERR_DELAY;
202
203 dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
204 if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
205 goto out;
206 if (args->cbl_recall_type == RETURN_FILE)
207 res = initiate_file_draining(clp, args);
208 else
209 res = initiate_bulk_draining(clp, args);
210 clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
211out:
212 dprintk("%s returning %i\n", __func__, res);
213 return res;
214
215}
216
217__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
218 void *dummy, struct cb_process_state *cps)
219{
220 u32 res;
221
222 dprintk("%s: -->\n", __func__);
223
224 if (cps->clp)
225 res = do_callback_layoutrecall(cps->clp, args);
226 else
227 res = NFS4ERR_OP_NOT_IN_SESSION;
228
229 dprintk("%s: exit with status = %d\n", __func__, res);
230 return cpu_to_be32(res);
231}
232
233static void pnfs_recall_all_layouts(struct nfs_client *clp)
234{
235 struct cb_layoutrecallargs args;
236
237 /* Pretend we got a CB_LAYOUTRECALL(ALL) */
238 memset(&args, 0, sizeof(args));
239 args.cbl_recall_type = RETURN_ALL;
240 /* FIXME we ignore errors, what should we do? */
241 do_callback_layoutrecall(clp, &args);
242}
243
116int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) 244int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
117{ 245{
118 if (delegation == NULL) 246 if (delegation == NULL)
119 return 0; 247 return 0;
120 248
121 /* seqid is 4-bytes long */ 249 if (stateid->stateid.seqid != 0)
122 if (((u32 *) &stateid->data)[0] != 0)
123 return 0; 250 return 0;
124 if (memcmp(&delegation->stateid.data[4], &stateid->data[4], 251 if (memcmp(&delegation->stateid.stateid.other,
125 sizeof(stateid->data)-4)) 252 &stateid->stateid.other,
253 NFS4_STATEID_OTHER_SIZE))
126 return 0; 254 return 0;
127 255
128 return 1; 256 return 1;
@@ -185,42 +313,6 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
185} 313}
186 314
187/* 315/*
188 * Returns a pointer to a held 'struct nfs_client' that matches the server's
189 * address, major version number, and session ID. It is the caller's
190 * responsibility to release the returned reference.
191 *
192 * Returns NULL if there are no connections with sessions, or if no session
193 * matches the one of interest.
194 */
195 static struct nfs_client *find_client_with_session(
196 const struct sockaddr *addr, u32 nfsversion,
197 struct nfs4_sessionid *sessionid)
198{
199 struct nfs_client *clp;
200
201 clp = nfs_find_client(addr, 4);
202 if (clp == NULL)
203 return NULL;
204
205 do {
206 struct nfs_client *prev = clp;
207
208 if (clp->cl_session != NULL) {
209 if (memcmp(clp->cl_session->sess_id.data,
210 sessionid->data,
211 NFS4_MAX_SESSIONID_LEN) == 0) {
212 /* Returns a held reference to clp */
213 return clp;
214 }
215 }
216 clp = nfs_find_client_next(prev);
217 nfs_put_client(prev);
218 } while (clp != NULL);
219
220 return NULL;
221}
222
223/*
224 * For each referring call triple, check the session's slot table for 316 * For each referring call triple, check the session's slot table for
225 * a match. If the slot is in use and the sequence numbers match, the 317 * a match. If the slot is in use and the sequence numbers match, the
226 * client is still waiting for a response to the original request. 318 * client is still waiting for a response to the original request.
@@ -276,20 +368,28 @@ out:
276} 368}
277 369
278__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, 370__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
279 struct cb_sequenceres *res) 371 struct cb_sequenceres *res,
372 struct cb_process_state *cps)
280{ 373{
281 struct nfs_client *clp; 374 struct nfs_client *clp;
282 int i; 375 int i;
283 __be32 status; 376 __be32 status = htonl(NFS4ERR_BADSESSION);
284 377
285 status = htonl(NFS4ERR_BADSESSION); 378 cps->clp = NULL;
286 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); 379
380 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
287 if (clp == NULL) 381 if (clp == NULL)
288 goto out; 382 goto out;
289 383
384 /* state manager is resetting the session */
385 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
386 status = NFS4ERR_DELAY;
387 goto out;
388 }
389
290 status = validate_seqid(&clp->cl_session->bc_slot_table, args); 390 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
291 if (status) 391 if (status)
292 goto out_putclient; 392 goto out;
293 393
294 /* 394 /*
295 * Check for pending referring calls. If a match is found, a 395 * Check for pending referring calls. If a match is found, a
@@ -298,7 +398,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
298 */ 398 */
299 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { 399 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
300 status = htonl(NFS4ERR_DELAY); 400 status = htonl(NFS4ERR_DELAY);
301 goto out_putclient; 401 goto out;
302 } 402 }
303 403
304 memcpy(&res->csr_sessionid, &args->csa_sessionid, 404 memcpy(&res->csr_sessionid, &args->csa_sessionid,
@@ -307,83 +407,93 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
307 res->csr_slotid = args->csa_slotid; 407 res->csr_slotid = args->csa_slotid;
308 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 408 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
309 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 409 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
410 nfs4_cb_take_slot(clp);
310 411
311out_putclient:
312 nfs_put_client(clp);
313out: 412out:
413 cps->clp = clp; /* put in nfs4_callback_compound */
314 for (i = 0; i < args->csa_nrclists; i++) 414 for (i = 0; i < args->csa_nrclists; i++)
315 kfree(args->csa_rclists[i].rcl_refcalls); 415 kfree(args->csa_rclists[i].rcl_refcalls);
316 kfree(args->csa_rclists); 416 kfree(args->csa_rclists);
317 417
318 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) 418 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
319 res->csr_status = 0; 419 cps->drc_status = status;
320 else 420 status = 0;
421 } else
321 res->csr_status = status; 422 res->csr_status = status;
423
322 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, 424 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
323 ntohl(status), ntohl(res->csr_status)); 425 ntohl(status), ntohl(res->csr_status));
324 return status; 426 return status;
325} 427}
326 428
327__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) 429static bool
430validate_bitmap_values(unsigned long mask)
431{
432 return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
433}
434
435__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
436 struct cb_process_state *cps)
328{ 437{
329 struct nfs_client *clp;
330 __be32 status; 438 __be32 status;
331 fmode_t flags = 0; 439 fmode_t flags = 0;
332 440
333 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 441 status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
334 clp = nfs_find_client(args->craa_addr, 4); 442 if (!cps->clp) /* set in cb_sequence */
335 if (clp == NULL)
336 goto out; 443 goto out;
337 444
338 dprintk("NFS: RECALL_ANY callback request from %s\n", 445 dprintk("NFS: RECALL_ANY callback request from %s\n",
339 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 446 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
447
448 status = cpu_to_be32(NFS4ERR_INVAL);
449 if (!validate_bitmap_values(args->craa_type_mask))
450 goto out;
340 451
452 status = cpu_to_be32(NFS4_OK);
341 if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) 453 if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
342 &args->craa_type_mask)) 454 &args->craa_type_mask))
343 flags = FMODE_READ; 455 flags = FMODE_READ;
344 if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) 456 if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
345 &args->craa_type_mask)) 457 &args->craa_type_mask))
346 flags |= FMODE_WRITE; 458 flags |= FMODE_WRITE;
347 459 if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
460 &args->craa_type_mask))
461 pnfs_recall_all_layouts(cps->clp);
348 if (flags) 462 if (flags)
349 nfs_expire_all_delegation_types(clp, flags); 463 nfs_expire_all_delegation_types(cps->clp, flags);
350 status = htonl(NFS4_OK);
351out: 464out:
352 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 465 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
353 return status; 466 return status;
354} 467}
355 468
356/* Reduce the fore channel's max_slots to the target value */ 469/* Reduce the fore channel's max_slots to the target value */
357__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) 470__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
471 struct cb_process_state *cps)
358{ 472{
359 struct nfs_client *clp;
360 struct nfs4_slot_table *fc_tbl; 473 struct nfs4_slot_table *fc_tbl;
361 __be32 status; 474 __be32 status;
362 475
363 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 476 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
364 clp = nfs_find_client(args->crsa_addr, 4); 477 if (!cps->clp) /* set in cb_sequence */
365 if (clp == NULL)
366 goto out; 478 goto out;
367 479
368 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", 480 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
369 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), 481 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
370 args->crsa_target_max_slots); 482 args->crsa_target_max_slots);
371 483
372 fc_tbl = &clp->cl_session->fc_slot_table; 484 fc_tbl = &cps->clp->cl_session->fc_slot_table;
373 485
374 status = htonl(NFS4ERR_BAD_HIGH_SLOT); 486 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
375 if (args->crsa_target_max_slots > fc_tbl->max_slots || 487 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
376 args->crsa_target_max_slots < 1) 488 args->crsa_target_max_slots < 1)
377 goto out_putclient; 489 goto out;
378 490
379 status = htonl(NFS4_OK); 491 status = htonl(NFS4_OK);
380 if (args->crsa_target_max_slots == fc_tbl->max_slots) 492 if (args->crsa_target_max_slots == fc_tbl->max_slots)
381 goto out_putclient; 493 goto out;
382 494
383 fc_tbl->target_max_slots = args->crsa_target_max_slots; 495 fc_tbl->target_max_slots = args->crsa_target_max_slots;
384 nfs41_handle_recall_slot(clp); 496 nfs41_handle_recall_slot(cps->clp);
385out_putclient:
386 nfs_put_client(clp); /* balance nfs_find_client */
387out: 497out:
388 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 498 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
389 return status; 499 return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05af212f0edf..14e0f9371d14 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,8 +10,10 @@
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sunrpc/bc_xprt.h>
13#include "nfs4_fs.h" 14#include "nfs4_fs.h"
14#include "callback.h" 15#include "callback.h"
16#include "internal.h"
15 17
16#define CB_OP_TAGLEN_MAXSZ (512) 18#define CB_OP_TAGLEN_MAXSZ (512)
17#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) 19#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
@@ -22,6 +24,7 @@
22#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 24#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
23 25
24#if defined(CONFIG_NFS_V4_1) 26#if defined(CONFIG_NFS_V4_1)
27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
25#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 28#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
26 4 + 1 + 3) 29 4 + 1 + 3)
27#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 30#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -33,7 +36,8 @@
33/* Internal error code */ 36/* Internal error code */
34#define NFS4ERR_RESOURCE_HDR 11050 37#define NFS4ERR_RESOURCE_HDR 11050
35 38
36typedef __be32 (*callback_process_op_t)(void *, void *); 39typedef __be32 (*callback_process_op_t)(void *, void *,
40 struct cb_process_state *);
37typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); 41typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
38typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); 42typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
39 43
@@ -160,7 +164,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
160 hdr->minorversion = ntohl(*p++); 164 hdr->minorversion = ntohl(*p++);
161 /* Check minor version is zero or one. */ 165 /* Check minor version is zero or one. */
162 if (hdr->minorversion <= 1) { 166 if (hdr->minorversion <= 1) {
163 p++; /* skip callback_ident */ 167 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
164 } else { 168 } else {
165 printk(KERN_WARNING "%s: NFSv4 server callback with " 169 printk(KERN_WARNING "%s: NFSv4 server callback with "
166 "illegal minor version %u!\n", 170 "illegal minor version %u!\n",
@@ -220,6 +224,66 @@ out:
220 224
221#if defined(CONFIG_NFS_V4_1) 225#if defined(CONFIG_NFS_V4_1)
222 226
227static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
228 struct xdr_stream *xdr,
229 struct cb_layoutrecallargs *args)
230{
231 __be32 *p;
232 __be32 status = 0;
233 uint32_t iomode;
234
235 args->cbl_addr = svc_addr(rqstp);
236 p = read_buf(xdr, 4 * sizeof(uint32_t));
237 if (unlikely(p == NULL)) {
238 status = htonl(NFS4ERR_BADXDR);
239 goto out;
240 }
241
242 args->cbl_layout_type = ntohl(*p++);
243 /* Depite the spec's xdr, iomode really belongs in the FILE switch,
244 * as it is unuseable and ignored with the other types.
245 */
246 iomode = ntohl(*p++);
247 args->cbl_layoutchanged = ntohl(*p++);
248 args->cbl_recall_type = ntohl(*p++);
249
250 if (args->cbl_recall_type == RETURN_FILE) {
251 args->cbl_range.iomode = iomode;
252 status = decode_fh(xdr, &args->cbl_fh);
253 if (unlikely(status != 0))
254 goto out;
255
256 p = read_buf(xdr, 2 * sizeof(uint64_t));
257 if (unlikely(p == NULL)) {
258 status = htonl(NFS4ERR_BADXDR);
259 goto out;
260 }
261 p = xdr_decode_hyper(p, &args->cbl_range.offset);
262 p = xdr_decode_hyper(p, &args->cbl_range.length);
263 status = decode_stateid(xdr, &args->cbl_stateid);
264 if (unlikely(status != 0))
265 goto out;
266 } else if (args->cbl_recall_type == RETURN_FSID) {
267 p = read_buf(xdr, 2 * sizeof(uint64_t));
268 if (unlikely(p == NULL)) {
269 status = htonl(NFS4ERR_BADXDR);
270 goto out;
271 }
272 p = xdr_decode_hyper(p, &args->cbl_fsid.major);
273 p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
274 } else if (args->cbl_recall_type != RETURN_ALL) {
275 status = htonl(NFS4ERR_BADXDR);
276 goto out;
277 }
278 dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
279 __func__,
280 args->cbl_layout_type, iomode,
281 args->cbl_layoutchanged, args->cbl_recall_type);
282out:
283 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
284 return status;
285}
286
223static __be32 decode_sessionid(struct xdr_stream *xdr, 287static __be32 decode_sessionid(struct xdr_stream *xdr,
224 struct nfs4_sessionid *sid) 288 struct nfs4_sessionid *sid)
225{ 289{
@@ -574,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
574 case OP_CB_SEQUENCE: 638 case OP_CB_SEQUENCE:
575 case OP_CB_RECALL_ANY: 639 case OP_CB_RECALL_ANY:
576 case OP_CB_RECALL_SLOT: 640 case OP_CB_RECALL_SLOT:
641 case OP_CB_LAYOUTRECALL:
577 *op = &callback_ops[op_nr]; 642 *op = &callback_ops[op_nr];
578 break; 643 break;
579 644
580 case OP_CB_LAYOUTRECALL:
581 case OP_CB_NOTIFY_DEVICEID: 645 case OP_CB_NOTIFY_DEVICEID:
582 case OP_CB_NOTIFY: 646 case OP_CB_NOTIFY:
583 case OP_CB_PUSH_DELEG: 647 case OP_CB_PUSH_DELEG:
@@ -593,6 +657,37 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
593 return htonl(NFS_OK); 657 return htonl(NFS_OK);
594} 658}
595 659
660static void nfs4_callback_free_slot(struct nfs4_session *session)
661{
662 struct nfs4_slot_table *tbl = &session->bc_slot_table;
663
664 spin_lock(&tbl->slot_tbl_lock);
665 /*
666 * Let the state manager know callback processing done.
667 * A single slot, so highest used slotid is either 0 or -1
668 */
669 tbl->highest_used_slotid--;
670 nfs4_check_drain_bc_complete(session);
671 spin_unlock(&tbl->slot_tbl_lock);
672}
673
674static void nfs4_cb_free_slot(struct nfs_client *clp)
675{
676 if (clp && clp->cl_session)
677 nfs4_callback_free_slot(clp->cl_session);
678}
679
680/* A single slot, so highest used slotid is either 0 or -1 */
681void nfs4_cb_take_slot(struct nfs_client *clp)
682{
683 struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
684
685 spin_lock(&tbl->slot_tbl_lock);
686 tbl->highest_used_slotid++;
687 BUG_ON(tbl->highest_used_slotid != 0);
688 spin_unlock(&tbl->slot_tbl_lock);
689}
690
596#else /* CONFIG_NFS_V4_1 */ 691#else /* CONFIG_NFS_V4_1 */
597 692
598static __be32 693static __be32
@@ -601,6 +696,9 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
601 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 696 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
602} 697}
603 698
699static void nfs4_cb_free_slot(struct nfs_client *clp)
700{
701}
604#endif /* CONFIG_NFS_V4_1 */ 702#endif /* CONFIG_NFS_V4_1 */
605 703
606static __be32 704static __be32
@@ -621,7 +719,8 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
621static __be32 process_op(uint32_t minorversion, int nop, 719static __be32 process_op(uint32_t minorversion, int nop,
622 struct svc_rqst *rqstp, 720 struct svc_rqst *rqstp,
623 struct xdr_stream *xdr_in, void *argp, 721 struct xdr_stream *xdr_in, void *argp,
624 struct xdr_stream *xdr_out, void *resp, int* drc_status) 722 struct xdr_stream *xdr_out, void *resp,
723 struct cb_process_state *cps)
625{ 724{
626 struct callback_op *op = &callback_ops[0]; 725 struct callback_op *op = &callback_ops[0];
627 unsigned int op_nr; 726 unsigned int op_nr;
@@ -644,8 +743,8 @@ static __be32 process_op(uint32_t minorversion, int nop,
644 if (status) 743 if (status)
645 goto encode_hdr; 744 goto encode_hdr;
646 745
647 if (*drc_status) { 746 if (cps->drc_status) {
648 status = *drc_status; 747 status = cps->drc_status;
649 goto encode_hdr; 748 goto encode_hdr;
650 } 749 }
651 750
@@ -653,16 +752,10 @@ static __be32 process_op(uint32_t minorversion, int nop,
653 if (maxlen > 0 && maxlen < PAGE_SIZE) { 752 if (maxlen > 0 && maxlen < PAGE_SIZE) {
654 status = op->decode_args(rqstp, xdr_in, argp); 753 status = op->decode_args(rqstp, xdr_in, argp);
655 if (likely(status == 0)) 754 if (likely(status == 0))
656 status = op->process_op(argp, resp); 755 status = op->process_op(argp, resp, cps);
657 } else 756 } else
658 status = htonl(NFS4ERR_RESOURCE); 757 status = htonl(NFS4ERR_RESOURCE);
659 758
660 /* Only set by OP_CB_SEQUENCE processing */
661 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
662 *drc_status = status;
663 status = 0;
664 }
665
666encode_hdr: 759encode_hdr:
667 res = encode_op_hdr(xdr_out, op_nr, status); 760 res = encode_op_hdr(xdr_out, op_nr, status);
668 if (unlikely(res)) 761 if (unlikely(res))
@@ -681,8 +774,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
681 struct cb_compound_hdr_arg hdr_arg = { 0 }; 774 struct cb_compound_hdr_arg hdr_arg = { 0 };
682 struct cb_compound_hdr_res hdr_res = { NULL }; 775 struct cb_compound_hdr_res hdr_res = { NULL };
683 struct xdr_stream xdr_in, xdr_out; 776 struct xdr_stream xdr_in, xdr_out;
684 __be32 *p; 777 __be32 *p, status;
685 __be32 status, drc_status = 0; 778 struct cb_process_state cps = {
779 .drc_status = 0,
780 .clp = NULL,
781 };
686 unsigned int nops = 0; 782 unsigned int nops = 0;
687 783
688 dprintk("%s: start\n", __func__); 784 dprintk("%s: start\n", __func__);
@@ -696,6 +792,12 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
696 if (status == __constant_htonl(NFS4ERR_RESOURCE)) 792 if (status == __constant_htonl(NFS4ERR_RESOURCE))
697 return rpc_garbage_args; 793 return rpc_garbage_args;
698 794
795 if (hdr_arg.minorversion == 0) {
796 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
797 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
798 return rpc_drop_reply;
799 }
800
699 hdr_res.taglen = hdr_arg.taglen; 801 hdr_res.taglen = hdr_arg.taglen;
700 hdr_res.tag = hdr_arg.tag; 802 hdr_res.tag = hdr_arg.tag;
701 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) 803 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
@@ -703,7 +805,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
703 805
704 while (status == 0 && nops != hdr_arg.nops) { 806 while (status == 0 && nops != hdr_arg.nops) {
705 status = process_op(hdr_arg.minorversion, nops, rqstp, 807 status = process_op(hdr_arg.minorversion, nops, rqstp,
706 &xdr_in, argp, &xdr_out, resp, &drc_status); 808 &xdr_in, argp, &xdr_out, resp, &cps);
707 nops++; 809 nops++;
708 } 810 }
709 811
@@ -716,6 +818,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
716 818
717 *hdr_res.status = status; 819 *hdr_res.status = status;
718 *hdr_res.nops = htonl(nops); 820 *hdr_res.nops = htonl(nops);
821 nfs4_cb_free_slot(cps.clp);
822 nfs_put_client(cps.clp);
719 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 823 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
720 return rpc_success; 824 return rpc_success;
721} 825}
@@ -739,6 +843,12 @@ static struct callback_op callback_ops[] = {
739 .res_maxsize = CB_OP_RECALL_RES_MAXSZ, 843 .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
740 }, 844 },
741#if defined(CONFIG_NFS_V4_1) 845#if defined(CONFIG_NFS_V4_1)
846 [OP_CB_LAYOUTRECALL] = {
847 .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
848 .decode_args =
849 (callback_decode_arg_t)decode_layoutrecall_args,
850 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
851 },
742 [OP_CB_SEQUENCE] = { 852 [OP_CB_SEQUENCE] = {
743 .process_op = (callback_process_op_t)nfs4_callback_sequence, 853 .process_op = (callback_process_op_t)nfs4_callback_sequence,
744 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, 854 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e7340729af89..bd3ca32879e7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
48#include "iostat.h" 48#include "iostat.h"
49#include "internal.h" 49#include "internal.h"
50#include "fscache.h" 50#include "fscache.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_CLIENT 53#define NFSDBG_FACILITY NFSDBG_CLIENT
53 54
@@ -55,6 +56,30 @@ static DEFINE_SPINLOCK(nfs_client_lock);
55static LIST_HEAD(nfs_client_list); 56static LIST_HEAD(nfs_client_list);
56static LIST_HEAD(nfs_volume_list); 57static LIST_HEAD(nfs_volume_list);
57static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); 58static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
59#ifdef CONFIG_NFS_V4
60static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
61
62/*
63 * Get a unique NFSv4.0 callback identifier which will be used
64 * by the V4.0 callback service to lookup the nfs_client struct
65 */
66static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
67{
68 int ret = 0;
69
70 if (clp->rpc_ops->version != 4 || minorversion != 0)
71 return ret;
72retry:
73 if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
74 return -ENOMEM;
75 spin_lock(&nfs_client_lock);
76 ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
77 spin_unlock(&nfs_client_lock);
78 if (ret == -EAGAIN)
79 goto retry;
80 return ret;
81}
82#endif /* CONFIG_NFS_V4 */
58 83
59/* 84/*
60 * RPC cruft for NFS 85 * RPC cruft for NFS
@@ -143,7 +168,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
143 clp->cl_proto = cl_init->proto; 168 clp->cl_proto = cl_init->proto;
144 169
145#ifdef CONFIG_NFS_V4 170#ifdef CONFIG_NFS_V4
146 INIT_LIST_HEAD(&clp->cl_delegations); 171 err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
172 if (err)
173 goto error_cleanup;
174
147 spin_lock_init(&clp->cl_lock); 175 spin_lock_init(&clp->cl_lock);
148 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 176 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
149 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 177 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -155,7 +183,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
155 cred = rpc_lookup_machine_cred(); 183 cred = rpc_lookup_machine_cred();
156 if (!IS_ERR(cred)) 184 if (!IS_ERR(cred))
157 clp->cl_machine_cred = cred; 185 clp->cl_machine_cred = cred;
158 186#if defined(CONFIG_NFS_V4_1)
187 INIT_LIST_HEAD(&clp->cl_layouts);
188#endif
159 nfs_fscache_get_client_cookie(clp); 189 nfs_fscache_get_client_cookie(clp);
160 190
161 return clp; 191 return clp;
@@ -167,21 +197,17 @@ error_0:
167} 197}
168 198
169#ifdef CONFIG_NFS_V4 199#ifdef CONFIG_NFS_V4
170/*
171 * Clears/puts all minor version specific parts from an nfs_client struct
172 * reverting it to minorversion 0.
173 */
174static void nfs4_clear_client_minor_version(struct nfs_client *clp)
175{
176#ifdef CONFIG_NFS_V4_1 200#ifdef CONFIG_NFS_V4_1
177 if (nfs4_has_session(clp)) { 201static void nfs4_shutdown_session(struct nfs_client *clp)
202{
203 if (nfs4_has_session(clp))
178 nfs4_destroy_session(clp->cl_session); 204 nfs4_destroy_session(clp->cl_session);
179 clp->cl_session = NULL;
180 }
181
182 clp->cl_mvops = nfs_v4_minor_ops[0];
183#endif /* CONFIG_NFS_V4_1 */
184} 205}
206#else /* CONFIG_NFS_V4_1 */
207static void nfs4_shutdown_session(struct nfs_client *clp)
208{
209}
210#endif /* CONFIG_NFS_V4_1 */
185 211
186/* 212/*
187 * Destroy the NFS4 callback service 213 * Destroy the NFS4 callback service
@@ -196,17 +222,49 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
196{ 222{
197 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) 223 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
198 nfs4_kill_renewd(clp); 224 nfs4_kill_renewd(clp);
199 nfs4_clear_client_minor_version(clp); 225 nfs4_shutdown_session(clp);
200 nfs4_destroy_callback(clp); 226 nfs4_destroy_callback(clp);
201 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) 227 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
202 nfs_idmap_delete(clp); 228 nfs_idmap_delete(clp);
203 229
204 rpc_destroy_wait_queue(&clp->cl_rpcwaitq); 230 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
205} 231}
232
233/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
234void nfs_cleanup_cb_ident_idr(void)
235{
236 idr_destroy(&cb_ident_idr);
237}
238
239/* nfs_client_lock held */
240static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
241{
242 if (clp->cl_cb_ident)
243 idr_remove(&cb_ident_idr, clp->cl_cb_ident);
244}
245
246static void pnfs_init_server(struct nfs_server *server)
247{
248 rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
249}
250
206#else 251#else
207static void nfs4_shutdown_client(struct nfs_client *clp) 252static void nfs4_shutdown_client(struct nfs_client *clp)
208{ 253{
209} 254}
255
256void nfs_cleanup_cb_ident_idr(void)
257{
258}
259
260static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
261{
262}
263
264static void pnfs_init_server(struct nfs_server *server)
265{
266}
267
210#endif /* CONFIG_NFS_V4 */ 268#endif /* CONFIG_NFS_V4 */
211 269
212/* 270/*
@@ -245,6 +303,7 @@ void nfs_put_client(struct nfs_client *clp)
245 303
246 if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { 304 if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
247 list_del(&clp->cl_share_link); 305 list_del(&clp->cl_share_link);
306 nfs_cb_idr_remove_locked(clp);
248 spin_unlock(&nfs_client_lock); 307 spin_unlock(&nfs_client_lock);
249 308
250 BUG_ON(!list_empty(&clp->cl_superblocks)); 309 BUG_ON(!list_empty(&clp->cl_superblocks));
@@ -252,6 +311,7 @@ void nfs_put_client(struct nfs_client *clp)
252 nfs_free_client(clp); 311 nfs_free_client(clp);
253 } 312 }
254} 313}
314EXPORT_SYMBOL_GPL(nfs_put_client);
255 315
256#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 316#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
257/* 317/*
@@ -359,70 +419,28 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
359 return 0; 419 return 0;
360} 420}
361 421
362/* 422/* Common match routine for v4.0 and v4.1 callback services */
363 * Find a client by IP address and protocol version 423bool
364 * - returns NULL if no such client 424nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
365 */ 425 u32 minorversion)
366struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
367{ 426{
368 struct nfs_client *clp; 427 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
369
370 spin_lock(&nfs_client_lock);
371 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
372 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
373 428
374 /* Don't match clients that failed to initialise properly */ 429 /* Don't match clients that failed to initialise */
375 if (!(clp->cl_cons_state == NFS_CS_READY || 430 if (!(clp->cl_cons_state == NFS_CS_READY ||
376 clp->cl_cons_state == NFS_CS_SESSION_INITING)) 431 clp->cl_cons_state == NFS_CS_SESSION_INITING))
377 continue; 432 return false;
378 433
379 /* Different NFS versions cannot share the same nfs_client */ 434 /* Match the version and minorversion */
380 if (clp->rpc_ops->version != nfsversion) 435 if (clp->rpc_ops->version != 4 ||
381 continue; 436 clp->cl_minorversion != minorversion)
437 return false;
382 438
383 /* Match only the IP address, not the port number */ 439 /* Match only the IP address, not the port number */
384 if (!nfs_sockaddr_match_ipaddr(addr, clap)) 440 if (!nfs_sockaddr_match_ipaddr(addr, clap))
385 continue; 441 return false;
386 442
387 atomic_inc(&clp->cl_count); 443 return true;
388 spin_unlock(&nfs_client_lock);
389 return clp;
390 }
391 spin_unlock(&nfs_client_lock);
392 return NULL;
393}
394
395/*
396 * Find a client by IP address and protocol version
397 * - returns NULL if no such client
398 */
399struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
400{
401 struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
402 u32 nfsvers = clp->rpc_ops->version;
403
404 spin_lock(&nfs_client_lock);
405 list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
406 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
407
408 /* Don't match clients that failed to initialise properly */
409 if (clp->cl_cons_state != NFS_CS_READY)
410 continue;
411
412 /* Different NFS versions cannot share the same nfs_client */
413 if (clp->rpc_ops->version != nfsvers)
414 continue;
415
416 /* Match only the IP address, not the port number */
417 if (!nfs_sockaddr_match_ipaddr(sap, clap))
418 continue;
419
420 atomic_inc(&clp->cl_count);
421 spin_unlock(&nfs_client_lock);
422 return clp;
423 }
424 spin_unlock(&nfs_client_lock);
425 return NULL;
426} 444}
427 445
428/* 446/*
@@ -601,6 +619,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
601{ 619{
602 struct rpc_clnt *clnt = NULL; 620 struct rpc_clnt *clnt = NULL;
603 struct rpc_create_args args = { 621 struct rpc_create_args args = {
622 .net = &init_net,
604 .protocol = clp->cl_proto, 623 .protocol = clp->cl_proto,
605 .address = (struct sockaddr *)&clp->cl_addr, 624 .address = (struct sockaddr *)&clp->cl_addr,
606 .addrsize = clp->cl_addrlen, 625 .addrsize = clp->cl_addrlen,
@@ -635,7 +654,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
635 */ 654 */
636static void nfs_destroy_server(struct nfs_server *server) 655static void nfs_destroy_server(struct nfs_server *server)
637{ 656{
638 if (!(server->flags & NFS_MOUNT_NONLM)) 657 if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
658 !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
639 nlmclnt_done(server->nlm_host); 659 nlmclnt_done(server->nlm_host);
640} 660}
641 661
@@ -657,7 +677,8 @@ static int nfs_start_lockd(struct nfs_server *server)
657 677
658 if (nlm_init.nfs_version > 3) 678 if (nlm_init.nfs_version > 3)
659 return 0; 679 return 0;
660 if (server->flags & NFS_MOUNT_NONLM) 680 if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
681 (server->flags & NFS_MOUNT_LOCAL_FCNTL))
661 return 0; 682 return 0;
662 683
663 switch (clp->cl_proto) { 684 switch (clp->cl_proto) {
@@ -898,11 +919,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
898 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 919 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
899 server->wsize = NFS_MAX_FILE_IO_SIZE; 920 server->wsize = NFS_MAX_FILE_IO_SIZE;
900 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 921 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
922 set_pnfs_layoutdriver(server, fsinfo->layouttype);
923
901 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 924 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
902 925
903 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); 926 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
904 if (server->dtsize > PAGE_CACHE_SIZE) 927 if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
905 server->dtsize = PAGE_CACHE_SIZE; 928 server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
906 if (server->dtsize > server->rsize) 929 if (server->dtsize > server->rsize)
907 server->dtsize = server->rsize; 930 server->dtsize = server->rsize;
908 931
@@ -913,6 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
913 936
914 server->maxfilesize = fsinfo->maxfilesize; 937 server->maxfilesize = fsinfo->maxfilesize;
915 938
939 server->time_delta = fsinfo->time_delta;
940
916 /* We're airborne Set socket buffersize */ 941 /* We're airborne Set socket buffersize */
917 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); 942 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
918} 943}
@@ -935,6 +960,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
935 } 960 }
936 961
937 fsinfo.fattr = fattr; 962 fsinfo.fattr = fattr;
963 fsinfo.layouttype = 0;
938 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); 964 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
939 if (error < 0) 965 if (error < 0)
940 goto out_error; 966 goto out_error;
@@ -976,6 +1002,27 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
976 target->options = source->options; 1002 target->options = source->options;
977} 1003}
978 1004
1005static void nfs_server_insert_lists(struct nfs_server *server)
1006{
1007 struct nfs_client *clp = server->nfs_client;
1008
1009 spin_lock(&nfs_client_lock);
1010 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
1011 list_add_tail(&server->master_link, &nfs_volume_list);
1012 spin_unlock(&nfs_client_lock);
1013
1014}
1015
1016static void nfs_server_remove_lists(struct nfs_server *server)
1017{
1018 spin_lock(&nfs_client_lock);
1019 list_del_rcu(&server->client_link);
1020 list_del(&server->master_link);
1021 spin_unlock(&nfs_client_lock);
1022
1023 synchronize_rcu();
1024}
1025
979/* 1026/*
980 * Allocate and initialise a server record 1027 * Allocate and initialise a server record
981 */ 1028 */
@@ -992,6 +1039,7 @@ static struct nfs_server *nfs_alloc_server(void)
992 /* Zero out the NFS state stuff */ 1039 /* Zero out the NFS state stuff */
993 INIT_LIST_HEAD(&server->client_link); 1040 INIT_LIST_HEAD(&server->client_link);
994 INIT_LIST_HEAD(&server->master_link); 1041 INIT_LIST_HEAD(&server->master_link);
1042 INIT_LIST_HEAD(&server->delegations);
995 1043
996 atomic_set(&server->active, 0); 1044 atomic_set(&server->active, 0);
997 1045
@@ -1007,6 +1055,8 @@ static struct nfs_server *nfs_alloc_server(void)
1007 return NULL; 1055 return NULL;
1008 } 1056 }
1009 1057
1058 pnfs_init_server(server);
1059
1010 return server; 1060 return server;
1011} 1061}
1012 1062
@@ -1017,10 +1067,8 @@ void nfs_free_server(struct nfs_server *server)
1017{ 1067{
1018 dprintk("--> nfs_free_server()\n"); 1068 dprintk("--> nfs_free_server()\n");
1019 1069
1020 spin_lock(&nfs_client_lock); 1070 nfs_server_remove_lists(server);
1021 list_del(&server->client_link); 1071 unset_pnfs_layoutdriver(server);
1022 list_del(&server->master_link);
1023 spin_unlock(&nfs_client_lock);
1024 1072
1025 if (server->destroy != NULL) 1073 if (server->destroy != NULL)
1026 server->destroy(server); 1074 server->destroy(server);
@@ -1095,11 +1143,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1095 (unsigned long long) server->fsid.major, 1143 (unsigned long long) server->fsid.major,
1096 (unsigned long long) server->fsid.minor); 1144 (unsigned long long) server->fsid.minor);
1097 1145
1098 spin_lock(&nfs_client_lock); 1146 nfs_server_insert_lists(server);
1099 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1100 list_add_tail(&server->master_link, &nfs_volume_list);
1101 spin_unlock(&nfs_client_lock);
1102
1103 server->mount_time = jiffies; 1147 server->mount_time = jiffies;
1104 nfs_free_fattr(fattr); 1148 nfs_free_fattr(fattr);
1105 return server; 1149 return server;
@@ -1112,6 +1156,96 @@ error:
1112 1156
1113#ifdef CONFIG_NFS_V4 1157#ifdef CONFIG_NFS_V4
1114/* 1158/*
1159 * NFSv4.0 callback thread helper
1160 *
1161 * Find a client by IP address, protocol version, and minorversion
1162 *
1163 * Called from the pg_authenticate method. The callback identifier
1164 * is not used as it has not been decoded.
1165 *
1166 * Returns NULL if no such client
1167 */
1168struct nfs_client *
1169nfs4_find_client_no_ident(const struct sockaddr *addr)
1170{
1171 struct nfs_client *clp;
1172
1173 spin_lock(&nfs_client_lock);
1174 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
1175 if (nfs4_cb_match_client(addr, clp, 0) == false)
1176 continue;
1177 atomic_inc(&clp->cl_count);
1178 spin_unlock(&nfs_client_lock);
1179 return clp;
1180 }
1181 spin_unlock(&nfs_client_lock);
1182 return NULL;
1183}
1184
1185/*
1186 * NFSv4.0 callback thread helper
1187 *
1188 * Find a client by callback identifier
1189 */
1190struct nfs_client *
1191nfs4_find_client_ident(int cb_ident)
1192{
1193 struct nfs_client *clp;
1194
1195 spin_lock(&nfs_client_lock);
1196 clp = idr_find(&cb_ident_idr, cb_ident);
1197 if (clp)
1198 atomic_inc(&clp->cl_count);
1199 spin_unlock(&nfs_client_lock);
1200 return clp;
1201}
1202
1203#if defined(CONFIG_NFS_V4_1)
1204/*
1205 * NFSv4.1 callback thread helper
1206 * For CB_COMPOUND calls, find a client by IP address, protocol version,
1207 * minorversion, and sessionID
1208 *
1209 * Returns NULL if no such client
1210 */
1211struct nfs_client *
1212nfs4_find_client_sessionid(const struct sockaddr *addr,
1213 struct nfs4_sessionid *sid)
1214{
1215 struct nfs_client *clp;
1216
1217 spin_lock(&nfs_client_lock);
1218 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
1219 if (nfs4_cb_match_client(addr, clp, 1) == false)
1220 continue;
1221
1222 if (!nfs4_has_session(clp))
1223 continue;
1224
1225 /* Match sessionid*/
1226 if (memcmp(clp->cl_session->sess_id.data,
1227 sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
1228 continue;
1229
1230 atomic_inc(&clp->cl_count);
1231 spin_unlock(&nfs_client_lock);
1232 return clp;
1233 }
1234 spin_unlock(&nfs_client_lock);
1235 return NULL;
1236}
1237
1238#else /* CONFIG_NFS_V4_1 */
1239
1240struct nfs_client *
1241nfs4_find_client_sessionid(const struct sockaddr *addr,
1242 struct nfs4_sessionid *sid)
1243{
1244 return NULL;
1245}
1246#endif /* CONFIG_NFS_V4_1 */
1247
1248/*
1115 * Initialize the NFS4 callback service 1249 * Initialize the NFS4 callback service
1116 */ 1250 */
1117static int nfs4_init_callback(struct nfs_client *clp) 1251static int nfs4_init_callback(struct nfs_client *clp)
@@ -1329,11 +1463,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
1329 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) 1463 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1330 server->namelen = NFS4_MAXNAMLEN; 1464 server->namelen = NFS4_MAXNAMLEN;
1331 1465
1332 spin_lock(&nfs_client_lock); 1466 nfs_server_insert_lists(server);
1333 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1334 list_add_tail(&server->master_link, &nfs_volume_list);
1335 spin_unlock(&nfs_client_lock);
1336
1337 server->mount_time = jiffies; 1467 server->mount_time = jiffies;
1338out: 1468out:
1339 nfs_free_fattr(fattr); 1469 nfs_free_fattr(fattr);
@@ -1356,8 +1486,9 @@ static int nfs4_init_server(struct nfs_server *server,
1356 1486
1357 /* Initialise the client representation from the mount data */ 1487 /* Initialise the client representation from the mount data */
1358 server->flags = data->flags; 1488 server->flags = data->flags;
1359 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| 1489 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
1360 NFS_CAP_POSIX_LOCK; 1490 if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
1491 server->caps |= NFS_CAP_READDIRPLUS;
1361 server->options = data->options; 1492 server->options = data->options;
1362 1493
1363 /* Get a client record */ 1494 /* Get a client record */
@@ -1537,11 +1668,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1537 if (error < 0) 1668 if (error < 0)
1538 goto out_free_server; 1669 goto out_free_server;
1539 1670
1540 spin_lock(&nfs_client_lock); 1671 nfs_server_insert_lists(server);
1541 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1542 list_add_tail(&server->master_link, &nfs_volume_list);
1543 spin_unlock(&nfs_client_lock);
1544
1545 server->mount_time = jiffies; 1672 server->mount_time = jiffies;
1546 1673
1547 nfs_free_fattr(fattr_fsinfo); 1674 nfs_free_fattr(fattr_fsinfo);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b9c3c43cea1d..bbbc6bf5cb2e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/smp_lock.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16 15
17#include <linux/nfs4.h> 16#include <linux/nfs4.h>
@@ -24,8 +23,6 @@
24 23
25static void nfs_do_free_delegation(struct nfs_delegation *delegation) 24static void nfs_do_free_delegation(struct nfs_delegation *delegation)
26{ 25{
27 if (delegation->cred)
28 put_rpccred(delegation->cred);
29 kfree(delegation); 26 kfree(delegation);
30} 27}
31 28
@@ -38,14 +35,30 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
38 35
39static void nfs_free_delegation(struct nfs_delegation *delegation) 36static void nfs_free_delegation(struct nfs_delegation *delegation)
40{ 37{
38 if (delegation->cred) {
39 put_rpccred(delegation->cred);
40 delegation->cred = NULL;
41 }
41 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 42 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
42} 43}
43 44
45/**
46 * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
47 * @delegation: delegation to process
48 *
49 */
44void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) 50void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
45{ 51{
46 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); 52 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
47} 53}
48 54
55/**
56 * nfs_have_delegation - check if inode has a delegation
57 * @inode: inode to check
58 * @flags: delegation types to check for
59 *
60 * Returns one if inode has the indicated delegation, otherwise zero.
61 */
49int nfs_have_delegation(struct inode *inode, fmode_t flags) 62int nfs_have_delegation(struct inode *inode, fmode_t flags)
50{ 63{
51 struct nfs_delegation *delegation; 64 struct nfs_delegation *delegation;
@@ -71,20 +84,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
71 if (inode->i_flock == NULL) 84 if (inode->i_flock == NULL)
72 goto out; 85 goto out;
73 86
74 /* Protect inode->i_flock using the BKL */ 87 /* Protect inode->i_flock using the file locks lock */
75 lock_kernel(); 88 lock_flocks();
76 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 89 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
77 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 90 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
78 continue; 91 continue;
79 if (nfs_file_open_context(fl->fl_file) != ctx) 92 if (nfs_file_open_context(fl->fl_file) != ctx)
80 continue; 93 continue;
81 unlock_kernel(); 94 unlock_flocks();
82 status = nfs4_lock_delegation_recall(state, fl); 95 status = nfs4_lock_delegation_recall(state, fl);
83 if (status < 0) 96 if (status < 0)
84 goto out; 97 goto out;
85 lock_kernel(); 98 lock_flocks();
86 } 99 }
87 unlock_kernel(); 100 unlock_flocks();
88out: 101out:
89 return status; 102 return status;
90} 103}
@@ -120,10 +133,15 @@ again:
120 return 0; 133 return 0;
121} 134}
122 135
123/* 136/**
124 * Set up a delegation on an inode 137 * nfs_inode_reclaim_delegation - process a delegation reclaim request
138 * @inode: inode to process
139 * @cred: credential to use for request
140 * @res: new delegation state from server
141 *
125 */ 142 */
126void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 143void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
144 struct nfs_openres *res)
127{ 145{
128 struct nfs_delegation *delegation; 146 struct nfs_delegation *delegation;
129 struct rpc_cred *oldcred = NULL; 147 struct rpc_cred *oldcred = NULL;
@@ -176,38 +194,52 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
176 return inode; 194 return inode;
177} 195}
178 196
179static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, 197static struct nfs_delegation *
180 const nfs4_stateid *stateid, 198nfs_detach_delegation_locked(struct nfs_inode *nfsi,
181 struct nfs_client *clp) 199 struct nfs_server *server)
182{ 200{
183 struct nfs_delegation *delegation = 201 struct nfs_delegation *delegation =
184 rcu_dereference_protected(nfsi->delegation, 202 rcu_dereference_protected(nfsi->delegation,
185 lockdep_is_held(&clp->cl_lock)); 203 lockdep_is_held(&server->nfs_client->cl_lock));
186 204
187 if (delegation == NULL) 205 if (delegation == NULL)
188 goto nomatch; 206 goto nomatch;
207
189 spin_lock(&delegation->lock); 208 spin_lock(&delegation->lock);
190 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
191 sizeof(delegation->stateid.data)) != 0)
192 goto nomatch_unlock;
193 list_del_rcu(&delegation->super_list); 209 list_del_rcu(&delegation->super_list);
194 delegation->inode = NULL; 210 delegation->inode = NULL;
195 nfsi->delegation_state = 0; 211 nfsi->delegation_state = 0;
196 rcu_assign_pointer(nfsi->delegation, NULL); 212 rcu_assign_pointer(nfsi->delegation, NULL);
197 spin_unlock(&delegation->lock); 213 spin_unlock(&delegation->lock);
198 return delegation; 214 return delegation;
199nomatch_unlock:
200 spin_unlock(&delegation->lock);
201nomatch: 215nomatch:
202 return NULL; 216 return NULL;
203} 217}
204 218
205/* 219static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
206 * Set up a delegation on an inode 220 struct nfs_server *server)
221{
222 struct nfs_client *clp = server->nfs_client;
223 struct nfs_delegation *delegation;
224
225 spin_lock(&clp->cl_lock);
226 delegation = nfs_detach_delegation_locked(nfsi, server);
227 spin_unlock(&clp->cl_lock);
228 return delegation;
229}
230
231/**
232 * nfs_inode_set_delegation - set up a delegation on an inode
233 * @inode: inode to which delegation applies
234 * @cred: cred to use for subsequent delegation processing
235 * @res: new delegation state from server
236 *
237 * Returns zero on success, or a negative errno value.
207 */ 238 */
208int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 239int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
209{ 240{
210 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 241 struct nfs_server *server = NFS_SERVER(inode);
242 struct nfs_client *clp = server->nfs_client;
211 struct nfs_inode *nfsi = NFS_I(inode); 243 struct nfs_inode *nfsi = NFS_I(inode);
212 struct nfs_delegation *delegation, *old_delegation; 244 struct nfs_delegation *delegation, *old_delegation;
213 struct nfs_delegation *freeme = NULL; 245 struct nfs_delegation *freeme = NULL;
@@ -228,7 +260,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
228 260
229 spin_lock(&clp->cl_lock); 261 spin_lock(&clp->cl_lock);
230 old_delegation = rcu_dereference_protected(nfsi->delegation, 262 old_delegation = rcu_dereference_protected(nfsi->delegation,
231 lockdep_is_held(&clp->cl_lock)); 263 lockdep_is_held(&clp->cl_lock));
232 if (old_delegation != NULL) { 264 if (old_delegation != NULL) {
233 if (memcmp(&delegation->stateid, &old_delegation->stateid, 265 if (memcmp(&delegation->stateid, &old_delegation->stateid,
234 sizeof(old_delegation->stateid)) == 0 && 266 sizeof(old_delegation->stateid)) == 0 &&
@@ -247,9 +279,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
247 delegation = NULL; 279 delegation = NULL;
248 goto out; 280 goto out;
249 } 281 }
250 freeme = nfs_detach_delegation_locked(nfsi, NULL, clp); 282 freeme = nfs_detach_delegation_locked(nfsi, server);
251 } 283 }
252 list_add_rcu(&delegation->super_list, &clp->cl_delegations); 284 list_add_rcu(&delegation->super_list, &server->delegations);
253 nfsi->delegation_state = delegation->type; 285 nfsi->delegation_state = delegation->type;
254 rcu_assign_pointer(nfsi->delegation, delegation); 286 rcu_assign_pointer(nfsi->delegation, delegation);
255 delegation = NULL; 287 delegation = NULL;
@@ -291,73 +323,85 @@ out:
291 return err; 323 return err;
292} 324}
293 325
294/* 326/**
295 * Return all delegations that have been marked for return 327 * nfs_client_return_marked_delegations - return previously marked delegations
328 * @clp: nfs_client to process
329 *
330 * Returns zero on success, or a negative errno value.
296 */ 331 */
297int nfs_client_return_marked_delegations(struct nfs_client *clp) 332int nfs_client_return_marked_delegations(struct nfs_client *clp)
298{ 333{
299 struct nfs_delegation *delegation; 334 struct nfs_delegation *delegation;
335 struct nfs_server *server;
300 struct inode *inode; 336 struct inode *inode;
301 int err = 0; 337 int err = 0;
302 338
303restart: 339restart:
304 rcu_read_lock(); 340 rcu_read_lock();
305 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 341 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
306 if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) 342 list_for_each_entry_rcu(delegation, &server->delegations,
307 continue; 343 super_list) {
308 inode = nfs_delegation_grab_inode(delegation); 344 if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
309 if (inode == NULL) 345 &delegation->flags))
310 continue; 346 continue;
311 spin_lock(&clp->cl_lock); 347 inode = nfs_delegation_grab_inode(delegation);
312 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); 348 if (inode == NULL)
313 spin_unlock(&clp->cl_lock); 349 continue;
314 rcu_read_unlock(); 350 delegation = nfs_detach_delegation(NFS_I(inode),
315 if (delegation != NULL) { 351 server);
316 filemap_flush(inode->i_mapping); 352 rcu_read_unlock();
317 err = __nfs_inode_return_delegation(inode, delegation, 0); 353
354 if (delegation != NULL) {
355 filemap_flush(inode->i_mapping);
356 err = __nfs_inode_return_delegation(inode,
357 delegation, 0);
358 }
359 iput(inode);
360 if (!err)
361 goto restart;
362 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
363 return err;
318 } 364 }
319 iput(inode);
320 if (!err)
321 goto restart;
322 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
323 return err;
324 } 365 }
325 rcu_read_unlock(); 366 rcu_read_unlock();
326 return 0; 367 return 0;
327} 368}
328 369
329/* 370/**
330 * This function returns the delegation without reclaiming opens 371 * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
331 * or protecting against delegation reclaims. 372 * @inode: inode to process
332 * It is therefore really only safe to be called from 373 *
333 * nfs4_clear_inode() 374 * Does not protect against delegation reclaims, therefore really only safe
375 * to be called from nfs4_clear_inode().
334 */ 376 */
335void nfs_inode_return_delegation_noreclaim(struct inode *inode) 377void nfs_inode_return_delegation_noreclaim(struct inode *inode)
336{ 378{
337 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 379 struct nfs_server *server = NFS_SERVER(inode);
338 struct nfs_inode *nfsi = NFS_I(inode); 380 struct nfs_inode *nfsi = NFS_I(inode);
339 struct nfs_delegation *delegation; 381 struct nfs_delegation *delegation;
340 382
341 if (rcu_access_pointer(nfsi->delegation) != NULL) { 383 if (rcu_access_pointer(nfsi->delegation) != NULL) {
342 spin_lock(&clp->cl_lock); 384 delegation = nfs_detach_delegation(nfsi, server);
343 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
344 spin_unlock(&clp->cl_lock);
345 if (delegation != NULL) 385 if (delegation != NULL)
346 nfs_do_return_delegation(inode, delegation, 0); 386 nfs_do_return_delegation(inode, delegation, 0);
347 } 387 }
348} 388}
349 389
390/**
391 * nfs_inode_return_delegation - synchronously return a delegation
392 * @inode: inode to process
393 *
394 * Returns zero on success, or a negative errno value.
395 */
350int nfs_inode_return_delegation(struct inode *inode) 396int nfs_inode_return_delegation(struct inode *inode)
351{ 397{
352 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 398 struct nfs_server *server = NFS_SERVER(inode);
353 struct nfs_inode *nfsi = NFS_I(inode); 399 struct nfs_inode *nfsi = NFS_I(inode);
354 struct nfs_delegation *delegation; 400 struct nfs_delegation *delegation;
355 int err = 0; 401 int err = 0;
356 402
357 if (rcu_access_pointer(nfsi->delegation) != NULL) { 403 if (rcu_access_pointer(nfsi->delegation) != NULL) {
358 spin_lock(&clp->cl_lock); 404 delegation = nfs_detach_delegation(nfsi, server);
359 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
360 spin_unlock(&clp->cl_lock);
361 if (delegation != NULL) { 405 if (delegation != NULL) {
362 nfs_wb_all(inode); 406 nfs_wb_all(inode);
363 err = __nfs_inode_return_delegation(inode, delegation, 1); 407 err = __nfs_inode_return_delegation(inode, delegation, 1);
@@ -366,46 +410,61 @@ int nfs_inode_return_delegation(struct inode *inode)
366 return err; 410 return err;
367} 411}
368 412
369static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation) 413static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
370{ 414{
415 struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
416
371 set_bit(NFS_DELEGATION_RETURN, &delegation->flags); 417 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
372 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); 418 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
373} 419}
374 420
375/* 421/**
376 * Return all delegations associated to a super block 422 * nfs_super_return_all_delegations - return delegations for one superblock
423 * @sb: sb to process
424 *
377 */ 425 */
378void nfs_super_return_all_delegations(struct super_block *sb) 426void nfs_super_return_all_delegations(struct super_block *sb)
379{ 427{
380 struct nfs_client *clp = NFS_SB(sb)->nfs_client; 428 struct nfs_server *server = NFS_SB(sb);
429 struct nfs_client *clp = server->nfs_client;
381 struct nfs_delegation *delegation; 430 struct nfs_delegation *delegation;
382 431
383 if (clp == NULL) 432 if (clp == NULL)
384 return; 433 return;
434
385 rcu_read_lock(); 435 rcu_read_lock();
386 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 436 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
387 spin_lock(&delegation->lock); 437 spin_lock(&delegation->lock);
388 if (delegation->inode != NULL && delegation->inode->i_sb == sb) 438 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
389 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
390 spin_unlock(&delegation->lock); 439 spin_unlock(&delegation->lock);
391 } 440 }
392 rcu_read_unlock(); 441 rcu_read_unlock();
442
393 if (nfs_client_return_marked_delegations(clp) != 0) 443 if (nfs_client_return_marked_delegations(clp) != 0)
394 nfs4_schedule_state_manager(clp); 444 nfs4_schedule_state_manager(clp);
395} 445}
396 446
397static 447static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
398void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags) 448 fmode_t flags)
399{ 449{
400 struct nfs_delegation *delegation; 450 struct nfs_delegation *delegation;
401 451
402 rcu_read_lock(); 452 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
403 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
404 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) 453 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
405 continue; 454 continue;
406 if (delegation->type & flags) 455 if (delegation->type & flags)
407 nfs_mark_return_delegation(clp, delegation); 456 nfs_mark_return_delegation(delegation);
408 } 457 }
458}
459
460static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
461 fmode_t flags)
462{
463 struct nfs_server *server;
464
465 rcu_read_lock();
466 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
467 nfs_mark_return_all_delegation_types(server, flags);
409 rcu_read_unlock(); 468 rcu_read_unlock();
410} 469}
411 470
@@ -420,19 +479,32 @@ static void nfs_delegation_run_state_manager(struct nfs_client *clp)
420 nfs4_schedule_state_manager(clp); 479 nfs4_schedule_state_manager(clp);
421} 480}
422 481
482/**
483 * nfs_expire_all_delegation_types
484 * @clp: client to process
485 * @flags: delegation types to expire
486 *
487 */
423void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) 488void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
424{ 489{
425 nfs_client_mark_return_all_delegation_types(clp, flags); 490 nfs_client_mark_return_all_delegation_types(clp, flags);
426 nfs_delegation_run_state_manager(clp); 491 nfs_delegation_run_state_manager(clp);
427} 492}
428 493
494/**
495 * nfs_expire_all_delegations
496 * @clp: client to process
497 *
498 */
429void nfs_expire_all_delegations(struct nfs_client *clp) 499void nfs_expire_all_delegations(struct nfs_client *clp)
430{ 500{
431 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); 501 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
432} 502}
433 503
434/* 504/**
435 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. 505 * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
506 * @clp: client to process
507 *
436 */ 508 */
437void nfs_handle_cb_pathdown(struct nfs_client *clp) 509void nfs_handle_cb_pathdown(struct nfs_client *clp)
438{ 510{
@@ -441,29 +513,43 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
441 nfs_client_mark_return_all_delegations(clp); 513 nfs_client_mark_return_all_delegations(clp);
442} 514}
443 515
444static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp) 516static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
445{ 517{
446 struct nfs_delegation *delegation; 518 struct nfs_delegation *delegation;
447 519
448 rcu_read_lock(); 520 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
449 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
450 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) 521 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
451 continue; 522 continue;
452 nfs_mark_return_delegation(clp, delegation); 523 nfs_mark_return_delegation(delegation);
453 } 524 }
454 rcu_read_unlock();
455} 525}
456 526
527/**
528 * nfs_expire_unreferenced_delegations - Eliminate unused delegations
529 * @clp: nfs_client to process
530 *
531 */
457void nfs_expire_unreferenced_delegations(struct nfs_client *clp) 532void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
458{ 533{
459 nfs_client_mark_return_unreferenced_delegations(clp); 534 struct nfs_server *server;
535
536 rcu_read_lock();
537 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
538 nfs_mark_return_unreferenced_delegations(server);
539 rcu_read_unlock();
540
460 nfs_delegation_run_state_manager(clp); 541 nfs_delegation_run_state_manager(clp);
461} 542}
462 543
463/* 544/**
464 * Asynchronous delegation recall! 545 * nfs_async_inode_return_delegation - asynchronously return a delegation
546 * @inode: inode to process
547 * @stateid: state ID information from CB_RECALL arguments
548 *
549 * Returns zero on success, or a negative errno value.
465 */ 550 */
466int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) 551int nfs_async_inode_return_delegation(struct inode *inode,
552 const nfs4_stateid *stateid)
467{ 553{
468 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 554 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
469 struct nfs_delegation *delegation; 555 struct nfs_delegation *delegation;
@@ -475,22 +561,21 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
475 rcu_read_unlock(); 561 rcu_read_unlock();
476 return -ENOENT; 562 return -ENOENT;
477 } 563 }
478 564 nfs_mark_return_delegation(delegation);
479 nfs_mark_return_delegation(clp, delegation);
480 rcu_read_unlock(); 565 rcu_read_unlock();
566
481 nfs_delegation_run_state_manager(clp); 567 nfs_delegation_run_state_manager(clp);
482 return 0; 568 return 0;
483} 569}
484 570
485/* 571static struct inode *
486 * Retrieve the inode associated with a delegation 572nfs_delegation_find_inode_server(struct nfs_server *server,
487 */ 573 const struct nfs_fh *fhandle)
488struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
489{ 574{
490 struct nfs_delegation *delegation; 575 struct nfs_delegation *delegation;
491 struct inode *res = NULL; 576 struct inode *res = NULL;
492 rcu_read_lock(); 577
493 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 578 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
494 spin_lock(&delegation->lock); 579 spin_lock(&delegation->lock);
495 if (delegation->inode != NULL && 580 if (delegation->inode != NULL &&
496 nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { 581 nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
@@ -500,49 +585,121 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
500 if (res != NULL) 585 if (res != NULL)
501 break; 586 break;
502 } 587 }
588 return res;
589}
590
591/**
592 * nfs_delegation_find_inode - retrieve the inode associated with a delegation
593 * @clp: client state handle
594 * @fhandle: filehandle from a delegation recall
595 *
596 * Returns pointer to inode matching "fhandle," or NULL if a matching inode
597 * cannot be found.
598 */
599struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
600 const struct nfs_fh *fhandle)
601{
602 struct nfs_server *server;
603 struct inode *res = NULL;
604
605 rcu_read_lock();
606 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
607 res = nfs_delegation_find_inode_server(server, fhandle);
608 if (res != NULL)
609 break;
610 }
503 rcu_read_unlock(); 611 rcu_read_unlock();
504 return res; 612 return res;
505} 613}
506 614
507/* 615static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
508 * Mark all delegations as needing to be reclaimed 616{
617 struct nfs_delegation *delegation;
618
619 list_for_each_entry_rcu(delegation, &server->delegations, super_list)
620 set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
621}
622
623/**
624 * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
625 * @clp: nfs_client to process
626 *
509 */ 627 */
510void nfs_delegation_mark_reclaim(struct nfs_client *clp) 628void nfs_delegation_mark_reclaim(struct nfs_client *clp)
511{ 629{
512 struct nfs_delegation *delegation; 630 struct nfs_server *server;
631
513 rcu_read_lock(); 632 rcu_read_lock();
514 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) 633 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
515 set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); 634 nfs_delegation_mark_reclaim_server(server);
516 rcu_read_unlock(); 635 rcu_read_unlock();
517} 636}
518 637
519/* 638/**
520 * Reap all unclaimed delegations after reboot recovery is done 639 * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
640 * @clp: nfs_client to process
641 *
521 */ 642 */
522void nfs_delegation_reap_unclaimed(struct nfs_client *clp) 643void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
523{ 644{
524 struct nfs_delegation *delegation; 645 struct nfs_delegation *delegation;
646 struct nfs_server *server;
525 struct inode *inode; 647 struct inode *inode;
648
526restart: 649restart:
527 rcu_read_lock(); 650 rcu_read_lock();
528 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { 651 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
529 if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) 652 list_for_each_entry_rcu(delegation, &server->delegations,
530 continue; 653 super_list) {
531 inode = nfs_delegation_grab_inode(delegation); 654 if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
532 if (inode == NULL) 655 &delegation->flags) == 0)
533 continue; 656 continue;
534 spin_lock(&clp->cl_lock); 657 inode = nfs_delegation_grab_inode(delegation);
535 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); 658 if (inode == NULL)
536 spin_unlock(&clp->cl_lock); 659 continue;
537 rcu_read_unlock(); 660 delegation = nfs_detach_delegation(NFS_I(inode),
538 if (delegation != NULL) 661 server);
539 nfs_free_delegation(delegation); 662 rcu_read_unlock();
540 iput(inode); 663
541 goto restart; 664 if (delegation != NULL)
665 nfs_free_delegation(delegation);
666 iput(inode);
667 goto restart;
668 }
542 } 669 }
543 rcu_read_unlock(); 670 rcu_read_unlock();
544} 671}
545 672
673/**
674 * nfs_delegations_present - check for existence of delegations
675 * @clp: client state handle
676 *
677 * Returns one if there are any nfs_delegation structures attached
678 * to this nfs_client.
679 */
680int nfs_delegations_present(struct nfs_client *clp)
681{
682 struct nfs_server *server;
683 int ret = 0;
684
685 rcu_read_lock();
686 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
687 if (!list_empty(&server->delegations)) {
688 ret = 1;
689 break;
690 }
691 rcu_read_unlock();
692 return ret;
693}
694
695/**
696 * nfs4_copy_delegation_stateid - Copy inode's state ID information
697 * @dst: stateid data structure to fill in
698 * @inode: inode to check
699 *
700 * Returns one and fills in "dst->data" * if inode had a delegation,
701 * otherwise zero is returned.
702 */
546int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) 703int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
547{ 704{
548 struct nfs_inode *nfsi = NFS_I(inode); 705 struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2026304bda19..d9322e490c56 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -44,6 +44,7 @@ void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
44void nfs_expire_unreferenced_delegations(struct nfs_client *clp); 44void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
45void nfs_handle_cb_pathdown(struct nfs_client *clp); 45void nfs_handle_cb_pathdown(struct nfs_client *clp);
46int nfs_client_return_marked_delegations(struct nfs_client *clp); 46int nfs_client_return_marked_delegations(struct nfs_client *clp);
47int nfs_delegations_present(struct nfs_client *clp);
47 48
48void nfs_delegation_mark_reclaim(struct nfs_client *clp); 49void nfs_delegation_mark_reclaim(struct nfs_client *clp);
49void nfs_delegation_reap_unclaimed(struct nfs_client *clp); 50void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,13 @@
33#include <linux/namei.h> 33#include <linux/namei.h>
34#include <linux/mount.h> 34#include <linux/mount.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/kmemleak.h>
37#include <linux/xattr.h>
36 38
37#include "nfs4_fs.h"
38#include "delegation.h" 39#include "delegation.h"
39#include "iostat.h" 40#include "iostat.h"
40#include "internal.h" 41#include "internal.h"
42#include "fscache.h"
41 43
42/* #define NFS_DEBUG_VERBOSE 1 */ 44/* #define NFS_DEBUG_VERBOSE 1 */
43 45
@@ -55,6 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 57 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, int); 58static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 59static loff_t nfs_llseek_dir(struct file *, loff_t, int);
60static void nfs_readdir_clear_array(struct page*);
58 61
59const struct file_operations nfs_dir_operations = { 62const struct file_operations nfs_dir_operations = {
60 .llseek = nfs_llseek_dir, 63 .llseek = nfs_llseek_dir,
@@ -80,6 +83,10 @@ const struct inode_operations nfs_dir_inode_operations = {
80 .setattr = nfs_setattr, 83 .setattr = nfs_setattr,
81}; 84};
82 85
86const struct address_space_operations nfs_dir_aops = {
87 .freepage = nfs_readdir_clear_array,
88};
89
83#ifdef CONFIG_NFS_V3 90#ifdef CONFIG_NFS_V3
84const struct inode_operations nfs3_dir_inode_operations = { 91const struct inode_operations nfs3_dir_inode_operations = {
85 .create = nfs_create, 92 .create = nfs_create,
@@ -104,8 +111,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
104#ifdef CONFIG_NFS_V4 111#ifdef CONFIG_NFS_V4
105 112
106static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); 113static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
114static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
107const struct inode_operations nfs4_dir_inode_operations = { 115const struct inode_operations nfs4_dir_inode_operations = {
108 .create = nfs_create, 116 .create = nfs_open_create,
109 .lookup = nfs_atomic_lookup, 117 .lookup = nfs_atomic_lookup,
110 .link = nfs_link, 118 .link = nfs_link,
111 .unlink = nfs_unlink, 119 .unlink = nfs_unlink,
@@ -117,9 +125,10 @@ const struct inode_operations nfs4_dir_inode_operations = {
117 .permission = nfs_permission, 125 .permission = nfs_permission,
118 .getattr = nfs_getattr, 126 .getattr = nfs_getattr,
119 .setattr = nfs_setattr, 127 .setattr = nfs_setattr,
120 .getxattr = nfs4_getxattr, 128 .getxattr = generic_getxattr,
121 .setxattr = nfs4_setxattr, 129 .setxattr = generic_setxattr,
122 .listxattr = nfs4_listxattr, 130 .listxattr = generic_listxattr,
131 .removexattr = generic_removexattr,
123}; 132};
124 133
125#endif /* CONFIG_NFS_V4 */ 134#endif /* CONFIG_NFS_V4 */
@@ -150,51 +159,209 @@ nfs_opendir(struct inode *inode, struct file *filp)
150 return res; 159 return res;
151} 160}
152 161
153typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); 162struct nfs_cache_array_entry {
163 u64 cookie;
164 u64 ino;
165 struct qstr string;
166 unsigned char d_type;
167};
168
169struct nfs_cache_array {
170 unsigned int size;
171 int eof_index;
172 u64 last_cookie;
173 struct nfs_cache_array_entry array[0];
174};
175
176typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
154typedef struct { 177typedef struct {
155 struct file *file; 178 struct file *file;
156 struct page *page; 179 struct page *page;
157 unsigned long page_index; 180 unsigned long page_index;
158 __be32 *ptr;
159 u64 *dir_cookie; 181 u64 *dir_cookie;
182 u64 last_cookie;
160 loff_t current_index; 183 loff_t current_index;
161 struct nfs_entry *entry;
162 decode_dirent_t decode; 184 decode_dirent_t decode;
163 int plus; 185
164 unsigned long timestamp; 186 unsigned long timestamp;
165 unsigned long gencount; 187 unsigned long gencount;
166 int timestamp_valid; 188 unsigned int cache_entry_index;
189 unsigned int plus:1;
190 unsigned int eof:1;
167} nfs_readdir_descriptor_t; 191} nfs_readdir_descriptor_t;
168 192
169/* Now we cache directories properly, by stuffing the dirent 193/*
170 * data directly in the page cache. 194 * The caller is responsible for calling nfs_readdir_release_array(page)
171 *
172 * Inode invalidation due to refresh etc. takes care of
173 * _everything_, no sloppy entry flushing logic, no extraneous
174 * copying, network direct to page cache, the way it was meant
175 * to be.
176 *
177 * NOTE: Dirent information verification is done always by the
178 * page-in of the RPC reply, nowhere else, this simplies
179 * things substantially.
180 */ 195 */
181static 196static
182int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) 197struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
198{
199 void *ptr;
200 if (page == NULL)
201 return ERR_PTR(-EIO);
202 ptr = kmap(page);
203 if (ptr == NULL)
204 return ERR_PTR(-ENOMEM);
205 return ptr;
206}
207
208static
209void nfs_readdir_release_array(struct page *page)
210{
211 kunmap(page);
212}
213
214/*
215 * we are freeing strings created by nfs_add_to_readdir_array()
216 */
217static
218void nfs_readdir_clear_array(struct page *page)
219{
220 struct nfs_cache_array *array;
221 int i;
222
223 array = kmap_atomic(page, KM_USER0);
224 for (i = 0; i < array->size; i++)
225 kfree(array->array[i].string.name);
226 kunmap_atomic(array, KM_USER0);
227}
228
229/*
230 * the caller is responsible for freeing qstr.name
231 * when called by nfs_readdir_add_to_array, the strings will be freed in
232 * nfs_clear_readdir_array()
233 */
234static
235int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
236{
237 string->len = len;
238 string->name = kmemdup(name, len, GFP_KERNEL);
239 if (string->name == NULL)
240 return -ENOMEM;
241 /*
242 * Avoid a kmemleak false positive. The pointer to the name is stored
243 * in a page cache page which kmemleak does not scan.
244 */
245 kmemleak_not_leak(string->name);
246 string->hash = full_name_hash(name, len);
247 return 0;
248}
249
250static
251int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
252{
253 struct nfs_cache_array *array = nfs_readdir_get_array(page);
254 struct nfs_cache_array_entry *cache_entry;
255 int ret;
256
257 if (IS_ERR(array))
258 return PTR_ERR(array);
259
260 cache_entry = &array->array[array->size];
261
262 /* Check that this entry lies within the page bounds */
263 ret = -ENOSPC;
264 if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
265 goto out;
266
267 cache_entry->cookie = entry->prev_cookie;
268 cache_entry->ino = entry->ino;
269 cache_entry->d_type = entry->d_type;
270 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
271 if (ret)
272 goto out;
273 array->last_cookie = entry->cookie;
274 array->size++;
275 if (entry->eof != 0)
276 array->eof_index = array->size;
277out:
278 nfs_readdir_release_array(page);
279 return ret;
280}
281
282static
283int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
284{
285 loff_t diff = desc->file->f_pos - desc->current_index;
286 unsigned int index;
287
288 if (diff < 0)
289 goto out_eof;
290 if (diff >= array->size) {
291 if (array->eof_index >= 0)
292 goto out_eof;
293 desc->current_index += array->size;
294 return -EAGAIN;
295 }
296
297 index = (unsigned int)diff;
298 *desc->dir_cookie = array->array[index].cookie;
299 desc->cache_entry_index = index;
300 return 0;
301out_eof:
302 desc->eof = 1;
303 return -EBADCOOKIE;
304}
305
306static
307int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
308{
309 int i;
310 int status = -EAGAIN;
311
312 for (i = 0; i < array->size; i++) {
313 if (array->array[i].cookie == *desc->dir_cookie) {
314 desc->cache_entry_index = i;
315 return 0;
316 }
317 }
318 if (array->eof_index >= 0) {
319 status = -EBADCOOKIE;
320 if (*desc->dir_cookie == array->last_cookie)
321 desc->eof = 1;
322 }
323 return status;
324}
325
326static
327int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
328{
329 struct nfs_cache_array *array;
330 int status;
331
332 array = nfs_readdir_get_array(desc->page);
333 if (IS_ERR(array)) {
334 status = PTR_ERR(array);
335 goto out;
336 }
337
338 if (*desc->dir_cookie == 0)
339 status = nfs_readdir_search_for_pos(array, desc);
340 else
341 status = nfs_readdir_search_for_cookie(array, desc);
342
343 if (status == -EAGAIN) {
344 desc->last_cookie = array->last_cookie;
345 desc->page_index++;
346 }
347 nfs_readdir_release_array(desc->page);
348out:
349 return status;
350}
351
352/* Fill a page with xdr information before transferring to the cache page */
353static
354int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
355 struct nfs_entry *entry, struct file *file, struct inode *inode)
183{ 356{
184 struct file *file = desc->file;
185 struct inode *inode = file->f_path.dentry->d_inode;
186 struct rpc_cred *cred = nfs_file_cred(file); 357 struct rpc_cred *cred = nfs_file_cred(file);
187 unsigned long timestamp, gencount; 358 unsigned long timestamp, gencount;
188 int error; 359 int error;
189 360
190 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
191 __func__, (long long)desc->entry->cookie,
192 page->index);
193
194 again: 361 again:
195 timestamp = jiffies; 362 timestamp = jiffies;
196 gencount = nfs_inc_attr_generation_counter(); 363 gencount = nfs_inc_attr_generation_counter();
197 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, 364 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
198 NFS_SERVER(inode)->dtsize, desc->plus); 365 NFS_SERVER(inode)->dtsize, desc->plus);
199 if (error < 0) { 366 if (error < 0) {
200 /* We requested READDIRPLUS, but the server doesn't grok it */ 367 /* We requested READDIRPLUS, but the server doesn't grok it */
@@ -208,199 +375,312 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
208 } 375 }
209 desc->timestamp = timestamp; 376 desc->timestamp = timestamp;
210 desc->gencount = gencount; 377 desc->gencount = gencount;
211 desc->timestamp_valid = 1; 378error:
212 SetPageUptodate(page); 379 return error;
213 /* Ensure consistent page alignment of the data.
214 * Note: assumes we have exclusive access to this mapping either
215 * through inode->i_mutex or some other mechanism.
216 */
217 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
218 /* Should never happen */
219 nfs_zap_mapping(inode, inode->i_mapping);
220 }
221 unlock_page(page);
222 return 0;
223 error:
224 unlock_page(page);
225 return -EIO;
226} 380}
227 381
228static inline 382static int xdr_decode(nfs_readdir_descriptor_t *desc,
229int dir_decode(nfs_readdir_descriptor_t *desc) 383 struct nfs_entry *entry, struct xdr_stream *xdr)
230{ 384{
231 __be32 *p = desc->ptr; 385 int error;
232 p = desc->decode(p, desc->entry, desc->plus); 386
233 if (IS_ERR(p)) 387 error = desc->decode(xdr, entry, desc->plus);
234 return PTR_ERR(p); 388 if (error)
235 desc->ptr = p; 389 return error;
236 if (desc->timestamp_valid) { 390 entry->fattr->time_start = desc->timestamp;
237 desc->entry->fattr->time_start = desc->timestamp; 391 entry->fattr->gencount = desc->gencount;
238 desc->entry->fattr->gencount = desc->gencount;
239 } else
240 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
241 return 0; 392 return 0;
242} 393}
243 394
244static inline 395static
245void dir_page_release(nfs_readdir_descriptor_t *desc) 396int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
246{ 397{
247 kunmap(desc->page); 398 if (dentry->d_inode == NULL)
248 page_cache_release(desc->page); 399 goto different;
249 desc->page = NULL; 400 if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
250 desc->ptr = NULL; 401 goto different;
402 return 1;
403different:
404 return 0;
251} 405}
252 406
253/* 407static
254 * Given a pointer to a buffer that has already been filled by a call 408void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
255 * to readdir, find the next entry with cookie '*desc->dir_cookie'.
256 *
257 * If the end of the buffer has been reached, return -EAGAIN, if not,
258 * return the offset within the buffer of the next entry to be
259 * read.
260 */
261static inline
262int find_dirent(nfs_readdir_descriptor_t *desc)
263{ 409{
264 struct nfs_entry *entry = desc->entry; 410 struct qstr filename = {
265 int loop_count = 0, 411 .len = entry->len,
266 status; 412 .name = entry->name,
413 };
414 struct dentry *dentry;
415 struct dentry *alias;
416 struct inode *dir = parent->d_inode;
417 struct inode *inode;
267 418
268 while((status = dir_decode(desc)) == 0) { 419 if (filename.name[0] == '.') {
269 dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", 420 if (filename.len == 1)
270 __func__, (unsigned long long)entry->cookie); 421 return;
271 if (entry->prev_cookie == *desc->dir_cookie) 422 if (filename.len == 2 && filename.name[1] == '.')
272 break; 423 return;
273 if (loop_count++ > 200) { 424 }
274 loop_count = 0; 425 filename.hash = full_name_hash(filename.name, filename.len);
275 schedule(); 426
427 dentry = d_lookup(parent, &filename);
428 if (dentry != NULL) {
429 if (nfs_same_file(dentry, entry)) {
430 nfs_refresh_inode(dentry->d_inode, entry->fattr);
431 goto out;
432 } else {
433 d_drop(dentry);
434 dput(dentry);
276 } 435 }
277 } 436 }
278 return status; 437
438 dentry = d_alloc(parent, &filename);
439 if (dentry == NULL)
440 return;
441
442 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
443 if (IS_ERR(inode))
444 goto out;
445
446 alias = d_materialise_unique(dentry, inode);
447 if (IS_ERR(alias))
448 goto out;
449 else if (alias) {
450 nfs_set_verifier(alias, nfs_save_change_attribute(dir));
451 dput(alias);
452 } else
453 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
454
455out:
456 dput(dentry);
279} 457}
280 458
281/* 459/* Perform conversion from xdr to cache array */
282 * Given a pointer to a buffer that has already been filled by a call 460static
283 * to readdir, find the entry at offset 'desc->file->f_pos'. 461int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
284 * 462 struct page **xdr_pages, struct page *page, unsigned int buflen)
285 * If the end of the buffer has been reached, return -EAGAIN, if not,
286 * return the offset within the buffer of the next entry to be
287 * read.
288 */
289static inline
290int find_dirent_index(nfs_readdir_descriptor_t *desc)
291{ 463{
292 struct nfs_entry *entry = desc->entry; 464 struct xdr_stream stream;
293 int loop_count = 0, 465 struct xdr_buf buf = {
294 status; 466 .pages = xdr_pages,
467 .page_len = buflen,
468 .buflen = buflen,
469 .len = buflen,
470 };
471 struct page *scratch;
472 struct nfs_cache_array *array;
473 unsigned int count = 0;
474 int status;
295 475
296 for(;;) { 476 scratch = alloc_page(GFP_KERNEL);
297 status = dir_decode(desc); 477 if (scratch == NULL)
298 if (status) 478 return -ENOMEM;
299 break;
300 479
301 dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", 480 xdr_init_decode(&stream, &buf, NULL);
302 (unsigned long long)entry->cookie, desc->current_index); 481 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
303 482
304 if (desc->file->f_pos == desc->current_index) { 483 do {
305 *desc->dir_cookie = entry->cookie; 484 status = xdr_decode(desc, entry, &stream);
485 if (status != 0) {
486 if (status == -EAGAIN)
487 status = 0;
306 break; 488 break;
307 } 489 }
308 desc->current_index++; 490
309 if (loop_count++ > 200) { 491 count++;
310 loop_count = 0; 492
311 schedule(); 493 if (desc->plus != 0)
312 } 494 nfs_prime_dcache(desc->file->f_path.dentry, entry);
495
496 status = nfs_readdir_add_to_array(entry, page);
497 if (status != 0)
498 break;
499 } while (!entry->eof);
500
501 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
502 array = nfs_readdir_get_array(page);
503 if (!IS_ERR(array)) {
504 array->eof_index = array->size;
505 status = 0;
506 nfs_readdir_release_array(page);
507 } else
508 status = PTR_ERR(array);
313 } 509 }
510
511 put_page(scratch);
314 return status; 512 return status;
315} 513}
316 514
515static
516void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
517{
518 unsigned int i;
519 for (i = 0; i < npages; i++)
520 put_page(pages[i]);
521}
522
523static
524void nfs_readdir_free_large_page(void *ptr, struct page **pages,
525 unsigned int npages)
526{
527 nfs_readdir_free_pagearray(pages, npages);
528}
529
317/* 530/*
318 * Find the given page, and call find_dirent() or find_dirent_index in 531 * nfs_readdir_large_page will allocate pages that must be freed with a call
319 * order to try to return the next entry. 532 * to nfs_readdir_free_large_page
320 */ 533 */
321static inline 534static
322int find_dirent_page(nfs_readdir_descriptor_t *desc) 535int nfs_readdir_large_page(struct page **pages, unsigned int npages)
323{ 536{
324 struct inode *inode = desc->file->f_path.dentry->d_inode; 537 unsigned int i;
325 struct page *page;
326 int status;
327 538
328 dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", 539 for (i = 0; i < npages; i++) {
329 __func__, desc->page_index, 540 struct page *page = alloc_page(GFP_KERNEL);
330 (long long) *desc->dir_cookie); 541 if (page == NULL)
542 goto out_freepages;
543 pages[i] = page;
544 }
545 return 0;
331 546
332 /* If we find the page in the page_cache, we cannot be sure 547out_freepages:
333 * how fresh the data is, so we will ignore readdir_plus attributes. 548 nfs_readdir_free_pagearray(pages, i);
334 */ 549 return -ENOMEM;
335 desc->timestamp_valid = 0; 550}
336 page = read_cache_page(inode->i_mapping, desc->page_index, 551
337 (filler_t *)nfs_readdir_filler, desc); 552static
338 if (IS_ERR(page)) { 553int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
339 status = PTR_ERR(page); 554{
555 struct page *pages[NFS_MAX_READDIR_PAGES];
556 void *pages_ptr = NULL;
557 struct nfs_entry entry;
558 struct file *file = desc->file;
559 struct nfs_cache_array *array;
560 int status = -ENOMEM;
561 unsigned int array_size = ARRAY_SIZE(pages);
562
563 entry.prev_cookie = 0;
564 entry.cookie = desc->last_cookie;
565 entry.eof = 0;
566 entry.fh = nfs_alloc_fhandle();
567 entry.fattr = nfs_alloc_fattr();
568 entry.server = NFS_SERVER(inode);
569 if (entry.fh == NULL || entry.fattr == NULL)
570 goto out;
571
572 array = nfs_readdir_get_array(page);
573 if (IS_ERR(array)) {
574 status = PTR_ERR(array);
340 goto out; 575 goto out;
341 } 576 }
577 memset(array, 0, sizeof(struct nfs_cache_array));
578 array->eof_index = -1;
342 579
343 /* NOTE: Someone else may have changed the READDIRPLUS flag */ 580 status = nfs_readdir_large_page(pages, array_size);
344 desc->page = page;
345 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
346 if (*desc->dir_cookie != 0)
347 status = find_dirent(desc);
348 else
349 status = find_dirent_index(desc);
350 if (status < 0) 581 if (status < 0)
351 dir_page_release(desc); 582 goto out_release_array;
352 out: 583 do {
353 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); 584 unsigned int pglen;
585 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
586
587 if (status < 0)
588 break;
589 pglen = status;
590 status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
591 if (status < 0) {
592 if (status == -ENOSPC)
593 status = 0;
594 break;
595 }
596 } while (array->eof_index < 0);
597
598 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
599out_release_array:
600 nfs_readdir_release_array(page);
601out:
602 nfs_free_fattr(entry.fattr);
603 nfs_free_fhandle(entry.fh);
354 return status; 604 return status;
355} 605}
356 606
357/* 607/*
358 * Recurse through the page cache pages, and return a 608 * Now we cache directories properly, by converting xdr information
359 * filled nfs_entry structure of the next directory entry if possible. 609 * to an array that can be used for lookups later. This results in
360 * 610 * fewer cache pages, since we can store more information on each page.
361 * The target for the search is '*desc->dir_cookie' if non-0, 611 * We only need to convert from xdr once so future lookups are much simpler
362 * 'desc->file->f_pos' otherwise
363 */ 612 */
364static inline 613static
365int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) 614int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
366{ 615{
367 int loop_count = 0; 616 struct inode *inode = desc->file->f_path.dentry->d_inode;
368 int res; 617 int ret;
369 618
370 /* Always search-by-index from the beginning of the cache */ 619 ret = nfs_readdir_xdr_to_array(desc, page, inode);
371 if (*desc->dir_cookie == 0) { 620 if (ret < 0)
372 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n", 621 goto error;
373 (long long)desc->file->f_pos); 622 SetPageUptodate(page);
374 desc->page_index = 0;
375 desc->entry->cookie = desc->entry->prev_cookie = 0;
376 desc->entry->eof = 0;
377 desc->current_index = 0;
378 } else
379 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
380 (unsigned long long)*desc->dir_cookie);
381 623
382 for (;;) { 624 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
383 res = find_dirent_page(desc); 625 /* Should never happen */
384 if (res != -EAGAIN) 626 nfs_zap_mapping(inode, inode->i_mapping);
385 break;
386 /* Align to beginning of next page */
387 desc->page_index ++;
388 if (loop_count++ > 200) {
389 loop_count = 0;
390 schedule();
391 }
392 } 627 }
628 unlock_page(page);
629 return 0;
630 error:
631 unlock_page(page);
632 return ret;
633}
393 634
394 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res); 635static
395 return res; 636void cache_page_release(nfs_readdir_descriptor_t *desc)
637{
638 if (!desc->page->mapping)
639 nfs_readdir_clear_array(desc->page);
640 page_cache_release(desc->page);
641 desc->page = NULL;
396} 642}
397 643
398static inline unsigned int dt_type(struct inode *inode) 644static
645struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
646{
647 return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
648 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
649}
650
651/*
652 * Returns 0 if desc->dir_cookie was found on page desc->page_index
653 */
654static
655int find_cache_page(nfs_readdir_descriptor_t *desc)
399{ 656{
400 return (inode->i_mode >> 12) & 15; 657 int res;
658
659 desc->page = get_cache_page(desc);
660 if (IS_ERR(desc->page))
661 return PTR_ERR(desc->page);
662
663 res = nfs_readdir_search_array(desc);
664 if (res != 0)
665 cache_page_release(desc);
666 return res;
401} 667}
402 668
403static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc); 669/* Search for desc->dir_cookie from the beginning of the page cache */
670static inline
671int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
672{
673 int res;
674
675 if (desc->page_index == 0) {
676 desc->current_index = 0;
677 desc->last_cookie = 0;
678 }
679 do {
680 res = find_cache_page(desc);
681 } while (res == -EAGAIN);
682 return res;
683}
404 684
405/* 685/*
406 * Once we've found the start of the dirent within a page: fill 'er up... 686 * Once we've found the start of the dirent within a page: fill 'er up...
@@ -410,51 +690,38 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
410 filldir_t filldir) 690 filldir_t filldir)
411{ 691{
412 struct file *file = desc->file; 692 struct file *file = desc->file;
413 struct nfs_entry *entry = desc->entry; 693 int i = 0;
414 struct dentry *dentry = NULL; 694 int res = 0;
415 u64 fileid; 695 struct nfs_cache_array *array = NULL;
416 int loop_count = 0,
417 res;
418
419 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
420 (unsigned long long)entry->cookie);
421
422 for(;;) {
423 unsigned d_type = DT_UNKNOWN;
424 /* Note: entry->prev_cookie contains the cookie for
425 * retrieving the current dirent on the server */
426 fileid = entry->ino;
427
428 /* Get a dentry if we have one */
429 if (dentry != NULL)
430 dput(dentry);
431 dentry = nfs_readdir_lookup(desc);
432 696
433 /* Use readdirplus info */ 697 array = nfs_readdir_get_array(desc->page);
434 if (dentry != NULL && dentry->d_inode != NULL) { 698 if (IS_ERR(array)) {
435 d_type = dt_type(dentry->d_inode); 699 res = PTR_ERR(array);
436 fileid = NFS_FILEID(dentry->d_inode); 700 goto out;
437 } 701 }
438 702
439 res = filldir(dirent, entry->name, entry->len, 703 for (i = desc->cache_entry_index; i < array->size; i++) {
440 file->f_pos, nfs_compat_user_ino64(fileid), 704 struct nfs_cache_array_entry *ent;
441 d_type); 705
442 if (res < 0) 706 ent = &array->array[i];
443 break; 707 if (filldir(dirent, ent->string.name, ent->string.len,
444 file->f_pos++; 708 file->f_pos, nfs_compat_user_ino64(ent->ino),
445 *desc->dir_cookie = entry->cookie; 709 ent->d_type) < 0) {
446 if (dir_decode(desc) != 0) { 710 desc->eof = 1;
447 desc->page_index ++;
448 break; 711 break;
449 } 712 }
450 if (loop_count++ > 200) { 713 file->f_pos++;
451 loop_count = 0; 714 if (i < (array->size-1))
452 schedule(); 715 *desc->dir_cookie = array->array[i+1].cookie;
453 } 716 else
717 *desc->dir_cookie = array->last_cookie;
454 } 718 }
455 dir_page_release(desc); 719 if (array->eof_index >= 0)
456 if (dentry != NULL) 720 desc->eof = 1;
457 dput(dentry); 721
722 nfs_readdir_release_array(desc->page);
723out:
724 cache_page_release(desc);
458 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", 725 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
459 (unsigned long long)*desc->dir_cookie, res); 726 (unsigned long long)*desc->dir_cookie, res);
460 return res; 727 return res;
@@ -476,12 +743,9 @@ static inline
476int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, 743int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
477 filldir_t filldir) 744 filldir_t filldir)
478{ 745{
479 struct file *file = desc->file;
480 struct inode *inode = file->f_path.dentry->d_inode;
481 struct rpc_cred *cred = nfs_file_cred(file);
482 struct page *page = NULL; 746 struct page *page = NULL;
483 int status; 747 int status;
484 unsigned long timestamp, gencount; 748 struct inode *inode = desc->file->f_path.dentry->d_inode;
485 749
486 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 750 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
487 (unsigned long long)*desc->dir_cookie); 751 (unsigned long long)*desc->dir_cookie);
@@ -491,38 +755,23 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
491 status = -ENOMEM; 755 status = -ENOMEM;
492 goto out; 756 goto out;
493 } 757 }
494 timestamp = jiffies; 758
495 gencount = nfs_inc_attr_generation_counter(); 759 desc->page_index = 0;
496 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, 760 desc->last_cookie = *desc->dir_cookie;
497 *desc->dir_cookie, page,
498 NFS_SERVER(inode)->dtsize,
499 desc->plus);
500 desc->page = page; 761 desc->page = page;
501 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 762
502 if (status >= 0) { 763 status = nfs_readdir_xdr_to_array(desc, page, inode);
503 desc->timestamp = timestamp;
504 desc->gencount = gencount;
505 desc->timestamp_valid = 1;
506 if ((status = dir_decode(desc)) == 0)
507 desc->entry->prev_cookie = *desc->dir_cookie;
508 } else
509 status = -EIO;
510 if (status < 0) 764 if (status < 0)
511 goto out_release; 765 goto out_release;
512 766
513 status = nfs_do_filldir(desc, dirent, filldir); 767 status = nfs_do_filldir(desc, dirent, filldir);
514 768
515 /* Reset read descriptor so it searches the page cache from
516 * the start upon the next call to readdir_search_pagecache() */
517 desc->page_index = 0;
518 desc->entry->cookie = desc->entry->prev_cookie = 0;
519 desc->entry->eof = 0;
520 out: 769 out:
521 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", 770 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
522 __func__, status); 771 __func__, status);
523 return status; 772 return status;
524 out_release: 773 out_release:
525 dir_page_release(desc); 774 cache_page_release(desc);
526 goto out; 775 goto out;
527} 776}
528 777
@@ -536,8 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
536 struct inode *inode = dentry->d_inode; 785 struct inode *inode = dentry->d_inode;
537 nfs_readdir_descriptor_t my_desc, 786 nfs_readdir_descriptor_t my_desc,
538 *desc = &my_desc; 787 *desc = &my_desc;
539 struct nfs_entry my_entry; 788 int res;
540 int res = -ENOMEM;
541 789
542 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 790 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
543 dentry->d_parent->d_name.name, dentry->d_name.name, 791 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -557,57 +805,44 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
557 desc->decode = NFS_PROTO(inode)->decode_dirent; 805 desc->decode = NFS_PROTO(inode)->decode_dirent;
558 desc->plus = NFS_USE_READDIRPLUS(inode); 806 desc->plus = NFS_USE_READDIRPLUS(inode);
559 807
560 my_entry.cookie = my_entry.prev_cookie = 0;
561 my_entry.eof = 0;
562 my_entry.fh = nfs_alloc_fhandle();
563 my_entry.fattr = nfs_alloc_fattr();
564 if (my_entry.fh == NULL || my_entry.fattr == NULL)
565 goto out_alloc_failed;
566
567 desc->entry = &my_entry;
568
569 nfs_block_sillyrename(dentry); 808 nfs_block_sillyrename(dentry);
570 res = nfs_revalidate_mapping(inode, filp->f_mapping); 809 res = nfs_revalidate_mapping(inode, filp->f_mapping);
571 if (res < 0) 810 if (res < 0)
572 goto out; 811 goto out;
573 812
574 while(!desc->entry->eof) { 813 do {
575 res = readdir_search_pagecache(desc); 814 res = readdir_search_pagecache(desc);
576 815
577 if (res == -EBADCOOKIE) { 816 if (res == -EBADCOOKIE) {
817 res = 0;
578 /* This means either end of directory */ 818 /* This means either end of directory */
579 if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { 819 if (*desc->dir_cookie && desc->eof == 0) {
580 /* Or that the server has 'lost' a cookie */ 820 /* Or that the server has 'lost' a cookie */
581 res = uncached_readdir(desc, dirent, filldir); 821 res = uncached_readdir(desc, dirent, filldir);
582 if (res >= 0) 822 if (res == 0)
583 continue; 823 continue;
584 } 824 }
585 res = 0;
586 break; 825 break;
587 } 826 }
588 if (res == -ETOOSMALL && desc->plus) { 827 if (res == -ETOOSMALL && desc->plus) {
589 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 828 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
590 nfs_zap_caches(inode); 829 nfs_zap_caches(inode);
830 desc->page_index = 0;
591 desc->plus = 0; 831 desc->plus = 0;
592 desc->entry->eof = 0; 832 desc->eof = 0;
593 continue; 833 continue;
594 } 834 }
595 if (res < 0) 835 if (res < 0)
596 break; 836 break;
597 837
598 res = nfs_do_filldir(desc, dirent, filldir); 838 res = nfs_do_filldir(desc, dirent, filldir);
599 if (res < 0) { 839 if (res < 0)
600 res = 0;
601 break; 840 break;
602 } 841 } while (!desc->eof);
603 }
604out: 842out:
605 nfs_unblock_sillyrename(dentry); 843 nfs_unblock_sillyrename(dentry);
606 if (res > 0) 844 if (res > 0)
607 res = 0; 845 res = 0;
608out_alloc_failed:
609 nfs_free_fattr(my_entry.fattr);
610 nfs_free_fhandle(my_entry.fh);
611 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", 846 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
612 dentry->d_parent->d_name.name, dentry->d_name.name, 847 dentry->d_parent->d_name.name, dentry->d_name.name,
613 res); 848 res);
@@ -703,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
703 * component of the path. 938 * component of the path.
704 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. 939 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
705 */ 940 */
706static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask) 941static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
942 unsigned int mask)
707{ 943{
708 if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) 944 if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
709 return 0; 945 return 0;
@@ -734,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
734{ 970{
735 struct nfs_server *server = NFS_SERVER(inode); 971 struct nfs_server *server = NFS_SERVER(inode);
736 972
737 if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) 973 if (IS_AUTOMOUNT(inode))
738 return 0; 974 return 0;
739 if (nd != NULL) { 975 if (nd != NULL) {
740 /* VFS wants an on-the-wire revalidation */ 976 /* VFS wants an on-the-wire revalidation */
@@ -783,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
783 * If the parent directory is seen to have changed, we throw out the 1019 * If the parent directory is seen to have changed, we throw out the
784 * cached dentry and do a new lookup. 1020 * cached dentry and do a new lookup.
785 */ 1021 */
786static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) 1022static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
787{ 1023{
788 struct inode *dir; 1024 struct inode *dir;
789 struct inode *inode; 1025 struct inode *inode;
@@ -792,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
792 struct nfs_fattr *fattr = NULL; 1028 struct nfs_fattr *fattr = NULL;
793 int error; 1029 int error;
794 1030
1031 if (nd->flags & LOOKUP_RCU)
1032 return -ECHILD;
1033
795 parent = dget_parent(dentry); 1034 parent = dget_parent(dentry);
796 dir = parent->d_inode; 1035 dir = parent->d_inode;
797 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 1036 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
@@ -882,7 +1121,7 @@ out_error:
882/* 1121/*
883 * This is called from dput() when d_count is going to 0. 1122 * This is called from dput() when d_count is going to 0.
884 */ 1123 */
885static int nfs_dentry_delete(struct dentry *dentry) 1124static int nfs_dentry_delete(const struct dentry *dentry)
886{ 1125{
887 dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", 1126 dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
888 dentry->d_parent->d_name.name, dentry->d_name.name, 1127 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -934,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
934 .d_revalidate = nfs_lookup_revalidate, 1173 .d_revalidate = nfs_lookup_revalidate,
935 .d_delete = nfs_dentry_delete, 1174 .d_delete = nfs_dentry_delete,
936 .d_iput = nfs_dentry_iput, 1175 .d_iput = nfs_dentry_iput,
1176 .d_automount = nfs_d_automount,
937}; 1177};
938 1178
939static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 1179static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -953,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
953 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 1193 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
954 goto out; 1194 goto out;
955 1195
956 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
957
958 /* 1196 /*
959 * If we're doing an exclusive create, optimize away the lookup 1197 * If we're doing an exclusive create, optimize away the lookup
960 * but don't hash the dentry. 1198 * but don't hash the dentry.
@@ -982,7 +1220,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
982 goto out_unblock_sillyrename; 1220 goto out_unblock_sillyrename;
983 } 1221 }
984 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1222 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
985 res = (struct dentry *)inode; 1223 res = ERR_CAST(inode);
986 if (IS_ERR(res)) 1224 if (IS_ERR(res))
987 goto out_unblock_sillyrename; 1225 goto out_unblock_sillyrename;
988 1226
@@ -1009,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
1009 .d_revalidate = nfs_open_revalidate, 1247 .d_revalidate = nfs_open_revalidate,
1010 .d_delete = nfs_dentry_delete, 1248 .d_delete = nfs_dentry_delete,
1011 .d_iput = nfs_dentry_iput, 1249 .d_iput = nfs_dentry_iput,
1250 .d_automount = nfs_d_automount,
1012}; 1251};
1013 1252
1014/* 1253/*
@@ -1029,10 +1268,63 @@ static int is_atomic_open(struct nameidata *nd)
1029 return 1; 1268 return 1;
1030} 1269}
1031 1270
1271static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
1272{
1273 struct path path = {
1274 .mnt = nd->path.mnt,
1275 .dentry = dentry,
1276 };
1277 struct nfs_open_context *ctx;
1278 struct rpc_cred *cred;
1279 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
1280
1281 cred = rpc_lookup_cred();
1282 if (IS_ERR(cred))
1283 return ERR_CAST(cred);
1284 ctx = alloc_nfs_open_context(&path, cred, fmode);
1285 put_rpccred(cred);
1286 if (ctx == NULL)
1287 return ERR_PTR(-ENOMEM);
1288 return ctx;
1289}
1290
1291static int do_open(struct inode *inode, struct file *filp)
1292{
1293 nfs_fscache_set_inode_cookie(inode, filp);
1294 return 0;
1295}
1296
1297static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
1298{
1299 struct file *filp;
1300 int ret = 0;
1301
1302 /* If the open_intent is for execute, we have an extra check to make */
1303 if (ctx->mode & FMODE_EXEC) {
1304 ret = nfs_may_open(ctx->path.dentry->d_inode,
1305 ctx->cred,
1306 nd->intent.open.flags);
1307 if (ret < 0)
1308 goto out;
1309 }
1310 filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
1311 if (IS_ERR(filp))
1312 ret = PTR_ERR(filp);
1313 else
1314 nfs_file_set_open_context(filp, ctx);
1315out:
1316 put_nfs_open_context(ctx);
1317 return ret;
1318}
1319
1032static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 1320static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1033{ 1321{
1322 struct nfs_open_context *ctx;
1323 struct iattr attr;
1034 struct dentry *res = NULL; 1324 struct dentry *res = NULL;
1035 int error; 1325 struct inode *inode;
1326 int open_flags;
1327 int err;
1036 1328
1037 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", 1329 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
1038 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1330 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1045,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1045 res = ERR_PTR(-ENAMETOOLONG); 1337 res = ERR_PTR(-ENAMETOOLONG);
1046 goto out; 1338 goto out;
1047 } 1339 }
1048 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1049 1340
1050 /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash 1341 /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
1051 * the dentry. */ 1342 * the dentry. */
@@ -1054,29 +1345,61 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1054 goto out; 1345 goto out;
1055 } 1346 }
1056 1347
1348 ctx = nameidata_to_nfs_open_context(dentry, nd);
1349 res = ERR_CAST(ctx);
1350 if (IS_ERR(ctx))
1351 goto out;
1352
1353 open_flags = nd->intent.open.flags;
1354 if (nd->flags & LOOKUP_CREATE) {
1355 attr.ia_mode = nd->intent.open.create_mode;
1356 attr.ia_valid = ATTR_MODE;
1357 attr.ia_mode &= ~current_umask();
1358 } else {
1359 open_flags &= ~(O_EXCL | O_CREAT);
1360 attr.ia_valid = 0;
1361 }
1362
1057 /* Open the file on the server */ 1363 /* Open the file on the server */
1058 res = nfs4_atomic_open(dir, dentry, nd); 1364 nfs_block_sillyrename(dentry->d_parent);
1059 if (IS_ERR(res)) { 1365 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1060 error = PTR_ERR(res); 1366 if (IS_ERR(inode)) {
1061 switch (error) { 1367 nfs_unblock_sillyrename(dentry->d_parent);
1368 put_nfs_open_context(ctx);
1369 switch (PTR_ERR(inode)) {
1062 /* Make a negative dentry */ 1370 /* Make a negative dentry */
1063 case -ENOENT: 1371 case -ENOENT:
1372 d_add(dentry, NULL);
1064 res = NULL; 1373 res = NULL;
1065 goto out; 1374 goto out;
1066 /* This turned out not to be a regular file */ 1375 /* This turned out not to be a regular file */
1067 case -EISDIR:
1068 case -ENOTDIR: 1376 case -ENOTDIR:
1069 goto no_open; 1377 goto no_open;
1070 case -ELOOP: 1378 case -ELOOP:
1071 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1379 if (!(nd->intent.open.flags & O_NOFOLLOW))
1072 goto no_open; 1380 goto no_open;
1381 /* case -EISDIR: */
1073 /* case -EINVAL: */ 1382 /* case -EINVAL: */
1074 default: 1383 default:
1384 res = ERR_CAST(inode);
1075 goto out; 1385 goto out;
1076 } 1386 }
1077 } else if (res != NULL) 1387 }
1388 res = d_add_unique(dentry, inode);
1389 nfs_unblock_sillyrename(dentry->d_parent);
1390 if (res != NULL) {
1391 dput(ctx->path.dentry);
1392 ctx->path.dentry = dget(res);
1078 dentry = res; 1393 dentry = res;
1394 }
1395 err = nfs_intent_set_file(nd, ctx);
1396 if (err < 0) {
1397 if (res != NULL)
1398 dput(res);
1399 return ERR_PTR(err);
1400 }
1079out: 1401out:
1402 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1080 return res; 1403 return res;
1081no_open: 1404no_open:
1082 return nfs_lookup(dir, dentry, nd); 1405 return nfs_lookup(dir, dentry, nd);
@@ -1085,14 +1408,21 @@ no_open:
1085static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) 1408static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1086{ 1409{
1087 struct dentry *parent = NULL; 1410 struct dentry *parent = NULL;
1088 struct inode *inode = dentry->d_inode; 1411 struct inode *inode;
1089 struct inode *dir; 1412 struct inode *dir;
1413 struct nfs_open_context *ctx;
1090 int openflags, ret = 0; 1414 int openflags, ret = 0;
1091 1415
1416 if (nd->flags & LOOKUP_RCU)
1417 return -ECHILD;
1418
1419 inode = dentry->d_inode;
1092 if (!is_atomic_open(nd) || d_mountpoint(dentry)) 1420 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1093 goto no_open; 1421 goto no_open;
1422
1094 parent = dget_parent(dentry); 1423 parent = dget_parent(dentry);
1095 dir = parent->d_inode; 1424 dir = parent->d_inode;
1425
1096 /* We can't create new files in nfs_open_revalidate(), so we 1426 /* We can't create new files in nfs_open_revalidate(), so we
1097 * optimize away revalidation of negative dentries. 1427 * optimize away revalidation of negative dentries.
1098 */ 1428 */
@@ -1112,99 +1442,96 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1112 /* We can't create new files, or truncate existing ones here */ 1442 /* We can't create new files, or truncate existing ones here */
1113 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); 1443 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
1114 1444
1445 ctx = nameidata_to_nfs_open_context(dentry, nd);
1446 ret = PTR_ERR(ctx);
1447 if (IS_ERR(ctx))
1448 goto out;
1115 /* 1449 /*
1116 * Note: we're not holding inode->i_mutex and so may be racing with 1450 * Note: we're not holding inode->i_mutex and so may be racing with
1117 * operations that change the directory. We therefore save the 1451 * operations that change the directory. We therefore save the
1118 * change attribute *before* we do the RPC call. 1452 * change attribute *before* we do the RPC call.
1119 */ 1453 */
1120 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1454 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
1455 if (IS_ERR(inode)) {
1456 ret = PTR_ERR(inode);
1457 switch (ret) {
1458 case -EPERM:
1459 case -EACCES:
1460 case -EDQUOT:
1461 case -ENOSPC:
1462 case -EROFS:
1463 goto out_put_ctx;
1464 default:
1465 goto out_drop;
1466 }
1467 }
1468 iput(inode);
1469 if (inode != dentry->d_inode)
1470 goto out_drop;
1471
1472 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1473 ret = nfs_intent_set_file(nd, ctx);
1474 if (ret >= 0)
1475 ret = 1;
1121out: 1476out:
1122 dput(parent); 1477 dput(parent);
1123 if (!ret)
1124 d_drop(dentry);
1125 return ret; 1478 return ret;
1479out_drop:
1480 d_drop(dentry);
1481 ret = 0;
1482out_put_ctx:
1483 put_nfs_open_context(ctx);
1484 goto out;
1485
1126no_open_dput: 1486no_open_dput:
1127 dput(parent); 1487 dput(parent);
1128no_open: 1488no_open:
1129 return nfs_lookup_revalidate(dentry, nd); 1489 return nfs_lookup_revalidate(dentry, nd);
1130} 1490}
1131#endif /* CONFIG_NFSV4 */
1132 1491
1133static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) 1492static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
1493 struct nameidata *nd)
1134{ 1494{
1135 struct dentry *parent = desc->file->f_path.dentry; 1495 struct nfs_open_context *ctx = NULL;
1136 struct inode *dir = parent->d_inode; 1496 struct iattr attr;
1137 struct nfs_entry *entry = desc->entry; 1497 int error;
1138 struct dentry *dentry, *alias; 1498 int open_flags = 0;
1139 struct qstr name = {
1140 .name = entry->name,
1141 .len = entry->len,
1142 };
1143 struct inode *inode;
1144 unsigned long verf = nfs_save_change_attribute(dir);
1145 1499
1146 switch (name.len) { 1500 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1147 case 2: 1501 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1148 if (name.name[0] == '.' && name.name[1] == '.')
1149 return dget_parent(parent);
1150 break;
1151 case 1:
1152 if (name.name[0] == '.')
1153 return dget(parent);
1154 }
1155 1502
1156 spin_lock(&dir->i_lock); 1503 attr.ia_mode = mode;
1157 if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { 1504 attr.ia_valid = ATTR_MODE;
1158 spin_unlock(&dir->i_lock);
1159 return NULL;
1160 }
1161 spin_unlock(&dir->i_lock);
1162 1505
1163 name.hash = full_name_hash(name.name, name.len); 1506 if ((nd->flags & LOOKUP_CREATE) != 0) {
1164 dentry = d_lookup(parent, &name); 1507 open_flags = nd->intent.open.flags;
1165 if (dentry != NULL) {
1166 /* Is this a positive dentry that matches the readdir info? */
1167 if (dentry->d_inode != NULL &&
1168 (NFS_FILEID(dentry->d_inode) == entry->ino ||
1169 d_mountpoint(dentry))) {
1170 if (!desc->plus || entry->fh->size == 0)
1171 return dentry;
1172 if (nfs_compare_fh(NFS_FH(dentry->d_inode),
1173 entry->fh) == 0)
1174 goto out_renew;
1175 }
1176 /* No, so d_drop to allow one to be created */
1177 d_drop(dentry);
1178 dput(dentry);
1179 }
1180 if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
1181 return NULL;
1182 if (name.len > NFS_SERVER(dir)->namelen)
1183 return NULL;
1184 /* Note: caller is already holding the dir->i_mutex! */
1185 dentry = d_alloc(parent, &name);
1186 if (dentry == NULL)
1187 return NULL;
1188 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1189 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
1190 if (IS_ERR(inode)) {
1191 dput(dentry);
1192 return NULL;
1193 }
1194 1508
1195 alias = d_materialise_unique(dentry, inode); 1509 ctx = nameidata_to_nfs_open_context(dentry, nd);
1196 if (alias != NULL) { 1510 error = PTR_ERR(ctx);
1197 dput(dentry); 1511 if (IS_ERR(ctx))
1198 if (IS_ERR(alias)) 1512 goto out_err_drop;
1199 return NULL;
1200 dentry = alias;
1201 } 1513 }
1202 1514
1203out_renew: 1515 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
1204 nfs_set_verifier(dentry, verf); 1516 if (error != 0)
1205 return dentry; 1517 goto out_put_ctx;
1518 if (ctx != NULL) {
1519 error = nfs_intent_set_file(nd, ctx);
1520 if (error < 0)
1521 goto out_err;
1522 }
1523 return 0;
1524out_put_ctx:
1525 if (ctx != NULL)
1526 put_nfs_open_context(ctx);
1527out_err_drop:
1528 d_drop(dentry);
1529out_err:
1530 return error;
1206} 1531}
1207 1532
1533#endif /* CONFIG_NFSV4 */
1534
1208/* 1535/*
1209 * Code common to create, mkdir, and mknod. 1536 * Code common to create, mkdir, and mknod.
1210 */ 1537 */
@@ -1269,7 +1596,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1269 if ((nd->flags & LOOKUP_CREATE) != 0) 1596 if ((nd->flags & LOOKUP_CREATE) != 0)
1270 open_flags = nd->intent.open.flags; 1597 open_flags = nd->intent.open.flags;
1271 1598
1272 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); 1599 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
1273 if (error != 0) 1600 if (error != 0)
1274 goto out_err; 1601 goto out_err;
1275 return 0; 1602 return 0;
@@ -1351,76 +1678,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1351 return error; 1678 return error;
1352} 1679}
1353 1680
1354static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
1355{
1356 static unsigned int sillycounter;
1357 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
1358 const int countersize = sizeof(sillycounter)*2;
1359 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
1360 char silly[slen+1];
1361 struct qstr qsilly;
1362 struct dentry *sdentry;
1363 int error = -EIO;
1364
1365 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
1366 dentry->d_parent->d_name.name, dentry->d_name.name,
1367 atomic_read(&dentry->d_count));
1368 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
1369
1370 /*
1371 * We don't allow a dentry to be silly-renamed twice.
1372 */
1373 error = -EBUSY;
1374 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1375 goto out;
1376
1377 sprintf(silly, ".nfs%*.*Lx",
1378 fileidsize, fileidsize,
1379 (unsigned long long)NFS_FILEID(dentry->d_inode));
1380
1381 /* Return delegation in anticipation of the rename */
1382 nfs_inode_return_delegation(dentry->d_inode);
1383
1384 sdentry = NULL;
1385 do {
1386 char *suffix = silly + slen - countersize;
1387
1388 dput(sdentry);
1389 sillycounter++;
1390 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
1391
1392 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
1393 dentry->d_name.name, silly);
1394
1395 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
1396 /*
1397 * N.B. Better to return EBUSY here ... it could be
1398 * dangerous to delete the file while it's in use.
1399 */
1400 if (IS_ERR(sdentry))
1401 goto out;
1402 } while(sdentry->d_inode != NULL); /* need negative lookup */
1403
1404 qsilly.name = silly;
1405 qsilly.len = strlen(silly);
1406 if (dentry->d_inode) {
1407 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1408 dir, &qsilly);
1409 nfs_mark_for_revalidate(dentry->d_inode);
1410 } else
1411 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1412 dir, &qsilly);
1413 if (!error) {
1414 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1415 d_move(dentry, sdentry);
1416 error = nfs_async_unlink(dir, dentry);
1417 /* If we return 0 we don't unlink */
1418 }
1419 dput(sdentry);
1420out:
1421 return error;
1422}
1423
1424/* 1681/*
1425 * Remove a file after making sure there are no pending writes, 1682 * Remove a file after making sure there are no pending writes,
1426 * and after checking that the file has only one user. 1683 * and after checking that the file has only one user.
@@ -1471,11 +1728,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1471 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, 1728 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
1472 dir->i_ino, dentry->d_name.name); 1729 dir->i_ino, dentry->d_name.name);
1473 1730
1474 spin_lock(&dcache_lock);
1475 spin_lock(&dentry->d_lock); 1731 spin_lock(&dentry->d_lock);
1476 if (atomic_read(&dentry->d_count) > 1) { 1732 if (dentry->d_count > 1) {
1477 spin_unlock(&dentry->d_lock); 1733 spin_unlock(&dentry->d_lock);
1478 spin_unlock(&dcache_lock);
1479 /* Start asynchronous writeout of the inode */ 1734 /* Start asynchronous writeout of the inode */
1480 write_inode_now(dentry->d_inode, 0); 1735 write_inode_now(dentry->d_inode, 0);
1481 error = nfs_sillyrename(dir, dentry); 1736 error = nfs_sillyrename(dir, dentry);
@@ -1486,7 +1741,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
1486 need_rehash = 1; 1741 need_rehash = 1;
1487 } 1742 }
1488 spin_unlock(&dentry->d_lock); 1743 spin_unlock(&dentry->d_lock);
1489 spin_unlock(&dcache_lock);
1490 error = nfs_safe_remove(dentry); 1744 error = nfs_safe_remove(dentry);
1491 if (!error || error == -ENOENT) { 1745 if (!error || error == -ENOENT) {
1492 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1746 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1580,7 +1834,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1580 d_drop(dentry); 1834 d_drop(dentry);
1581 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1835 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1582 if (error == 0) { 1836 if (error == 0) {
1583 atomic_inc(&inode->i_count); 1837 ihold(inode);
1584 d_add(dentry, inode); 1838 d_add(dentry, inode);
1585 } 1839 }
1586 return error; 1840 return error;
@@ -1621,7 +1875,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1621 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", 1875 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1622 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1876 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1623 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1877 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1624 atomic_read(&new_dentry->d_count)); 1878 new_dentry->d_count);
1625 1879
1626 /* 1880 /*
1627 * For non-directories, check whether the target is busy and if so, 1881 * For non-directories, check whether the target is busy and if so,
@@ -1639,7 +1893,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1639 rehash = new_dentry; 1893 rehash = new_dentry;
1640 } 1894 }
1641 1895
1642 if (atomic_read(&new_dentry->d_count) > 2) { 1896 if (new_dentry->d_count > 2) {
1643 int err; 1897 int err;
1644 1898
1645 /* copy the target dentry's name */ 1899 /* copy the target dentry's name */
@@ -1711,14 +1965,14 @@ static void nfs_access_free_list(struct list_head *head)
1711int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 1965int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
1712{ 1966{
1713 LIST_HEAD(head); 1967 LIST_HEAD(head);
1714 struct nfs_inode *nfsi; 1968 struct nfs_inode *nfsi, *next;
1715 struct nfs_access_entry *cache; 1969 struct nfs_access_entry *cache;
1716 1970
1717 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 1971 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1718 return (nr_to_scan == 0) ? 0 : -1; 1972 return (nr_to_scan == 0) ? 0 : -1;
1719 1973
1720 spin_lock(&nfs_access_lru_lock); 1974 spin_lock(&nfs_access_lru_lock);
1721 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1975 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
1722 struct inode *inode; 1976 struct inode *inode;
1723 1977
1724 if (nr_to_scan-- == 0) 1978 if (nr_to_scan-- == 0)
@@ -1941,11 +2195,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
1941 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); 2195 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
1942} 2196}
1943 2197
1944int nfs_permission(struct inode *inode, int mask) 2198int nfs_permission(struct inode *inode, int mask, unsigned int flags)
1945{ 2199{
1946 struct rpc_cred *cred; 2200 struct rpc_cred *cred;
1947 int res = 0; 2201 int res = 0;
1948 2202
2203 if (flags & IPERM_FLAG_RCU)
2204 return -ECHILD;
2205
1949 nfs_inc_stats(inode, NFSIOS_VFSACCESS); 2206 nfs_inc_stats(inode, NFSIOS_VFSACCESS);
1950 2207
1951 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2208 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -1993,7 +2250,7 @@ out:
1993out_notsup: 2250out_notsup:
1994 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 2251 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
1995 if (res == 0) 2252 if (res == 0)
1996 res = generic_permission(inode, mask, NULL); 2253 res = generic_permission(inode, mask, flags, NULL);
1997 goto out; 2254 goto out;
1998} 2255}
1999 2256
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 064a80961677..9943a75bb6d1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
407 pos += vec->iov_len; 407 pos += vec->iov_len;
408 } 408 }
409 409
410 /*
411 * If no bytes were started, return the error, and let the
412 * generic layer handle the completion.
413 */
414 if (requested_bytes == 0) {
415 nfs_direct_req_release(dreq);
416 return result < 0 ? result : -EIO;
417 }
418
410 if (put_dreq(dreq)) 419 if (put_dreq(dreq))
411 nfs_direct_complete(dreq); 420 nfs_direct_complete(dreq);
412 421 return 0;
413 if (requested_bytes != 0)
414 return 0;
415
416 if (result < 0)
417 return result;
418 return -EIO;
419} 422}
420 423
421static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 424static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
841 pos += vec->iov_len; 844 pos += vec->iov_len;
842 } 845 }
843 846
847 /*
848 * If no bytes were started, return the error, and let the
849 * generic layer handle the completion.
850 */
851 if (requested_bytes == 0) {
852 nfs_direct_req_release(dreq);
853 return result < 0 ? result : -EIO;
854 }
855
844 if (put_dreq(dreq)) 856 if (put_dreq(dreq))
845 nfs_direct_write_complete(dreq, dreq->inode); 857 nfs_direct_write_complete(dreq, dreq->inode);
846 858 return 0;
847 if (requested_bytes != 0)
848 return 0;
849
850 if (result < 0)
851 return result;
852 return -EIO;
853} 859}
854 860
855static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, 861static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
@@ -867,13 +873,13 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
867 goto out; 873 goto out;
868 nfs_alloc_commit_data(dreq); 874 nfs_alloc_commit_data(dreq);
869 875
870 if (dreq->commit_data == NULL || count < wsize) 876 if (dreq->commit_data == NULL || count <= wsize)
871 sync = NFS_FILE_SYNC; 877 sync = NFS_FILE_SYNC;
872 878
873 dreq->inode = inode; 879 dreq->inode = inode;
874 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 880 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
875 dreq->l_ctx = nfs_get_lock_context(dreq->ctx); 881 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
876 if (dreq->l_ctx != NULL) 882 if (dreq->l_ctx == NULL)
877 goto out_release; 883 goto out_release;
878 if (!is_sync_kiocb(iocb)) 884 if (!is_sync_kiocb(iocb))
879 dreq->iocb = iocb; 885 dreq->iocb = iocb;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index dba50a5625db..a6e711ad130f 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
167 return 0; 167 return 0;
168 } 168 }
169 item = container_of(h, struct nfs_dns_ent, h); 169 item = container_of(h, struct nfs_dns_ent, h);
170 ttl = (long)item->h.expiry_time - (long)get_seconds(); 170 ttl = item->h.expiry_time - seconds_since_boot();
171 if (ttl < 0) 171 if (ttl < 0)
172 ttl = 0; 172 ttl = 0;
173 173
@@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
239 ttl = get_expiry(&buf); 239 ttl = get_expiry(&buf);
240 if (ttl == 0) 240 if (ttl == 0)
241 goto out; 241 goto out;
242 key.h.expiry_time = ttl + get_seconds(); 242 key.h.expiry_time = ttl + seconds_since_boot();
243 243
244 ret = -ENOMEM; 244 ret = -ENOMEM;
245 item = nfs_dns_lookup(cd, &key); 245 item = nfs_dns_lookup(cd, &key);
@@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
301 goto out_err; 301 goto out_err;
302 ret = -ETIMEDOUT; 302 ret = -ETIMEDOUT;
303 if (!test_bit(CACHE_VALID, &(*item)->h.flags) 303 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
304 || (*item)->h.expiry_time < get_seconds() 304 || (*item)->h.expiry_time < seconds_since_boot()
305 || cd->flush_time > (*item)->h.last_refresh) 305 || cd->flush_time > (*item)->h.last_refresh)
306 goto out_put; 306 goto out_put;
307 ret = -ENOENT; 307 ret = -ENOENT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05bf3c0dc751..7bf029ef4084 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#define NFSDBG_FACILITY NFSDBG_FILE 41#define NFSDBG_FACILITY NFSDBG_FILE
41 42
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
386 file->f_path.dentry->d_name.name, 387 file->f_path.dentry->d_name.name,
387 mapping->host->i_ino, len, (long long) pos); 388 mapping->host->i_ino, len, (long long) pos);
388 389
390 pnfs_update_layout(mapping->host,
391 nfs_file_open_context(file),
392 IOMODE_RW);
393
389start: 394start:
390 /* 395 /*
391 * Prevent starvation issues if someone is doing a consistency 396 * Prevent starvation issues if someone is doing a consistency
@@ -551,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
551 struct file *filp = vma->vm_file; 556 struct file *filp = vma->vm_file;
552 struct dentry *dentry = filp->f_path.dentry; 557 struct dentry *dentry = filp->f_path.dentry;
553 unsigned pagelen; 558 unsigned pagelen;
554 int ret = -EINVAL; 559 int ret = VM_FAULT_NOPAGE;
555 struct address_space *mapping; 560 struct address_space *mapping;
556 561
557 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", 562 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -567,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
567 if (mapping != dentry->d_inode->i_mapping) 572 if (mapping != dentry->d_inode->i_mapping)
568 goto out_unlock; 573 goto out_unlock;
569 574
570 ret = 0;
571 pagelen = nfs_page_length(page); 575 pagelen = nfs_page_length(page);
572 if (pagelen == 0) 576 if (pagelen == 0)
573 goto out_unlock; 577 goto out_unlock;
574 578
575 ret = nfs_flush_incompatible(filp, page); 579 ret = VM_FAULT_LOCKED;
576 if (ret != 0) 580 if (nfs_flush_incompatible(filp, page) == 0 &&
577 goto out_unlock; 581 nfs_updatepage(filp, page, 0, pagelen) == 0)
582 goto out;
578 583
579 ret = nfs_updatepage(filp, page, 0, pagelen); 584 ret = VM_FAULT_SIGBUS;
580out_unlock: 585out_unlock:
581 if (!ret)
582 return VM_FAULT_LOCKED;
583 unlock_page(page); 586 unlock_page(page);
584 return VM_FAULT_SIGBUS; 587out:
588 return ret;
585} 589}
586 590
587static const struct vm_operations_struct nfs_file_vm_ops = { 591static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -684,10 +688,12 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
684 return ret; 688 return ret;
685} 689}
686 690
687static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) 691static int
692do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
688{ 693{
689 struct inode *inode = filp->f_mapping->host; 694 struct inode *inode = filp->f_mapping->host;
690 int status = 0; 695 int status = 0;
696 unsigned int saved_type = fl->fl_type;
691 697
692 /* Try local locking first */ 698 /* Try local locking first */
693 posix_test_lock(filp, fl); 699 posix_test_lock(filp, fl);
@@ -695,11 +701,12 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
695 /* found a conflict */ 701 /* found a conflict */
696 goto out; 702 goto out;
697 } 703 }
704 fl->fl_type = saved_type;
698 705
699 if (nfs_have_delegation(inode, FMODE_READ)) 706 if (nfs_have_delegation(inode, FMODE_READ))
700 goto out_noconflict; 707 goto out_noconflict;
701 708
702 if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) 709 if (is_local)
703 goto out_noconflict; 710 goto out_noconflict;
704 711
705 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 712 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -726,7 +733,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
726 return res; 733 return res;
727} 734}
728 735
729static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) 736static int
737do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
730{ 738{
731 struct inode *inode = filp->f_mapping->host; 739 struct inode *inode = filp->f_mapping->host;
732 int status; 740 int status;
@@ -741,15 +749,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
741 * If we're signalled while cleaning up locks on process exit, we 749 * If we're signalled while cleaning up locks on process exit, we
742 * still need to complete the unlock. 750 * still need to complete the unlock.
743 */ 751 */
744 /* Use local locking if mounted with "-onolock" */ 752 /*
745 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 753 * Use local locking if mounted with "-onolock" or with appropriate
754 * "-olocal_lock="
755 */
756 if (!is_local)
746 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 757 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
747 else 758 else
748 status = do_vfs_lock(filp, fl); 759 status = do_vfs_lock(filp, fl);
749 return status; 760 return status;
750} 761}
751 762
752static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) 763static int
764is_time_granular(struct timespec *ts) {
765 return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
766}
767
768static int
769do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
753{ 770{
754 struct inode *inode = filp->f_mapping->host; 771 struct inode *inode = filp->f_mapping->host;
755 int status; 772 int status;
@@ -762,20 +779,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
762 if (status != 0) 779 if (status != 0)
763 goto out; 780 goto out;
764 781
765 /* Use local locking if mounted with "-onolock" */ 782 /*
766 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 783 * Use local locking if mounted with "-onolock" or with appropriate
784 * "-olocal_lock="
785 */
786 if (!is_local)
767 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 787 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
768 else 788 else
769 status = do_vfs_lock(filp, fl); 789 status = do_vfs_lock(filp, fl);
770 if (status < 0) 790 if (status < 0)
771 goto out; 791 goto out;
792
772 /* 793 /*
773 * Make sure we clear the cache whenever we try to get the lock. 794 * Revalidate the cache if the server has time stamps granular
795 * enough to detect subsecond changes. Otherwise, clear the
796 * cache to prevent missing any changes.
797 *
774 * This makes locking act as a cache coherency point. 798 * This makes locking act as a cache coherency point.
775 */ 799 */
776 nfs_sync_mapping(filp->f_mapping); 800 nfs_sync_mapping(filp->f_mapping);
777 if (!nfs_have_delegation(inode, FMODE_READ)) 801 if (!nfs_have_delegation(inode, FMODE_READ)) {
778 nfs_zap_caches(inode); 802 if (is_time_granular(&NFS_SERVER(inode)->time_delta))
803 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
804 else
805 nfs_zap_caches(inode);
806 }
779out: 807out:
780 return status; 808 return status;
781} 809}
@@ -787,6 +815,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
787{ 815{
788 struct inode *inode = filp->f_mapping->host; 816 struct inode *inode = filp->f_mapping->host;
789 int ret = -ENOLCK; 817 int ret = -ENOLCK;
818 int is_local = 0;
790 819
791 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", 820 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
792 filp->f_path.dentry->d_parent->d_name.name, 821 filp->f_path.dentry->d_parent->d_name.name,
@@ -800,6 +829,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
800 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 829 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
801 goto out_err; 830 goto out_err;
802 831
832 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
833 is_local = 1;
834
803 if (NFS_PROTO(inode)->lock_check_bounds != NULL) { 835 if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
804 ret = NFS_PROTO(inode)->lock_check_bounds(fl); 836 ret = NFS_PROTO(inode)->lock_check_bounds(fl);
805 if (ret < 0) 837 if (ret < 0)
@@ -807,11 +839,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
807 } 839 }
808 840
809 if (IS_GETLK(cmd)) 841 if (IS_GETLK(cmd))
810 ret = do_getlk(filp, cmd, fl); 842 ret = do_getlk(filp, cmd, fl, is_local);
811 else if (fl->fl_type == F_UNLCK) 843 else if (fl->fl_type == F_UNLCK)
812 ret = do_unlk(filp, cmd, fl); 844 ret = do_unlk(filp, cmd, fl, is_local);
813 else 845 else
814 ret = do_setlk(filp, cmd, fl); 846 ret = do_setlk(filp, cmd, fl, is_local);
815out_err: 847out_err:
816 return ret; 848 return ret;
817} 849}
@@ -821,6 +853,9 @@ out_err:
821 */ 853 */
822static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) 854static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
823{ 855{
856 struct inode *inode = filp->f_mapping->host;
857 int is_local = 0;
858
824 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", 859 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
825 filp->f_path.dentry->d_parent->d_name.name, 860 filp->f_path.dentry->d_parent->d_name.name,
826 filp->f_path.dentry->d_name.name, 861 filp->f_path.dentry->d_name.name,
@@ -829,14 +864,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
829 if (!(fl->fl_flags & FL_FLOCK)) 864 if (!(fl->fl_flags & FL_FLOCK))
830 return -ENOLCK; 865 return -ENOLCK;
831 866
867 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
868 is_local = 1;
869
832 /* We're simulating flock() locks using posix locks on the server */ 870 /* We're simulating flock() locks using posix locks on the server */
833 fl->fl_owner = (fl_owner_t)filp; 871 fl->fl_owner = (fl_owner_t)filp;
834 fl->fl_start = 0; 872 fl->fl_start = 0;
835 fl->fl_end = OFFSET_MAX; 873 fl->fl_end = OFFSET_MAX;
836 874
837 if (fl->fl_type == F_UNLCK) 875 if (fl->fl_type == F_UNLCK)
838 return do_unlk(filp, cmd, fl); 876 return do_unlk(filp, cmd, fl, is_local);
839 return do_setlk(filp, cmd, fl); 877 return do_setlk(filp, cmd, fl, is_local);
840} 878}
841 879
842/* 880/*
@@ -848,6 +886,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
848 dprintk("NFS: setlease(%s/%s, arg=%ld)\n", 886 dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
849 file->f_path.dentry->d_parent->d_name.name, 887 file->f_path.dentry->d_parent->d_name.name,
850 file->f_path.dentry->d_name.name, arg); 888 file->f_path.dentry->d_name.name, arg);
851
852 return -EINVAL; 889 return -EINVAL;
853} 890}
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..b5ffe8fa291f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
54 iput(inode); 54 iput(inode);
55 return -ENOMEM; 55 return -ENOMEM;
56 } 56 }
57 /* Circumvent igrab(): we know the inode is not being freed */ 57 ihold(inode);
58 atomic_inc(&inode->i_count);
59 /* 58 /*
60 * Ensure that this dentry is invisible to d_find_alias(). 59 * Ensure that this dentry is invisible to d_find_alias().
61 * Otherwise, it may be spliced into the tree by 60 * Otherwise, it may be spliced into the tree by
@@ -64,9 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
64 * This again causes shrink_dcache_for_umount_subtree() to 63 * This again causes shrink_dcache_for_umount_subtree() to
65 * Oops, since the test for IS_ROOT() will fail. 64 * Oops, since the test for IS_ROOT() will fail.
66 */ 65 */
67 spin_lock(&dcache_lock); 66 spin_lock(&sb->s_root->d_inode->i_lock);
67 spin_lock(&sb->s_root->d_lock);
68 list_del_init(&sb->s_root->d_alias); 68 list_del_init(&sb->s_root->d_alias);
69 spin_unlock(&dcache_lock); 69 spin_unlock(&sb->s_root->d_lock);
70 spin_unlock(&sb->s_root->d_inode->i_lock);
70 } 71 }
71 return 0; 72 return 0;
72} 73}
@@ -118,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
118 } 119 }
119 120
120 security_d_instantiate(ret, inode); 121 security_d_instantiate(ret, inode);
121
122 if (ret->d_op == NULL)
123 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
124out: 122out:
125 nfs_free_fattr(fsinfo.fattr); 123 nfs_free_fattr(fsinfo.fattr);
126 return ret; 124 return ret;
@@ -226,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
226 224
227 security_d_instantiate(ret, inode); 225 security_d_instantiate(ret, inode);
228 226
229 if (ret->d_op == NULL)
230 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
231
232out: 227out:
233 nfs_free_fattr(fattr); 228 nfs_free_fattr(fattr);
234 dprintk("<-- nfs4_get_root()\n"); 229 dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916f..18696882f1c6 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
38
39#include <linux/slab.h>
40#include <linux/cred.h>
41#include <linux/nfs_idmap.h>
42#include <linux/keyctl.h>
43#include <linux/key-type.h>
44#include <linux/rcupdate.h>
45#include <linux/kernel.h>
46#include <linux/err.h>
47
48#include <keys/user-type.h>
49
50#define NFS_UINT_MAXLEN 11
51
52const struct cred *id_resolver_cache;
53
54struct key_type key_type_id_resolver = {
55 .name = "id_resolver",
56 .instantiate = user_instantiate,
57 .match = user_match,
58 .revoke = user_revoke,
59 .destroy = user_destroy,
60 .describe = user_describe,
61 .read = user_read,
62};
63
64int nfs_idmap_init(void)
65{
66 struct cred *cred;
67 struct key *keyring;
68 int ret = 0;
69
70 printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
71
72 cred = prepare_kernel_cred(NULL);
73 if (!cred)
74 return -ENOMEM;
75
76 keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
77 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
78 KEY_USR_VIEW | KEY_USR_READ,
79 KEY_ALLOC_NOT_IN_QUOTA);
80 if (IS_ERR(keyring)) {
81 ret = PTR_ERR(keyring);
82 goto failed_put_cred;
83 }
84
85 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
86 if (ret < 0)
87 goto failed_put_key;
88
89 ret = register_key_type(&key_type_id_resolver);
90 if (ret < 0)
91 goto failed_put_key;
92
93 cred->thread_keyring = keyring;
94 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
95 id_resolver_cache = cred;
96 return 0;
97
98failed_put_key:
99 key_put(keyring);
100failed_put_cred:
101 put_cred(cred);
102 return ret;
103}
104
105void nfs_idmap_quit(void)
106{
107 key_revoke(id_resolver_cache->thread_keyring);
108 unregister_key_type(&key_type_id_resolver);
109 put_cred(id_resolver_cache);
110}
111
112/*
113 * Assemble the description to pass to request_key()
114 * This function will allocate a new string and update dest to point
115 * at it. The caller is responsible for freeing dest.
116 *
117 * On error 0 is returned. Otherwise, the length of dest is returned.
118 */
119static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
120 const char *type, size_t typelen, char **desc)
121{
122 char *cp;
123 size_t desclen = typelen + namelen + 2;
124
125 *desc = kmalloc(desclen, GFP_KERNEL);
126 if (!*desc)
127 return -ENOMEM;
128
129 cp = *desc;
130 memcpy(cp, type, typelen);
131 cp += typelen;
132 *cp++ = ':';
133
134 memcpy(cp, name, namelen);
135 cp += namelen;
136 *cp = '\0';
137 return desclen;
138}
139
140static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
141 const char *type, void *data, size_t data_size)
142{
143 const struct cred *saved_cred;
144 struct key *rkey;
145 char *desc;
146 struct user_key_payload *payload;
147 ssize_t ret;
148
149 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
150 if (ret <= 0)
151 goto out;
152
153 saved_cred = override_creds(id_resolver_cache);
154 rkey = request_key(&key_type_id_resolver, desc, "");
155 revert_creds(saved_cred);
156 kfree(desc);
157 if (IS_ERR(rkey)) {
158 ret = PTR_ERR(rkey);
159 goto out;
160 }
161
162 rcu_read_lock();
163 rkey->perm |= KEY_USR_VIEW;
164
165 ret = key_validate(rkey);
166 if (ret < 0)
167 goto out_up;
168
169 payload = rcu_dereference(rkey->payload.data);
170 if (IS_ERR_OR_NULL(payload)) {
171 ret = PTR_ERR(payload);
172 goto out_up;
173 }
174
175 ret = payload->datalen;
176 if (ret > 0 && ret <= data_size)
177 memcpy(data, payload->data, ret);
178 else
179 ret = -EINVAL;
180
181out_up:
182 rcu_read_unlock();
183 key_put(rkey);
184out:
185 return ret;
186}
187
188
189/* ID -> Name */
190static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
191{
192 char id_str[NFS_UINT_MAXLEN];
193 int id_len;
194 ssize_t ret;
195
196 id_len = snprintf(id_str, sizeof(id_str), "%u", id);
197 ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
198 if (ret < 0)
199 return -EINVAL;
200 return ret;
201}
202
203/* Name -> ID */
204static int nfs_idmap_lookup_id(const char *name, size_t namelen,
205 const char *type, __u32 *id)
206{
207 char id_str[NFS_UINT_MAXLEN];
208 long id_long;
209 ssize_t data_size;
210 int ret = 0;
211
212 data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
213 if (data_size <= 0) {
214 ret = -EINVAL;
215 } else {
216 ret = strict_strtol(id_str, 10, &id_long);
217 *id = (__u32)id_long;
218 }
219 return ret;
220}
221
222int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
223{
224 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
225}
226
227int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
228{
229 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
230}
231
232int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
233{
234 return nfs_idmap_lookup_name(uid, "user", buf, buflen);
235}
236int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
237{
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen);
239}
240
241#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
242
37#include <linux/module.h> 243#include <linux/module.h>
38#include <linux/mutex.h> 244#include <linux/mutex.h>
39#include <linux/init.h> 245#include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
503 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 709 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
504} 710}
505 711
506int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) 712int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
507{ 713{
508 struct idmap *idmap = clp->cl_idmap; 714 struct idmap *idmap = clp->cl_idmap;
509 715
510 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 716 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
511} 717}
512int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) 718int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
513{ 719{
514 struct idmap *idmap = clp->cl_idmap; 720 struct idmap *idmap = clp->cl_idmap;
515 721
516 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 722 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
517} 723}
518 724
725#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d2d6c72aa78..1cc600e77bb4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h" 49#include "fscache.h"
50#include "dns_resolve.h" 50#include "dns_resolve.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_VFS 53#define NFSDBG_FACILITY NFSDBG_VFS
53 54
@@ -234,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
234 return 0; 235 return 0;
235} 236}
236 237
237/* Don't use READDIRPLUS on directories that we believe are too large */
238#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
239
240/* 238/*
241 * This is our front-end to iget that looks up inodes by file handle 239 * This is our front-end to iget that looks up inodes by file handle
242 * instead of inode number. 240 * instead of inode number.
@@ -291,8 +289,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 } else if (S_ISDIR(inode->i_mode)) { 289 } else if (S_ISDIR(inode->i_mode)) {
292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
293 inode->i_fop = &nfs_dir_operations; 291 inode->i_fop = &nfs_dir_operations;
294 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) 292 inode->i_data.a_ops = &nfs_dir_aops;
295 && fattr->size <= NFS_LIMIT_READDIRPLUS) 293 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
296 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 294 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
297 /* Deal with crossing mountpoints */ 295 /* Deal with crossing mountpoints */
298 if ((fattr->valid & NFS_ATTR_FATTR_FSID) 296 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -302,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
302 else 300 else
303 inode->i_op = &nfs_mountpoint_inode_operations; 301 inode->i_op = &nfs_mountpoint_inode_operations;
304 inode->i_fop = NULL; 302 inode->i_fop = NULL;
305 set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); 303 inode->i_flags |= S_AUTOMOUNT;
306 } 304 }
307 } else if (S_ISLNK(inode->i_mode)) 305 } else if (S_ISLNK(inode->i_mode))
308 inode->i_op = &nfs_symlink_inode_operations; 306 inode->i_op = &nfs_symlink_inode_operations;
@@ -623,7 +621,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
623 nfs_revalidate_inode(server, inode); 621 nfs_revalidate_inode(server, inode);
624} 622}
625 623
626static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred) 624struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
627{ 625{
628 struct nfs_open_context *ctx; 626 struct nfs_open_context *ctx;
629 627
@@ -633,11 +631,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
633 path_get(&ctx->path); 631 path_get(&ctx->path);
634 ctx->cred = get_rpccred(cred); 632 ctx->cred = get_rpccred(cred);
635 ctx->state = NULL; 633 ctx->state = NULL;
634 ctx->mode = f_mode;
636 ctx->flags = 0; 635 ctx->flags = 0;
637 ctx->error = 0; 636 ctx->error = 0;
638 ctx->dir_cookie = 0; 637 ctx->dir_cookie = 0;
639 nfs_init_lock_context(&ctx->lock_context); 638 nfs_init_lock_context(&ctx->lock_context);
640 ctx->lock_context.open_context = ctx; 639 ctx->lock_context.open_context = ctx;
640 INIT_LIST_HEAD(&ctx->list);
641 } 641 }
642 return ctx; 642 return ctx;
643} 643}
@@ -653,11 +653,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
653{ 653{
654 struct inode *inode = ctx->path.dentry->d_inode; 654 struct inode *inode = ctx->path.dentry->d_inode;
655 655
656 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) 656 if (!list_empty(&ctx->list)) {
657 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
658 return;
659 list_del(&ctx->list);
660 spin_unlock(&inode->i_lock);
661 } else if (!atomic_dec_and_test(&ctx->lock_context.count))
657 return; 662 return;
658 list_del(&ctx->list); 663 if (inode != NULL)
659 spin_unlock(&inode->i_lock); 664 NFS_PROTO(inode)->close_context(ctx, is_sync);
660 NFS_PROTO(inode)->close_context(ctx, is_sync);
661 if (ctx->cred != NULL) 665 if (ctx->cred != NULL)
662 put_rpccred(ctx->cred); 666 put_rpccred(ctx->cred);
663 path_put(&ctx->path); 667 path_put(&ctx->path);
@@ -673,7 +677,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
673 * Ensure that mmap has a recent RPC credential for use when writing out 677 * Ensure that mmap has a recent RPC credential for use when writing out
674 * shared pages 678 * shared pages
675 */ 679 */
676static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) 680void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
677{ 681{
678 struct inode *inode = filp->f_path.dentry->d_inode; 682 struct inode *inode = filp->f_path.dentry->d_inode;
679 struct nfs_inode *nfsi = NFS_I(inode); 683 struct nfs_inode *nfsi = NFS_I(inode);
@@ -730,11 +734,10 @@ int nfs_open(struct inode *inode, struct file *filp)
730 cred = rpc_lookup_cred(); 734 cred = rpc_lookup_cred();
731 if (IS_ERR(cred)) 735 if (IS_ERR(cred))
732 return PTR_ERR(cred); 736 return PTR_ERR(cred);
733 ctx = alloc_nfs_open_context(&filp->f_path, cred); 737 ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
734 put_rpccred(cred); 738 put_rpccred(cred);
735 if (ctx == NULL) 739 if (ctx == NULL)
736 return -ENOMEM; 740 return -ENOMEM;
737 ctx->mode = filp->f_mode;
738 nfs_file_set_open_context(filp, ctx); 741 nfs_file_set_open_context(filp, ctx);
739 put_nfs_open_context(ctx); 742 put_nfs_open_context(ctx);
740 nfs_fscache_set_inode_cookie(inode, filp); 743 nfs_fscache_set_inode_cookie(inode, filp);
@@ -878,9 +881,10 @@ out:
878 return ret; 881 return ret;
879} 882}
880 883
881static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 884static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
882{ 885{
883 struct nfs_inode *nfsi = NFS_I(inode); 886 struct nfs_inode *nfsi = NFS_I(inode);
887 unsigned long ret = 0;
884 888
885 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) 889 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
886 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) 890 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -888,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
888 nfsi->change_attr = fattr->change_attr; 892 nfsi->change_attr = fattr->change_attr;
889 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
890 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 894 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
895 ret |= NFS_INO_INVALID_ATTR;
891 } 896 }
892 /* If we have atomic WCC data, we may update some attributes */ 897 /* If we have atomic WCC data, we may update some attributes */
893 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) 898 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
894 && (fattr->valid & NFS_ATTR_FATTR_CTIME) 899 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
895 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 900 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
896 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 901 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
902 ret |= NFS_INO_INVALID_ATTR;
903 }
897 904
898 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) 905 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
899 && (fattr->valid & NFS_ATTR_FATTR_MTIME) 906 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
900 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 907 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
901 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 908 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
902 if (S_ISDIR(inode->i_mode)) 909 if (S_ISDIR(inode->i_mode))
903 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 910 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
911 ret |= NFS_INO_INVALID_ATTR;
904 } 912 }
905 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) 913 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
906 && (fattr->valid & NFS_ATTR_FATTR_SIZE) 914 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
907 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) 915 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
908 && nfsi->npages == 0) 916 && nfsi->npages == 0) {
909 i_size_write(inode, nfs_size_to_loff_t(fattr->size)); 917 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
918 ret |= NFS_INO_INVALID_ATTR;
919 }
920 return ret;
910} 921}
911 922
912/** 923/**
@@ -1205,7 +1216,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1205 /* Update the fsid? */ 1216 /* Update the fsid? */
1206 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && 1217 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1207 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1218 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1208 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1219 !IS_AUTOMOUNT(inode))
1209 server->fsid = fattr->fsid; 1220 server->fsid = fattr->fsid;
1210 1221
1211 /* 1222 /*
@@ -1220,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1220 | NFS_INO_REVAL_PAGECACHE); 1231 | NFS_INO_REVAL_PAGECACHE);
1221 1232
1222 /* Do atomic weak cache consistency updates */ 1233 /* Do atomic weak cache consistency updates */
1223 nfs_wcc_update_inode(inode, fattr); 1234 invalid |= nfs_wcc_update_inode(inode, fattr);
1224 1235
1225 /* More cache consistency checks */ 1236 /* More cache consistency checks */
1226 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1237 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
@@ -1407,6 +1418,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1407 */ 1418 */
1408void nfs4_evict_inode(struct inode *inode) 1419void nfs4_evict_inode(struct inode *inode)
1409{ 1420{
1421 pnfs_destroy_layout(NFS_I(inode));
1410 truncate_inode_pages(&inode->i_data, 0); 1422 truncate_inode_pages(&inode->i_data, 0);
1411 end_writeback(inode); 1423 end_writeback(inode);
1412 /* If we are holding a delegation, return it! */ 1424 /* If we are holding a delegation, return it! */
@@ -1434,11 +1446,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
1434 return &nfsi->vfs_inode; 1446 return &nfsi->vfs_inode;
1435} 1447}
1436 1448
1437void nfs_destroy_inode(struct inode *inode) 1449static void nfs_i_callback(struct rcu_head *head)
1438{ 1450{
1451 struct inode *inode = container_of(head, struct inode, i_rcu);
1452 INIT_LIST_HEAD(&inode->i_dentry);
1439 kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); 1453 kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
1440} 1454}
1441 1455
1456void nfs_destroy_inode(struct inode *inode)
1457{
1458 call_rcu(&inode->i_rcu, nfs_i_callback);
1459}
1460
1442static inline void nfs4_init_once(struct nfs_inode *nfsi) 1461static inline void nfs4_init_once(struct nfs_inode *nfsi)
1443{ 1462{
1444#ifdef CONFIG_NFS_V4 1463#ifdef CONFIG_NFS_V4
@@ -1446,6 +1465,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
1446 nfsi->delegation = NULL; 1465 nfsi->delegation = NULL;
1447 nfsi->delegation_state = 0; 1466 nfsi->delegation_state = 0;
1448 init_rwsem(&nfsi->rwsem); 1467 init_rwsem(&nfsi->rwsem);
1468 nfsi->layout = NULL;
1449#endif 1469#endif
1450} 1470}
1451 1471
@@ -1493,7 +1513,7 @@ static int nfsiod_start(void)
1493{ 1513{
1494 struct workqueue_struct *wq; 1514 struct workqueue_struct *wq;
1495 dprintk("RPC: creating workqueue nfsiod\n"); 1515 dprintk("RPC: creating workqueue nfsiod\n");
1496 wq = create_singlethread_workqueue("nfsiod"); 1516 wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
1497 if (wq == NULL) 1517 if (wq == NULL)
1498 return -ENOMEM; 1518 return -ENOMEM;
1499 nfsiod_workqueue = wq; 1519 nfsiod_workqueue = wq;
@@ -1521,6 +1541,10 @@ static int __init init_nfs_fs(void)
1521{ 1541{
1522 int err; 1542 int err;
1523 1543
1544 err = nfs_idmap_init();
1545 if (err < 0)
1546 goto out9;
1547
1524 err = nfs_dns_resolver_init(); 1548 err = nfs_dns_resolver_init();
1525 if (err < 0) 1549 if (err < 0)
1526 goto out8; 1550 goto out8;
@@ -1585,6 +1609,8 @@ out6:
1585out7: 1609out7:
1586 nfs_dns_resolver_destroy(); 1610 nfs_dns_resolver_destroy();
1587out8: 1611out8:
1612 nfs_idmap_quit();
1613out9:
1588 return err; 1614 return err;
1589} 1615}
1590 1616
@@ -1597,9 +1623,11 @@ static void __exit exit_nfs_fs(void)
1597 nfs_destroy_nfspagecache(); 1623 nfs_destroy_nfspagecache();
1598 nfs_fscache_unregister(); 1624 nfs_fscache_unregister();
1599 nfs_dns_resolver_destroy(); 1625 nfs_dns_resolver_destroy();
1626 nfs_idmap_quit();
1600#ifdef CONFIG_PROC_FS 1627#ifdef CONFIG_PROC_FS
1601 rpc_proc_unregister("nfs"); 1628 rpc_proc_unregister("nfs");
1602#endif 1629#endif
1630 nfs_cleanup_cb_ident_idr();
1603 unregister_nfs_fs(); 1631 unregister_nfs_fs();
1604 nfs_fs_proc_exit(); 1632 nfs_fs_proc_exit();
1605 nfsiod_stop(); 1633 nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c961bc92c107..cf9fdbdabc67 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
63#define NFS_UNSPEC_PORT (-1) 63#define NFS_UNSPEC_PORT (-1)
64 64
65/* 65/*
66 * Maximum number of pages that readdir can use for creating
67 * a vmapped array of pages.
68 */
69#define NFS_MAX_READDIR_PAGES 8
70
71/*
66 * In-kernel mount arguments 72 * In-kernel mount arguments
67 */ 73 */
68struct nfs_parsed_mount_data { 74struct nfs_parsed_mount_data {
@@ -122,9 +128,12 @@ extern void nfs_umount(const struct nfs_mount_request *info);
122/* client.c */ 128/* client.c */
123extern struct rpc_program nfs_program; 129extern struct rpc_program nfs_program;
124 130
131extern void nfs_cleanup_cb_ident_idr(void);
125extern void nfs_put_client(struct nfs_client *); 132extern void nfs_put_client(struct nfs_client *);
126extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32); 133extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
127extern struct nfs_client *nfs_find_client_next(struct nfs_client *); 134extern struct nfs_client *nfs4_find_client_ident(int);
135extern struct nfs_client *
136nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
128extern struct nfs_server *nfs_create_server( 137extern struct nfs_server *nfs_create_server(
129 const struct nfs_parsed_mount_data *, 138 const struct nfs_parsed_mount_data *,
130 struct nfs_fh *); 139 struct nfs_fh *);
@@ -179,17 +188,20 @@ extern int __init nfs_init_directcache(void);
179extern void nfs_destroy_directcache(void); 188extern void nfs_destroy_directcache(void);
180 189
181/* nfs2xdr.c */ 190/* nfs2xdr.c */
182extern int nfs_stat_to_errno(int); 191extern int nfs_stat_to_errno(enum nfs_stat);
183extern struct rpc_procinfo nfs_procedures[]; 192extern struct rpc_procinfo nfs_procedures[];
184extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); 193extern int nfs2_decode_dirent(struct xdr_stream *,
194 struct nfs_entry *, int);
185 195
186/* nfs3xdr.c */ 196/* nfs3xdr.c */
187extern struct rpc_procinfo nfs3_procedures[]; 197extern struct rpc_procinfo nfs3_procedures[];
188extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); 198extern int nfs3_decode_dirent(struct xdr_stream *,
199 struct nfs_entry *, int);
189 200
190/* nfs4xdr.c */ 201/* nfs4xdr.c */
191#ifdef CONFIG_NFS_V4 202#ifdef CONFIG_NFS_V4
192extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 203extern int nfs4_decode_dirent(struct xdr_stream *,
204 struct nfs_entry *, int);
193#endif 205#endif
194#ifdef CONFIG_NFS_V4_1 206#ifdef CONFIG_NFS_V4_1
195extern const u32 nfs41_maxread_overhead; 207extern const u32 nfs41_maxread_overhead;
@@ -239,6 +251,7 @@ extern char *nfs_path(const char *base,
239 const struct dentry *droot, 251 const struct dentry *droot,
240 const struct dentry *dentry, 252 const struct dentry *dentry,
241 char *buffer, ssize_t buflen); 253 char *buffer, ssize_t buflen);
254extern struct vfsmount *nfs_d_automount(struct path *path);
242 255
243/* getroot.c */ 256/* getroot.c */
244extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); 257extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
@@ -356,6 +369,15 @@ unsigned int nfs_page_length(struct page *page)
356} 369}
357 370
358/* 371/*
372 * Convert a umode to a dirent->d_type
373 */
374static inline
375unsigned char nfs_umode_to_dtype(umode_t mode)
376{
377 return (mode >> 12) & 15;
378}
379
380/*
359 * Determine the number of pages in an array of length 'len' and 381 * Determine the number of pages in an array of length 'len' and
360 * with a base offset of 'base' 382 * with a base offset of 'base'
361 */ 383 */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..d4c2d6b7507e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
153 .rpc_resp = &result, 153 .rpc_resp = &result,
154 }; 154 };
155 struct rpc_create_args args = { 155 struct rpc_create_args args = {
156 .net = &init_net,
156 .protocol = info->protocol, 157 .protocol = info->protocol,
157 .address = info->sap, 158 .address = info->sap,
158 .addrsize = info->salen, 159 .addrsize = info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
224 .to_retries = 2, 225 .to_retries = 2,
225 }; 226 };
226 struct rpc_create_args args = { 227 struct rpc_create_args args = {
228 .net = &init_net,
227 .protocol = IPPROTO_UDP, 229 .protocol = IPPROTO_UDP,
228 .address = info->sap, 230 .address = info->sap,
229 .addrsize = info->salen, 231 .addrsize = info->salen,
@@ -234,10 +236,8 @@ void nfs_umount(const struct nfs_mount_request *info)
234 .authflavor = RPC_AUTH_UNIX, 236 .authflavor = RPC_AUTH_UNIX,
235 .flags = RPC_CLNT_CREATE_NOPING, 237 .flags = RPC_CLNT_CREATE_NOPING,
236 }; 238 };
237 struct mountres result;
238 struct rpc_message msg = { 239 struct rpc_message msg = {
239 .rpc_argp = info->dirpath, 240 .rpc_argp = info->dirpath,
240 .rpc_resp = &result,
241 }; 241 };
242 struct rpc_clnt *clnt; 242 struct rpc_clnt *clnt;
243 int status; 243 int status;
@@ -246,7 +246,7 @@ void nfs_umount(const struct nfs_mount_request *info)
246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
247 247
248 clnt = rpc_create(&args); 248 clnt = rpc_create(&args);
249 if (unlikely(IS_ERR(clnt))) 249 if (IS_ERR(clnt))
250 goto out_clnt_err; 250 goto out_clnt_err;
251 251
252 dprintk("NFS: sending UMNT request for %s:%s\n", 252 dprintk("NFS: sending UMNT request for %s:%s\n",
@@ -278,29 +278,20 @@ out_call_err:
278 * XDR encode/decode functions for MOUNT 278 * XDR encode/decode functions for MOUNT
279 */ 279 */
280 280
281static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) 281static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
282{ 282{
283 const u32 pathname_len = strlen(pathname); 283 const u32 pathname_len = strlen(pathname);
284 __be32 *p; 284 __be32 *p;
285 285
286 if (unlikely(pathname_len > MNTPATHLEN)) 286 BUG_ON(pathname_len > MNTPATHLEN);
287 return -EIO; 287 p = xdr_reserve_space(xdr, 4 + pathname_len);
288
289 p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len);
290 if (unlikely(p == NULL))
291 return -EIO;
292 xdr_encode_opaque(p, pathname, pathname_len); 288 xdr_encode_opaque(p, pathname, pathname_len);
293
294 return 0;
295} 289}
296 290
297static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p, 291static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
298 const char *dirpath) 292 const char *dirpath)
299{ 293{
300 struct xdr_stream xdr; 294 encode_mntdirpath(xdr, dirpath);
301
302 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
303 return encode_mntdirpath(&xdr, dirpath);
304} 295}
305 296
306/* 297/*
@@ -318,10 +309,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
318 u32 status; 309 u32 status;
319 __be32 *p; 310 __be32 *p;
320 311
321 p = xdr_inline_decode(xdr, sizeof(status)); 312 p = xdr_inline_decode(xdr, 4);
322 if (unlikely(p == NULL)) 313 if (unlikely(p == NULL))
323 return -EIO; 314 return -EIO;
324 status = ntohl(*p); 315 status = be32_to_cpup(p);
325 316
326 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { 317 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
327 if (mnt_errtbl[i].status == status) { 318 if (mnt_errtbl[i].status == status) {
@@ -349,18 +340,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
349 return 0; 340 return 0;
350} 341}
351 342
352static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p, 343static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
353 struct mountres *res) 344 struct xdr_stream *xdr,
345 struct mountres *res)
354{ 346{
355 struct xdr_stream xdr;
356 int status; 347 int status;
357 348
358 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 349 status = decode_status(xdr, res);
359
360 status = decode_status(&xdr, res);
361 if (unlikely(status != 0 || res->errno != 0)) 350 if (unlikely(status != 0 || res->errno != 0))
362 return status; 351 return status;
363 return decode_fhandle(&xdr, res); 352 return decode_fhandle(xdr, res);
364} 353}
365 354
366static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) 355static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
@@ -369,10 +358,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
369 u32 status; 358 u32 status;
370 __be32 *p; 359 __be32 *p;
371 360
372 p = xdr_inline_decode(xdr, sizeof(status)); 361 p = xdr_inline_decode(xdr, 4);
373 if (unlikely(p == NULL)) 362 if (unlikely(p == NULL))
374 return -EIO; 363 return -EIO;
375 status = ntohl(*p); 364 status = be32_to_cpup(p);
376 365
377 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { 366 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
378 if (mnt3_errtbl[i].status == status) { 367 if (mnt3_errtbl[i].status == status) {
@@ -392,11 +381,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
392 u32 size; 381 u32 size;
393 __be32 *p; 382 __be32 *p;
394 383
395 p = xdr_inline_decode(xdr, sizeof(size)); 384 p = xdr_inline_decode(xdr, 4);
396 if (unlikely(p == NULL)) 385 if (unlikely(p == NULL))
397 return -EIO; 386 return -EIO;
398 387
399 size = ntohl(*p++); 388 size = be32_to_cpup(p);
400 if (size > NFS3_FHSIZE || size == 0) 389 if (size > NFS3_FHSIZE || size == 0)
401 return -EIO; 390 return -EIO;
402 391
@@ -419,15 +408,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
419 if (*count == 0) 408 if (*count == 0)
420 return 0; 409 return 0;
421 410
422 p = xdr_inline_decode(xdr, sizeof(entries)); 411 p = xdr_inline_decode(xdr, 4);
423 if (unlikely(p == NULL)) 412 if (unlikely(p == NULL))
424 return -EIO; 413 return -EIO;
425 entries = ntohl(*p); 414 entries = be32_to_cpup(p);
426 dprintk("NFS: received %u auth flavors\n", entries); 415 dprintk("NFS: received %u auth flavors\n", entries);
427 if (entries > NFS_MAX_SECFLAVORS) 416 if (entries > NFS_MAX_SECFLAVORS)
428 entries = NFS_MAX_SECFLAVORS; 417 entries = NFS_MAX_SECFLAVORS;
429 418
430 p = xdr_inline_decode(xdr, sizeof(u32) * entries); 419 p = xdr_inline_decode(xdr, 4 * entries);
431 if (unlikely(p == NULL)) 420 if (unlikely(p == NULL))
432 return -EIO; 421 return -EIO;
433 422
@@ -435,38 +424,36 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
435 entries = *count; 424 entries = *count;
436 425
437 for (i = 0; i < entries; i++) { 426 for (i = 0; i < entries; i++) {
438 flavors[i] = ntohl(*p++); 427 flavors[i] = be32_to_cpup(p++);
439 dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]); 428 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
440 } 429 }
441 *count = i; 430 *count = i;
442 431
443 return 0; 432 return 0;
444} 433}
445 434
446static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p, 435static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
447 struct mountres *res) 436 struct xdr_stream *xdr,
437 struct mountres *res)
448{ 438{
449 struct xdr_stream xdr;
450 int status; 439 int status;
451 440
452 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 441 status = decode_fhs_status(xdr, res);
453
454 status = decode_fhs_status(&xdr, res);
455 if (unlikely(status != 0 || res->errno != 0)) 442 if (unlikely(status != 0 || res->errno != 0))
456 return status; 443 return status;
457 status = decode_fhandle3(&xdr, res); 444 status = decode_fhandle3(xdr, res);
458 if (unlikely(status != 0)) { 445 if (unlikely(status != 0)) {
459 res->errno = -EBADHANDLE; 446 res->errno = -EBADHANDLE;
460 return 0; 447 return 0;
461 } 448 }
462 return decode_auth_flavors(&xdr, res); 449 return decode_auth_flavors(xdr, res);
463} 450}
464 451
465static struct rpc_procinfo mnt_procedures[] = { 452static struct rpc_procinfo mnt_procedures[] = {
466 [MOUNTPROC_MNT] = { 453 [MOUNTPROC_MNT] = {
467 .p_proc = MOUNTPROC_MNT, 454 .p_proc = MOUNTPROC_MNT,
468 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 455 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
469 .p_decode = (kxdrproc_t)mnt_dec_mountres, 456 .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres,
470 .p_arglen = MNT_enc_dirpath_sz, 457 .p_arglen = MNT_enc_dirpath_sz,
471 .p_replen = MNT_dec_mountres_sz, 458 .p_replen = MNT_dec_mountres_sz,
472 .p_statidx = MOUNTPROC_MNT, 459 .p_statidx = MOUNTPROC_MNT,
@@ -474,7 +461,7 @@ static struct rpc_procinfo mnt_procedures[] = {
474 }, 461 },
475 [MOUNTPROC_UMNT] = { 462 [MOUNTPROC_UMNT] = {
476 .p_proc = MOUNTPROC_UMNT, 463 .p_proc = MOUNTPROC_UMNT,
477 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 464 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
478 .p_arglen = MNT_enc_dirpath_sz, 465 .p_arglen = MNT_enc_dirpath_sz,
479 .p_statidx = MOUNTPROC_UMNT, 466 .p_statidx = MOUNTPROC_UMNT,
480 .p_name = "UMOUNT", 467 .p_name = "UMOUNT",
@@ -484,8 +471,8 @@ static struct rpc_procinfo mnt_procedures[] = {
484static struct rpc_procinfo mnt3_procedures[] = { 471static struct rpc_procinfo mnt3_procedures[] = {
485 [MOUNTPROC3_MNT] = { 472 [MOUNTPROC3_MNT] = {
486 .p_proc = MOUNTPROC3_MNT, 473 .p_proc = MOUNTPROC3_MNT,
487 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 474 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
488 .p_decode = (kxdrproc_t)mnt_dec_mountres3, 475 .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3,
489 .p_arglen = MNT_enc_dirpath_sz, 476 .p_arglen = MNT_enc_dirpath_sz,
490 .p_replen = MNT_dec_mountres3_sz, 477 .p_replen = MNT_dec_mountres3_sz,
491 .p_statidx = MOUNTPROC3_MNT, 478 .p_statidx = MOUNTPROC3_MNT,
@@ -493,7 +480,7 @@ static struct rpc_procinfo mnt3_procedures[] = {
493 }, 480 },
494 [MOUNTPROC3_UMNT] = { 481 [MOUNTPROC3_UMNT] = {
495 .p_proc = MOUNTPROC3_UMNT, 482 .p_proc = MOUNTPROC3_UMNT,
496 .p_encode = (kxdrproc_t)mnt_enc_dirpath, 483 .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
497 .p_arglen = MNT_enc_dirpath_sz, 484 .p_arglen = MNT_enc_dirpath_sz,
498 .p_statidx = MOUNTPROC3_UMNT, 485 .p_statidx = MOUNTPROC3_UMNT,
499 .p_name = "UMOUNT", 486 .p_name = "UMOUNT",
@@ -503,13 +490,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
503 490
504static struct rpc_version mnt_version1 = { 491static struct rpc_version mnt_version1 = {
505 .number = 1, 492 .number = 1,
506 .nrprocs = 2, 493 .nrprocs = ARRAY_SIZE(mnt_procedures),
507 .procs = mnt_procedures, 494 .procs = mnt_procedures,
508}; 495};
509 496
510static struct rpc_version mnt_version3 = { 497static struct rpc_version mnt_version3 = {
511 .number = 3, 498 .number = 3,
512 .nrprocs = 2, 499 .nrprocs = ARRAY_SIZE(mnt3_procedures),
513 .procs = mnt3_procedures, 500 .procs = mnt3_procedures,
514}; 501};
515 502
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa3673cf3..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,12 +49,17 @@ char *nfs_path(const char *base,
49 const struct dentry *dentry, 49 const struct dentry *dentry,
50 char *buffer, ssize_t buflen) 50 char *buffer, ssize_t buflen)
51{ 51{
52 char *end = buffer+buflen; 52 char *end;
53 int namelen; 53 int namelen;
54 unsigned seq;
54 55
56rename_retry:
57 end = buffer+buflen;
55 *--end = '\0'; 58 *--end = '\0';
56 buflen--; 59 buflen--;
57 spin_lock(&dcache_lock); 60
61 seq = read_seqbegin(&rename_lock);
62 rcu_read_lock();
58 while (!IS_ROOT(dentry) && dentry != droot) { 63 while (!IS_ROOT(dentry) && dentry != droot) {
59 namelen = dentry->d_name.len; 64 namelen = dentry->d_name.len;
60 buflen -= namelen + 1; 65 buflen -= namelen + 1;
@@ -65,7 +70,9 @@ char *nfs_path(const char *base,
65 *--end = '/'; 70 *--end = '/';
66 dentry = dentry->d_parent; 71 dentry = dentry->d_parent;
67 } 72 }
68 spin_unlock(&dcache_lock); 73 rcu_read_unlock();
74 if (read_seqretry(&rename_lock, seq))
75 goto rename_retry;
69 if (*end != '/') { 76 if (*end != '/') {
70 if (--buflen < 0) 77 if (--buflen < 0)
71 goto Elong; 78 goto Elong;
@@ -82,15 +89,16 @@ char *nfs_path(const char *base,
82 memcpy(end, base, namelen); 89 memcpy(end, base, namelen);
83 return end; 90 return end;
84Elong_unlock: 91Elong_unlock:
85 spin_unlock(&dcache_lock); 92 rcu_read_unlock();
93 if (read_seqretry(&rename_lock, seq))
94 goto rename_retry;
86Elong: 95Elong:
87 return ERR_PTR(-ENAMETOOLONG); 96 return ERR_PTR(-ENAMETOOLONG);
88} 97}
89 98
90/* 99/*
91 * nfs_follow_mountpoint - handle crossing a mountpoint on the server 100 * nfs_d_automount - Handle crossing a mountpoint on the server
92 * @dentry - dentry of mountpoint 101 * @path - The mountpoint
93 * @nd - nameidata info
94 * 102 *
95 * When we encounter a mountpoint on the server, we want to set up 103 * When we encounter a mountpoint on the server, we want to set up
96 * a mountpoint on the client too, to prevent inode numbers from 104 * a mountpoint on the client too, to prevent inode numbers from
@@ -100,87 +108,65 @@ Elong:
100 * situation, and that different filesystems may want to use 108 * situation, and that different filesystems may want to use
101 * different security flavours. 109 * different security flavours.
102 */ 110 */
103static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 111struct vfsmount *nfs_d_automount(struct path *path)
104{ 112{
105 struct vfsmount *mnt; 113 struct vfsmount *mnt;
106 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 114 struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
107 struct dentry *parent; 115 struct dentry *parent;
108 struct nfs_fh *fh = NULL; 116 struct nfs_fh *fh = NULL;
109 struct nfs_fattr *fattr = NULL; 117 struct nfs_fattr *fattr = NULL;
110 int err; 118 int err;
111 119
112 dprintk("--> nfs_follow_mountpoint()\n"); 120 dprintk("--> nfs_d_automount()\n");
113 121
114 err = -ESTALE; 122 mnt = ERR_PTR(-ESTALE);
115 if (IS_ROOT(dentry)) 123 if (IS_ROOT(path->dentry))
116 goto out_err; 124 goto out_nofree;
117 125
118 err = -ENOMEM; 126 mnt = ERR_PTR(-ENOMEM);
119 fh = nfs_alloc_fhandle(); 127 fh = nfs_alloc_fhandle();
120 fattr = nfs_alloc_fattr(); 128 fattr = nfs_alloc_fattr();
121 if (fh == NULL || fattr == NULL) 129 if (fh == NULL || fattr == NULL)
122 goto out_err; 130 goto out;
123 131
124 dprintk("%s: enter\n", __func__); 132 dprintk("%s: enter\n", __func__);
125 dput(nd->path.dentry);
126 nd->path.dentry = dget(dentry);
127 133
128 /* Look it up again */ 134 /* Look it up again to get its attributes */
129 parent = dget_parent(nd->path.dentry); 135 parent = dget_parent(path->dentry);
130 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 136 err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
131 &nd->path.dentry->d_name, 137 &path->dentry->d_name,
132 fh, fattr); 138 fh, fattr);
133 dput(parent); 139 dput(parent);
134 if (err != 0) 140 if (err != 0) {
135 goto out_err; 141 mnt = ERR_PTR(err);
142 goto out;
143 }
136 144
137 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 145 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
138 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); 146 mnt = nfs_do_refmount(path->mnt, path->dentry);
139 else 147 else
140 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, 148 mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
141 fattr);
142 err = PTR_ERR(mnt);
143 if (IS_ERR(mnt)) 149 if (IS_ERR(mnt))
144 goto out_err; 150 goto out;
145 151
146 mntget(mnt); 152 dprintk("%s: done, success\n", __func__);
147 err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, 153 mntget(mnt); /* prevent immediate expiration */
148 &nfs_automount_list); 154 mnt_set_expiry(mnt, &nfs_automount_list);
149 if (err < 0) {
150 mntput(mnt);
151 if (err == -EBUSY)
152 goto out_follow;
153 goto out_err;
154 }
155 path_put(&nd->path);
156 nd->path.mnt = mnt;
157 nd->path.dentry = dget(mnt->mnt_root);
158 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); 155 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
156
159out: 157out:
160 nfs_free_fattr(fattr); 158 nfs_free_fattr(fattr);
161 nfs_free_fhandle(fh); 159 nfs_free_fhandle(fh);
162 dprintk("%s: done, returned %d\n", __func__, err); 160out_nofree:
163 161 dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
164 dprintk("<-- nfs_follow_mountpoint() = %d\n", err); 162 return mnt;
165 return ERR_PTR(err);
166out_err:
167 path_put(&nd->path);
168 goto out;
169out_follow:
170 while (d_mountpoint(nd->path.dentry) &&
171 follow_down(&nd->path))
172 ;
173 err = 0;
174 goto out;
175} 163}
176 164
177const struct inode_operations nfs_mountpoint_inode_operations = { 165const struct inode_operations nfs_mountpoint_inode_operations = {
178 .follow_link = nfs_follow_mountpoint,
179 .getattr = nfs_getattr, 166 .getattr = nfs_getattr,
180}; 167};
181 168
182const struct inode_operations nfs_referral_inode_operations = { 169const struct inode_operations nfs_referral_inode_operations = {
183 .follow_link = nfs_follow_mountpoint,
184}; 170};
185 171
186static void nfs_expire_automounts(struct work_struct *work) 172static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db8846a0e82e..792cb13a4304 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,609 +61,1008 @@
61#define NFS_readdirres_sz (1) 61#define NFS_readdirres_sz (1)
62#define NFS_statfsres_sz (1+NFS_info_sz) 62#define NFS_statfsres_sz (1+NFS_info_sz)
63 63
64
65/*
66 * While encoding arguments, set up the reply buffer in advance to
67 * receive reply data directly into the page cache.
68 */
69static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
70 unsigned int base, unsigned int len,
71 unsigned int bufsize)
72{
73 struct rpc_auth *auth = req->rq_cred->cr_auth;
74 unsigned int replen;
75
76 replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
77 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
78}
79
80/*
81 * Handle decode buffer overflows out-of-line.
82 */
83static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
84{
85 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
86 "Remaining buffer length is %tu words.\n",
87 func, xdr->end - xdr->p);
88}
89
90
91/*
92 * Encode/decode NFSv2 basic data types
93 *
94 * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
95 * "NFS: Network File System Protocol Specification".
96 *
97 * Not all basic data types have their own encoding and decoding
98 * functions. For run-time efficiency, some data types are encoded
99 * or decoded inline.
100 */
101
102/*
103 * typedef opaque nfsdata<>;
104 */
105static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
106{
107 u32 recvd, count;
108 size_t hdrlen;
109 __be32 *p;
110
111 p = xdr_inline_decode(xdr, 4);
112 if (unlikely(p == NULL))
113 goto out_overflow;
114 count = be32_to_cpup(p);
115 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
116 recvd = xdr->buf->len - hdrlen;
117 if (unlikely(count > recvd))
118 goto out_cheating;
119out:
120 xdr_read_pages(xdr, count);
121 result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */
122 result->count = count;
123 return count;
124out_cheating:
125 dprintk("NFS: server cheating in read result: "
126 "count %u > recvd %u\n", count, recvd);
127 count = recvd;
128 goto out;
129out_overflow:
130 print_overflow_msg(__func__, xdr);
131 return -EIO;
132}
133
134/*
135 * enum stat {
136 * NFS_OK = 0,
137 * NFSERR_PERM = 1,
138 * NFSERR_NOENT = 2,
139 * NFSERR_IO = 5,
140 * NFSERR_NXIO = 6,
141 * NFSERR_ACCES = 13,
142 * NFSERR_EXIST = 17,
143 * NFSERR_NODEV = 19,
144 * NFSERR_NOTDIR = 20,
145 * NFSERR_ISDIR = 21,
146 * NFSERR_FBIG = 27,
147 * NFSERR_NOSPC = 28,
148 * NFSERR_ROFS = 30,
149 * NFSERR_NAMETOOLONG = 63,
150 * NFSERR_NOTEMPTY = 66,
151 * NFSERR_DQUOT = 69,
152 * NFSERR_STALE = 70,
153 * NFSERR_WFLUSH = 99
154 * };
155 */
156static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
157{
158 __be32 *p;
159
160 p = xdr_inline_decode(xdr, 4);
161 if (unlikely(p == NULL))
162 goto out_overflow;
163 *status = be32_to_cpup(p);
164 return 0;
165out_overflow:
166 print_overflow_msg(__func__, xdr);
167 return -EIO;
168}
169
64/* 170/*
65 * Common NFS XDR functions as inlines 171 * 2.3.2. ftype
172 *
173 * enum ftype {
174 * NFNON = 0,
175 * NFREG = 1,
176 * NFDIR = 2,
177 * NFBLK = 3,
178 * NFCHR = 4,
179 * NFLNK = 5
180 * };
181 *
66 */ 182 */
67static inline __be32 * 183static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
68xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
69{ 184{
70 memcpy(p, fhandle->data, NFS2_FHSIZE); 185 *type = be32_to_cpup(p++);
71 return p + XDR_QUADLEN(NFS2_FHSIZE); 186 if (unlikely(*type > NF2FIFO))
187 *type = NFBAD;
188 return p;
72} 189}
73 190
74static inline __be32 * 191/*
75xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle) 192 * 2.3.3. fhandle
193 *
194 * typedef opaque fhandle[FHSIZE];
195 */
196static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
76{ 197{
77 /* NFSv2 handles have a fixed length */ 198 __be32 *p;
78 fhandle->size = NFS2_FHSIZE; 199
79 memcpy(fhandle->data, p, NFS2_FHSIZE); 200 BUG_ON(fh->size != NFS2_FHSIZE);
80 return p + XDR_QUADLEN(NFS2_FHSIZE); 201 p = xdr_reserve_space(xdr, NFS2_FHSIZE);
202 memcpy(p, fh->data, NFS2_FHSIZE);
81} 203}
82 204
83static inline __be32* 205static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
84xdr_encode_time(__be32 *p, struct timespec *timep)
85{ 206{
86 *p++ = htonl(timep->tv_sec); 207 __be32 *p;
87 /* Convert nanoseconds into microseconds */ 208
88 *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0); 209 p = xdr_inline_decode(xdr, NFS2_FHSIZE);
210 if (unlikely(p == NULL))
211 goto out_overflow;
212 fh->size = NFS2_FHSIZE;
213 memcpy(fh->data, p, NFS2_FHSIZE);
214 return 0;
215out_overflow:
216 print_overflow_msg(__func__, xdr);
217 return -EIO;
218}
219
220/*
221 * 2.3.4. timeval
222 *
223 * struct timeval {
224 * unsigned int seconds;
225 * unsigned int useconds;
226 * };
227 */
228static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
229{
230 *p++ = cpu_to_be32(timep->tv_sec);
231 if (timep->tv_nsec != 0)
232 *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
233 else
234 *p++ = cpu_to_be32(0);
89 return p; 235 return p;
90} 236}
91 237
92static inline __be32* 238/*
93xdr_encode_current_server_time(__be32 *p, struct timespec *timep) 239 * Passing the invalid value useconds=1000000 is a Sun convention for
240 * "set to current server time". It's needed to make permissions checks
241 * for the "touch" program across v2 mounts to Solaris and Irix servers
242 * work correctly. See description of sattr in section 6.1 of "NFS
243 * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
244 */
245static __be32 *xdr_encode_current_server_time(__be32 *p,
246 const struct timespec *timep)
94{ 247{
95 /* 248 *p++ = cpu_to_be32(timep->tv_sec);
96 * Passing the invalid value useconds=1000000 is a 249 *p++ = cpu_to_be32(1000000);
97 * Sun convention for "set to current server time".
98 * It's needed to make permissions checks for the
99 * "touch" program across v2 mounts to Solaris and
100 * Irix boxes work correctly. See description of
101 * sattr in section 6.1 of "NFS Illustrated" by
102 * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
103 */
104 *p++ = htonl(timep->tv_sec);
105 *p++ = htonl(1000000);
106 return p; 250 return p;
107} 251}
108 252
109static inline __be32* 253static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
110xdr_decode_time(__be32 *p, struct timespec *timep)
111{ 254{
112 timep->tv_sec = ntohl(*p++); 255 timep->tv_sec = be32_to_cpup(p++);
113 /* Convert microseconds into nanoseconds */ 256 timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
114 timep->tv_nsec = ntohl(*p++) * 1000;
115 return p; 257 return p;
116} 258}
117 259
118static __be32 * 260/*
119xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 261 * 2.3.5. fattr
262 *
263 * struct fattr {
264 * ftype type;
265 * unsigned int mode;
266 * unsigned int nlink;
267 * unsigned int uid;
268 * unsigned int gid;
269 * unsigned int size;
270 * unsigned int blocksize;
271 * unsigned int rdev;
272 * unsigned int blocks;
273 * unsigned int fsid;
274 * unsigned int fileid;
275 * timeval atime;
276 * timeval mtime;
277 * timeval ctime;
278 * };
279 *
280 */
281static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
120{ 282{
121 u32 rdev, type; 283 u32 rdev, type;
122 type = ntohl(*p++); 284 __be32 *p;
123 fattr->mode = ntohl(*p++); 285
124 fattr->nlink = ntohl(*p++); 286 p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
125 fattr->uid = ntohl(*p++); 287 if (unlikely(p == NULL))
126 fattr->gid = ntohl(*p++); 288 goto out_overflow;
127 fattr->size = ntohl(*p++); 289
128 fattr->du.nfs2.blocksize = ntohl(*p++);
129 rdev = ntohl(*p++);
130 fattr->du.nfs2.blocks = ntohl(*p++);
131 fattr->fsid.major = ntohl(*p++);
132 fattr->fsid.minor = 0;
133 fattr->fileid = ntohl(*p++);
134 p = xdr_decode_time(p, &fattr->atime);
135 p = xdr_decode_time(p, &fattr->mtime);
136 p = xdr_decode_time(p, &fattr->ctime);
137 fattr->valid |= NFS_ATTR_FATTR_V2; 290 fattr->valid |= NFS_ATTR_FATTR_V2;
291
292 p = xdr_decode_ftype(p, &type);
293
294 fattr->mode = be32_to_cpup(p++);
295 fattr->nlink = be32_to_cpup(p++);
296 fattr->uid = be32_to_cpup(p++);
297 fattr->gid = be32_to_cpup(p++);
298 fattr->size = be32_to_cpup(p++);
299 fattr->du.nfs2.blocksize = be32_to_cpup(p++);
300
301 rdev = be32_to_cpup(p++);
138 fattr->rdev = new_decode_dev(rdev); 302 fattr->rdev = new_decode_dev(rdev);
139 if (type == NFCHR && rdev == NFS2_FIFO_DEV) { 303 if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
140 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 304 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
141 fattr->rdev = 0; 305 fattr->rdev = 0;
142 } 306 }
143 return p; 307
308 fattr->du.nfs2.blocks = be32_to_cpup(p++);
309 fattr->fsid.major = be32_to_cpup(p++);
310 fattr->fsid.minor = 0;
311 fattr->fileid = be32_to_cpup(p++);
312
313 p = xdr_decode_time(p, &fattr->atime);
314 p = xdr_decode_time(p, &fattr->mtime);
315 xdr_decode_time(p, &fattr->ctime);
316 return 0;
317out_overflow:
318 print_overflow_msg(__func__, xdr);
319 return -EIO;
144} 320}
145 321
146static inline __be32 * 322/*
147xdr_encode_sattr(__be32 *p, struct iattr *attr) 323 * 2.3.6. sattr
148{ 324 *
149 const __be32 not_set = __constant_htonl(0xFFFFFFFF); 325 * struct sattr {
326 * unsigned int mode;
327 * unsigned int uid;
328 * unsigned int gid;
329 * unsigned int size;
330 * timeval atime;
331 * timeval mtime;
332 * };
333 */
150 334
151 *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; 335#define NFS2_SATTR_NOT_SET (0xffffffff)
152 *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set; 336
153 *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set; 337static __be32 *xdr_time_not_set(__be32 *p)
154 *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set; 338{
339 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
340 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
341 return p;
342}
155 343
156 if (attr->ia_valid & ATTR_ATIME_SET) { 344static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
345{
346 __be32 *p;
347
348 p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
349
350 if (attr->ia_valid & ATTR_MODE)
351 *p++ = cpu_to_be32(attr->ia_mode);
352 else
353 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
354 if (attr->ia_valid & ATTR_UID)
355 *p++ = cpu_to_be32(attr->ia_uid);
356 else
357 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
358 if (attr->ia_valid & ATTR_GID)
359 *p++ = cpu_to_be32(attr->ia_gid);
360 else
361 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
362 if (attr->ia_valid & ATTR_SIZE)
363 *p++ = cpu_to_be32((u32)attr->ia_size);
364 else
365 *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
366
367 if (attr->ia_valid & ATTR_ATIME_SET)
157 p = xdr_encode_time(p, &attr->ia_atime); 368 p = xdr_encode_time(p, &attr->ia_atime);
158 } else if (attr->ia_valid & ATTR_ATIME) { 369 else if (attr->ia_valid & ATTR_ATIME)
159 p = xdr_encode_current_server_time(p, &attr->ia_atime); 370 p = xdr_encode_current_server_time(p, &attr->ia_atime);
160 } else { 371 else
161 *p++ = not_set; 372 p = xdr_time_not_set(p);
162 *p++ = not_set; 373 if (attr->ia_valid & ATTR_MTIME_SET)
163 } 374 xdr_encode_time(p, &attr->ia_mtime);
164 375 else if (attr->ia_valid & ATTR_MTIME)
165 if (attr->ia_valid & ATTR_MTIME_SET) { 376 xdr_encode_current_server_time(p, &attr->ia_mtime);
166 p = xdr_encode_time(p, &attr->ia_mtime); 377 else
167 } else if (attr->ia_valid & ATTR_MTIME) { 378 xdr_time_not_set(p);
168 p = xdr_encode_current_server_time(p, &attr->ia_mtime);
169 } else {
170 *p++ = not_set;
171 *p++ = not_set;
172 }
173 return p;
174} 379}
175 380
176/* 381/*
177 * NFS encode functions 382 * 2.3.7. filename
178 */ 383 *
179/* 384 * typedef string filename<MAXNAMLEN>;
180 * Encode file handle argument
181 * GETATTR, READLINK, STATFS
182 */ 385 */
183static int 386static void encode_filename(struct xdr_stream *xdr,
184nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) 387 const char *name, u32 length)
185{ 388{
186 p = xdr_encode_fhandle(p, fh); 389 __be32 *p;
187 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 390
391 BUG_ON(length > NFS2_MAXNAMLEN);
392 p = xdr_reserve_space(xdr, 4 + length);
393 xdr_encode_opaque(p, name, length);
394}
395
396static int decode_filename_inline(struct xdr_stream *xdr,
397 const char **name, u32 *length)
398{
399 __be32 *p;
400 u32 count;
401
402 p = xdr_inline_decode(xdr, 4);
403 if (unlikely(p == NULL))
404 goto out_overflow;
405 count = be32_to_cpup(p);
406 if (count > NFS3_MAXNAMLEN)
407 goto out_nametoolong;
408 p = xdr_inline_decode(xdr, count);
409 if (unlikely(p == NULL))
410 goto out_overflow;
411 *name = (const char *)p;
412 *length = count;
188 return 0; 413 return 0;
414out_nametoolong:
415 dprintk("NFS: returned filename too long: %u\n", count);
416 return -ENAMETOOLONG;
417out_overflow:
418 print_overflow_msg(__func__, xdr);
419 return -EIO;
189} 420}
190 421
191/* 422/*
192 * Encode SETATTR arguments 423 * 2.3.8. path
424 *
425 * typedef string path<MAXPATHLEN>;
193 */ 426 */
194static int 427static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
195nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args) 428{
429 __be32 *p;
430
431 BUG_ON(length > NFS2_MAXPATHLEN);
432 p = xdr_reserve_space(xdr, 4);
433 *p = cpu_to_be32(length);
434 xdr_write_pages(xdr, pages, 0, length);
435}
436
437static int decode_path(struct xdr_stream *xdr)
196{ 438{
197 p = xdr_encode_fhandle(p, args->fh); 439 u32 length, recvd;
198 p = xdr_encode_sattr(p, args->sattr); 440 size_t hdrlen;
199 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 441 __be32 *p;
442
443 p = xdr_inline_decode(xdr, 4);
444 if (unlikely(p == NULL))
445 goto out_overflow;
446 length = be32_to_cpup(p);
447 if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
448 goto out_size;
449 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
450 recvd = xdr->buf->len - hdrlen;
451 if (unlikely(length > recvd))
452 goto out_cheating;
453
454 xdr_read_pages(xdr, length);
455 xdr_terminate_string(xdr->buf, length);
200 return 0; 456 return 0;
457out_size:
458 dprintk("NFS: returned pathname too long: %u\n", length);
459 return -ENAMETOOLONG;
460out_cheating:
461 dprintk("NFS: server cheating in pathname result: "
462 "length %u > received %u\n", length, recvd);
463 return -EIO;
464out_overflow:
465 print_overflow_msg(__func__, xdr);
466 return -EIO;
201} 467}
202 468
203/* 469/*
204 * Encode directory ops argument 470 * 2.3.9. attrstat
205 * LOOKUP, RMDIR 471 *
472 * union attrstat switch (stat status) {
473 * case NFS_OK:
474 * fattr attributes;
475 * default:
476 * void;
477 * };
206 */ 478 */
207static int 479static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
208nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
209{ 480{
210 p = xdr_encode_fhandle(p, args->fh); 481 enum nfs_stat status;
211 p = xdr_encode_array(p, args->name, args->len); 482 int error;
212 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 483
213 return 0; 484 error = decode_stat(xdr, &status);
485 if (unlikely(error))
486 goto out;
487 if (status != NFS_OK)
488 goto out_default;
489 error = decode_fattr(xdr, result);
490out:
491 return error;
492out_default:
493 return nfs_stat_to_errno(status);
214} 494}
215 495
216/* 496/*
217 * Encode REMOVE argument 497 * 2.3.10. diropargs
498 *
499 * struct diropargs {
500 * fhandle dir;
501 * filename name;
502 * };
218 */ 503 */
219static int 504static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
220nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) 505 const char *name, u32 length)
221{ 506{
222 p = xdr_encode_fhandle(p, args->fh); 507 encode_fhandle(xdr, fh);
223 p = xdr_encode_array(p, args->name.name, args->name.len); 508 encode_filename(xdr, name, length);
224 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
225 return 0;
226} 509}
227 510
228/* 511/*
229 * Arguments to a READ call. Since we read data directly into the page 512 * 2.3.11. diropres
230 * cache, we also set up the reply iovec here so that iov[1] points 513 *
231 * exactly to the page we want to fetch. 514 * union diropres switch (stat status) {
515 * case NFS_OK:
516 * struct {
517 * fhandle file;
518 * fattr attributes;
519 * } diropok;
520 * default:
521 * void;
522 * };
232 */ 523 */
233static int 524static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
235{ 525{
236 struct rpc_auth *auth = req->rq_cred->cr_auth; 526 int error;
237 unsigned int replen; 527
238 u32 offset = (u32)args->offset; 528 error = decode_fhandle(xdr, result->fh);
239 u32 count = args->count; 529 if (unlikely(error))
240 530 goto out;
241 p = xdr_encode_fhandle(p, args->fh); 531 error = decode_fattr(xdr, result->fattr);
242 *p++ = htonl(offset); 532out:
243 *p++ = htonl(count); 533 return error;
244 *p++ = htonl(count); 534}
245 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
246 535
247 /* Inline the page array */ 536static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
248 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; 537{
249 xdr_inline_pages(&req->rq_rcv_buf, replen, 538 enum nfs_stat status;
250 args->pages, args->pgbase, count); 539 int error;
251 req->rq_rcv_buf.flags |= XDRBUF_READ; 540
252 return 0; 541 error = decode_stat(xdr, &status);
542 if (unlikely(error))
543 goto out;
544 if (status != NFS_OK)
545 goto out_default;
546 error = decode_diropok(xdr, result);
547out:
548 return error;
549out_default:
550 return nfs_stat_to_errno(status);
253} 551}
254 552
553
255/* 554/*
256 * Decode READ reply 555 * NFSv2 XDR encode functions
556 *
557 * NFSv2 argument types are defined in section 2.2 of RFC 1094:
558 * "NFS: Network File System Protocol Specification".
257 */ 559 */
258static int
259nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
260{
261 struct kvec *iov = req->rq_rcv_buf.head;
262 size_t hdrlen;
263 u32 count, recvd;
264 int status;
265
266 if ((status = ntohl(*p++)))
267 return nfs_stat_to_errno(status);
268 p = xdr_decode_fattr(p, res->fattr);
269
270 count = ntohl(*p++);
271 res->eof = 0;
272 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
273 if (iov->iov_len < hdrlen) {
274 dprintk("NFS: READ reply header overflowed:"
275 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
276 return -errno_NFSERR_IO;
277 } else if (iov->iov_len != hdrlen) {
278 dprintk("NFS: READ header is short. iovec will be shifted.\n");
279 xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen);
280 }
281 560
282 recvd = req->rq_rcv_buf.len - hdrlen; 561static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
283 if (count > recvd) { 562 struct xdr_stream *xdr,
284 dprintk("NFS: server cheating in read reply: " 563 const struct nfs_fh *fh)
285 "count %u > recvd %u\n", count, recvd); 564{
286 count = recvd; 565 encode_fhandle(xdr, fh);
287 } 566}
288 567
289 dprintk("RPC: readres OK count %u\n", count); 568/*
290 if (count < res->count) 569 * 2.2.3. sattrargs
291 res->count = count; 570 *
571 * struct sattrargs {
572 * fhandle file;
573 * sattr attributes;
574 * };
575 */
576static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
577 struct xdr_stream *xdr,
578 const struct nfs_sattrargs *args)
579{
580 encode_fhandle(xdr, args->fh);
581 encode_sattr(xdr, args->sattr);
582}
292 583
293 return count; 584static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
585 struct xdr_stream *xdr,
586 const struct nfs_diropargs *args)
587{
588 encode_diropargs(xdr, args->fh, args->name, args->len);
294} 589}
295 590
591static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
592 struct xdr_stream *xdr,
593 const struct nfs_readlinkargs *args)
594{
595 encode_fhandle(xdr, args->fh);
596 prepare_reply_buffer(req, args->pages, args->pgbase,
597 args->pglen, NFS_readlinkres_sz);
598}
296 599
297/* 600/*
298 * Write arguments. Splice the buffer to be written into the iovec. 601 * 2.2.7. readargs
602 *
603 * struct readargs {
604 * fhandle file;
605 * unsigned offset;
606 * unsigned count;
607 * unsigned totalcount;
608 * };
299 */ 609 */
300static int 610static void encode_readargs(struct xdr_stream *xdr,
301nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 611 const struct nfs_readargs *args)
302{ 612{
303 struct xdr_buf *sndbuf = &req->rq_snd_buf; 613 u32 offset = args->offset;
304 u32 offset = (u32)args->offset;
305 u32 count = args->count; 614 u32 count = args->count;
615 __be32 *p;
306 616
307 p = xdr_encode_fhandle(p, args->fh); 617 encode_fhandle(xdr, args->fh);
308 *p++ = htonl(offset);
309 *p++ = htonl(offset);
310 *p++ = htonl(count);
311 *p++ = htonl(count);
312 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
313 618
314 /* Copy the page array */ 619 p = xdr_reserve_space(xdr, 4 + 4 + 4);
315 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 620 *p++ = cpu_to_be32(offset);
316 sndbuf->flags |= XDRBUF_WRITE; 621 *p++ = cpu_to_be32(count);
317 return 0; 622 *p = cpu_to_be32(count);
318} 623}
319 624
320/* 625static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
321 * Encode create arguments 626 struct xdr_stream *xdr,
322 * CREATE, MKDIR 627 const struct nfs_readargs *args)
323 */
324static int
325nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
326{ 628{
327 p = xdr_encode_fhandle(p, args->fh); 629 encode_readargs(xdr, args);
328 p = xdr_encode_array(p, args->name, args->len); 630 prepare_reply_buffer(req, args->pages, args->pgbase,
329 p = xdr_encode_sattr(p, args->sattr); 631 args->count, NFS_readres_sz);
330 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 632 req->rq_rcv_buf.flags |= XDRBUF_READ;
331 return 0;
332} 633}
333 634
334/* 635/*
335 * Encode RENAME arguments 636 * 2.2.9. writeargs
637 *
638 * struct writeargs {
639 * fhandle file;
640 * unsigned beginoffset;
641 * unsigned offset;
642 * unsigned totalcount;
643 * nfsdata data;
644 * };
336 */ 645 */
337static int 646static void encode_writeargs(struct xdr_stream *xdr,
338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 647 const struct nfs_writeargs *args)
339{ 648{
340 p = xdr_encode_fhandle(p, args->fromfh); 649 u32 offset = args->offset;
341 p = xdr_encode_array(p, args->fromname, args->fromlen); 650 u32 count = args->count;
342 p = xdr_encode_fhandle(p, args->tofh); 651 __be32 *p;
343 p = xdr_encode_array(p, args->toname, args->tolen); 652
344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 653 encode_fhandle(xdr, args->fh);
345 return 0; 654
655 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
656 *p++ = cpu_to_be32(offset);
657 *p++ = cpu_to_be32(offset);
658 *p++ = cpu_to_be32(count);
659
660 /* nfsdata */
661 *p = cpu_to_be32(count);
662 xdr_write_pages(xdr, args->pages, args->pgbase, count);
346} 663}
347 664
348/* 665static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
349 * Encode LINK arguments 666 struct xdr_stream *xdr,
350 */ 667 const struct nfs_writeargs *args)
351static int
352nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
353{ 668{
354 p = xdr_encode_fhandle(p, args->fromfh); 669 encode_writeargs(xdr, args);
355 p = xdr_encode_fhandle(p, args->tofh); 670 xdr->buf->flags |= XDRBUF_WRITE;
356 p = xdr_encode_array(p, args->toname, args->tolen);
357 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
358 return 0;
359} 671}
360 672
361/* 673/*
362 * Encode SYMLINK arguments 674 * 2.2.10. createargs
675 *
676 * struct createargs {
677 * diropargs where;
678 * sattr attributes;
679 * };
363 */ 680 */
364static int 681static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
365nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args) 682 struct xdr_stream *xdr,
683 const struct nfs_createargs *args)
366{ 684{
367 struct xdr_buf *sndbuf = &req->rq_snd_buf; 685 encode_diropargs(xdr, args->fh, args->name, args->len);
368 size_t pad; 686 encode_sattr(xdr, args->sattr);
687}
369 688
370 p = xdr_encode_fhandle(p, args->fromfh); 689static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
371 p = xdr_encode_array(p, args->fromname, args->fromlen); 690 struct xdr_stream *xdr,
372 *p++ = htonl(args->pathlen); 691 const struct nfs_removeargs *args)
373 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); 692{
693 encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
694}
374 695
375 xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen); 696/*
697 * 2.2.12. renameargs
698 *
699 * struct renameargs {
700 * diropargs from;
701 * diropargs to;
702 * };
703 */
704static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
705 struct xdr_stream *xdr,
706 const struct nfs_renameargs *args)
707{
708 const struct qstr *old = args->old_name;
709 const struct qstr *new = args->new_name;
376 710
377 /* 711 encode_diropargs(xdr, args->old_dir, old->name, old->len);
378 * xdr_encode_pages may have added a few bytes to ensure the 712 encode_diropargs(xdr, args->new_dir, new->name, new->len);
379 * pathname ends on a 4-byte boundary. Start encoding the
380 * attributes after the pad bytes.
381 */
382 pad = sndbuf->tail->iov_len;
383 if (pad > 0)
384 p++;
385 p = xdr_encode_sattr(p, args->sattr);
386 sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
387 return 0;
388} 713}
389 714
390/* 715/*
391 * Encode arguments to readdir call 716 * 2.2.13. linkargs
717 *
718 * struct linkargs {
719 * fhandle from;
720 * diropargs to;
721 * };
392 */ 722 */
393static int 723static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) 724 struct xdr_stream *xdr,
725 const struct nfs_linkargs *args)
395{ 726{
396 struct rpc_auth *auth = req->rq_cred->cr_auth; 727 encode_fhandle(xdr, args->fromfh);
397 unsigned int replen; 728 encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
398 u32 count = args->count; 729}
399
400 p = xdr_encode_fhandle(p, args->fh);
401 *p++ = htonl(args->cookie);
402 *p++ = htonl(count); /* see above */
403 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
404 730
405 /* Inline the page array */ 731/*
406 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2; 732 * 2.2.14. symlinkargs
407 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); 733 *
408 return 0; 734 * struct symlinkargs {
735 * diropargs from;
736 * path to;
737 * sattr attributes;
738 * };
739 */
740static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
741 struct xdr_stream *xdr,
742 const struct nfs_symlinkargs *args)
743{
744 encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
745 encode_path(xdr, args->pages, args->pathlen);
746 encode_sattr(xdr, args->sattr);
409} 747}
410 748
411/* 749/*
412 * Decode the result of a readdir call. 750 * 2.2.17. readdirargs
413 * We're not really decoding anymore, we just leave the buffer untouched 751 *
414 * and only check that it is syntactically correct. 752 * struct readdirargs {
415 * The real decoding happens in nfs_decode_entry below, called directly 753 * fhandle dir;
416 * from nfs_readdir for each entry. 754 * nfscookie cookie;
755 * unsigned count;
756 * };
417 */ 757 */
418static int 758static void encode_readdirargs(struct xdr_stream *xdr,
419nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) 759 const struct nfs_readdirargs *args)
420{ 760{
421 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 761 __be32 *p;
422 struct kvec *iov = rcvbuf->head;
423 struct page **page;
424 size_t hdrlen;
425 unsigned int pglen, recvd;
426 u32 len;
427 int status, nr = 0;
428 __be32 *end, *entry, *kaddr;
429
430 if ((status = ntohl(*p++)))
431 return nfs_stat_to_errno(status);
432
433 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
434 if (iov->iov_len < hdrlen) {
435 dprintk("NFS: READDIR reply header overflowed:"
436 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
437 return -errno_NFSERR_IO;
438 } else if (iov->iov_len != hdrlen) {
439 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
440 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
441 }
442 762
443 pglen = rcvbuf->page_len; 763 encode_fhandle(xdr, args->fh);
444 recvd = rcvbuf->len - hdrlen;
445 if (pglen > recvd)
446 pglen = recvd;
447 page = rcvbuf->pages;
448 kaddr = p = kmap_atomic(*page, KM_USER0);
449 end = (__be32 *)((char *)p + pglen);
450 entry = p;
451
452 /* Make sure the packet actually has a value_follows and EOF entry */
453 if ((entry + 1) > end)
454 goto short_pkt;
455
456 for (; *p++; nr++) {
457 if (p + 2 > end)
458 goto short_pkt;
459 p++; /* fileid */
460 len = ntohl(*p++);
461 p += XDR_QUADLEN(len) + 1; /* name plus cookie */
462 if (len > NFS2_MAXNAMLEN) {
463 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
464 len);
465 goto err_unmap;
466 }
467 if (p + 2 > end)
468 goto short_pkt;
469 entry = p;
470 }
471 764
472 /* 765 p = xdr_reserve_space(xdr, 4 + 4);
473 * Apparently some server sends responses that are a valid size, but 766 *p++ = cpu_to_be32(args->cookie);
474 * contain no entries, and have value_follows==0 and EOF==0. For 767 *p = cpu_to_be32(args->count);
475 * those, just set the EOF marker.
476 */
477 if (!nr && entry[1] == 0) {
478 dprintk("NFS: readdir reply truncated!\n");
479 entry[1] = 1;
480 }
481 out:
482 kunmap_atomic(kaddr, KM_USER0);
483 return nr;
484 short_pkt:
485 /*
486 * When we get a short packet there are 2 possibilities. We can
487 * return an error, or fix up the response to look like a valid
488 * response and return what we have so far. If there are no
489 * entries and the packet was short, then return -EIO. If there
490 * are valid entries in the response, return them and pretend that
491 * the call was successful, but incomplete. The caller can retry the
492 * readdir starting at the last cookie.
493 */
494 entry[0] = entry[1] = 0;
495 if (!nr)
496 nr = -errno_NFSERR_IO;
497 goto out;
498err_unmap:
499 nr = -errno_NFSERR_IO;
500 goto out;
501} 768}
502 769
503__be32 * 770static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
504nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 771 struct xdr_stream *xdr,
772 const struct nfs_readdirargs *args)
505{ 773{
506 if (!*p++) { 774 encode_readdirargs(xdr, args);
507 if (!*p) 775 prepare_reply_buffer(req, args->pages, 0,
508 return ERR_PTR(-EAGAIN); 776 args->count, NFS_readdirres_sz);
509 entry->eof = 1;
510 return ERR_PTR(-EBADCOOKIE);
511 }
512
513 entry->ino = ntohl(*p++);
514 entry->len = ntohl(*p++);
515 entry->name = (const char *) p;
516 p += XDR_QUADLEN(entry->len);
517 entry->prev_cookie = entry->cookie;
518 entry->cookie = ntohl(*p++);
519 entry->eof = !p[0] && p[1];
520
521 return p;
522} 777}
523 778
524/* 779/*
525 * NFS XDR decode functions 780 * NFSv2 XDR decode functions
526 */ 781 *
527/* 782 * NFSv2 result types are defined in section 2.2 of RFC 1094:
528 * Decode simple status reply 783 * "NFS: Network File System Protocol Specification".
529 */ 784 */
530static int 785
531nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy) 786static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
787 void *__unused)
532{ 788{
533 int status; 789 enum nfs_stat status;
790 int error;
791
792 error = decode_stat(xdr, &status);
793 if (unlikely(error))
794 goto out;
795 if (status != NFS_OK)
796 goto out_default;
797out:
798 return error;
799out_default:
800 return nfs_stat_to_errno(status);
801}
534 802
535 if ((status = ntohl(*p++)) != 0) 803static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
536 status = nfs_stat_to_errno(status); 804 struct nfs_fattr *result)
537 return status; 805{
806 return decode_attrstat(xdr, result);
807}
808
809static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
810 struct nfs_diropok *result)
811{
812 return decode_diropres(xdr, result);
538} 813}
539 814
540/* 815/*
541 * Decode attrstat reply 816 * 2.2.6. readlinkres
542 * GETATTR, SETATTR, WRITE 817 *
818 * union readlinkres switch (stat status) {
819 * case NFS_OK:
820 * path data;
821 * default:
822 * void;
823 * };
543 */ 824 */
544static int 825static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
545nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 826 struct xdr_stream *xdr, void *__unused)
546{ 827{
547 int status; 828 enum nfs_stat status;
548 829 int error;
549 if ((status = ntohl(*p++))) 830
550 return nfs_stat_to_errno(status); 831 error = decode_stat(xdr, &status);
551 xdr_decode_fattr(p, fattr); 832 if (unlikely(error))
552 return 0; 833 goto out;
834 if (status != NFS_OK)
835 goto out_default;
836 error = decode_path(xdr);
837out:
838 return error;
839out_default:
840 return nfs_stat_to_errno(status);
553} 841}
554 842
555/* 843/*
556 * Decode diropres reply 844 * 2.2.7. readres
557 * LOOKUP, CREATE, MKDIR 845 *
846 * union readres switch (stat status) {
847 * case NFS_OK:
848 * fattr attributes;
849 * nfsdata data;
850 * default:
851 * void;
852 * };
558 */ 853 */
559static int 854static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
560nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) 855 struct nfs_readres *result)
561{ 856{
562 int status; 857 enum nfs_stat status;
858 int error;
859
860 error = decode_stat(xdr, &status);
861 if (unlikely(error))
862 goto out;
863 if (status != NFS_OK)
864 goto out_default;
865 error = decode_fattr(xdr, result->fattr);
866 if (unlikely(error))
867 goto out;
868 error = decode_nfsdata(xdr, result);
869out:
870 return error;
871out_default:
872 return nfs_stat_to_errno(status);
873}
563 874
564 if ((status = ntohl(*p++))) 875static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
565 return nfs_stat_to_errno(status); 876 struct nfs_writeres *result)
566 p = xdr_decode_fhandle(p, res->fh); 877{
567 xdr_decode_fattr(p, res->fattr); 878 /* All NFSv2 writes are "file sync" writes */
568 return 0; 879 result->verf->committed = NFS_FILE_SYNC;
880 return decode_attrstat(xdr, result->fattr);
569} 881}
570 882
571/* 883/**
572 * Encode READLINK args 884 * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
885 * the local page cache.
886 * @xdr: XDR stream where entry resides
887 * @entry: buffer to fill in with entry data
888 * @plus: boolean indicating whether this should be a readdirplus entry
889 *
890 * Returns zero if successful, otherwise a negative errno value is
891 * returned.
892 *
893 * This function is not invoked during READDIR reply decoding, but
894 * rather whenever an application invokes the getdents(2) system call
895 * on a directory already in our cache.
896 *
897 * 2.2.17. entry
898 *
899 * struct entry {
900 * unsigned fileid;
901 * filename name;
902 * nfscookie cookie;
903 * entry *nextentry;
904 * };
573 */ 905 */
574static int 906int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
575nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) 907 int plus)
576{ 908{
577 struct rpc_auth *auth = req->rq_cred->cr_auth; 909 __be32 *p;
578 unsigned int replen; 910 int error;
911
912 p = xdr_inline_decode(xdr, 4);
913 if (unlikely(p == NULL))
914 goto out_overflow;
915 if (*p++ == xdr_zero) {
916 p = xdr_inline_decode(xdr, 4);
917 if (unlikely(p == NULL))
918 goto out_overflow;
919 if (*p++ == xdr_zero)
920 return -EAGAIN;
921 entry->eof = 1;
922 return -EBADCOOKIE;
923 }
924
925 p = xdr_inline_decode(xdr, 4);
926 if (unlikely(p == NULL))
927 goto out_overflow;
928 entry->ino = be32_to_cpup(p);
929
930 error = decode_filename_inline(xdr, &entry->name, &entry->len);
931 if (unlikely(error))
932 return error;
933
934 /*
935 * The type (size and byte order) of nfscookie isn't defined in
936 * RFC 1094. This implementation assumes that it's an XDR uint32.
937 */
938 entry->prev_cookie = entry->cookie;
939 p = xdr_inline_decode(xdr, 4);
940 if (unlikely(p == NULL))
941 goto out_overflow;
942 entry->cookie = be32_to_cpup(p);
579 943
580 p = xdr_encode_fhandle(p, args->fh); 944 entry->d_type = DT_UNKNOWN;
581 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
582 945
583 /* Inline the page array */
584 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2;
585 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen);
586 return 0; 946 return 0;
947
948out_overflow:
949 print_overflow_msg(__func__, xdr);
950 return -EAGAIN;
587} 951}
588 952
589/* 953/*
590 * Decode READLINK reply 954 * 2.2.17. readdirres
955 *
956 * union readdirres switch (stat status) {
957 * case NFS_OK:
958 * struct {
959 * entry *entries;
960 * bool eof;
961 * } readdirok;
962 * default:
963 * void;
964 * };
965 *
966 * Read the directory contents into the page cache, but don't
967 * touch them. The actual decoding is done by nfs2_decode_dirent()
968 * during subsequent nfs_readdir() calls.
591 */ 969 */
592static int 970static int decode_readdirok(struct xdr_stream *xdr)
593nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
594{ 971{
595 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 972 u32 recvd, pglen;
596 struct kvec *iov = rcvbuf->head;
597 size_t hdrlen; 973 size_t hdrlen;
598 u32 len, recvd;
599 char *kaddr;
600 int status;
601
602 if ((status = ntohl(*p++)))
603 return nfs_stat_to_errno(status);
604 /* Convert length of symlink */
605 len = ntohl(*p++);
606 if (len >= rcvbuf->page_len) {
607 dprintk("nfs: server returned giant symlink!\n");
608 return -ENAMETOOLONG;
609 }
610 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
611 if (iov->iov_len < hdrlen) {
612 dprintk("NFS: READLINK reply header overflowed:"
613 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
614 return -errno_NFSERR_IO;
615 } else if (iov->iov_len != hdrlen) {
616 dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
617 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
618 }
619 recvd = req->rq_rcv_buf.len - hdrlen;
620 if (recvd < len) {
621 dprintk("NFS: server cheating in readlink reply: "
622 "count %u > recvd %u\n", len, recvd);
623 return -EIO;
624 }
625 974
626 /* NULL terminate the string we got */ 975 pglen = xdr->buf->page_len;
627 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); 976 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
628 kaddr[len+rcvbuf->page_base] = '\0'; 977 recvd = xdr->buf->len - hdrlen;
629 kunmap_atomic(kaddr, KM_USER0); 978 if (unlikely(pglen > recvd))
630 return 0; 979 goto out_cheating;
980out:
981 xdr_read_pages(xdr, pglen);
982 return pglen;
983out_cheating:
984 dprintk("NFS: server cheating in readdir result: "
985 "pglen %u > recvd %u\n", pglen, recvd);
986 pglen = recvd;
987 goto out;
631} 988}
632 989
633/* 990static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
634 * Decode WRITE reply 991 struct xdr_stream *xdr, void *__unused)
635 */
636static int
637nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
638{ 992{
639 res->verf->committed = NFS_FILE_SYNC; 993 enum nfs_stat status;
640 return nfs_xdr_attrstat(req, p, res->fattr); 994 int error;
995
996 error = decode_stat(xdr, &status);
997 if (unlikely(error))
998 goto out;
999 if (status != NFS_OK)
1000 goto out_default;
1001 error = decode_readdirok(xdr);
1002out:
1003 return error;
1004out_default:
1005 return nfs_stat_to_errno(status);
641} 1006}
642 1007
643/* 1008/*
644 * Decode STATFS reply 1009 * 2.2.18. statfsres
1010 *
1011 * union statfsres (stat status) {
1012 * case NFS_OK:
1013 * struct {
1014 * unsigned tsize;
1015 * unsigned bsize;
1016 * unsigned blocks;
1017 * unsigned bfree;
1018 * unsigned bavail;
1019 * } info;
1020 * default:
1021 * void;
1022 * };
645 */ 1023 */
646static int 1024static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
647nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
648{ 1025{
649 int status; 1026 __be32 *p;
650 1027
651 if ((status = ntohl(*p++))) 1028 p = xdr_inline_decode(xdr, NFS_info_sz << 2);
652 return nfs_stat_to_errno(status); 1029 if (unlikely(p == NULL))
653 1030 goto out_overflow;
654 res->tsize = ntohl(*p++); 1031 result->tsize = be32_to_cpup(p++);
655 res->bsize = ntohl(*p++); 1032 result->bsize = be32_to_cpup(p++);
656 res->blocks = ntohl(*p++); 1033 result->blocks = be32_to_cpup(p++);
657 res->bfree = ntohl(*p++); 1034 result->bfree = be32_to_cpup(p++);
658 res->bavail = ntohl(*p++); 1035 result->bavail = be32_to_cpup(p);
659 return 0; 1036 return 0;
1037out_overflow:
1038 print_overflow_msg(__func__, xdr);
1039 return -EIO;
1040}
1041
1042static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
1043 struct nfs2_fsstat *result)
1044{
1045 enum nfs_stat status;
1046 int error;
1047
1048 error = decode_stat(xdr, &status);
1049 if (unlikely(error))
1050 goto out;
1051 if (status != NFS_OK)
1052 goto out_default;
1053 error = decode_info(xdr, result);
1054out:
1055 return error;
1056out_default:
1057 return nfs_stat_to_errno(status);
660} 1058}
661 1059
1060
662/* 1061/*
663 * We need to translate between nfs status return values and 1062 * We need to translate between nfs status return values and
664 * the local errno values which may not be the same. 1063 * the local errno values which may not be the same.
665 */ 1064 */
666static struct { 1065static const struct {
667 int stat; 1066 int stat;
668 int errno; 1067 int errno;
669} nfs_errtbl[] = { 1068} nfs_errtbl[] = {
@@ -703,28 +1102,30 @@ static struct {
703 { -1, -EIO } 1102 { -1, -EIO }
704}; 1103};
705 1104
706/* 1105/**
707 * Convert an NFS error code to a local one. 1106 * nfs_stat_to_errno - convert an NFS status code to a local errno
708 * This one is used jointly by NFSv2 and NFSv3. 1107 * @status: NFS status code to convert
1108 *
1109 * Returns a local errno value, or -EIO if the NFS status code is
1110 * not recognized. This function is used jointly by NFSv2 and NFSv3.
709 */ 1111 */
710int 1112int nfs_stat_to_errno(enum nfs_stat status)
711nfs_stat_to_errno(int stat)
712{ 1113{
713 int i; 1114 int i;
714 1115
715 for (i = 0; nfs_errtbl[i].stat != -1; i++) { 1116 for (i = 0; nfs_errtbl[i].stat != -1; i++) {
716 if (nfs_errtbl[i].stat == stat) 1117 if (nfs_errtbl[i].stat == (int)status)
717 return nfs_errtbl[i].errno; 1118 return nfs_errtbl[i].errno;
718 } 1119 }
719 dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); 1120 dprintk("NFS: Unrecognized nfs status value: %u\n", status);
720 return nfs_errtbl[i].errno; 1121 return nfs_errtbl[i].errno;
721} 1122}
722 1123
723#define PROC(proc, argtype, restype, timer) \ 1124#define PROC(proc, argtype, restype, timer) \
724[NFSPROC_##proc] = { \ 1125[NFSPROC_##proc] = { \
725 .p_proc = NFSPROC_##proc, \ 1126 .p_proc = NFSPROC_##proc, \
726 .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \ 1127 .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \
727 .p_decode = (kxdrproc_t) nfs_xdr_##restype, \ 1128 .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \
728 .p_arglen = NFS_##argtype##_sz, \ 1129 .p_arglen = NFS_##argtype##_sz, \
729 .p_replen = NFS_##restype##_sz, \ 1130 .p_replen = NFS_##restype##_sz, \
730 .p_timer = timer, \ 1131 .p_timer = timer, \
@@ -732,21 +1133,21 @@ nfs_stat_to_errno(int stat)
732 .p_name = #proc, \ 1133 .p_name = #proc, \
733 } 1134 }
734struct rpc_procinfo nfs_procedures[] = { 1135struct rpc_procinfo nfs_procedures[] = {
735 PROC(GETATTR, fhandle, attrstat, 1), 1136 PROC(GETATTR, fhandle, attrstat, 1),
736 PROC(SETATTR, sattrargs, attrstat, 0), 1137 PROC(SETATTR, sattrargs, attrstat, 0),
737 PROC(LOOKUP, diropargs, diropres, 2), 1138 PROC(LOOKUP, diropargs, diropres, 2),
738 PROC(READLINK, readlinkargs, readlinkres, 3), 1139 PROC(READLINK, readlinkargs, readlinkres, 3),
739 PROC(READ, readargs, readres, 3), 1140 PROC(READ, readargs, readres, 3),
740 PROC(WRITE, writeargs, writeres, 4), 1141 PROC(WRITE, writeargs, writeres, 4),
741 PROC(CREATE, createargs, diropres, 0), 1142 PROC(CREATE, createargs, diropres, 0),
742 PROC(REMOVE, removeargs, stat, 0), 1143 PROC(REMOVE, removeargs, stat, 0),
743 PROC(RENAME, renameargs, stat, 0), 1144 PROC(RENAME, renameargs, stat, 0),
744 PROC(LINK, linkargs, stat, 0), 1145 PROC(LINK, linkargs, stat, 0),
745 PROC(SYMLINK, symlinkargs, stat, 0), 1146 PROC(SYMLINK, symlinkargs, stat, 0),
746 PROC(MKDIR, createargs, diropres, 0), 1147 PROC(MKDIR, createargs, diropres, 0),
747 PROC(RMDIR, diropargs, stat, 0), 1148 PROC(RMDIR, diropargs, stat, 0),
748 PROC(READDIR, readdirargs, readdirres, 3), 1149 PROC(READDIR, readdirargs, readdirres, 3),
749 PROC(STATFS, fhandle, statfsres, 0), 1150 PROC(STATFS, fhandle, statfsres, 0),
750}; 1151};
751 1152
752struct rpc_version nfs_version2 = { 1153struct rpc_version nfs_version2 = {
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e2..274342771655 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
311 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 311 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
312 goto out; 312 goto out;
313 313
314 /* We are doing this here, because XDR marshalling can only 314 /* We are doing this here because XDR marshalling does not
315 return -ENOMEM. */ 315 * return any results, it BUGs. */
316 status = -ENOSPC; 316 status = -ENOSPC;
317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) 317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
318 goto out; 318 goto out;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a1..ce939c062a52 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
313 */ 313 */
314static int 314static int
315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
316 int flags, struct nameidata *nd) 316 int flags, struct nfs_open_context *ctx)
317{ 317{
318 struct nfs3_createdata *data; 318 struct nfs3_createdata *data;
319 mode_t mode = sattr->ia_mode; 319 mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
438 return 1; 438 return 1;
439} 439}
440 440
441static void
442nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
443{
444 msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
445}
446
447static int
448nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
449 struct inode *new_dir)
450{
451 struct nfs_renameres *res;
452
453 if (nfs3_async_handle_jukebox(task, old_dir))
454 return 0;
455 res = task->tk_msg.rpc_resp;
456
457 nfs_post_op_update_inode(old_dir, res->old_fattr);
458 nfs_post_op_update_inode(new_dir, res->new_fattr);
459 return 1;
460}
461
441static int 462static int
442nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, 463nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
443 struct inode *new_dir, struct qstr *new_name) 464 struct inode *new_dir, struct qstr *new_name)
444{ 465{
445 struct nfs3_renameargs arg = { 466 struct nfs_renameargs arg = {
446 .fromfh = NFS_FH(old_dir), 467 .old_dir = NFS_FH(old_dir),
447 .fromname = old_name->name, 468 .old_name = old_name,
448 .fromlen = old_name->len, 469 .new_dir = NFS_FH(new_dir),
449 .tofh = NFS_FH(new_dir), 470 .new_name = new_name,
450 .toname = new_name->name,
451 .tolen = new_name->len
452 }; 471 };
453 struct nfs3_renameres res; 472 struct nfs_renameres res;
454 struct rpc_message msg = { 473 struct rpc_message msg = {
455 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], 474 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
456 .rpc_argp = &arg, 475 .rpc_argp = &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
460 479
461 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 480 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
462 481
463 res.fromattr = nfs_alloc_fattr(); 482 res.old_fattr = nfs_alloc_fattr();
464 res.toattr = nfs_alloc_fattr(); 483 res.new_fattr = nfs_alloc_fattr();
465 if (res.fromattr == NULL || res.toattr == NULL) 484 if (res.old_fattr == NULL || res.new_fattr == NULL)
466 goto out; 485 goto out;
467 486
468 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 487 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
469 nfs_post_op_update_inode(old_dir, res.fromattr); 488 nfs_post_op_update_inode(old_dir, res.old_fattr);
470 nfs_post_op_update_inode(new_dir, res.toattr); 489 nfs_post_op_update_inode(new_dir, res.new_fattr);
471out: 490out:
472 nfs_free_fattr(res.toattr); 491 nfs_free_fattr(res.old_fattr);
473 nfs_free_fattr(res.fromattr); 492 nfs_free_fattr(res.new_fattr);
474 dprintk("NFS reply rename: %d\n", status); 493 dprintk("NFS reply rename: %d\n", status);
475 return status; 494 return status;
476} 495}
@@ -611,7 +630,7 @@ out:
611 */ 630 */
612static int 631static int
613nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 632nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
614 u64 cookie, struct page *page, unsigned int count, int plus) 633 u64 cookie, struct page **pages, unsigned int count, int plus)
615{ 634{
616 struct inode *dir = dentry->d_inode; 635 struct inode *dir = dentry->d_inode;
617 __be32 *verf = NFS_COOKIEVERF(dir); 636 __be32 *verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
621 .verf = {verf[0], verf[1]}, 640 .verf = {verf[0], verf[1]},
622 .plus = plus, 641 .plus = plus,
623 .count = count, 642 .count = count,
624 .pages = &page 643 .pages = pages
625 }; 644 };
626 struct nfs3_readdirres res = { 645 struct nfs3_readdirres res = {
627 .verf = verf, 646 .verf = verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
652 671
653 nfs_free_fattr(res.dir_attr); 672 nfs_free_fattr(res.dir_attr);
654out: 673out:
655 dprintk("NFS reply readdir: %d\n", status); 674 dprintk("NFS reply readdir%s: %d\n",
675 plus? "plus" : "", status);
656 return status; 676 return status;
657} 677}
658 678
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
722 dprintk("NFS call fsstat\n"); 742 dprintk("NFS call fsstat\n");
723 nfs_fattr_init(stat->fattr); 743 nfs_fattr_init(stat->fattr);
724 status = rpc_call_sync(server->client, &msg, 0); 744 status = rpc_call_sync(server->client, &msg, 0);
725 dprintk("NFS reply statfs: %d\n", status); 745 dprintk("NFS reply fsstat: %d\n", status);
726 return status; 746 return status;
727} 747}
728 748
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
844 .unlink_setup = nfs3_proc_unlink_setup, 864 .unlink_setup = nfs3_proc_unlink_setup,
845 .unlink_done = nfs3_proc_unlink_done, 865 .unlink_done = nfs3_proc_unlink_done,
846 .rename = nfs3_proc_rename, 866 .rename = nfs3_proc_rename,
867 .rename_setup = nfs3_proc_rename_setup,
868 .rename_done = nfs3_proc_rename_done,
847 .link = nfs3_proc_link, 869 .link = nfs3_proc_link,
848 .symlink = nfs3_proc_symlink, 870 .symlink = nfs3_proc_symlink,
849 .mkdir = nfs3_proc_mkdir, 871 .mkdir = nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9769704f8ce6..183c6b123d0f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -37,18 +37,16 @@
37#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) 37#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2))
38#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) 38#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2))
39#define NFS3_fattr_sz (21) 39#define NFS3_fattr_sz (21)
40#define NFS3_wcc_attr_sz (6) 40#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2)
41#define NFS3_wcc_attr_sz (6)
41#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) 42#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz)
42#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) 43#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz)
43#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) 44#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
44#define NFS3_fsstat_sz
45#define NFS3_fsinfo_sz
46#define NFS3_pathconf_sz
47#define NFS3_entry_sz (NFS3_filename_sz+3)
48
49#define NFS3_sattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
50#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) 45#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz)
51#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) 46
47#define NFS3_getattrargs_sz (NFS3_fh_sz)
48#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
49#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz)
52#define NFS3_accessargs_sz (NFS3_fh_sz+1) 50#define NFS3_accessargs_sz (NFS3_fh_sz+1)
53#define NFS3_readlinkargs_sz (NFS3_fh_sz) 51#define NFS3_readlinkargs_sz (NFS3_fh_sz)
54#define NFS3_readargs_sz (NFS3_fh_sz+3) 52#define NFS3_readargs_sz (NFS3_fh_sz+3)
@@ -57,14 +55,16 @@
57#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) 55#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz)
58#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) 56#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz)
59#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) 57#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz)
58#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz)
60#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) 59#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz)
61#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) 60#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz)
62#define NFS3_readdirargs_sz (NFS3_fh_sz+2) 61#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3)
62#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
63#define NFS3_commitargs_sz (NFS3_fh_sz+3) 63#define NFS3_commitargs_sz (NFS3_fh_sz+3)
64 64
65#define NFS3_attrstat_sz (1+NFS3_fattr_sz) 65#define NFS3_getattrres_sz (1+NFS3_fattr_sz)
66#define NFS3_wccstat_sz (1+NFS3_wcc_data_sz) 66#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz)
67#define NFS3_removeres_sz (NFS3_wccstat_sz) 67#define NFS3_removeres_sz (NFS3_setattrres_sz)
68#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) 68#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
69#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) 69#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
70#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) 70#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1)
@@ -101,1074 +101,2364 @@ static const umode_t nfs_type2fmt[] = {
101}; 101};
102 102
103/* 103/*
104 * Common NFS XDR functions as inlines 104 * While encoding arguments, set up the reply buffer in advance to
105 * receive reply data directly into the page cache.
105 */ 106 */
106static inline __be32 * 107static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
107xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh) 108 unsigned int base, unsigned int len,
109 unsigned int bufsize)
108{ 110{
109 return xdr_encode_array(p, fh->data, fh->size); 111 struct rpc_auth *auth = req->rq_cred->cr_auth;
112 unsigned int replen;
113
114 replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
115 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
110} 116}
111 117
112static inline __be32 * 118/*
113xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh) 119 * Handle decode buffer overflows out-of-line.
120 */
121static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
114{ 122{
115 if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) { 123 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
116 memcpy(fh->data, p, fh->size); 124 "Remaining buffer length is %tu words.\n",
117 return p + XDR_QUADLEN(fh->size); 125 func, xdr->end - xdr->p);
118 }
119 return NULL;
120} 126}
121 127
128
122/* 129/*
123 * Encode/decode time. 130 * Encode/decode NFSv3 basic data types
131 *
132 * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
133 * "NFS Version 3 Protocol Specification".
134 *
135 * Not all basic data types have their own encoding and decoding
136 * functions. For run-time efficiency, some data types are encoded
137 * or decoded inline.
124 */ 138 */
125static inline __be32 * 139
126xdr_encode_time3(__be32 *p, struct timespec *timep) 140static void encode_uint32(struct xdr_stream *xdr, u32 value)
127{ 141{
128 *p++ = htonl(timep->tv_sec); 142 __be32 *p = xdr_reserve_space(xdr, 4);
129 *p++ = htonl(timep->tv_nsec); 143 *p = cpu_to_be32(value);
130 return p;
131} 144}
132 145
133static inline __be32 * 146static int decode_uint32(struct xdr_stream *xdr, u32 *value)
134xdr_decode_time3(__be32 *p, struct timespec *timep)
135{ 147{
136 timep->tv_sec = ntohl(*p++); 148 __be32 *p;
137 timep->tv_nsec = ntohl(*p++); 149
138 return p; 150 p = xdr_inline_decode(xdr, 4);
151 if (unlikely(p == NULL))
152 goto out_overflow;
153 *value = be32_to_cpup(p);
154 return 0;
155out_overflow:
156 print_overflow_msg(__func__, xdr);
157 return -EIO;
139} 158}
140 159
141static __be32 * 160static int decode_uint64(struct xdr_stream *xdr, u64 *value)
142xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
143{ 161{
144 unsigned int type, major, minor; 162 __be32 *p;
145 umode_t fmode;
146 163
147 type = ntohl(*p++); 164 p = xdr_inline_decode(xdr, 8);
148 if (type > NF3FIFO) 165 if (unlikely(p == NULL))
149 type = NF3NON; 166 goto out_overflow;
150 fmode = nfs_type2fmt[type]; 167 xdr_decode_hyper(p, value);
151 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; 168 return 0;
152 fattr->nlink = ntohl(*p++); 169out_overflow:
153 fattr->uid = ntohl(*p++); 170 print_overflow_msg(__func__, xdr);
154 fattr->gid = ntohl(*p++); 171 return -EIO;
155 p = xdr_decode_hyper(p, &fattr->size); 172}
156 p = xdr_decode_hyper(p, &fattr->du.nfs3.used);
157
158 /* Turn remote device info into Linux-specific dev_t */
159 major = ntohl(*p++);
160 minor = ntohl(*p++);
161 fattr->rdev = MKDEV(major, minor);
162 if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
163 fattr->rdev = 0;
164 173
165 p = xdr_decode_hyper(p, &fattr->fsid.major); 174/*
166 fattr->fsid.minor = 0; 175 * fileid3
167 p = xdr_decode_hyper(p, &fattr->fileid); 176 *
168 p = xdr_decode_time3(p, &fattr->atime); 177 * typedef uint64 fileid3;
169 p = xdr_decode_time3(p, &fattr->mtime); 178 */
170 p = xdr_decode_time3(p, &fattr->ctime); 179static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
180{
181 return xdr_decode_hyper(p, fileid);
182}
171 183
172 /* Update the mode bits */ 184static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
173 fattr->valid |= NFS_ATTR_FATTR_V3; 185{
174 return p; 186 return decode_uint64(xdr, fileid);
175} 187}
176 188
177static inline __be32 * 189/*
178xdr_encode_sattr(__be32 *p, struct iattr *attr) 190 * filename3
191 *
192 * typedef string filename3<>;
193 */
194static void encode_filename3(struct xdr_stream *xdr,
195 const char *name, u32 length)
179{ 196{
180 if (attr->ia_valid & ATTR_MODE) { 197 __be32 *p;
181 *p++ = xdr_one; 198
182 *p++ = htonl(attr->ia_mode & S_IALLUGO); 199 BUG_ON(length > NFS3_MAXNAMLEN);
183 } else { 200 p = xdr_reserve_space(xdr, 4 + length);
184 *p++ = xdr_zero; 201 xdr_encode_opaque(p, name, length);
185 }
186 if (attr->ia_valid & ATTR_UID) {
187 *p++ = xdr_one;
188 *p++ = htonl(attr->ia_uid);
189 } else {
190 *p++ = xdr_zero;
191 }
192 if (attr->ia_valid & ATTR_GID) {
193 *p++ = xdr_one;
194 *p++ = htonl(attr->ia_gid);
195 } else {
196 *p++ = xdr_zero;
197 }
198 if (attr->ia_valid & ATTR_SIZE) {
199 *p++ = xdr_one;
200 p = xdr_encode_hyper(p, (__u64) attr->ia_size);
201 } else {
202 *p++ = xdr_zero;
203 }
204 if (attr->ia_valid & ATTR_ATIME_SET) {
205 *p++ = xdr_two;
206 p = xdr_encode_time3(p, &attr->ia_atime);
207 } else if (attr->ia_valid & ATTR_ATIME) {
208 *p++ = xdr_one;
209 } else {
210 *p++ = xdr_zero;
211 }
212 if (attr->ia_valid & ATTR_MTIME_SET) {
213 *p++ = xdr_two;
214 p = xdr_encode_time3(p, &attr->ia_mtime);
215 } else if (attr->ia_valid & ATTR_MTIME) {
216 *p++ = xdr_one;
217 } else {
218 *p++ = xdr_zero;
219 }
220 return p;
221} 202}
222 203
223static inline __be32 * 204static int decode_inline_filename3(struct xdr_stream *xdr,
224xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) 205 const char **name, u32 *length)
225{ 206{
226 p = xdr_decode_hyper(p, &fattr->pre_size); 207 __be32 *p;
227 p = xdr_decode_time3(p, &fattr->pre_mtime); 208 u32 count;
228 p = xdr_decode_time3(p, &fattr->pre_ctime); 209
229 fattr->valid |= NFS_ATTR_FATTR_PRESIZE 210 p = xdr_inline_decode(xdr, 4);
230 | NFS_ATTR_FATTR_PREMTIME 211 if (unlikely(p == NULL))
231 | NFS_ATTR_FATTR_PRECTIME; 212 goto out_overflow;
232 return p; 213 count = be32_to_cpup(p);
214 if (count > NFS3_MAXNAMLEN)
215 goto out_nametoolong;
216 p = xdr_inline_decode(xdr, count);
217 if (unlikely(p == NULL))
218 goto out_overflow;
219 *name = (const char *)p;
220 *length = count;
221 return 0;
222
223out_nametoolong:
224 dprintk("NFS: returned filename too long: %u\n", count);
225 return -ENAMETOOLONG;
226out_overflow:
227 print_overflow_msg(__func__, xdr);
228 return -EIO;
233} 229}
234 230
235static inline __be32 * 231/*
236xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) 232 * nfspath3
233 *
234 * typedef string nfspath3<>;
235 */
236static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
237 const u32 length)
237{ 238{
238 if (*p++) 239 BUG_ON(length > NFS3_MAXPATHLEN);
239 p = xdr_decode_fattr(p, fattr); 240 encode_uint32(xdr, length);
240 return p; 241 xdr_write_pages(xdr, pages, 0, length);
241} 242}
242 243
243static inline __be32 * 244static int decode_nfspath3(struct xdr_stream *xdr)
244xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
245{ 245{
246 if (*p++) 246 u32 recvd, count;
247 return xdr_decode_wcc_attr(p, fattr); 247 size_t hdrlen;
248 return p; 248 __be32 *p;
249
250 p = xdr_inline_decode(xdr, 4);
251 if (unlikely(p == NULL))
252 goto out_overflow;
253 count = be32_to_cpup(p);
254 if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
255 goto out_nametoolong;
256 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
257 recvd = xdr->buf->len - hdrlen;
258 if (unlikely(count > recvd))
259 goto out_cheating;
260
261 xdr_read_pages(xdr, count);
262 xdr_terminate_string(xdr->buf, count);
263 return 0;
264
265out_nametoolong:
266 dprintk("NFS: returned pathname too long: %u\n", count);
267 return -ENAMETOOLONG;
268out_cheating:
269 dprintk("NFS: server cheating in pathname result: "
270 "count %u > recvd %u\n", count, recvd);
271 return -EIO;
272out_overflow:
273 print_overflow_msg(__func__, xdr);
274 return -EIO;
249} 275}
250 276
277/*
278 * cookie3
279 *
280 * typedef uint64 cookie3
281 */
282static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
283{
284 return xdr_encode_hyper(p, cookie);
285}
251 286
252static inline __be32 * 287static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
253xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
254{ 288{
255 p = xdr_decode_pre_op_attr(p, fattr); 289 return decode_uint64(xdr, cookie);
256 return xdr_decode_post_op_attr(p, fattr);
257} 290}
258 291
259/* 292/*
260 * NFS encode functions 293 * cookieverf3
294 *
295 * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
261 */ 296 */
297static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
298{
299 memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
300 return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
301}
302
303static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
304{
305 __be32 *p;
306
307 p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
308 if (unlikely(p == NULL))
309 goto out_overflow;
310 memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
311 return 0;
312out_overflow:
313 print_overflow_msg(__func__, xdr);
314 return -EIO;
315}
262 316
263/* 317/*
264 * Encode file handle argument 318 * createverf3
319 *
320 * typedef opaque createverf3[NFS3_CREATEVERFSIZE];
265 */ 321 */
266static int 322static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
267nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
268{ 323{
269 p = xdr_encode_fhandle(p, fh); 324 __be32 *p;
270 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 325
326 p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
327 memcpy(p, verifier, NFS3_CREATEVERFSIZE);
328}
329
330static int decode_writeverf3(struct xdr_stream *xdr, __be32 *verifier)
331{
332 __be32 *p;
333
334 p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
335 if (unlikely(p == NULL))
336 goto out_overflow;
337 memcpy(verifier, p, NFS3_WRITEVERFSIZE);
271 return 0; 338 return 0;
339out_overflow:
340 print_overflow_msg(__func__, xdr);
341 return -EIO;
272} 342}
273 343
274/* 344/*
275 * Encode SETATTR arguments 345 * size3
346 *
347 * typedef uint64 size3;
276 */ 348 */
277static int 349static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
278nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args) 350{
279{ 351 return xdr_decode_hyper(p, size);
280 p = xdr_encode_fhandle(p, args->fh);
281 p = xdr_encode_sattr(p, args->sattr);
282 *p++ = htonl(args->guard);
283 if (args->guard)
284 p = xdr_encode_time3(p, &args->guardtime);
285 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
286 return 0;
287} 352}
288 353
289/* 354/*
290 * Encode directory ops argument 355 * nfsstat3
356 *
357 * enum nfsstat3 {
358 * NFS3_OK = 0,
359 * ...
360 * }
291 */ 361 */
292static int 362#define NFS3_OK NFS_OK
293nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args) 363
364static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
294{ 365{
295 p = xdr_encode_fhandle(p, args->fh); 366 __be32 *p;
296 p = xdr_encode_array(p, args->name, args->len); 367
297 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 368 p = xdr_inline_decode(xdr, 4);
369 if (unlikely(p == NULL))
370 goto out_overflow;
371 *status = be32_to_cpup(p);
298 return 0; 372 return 0;
373out_overflow:
374 print_overflow_msg(__func__, xdr);
375 return -EIO;
299} 376}
300 377
301/* 378/*
302 * Encode REMOVE argument 379 * ftype3
380 *
381 * enum ftype3 {
382 * NF3REG = 1,
383 * NF3DIR = 2,
384 * NF3BLK = 3,
385 * NF3CHR = 4,
386 * NF3LNK = 5,
387 * NF3SOCK = 6,
388 * NF3FIFO = 7
389 * };
303 */ 390 */
304static int 391static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
305nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
306{ 392{
307 p = xdr_encode_fhandle(p, args->fh); 393 BUG_ON(type > NF3FIFO);
308 p = xdr_encode_array(p, args->name.name, args->name.len); 394 encode_uint32(xdr, type);
309 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 395}
310 return 0; 396
397static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
398{
399 u32 type;
400
401 type = be32_to_cpup(p++);
402 if (type > NF3FIFO)
403 type = NF3NON;
404 *mode = nfs_type2fmt[type];
405 return p;
311} 406}
312 407
313/* 408/*
314 * Encode access() argument 409 * specdata3
410 *
411 * struct specdata3 {
412 * uint32 specdata1;
413 * uint32 specdata2;
414 * };
315 */ 415 */
316static int 416static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
317nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
318{ 417{
319 p = xdr_encode_fhandle(p, args->fh); 418 __be32 *p;
320 *p++ = htonl(args->access); 419
321 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 420 p = xdr_reserve_space(xdr, 8);
322 return 0; 421 *p++ = cpu_to_be32(MAJOR(rdev));
422 *p = cpu_to_be32(MINOR(rdev));
423}
424
425static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
426{
427 unsigned int major, minor;
428
429 major = be32_to_cpup(p++);
430 minor = be32_to_cpup(p++);
431 *rdev = MKDEV(major, minor);
432 if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
433 *rdev = 0;
434 return p;
323} 435}
324 436
325/* 437/*
326 * Arguments to a READ call. Since we read data directly into the page 438 * nfs_fh3
327 * cache, we also set up the reply iovec here so that iov[1] points 439 *
328 * exactly to the page we want to fetch. 440 * struct nfs_fh3 {
441 * opaque data<NFS3_FHSIZE>;
442 * };
329 */ 443 */
330static int 444static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
331nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
332{ 445{
333 struct rpc_auth *auth = req->rq_cred->cr_auth; 446 __be32 *p;
334 unsigned int replen;
335 u32 count = args->count;
336 447
337 p = xdr_encode_fhandle(p, args->fh); 448 BUG_ON(fh->size > NFS3_FHSIZE);
338 p = xdr_encode_hyper(p, args->offset); 449 p = xdr_reserve_space(xdr, 4 + fh->size);
339 *p++ = htonl(count); 450 xdr_encode_opaque(p, fh->data, fh->size);
340 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 451}
341 452
342 /* Inline the page array */ 453static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
343 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; 454{
344 xdr_inline_pages(&req->rq_rcv_buf, replen, 455 u32 length;
345 args->pages, args->pgbase, count); 456 __be32 *p;
346 req->rq_rcv_buf.flags |= XDRBUF_READ; 457
458 p = xdr_inline_decode(xdr, 4);
459 if (unlikely(p == NULL))
460 goto out_overflow;
461 length = be32_to_cpup(p++);
462 if (unlikely(length > NFS3_FHSIZE))
463 goto out_toobig;
464 p = xdr_inline_decode(xdr, length);
465 if (unlikely(p == NULL))
466 goto out_overflow;
467 fh->size = length;
468 memcpy(fh->data, p, length);
347 return 0; 469 return 0;
470out_toobig:
471 dprintk("NFS: file handle size (%u) too big\n", length);
472 return -E2BIG;
473out_overflow:
474 print_overflow_msg(__func__, xdr);
475 return -EIO;
476}
477
478static void zero_nfs_fh3(struct nfs_fh *fh)
479{
480 memset(fh, 0, sizeof(*fh));
348} 481}
349 482
350/* 483/*
351 * Write arguments. Splice the buffer to be written into the iovec. 484 * nfstime3
485 *
486 * struct nfstime3 {
487 * uint32 seconds;
488 * uint32 nseconds;
489 * };
352 */ 490 */
353static int 491static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep)
354nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
355{ 492{
356 struct xdr_buf *sndbuf = &req->rq_snd_buf; 493 *p++ = cpu_to_be32(timep->tv_sec);
357 u32 count = args->count; 494 *p++ = cpu_to_be32(timep->tv_nsec);
495 return p;
496}
358 497
359 p = xdr_encode_fhandle(p, args->fh); 498static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep)
360 p = xdr_encode_hyper(p, args->offset); 499{
361 *p++ = htonl(count); 500 timep->tv_sec = be32_to_cpup(p++);
362 *p++ = htonl(args->stable); 501 timep->tv_nsec = be32_to_cpup(p++);
363 *p++ = htonl(count); 502 return p;
364 sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
365
366 /* Copy the page array */
367 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count);
368 sndbuf->flags |= XDRBUF_WRITE;
369 return 0;
370} 503}
371 504
372/* 505/*
373 * Encode CREATE arguments 506 * sattr3
507 *
508 * enum time_how {
509 * DONT_CHANGE = 0,
510 * SET_TO_SERVER_TIME = 1,
511 * SET_TO_CLIENT_TIME = 2
512 * };
513 *
514 * union set_mode3 switch (bool set_it) {
515 * case TRUE:
516 * mode3 mode;
517 * default:
518 * void;
519 * };
520 *
521 * union set_uid3 switch (bool set_it) {
522 * case TRUE:
523 * uid3 uid;
524 * default:
525 * void;
526 * };
527 *
528 * union set_gid3 switch (bool set_it) {
529 * case TRUE:
530 * gid3 gid;
531 * default:
532 * void;
533 * };
534 *
535 * union set_size3 switch (bool set_it) {
536 * case TRUE:
537 * size3 size;
538 * default:
539 * void;
540 * };
541 *
542 * union set_atime switch (time_how set_it) {
543 * case SET_TO_CLIENT_TIME:
544 * nfstime3 atime;
545 * default:
546 * void;
547 * };
548 *
549 * union set_mtime switch (time_how set_it) {
550 * case SET_TO_CLIENT_TIME:
551 * nfstime3 mtime;
552 * default:
553 * void;
554 * };
555 *
556 * struct sattr3 {
557 * set_mode3 mode;
558 * set_uid3 uid;
559 * set_gid3 gid;
560 * set_size3 size;
561 * set_atime atime;
562 * set_mtime mtime;
563 * };
374 */ 564 */
375static int 565static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
376nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
377{ 566{
378 p = xdr_encode_fhandle(p, args->fh); 567 u32 nbytes;
379 p = xdr_encode_array(p, args->name, args->len); 568 __be32 *p;
380 569
381 *p++ = htonl(args->createmode); 570 /*
382 if (args->createmode == NFS3_CREATE_EXCLUSIVE) { 571 * In order to make only a single xdr_reserve_space() call,
383 *p++ = args->verifier[0]; 572 * pre-compute the total number of bytes to be reserved.
384 *p++ = args->verifier[1]; 573 * Six boolean values, one for each set_foo field, are always
574 * present in the encoded result, so start there.
575 */
576 nbytes = 6 * 4;
577 if (attr->ia_valid & ATTR_MODE)
578 nbytes += 4;
579 if (attr->ia_valid & ATTR_UID)
580 nbytes += 4;
581 if (attr->ia_valid & ATTR_GID)
582 nbytes += 4;
583 if (attr->ia_valid & ATTR_SIZE)
584 nbytes += 8;
585 if (attr->ia_valid & ATTR_ATIME_SET)
586 nbytes += 8;
587 if (attr->ia_valid & ATTR_MTIME_SET)
588 nbytes += 8;
589 p = xdr_reserve_space(xdr, nbytes);
590
591 if (attr->ia_valid & ATTR_MODE) {
592 *p++ = xdr_one;
593 *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
385 } else 594 } else
386 p = xdr_encode_sattr(p, args->sattr); 595 *p++ = xdr_zero;
387 596
388 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 597 if (attr->ia_valid & ATTR_UID) {
389 return 0; 598 *p++ = xdr_one;
599 *p++ = cpu_to_be32(attr->ia_uid);
600 } else
601 *p++ = xdr_zero;
602
603 if (attr->ia_valid & ATTR_GID) {
604 *p++ = xdr_one;
605 *p++ = cpu_to_be32(attr->ia_gid);
606 } else
607 *p++ = xdr_zero;
608
609 if (attr->ia_valid & ATTR_SIZE) {
610 *p++ = xdr_one;
611 p = xdr_encode_hyper(p, (u64)attr->ia_size);
612 } else
613 *p++ = xdr_zero;
614
615 if (attr->ia_valid & ATTR_ATIME_SET) {
616 *p++ = xdr_two;
617 p = xdr_encode_nfstime3(p, &attr->ia_atime);
618 } else if (attr->ia_valid & ATTR_ATIME) {
619 *p++ = xdr_one;
620 } else
621 *p++ = xdr_zero;
622
623 if (attr->ia_valid & ATTR_MTIME_SET) {
624 *p++ = xdr_two;
625 xdr_encode_nfstime3(p, &attr->ia_mtime);
626 } else if (attr->ia_valid & ATTR_MTIME) {
627 *p = xdr_one;
628 } else
629 *p = xdr_zero;
390} 630}
391 631
392/* 632/*
393 * Encode MKDIR arguments 633 * fattr3
634 *
635 * struct fattr3 {
636 * ftype3 type;
637 * mode3 mode;
638 * uint32 nlink;
639 * uid3 uid;
640 * gid3 gid;
641 * size3 size;
642 * size3 used;
643 * specdata3 rdev;
644 * uint64 fsid;
645 * fileid3 fileid;
646 * nfstime3 atime;
647 * nfstime3 mtime;
648 * nfstime3 ctime;
649 * };
394 */ 650 */
395static int 651static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
396nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
397{ 652{
398 p = xdr_encode_fhandle(p, args->fh); 653 umode_t fmode;
399 p = xdr_encode_array(p, args->name, args->len); 654 __be32 *p;
400 p = xdr_encode_sattr(p, args->sattr); 655
401 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 656 p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
657 if (unlikely(p == NULL))
658 goto out_overflow;
659
660 p = xdr_decode_ftype3(p, &fmode);
661
662 fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
663 fattr->nlink = be32_to_cpup(p++);
664 fattr->uid = be32_to_cpup(p++);
665 fattr->gid = be32_to_cpup(p++);
666
667 p = xdr_decode_size3(p, &fattr->size);
668 p = xdr_decode_size3(p, &fattr->du.nfs3.used);
669 p = xdr_decode_specdata3(p, &fattr->rdev);
670
671 p = xdr_decode_hyper(p, &fattr->fsid.major);
672 fattr->fsid.minor = 0;
673
674 p = xdr_decode_fileid3(p, &fattr->fileid);
675 p = xdr_decode_nfstime3(p, &fattr->atime);
676 p = xdr_decode_nfstime3(p, &fattr->mtime);
677 xdr_decode_nfstime3(p, &fattr->ctime);
678
679 fattr->valid |= NFS_ATTR_FATTR_V3;
402 return 0; 680 return 0;
681out_overflow:
682 print_overflow_msg(__func__, xdr);
683 return -EIO;
403} 684}
404 685
405/* 686/*
406 * Encode SYMLINK arguments 687 * post_op_attr
688 *
689 * union post_op_attr switch (bool attributes_follow) {
690 * case TRUE:
691 * fattr3 attributes;
692 * case FALSE:
693 * void;
694 * };
407 */ 695 */
408static int 696static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
409nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
410{ 697{
411 p = xdr_encode_fhandle(p, args->fromfh); 698 __be32 *p;
412 p = xdr_encode_array(p, args->fromname, args->fromlen);
413 p = xdr_encode_sattr(p, args->sattr);
414 *p++ = htonl(args->pathlen);
415 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
416 699
417 /* Copy the page */ 700 p = xdr_inline_decode(xdr, 4);
418 xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen); 701 if (unlikely(p == NULL))
702 goto out_overflow;
703 if (*p != xdr_zero)
704 return decode_fattr3(xdr, fattr);
419 return 0; 705 return 0;
706out_overflow:
707 print_overflow_msg(__func__, xdr);
708 return -EIO;
420} 709}
421 710
422/* 711/*
423 * Encode MKNOD arguments 712 * wcc_attr
713 * struct wcc_attr {
714 * size3 size;
715 * nfstime3 mtime;
716 * nfstime3 ctime;
717 * };
424 */ 718 */
425static int 719static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
426nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) 720{
427{ 721 __be32 *p;
428 p = xdr_encode_fhandle(p, args->fh); 722
429 p = xdr_encode_array(p, args->name, args->len); 723 p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
430 *p++ = htonl(args->type); 724 if (unlikely(p == NULL))
431 p = xdr_encode_sattr(p, args->sattr); 725 goto out_overflow;
432 if (args->type == NF3CHR || args->type == NF3BLK) { 726
433 *p++ = htonl(MAJOR(args->rdev)); 727 fattr->valid |= NFS_ATTR_FATTR_PRESIZE
434 *p++ = htonl(MINOR(args->rdev)); 728 | NFS_ATTR_FATTR_PREMTIME
435 } 729 | NFS_ATTR_FATTR_PRECTIME;
730
731 p = xdr_decode_size3(p, &fattr->pre_size);
732 p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
733 xdr_decode_nfstime3(p, &fattr->pre_ctime);
436 734
437 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
438 return 0; 735 return 0;
736out_overflow:
737 print_overflow_msg(__func__, xdr);
738 return -EIO;
439} 739}
440 740
441/* 741/*
442 * Encode RENAME arguments 742 * pre_op_attr
743 * union pre_op_attr switch (bool attributes_follow) {
744 * case TRUE:
745 * wcc_attr attributes;
746 * case FALSE:
747 * void;
748 * };
749 *
750 * wcc_data
751 *
752 * struct wcc_data {
753 * pre_op_attr before;
754 * post_op_attr after;
755 * };
443 */ 756 */
444static int 757static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
445nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args) 758{
446{ 759 __be32 *p;
447 p = xdr_encode_fhandle(p, args->fromfh); 760
448 p = xdr_encode_array(p, args->fromname, args->fromlen); 761 p = xdr_inline_decode(xdr, 4);
449 p = xdr_encode_fhandle(p, args->tofh); 762 if (unlikely(p == NULL))
450 p = xdr_encode_array(p, args->toname, args->tolen); 763 goto out_overflow;
451 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 764 if (*p != xdr_zero)
765 return decode_wcc_attr(xdr, fattr);
452 return 0; 766 return 0;
767out_overflow:
768 print_overflow_msg(__func__, xdr);
769 return -EIO;
453} 770}
454 771
455/* 772static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)
456 * Encode LINK arguments
457 */
458static int
459nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
460{ 773{
461 p = xdr_encode_fhandle(p, args->fromfh); 774 int error;
462 p = xdr_encode_fhandle(p, args->tofh); 775
463 p = xdr_encode_array(p, args->toname, args->tolen); 776 error = decode_pre_op_attr(xdr, fattr);
464 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 777 if (unlikely(error))
465 return 0; 778 goto out;
779 error = decode_post_op_attr(xdr, fattr);
780out:
781 return error;
466} 782}
467 783
468/* 784/*
469 * Encode arguments to readdir call 785 * post_op_fh3
786 *
787 * union post_op_fh3 switch (bool handle_follows) {
788 * case TRUE:
789 * nfs_fh3 handle;
790 * case FALSE:
791 * void;
792 * };
470 */ 793 */
471static int 794static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
472nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
473{ 795{
474 struct rpc_auth *auth = req->rq_cred->cr_auth; 796 __be32 *p = xdr_inline_decode(xdr, 4);
475 unsigned int replen; 797 if (unlikely(p == NULL))
476 u32 count = args->count; 798 goto out_overflow;
477 799 if (*p != xdr_zero)
478 p = xdr_encode_fhandle(p, args->fh); 800 return decode_nfs_fh3(xdr, fh);
479 p = xdr_encode_hyper(p, args->cookie); 801 zero_nfs_fh3(fh);
480 *p++ = args->verf[0];
481 *p++ = args->verf[1];
482 if (args->plus) {
483 /* readdirplus: need dircount + buffer size.
484 * We just make sure we make dircount big enough */
485 *p++ = htonl(count >> 3);
486 }
487 *p++ = htonl(count);
488 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
489
490 /* Inline the page array */
491 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2;
492 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count);
493 return 0; 802 return 0;
803out_overflow:
804 print_overflow_msg(__func__, xdr);
805 return -EIO;
494} 806}
495 807
496/* 808/*
497 * Decode the result of a readdir call. 809 * diropargs3
498 * We just check for syntactical correctness. 810 *
811 * struct diropargs3 {
812 * nfs_fh3 dir;
813 * filename3 name;
814 * };
499 */ 815 */
500static int 816static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
501nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res) 817 const char *name, u32 length)
502{ 818{
503 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 819 encode_nfs_fh3(xdr, fh);
504 struct kvec *iov = rcvbuf->head; 820 encode_filename3(xdr, name, length);
505 struct page **page; 821}
506 size_t hdrlen;
507 u32 len, recvd, pglen;
508 int status, nr = 0;
509 __be32 *entry, *end, *kaddr;
510
511 status = ntohl(*p++);
512 /* Decode post_op_attrs */
513 p = xdr_decode_post_op_attr(p, res->dir_attr);
514 if (status)
515 return nfs_stat_to_errno(status);
516 /* Decode verifier cookie */
517 if (res->verf) {
518 res->verf[0] = *p++;
519 res->verf[1] = *p++;
520 } else {
521 p += 2;
522 }
523
524 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
525 if (iov->iov_len < hdrlen) {
526 dprintk("NFS: READDIR reply header overflowed:"
527 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
528 return -errno_NFSERR_IO;
529 } else if (iov->iov_len != hdrlen) {
530 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
531 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
532 }
533 822
534 pglen = rcvbuf->page_len;
535 recvd = rcvbuf->len - hdrlen;
536 if (pglen > recvd)
537 pglen = recvd;
538 page = rcvbuf->pages;
539 kaddr = p = kmap_atomic(*page, KM_USER0);
540 end = (__be32 *)((char *)p + pglen);
541 entry = p;
542
543 /* Make sure the packet actually has a value_follows and EOF entry */
544 if ((entry + 1) > end)
545 goto short_pkt;
546
547 for (; *p++; nr++) {
548 if (p + 3 > end)
549 goto short_pkt;
550 p += 2; /* inode # */
551 len = ntohl(*p++); /* string length */
552 p += XDR_QUADLEN(len) + 2; /* name + cookie */
553 if (len > NFS3_MAXNAMLEN) {
554 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
555 len);
556 goto err_unmap;
557 }
558
559 if (res->plus) {
560 /* post_op_attr */
561 if (p + 2 > end)
562 goto short_pkt;
563 if (*p++) {
564 p += 21;
565 if (p + 1 > end)
566 goto short_pkt;
567 }
568 /* post_op_fh3 */
569 if (*p++) {
570 if (p + 1 > end)
571 goto short_pkt;
572 len = ntohl(*p++);
573 if (len > NFS3_FHSIZE) {
574 dprintk("NFS: giant filehandle in "
575 "readdir (len 0x%x)!\n", len);
576 goto err_unmap;
577 }
578 p += XDR_QUADLEN(len);
579 }
580 }
581 823
582 if (p + 2 > end) 824/*
583 goto short_pkt; 825 * NFSv3 XDR encode functions
584 entry = p; 826 *
585 } 827 * NFSv3 argument types are defined in section 3.3 of RFC 1813:
828 * "NFS Version 3 Protocol Specification".
829 */
586 830
587 /* 831/*
588 * Apparently some server sends responses that are a valid size, but 832 * 3.3.1 GETATTR3args
589 * contain no entries, and have value_follows==0 and EOF==0. For 833 *
590 * those, just set the EOF marker. 834 * struct GETATTR3args {
591 */ 835 * nfs_fh3 object;
592 if (!nr && entry[1] == 0) { 836 * };
593 dprintk("NFS: readdir reply truncated!\n"); 837 */
594 entry[1] = 1; 838static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
595 } 839 struct xdr_stream *xdr,
596 out: 840 const struct nfs_fh *fh)
597 kunmap_atomic(kaddr, KM_USER0); 841{
598 return nr; 842 encode_nfs_fh3(xdr, fh);
599 short_pkt:
600 /*
601 * When we get a short packet there are 2 possibilities. We can
602 * return an error, or fix up the response to look like a valid
603 * response and return what we have so far. If there are no
604 * entries and the packet was short, then return -EIO. If there
605 * are valid entries in the response, return them and pretend that
606 * the call was successful, but incomplete. The caller can retry the
607 * readdir starting at the last cookie.
608 */
609 entry[0] = entry[1] = 0;
610 if (!nr)
611 nr = -errno_NFSERR_IO;
612 goto out;
613err_unmap:
614 nr = -errno_NFSERR_IO;
615 goto out;
616} 843}
617 844
618__be32 * 845/*
619nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 846 * 3.3.2 SETATTR3args
847 *
848 * union sattrguard3 switch (bool check) {
849 * case TRUE:
850 * nfstime3 obj_ctime;
851 * case FALSE:
852 * void;
853 * };
854 *
855 * struct SETATTR3args {
856 * nfs_fh3 object;
857 * sattr3 new_attributes;
858 * sattrguard3 guard;
859 * };
860 */
861static void encode_sattrguard3(struct xdr_stream *xdr,
862 const struct nfs3_sattrargs *args)
620{ 863{
621 struct nfs_entry old = *entry; 864 __be32 *p;
622
623 if (!*p++) {
624 if (!*p)
625 return ERR_PTR(-EAGAIN);
626 entry->eof = 1;
627 return ERR_PTR(-EBADCOOKIE);
628 }
629
630 p = xdr_decode_hyper(p, &entry->ino);
631 entry->len = ntohl(*p++);
632 entry->name = (const char *) p;
633 p += XDR_QUADLEN(entry->len);
634 entry->prev_cookie = entry->cookie;
635 p = xdr_decode_hyper(p, &entry->cookie);
636 865
637 if (plus) { 866 if (args->guard) {
638 entry->fattr->valid = 0; 867 p = xdr_reserve_space(xdr, 4 + 8);
639 p = xdr_decode_post_op_attr(p, entry->fattr); 868 *p++ = xdr_one;
640 /* In fact, a post_op_fh3: */ 869 xdr_encode_nfstime3(p, &args->guardtime);
641 if (*p++) { 870 } else {
642 p = xdr_decode_fhandle(p, entry->fh); 871 p = xdr_reserve_space(xdr, 4);
643 /* Ugh -- server reply was truncated */ 872 *p = xdr_zero;
644 if (p == NULL) {
645 dprintk("NFS: FH truncated\n");
646 *entry = old;
647 return ERR_PTR(-EAGAIN);
648 }
649 } else
650 memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
651 } 873 }
874}
652 875
653 entry->eof = !p[0] && p[1]; 876static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
654 return p; 877 struct xdr_stream *xdr,
878 const struct nfs3_sattrargs *args)
879{
880 encode_nfs_fh3(xdr, args->fh);
881 encode_sattr3(xdr, args->sattr);
882 encode_sattrguard3(xdr, args);
655} 883}
656 884
657/* 885/*
658 * Encode COMMIT arguments 886 * 3.3.3 LOOKUP3args
887 *
888 * struct LOOKUP3args {
889 * diropargs3 what;
890 * };
659 */ 891 */
660static int 892static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
661nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 893 struct xdr_stream *xdr,
894 const struct nfs3_diropargs *args)
662{ 895{
663 p = xdr_encode_fhandle(p, args->fh); 896 encode_diropargs3(xdr, args->fh, args->name, args->len);
664 p = xdr_encode_hyper(p, args->offset);
665 *p++ = htonl(args->count);
666 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
667 return 0;
668} 897}
669 898
670#ifdef CONFIG_NFS_V3_ACL
671/* 899/*
672 * Encode GETACL arguments 900 * 3.3.4 ACCESS3args
901 *
902 * struct ACCESS3args {
903 * nfs_fh3 object;
904 * uint32 access;
905 * };
673 */ 906 */
674static int 907static void encode_access3args(struct xdr_stream *xdr,
675nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, 908 const struct nfs3_accessargs *args)
676 struct nfs3_getaclargs *args)
677{ 909{
678 struct rpc_auth *auth = req->rq_cred->cr_auth; 910 encode_nfs_fh3(xdr, args->fh);
679 unsigned int replen; 911 encode_uint32(xdr, args->access);
912}
680 913
681 p = xdr_encode_fhandle(p, args->fh); 914static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
682 *p++ = htonl(args->mask); 915 struct xdr_stream *xdr,
683 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 916 const struct nfs3_accessargs *args)
917{
918 encode_access3args(xdr, args);
919}
684 920
685 if (args->mask & (NFS_ACL | NFS_DFACL)) { 921/*
686 /* Inline the page array */ 922 * 3.3.5 READLINK3args
687 replen = (RPC_REPHDRSIZE + auth->au_rslack + 923 *
688 ACL3_getaclres_sz) << 2; 924 * struct READLINK3args {
689 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, 925 * nfs_fh3 symlink;
690 NFSACL_MAXPAGES << PAGE_SHIFT); 926 * };
691 } 927 */
692 return 0; 928static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
929 struct xdr_stream *xdr,
930 const struct nfs3_readlinkargs *args)
931{
932 encode_nfs_fh3(xdr, args->fh);
933 prepare_reply_buffer(req, args->pages, args->pgbase,
934 args->pglen, NFS3_readlinkres_sz);
693} 935}
694 936
695/* 937/*
696 * Encode SETACL arguments 938 * 3.3.6 READ3args
939 *
940 * struct READ3args {
941 * nfs_fh3 file;
942 * offset3 offset;
943 * count3 count;
944 * };
697 */ 945 */
698static int 946static void encode_read3args(struct xdr_stream *xdr,
699nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, 947 const struct nfs_readargs *args)
700 struct nfs3_setaclargs *args)
701{ 948{
702 struct xdr_buf *buf = &req->rq_snd_buf; 949 __be32 *p;
703 unsigned int base;
704 int err;
705 950
706 p = xdr_encode_fhandle(p, NFS_FH(args->inode)); 951 encode_nfs_fh3(xdr, args->fh);
707 *p++ = htonl(args->mask);
708 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
709 base = req->rq_slen;
710 952
711 if (args->npages != 0) 953 p = xdr_reserve_space(xdr, 8 + 4);
712 xdr_encode_pages(buf, args->pages, 0, args->len); 954 p = xdr_encode_hyper(p, args->offset);
713 else 955 *p = cpu_to_be32(args->count);
714 req->rq_slen = xdr_adjust_iovec(req->rq_svec, 956}
715 p + XDR_QUADLEN(args->len));
716 957
717 err = nfsacl_encode(buf, base, args->inode, 958static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
718 (args->mask & NFS_ACL) ? 959 struct xdr_stream *xdr,
719 args->acl_access : NULL, 1, 0); 960 const struct nfs_readargs *args)
720 if (err > 0) 961{
721 err = nfsacl_encode(buf, base + err, args->inode, 962 encode_read3args(xdr, args);
722 (args->mask & NFS_DFACL) ? 963 prepare_reply_buffer(req, args->pages, args->pgbase,
723 args->acl_default : NULL, 1, 964 args->count, NFS3_readres_sz);
724 NFS_ACL_DEFAULT); 965 req->rq_rcv_buf.flags |= XDRBUF_READ;
725 return (err > 0) ? 0 : err;
726} 966}
727#endif /* CONFIG_NFS_V3_ACL */
728 967
729/* 968/*
730 * NFS XDR decode functions 969 * 3.3.7 WRITE3args
970 *
971 * enum stable_how {
972 * UNSTABLE = 0,
973 * DATA_SYNC = 1,
974 * FILE_SYNC = 2
975 * };
976 *
977 * struct WRITE3args {
978 * nfs_fh3 file;
979 * offset3 offset;
980 * count3 count;
981 * stable_how stable;
982 * opaque data<>;
983 * };
731 */ 984 */
985static void encode_write3args(struct xdr_stream *xdr,
986 const struct nfs_writeargs *args)
987{
988 __be32 *p;
989
990 encode_nfs_fh3(xdr, args->fh);
991
992 p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
993 p = xdr_encode_hyper(p, args->offset);
994 *p++ = cpu_to_be32(args->count);
995 *p++ = cpu_to_be32(args->stable);
996 *p = cpu_to_be32(args->count);
997 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
998}
999
1000static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
1001 struct xdr_stream *xdr,
1002 const struct nfs_writeargs *args)
1003{
1004 encode_write3args(xdr, args);
1005 xdr->buf->flags |= XDRBUF_WRITE;
1006}
732 1007
733/* 1008/*
734 * Decode attrstat reply. 1009 * 3.3.8 CREATE3args
1010 *
1011 * enum createmode3 {
1012 * UNCHECKED = 0,
1013 * GUARDED = 1,
1014 * EXCLUSIVE = 2
1015 * };
1016 *
1017 * union createhow3 switch (createmode3 mode) {
1018 * case UNCHECKED:
1019 * case GUARDED:
1020 * sattr3 obj_attributes;
1021 * case EXCLUSIVE:
1022 * createverf3 verf;
1023 * };
1024 *
1025 * struct CREATE3args {
1026 * diropargs3 where;
1027 * createhow3 how;
1028 * };
735 */ 1029 */
736static int 1030static void encode_createhow3(struct xdr_stream *xdr,
737nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1031 const struct nfs3_createargs *args)
738{ 1032{
739 int status; 1033 encode_uint32(xdr, args->createmode);
1034 switch (args->createmode) {
1035 case NFS3_CREATE_UNCHECKED:
1036 case NFS3_CREATE_GUARDED:
1037 encode_sattr3(xdr, args->sattr);
1038 break;
1039 case NFS3_CREATE_EXCLUSIVE:
1040 encode_createverf3(xdr, args->verifier);
1041 break;
1042 default:
1043 BUG();
1044 }
1045}
740 1046
741 if ((status = ntohl(*p++))) 1047static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
742 return nfs_stat_to_errno(status); 1048 struct xdr_stream *xdr,
743 xdr_decode_fattr(p, fattr); 1049 const struct nfs3_createargs *args)
744 return 0; 1050{
1051 encode_diropargs3(xdr, args->fh, args->name, args->len);
1052 encode_createhow3(xdr, args);
745} 1053}
746 1054
747/* 1055/*
748 * Decode status+wcc_data reply 1056 * 3.3.9 MKDIR3args
749 * SATTR, REMOVE, RMDIR 1057 *
1058 * struct MKDIR3args {
1059 * diropargs3 where;
1060 * sattr3 attributes;
1061 * };
750 */ 1062 */
751static int 1063static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
752nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1064 struct xdr_stream *xdr,
1065 const struct nfs3_mkdirargs *args)
753{ 1066{
754 int status; 1067 encode_diropargs3(xdr, args->fh, args->name, args->len);
1068 encode_sattr3(xdr, args->sattr);
1069}
755 1070
756 if ((status = ntohl(*p++))) 1071/*
757 status = nfs_stat_to_errno(status); 1072 * 3.3.10 SYMLINK3args
758 xdr_decode_wcc_data(p, fattr); 1073 *
759 return status; 1074 * struct symlinkdata3 {
1075 * sattr3 symlink_attributes;
1076 * nfspath3 symlink_data;
1077 * };
1078 *
1079 * struct SYMLINK3args {
1080 * diropargs3 where;
1081 * symlinkdata3 symlink;
1082 * };
1083 */
1084static void encode_symlinkdata3(struct xdr_stream *xdr,
1085 const struct nfs3_symlinkargs *args)
1086{
1087 encode_sattr3(xdr, args->sattr);
1088 encode_nfspath3(xdr, args->pages, args->pathlen);
760} 1089}
761 1090
762static int 1091static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
763nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) 1092 struct xdr_stream *xdr,
1093 const struct nfs3_symlinkargs *args)
764{ 1094{
765 return nfs3_xdr_wccstat(req, p, res->dir_attr); 1095 encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
1096 encode_symlinkdata3(xdr, args);
766} 1097}
767 1098
768/* 1099/*
769 * Decode LOOKUP reply 1100 * 3.3.11 MKNOD3args
1101 *
1102 * struct devicedata3 {
1103 * sattr3 dev_attributes;
1104 * specdata3 spec;
1105 * };
1106 *
1107 * union mknoddata3 switch (ftype3 type) {
1108 * case NF3CHR:
1109 * case NF3BLK:
1110 * devicedata3 device;
1111 * case NF3SOCK:
1112 * case NF3FIFO:
1113 * sattr3 pipe_attributes;
1114 * default:
1115 * void;
1116 * };
1117 *
1118 * struct MKNOD3args {
1119 * diropargs3 where;
1120 * mknoddata3 what;
1121 * };
770 */ 1122 */
771static int 1123static void encode_devicedata3(struct xdr_stream *xdr,
772nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) 1124 const struct nfs3_mknodargs *args)
773{ 1125{
774 int status; 1126 encode_sattr3(xdr, args->sattr);
1127 encode_specdata3(xdr, args->rdev);
1128}
775 1129
776 if ((status = ntohl(*p++))) { 1130static void encode_mknoddata3(struct xdr_stream *xdr,
777 status = nfs_stat_to_errno(status); 1131 const struct nfs3_mknodargs *args)
778 } else { 1132{
779 if (!(p = xdr_decode_fhandle(p, res->fh))) 1133 encode_ftype3(xdr, args->type);
780 return -errno_NFSERR_IO; 1134 switch (args->type) {
781 p = xdr_decode_post_op_attr(p, res->fattr); 1135 case NF3CHR:
1136 case NF3BLK:
1137 encode_devicedata3(xdr, args);
1138 break;
1139 case NF3SOCK:
1140 case NF3FIFO:
1141 encode_sattr3(xdr, args->sattr);
1142 break;
1143 case NF3REG:
1144 case NF3DIR:
1145 break;
1146 default:
1147 BUG();
782 } 1148 }
783 xdr_decode_post_op_attr(p, res->dir_attr); 1149}
784 return status; 1150
1151static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
1152 struct xdr_stream *xdr,
1153 const struct nfs3_mknodargs *args)
1154{
1155 encode_diropargs3(xdr, args->fh, args->name, args->len);
1156 encode_mknoddata3(xdr, args);
785} 1157}
786 1158
787/* 1159/*
788 * Decode ACCESS reply 1160 * 3.3.12 REMOVE3args
1161 *
1162 * struct REMOVE3args {
1163 * diropargs3 object;
1164 * };
789 */ 1165 */
790static int 1166static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
791nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) 1167 struct xdr_stream *xdr,
1168 const struct nfs_removeargs *args)
792{ 1169{
793 int status = ntohl(*p++); 1170 encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
1171}
794 1172
795 p = xdr_decode_post_op_attr(p, res->fattr); 1173/*
796 if (status) 1174 * 3.3.14 RENAME3args
797 return nfs_stat_to_errno(status); 1175 *
798 res->access = ntohl(*p++); 1176 * struct RENAME3args {
799 return 0; 1177 * diropargs3 from;
1178 * diropargs3 to;
1179 * };
1180 */
1181static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
1182 struct xdr_stream *xdr,
1183 const struct nfs_renameargs *args)
1184{
1185 const struct qstr *old = args->old_name;
1186 const struct qstr *new = args->new_name;
1187
1188 encode_diropargs3(xdr, args->old_dir, old->name, old->len);
1189 encode_diropargs3(xdr, args->new_dir, new->name, new->len);
800} 1190}
801 1191
802static int 1192/*
803nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) 1193 * 3.3.15 LINK3args
1194 *
1195 * struct LINK3args {
1196 * nfs_fh3 file;
1197 * diropargs3 link;
1198 * };
1199 */
1200static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
1201 struct xdr_stream *xdr,
1202 const struct nfs3_linkargs *args)
804{ 1203{
805 struct rpc_auth *auth = req->rq_cred->cr_auth; 1204 encode_nfs_fh3(xdr, args->fromfh);
806 unsigned int replen; 1205 encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
1206}
807 1207
808 p = xdr_encode_fhandle(p, args->fh); 1208/*
809 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 1209 * 3.3.16 READDIR3args
1210 *
1211 * struct READDIR3args {
1212 * nfs_fh3 dir;
1213 * cookie3 cookie;
1214 * cookieverf3 cookieverf;
1215 * count3 count;
1216 * };
1217 */
1218static void encode_readdir3args(struct xdr_stream *xdr,
1219 const struct nfs3_readdirargs *args)
1220{
1221 __be32 *p;
810 1222
811 /* Inline the page array */ 1223 encode_nfs_fh3(xdr, args->fh);
812 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2; 1224
813 xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); 1225 p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
814 return 0; 1226 p = xdr_encode_cookie3(p, args->cookie);
1227 p = xdr_encode_cookieverf3(p, args->verf);
1228 *p = cpu_to_be32(args->count);
1229}
1230
1231static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
1232 struct xdr_stream *xdr,
1233 const struct nfs3_readdirargs *args)
1234{
1235 encode_readdir3args(xdr, args);
1236 prepare_reply_buffer(req, args->pages, 0,
1237 args->count, NFS3_readdirres_sz);
815} 1238}
816 1239
817/* 1240/*
818 * Decode READLINK reply 1241 * 3.3.17 READDIRPLUS3args
1242 *
1243 * struct READDIRPLUS3args {
1244 * nfs_fh3 dir;
1245 * cookie3 cookie;
1246 * cookieverf3 cookieverf;
1247 * count3 dircount;
1248 * count3 maxcount;
1249 * };
819 */ 1250 */
820static int 1251static void encode_readdirplus3args(struct xdr_stream *xdr,
821nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 1252 const struct nfs3_readdirargs *args)
822{ 1253{
823 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 1254 __be32 *p;
824 struct kvec *iov = rcvbuf->head;
825 size_t hdrlen;
826 u32 len, recvd;
827 char *kaddr;
828 int status;
829 1255
830 status = ntohl(*p++); 1256 encode_nfs_fh3(xdr, args->fh);
831 p = xdr_decode_post_op_attr(p, fattr);
832 1257
833 if (status != 0) 1258 p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
834 return nfs_stat_to_errno(status); 1259 p = xdr_encode_cookie3(p, args->cookie);
1260 p = xdr_encode_cookieverf3(p, args->verf);
835 1261
836 /* Convert length of symlink */ 1262 /*
837 len = ntohl(*p++); 1263 * readdirplus: need dircount + buffer size.
838 if (len >= rcvbuf->page_len) { 1264 * We just make sure we make dircount big enough
839 dprintk("nfs: server returned giant symlink!\n"); 1265 */
840 return -ENAMETOOLONG; 1266 *p++ = cpu_to_be32(args->count >> 3);
841 }
842 1267
843 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1268 *p = cpu_to_be32(args->count);
844 if (iov->iov_len < hdrlen) { 1269}
845 dprintk("NFS: READLINK reply header overflowed:"
846 "length %Zu > %Zu\n", hdrlen, iov->iov_len);
847 return -errno_NFSERR_IO;
848 } else if (iov->iov_len != hdrlen) {
849 dprintk("NFS: READLINK header is short. "
850 "iovec will be shifted.\n");
851 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen);
852 }
853 recvd = req->rq_rcv_buf.len - hdrlen;
854 if (recvd < len) {
855 dprintk("NFS: server cheating in readlink reply: "
856 "count %u > recvd %u\n", len, recvd);
857 return -EIO;
858 }
859 1270
860 /* NULL terminate the string we got */ 1271static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
861 kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0); 1272 struct xdr_stream *xdr,
862 kaddr[len+rcvbuf->page_base] = '\0'; 1273 const struct nfs3_readdirargs *args)
863 kunmap_atomic(kaddr, KM_USER0); 1274{
864 return 0; 1275 encode_readdirplus3args(xdr, args);
1276 prepare_reply_buffer(req, args->pages, 0,
1277 args->count, NFS3_readdirres_sz);
865} 1278}
866 1279
867/* 1280/*
868 * Decode READ reply 1281 * 3.3.21 COMMIT3args
1282 *
1283 * struct COMMIT3args {
1284 * nfs_fh3 file;
1285 * offset3 offset;
1286 * count3 count;
1287 * };
869 */ 1288 */
870static int 1289static void encode_commit3args(struct xdr_stream *xdr,
871nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) 1290 const struct nfs_writeargs *args)
872{ 1291{
873 struct kvec *iov = req->rq_rcv_buf.head; 1292 __be32 *p;
874 size_t hdrlen;
875 u32 count, ocount, recvd;
876 int status;
877 1293
878 status = ntohl(*p++); 1294 encode_nfs_fh3(xdr, args->fh);
879 p = xdr_decode_post_op_attr(p, res->fattr);
880 1295
881 if (status != 0) 1296 p = xdr_reserve_space(xdr, 8 + 4);
882 return nfs_stat_to_errno(status); 1297 p = xdr_encode_hyper(p, args->offset);
1298 *p = cpu_to_be32(args->count);
1299}
883 1300
884 /* Decode reply count and EOF flag. NFSv3 is somewhat redundant 1301static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
885 * in that it puts the count both in the res struct and in the 1302 struct xdr_stream *xdr,
886 * opaque data count. */ 1303 const struct nfs_writeargs *args)
887 count = ntohl(*p++); 1304{
888 res->eof = ntohl(*p++); 1305 encode_commit3args(xdr, args);
889 ocount = ntohl(*p++); 1306}
890 1307
891 if (ocount != count) { 1308#ifdef CONFIG_NFS_V3_ACL
892 dprintk("NFS: READ count doesn't match RPC opaque count.\n");
893 return -errno_NFSERR_IO;
894 }
895 1309
896 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 1310static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
897 if (iov->iov_len < hdrlen) { 1311 struct xdr_stream *xdr,
898 dprintk("NFS: READ reply header overflowed:" 1312 const struct nfs3_getaclargs *args)
899 "length %Zu > %Zu\n", hdrlen, iov->iov_len); 1313{
900 return -errno_NFSERR_IO; 1314 encode_nfs_fh3(xdr, args->fh);
901 } else if (iov->iov_len != hdrlen) { 1315 encode_uint32(xdr, args->mask);
902 dprintk("NFS: READ header is short. iovec will be shifted.\n"); 1316 if (args->mask & (NFS_ACL | NFS_DFACL))
903 xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); 1317 prepare_reply_buffer(req, args->pages, 0,
904 } 1318 NFSACL_MAXPAGES << PAGE_SHIFT,
1319 ACL3_getaclres_sz);
1320}
905 1321
906 recvd = req->rq_rcv_buf.len - hdrlen; 1322static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
907 if (count > recvd) { 1323 struct xdr_stream *xdr,
908 dprintk("NFS: server cheating in read reply: " 1324 const struct nfs3_setaclargs *args)
909 "count %u > recvd %u\n", count, recvd); 1325{
910 count = recvd; 1326 unsigned int base;
911 res->eof = 0; 1327 int error;
912 }
913 1328
914 if (count < res->count) 1329 encode_nfs_fh3(xdr, NFS_FH(args->inode));
915 res->count = count; 1330 encode_uint32(xdr, args->mask);
916 1331
917 return count; 1332 base = req->rq_slen;
1333 if (args->npages != 0)
1334 xdr_write_pages(xdr, args->pages, 0, args->len);
1335 else
1336 xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
1337
1338 error = nfsacl_encode(xdr->buf, base, args->inode,
1339 (args->mask & NFS_ACL) ?
1340 args->acl_access : NULL, 1, 0);
1341 BUG_ON(error < 0);
1342 error = nfsacl_encode(xdr->buf, base + error, args->inode,
1343 (args->mask & NFS_DFACL) ?
1344 args->acl_default : NULL, 1,
1345 NFS_ACL_DEFAULT);
1346 BUG_ON(error < 0);
918} 1347}
919 1348
1349#endif /* CONFIG_NFS_V3_ACL */
1350
920/* 1351/*
921 * Decode WRITE response 1352 * NFSv3 XDR decode functions
1353 *
1354 * NFSv3 result types are defined in section 3.3 of RFC 1813:
1355 * "NFS Version 3 Protocol Specification".
922 */ 1356 */
923static int
924nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
925{
926 int status;
927 1357
928 status = ntohl(*p++); 1358/*
929 p = xdr_decode_wcc_data(p, res->fattr); 1359 * 3.3.1 GETATTR3res
1360 *
1361 * struct GETATTR3resok {
1362 * fattr3 obj_attributes;
1363 * };
1364 *
1365 * union GETATTR3res switch (nfsstat3 status) {
1366 * case NFS3_OK:
1367 * GETATTR3resok resok;
1368 * default:
1369 * void;
1370 * };
1371 */
1372static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
1373 struct xdr_stream *xdr,
1374 struct nfs_fattr *result)
1375{
1376 enum nfs_stat status;
1377 int error;
1378
1379 error = decode_nfsstat3(xdr, &status);
1380 if (unlikely(error))
1381 goto out;
1382 if (status != NFS3_OK)
1383 goto out_default;
1384 error = decode_fattr3(xdr, result);
1385out:
1386 return error;
1387out_default:
1388 return nfs_stat_to_errno(status);
1389}
930 1390
931 if (status != 0) 1391/*
932 return nfs_stat_to_errno(status); 1392 * 3.3.2 SETATTR3res
1393 *
1394 * struct SETATTR3resok {
1395 * wcc_data obj_wcc;
1396 * };
1397 *
1398 * struct SETATTR3resfail {
1399 * wcc_data obj_wcc;
1400 * };
1401 *
1402 * union SETATTR3res switch (nfsstat3 status) {
1403 * case NFS3_OK:
1404 * SETATTR3resok resok;
1405 * default:
1406 * SETATTR3resfail resfail;
1407 * };
1408 */
1409static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
1410 struct xdr_stream *xdr,
1411 struct nfs_fattr *result)
1412{
1413 enum nfs_stat status;
1414 int error;
1415
1416 error = decode_nfsstat3(xdr, &status);
1417 if (unlikely(error))
1418 goto out;
1419 error = decode_wcc_data(xdr, result);
1420 if (unlikely(error))
1421 goto out;
1422 if (status != NFS3_OK)
1423 goto out_status;
1424out:
1425 return error;
1426out_status:
1427 return nfs_stat_to_errno(status);
1428}
933 1429
934 res->count = ntohl(*p++); 1430/*
935 res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); 1431 * 3.3.3 LOOKUP3res
936 res->verf->verifier[0] = *p++; 1432 *
937 res->verf->verifier[1] = *p++; 1433 * struct LOOKUP3resok {
1434 * nfs_fh3 object;
1435 * post_op_attr obj_attributes;
1436 * post_op_attr dir_attributes;
1437 * };
1438 *
1439 * struct LOOKUP3resfail {
1440 * post_op_attr dir_attributes;
1441 * };
1442 *
1443 * union LOOKUP3res switch (nfsstat3 status) {
1444 * case NFS3_OK:
1445 * LOOKUP3resok resok;
1446 * default:
1447 * LOOKUP3resfail resfail;
1448 * };
1449 */
1450static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
1451 struct xdr_stream *xdr,
1452 struct nfs3_diropres *result)
1453{
1454 enum nfs_stat status;
1455 int error;
1456
1457 error = decode_nfsstat3(xdr, &status);
1458 if (unlikely(error))
1459 goto out;
1460 if (status != NFS3_OK)
1461 goto out_default;
1462 error = decode_nfs_fh3(xdr, result->fh);
1463 if (unlikely(error))
1464 goto out;
1465 error = decode_post_op_attr(xdr, result->fattr);
1466 if (unlikely(error))
1467 goto out;
1468 error = decode_post_op_attr(xdr, result->dir_attr);
1469out:
1470 return error;
1471out_default:
1472 error = decode_post_op_attr(xdr, result->dir_attr);
1473 if (unlikely(error))
1474 goto out;
1475 return nfs_stat_to_errno(status);
1476}
938 1477
939 return res->count; 1478/*
1479 * 3.3.4 ACCESS3res
1480 *
1481 * struct ACCESS3resok {
1482 * post_op_attr obj_attributes;
1483 * uint32 access;
1484 * };
1485 *
1486 * struct ACCESS3resfail {
1487 * post_op_attr obj_attributes;
1488 * };
1489 *
1490 * union ACCESS3res switch (nfsstat3 status) {
1491 * case NFS3_OK:
1492 * ACCESS3resok resok;
1493 * default:
1494 * ACCESS3resfail resfail;
1495 * };
1496 */
1497static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
1498 struct xdr_stream *xdr,
1499 struct nfs3_accessres *result)
1500{
1501 enum nfs_stat status;
1502 int error;
1503
1504 error = decode_nfsstat3(xdr, &status);
1505 if (unlikely(error))
1506 goto out;
1507 error = decode_post_op_attr(xdr, result->fattr);
1508 if (unlikely(error))
1509 goto out;
1510 if (status != NFS3_OK)
1511 goto out_default;
1512 error = decode_uint32(xdr, &result->access);
1513out:
1514 return error;
1515out_default:
1516 return nfs_stat_to_errno(status);
940} 1517}
941 1518
942/* 1519/*
943 * Decode a CREATE response 1520 * 3.3.5 READLINK3res
1521 *
1522 * struct READLINK3resok {
1523 * post_op_attr symlink_attributes;
1524 * nfspath3 data;
1525 * };
1526 *
1527 * struct READLINK3resfail {
1528 * post_op_attr symlink_attributes;
1529 * };
1530 *
1531 * union READLINK3res switch (nfsstat3 status) {
1532 * case NFS3_OK:
1533 * READLINK3resok resok;
1534 * default:
1535 * READLINK3resfail resfail;
1536 * };
944 */ 1537 */
945static int 1538static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
946nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) 1539 struct xdr_stream *xdr,
947{ 1540 struct nfs_fattr *result)
948 int status; 1541{
949 1542 enum nfs_stat status;
950 status = ntohl(*p++); 1543 int error;
951 if (status == 0) { 1544
952 if (*p++) { 1545 error = decode_nfsstat3(xdr, &status);
953 if (!(p = xdr_decode_fhandle(p, res->fh))) 1546 if (unlikely(error))
954 return -errno_NFSERR_IO; 1547 goto out;
955 p = xdr_decode_post_op_attr(p, res->fattr); 1548 error = decode_post_op_attr(xdr, result);
956 } else { 1549 if (unlikely(error))
957 memset(res->fh, 0, sizeof(*res->fh)); 1550 goto out;
958 /* Do decode post_op_attr but set it to NULL */ 1551 if (status != NFS3_OK)
959 p = xdr_decode_post_op_attr(p, res->fattr); 1552 goto out_default;
960 res->fattr->valid = 0; 1553 error = decode_nfspath3(xdr);
961 } 1554out:
962 } else { 1555 return error;
963 status = nfs_stat_to_errno(status); 1556out_default:
964 } 1557 return nfs_stat_to_errno(status);
965 p = xdr_decode_wcc_data(p, res->dir_attr);
966 return status;
967} 1558}
968 1559
969/* 1560/*
970 * Decode RENAME reply 1561 * 3.3.6 READ3res
1562 *
1563 * struct READ3resok {
1564 * post_op_attr file_attributes;
1565 * count3 count;
1566 * bool eof;
1567 * opaque data<>;
1568 * };
1569 *
1570 * struct READ3resfail {
1571 * post_op_attr file_attributes;
1572 * };
1573 *
1574 * union READ3res switch (nfsstat3 status) {
1575 * case NFS3_OK:
1576 * READ3resok resok;
1577 * default:
1578 * READ3resfail resfail;
1579 * };
971 */ 1580 */
972static int 1581static int decode_read3resok(struct xdr_stream *xdr,
973nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) 1582 struct nfs_readres *result)
974{ 1583{
975 int status; 1584 u32 eof, count, ocount, recvd;
1585 size_t hdrlen;
1586 __be32 *p;
1587
1588 p = xdr_inline_decode(xdr, 4 + 4 + 4);
1589 if (unlikely(p == NULL))
1590 goto out_overflow;
1591 count = be32_to_cpup(p++);
1592 eof = be32_to_cpup(p++);
1593 ocount = be32_to_cpup(p++);
1594 if (unlikely(ocount != count))
1595 goto out_mismatch;
1596 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
1597 recvd = xdr->buf->len - hdrlen;
1598 if (unlikely(count > recvd))
1599 goto out_cheating;
1600
1601out:
1602 xdr_read_pages(xdr, count);
1603 result->eof = eof;
1604 result->count = count;
1605 return count;
1606out_mismatch:
1607 dprintk("NFS: READ count doesn't match length of opaque: "
1608 "count %u != ocount %u\n", count, ocount);
1609 return -EIO;
1610out_cheating:
1611 dprintk("NFS: server cheating in read result: "
1612 "count %u > recvd %u\n", count, recvd);
1613 count = recvd;
1614 eof = 0;
1615 goto out;
1616out_overflow:
1617 print_overflow_msg(__func__, xdr);
1618 return -EIO;
1619}
976 1620
977 if ((status = ntohl(*p++)) != 0) 1621static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
978 status = nfs_stat_to_errno(status); 1622 struct nfs_readres *result)
979 p = xdr_decode_wcc_data(p, res->fromattr); 1623{
980 p = xdr_decode_wcc_data(p, res->toattr); 1624 enum nfs_stat status;
981 return status; 1625 int error;
1626
1627 error = decode_nfsstat3(xdr, &status);
1628 if (unlikely(error))
1629 goto out;
1630 error = decode_post_op_attr(xdr, result->fattr);
1631 if (unlikely(error))
1632 goto out;
1633 if (status != NFS3_OK)
1634 goto out_status;
1635 error = decode_read3resok(xdr, result);
1636out:
1637 return error;
1638out_status:
1639 return nfs_stat_to_errno(status);
982} 1640}
983 1641
984/* 1642/*
985 * Decode LINK reply 1643 * 3.3.7 WRITE3res
1644 *
1645 * enum stable_how {
1646 * UNSTABLE = 0,
1647 * DATA_SYNC = 1,
1648 * FILE_SYNC = 2
1649 * };
1650 *
1651 * struct WRITE3resok {
1652 * wcc_data file_wcc;
1653 * count3 count;
1654 * stable_how committed;
1655 * writeverf3 verf;
1656 * };
1657 *
1658 * struct WRITE3resfail {
1659 * wcc_data file_wcc;
1660 * };
1661 *
1662 * union WRITE3res switch (nfsstat3 status) {
1663 * case NFS3_OK:
1664 * WRITE3resok resok;
1665 * default:
1666 * WRITE3resfail resfail;
1667 * };
986 */ 1668 */
987static int 1669static int decode_write3resok(struct xdr_stream *xdr,
988nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res) 1670 struct nfs_writeres *result)
989{ 1671{
990 int status; 1672 __be32 *p;
1673
1674 p = xdr_inline_decode(xdr, 4 + 4 + NFS3_WRITEVERFSIZE);
1675 if (unlikely(p == NULL))
1676 goto out_overflow;
1677 result->count = be32_to_cpup(p++);
1678 result->verf->committed = be32_to_cpup(p++);
1679 if (unlikely(result->verf->committed > NFS_FILE_SYNC))
1680 goto out_badvalue;
1681 memcpy(result->verf->verifier, p, NFS3_WRITEVERFSIZE);
1682 return result->count;
1683out_badvalue:
1684 dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
1685 return -EIO;
1686out_overflow:
1687 print_overflow_msg(__func__, xdr);
1688 return -EIO;
1689}
991 1690
992 if ((status = ntohl(*p++)) != 0) 1691static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
993 status = nfs_stat_to_errno(status); 1692 struct nfs_writeres *result)
994 p = xdr_decode_post_op_attr(p, res->fattr); 1693{
995 p = xdr_decode_wcc_data(p, res->dir_attr); 1694 enum nfs_stat status;
996 return status; 1695 int error;
1696
1697 error = decode_nfsstat3(xdr, &status);
1698 if (unlikely(error))
1699 goto out;
1700 error = decode_wcc_data(xdr, result->fattr);
1701 if (unlikely(error))
1702 goto out;
1703 if (status != NFS3_OK)
1704 goto out_status;
1705 error = decode_write3resok(xdr, result);
1706out:
1707 return error;
1708out_status:
1709 return nfs_stat_to_errno(status);
997} 1710}
998 1711
999/* 1712/*
1000 * Decode FSSTAT reply 1713 * 3.3.8 CREATE3res
1714 *
1715 * struct CREATE3resok {
1716 * post_op_fh3 obj;
1717 * post_op_attr obj_attributes;
1718 * wcc_data dir_wcc;
1719 * };
1720 *
1721 * struct CREATE3resfail {
1722 * wcc_data dir_wcc;
1723 * };
1724 *
1725 * union CREATE3res switch (nfsstat3 status) {
1726 * case NFS3_OK:
1727 * CREATE3resok resok;
1728 * default:
1729 * CREATE3resfail resfail;
1730 * };
1001 */ 1731 */
1002static int 1732static int decode_create3resok(struct xdr_stream *xdr,
1003nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) 1733 struct nfs3_diropres *result)
1004{ 1734{
1005 int status; 1735 int error;
1006 1736
1007 status = ntohl(*p++); 1737 error = decode_post_op_fh3(xdr, result->fh);
1738 if (unlikely(error))
1739 goto out;
1740 error = decode_post_op_attr(xdr, result->fattr);
1741 if (unlikely(error))
1742 goto out;
1743 /* The server isn't required to return a file handle.
1744 * If it didn't, force the client to perform a LOOKUP
1745 * to determine the correct file handle and attribute
1746 * values for the new object. */
1747 if (result->fh->size == 0)
1748 result->fattr->valid = 0;
1749 error = decode_wcc_data(xdr, result->dir_attr);
1750out:
1751 return error;
1752}
1008 1753
1009 p = xdr_decode_post_op_attr(p, res->fattr); 1754static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
1010 if (status != 0) 1755 struct xdr_stream *xdr,
1011 return nfs_stat_to_errno(status); 1756 struct nfs3_diropres *result)
1757{
1758 enum nfs_stat status;
1759 int error;
1760
1761 error = decode_nfsstat3(xdr, &status);
1762 if (unlikely(error))
1763 goto out;
1764 if (status != NFS3_OK)
1765 goto out_default;
1766 error = decode_create3resok(xdr, result);
1767out:
1768 return error;
1769out_default:
1770 error = decode_wcc_data(xdr, result->dir_attr);
1771 if (unlikely(error))
1772 goto out;
1773 return nfs_stat_to_errno(status);
1774}
1012 1775
1013 p = xdr_decode_hyper(p, &res->tbytes); 1776/*
1014 p = xdr_decode_hyper(p, &res->fbytes); 1777 * 3.3.12 REMOVE3res
1015 p = xdr_decode_hyper(p, &res->abytes); 1778 *
1016 p = xdr_decode_hyper(p, &res->tfiles); 1779 * struct REMOVE3resok {
1017 p = xdr_decode_hyper(p, &res->ffiles); 1780 * wcc_data dir_wcc;
1018 p = xdr_decode_hyper(p, &res->afiles); 1781 * };
1782 *
1783 * struct REMOVE3resfail {
1784 * wcc_data dir_wcc;
1785 * };
1786 *
1787 * union REMOVE3res switch (nfsstat3 status) {
1788 * case NFS3_OK:
1789 * REMOVE3resok resok;
1790 * default:
1791 * REMOVE3resfail resfail;
1792 * };
1793 */
1794static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
1795 struct xdr_stream *xdr,
1796 struct nfs_removeres *result)
1797{
1798 enum nfs_stat status;
1799 int error;
1800
1801 error = decode_nfsstat3(xdr, &status);
1802 if (unlikely(error))
1803 goto out;
1804 error = decode_wcc_data(xdr, result->dir_attr);
1805 if (unlikely(error))
1806 goto out;
1807 if (status != NFS3_OK)
1808 goto out_status;
1809out:
1810 return error;
1811out_status:
1812 return nfs_stat_to_errno(status);
1813}
1019 1814
1020 /* ignore invarsec */ 1815/*
1021 return 0; 1816 * 3.3.14 RENAME3res
1817 *
1818 * struct RENAME3resok {
1819 * wcc_data fromdir_wcc;
1820 * wcc_data todir_wcc;
1821 * };
1822 *
1823 * struct RENAME3resfail {
1824 * wcc_data fromdir_wcc;
1825 * wcc_data todir_wcc;
1826 * };
1827 *
1828 * union RENAME3res switch (nfsstat3 status) {
1829 * case NFS3_OK:
1830 * RENAME3resok resok;
1831 * default:
1832 * RENAME3resfail resfail;
1833 * };
1834 */
1835static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
1836 struct xdr_stream *xdr,
1837 struct nfs_renameres *result)
1838{
1839 enum nfs_stat status;
1840 int error;
1841
1842 error = decode_nfsstat3(xdr, &status);
1843 if (unlikely(error))
1844 goto out;
1845 error = decode_wcc_data(xdr, result->old_fattr);
1846 if (unlikely(error))
1847 goto out;
1848 error = decode_wcc_data(xdr, result->new_fattr);
1849 if (unlikely(error))
1850 goto out;
1851 if (status != NFS3_OK)
1852 goto out_status;
1853out:
1854 return error;
1855out_status:
1856 return nfs_stat_to_errno(status);
1022} 1857}
1023 1858
1024/* 1859/*
1025 * Decode FSINFO reply 1860 * 3.3.15 LINK3res
1861 *
1862 * struct LINK3resok {
1863 * post_op_attr file_attributes;
1864 * wcc_data linkdir_wcc;
1865 * };
1866 *
1867 * struct LINK3resfail {
1868 * post_op_attr file_attributes;
1869 * wcc_data linkdir_wcc;
1870 * };
1871 *
1872 * union LINK3res switch (nfsstat3 status) {
1873 * case NFS3_OK:
1874 * LINK3resok resok;
1875 * default:
1876 * LINK3resfail resfail;
1877 * };
1026 */ 1878 */
1027static int 1879static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1028nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) 1880 struct nfs3_linkres *result)
1029{ 1881{
1030 int status; 1882 enum nfs_stat status;
1883 int error;
1884
1885 error = decode_nfsstat3(xdr, &status);
1886 if (unlikely(error))
1887 goto out;
1888 error = decode_post_op_attr(xdr, result->fattr);
1889 if (unlikely(error))
1890 goto out;
1891 error = decode_wcc_data(xdr, result->dir_attr);
1892 if (unlikely(error))
1893 goto out;
1894 if (status != NFS3_OK)
1895 goto out_status;
1896out:
1897 return error;
1898out_status:
1899 return nfs_stat_to_errno(status);
1900}
1901
1902/**
1903 * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
1904 * the local page cache
1905 * @xdr: XDR stream where entry resides
1906 * @entry: buffer to fill in with entry data
1907 * @plus: boolean indicating whether this should be a readdirplus entry
1908 *
1909 * Returns zero if successful, otherwise a negative errno value is
1910 * returned.
1911 *
1912 * This function is not invoked during READDIR reply decoding, but
1913 * rather whenever an application invokes the getdents(2) system call
1914 * on a directory already in our cache.
1915 *
1916 * 3.3.16 entry3
1917 *
1918 * struct entry3 {
1919 * fileid3 fileid;
1920 * filename3 name;
1921 * cookie3 cookie;
1922 * fhandle3 filehandle;
1923 * post_op_attr3 attributes;
1924 * entry3 *nextentry;
1925 * };
1926 *
1927 * 3.3.17 entryplus3
1928 * struct entryplus3 {
1929 * fileid3 fileid;
1930 * filename3 name;
1931 * cookie3 cookie;
1932 * post_op_attr name_attributes;
1933 * post_op_fh3 name_handle;
1934 * entryplus3 *nextentry;
1935 * };
1936 */
1937int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
1938 int plus)
1939{
1940 struct nfs_entry old = *entry;
1941 __be32 *p;
1942 int error;
1943
1944 p = xdr_inline_decode(xdr, 4);
1945 if (unlikely(p == NULL))
1946 goto out_overflow;
1947 if (*p == xdr_zero) {
1948 p = xdr_inline_decode(xdr, 4);
1949 if (unlikely(p == NULL))
1950 goto out_overflow;
1951 if (*p == xdr_zero)
1952 return -EAGAIN;
1953 entry->eof = 1;
1954 return -EBADCOOKIE;
1955 }
1031 1956
1032 status = ntohl(*p++); 1957 error = decode_fileid3(xdr, &entry->ino);
1958 if (unlikely(error))
1959 return error;
1033 1960
1034 p = xdr_decode_post_op_attr(p, res->fattr); 1961 error = decode_inline_filename3(xdr, &entry->name, &entry->len);
1035 if (status != 0) 1962 if (unlikely(error))
1036 return nfs_stat_to_errno(status); 1963 return error;
1037 1964
1038 res->rtmax = ntohl(*p++); 1965 entry->prev_cookie = entry->cookie;
1039 res->rtpref = ntohl(*p++); 1966 error = decode_cookie3(xdr, &entry->cookie);
1040 res->rtmult = ntohl(*p++); 1967 if (unlikely(error))
1041 res->wtmax = ntohl(*p++); 1968 return error;
1042 res->wtpref = ntohl(*p++); 1969
1043 res->wtmult = ntohl(*p++); 1970 entry->d_type = DT_UNKNOWN;
1044 res->dtpref = ntohl(*p++); 1971
1045 p = xdr_decode_hyper(p, &res->maxfilesize); 1972 if (plus) {
1973 entry->fattr->valid = 0;
1974 error = decode_post_op_attr(xdr, entry->fattr);
1975 if (unlikely(error))
1976 return error;
1977 if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
1978 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
1979
1980 /* In fact, a post_op_fh3: */
1981 p = xdr_inline_decode(xdr, 4);
1982 if (unlikely(p == NULL))
1983 goto out_overflow;
1984 if (*p != xdr_zero) {
1985 error = decode_nfs_fh3(xdr, entry->fh);
1986 if (unlikely(error)) {
1987 if (error == -E2BIG)
1988 goto out_truncated;
1989 return error;
1990 }
1991 } else
1992 zero_nfs_fh3(entry->fh);
1993 }
1046 1994
1047 /* ignore time_delta and properties */
1048 res->lease_time = 0;
1049 return 0; 1995 return 0;
1996
1997out_overflow:
1998 print_overflow_msg(__func__, xdr);
1999 return -EAGAIN;
2000out_truncated:
2001 dprintk("NFS: directory entry contains invalid file handle\n");
2002 *entry = old;
2003 return -EAGAIN;
1050} 2004}
1051 2005
1052/* 2006/*
1053 * Decode PATHCONF reply 2007 * 3.3.16 READDIR3res
2008 *
2009 * struct dirlist3 {
2010 * entry3 *entries;
2011 * bool eof;
2012 * };
2013 *
2014 * struct READDIR3resok {
2015 * post_op_attr dir_attributes;
2016 * cookieverf3 cookieverf;
2017 * dirlist3 reply;
2018 * };
2019 *
2020 * struct READDIR3resfail {
2021 * post_op_attr dir_attributes;
2022 * };
2023 *
2024 * union READDIR3res switch (nfsstat3 status) {
2025 * case NFS3_OK:
2026 * READDIR3resok resok;
2027 * default:
2028 * READDIR3resfail resfail;
2029 * };
2030 *
2031 * Read the directory contents into the page cache, but otherwise
2032 * don't touch them. The actual decoding is done by nfs3_decode_entry()
2033 * during subsequent nfs_readdir() calls.
1054 */ 2034 */
1055static int 2035static int decode_dirlist3(struct xdr_stream *xdr)
1056nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
1057{ 2036{
1058 int status; 2037 u32 recvd, pglen;
2038 size_t hdrlen;
1059 2039
1060 status = ntohl(*p++); 2040 pglen = xdr->buf->page_len;
2041 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
2042 recvd = xdr->buf->len - hdrlen;
2043 if (unlikely(pglen > recvd))
2044 goto out_cheating;
2045out:
2046 xdr_read_pages(xdr, pglen);
2047 return pglen;
2048out_cheating:
2049 dprintk("NFS: server cheating in readdir result: "
2050 "pglen %u > recvd %u\n", pglen, recvd);
2051 pglen = recvd;
2052 goto out;
2053}
1061 2054
1062 p = xdr_decode_post_op_attr(p, res->fattr); 2055static int decode_readdir3resok(struct xdr_stream *xdr,
1063 if (status != 0) 2056 struct nfs3_readdirres *result)
1064 return nfs_stat_to_errno(status); 2057{
1065 res->max_link = ntohl(*p++); 2058 int error;
1066 res->max_namelen = ntohl(*p++); 2059
2060 error = decode_post_op_attr(xdr, result->dir_attr);
2061 if (unlikely(error))
2062 goto out;
2063 /* XXX: do we need to check if result->verf != NULL ? */
2064 error = decode_cookieverf3(xdr, result->verf);
2065 if (unlikely(error))
2066 goto out;
2067 error = decode_dirlist3(xdr);
2068out:
2069 return error;
2070}
1067 2071
1068 /* ignore remaining fields */ 2072static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
1069 return 0; 2073 struct xdr_stream *xdr,
2074 struct nfs3_readdirres *result)
2075{
2076 enum nfs_stat status;
2077 int error;
2078
2079 error = decode_nfsstat3(xdr, &status);
2080 if (unlikely(error))
2081 goto out;
2082 if (status != NFS3_OK)
2083 goto out_default;
2084 error = decode_readdir3resok(xdr, result);
2085out:
2086 return error;
2087out_default:
2088 error = decode_post_op_attr(xdr, result->dir_attr);
2089 if (unlikely(error))
2090 goto out;
2091 return nfs_stat_to_errno(status);
1070} 2092}
1071 2093
1072/* 2094/*
1073 * Decode COMMIT reply 2095 * 3.3.18 FSSTAT3res
2096 *
2097 * struct FSSTAT3resok {
2098 * post_op_attr obj_attributes;
2099 * size3 tbytes;
2100 * size3 fbytes;
2101 * size3 abytes;
2102 * size3 tfiles;
2103 * size3 ffiles;
2104 * size3 afiles;
2105 * uint32 invarsec;
2106 * };
2107 *
2108 * struct FSSTAT3resfail {
2109 * post_op_attr obj_attributes;
2110 * };
2111 *
2112 * union FSSTAT3res switch (nfsstat3 status) {
2113 * case NFS3_OK:
2114 * FSSTAT3resok resok;
2115 * default:
2116 * FSSTAT3resfail resfail;
2117 * };
1074 */ 2118 */
1075static int 2119static int decode_fsstat3resok(struct xdr_stream *xdr,
1076nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) 2120 struct nfs_fsstat *result)
1077{ 2121{
1078 int status; 2122 __be32 *p;
2123
2124 p = xdr_inline_decode(xdr, 8 * 6 + 4);
2125 if (unlikely(p == NULL))
2126 goto out_overflow;
2127 p = xdr_decode_size3(p, &result->tbytes);
2128 p = xdr_decode_size3(p, &result->fbytes);
2129 p = xdr_decode_size3(p, &result->abytes);
2130 p = xdr_decode_size3(p, &result->tfiles);
2131 p = xdr_decode_size3(p, &result->ffiles);
2132 xdr_decode_size3(p, &result->afiles);
2133 /* ignore invarsec */
2134 return 0;
2135out_overflow:
2136 print_overflow_msg(__func__, xdr);
2137 return -EIO;
2138}
1079 2139
1080 status = ntohl(*p++); 2140static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
1081 p = xdr_decode_wcc_data(p, res->fattr); 2141 struct xdr_stream *xdr,
1082 if (status != 0) 2142 struct nfs_fsstat *result)
1083 return nfs_stat_to_errno(status); 2143{
2144 enum nfs_stat status;
2145 int error;
2146
2147 error = decode_nfsstat3(xdr, &status);
2148 if (unlikely(error))
2149 goto out;
2150 error = decode_post_op_attr(xdr, result->fattr);
2151 if (unlikely(error))
2152 goto out;
2153 if (status != NFS3_OK)
2154 goto out_status;
2155 error = decode_fsstat3resok(xdr, result);
2156out:
2157 return error;
2158out_status:
2159 return nfs_stat_to_errno(status);
2160}
1084 2161
1085 res->verf->verifier[0] = *p++; 2162/*
1086 res->verf->verifier[1] = *p++; 2163 * 3.3.19 FSINFO3res
2164 *
2165 * struct FSINFO3resok {
2166 * post_op_attr obj_attributes;
2167 * uint32 rtmax;
2168 * uint32 rtpref;
2169 * uint32 rtmult;
2170 * uint32 wtmax;
2171 * uint32 wtpref;
2172 * uint32 wtmult;
2173 * uint32 dtpref;
2174 * size3 maxfilesize;
2175 * nfstime3 time_delta;
2176 * uint32 properties;
2177 * };
2178 *
2179 * struct FSINFO3resfail {
2180 * post_op_attr obj_attributes;
2181 * };
2182 *
2183 * union FSINFO3res switch (nfsstat3 status) {
2184 * case NFS3_OK:
2185 * FSINFO3resok resok;
2186 * default:
2187 * FSINFO3resfail resfail;
2188 * };
2189 */
2190static int decode_fsinfo3resok(struct xdr_stream *xdr,
2191 struct nfs_fsinfo *result)
2192{
2193 __be32 *p;
2194
2195 p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
2196 if (unlikely(p == NULL))
2197 goto out_overflow;
2198 result->rtmax = be32_to_cpup(p++);
2199 result->rtpref = be32_to_cpup(p++);
2200 result->rtmult = be32_to_cpup(p++);
2201 result->wtmax = be32_to_cpup(p++);
2202 result->wtpref = be32_to_cpup(p++);
2203 result->wtmult = be32_to_cpup(p++);
2204 result->dtpref = be32_to_cpup(p++);
2205 p = xdr_decode_size3(p, &result->maxfilesize);
2206 xdr_decode_nfstime3(p, &result->time_delta);
2207
2208 /* ignore properties */
2209 result->lease_time = 0;
1087 return 0; 2210 return 0;
2211out_overflow:
2212 print_overflow_msg(__func__, xdr);
2213 return -EIO;
2214}
2215
2216static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
2217 struct xdr_stream *xdr,
2218 struct nfs_fsinfo *result)
2219{
2220 enum nfs_stat status;
2221 int error;
2222
2223 error = decode_nfsstat3(xdr, &status);
2224 if (unlikely(error))
2225 goto out;
2226 error = decode_post_op_attr(xdr, result->fattr);
2227 if (unlikely(error))
2228 goto out;
2229 if (status != NFS3_OK)
2230 goto out_status;
2231 error = decode_fsinfo3resok(xdr, result);
2232out:
2233 return error;
2234out_status:
2235 return nfs_stat_to_errno(status);
1088} 2236}
1089 2237
1090#ifdef CONFIG_NFS_V3_ACL
1091/* 2238/*
1092 * Decode GETACL reply 2239 * 3.3.20 PATHCONF3res
2240 *
2241 * struct PATHCONF3resok {
2242 * post_op_attr obj_attributes;
2243 * uint32 linkmax;
2244 * uint32 name_max;
2245 * bool no_trunc;
2246 * bool chown_restricted;
2247 * bool case_insensitive;
2248 * bool case_preserving;
2249 * };
2250 *
2251 * struct PATHCONF3resfail {
2252 * post_op_attr obj_attributes;
2253 * };
2254 *
2255 * union PATHCONF3res switch (nfsstat3 status) {
2256 * case NFS3_OK:
2257 * PATHCONF3resok resok;
2258 * default:
2259 * PATHCONF3resfail resfail;
2260 * };
1093 */ 2261 */
1094static int 2262static int decode_pathconf3resok(struct xdr_stream *xdr,
1095nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p, 2263 struct nfs_pathconf *result)
1096 struct nfs3_getaclres *res)
1097{ 2264{
1098 struct xdr_buf *buf = &req->rq_rcv_buf; 2265 __be32 *p;
1099 int status = ntohl(*p++);
1100 struct posix_acl **acl;
1101 unsigned int *aclcnt;
1102 int err, base;
1103
1104 if (status != 0)
1105 return nfs_stat_to_errno(status);
1106 p = xdr_decode_post_op_attr(p, res->fattr);
1107 res->mask = ntohl(*p++);
1108 if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
1109 return -EINVAL;
1110 base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
1111 2266
1112 acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL; 2267 p = xdr_inline_decode(xdr, 4 * 6);
1113 aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL; 2268 if (unlikely(p == NULL))
1114 err = nfsacl_decode(buf, base, aclcnt, acl); 2269 goto out_overflow;
2270 result->max_link = be32_to_cpup(p++);
2271 result->max_namelen = be32_to_cpup(p);
2272 /* ignore remaining fields */
2273 return 0;
2274out_overflow:
2275 print_overflow_msg(__func__, xdr);
2276 return -EIO;
2277}
1115 2278
1116 acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL; 2279static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
1117 aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL; 2280 struct xdr_stream *xdr,
1118 if (err > 0) 2281 struct nfs_pathconf *result)
1119 err = nfsacl_decode(buf, base + err, aclcnt, acl); 2282{
1120 return (err > 0) ? 0 : err; 2283 enum nfs_stat status;
2284 int error;
2285
2286 error = decode_nfsstat3(xdr, &status);
2287 if (unlikely(error))
2288 goto out;
2289 error = decode_post_op_attr(xdr, result->fattr);
2290 if (unlikely(error))
2291 goto out;
2292 if (status != NFS3_OK)
2293 goto out_status;
2294 error = decode_pathconf3resok(xdr, result);
2295out:
2296 return error;
2297out_status:
2298 return nfs_stat_to_errno(status);
1121} 2299}
1122 2300
1123/* 2301/*
1124 * Decode setacl reply. 2302 * 3.3.21 COMMIT3res
2303 *
2304 * struct COMMIT3resok {
2305 * wcc_data file_wcc;
2306 * writeverf3 verf;
2307 * };
2308 *
2309 * struct COMMIT3resfail {
2310 * wcc_data file_wcc;
2311 * };
2312 *
2313 * union COMMIT3res switch (nfsstat3 status) {
2314 * case NFS3_OK:
2315 * COMMIT3resok resok;
2316 * default:
2317 * COMMIT3resfail resfail;
2318 * };
1125 */ 2319 */
1126static int 2320static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
1127nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) 2321 struct xdr_stream *xdr,
2322 struct nfs_writeres *result)
1128{ 2323{
1129 int status = ntohl(*p++); 2324 enum nfs_stat status;
2325 int error;
2326
2327 error = decode_nfsstat3(xdr, &status);
2328 if (unlikely(error))
2329 goto out;
2330 error = decode_wcc_data(xdr, result->fattr);
2331 if (unlikely(error))
2332 goto out;
2333 if (status != NFS3_OK)
2334 goto out_status;
2335 error = decode_writeverf3(xdr, result->verf->verifier);
2336out:
2337 return error;
2338out_status:
2339 return nfs_stat_to_errno(status);
2340}
1130 2341
1131 if (status) 2342#ifdef CONFIG_NFS_V3_ACL
1132 return nfs_stat_to_errno(status); 2343
1133 xdr_decode_post_op_attr(p, fattr); 2344static inline int decode_getacl3resok(struct xdr_stream *xdr,
1134 return 0; 2345 struct nfs3_getaclres *result)
2346{
2347 struct posix_acl **acl;
2348 unsigned int *aclcnt;
2349 size_t hdrlen;
2350 int error;
2351
2352 error = decode_post_op_attr(xdr, result->fattr);
2353 if (unlikely(error))
2354 goto out;
2355 error = decode_uint32(xdr, &result->mask);
2356 if (unlikely(error))
2357 goto out;
2358 error = -EINVAL;
2359 if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
2360 goto out;
2361
2362 hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
2363
2364 acl = NULL;
2365 if (result->mask & NFS_ACL)
2366 acl = &result->acl_access;
2367 aclcnt = NULL;
2368 if (result->mask & NFS_ACLCNT)
2369 aclcnt = &result->acl_access_count;
2370 error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
2371 if (unlikely(error <= 0))
2372 goto out;
2373
2374 acl = NULL;
2375 if (result->mask & NFS_DFACL)
2376 acl = &result->acl_default;
2377 aclcnt = NULL;
2378 if (result->mask & NFS_DFACLCNT)
2379 aclcnt = &result->acl_default_count;
2380 error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
2381 if (unlikely(error <= 0))
2382 return error;
2383 error = 0;
2384out:
2385 return error;
2386}
2387
2388static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
2389 struct xdr_stream *xdr,
2390 struct nfs3_getaclres *result)
2391{
2392 enum nfs_stat status;
2393 int error;
2394
2395 error = decode_nfsstat3(xdr, &status);
2396 if (unlikely(error))
2397 goto out;
2398 if (status != NFS3_OK)
2399 goto out_default;
2400 error = decode_getacl3resok(xdr, result);
2401out:
2402 return error;
2403out_default:
2404 return nfs_stat_to_errno(status);
1135} 2405}
2406
2407static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
2408 struct xdr_stream *xdr,
2409 struct nfs_fattr *result)
2410{
2411 enum nfs_stat status;
2412 int error;
2413
2414 error = decode_nfsstat3(xdr, &status);
2415 if (unlikely(error))
2416 goto out;
2417 if (status != NFS3_OK)
2418 goto out_default;
2419 error = decode_post_op_attr(xdr, result);
2420out:
2421 return error;
2422out_default:
2423 return nfs_stat_to_errno(status);
2424}
2425
1136#endif /* CONFIG_NFS_V3_ACL */ 2426#endif /* CONFIG_NFS_V3_ACL */
1137 2427
1138#define PROC(proc, argtype, restype, timer) \ 2428#define PROC(proc, argtype, restype, timer) \
1139[NFS3PROC_##proc] = { \ 2429[NFS3PROC_##proc] = { \
1140 .p_proc = NFS3PROC_##proc, \ 2430 .p_proc = NFS3PROC_##proc, \
1141 .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \ 2431 .p_encode = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args, \
1142 .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \ 2432 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res, \
1143 .p_arglen = NFS3_##argtype##_sz, \ 2433 .p_arglen = NFS3_##argtype##args_sz, \
1144 .p_replen = NFS3_##restype##_sz, \ 2434 .p_replen = NFS3_##restype##res_sz, \
1145 .p_timer = timer, \ 2435 .p_timer = timer, \
1146 .p_statidx = NFS3PROC_##proc, \ 2436 .p_statidx = NFS3PROC_##proc, \
1147 .p_name = #proc, \ 2437 .p_name = #proc, \
1148 } 2438 }
1149 2439
1150struct rpc_procinfo nfs3_procedures[] = { 2440struct rpc_procinfo nfs3_procedures[] = {
1151 PROC(GETATTR, fhandle, attrstat, 1), 2441 PROC(GETATTR, getattr, getattr, 1),
1152 PROC(SETATTR, sattrargs, wccstat, 0), 2442 PROC(SETATTR, setattr, setattr, 0),
1153 PROC(LOOKUP, diropargs, lookupres, 2), 2443 PROC(LOOKUP, lookup, lookup, 2),
1154 PROC(ACCESS, accessargs, accessres, 1), 2444 PROC(ACCESS, access, access, 1),
1155 PROC(READLINK, readlinkargs, readlinkres, 3), 2445 PROC(READLINK, readlink, readlink, 3),
1156 PROC(READ, readargs, readres, 3), 2446 PROC(READ, read, read, 3),
1157 PROC(WRITE, writeargs, writeres, 4), 2447 PROC(WRITE, write, write, 4),
1158 PROC(CREATE, createargs, createres, 0), 2448 PROC(CREATE, create, create, 0),
1159 PROC(MKDIR, mkdirargs, createres, 0), 2449 PROC(MKDIR, mkdir, create, 0),
1160 PROC(SYMLINK, symlinkargs, createres, 0), 2450 PROC(SYMLINK, symlink, create, 0),
1161 PROC(MKNOD, mknodargs, createres, 0), 2451 PROC(MKNOD, mknod, create, 0),
1162 PROC(REMOVE, removeargs, removeres, 0), 2452 PROC(REMOVE, remove, remove, 0),
1163 PROC(RMDIR, diropargs, wccstat, 0), 2453 PROC(RMDIR, lookup, setattr, 0),
1164 PROC(RENAME, renameargs, renameres, 0), 2454 PROC(RENAME, rename, rename, 0),
1165 PROC(LINK, linkargs, linkres, 0), 2455 PROC(LINK, link, link, 0),
1166 PROC(READDIR, readdirargs, readdirres, 3), 2456 PROC(READDIR, readdir, readdir, 3),
1167 PROC(READDIRPLUS, readdirargs, readdirres, 3), 2457 PROC(READDIRPLUS, readdirplus, readdir, 3),
1168 PROC(FSSTAT, fhandle, fsstatres, 0), 2458 PROC(FSSTAT, getattr, fsstat, 0),
1169 PROC(FSINFO, fhandle, fsinfores, 0), 2459 PROC(FSINFO, getattr, fsinfo, 0),
1170 PROC(PATHCONF, fhandle, pathconfres, 0), 2460 PROC(PATHCONF, getattr, pathconf, 0),
1171 PROC(COMMIT, commitargs, commitres, 5), 2461 PROC(COMMIT, commit, commit, 5),
1172}; 2462};
1173 2463
1174struct rpc_version nfs_version3 = { 2464struct rpc_version nfs_version3 = {
@@ -1181,8 +2471,8 @@ struct rpc_version nfs_version3 = {
1181static struct rpc_procinfo nfs3_acl_procedures[] = { 2471static struct rpc_procinfo nfs3_acl_procedures[] = {
1182 [ACLPROC3_GETACL] = { 2472 [ACLPROC3_GETACL] = {
1183 .p_proc = ACLPROC3_GETACL, 2473 .p_proc = ACLPROC3_GETACL,
1184 .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, 2474 .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args,
1185 .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, 2475 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,
1186 .p_arglen = ACL3_getaclargs_sz, 2476 .p_arglen = ACL3_getaclargs_sz,
1187 .p_replen = ACL3_getaclres_sz, 2477 .p_replen = ACL3_getaclres_sz,
1188 .p_timer = 1, 2478 .p_timer = 1,
@@ -1190,8 +2480,8 @@ static struct rpc_procinfo nfs3_acl_procedures[] = {
1190 }, 2480 },
1191 [ACLPROC3_SETACL] = { 2481 [ACLPROC3_SETACL] = {
1192 .p_proc = ACLPROC3_SETACL, 2482 .p_proc = ACLPROC3_SETACL,
1193 .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, 2483 .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args,
1194 .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, 2484 .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,
1195 .p_arglen = ACL3_setaclargs_sz, 2485 .p_arglen = ACL3_setaclargs_sz,
1196 .p_replen = ACL3_setaclres_sz, 2486 .p_replen = ACL3_setaclres_sz,
1197 .p_timer = 0, 2487 .p_timer = 0,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 311e15cc8af0..7a7474073148 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
44 NFS4CLNT_RECLAIM_REBOOT, 44 NFS4CLNT_RECLAIM_REBOOT,
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_LAYOUTRECALL,
47 NFS4CLNT_SESSION_RESET, 48 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_RECALL_SLOT, 49 NFS4CLNT_RECALL_SLOT,
49}; 50};
@@ -109,7 +110,7 @@ struct nfs_unique_id {
109struct nfs4_state_owner { 110struct nfs4_state_owner {
110 struct nfs_unique_id so_owner_id; 111 struct nfs_unique_id so_owner_id;
111 struct nfs_server *so_server; 112 struct nfs_server *so_server;
112 struct rb_node so_client_node; 113 struct rb_node so_server_node;
113 114
114 struct rpc_cred *so_cred; /* Associated cred */ 115 struct rpc_cred *so_cred; /* Associated cred */
115 116
@@ -227,12 +228,6 @@ struct nfs4_state_maintenance_ops {
227extern const struct dentry_operations nfs4_dentry_operations; 228extern const struct dentry_operations nfs4_dentry_operations;
228extern const struct inode_operations nfs4_dir_inode_operations; 229extern const struct inode_operations nfs4_dir_inode_operations;
229 230
230/* inode.c */
231extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
232extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
233extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
234
235
236/* nfs4proc.c */ 231/* nfs4proc.c */
237extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 232extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
238extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 233extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
@@ -241,13 +236,12 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
241extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 236extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 237extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 238extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); 239extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
245extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
246extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
247extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 240extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
248extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 241extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
249 struct nfs4_fs_locations *fs_locations, struct page *page); 242 struct nfs4_fs_locations *fs_locations, struct page *page);
250extern void nfs4_release_lockowner(const struct nfs4_lock_state *); 243extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
244extern const struct xattr_handler *nfs4_xattr_handlers[];
251 245
252#if defined(CONFIG_NFS_V4_1) 246#if defined(CONFIG_NFS_V4_1)
253static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 247static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -333,7 +327,6 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
333extern const nfs4_stateid zero_stateid; 327extern const nfs4_stateid zero_stateid;
334 328
335/* nfs4xdr.c */ 329/* nfs4xdr.c */
336extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
337extern struct rpc_procinfo nfs4_procedures[]; 330extern struct rpc_procinfo nfs4_procedures[];
338 331
339struct nfs4_mount_data; 332struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..23f930caf1e2
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
1/*
2 * Module for the pnfs nfs4 file layout driver.
3 * Defines all I/O and Policy interface operations, plus code
4 * to register itself with the pNFS client.
5 *
6 * Copyright (c) 2002
7 * The Regents of the University of Michigan
8 * All Rights Reserved
9 *
10 * Dean Hildebrand <dhildebz@umich.edu>
11 *
12 * Permission is granted to use, copy, create derivative works, and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the University of Michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. If
17 * the above copyright notice or any other identification of the
18 * University of Michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * This software is provided as is, without representation or warranty
22 * of any kind either express or implied, including without limitation
23 * the implied warranties of merchantability, fitness for a particular
24 * purpose, or noninfringement. The Regents of the University of
25 * Michigan shall not be liable for any damages, including special,
26 * indirect, incidental, or consequential damages, with respect to any
27 * claim arising out of or in connection with the use of the software,
28 * even if it has been or is hereafter advised of the possibility of
29 * such damages.
30 */
31
32#include <linux/nfs_fs.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39MODULE_LICENSE("GPL");
40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
41MODULE_DESCRIPTION("The NFSv4 file layout driver");
42
43static int
44filelayout_set_layoutdriver(struct nfs_server *nfss)
45{
46 int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
47 nfs4_fl_free_deviceid_callback);
48 if (status) {
49 printk(KERN_WARNING "%s: deviceid cache could not be "
50 "initialized\n", __func__);
51 return status;
52 }
53 dprintk("%s: deviceid cache has been initialized successfully\n",
54 __func__);
55 return 0;
56}
57
58/* Clear out the layout by destroying its device list */
59static int
60filelayout_clear_layoutdriver(struct nfs_server *nfss)
61{
62 dprintk("--> %s\n", __func__);
63
64 if (nfss->nfs_client->cl_devid_cache)
65 pnfs_put_deviceid_cache(nfss->nfs_client);
66 return 0;
67}
68
69/*
70 * filelayout_check_layout()
71 *
72 * Make sure layout segment parameters are sane WRT the device.
73 * At this point no generic layer initialization of the lseg has occurred,
74 * and nothing has been added to the layout_hdr cache.
75 *
76 */
77static int
78filelayout_check_layout(struct pnfs_layout_hdr *lo,
79 struct nfs4_filelayout_segment *fl,
80 struct nfs4_layoutget_res *lgr,
81 struct nfs4_deviceid *id)
82{
83 struct nfs4_file_layout_dsaddr *dsaddr;
84 int status = -EINVAL;
85 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
86
87 dprintk("--> %s\n", __func__);
88
89 if (fl->pattern_offset > lgr->range.offset) {
90 dprintk("%s pattern_offset %lld to large\n",
91 __func__, fl->pattern_offset);
92 goto out;
93 }
94
95 if (fl->stripe_unit % PAGE_SIZE) {
96 dprintk("%s Stripe unit (%u) not page aligned\n",
97 __func__, fl->stripe_unit);
98 goto out;
99 }
100
101 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
103 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->plh_inode, id);
105 if (dsaddr == NULL)
106 goto out;
107 }
108 fl->dsaddr = dsaddr;
109
110 if (fl->first_stripe_index < 0 ||
111 fl->first_stripe_index >= dsaddr->stripe_count) {
112 dprintk("%s Bad first_stripe_index %d\n",
113 __func__, fl->first_stripe_index);
114 goto out_put;
115 }
116
117 if ((fl->stripe_type == STRIPE_SPARSE &&
118 fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
119 (fl->stripe_type == STRIPE_DENSE &&
120 fl->num_fh != dsaddr->stripe_count)) {
121 dprintk("%s num_fh %u not valid for given packing\n",
122 __func__, fl->num_fh);
123 goto out_put;
124 }
125
126 if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
127 dprintk("%s Stripe unit (%u) not aligned with rsize %u "
128 "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
129 nfss->wsize);
130 }
131
132 status = 0;
133out:
134 dprintk("--> %s returns %d\n", __func__, status);
135 return status;
136out_put:
137 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
138 goto out;
139}
140
141static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
142{
143 int i;
144
145 for (i = 0; i < fl->num_fh; i++) {
146 if (!fl->fh_array[i])
147 break;
148 kfree(fl->fh_array[i]);
149 }
150 kfree(fl->fh_array);
151 fl->fh_array = NULL;
152}
153
154static void
155_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
156{
157 filelayout_free_fh_array(fl);
158 kfree(fl);
159}
160
161static int
162filelayout_decode_layout(struct pnfs_layout_hdr *flo,
163 struct nfs4_filelayout_segment *fl,
164 struct nfs4_layoutget_res *lgr,
165 struct nfs4_deviceid *id)
166{
167 uint32_t *p = (uint32_t *)lgr->layout.buf;
168 uint32_t nfl_util;
169 int i;
170
171 dprintk("%s: set_layout_map Begin\n", __func__);
172
173 memcpy(id, p, sizeof(*id));
174 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
175 print_deviceid(id);
176
177 nfl_util = be32_to_cpup(p++);
178 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
179 fl->commit_through_mds = 1;
180 if (nfl_util & NFL4_UFLG_DENSE)
181 fl->stripe_type = STRIPE_DENSE;
182 else
183 fl->stripe_type = STRIPE_SPARSE;
184 fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
185
186 fl->first_stripe_index = be32_to_cpup(p++);
187 p = xdr_decode_hyper(p, &fl->pattern_offset);
188 fl->num_fh = be32_to_cpup(p++);
189
190 dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
191 __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
192 fl->pattern_offset);
193
194 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
195 GFP_KERNEL);
196 if (!fl->fh_array)
197 return -ENOMEM;
198
199 for (i = 0; i < fl->num_fh; i++) {
200 /* Do we want to use a mempool here? */
201 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
202 if (!fl->fh_array[i]) {
203 filelayout_free_fh_array(fl);
204 return -ENOMEM;
205 }
206 fl->fh_array[i]->size = be32_to_cpup(p++);
207 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
208 printk(KERN_ERR "Too big fh %d received %d\n",
209 i, fl->fh_array[i]->size);
210 filelayout_free_fh_array(fl);
211 return -EIO;
212 }
213 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
214 p += XDR_QUADLEN(fl->fh_array[i]->size);
215 dprintk("DEBUG: %s: fh len %d\n", __func__,
216 fl->fh_array[i]->size);
217 }
218
219 return 0;
220}
221
222static struct pnfs_layout_segment *
223filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
224 struct nfs4_layoutget_res *lgr)
225{
226 struct nfs4_filelayout_segment *fl;
227 int rc;
228 struct nfs4_deviceid id;
229
230 dprintk("--> %s\n", __func__);
231 fl = kzalloc(sizeof(*fl), GFP_KERNEL);
232 if (!fl)
233 return NULL;
234
235 rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
236 if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
237 _filelayout_free_lseg(fl);
238 return NULL;
239 }
240 return &fl->generic_hdr;
241}
242
243static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{
246 struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248
249 dprintk("--> %s\n", __func__);
250 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
251 &fl->dsaddr->deviceid);
252 _filelayout_free_lseg(fl);
253}
254
255static struct pnfs_layoutdriver_type filelayout_type = {
256 .id = LAYOUT_NFSV4_1_FILES,
257 .name = "LAYOUT_NFSV4_1_FILES",
258 .owner = THIS_MODULE,
259 .set_layoutdriver = filelayout_set_layoutdriver,
260 .clear_layoutdriver = filelayout_clear_layoutdriver,
261 .alloc_lseg = filelayout_alloc_lseg,
262 .free_lseg = filelayout_free_lseg,
263};
264
265static int __init nfs4filelayout_init(void)
266{
267 printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
268 __func__);
269 return pnfs_register_layoutdriver(&filelayout_type);
270}
271
272static void __exit nfs4filelayout_exit(void)
273{
274 printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
275 __func__);
276 pnfs_unregister_layoutdriver(&filelayout_type);
277}
278
279module_init(nfs4filelayout_init);
280module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
1/*
2 * NFSv4 file layout driver data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_NFS4FILELAYOUT_H
31#define FS_NFS_NFS4FILELAYOUT_H
32
33#include "pnfs.h"
34
35/*
36 * Field testing shows we need to support upto 4096 stripe indices.
37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
38 * reasonable. This in turn means we support a maximum of 256
39 * RFC 5661 multipath_list4 structures.
40 */
41#define NFS4_PNFS_MAX_STRIPE_CNT 4096
42#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
43
44enum stripetype4 {
45 STRIPE_SPARSE = 1,
46 STRIPE_DENSE = 2
47};
48
49/* Individual ip address */
50struct nfs4_pnfs_ds {
51 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
52 u32 ds_ip_addr;
53 u32 ds_port;
54 struct nfs_client *ds_clp;
55 atomic_t ds_count;
56};
57
58struct nfs4_file_layout_dsaddr {
59 struct pnfs_deviceid_node deviceid;
60 u32 stripe_count;
61 u8 *stripe_indices;
62 u32 ds_num;
63 struct nfs4_pnfs_ds *ds_list[1];
64};
65
66struct nfs4_filelayout_segment {
67 struct pnfs_layout_segment generic_hdr;
68 u32 stripe_type;
69 u32 commit_through_mds;
70 u32 stripe_unit;
71 u32 first_stripe_index;
72 u64 pattern_offset;
73 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
74 unsigned int num_fh;
75 struct nfs_fh **fh_array;
76};
77
78static inline struct nfs4_filelayout_segment *
79FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
80{
81 return container_of(lseg,
82 struct nfs4_filelayout_segment,
83 generic_hdr);
84}
85
86extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
87extern void print_ds(struct nfs4_pnfs_ds *ds);
88extern void print_deviceid(struct nfs4_deviceid *dev_id);
89extern struct nfs4_file_layout_dsaddr *
90nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
91struct nfs4_file_layout_dsaddr *
92get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
93
94#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..f5c9b125e8cc
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,453 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39/*
40 * Data server cache
41 *
42 * Data servers can be mapped to different device ids.
43 * nfs4_pnfs_ds reference counting
44 * - set to 1 on allocation
45 * - incremented when a device id maps a data server already in the cache.
46 * - decremented when deviceid is removed from the cache.
47 */
48DEFINE_SPINLOCK(nfs4_ds_cache_lock);
49static LIST_HEAD(nfs4_data_server_cache);
50
51/* Debug routines */
52void
53print_ds(struct nfs4_pnfs_ds *ds)
54{
55 if (ds == NULL) {
56 printk("%s NULL device\n", __func__);
57 return;
58 }
59 printk(" ip_addr %x port %hu\n"
60 " ref count %d\n"
61 " client %p\n"
62 " cl_exchange_flags %x\n",
63 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
64 atomic_read(&ds->ds_count), ds->ds_clp,
65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
66}
67
68void
69print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
70{
71 int i;
72
73 ifdebug(FACILITY) {
74 printk("%s dsaddr->ds_num %d\n", __func__,
75 dsaddr->ds_num);
76 for (i = 0; i < dsaddr->ds_num; i++)
77 print_ds(dsaddr->ds_list[i]);
78 }
79}
80
81void print_deviceid(struct nfs4_deviceid *id)
82{
83 u32 *p = (u32 *)id;
84
85 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
86 p[0], p[1], p[2], p[3]);
87}
88
89/* nfs4_ds_cache_lock is held */
90static struct nfs4_pnfs_ds *
91_data_server_lookup_locked(u32 ip_addr, u32 port)
92{
93 struct nfs4_pnfs_ds *ds;
94
95 dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
96 ntohl(ip_addr), ntohs(port));
97
98 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
99 if (ds->ds_ip_addr == ip_addr &&
100 ds->ds_port == port) {
101 return ds;
102 }
103 }
104 return NULL;
105}
106
107static void
108destroy_ds(struct nfs4_pnfs_ds *ds)
109{
110 dprintk("--> %s\n", __func__);
111 ifdebug(FACILITY)
112 print_ds(ds);
113
114 if (ds->ds_clp)
115 nfs_put_client(ds->ds_clp);
116 kfree(ds);
117}
118
119static void
120nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
121{
122 struct nfs4_pnfs_ds *ds;
123 int i;
124
125 print_deviceid(&dsaddr->deviceid.de_id);
126
127 for (i = 0; i < dsaddr->ds_num; i++) {
128 ds = dsaddr->ds_list[i];
129 if (ds != NULL) {
130 if (atomic_dec_and_lock(&ds->ds_count,
131 &nfs4_ds_cache_lock)) {
132 list_del_init(&ds->ds_node);
133 spin_unlock(&nfs4_ds_cache_lock);
134 destroy_ds(ds);
135 }
136 }
137 }
138 kfree(dsaddr->stripe_indices);
139 kfree(dsaddr);
140}
141
142void
143nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
144{
145 struct nfs4_file_layout_dsaddr *dsaddr =
146 container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
147
148 nfs4_fl_free_deviceid(dsaddr);
149}
150
151static struct nfs4_pnfs_ds *
152nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
153{
154 struct nfs4_pnfs_ds *tmp_ds, *ds;
155
156 ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
157 if (!ds)
158 goto out;
159
160 spin_lock(&nfs4_ds_cache_lock);
161 tmp_ds = _data_server_lookup_locked(ip_addr, port);
162 if (tmp_ds == NULL) {
163 ds->ds_ip_addr = ip_addr;
164 ds->ds_port = port;
165 atomic_set(&ds->ds_count, 1);
166 INIT_LIST_HEAD(&ds->ds_node);
167 ds->ds_clp = NULL;
168 list_add(&ds->ds_node, &nfs4_data_server_cache);
169 dprintk("%s add new data server ip 0x%x\n", __func__,
170 ds->ds_ip_addr);
171 } else {
172 kfree(ds);
173 atomic_inc(&tmp_ds->ds_count);
174 dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
175 __func__, tmp_ds->ds_ip_addr,
176 atomic_read(&tmp_ds->ds_count));
177 ds = tmp_ds;
178 }
179 spin_unlock(&nfs4_ds_cache_lock);
180out:
181 return ds;
182}
183
184/*
185 * Currently only support ipv4, and one multi-path address.
186 */
187static struct nfs4_pnfs_ds *
188decode_and_add_ds(__be32 **pp, struct inode *inode)
189{
190 struct nfs4_pnfs_ds *ds = NULL;
191 char *buf;
192 const char *ipend, *pstr;
193 u32 ip_addr, port;
194 int nlen, rlen, i;
195 int tmp[2];
196 __be32 *r_netid, *r_addr, *p = *pp;
197
198 /* r_netid */
199 nlen = be32_to_cpup(p++);
200 r_netid = p;
201 p += XDR_QUADLEN(nlen);
202
203 /* r_addr */
204 rlen = be32_to_cpup(p++);
205 r_addr = p;
206 p += XDR_QUADLEN(rlen);
207 *pp = p;
208
209 /* Check that netid is "tcp" */
210 if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) {
211 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
212 goto out_err;
213 }
214
215 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s: Invalid address, length %d\n", __func__,
218 rlen);
219 goto out_err;
220 }
221 buf = kmalloc(rlen + 1, GFP_KERNEL);
222 buf[rlen] = '\0';
223 memcpy(buf, r_addr, rlen);
224
225 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) {
227 char *res = strrchr(buf, '.');
228 if (!res) {
229 dprintk("%s: Failed finding expected dots in port\n",
230 __func__);
231 goto out_free;
232 }
233 *res = '-';
234 }
235
236 /* Currently only support ipv4 address */
237 if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
238 dprintk("%s: Only ipv4 addresses supported\n", __func__);
239 goto out_free;
240 }
241
242 /* port */
243 pstr = ipend;
244 sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
245 port = htons((tmp[0] << 8) | (tmp[1]));
246
247 ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
248 dprintk("%s: Decoded address and port %s\n", __func__, buf);
249out_free:
250 kfree(buf);
251out_err:
252 return ds;
253}
254
255/* Decode opaque device data and return the result */
256static struct nfs4_file_layout_dsaddr*
257decode_device(struct inode *ino, struct pnfs_device *pdev)
258{
259 int i, dummy;
260 u32 cnt, num;
261 u8 *indexp;
262 __be32 *p = (__be32 *)pdev->area, *indicesp;
263 struct nfs4_file_layout_dsaddr *dsaddr;
264
265 /* Get the stripe count (number of stripe index) */
266 cnt = be32_to_cpup(p++);
267 dprintk("%s stripe count %d\n", __func__, cnt);
268 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
269 printk(KERN_WARNING "%s: stripe count %d greater than "
270 "supported maximum %d\n", __func__,
271 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
272 goto out_err;
273 }
274
275 /* Check the multipath list count */
276 indicesp = p;
277 p += XDR_QUADLEN(cnt << 2);
278 num = be32_to_cpup(p++);
279 dprintk("%s ds_num %u\n", __func__, num);
280 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
281 printk(KERN_WARNING "%s: multipath count %d greater than "
282 "supported maximum %d\n", __func__,
283 num, NFS4_PNFS_MAX_MULTI_CNT);
284 goto out_err;
285 }
286 dsaddr = kzalloc(sizeof(*dsaddr) +
287 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
288 GFP_KERNEL);
289 if (!dsaddr)
290 goto out_err;
291
292 dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
293 if (!dsaddr->stripe_indices)
294 goto out_err_free;
295
296 dsaddr->stripe_count = cnt;
297 dsaddr->ds_num = num;
298
299 memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
300
301 /* Go back an read stripe indices */
302 p = indicesp;
303 indexp = &dsaddr->stripe_indices[0];
304 for (i = 0; i < dsaddr->stripe_count; i++) {
305 *indexp = be32_to_cpup(p++);
306 if (*indexp >= num)
307 goto out_err_free;
308 indexp++;
309 }
310 /* Skip already read multipath list count */
311 p++;
312
313 for (i = 0; i < dsaddr->ds_num; i++) {
314 int j;
315
316 dummy = be32_to_cpup(p++); /* multipath count */
317 if (dummy > 1) {
318 printk(KERN_WARNING
319 "%s: Multipath count %d not supported, "
320 "skipping all greater than 1\n", __func__,
321 dummy);
322 }
323 for (j = 0; j < dummy; j++) {
324 if (j == 0) {
325 dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
326 if (dsaddr->ds_list[i] == NULL)
327 goto out_err_free;
328 } else {
329 u32 len;
330 /* skip extra multipath */
331 len = be32_to_cpup(p++);
332 p += XDR_QUADLEN(len);
333 len = be32_to_cpup(p++);
334 p += XDR_QUADLEN(len);
335 continue;
336 }
337 }
338 }
339 return dsaddr;
340
341out_err_free:
342 nfs4_fl_free_deviceid(dsaddr);
343out_err:
344 dprintk("%s ERROR: returning NULL\n", __func__);
345 return NULL;
346}
347
348/*
349 * Decode the opaque device specified in 'dev'
350 * and add it to the list of available devices.
351 * If the deviceid is already cached, nfs4_add_deviceid will return
352 * a pointer to the cached struct and throw away the new.
353 */
354static struct nfs4_file_layout_dsaddr*
355decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
356{
357 struct nfs4_file_layout_dsaddr *dsaddr;
358 struct pnfs_deviceid_node *d;
359
360 dsaddr = decode_device(inode, dev);
361 if (!dsaddr) {
362 printk(KERN_WARNING "%s: Could not decode or add device\n",
363 __func__);
364 return NULL;
365 }
366
367 d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
368 &dsaddr->deviceid);
369
370 return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
371}
372
373/*
374 * Retrieve the information for dev_id, add it to the list
375 * of available devices, and return it.
376 */
377struct nfs4_file_layout_dsaddr *
378get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
379{
380 struct pnfs_device *pdev = NULL;
381 u32 max_resp_sz;
382 int max_pages;
383 struct page **pages = NULL;
384 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
385 int rc, i;
386 struct nfs_server *server = NFS_SERVER(inode);
387
388 /*
389 * Use the session max response size as the basis for setting
390 * GETDEVICEINFO's maxcount
391 */
392 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
393 max_pages = max_resp_sz >> PAGE_SHIFT;
394 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
395 __func__, inode, max_resp_sz, max_pages);
396
397 pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
398 if (pdev == NULL)
399 return NULL;
400
401 pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
402 if (pages == NULL) {
403 kfree(pdev);
404 return NULL;
405 }
406 for (i = 0; i < max_pages; i++) {
407 pages[i] = alloc_page(GFP_KERNEL);
408 if (!pages[i])
409 goto out_free;
410 }
411
412 /* set pdev->area */
413 pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
414 if (!pdev->area)
415 goto out_free;
416
417 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
418 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
419 pdev->pages = pages;
420 pdev->pgbase = 0;
421 pdev->pglen = PAGE_SIZE * max_pages;
422 pdev->mincount = 0;
423
424 rc = nfs4_proc_getdeviceinfo(server, pdev);
425 dprintk("%s getdevice info returns %d\n", __func__, rc);
426 if (rc)
427 goto out_free;
428
429 /*
430 * Found new device, need to decode it and then add it to the
431 * list of known devices for this mountpoint.
432 */
433 dsaddr = decode_and_add_device(inode, pdev);
434out_free:
435 if (pdev->area != NULL)
436 vunmap(pdev->area);
437 for (i = 0; i < max_pages; i++)
438 __free_page(pages[i]);
439 kfree(pages);
440 kfree(pdev);
441 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
442 return dsaddr;
443}
444
445struct nfs4_file_layout_dsaddr *
446nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
447{
448 struct pnfs_deviceid_node *d;
449
450 d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
451 return (d == NULL) ? NULL :
452 container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
453}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 089da5b5d20a..78936a8f40ab 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -49,12 +49,15 @@
49#include <linux/mount.h> 49#include <linux/mount.h>
50#include <linux/module.h> 50#include <linux/module.h>
51#include <linux/sunrpc/bc_xprt.h> 51#include <linux/sunrpc/bc_xprt.h>
52#include <linux/xattr.h>
53#include <linux/utsname.h>
52 54
53#include "nfs4_fs.h" 55#include "nfs4_fs.h"
54#include "delegation.h" 56#include "delegation.h"
55#include "internal.h" 57#include "internal.h"
56#include "iostat.h" 58#include "iostat.h"
57#include "callback.h" 59#include "callback.h"
60#include "pnfs.h"
58 61
59#define NFSDBG_FACILITY NFSDBG_PROC 62#define NFSDBG_FACILITY NFSDBG_PROC
60 63
@@ -129,7 +132,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
129 | FATTR4_WORD0_MAXREAD 132 | FATTR4_WORD0_MAXREAD
130 | FATTR4_WORD0_MAXWRITE 133 | FATTR4_WORD0_MAXWRITE
131 | FATTR4_WORD0_LEASE_TIME, 134 | FATTR4_WORD0_LEASE_TIME,
132 0 135 FATTR4_WORD1_TIME_DELTA
136 | FATTR4_WORD1_FS_LAYOUT_TYPES
133}; 137};
134 138
135const u32 nfs4_fs_locations_bitmap[2] = { 139const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +259,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
255 nfs4_state_mark_reclaim_nograce(clp, state); 259 nfs4_state_mark_reclaim_nograce(clp, state);
256 goto do_state_recovery; 260 goto do_state_recovery;
257 case -NFS4ERR_STALE_STATEID: 261 case -NFS4ERR_STALE_STATEID:
258 if (state == NULL)
259 break;
260 nfs4_state_mark_reclaim_reboot(clp, state);
261 case -NFS4ERR_STALE_CLIENTID: 262 case -NFS4ERR_STALE_CLIENTID:
262 case -NFS4ERR_EXPIRED: 263 case -NFS4ERR_EXPIRED:
263 goto do_state_recovery; 264 goto do_state_recovery;
@@ -334,10 +335,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
334 * Must be called while holding tbl->slot_tbl_lock 335 * Must be called while holding tbl->slot_tbl_lock
335 */ 336 */
336static void 337static void
337nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) 338nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
338{ 339{
340 int free_slotid = free_slot - tbl->slots;
339 int slotid = free_slotid; 341 int slotid = free_slotid;
340 342
343 BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
341 /* clear used bit in bitmap */ 344 /* clear used bit in bitmap */
342 __clear_bit(slotid, tbl->used_slots); 345 __clear_bit(slotid, tbl->used_slots);
343 346
@@ -354,9 +357,9 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
354} 357}
355 358
356/* 359/*
357 * Signal state manager thread if session is drained 360 * Signal state manager thread if session fore channel is drained
358 */ 361 */
359static void nfs41_check_drain_session_complete(struct nfs4_session *ses) 362static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
360{ 363{
361 struct rpc_task *task; 364 struct rpc_task *task;
362 365
@@ -370,8 +373,20 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
370 if (ses->fc_slot_table.highest_used_slotid != -1) 373 if (ses->fc_slot_table.highest_used_slotid != -1)
371 return; 374 return;
372 375
373 dprintk("%s COMPLETE: Session Drained\n", __func__); 376 dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
374 complete(&ses->complete); 377 complete(&ses->fc_slot_table.complete);
378}
379
380/*
381 * Signal state manager thread if session back channel is drained
382 */
383void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
384{
385 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
386 ses->bc_slot_table.highest_used_slotid != -1)
387 return;
388 dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
389 complete(&ses->bc_slot_table.complete);
375} 390}
376 391
377static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) 392static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -379,7 +394,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
379 struct nfs4_slot_table *tbl; 394 struct nfs4_slot_table *tbl;
380 395
381 tbl = &res->sr_session->fc_slot_table; 396 tbl = &res->sr_session->fc_slot_table;
382 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { 397 if (!res->sr_slot) {
383 /* just wake up the next guy waiting since 398 /* just wake up the next guy waiting since
384 * we may have not consumed a slot after all */ 399 * we may have not consumed a slot after all */
385 dprintk("%s: No slot\n", __func__); 400 dprintk("%s: No slot\n", __func__);
@@ -387,17 +402,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
387 } 402 }
388 403
389 spin_lock(&tbl->slot_tbl_lock); 404 spin_lock(&tbl->slot_tbl_lock);
390 nfs4_free_slot(tbl, res->sr_slotid); 405 nfs4_free_slot(tbl, res->sr_slot);
391 nfs41_check_drain_session_complete(res->sr_session); 406 nfs4_check_drain_fc_complete(res->sr_session);
392 spin_unlock(&tbl->slot_tbl_lock); 407 spin_unlock(&tbl->slot_tbl_lock);
393 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 408 res->sr_slot = NULL;
394} 409}
395 410
396static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 411static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
397{ 412{
398 unsigned long timestamp; 413 unsigned long timestamp;
399 struct nfs4_slot_table *tbl;
400 struct nfs4_slot *slot;
401 struct nfs_client *clp; 414 struct nfs_client *clp;
402 415
403 /* 416 /*
@@ -410,17 +423,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
410 res->sr_status = NFS_OK; 423 res->sr_status = NFS_OK;
411 424
412 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ 425 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
413 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) 426 if (!res->sr_slot)
414 goto out; 427 goto out;
415 428
416 tbl = &res->sr_session->fc_slot_table;
417 slot = tbl->slots + res->sr_slotid;
418
419 /* Check the SEQUENCE operation status */ 429 /* Check the SEQUENCE operation status */
420 switch (res->sr_status) { 430 switch (res->sr_status) {
421 case 0: 431 case 0:
422 /* Update the slot's sequence and clientid lease timer */ 432 /* Update the slot's sequence and clientid lease timer */
423 ++slot->seq_nr; 433 ++res->sr_slot->seq_nr;
424 timestamp = res->sr_renewal_time; 434 timestamp = res->sr_renewal_time;
425 clp = res->sr_session->clp; 435 clp = res->sr_session->clp;
426 do_renew_lease(clp, timestamp); 436 do_renew_lease(clp, timestamp);
@@ -433,12 +443,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
433 * returned NFS4ERR_DELAY as per Section 2.10.6.2 443 * returned NFS4ERR_DELAY as per Section 2.10.6.2
434 * of RFC5661. 444 * of RFC5661.
435 */ 445 */
436 dprintk("%s: slot=%d seq=%d: Operation in progress\n", 446 dprintk("%s: slot=%td seq=%d: Operation in progress\n",
437 __func__, res->sr_slotid, slot->seq_nr); 447 __func__,
448 res->sr_slot - res->sr_session->fc_slot_table.slots,
449 res->sr_slot->seq_nr);
438 goto out_retry; 450 goto out_retry;
439 default: 451 default:
440 /* Just update the slot sequence no. */ 452 /* Just update the slot sequence no. */
441 ++slot->seq_nr; 453 ++res->sr_slot->seq_nr;
442 } 454 }
443out: 455out:
444 /* The session may be reset by one of the error handlers. */ 456 /* The session may be reset by one of the error handlers. */
@@ -505,10 +517,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
505 517
506 dprintk("--> %s\n", __func__); 518 dprintk("--> %s\n", __func__);
507 /* slot already allocated? */ 519 /* slot already allocated? */
508 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) 520 if (res->sr_slot != NULL)
509 return 0; 521 return 0;
510 522
511 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
512 tbl = &session->fc_slot_table; 523 tbl = &session->fc_slot_table;
513 524
514 spin_lock(&tbl->slot_tbl_lock); 525 spin_lock(&tbl->slot_tbl_lock);
@@ -550,7 +561,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
550 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); 561 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
551 562
552 res->sr_session = session; 563 res->sr_session = session;
553 res->sr_slotid = slotid; 564 res->sr_slot = slot;
554 res->sr_renewal_time = jiffies; 565 res->sr_renewal_time = jiffies;
555 res->sr_status_flags = 0; 566 res->sr_status_flags = 0;
556 /* 567 /*
@@ -576,8 +587,9 @@ int nfs4_setup_sequence(const struct nfs_server *server,
576 goto out; 587 goto out;
577 } 588 }
578 589
579 dprintk("--> %s clp %p session %p sr_slotid %d\n", 590 dprintk("--> %s clp %p session %p sr_slot %td\n",
580 __func__, session->clp, session, res->sr_slotid); 591 __func__, session->clp, session, res->sr_slot ?
592 res->sr_slot - session->fc_slot_table.slots : -1);
581 593
582 ret = nfs41_setup_sequence(session, args, res, cache_reply, 594 ret = nfs41_setup_sequence(session, args, res, cache_reply,
583 task); 595 task);
@@ -650,7 +662,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
650 .callback_data = &data 662 .callback_data = &data
651 }; 663 };
652 664
653 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 665 res->sr_slot = NULL;
654 if (privileged) 666 if (privileged)
655 task_setup.callback_ops = &nfs41_call_priv_sync_ops; 667 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
656 task = rpc_run_task(&task_setup); 668 task = rpc_run_task(&task_setup);
@@ -735,7 +747,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
735 p->o_res.server = p->o_arg.server; 747 p->o_res.server = p->o_arg.server;
736 nfs_fattr_init(&p->f_attr); 748 nfs_fattr_init(&p->f_attr);
737 nfs_fattr_init(&p->dir_attr); 749 nfs_fattr_init(&p->dir_attr);
738 p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
739} 750}
740 751
741static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 752static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -1120,6 +1131,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1120 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1131 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1121 smp_rmb(); 1132 smp_rmb();
1122 if (state->n_rdwr != 0) { 1133 if (state->n_rdwr != 0) {
1134 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1123 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); 1135 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
1124 if (ret != 0) 1136 if (ret != 0)
1125 return ret; 1137 return ret;
@@ -1127,6 +1139,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1127 return -ESTALE; 1139 return -ESTALE;
1128 } 1140 }
1129 if (state->n_wronly != 0) { 1141 if (state->n_wronly != 0) {
1142 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1130 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); 1143 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
1131 if (ret != 0) 1144 if (ret != 0)
1132 return ret; 1145 return ret;
@@ -1134,6 +1147,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1134 return -ESTALE; 1147 return -ESTALE;
1135 } 1148 }
1136 if (state->n_rdonly != 0) { 1149 if (state->n_rdonly != 0) {
1150 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1137 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); 1151 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
1138 if (ret != 0) 1152 if (ret != 0)
1139 return ret; 1153 return ret;
@@ -1188,7 +1202,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1188 int err; 1202 int err;
1189 do { 1203 do {
1190 err = _nfs4_do_open_reclaim(ctx, state); 1204 err = _nfs4_do_open_reclaim(ctx, state);
1191 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 1205 if (err != -NFS4ERR_DELAY)
1192 break; 1206 break;
1193 nfs4_handle_exception(server, err, &exception); 1207 nfs4_handle_exception(server, err, &exception);
1194 } while (exception.retry); 1208 } while (exception.retry);
@@ -1258,6 +1272,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1258 case -NFS4ERR_ADMIN_REVOKED: 1272 case -NFS4ERR_ADMIN_REVOKED:
1259 case -NFS4ERR_BAD_STATEID: 1273 case -NFS4ERR_BAD_STATEID:
1260 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 1274 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
1275 case -EKEYEXPIRED:
1276 /*
1277 * User RPCSEC_GSS context has expired.
1278 * We cannot recover this stateid now, so
1279 * skip it and allow recovery thread to
1280 * proceed.
1281 */
1261 case -ENOMEM: 1282 case -ENOMEM:
1262 err = 0; 1283 err = 0;
1263 goto out; 1284 goto out;
@@ -1605,7 +1626,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
1605 goto out; 1626 goto out;
1606 case -NFS4ERR_GRACE: 1627 case -NFS4ERR_GRACE:
1607 case -NFS4ERR_DELAY: 1628 case -NFS4ERR_DELAY:
1608 case -EKEYEXPIRED:
1609 nfs4_handle_exception(server, err, &exception); 1629 nfs4_handle_exception(server, err, &exception);
1610 err = 0; 1630 err = 0;
1611 } 1631 }
@@ -1820,6 +1840,8 @@ struct nfs4_closedata {
1820 struct nfs_closeres res; 1840 struct nfs_closeres res;
1821 struct nfs_fattr fattr; 1841 struct nfs_fattr fattr;
1822 unsigned long timestamp; 1842 unsigned long timestamp;
1843 bool roc;
1844 u32 roc_barrier;
1823}; 1845};
1824 1846
1825static void nfs4_free_closedata(void *data) 1847static void nfs4_free_closedata(void *data)
@@ -1827,6 +1849,8 @@ static void nfs4_free_closedata(void *data)
1827 struct nfs4_closedata *calldata = data; 1849 struct nfs4_closedata *calldata = data;
1828 struct nfs4_state_owner *sp = calldata->state->owner; 1850 struct nfs4_state_owner *sp = calldata->state->owner;
1829 1851
1852 if (calldata->roc)
1853 pnfs_roc_release(calldata->state->inode);
1830 nfs4_put_open_state(calldata->state); 1854 nfs4_put_open_state(calldata->state);
1831 nfs_free_seqid(calldata->arg.seqid); 1855 nfs_free_seqid(calldata->arg.seqid);
1832 nfs4_put_state_owner(sp); 1856 nfs4_put_state_owner(sp);
@@ -1859,6 +1883,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1859 */ 1883 */
1860 switch (task->tk_status) { 1884 switch (task->tk_status) {
1861 case 0: 1885 case 0:
1886 if (calldata->roc)
1887 pnfs_roc_set_barrier(state->inode,
1888 calldata->roc_barrier);
1862 nfs_set_open_stateid(state, &calldata->res.stateid, 0); 1889 nfs_set_open_stateid(state, &calldata->res.stateid, 0);
1863 renew_lease(server, calldata->timestamp); 1890 renew_lease(server, calldata->timestamp);
1864 nfs4_close_clear_stateid_flags(state, 1891 nfs4_close_clear_stateid_flags(state,
@@ -1911,8 +1938,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1911 return; 1938 return;
1912 } 1939 }
1913 1940
1914 if (calldata->arg.fmode == 0) 1941 if (calldata->arg.fmode == 0) {
1915 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; 1942 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
1943 if (calldata->roc &&
1944 pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
1945 rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
1946 task, NULL);
1947 return;
1948 }
1949 }
1916 1950
1917 nfs_fattr_init(calldata->res.fattr); 1951 nfs_fattr_init(calldata->res.fattr);
1918 calldata->timestamp = jiffies; 1952 calldata->timestamp = jiffies;
@@ -1940,7 +1974,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
1940 * 1974 *
1941 * NOTE: Caller must be holding the sp->so_owner semaphore! 1975 * NOTE: Caller must be holding the sp->so_owner semaphore!
1942 */ 1976 */
1943int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) 1977int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
1944{ 1978{
1945 struct nfs_server *server = NFS_SERVER(state->inode); 1979 struct nfs_server *server = NFS_SERVER(state->inode);
1946 struct nfs4_closedata *calldata; 1980 struct nfs4_closedata *calldata;
@@ -1975,12 +2009,12 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1975 calldata->res.fattr = &calldata->fattr; 2009 calldata->res.fattr = &calldata->fattr;
1976 calldata->res.seqid = calldata->arg.seqid; 2010 calldata->res.seqid = calldata->arg.seqid;
1977 calldata->res.server = server; 2011 calldata->res.server = server;
1978 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 2012 calldata->roc = roc;
1979 path_get(path); 2013 path_get(path);
1980 calldata->path = *path; 2014 calldata->path = *path;
1981 2015
1982 msg.rpc_argp = &calldata->arg, 2016 msg.rpc_argp = &calldata->arg;
1983 msg.rpc_resp = &calldata->res, 2017 msg.rpc_resp = &calldata->res;
1984 task_setup_data.callback_data = calldata; 2018 task_setup_data.callback_data = calldata;
1985 task = rpc_run_task(&task_setup_data); 2019 task = rpc_run_task(&task_setup_data);
1986 if (IS_ERR(task)) 2020 if (IS_ERR(task))
@@ -1993,125 +2027,24 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1993out_free_calldata: 2027out_free_calldata:
1994 kfree(calldata); 2028 kfree(calldata);
1995out: 2029out:
2030 if (roc)
2031 pnfs_roc_release(state->inode);
1996 nfs4_put_open_state(state); 2032 nfs4_put_open_state(state);
1997 nfs4_put_state_owner(sp); 2033 nfs4_put_state_owner(sp);
1998 return status; 2034 return status;
1999} 2035}
2000 2036
2001static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode) 2037static struct inode *
2002{ 2038nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
2003 struct file *filp;
2004 int ret;
2005
2006 /* If the open_intent is for execute, we have an extra check to make */
2007 if (fmode & FMODE_EXEC) {
2008 ret = nfs_may_open(state->inode,
2009 state->owner->so_cred,
2010 nd->intent.open.flags);
2011 if (ret < 0)
2012 goto out_close;
2013 }
2014 filp = lookup_instantiate_filp(nd, path->dentry, NULL);
2015 if (!IS_ERR(filp)) {
2016 struct nfs_open_context *ctx;
2017 ctx = nfs_file_open_context(filp);
2018 ctx->state = state;
2019 return 0;
2020 }
2021 ret = PTR_ERR(filp);
2022out_close:
2023 nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
2024 return ret;
2025}
2026
2027struct dentry *
2028nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2029{ 2039{
2030 struct path path = {
2031 .mnt = nd->path.mnt,
2032 .dentry = dentry,
2033 };
2034 struct dentry *parent;
2035 struct iattr attr;
2036 struct rpc_cred *cred;
2037 struct nfs4_state *state; 2040 struct nfs4_state *state;
2038 struct dentry *res;
2039 int open_flags = nd->intent.open.flags;
2040 fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
2041
2042 if (nd->flags & LOOKUP_CREATE) {
2043 attr.ia_mode = nd->intent.open.create_mode;
2044 attr.ia_valid = ATTR_MODE;
2045 if (!IS_POSIXACL(dir))
2046 attr.ia_mode &= ~current_umask();
2047 } else {
2048 open_flags &= ~O_EXCL;
2049 attr.ia_valid = 0;
2050 BUG_ON(open_flags & O_CREAT);
2051 }
2052 2041
2053 cred = rpc_lookup_cred();
2054 if (IS_ERR(cred))
2055 return (struct dentry *)cred;
2056 parent = dentry->d_parent;
2057 /* Protect against concurrent sillydeletes */ 2042 /* Protect against concurrent sillydeletes */
2058 nfs_block_sillyrename(parent); 2043 state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
2059 state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); 2044 if (IS_ERR(state))
2060 put_rpccred(cred); 2045 return ERR_CAST(state);
2061 if (IS_ERR(state)) { 2046 ctx->state = state;
2062 if (PTR_ERR(state) == -ENOENT) { 2047 return igrab(state->inode);
2063 d_add(dentry, NULL);
2064 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2065 }
2066 nfs_unblock_sillyrename(parent);
2067 return (struct dentry *)state;
2068 }
2069 res = d_add_unique(dentry, igrab(state->inode));
2070 if (res != NULL)
2071 path.dentry = res;
2072 nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
2073 nfs_unblock_sillyrename(parent);
2074 nfs4_intent_set_file(nd, &path, state, fmode);
2075 return res;
2076}
2077
2078int
2079nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
2080{
2081 struct path path = {
2082 .mnt = nd->path.mnt,
2083 .dentry = dentry,
2084 };
2085 struct rpc_cred *cred;
2086 struct nfs4_state *state;
2087 fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
2088
2089 cred = rpc_lookup_cred();
2090 if (IS_ERR(cred))
2091 return PTR_ERR(cred);
2092 state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
2093 put_rpccred(cred);
2094 if (IS_ERR(state)) {
2095 switch (PTR_ERR(state)) {
2096 case -EPERM:
2097 case -EACCES:
2098 case -EDQUOT:
2099 case -ENOSPC:
2100 case -EROFS:
2101 return PTR_ERR(state);
2102 default:
2103 goto out_drop;
2104 }
2105 }
2106 if (state->inode == dentry->d_inode) {
2107 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2108 nfs4_intent_set_file(nd, &path, state, fmode);
2109 return 1;
2110 }
2111 nfs4_close_sync(&path, state, fmode);
2112out_drop:
2113 d_drop(dentry);
2114 return 0;
2115} 2048}
2116 2049
2117static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2050static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2568,36 +2501,35 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
2568 2501
2569static int 2502static int
2570nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 2503nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2571 int flags, struct nameidata *nd) 2504 int flags, struct nfs_open_context *ctx)
2572{ 2505{
2573 struct path path = { 2506 struct path my_path = {
2574 .mnt = nd->path.mnt,
2575 .dentry = dentry, 2507 .dentry = dentry,
2576 }; 2508 };
2509 struct path *path = &my_path;
2577 struct nfs4_state *state; 2510 struct nfs4_state *state;
2578 struct rpc_cred *cred; 2511 struct rpc_cred *cred = NULL;
2579 fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE); 2512 fmode_t fmode = 0;
2580 int status = 0; 2513 int status = 0;
2581 2514
2582 cred = rpc_lookup_cred(); 2515 if (ctx != NULL) {
2583 if (IS_ERR(cred)) { 2516 cred = ctx->cred;
2584 status = PTR_ERR(cred); 2517 path = &ctx->path;
2585 goto out; 2518 fmode = ctx->mode;
2586 } 2519 }
2587 state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred); 2520 sattr->ia_mode &= ~current_umask();
2521 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
2588 d_drop(dentry); 2522 d_drop(dentry);
2589 if (IS_ERR(state)) { 2523 if (IS_ERR(state)) {
2590 status = PTR_ERR(state); 2524 status = PTR_ERR(state);
2591 goto out_putcred; 2525 goto out;
2592 } 2526 }
2593 d_add(dentry, igrab(state->inode)); 2527 d_add(dentry, igrab(state->inode));
2594 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 2528 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2595 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2529 if (ctx != NULL)
2596 status = nfs4_intent_set_file(nd, &path, state, fmode); 2530 ctx->state = state;
2597 else 2531 else
2598 nfs4_close_sync(&path, state, fmode); 2532 nfs4_close_sync(path, state, fmode);
2599out_putcred:
2600 put_rpccred(cred);
2601out: 2533out:
2602 return status; 2534 return status;
2603} 2535}
@@ -2655,6 +2587,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2655 2587
2656 args->bitmask = server->cache_consistency_bitmask; 2588 args->bitmask = server->cache_consistency_bitmask;
2657 res->server = server; 2589 res->server = server;
2590 res->seq_res.sr_slot = NULL;
2658 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2591 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2659} 2592}
2660 2593
@@ -2671,18 +2604,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2671 return 1; 2604 return 1;
2672} 2605}
2673 2606
2607static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
2608{
2609 struct nfs_server *server = NFS_SERVER(dir);
2610 struct nfs_renameargs *arg = msg->rpc_argp;
2611 struct nfs_renameres *res = msg->rpc_resp;
2612
2613 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
2614 arg->bitmask = server->attr_bitmask;
2615 res->server = server;
2616}
2617
2618static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
2619 struct inode *new_dir)
2620{
2621 struct nfs_renameres *res = task->tk_msg.rpc_resp;
2622
2623 if (!nfs4_sequence_done(task, &res->seq_res))
2624 return 0;
2625 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2626 return 0;
2627
2628 update_changeattr(old_dir, &res->old_cinfo);
2629 nfs_post_op_update_inode(old_dir, res->old_fattr);
2630 update_changeattr(new_dir, &res->new_cinfo);
2631 nfs_post_op_update_inode(new_dir, res->new_fattr);
2632 return 1;
2633}
2634
2674static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, 2635static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2675 struct inode *new_dir, struct qstr *new_name) 2636 struct inode *new_dir, struct qstr *new_name)
2676{ 2637{
2677 struct nfs_server *server = NFS_SERVER(old_dir); 2638 struct nfs_server *server = NFS_SERVER(old_dir);
2678 struct nfs4_rename_arg arg = { 2639 struct nfs_renameargs arg = {
2679 .old_dir = NFS_FH(old_dir), 2640 .old_dir = NFS_FH(old_dir),
2680 .new_dir = NFS_FH(new_dir), 2641 .new_dir = NFS_FH(new_dir),
2681 .old_name = old_name, 2642 .old_name = old_name,
2682 .new_name = new_name, 2643 .new_name = new_name,
2683 .bitmask = server->attr_bitmask, 2644 .bitmask = server->attr_bitmask,
2684 }; 2645 };
2685 struct nfs4_rename_res res = { 2646 struct nfs_renameres res = {
2686 .server = server, 2647 .server = server,
2687 }; 2648 };
2688 struct rpc_message msg = { 2649 struct rpc_message msg = {
@@ -2887,6 +2848,8 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2887{ 2848{
2888 struct nfs4_exception exception = { }; 2849 struct nfs4_exception exception = { };
2889 int err; 2850 int err;
2851
2852 sattr->ia_mode &= ~current_umask();
2890 do { 2853 do {
2891 err = nfs4_handle_exception(NFS_SERVER(dir), 2854 err = nfs4_handle_exception(NFS_SERVER(dir),
2892 _nfs4_proc_mkdir(dir, dentry, sattr), 2855 _nfs4_proc_mkdir(dir, dentry, sattr),
@@ -2896,15 +2859,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2896} 2859}
2897 2860
2898static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2861static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2899 u64 cookie, struct page *page, unsigned int count, int plus) 2862 u64 cookie, struct page **pages, unsigned int count, int plus)
2900{ 2863{
2901 struct inode *dir = dentry->d_inode; 2864 struct inode *dir = dentry->d_inode;
2902 struct nfs4_readdir_arg args = { 2865 struct nfs4_readdir_arg args = {
2903 .fh = NFS_FH(dir), 2866 .fh = NFS_FH(dir),
2904 .pages = &page, 2867 .pages = pages,
2905 .pgbase = 0, 2868 .pgbase = 0,
2906 .count = count, 2869 .count = count,
2907 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2870 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
2871 .plus = plus,
2908 }; 2872 };
2909 struct nfs4_readdir_res res; 2873 struct nfs4_readdir_res res;
2910 struct rpc_message msg = { 2874 struct rpc_message msg = {
@@ -2922,8 +2886,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2922 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); 2886 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
2923 res.pgbase = args.pgbase; 2887 res.pgbase = args.pgbase;
2924 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); 2888 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
2925 if (status == 0) 2889 if (status >= 0) {
2926 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2890 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
2891 status += args.pgbase;
2892 }
2927 2893
2928 nfs_invalidate_atime(dir); 2894 nfs_invalidate_atime(dir);
2929 2895
@@ -2932,14 +2898,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2932} 2898}
2933 2899
2934static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2900static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2935 u64 cookie, struct page *page, unsigned int count, int plus) 2901 u64 cookie, struct page **pages, unsigned int count, int plus)
2936{ 2902{
2937 struct nfs4_exception exception = { }; 2903 struct nfs4_exception exception = { };
2938 int err; 2904 int err;
2939 do { 2905 do {
2940 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), 2906 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
2941 _nfs4_proc_readdir(dentry, cred, cookie, 2907 _nfs4_proc_readdir(dentry, cred, cookie,
2942 page, count, plus), 2908 pages, count, plus),
2943 &exception); 2909 &exception);
2944 } while (exception.retry); 2910 } while (exception.retry);
2945 return err; 2911 return err;
@@ -2984,6 +2950,8 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
2984{ 2950{
2985 struct nfs4_exception exception = { }; 2951 struct nfs4_exception exception = { };
2986 int err; 2952 int err;
2953
2954 sattr->ia_mode &= ~current_umask();
2987 do { 2955 do {
2988 err = nfs4_handle_exception(NFS_SERVER(dir), 2956 err = nfs4_handle_exception(NFS_SERVER(dir),
2989 _nfs4_proc_mknod(dir, dentry, sattr, rdev), 2957 _nfs4_proc_mknod(dir, dentry, sattr, rdev),
@@ -3429,6 +3397,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
3429 ret = nfs_revalidate_inode(server, inode); 3397 ret = nfs_revalidate_inode(server, inode);
3430 if (ret < 0) 3398 if (ret < 0)
3431 return ret; 3399 return ret;
3400 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
3401 nfs_zap_acl_cache(inode);
3432 ret = nfs4_read_cached_acl(inode, buf, buflen); 3402 ret = nfs4_read_cached_acl(inode, buf, buflen);
3433 if (ret != -ENOENT) 3403 if (ret != -ENOENT)
3434 return ret; 3404 return ret;
@@ -3457,6 +3427,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
3457 nfs_inode_return_delegation(inode); 3427 nfs_inode_return_delegation(inode);
3458 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 3428 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
3459 ret = nfs4_call_sync(server, &msg, &arg, &res, 1); 3429 ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
3430 /*
3431 * Acl update can result in inode attribute update.
3432 * so mark the attribute cache invalid.
3433 */
3434 spin_lock(&inode->i_lock);
3435 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
3436 spin_unlock(&inode->i_lock);
3460 nfs_access_zap_cache(inode); 3437 nfs_access_zap_cache(inode);
3461 nfs_zap_acl_cache(inode); 3438 nfs_zap_acl_cache(inode);
3462 return ret; 3439 return ret;
@@ -3490,9 +3467,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3490 nfs4_state_mark_reclaim_nograce(clp, state); 3467 nfs4_state_mark_reclaim_nograce(clp, state);
3491 goto do_state_recovery; 3468 goto do_state_recovery;
3492 case -NFS4ERR_STALE_STATEID: 3469 case -NFS4ERR_STALE_STATEID:
3493 if (state == NULL)
3494 break;
3495 nfs4_state_mark_reclaim_reboot(clp, state);
3496 case -NFS4ERR_STALE_CLIENTID: 3470 case -NFS4ERR_STALE_CLIENTID:
3497 case -NFS4ERR_EXPIRED: 3471 case -NFS4ERR_EXPIRED:
3498 goto do_state_recovery; 3472 goto do_state_recovery;
@@ -3540,6 +3514,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3540 struct nfs4_setclientid setclientid = { 3514 struct nfs4_setclientid setclientid = {
3541 .sc_verifier = &sc_verifier, 3515 .sc_verifier = &sc_verifier,
3542 .sc_prog = program, 3516 .sc_prog = program,
3517 .sc_cb_ident = clp->cl_cb_ident,
3543 }; 3518 };
3544 struct rpc_message msg = { 3519 struct rpc_message msg = {
3545 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 3520 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -3579,7 +3554,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3579 if (signalled()) 3554 if (signalled())
3580 break; 3555 break;
3581 if (loop++ & 1) 3556 if (loop++ & 1)
3582 ssleep(clp->cl_lease_time + 1); 3557 ssleep(clp->cl_lease_time / HZ + 1);
3583 else 3558 else
3584 if (++clp->cl_id_uniquifier == 0) 3559 if (++clp->cl_id_uniquifier == 0)
3585 break; 3560 break;
@@ -3626,7 +3601,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3626 case -NFS4ERR_RESOURCE: 3601 case -NFS4ERR_RESOURCE:
3627 /* The IBM lawyers misread another document! */ 3602 /* The IBM lawyers misread another document! */
3628 case -NFS4ERR_DELAY: 3603 case -NFS4ERR_DELAY:
3629 case -EKEYEXPIRED:
3630 err = nfs4_delay(clp->cl_rpcclient, &timeout); 3604 err = nfs4_delay(clp->cl_rpcclient, &timeout);
3631 } 3605 }
3632 } while (err == 0); 3606 } while (err == 0);
@@ -3721,14 +3695,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3721 memcpy(&data->stateid, stateid, sizeof(data->stateid)); 3695 memcpy(&data->stateid, stateid, sizeof(data->stateid));
3722 data->res.fattr = &data->fattr; 3696 data->res.fattr = &data->fattr;
3723 data->res.server = server; 3697 data->res.server = server;
3724 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3725 nfs_fattr_init(data->res.fattr); 3698 nfs_fattr_init(data->res.fattr);
3726 data->timestamp = jiffies; 3699 data->timestamp = jiffies;
3727 data->rpc_status = 0; 3700 data->rpc_status = 0;
3728 3701
3729 task_setup_data.callback_data = data; 3702 task_setup_data.callback_data = data;
3730 msg.rpc_argp = &data->args, 3703 msg.rpc_argp = &data->args;
3731 msg.rpc_resp = &data->res, 3704 msg.rpc_resp = &data->res;
3732 task = rpc_run_task(&task_setup_data); 3705 task = rpc_run_task(&task_setup_data);
3733 if (IS_ERR(task)) 3706 if (IS_ERR(task))
3734 return PTR_ERR(task); 3707 return PTR_ERR(task);
@@ -3807,6 +3780,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3807 goto out; 3780 goto out;
3808 lsp = request->fl_u.nfs4_fl.owner; 3781 lsp = request->fl_u.nfs4_fl.owner;
3809 arg.lock_owner.id = lsp->ls_id.id; 3782 arg.lock_owner.id = lsp->ls_id.id;
3783 arg.lock_owner.s_dev = server->s_dev;
3810 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 3784 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
3811 switch (status) { 3785 switch (status) {
3812 case 0: 3786 case 0:
@@ -3874,7 +3848,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3874 p->arg.fl = &p->fl; 3848 p->arg.fl = &p->fl;
3875 p->arg.seqid = seqid; 3849 p->arg.seqid = seqid;
3876 p->res.seqid = seqid; 3850 p->res.seqid = seqid;
3877 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3878 p->arg.stateid = &lsp->ls_stateid; 3851 p->arg.stateid = &lsp->ls_stateid;
3879 p->lsp = lsp; 3852 p->lsp = lsp;
3880 atomic_inc(&lsp->ls_count); 3853 atomic_inc(&lsp->ls_count);
@@ -3973,8 +3946,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3973 return ERR_PTR(-ENOMEM); 3946 return ERR_PTR(-ENOMEM);
3974 } 3947 }
3975 3948
3976 msg.rpc_argp = &data->arg, 3949 msg.rpc_argp = &data->arg;
3977 msg.rpc_resp = &data->res, 3950 msg.rpc_resp = &data->res;
3978 task_setup_data.callback_data = data; 3951 task_setup_data.callback_data = data;
3979 return rpc_run_task(&task_setup_data); 3952 return rpc_run_task(&task_setup_data);
3980} 3953}
@@ -4053,8 +4026,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
4053 p->arg.lock_stateid = &lsp->ls_stateid; 4026 p->arg.lock_stateid = &lsp->ls_stateid;
4054 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 4027 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
4055 p->arg.lock_owner.id = lsp->ls_id.id; 4028 p->arg.lock_owner.id = lsp->ls_id.id;
4029 p->arg.lock_owner.s_dev = server->s_dev;
4056 p->res.lock_seqid = p->arg.lock_seqid; 4030 p->res.lock_seqid = p->arg.lock_seqid;
4057 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4058 p->lsp = lsp; 4031 p->lsp = lsp;
4059 p->server = server; 4032 p->server = server;
4060 atomic_inc(&lsp->ls_count); 4033 atomic_inc(&lsp->ls_count);
@@ -4211,8 +4184,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4211 data->arg.reclaim = NFS_LOCK_RECLAIM; 4184 data->arg.reclaim = NFS_LOCK_RECLAIM;
4212 task_setup_data.callback_ops = &nfs4_recover_lock_ops; 4185 task_setup_data.callback_ops = &nfs4_recover_lock_ops;
4213 } 4186 }
4214 msg.rpc_argp = &data->arg, 4187 msg.rpc_argp = &data->arg;
4215 msg.rpc_resp = &data->res, 4188 msg.rpc_resp = &data->res;
4216 task_setup_data.callback_data = data; 4189 task_setup_data.callback_data = data;
4217 task = rpc_run_task(&task_setup_data); 4190 task = rpc_run_task(&task_setup_data);
4218 if (IS_ERR(task)) 4191 if (IS_ERR(task))
@@ -4241,7 +4214,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
4241 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4214 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4242 return 0; 4215 return 0;
4243 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); 4216 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
4244 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 4217 if (err != -NFS4ERR_DELAY)
4245 break; 4218 break;
4246 nfs4_handle_exception(server, err, &exception); 4219 nfs4_handle_exception(server, err, &exception);
4247 } while (exception.retry); 4220 } while (exception.retry);
@@ -4266,7 +4239,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
4266 goto out; 4239 goto out;
4267 case -NFS4ERR_GRACE: 4240 case -NFS4ERR_GRACE:
4268 case -NFS4ERR_DELAY: 4241 case -NFS4ERR_DELAY:
4269 case -EKEYEXPIRED:
4270 nfs4_handle_exception(server, err, &exception); 4242 nfs4_handle_exception(server, err, &exception);
4271 err = 0; 4243 err = 0;
4272 } 4244 }
@@ -4412,13 +4384,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4412 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 4384 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
4413 err = 0; 4385 err = 0;
4414 goto out; 4386 goto out;
4387 case -EKEYEXPIRED:
4388 /*
4389 * User RPCSEC_GSS context has expired.
4390 * We cannot recover this stateid now, so
4391 * skip it and allow recovery thread to
4392 * proceed.
4393 */
4394 err = 0;
4395 goto out;
4415 case -ENOMEM: 4396 case -ENOMEM:
4416 case -NFS4ERR_DENIED: 4397 case -NFS4ERR_DENIED:
4417 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 4398 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
4418 err = 0; 4399 err = 0;
4419 goto out; 4400 goto out;
4420 case -NFS4ERR_DELAY: 4401 case -NFS4ERR_DELAY:
4421 case -EKEYEXPIRED:
4422 break; 4402 break;
4423 } 4403 }
4424 err = nfs4_handle_exception(server, err, &exception); 4404 err = nfs4_handle_exception(server, err, &exception);
@@ -4451,48 +4431,43 @@ void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
4451 return; 4431 return;
4452 args->lock_owner.clientid = server->nfs_client->cl_clientid; 4432 args->lock_owner.clientid = server->nfs_client->cl_clientid;
4453 args->lock_owner.id = lsp->ls_id.id; 4433 args->lock_owner.id = lsp->ls_id.id;
4434 args->lock_owner.s_dev = server->s_dev;
4454 msg.rpc_argp = args; 4435 msg.rpc_argp = args;
4455 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); 4436 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
4456} 4437}
4457 4438
4458#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 4439#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
4459 4440
4460int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, 4441static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
4461 size_t buflen, int flags) 4442 const void *buf, size_t buflen,
4443 int flags, int type)
4462{ 4444{
4463 struct inode *inode = dentry->d_inode; 4445 if (strcmp(key, "") != 0)
4464 4446 return -EINVAL;
4465 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
4466 return -EOPNOTSUPP;
4467 4447
4468 return nfs4_proc_set_acl(inode, buf, buflen); 4448 return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
4469} 4449}
4470 4450
4471/* The getxattr man page suggests returning -ENODATA for unknown attributes, 4451static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
4472 * and that's what we'll do for e.g. user attributes that haven't been set. 4452 void *buf, size_t buflen, int type)
4473 * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
4474 * attributes in kernel-managed attribute namespaces. */
4475ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
4476 size_t buflen)
4477{ 4453{
4478 struct inode *inode = dentry->d_inode; 4454 if (strcmp(key, "") != 0)
4479 4455 return -EINVAL;
4480 if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
4481 return -EOPNOTSUPP;
4482 4456
4483 return nfs4_proc_get_acl(inode, buf, buflen); 4457 return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
4484} 4458}
4485 4459
4486ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) 4460static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
4461 size_t list_len, const char *name,
4462 size_t name_len, int type)
4487{ 4463{
4488 size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; 4464 size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
4489 4465
4490 if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) 4466 if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
4491 return 0; 4467 return 0;
4492 if (buf && buflen < len) 4468
4493 return -ERANGE; 4469 if (list && len <= list_len)
4494 if (buf) 4470 memcpy(list, XATTR_NAME_NFSV4_ACL, len);
4495 memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
4496 return len; 4471 return len;
4497} 4472}
4498 4473
@@ -4545,6 +4520,25 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
4545 4520
4546#ifdef CONFIG_NFS_V4_1 4521#ifdef CONFIG_NFS_V4_1
4547/* 4522/*
4523 * Check the exchange flags returned by the server for invalid flags, having
4524 * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
4525 * DS flags set.
4526 */
4527static int nfs4_check_cl_exchange_flags(u32 flags)
4528{
4529 if (flags & ~EXCHGID4_FLAG_MASK_R)
4530 goto out_inval;
4531 if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
4532 (flags & EXCHGID4_FLAG_USE_NON_PNFS))
4533 goto out_inval;
4534 if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
4535 goto out_inval;
4536 return NFS_OK;
4537out_inval:
4538 return -NFS4ERR_INVAL;
4539}
4540
4541/*
4548 * nfs4_proc_exchange_id() 4542 * nfs4_proc_exchange_id()
4549 * 4543 *
4550 * Since the clientid has expired, all compounds using sessions 4544 * Since the clientid has expired, all compounds using sessions
@@ -4557,7 +4551,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4557 nfs4_verifier verifier; 4551 nfs4_verifier verifier;
4558 struct nfs41_exchange_id_args args = { 4552 struct nfs41_exchange_id_args args = {
4559 .client = clp, 4553 .client = clp,
4560 .flags = clp->cl_exchange_flags, 4554 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
4561 }; 4555 };
4562 struct nfs41_exchange_id_res res = { 4556 struct nfs41_exchange_id_res res = {
4563 .client = clp, 4557 .client = clp,
@@ -4574,34 +4568,21 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4574 dprintk("--> %s\n", __func__); 4568 dprintk("--> %s\n", __func__);
4575 BUG_ON(clp == NULL); 4569 BUG_ON(clp == NULL);
4576 4570
4577 /* Remove server-only flags */
4578 args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R;
4579
4580 p = (u32 *)verifier.data; 4571 p = (u32 *)verifier.data;
4581 *p++ = htonl((u32)clp->cl_boot_time.tv_sec); 4572 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
4582 *p = htonl((u32)clp->cl_boot_time.tv_nsec); 4573 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
4583 args.verifier = &verifier; 4574 args.verifier = &verifier;
4584 4575
4585 while (1) { 4576 args.id_len = scnprintf(args.id, sizeof(args.id),
4586 args.id_len = scnprintf(args.id, sizeof(args.id), 4577 "%s/%s.%s/%u",
4587 "%s/%s %u", 4578 clp->cl_ipaddr,
4588 clp->cl_ipaddr, 4579 init_utsname()->nodename,
4589 rpc_peeraddr2str(clp->cl_rpcclient, 4580 init_utsname()->domainname,
4590 RPC_DISPLAY_ADDR), 4581 clp->cl_rpcclient->cl_auth->au_flavor);
4591 clp->cl_id_uniquifier);
4592
4593 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4594
4595 if (status != -NFS4ERR_CLID_INUSE)
4596 break;
4597
4598 if (signalled())
4599 break;
4600
4601 if (++clp->cl_id_uniquifier == 0)
4602 break;
4603 }
4604 4582
4583 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4584 if (!status)
4585 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4605 dprintk("<-- %s status= %d\n", __func__, status); 4586 dprintk("<-- %s status= %d\n", __func__, status);
4606 return status; 4587 return status;
4607} 4588}
@@ -4647,7 +4628,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4647 switch (task->tk_status) { 4628 switch (task->tk_status) {
4648 case -NFS4ERR_DELAY: 4629 case -NFS4ERR_DELAY:
4649 case -NFS4ERR_GRACE: 4630 case -NFS4ERR_GRACE:
4650 case -EKEYEXPIRED:
4651 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); 4631 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4652 rpc_delay(task, NFS4_POLL_RETRY_MIN); 4632 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4653 task->tk_status = 0; 4633 task->tk_status = 0;
@@ -4687,7 +4667,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4687 }; 4667 };
4688 int status; 4668 int status;
4689 4669
4690 res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4691 dprintk("--> %s\n", __func__); 4670 dprintk("--> %s\n", __func__);
4692 task = rpc_run_task(&task_setup); 4671 task = rpc_run_task(&task_setup);
4693 4672
@@ -4837,17 +4816,17 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4837 if (!session) 4816 if (!session)
4838 return NULL; 4817 return NULL;
4839 4818
4840 init_completion(&session->complete);
4841
4842 tbl = &session->fc_slot_table; 4819 tbl = &session->fc_slot_table;
4843 tbl->highest_used_slotid = -1; 4820 tbl->highest_used_slotid = -1;
4844 spin_lock_init(&tbl->slot_tbl_lock); 4821 spin_lock_init(&tbl->slot_tbl_lock);
4845 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); 4822 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
4823 init_completion(&tbl->complete);
4846 4824
4847 tbl = &session->bc_slot_table; 4825 tbl = &session->bc_slot_table;
4848 tbl->highest_used_slotid = -1; 4826 tbl->highest_used_slotid = -1;
4849 spin_lock_init(&tbl->slot_tbl_lock); 4827 spin_lock_init(&tbl->slot_tbl_lock);
4850 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 4828 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4829 init_completion(&tbl->complete);
4851 4830
4852 session->session_state = 1<<NFS4_SESSION_INITING; 4831 session->session_state = 1<<NFS4_SESSION_INITING;
4853 4832
@@ -4914,49 +4893,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4914 args->bc_attrs.max_reqs); 4893 args->bc_attrs.max_reqs);
4915} 4894}
4916 4895
4917static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd) 4896static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4918{ 4897{
4919 if (rcvd <= sent) 4898 struct nfs4_channel_attrs *sent = &args->fc_attrs;
4920 return 0; 4899 struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
4921 printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. " 4900
4922 "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd); 4901 if (rcvd->headerpadsz > sent->headerpadsz)
4923 return -EINVAL; 4902 return -EINVAL;
4903 if (rcvd->max_resp_sz > sent->max_resp_sz)
4904 return -EINVAL;
4905 /*
4906 * Our requested max_ops is the minimum we need; we're not
4907 * prepared to break up compounds into smaller pieces than that.
4908 * So, no point even trying to continue if the server won't
4909 * cooperate:
4910 */
4911 if (rcvd->max_ops < sent->max_ops)
4912 return -EINVAL;
4913 if (rcvd->max_reqs == 0)
4914 return -EINVAL;
4915 return 0;
4924} 4916}
4925 4917
4926#define _verify_fore_channel_attr(_name_) \ 4918static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4927 _verify_channel_attr("fore", #_name_, \ 4919{
4928 args->fc_attrs._name_, \ 4920 struct nfs4_channel_attrs *sent = &args->bc_attrs;
4929 session->fc_attrs._name_) 4921 struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
4930 4922
4931#define _verify_back_channel_attr(_name_) \ 4923 if (rcvd->max_rqst_sz > sent->max_rqst_sz)
4932 _verify_channel_attr("back", #_name_, \ 4924 return -EINVAL;
4933 args->bc_attrs._name_, \ 4925 if (rcvd->max_resp_sz < sent->max_resp_sz)
4934 session->bc_attrs._name_) 4926 return -EINVAL;
4927 if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
4928 return -EINVAL;
4929 /* These would render the backchannel useless: */
4930 if (rcvd->max_ops == 0)
4931 return -EINVAL;
4932 if (rcvd->max_reqs == 0)
4933 return -EINVAL;
4934 return 0;
4935}
4935 4936
4936/*
4937 * The server is not allowed to increase the fore channel header pad size,
4938 * maximum response size, or maximum number of operations.
4939 *
4940 * The back channel attributes are only negotiatied down: We send what the
4941 * (back channel) server insists upon.
4942 */
4943static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, 4937static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
4944 struct nfs4_session *session) 4938 struct nfs4_session *session)
4945{ 4939{
4946 int ret = 0; 4940 int ret;
4947
4948 ret |= _verify_fore_channel_attr(headerpadsz);
4949 ret |= _verify_fore_channel_attr(max_resp_sz);
4950 ret |= _verify_fore_channel_attr(max_ops);
4951
4952 ret |= _verify_back_channel_attr(headerpadsz);
4953 ret |= _verify_back_channel_attr(max_rqst_sz);
4954 ret |= _verify_back_channel_attr(max_resp_sz);
4955 ret |= _verify_back_channel_attr(max_resp_sz_cached);
4956 ret |= _verify_back_channel_attr(max_ops);
4957 ret |= _verify_back_channel_attr(max_reqs);
4958 4941
4959 return ret; 4942 ret = nfs4_verify_fore_channel_attrs(args, session);
4943 if (ret)
4944 return ret;
4945 return nfs4_verify_back_channel_attrs(args, session);
4960} 4946}
4961 4947
4962static int _nfs4_proc_create_session(struct nfs_client *clp) 4948static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5111,7 +5097,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
5111{ 5097{
5112 switch(task->tk_status) { 5098 switch(task->tk_status) {
5113 case -NFS4ERR_DELAY: 5099 case -NFS4ERR_DELAY:
5114 case -EKEYEXPIRED:
5115 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5100 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5116 return -EAGAIN; 5101 return -EAGAIN;
5117 default: 5102 default:
@@ -5180,12 +5165,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
5180 5165
5181 if (!atomic_inc_not_zero(&clp->cl_count)) 5166 if (!atomic_inc_not_zero(&clp->cl_count))
5182 return ERR_PTR(-EIO); 5167 return ERR_PTR(-EIO);
5183 calldata = kmalloc(sizeof(*calldata), GFP_NOFS); 5168 calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
5184 if (calldata == NULL) { 5169 if (calldata == NULL) {
5185 nfs_put_client(clp); 5170 nfs_put_client(clp);
5186 return ERR_PTR(-ENOMEM); 5171 return ERR_PTR(-ENOMEM);
5187 } 5172 }
5188 calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5189 msg.rpc_argp = &calldata->args; 5173 msg.rpc_argp = &calldata->args;
5190 msg.rpc_resp = &calldata->res; 5174 msg.rpc_resp = &calldata->res;
5191 calldata->clp = clp; 5175 calldata->clp = clp;
@@ -5254,7 +5238,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
5254 case -NFS4ERR_WRONG_CRED: /* What to do here? */ 5238 case -NFS4ERR_WRONG_CRED: /* What to do here? */
5255 break; 5239 break;
5256 case -NFS4ERR_DELAY: 5240 case -NFS4ERR_DELAY:
5257 case -EKEYEXPIRED:
5258 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5241 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5259 return -EAGAIN; 5242 return -EAGAIN;
5260 default: 5243 default:
@@ -5317,7 +5300,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5317 goto out; 5300 goto out;
5318 calldata->clp = clp; 5301 calldata->clp = clp;
5319 calldata->arg.one_fs = 0; 5302 calldata->arg.one_fs = 0;
5320 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5321 5303
5322 msg.rpc_argp = &calldata->arg; 5304 msg.rpc_argp = &calldata->arg;
5323 msg.rpc_resp = &calldata->res; 5305 msg.rpc_resp = &calldata->res;
@@ -5333,6 +5315,152 @@ out:
5333 dprintk("<-- %s status=%d\n", __func__, status); 5315 dprintk("<-- %s status=%d\n", __func__, status);
5334 return status; 5316 return status;
5335} 5317}
5318
5319static void
5320nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
5321{
5322 struct nfs4_layoutget *lgp = calldata;
5323 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5324
5325 dprintk("--> %s\n", __func__);
5326 /* Note the is a race here, where a CB_LAYOUTRECALL can come in
5327 * right now covering the LAYOUTGET we are about to send.
5328 * However, that is not so catastrophic, and there seems
5329 * to be no way to prevent it completely.
5330 */
5331 if (nfs4_setup_sequence(server, &lgp->args.seq_args,
5332 &lgp->res.seq_res, 0, task))
5333 return;
5334 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
5335 NFS_I(lgp->args.inode)->layout,
5336 lgp->args.ctx->state)) {
5337 rpc_exit(task, NFS4_OK);
5338 return;
5339 }
5340 rpc_call_start(task);
5341}
5342
5343static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
5344{
5345 struct nfs4_layoutget *lgp = calldata;
5346 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5347
5348 dprintk("--> %s\n", __func__);
5349
5350 if (!nfs4_sequence_done(task, &lgp->res.seq_res))
5351 return;
5352
5353 switch (task->tk_status) {
5354 case 0:
5355 break;
5356 case -NFS4ERR_LAYOUTTRYLATER:
5357 case -NFS4ERR_RECALLCONFLICT:
5358 task->tk_status = -NFS4ERR_DELAY;
5359 /* Fall through */
5360 default:
5361 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5362 rpc_restart_call_prepare(task);
5363 return;
5364 }
5365 }
5366 dprintk("<-- %s\n", __func__);
5367}
5368
5369static void nfs4_layoutget_release(void *calldata)
5370{
5371 struct nfs4_layoutget *lgp = calldata;
5372
5373 dprintk("--> %s\n", __func__);
5374 if (lgp->res.layout.buf != NULL)
5375 free_page((unsigned long) lgp->res.layout.buf);
5376 put_nfs_open_context(lgp->args.ctx);
5377 kfree(calldata);
5378 dprintk("<-- %s\n", __func__);
5379}
5380
5381static const struct rpc_call_ops nfs4_layoutget_call_ops = {
5382 .rpc_call_prepare = nfs4_layoutget_prepare,
5383 .rpc_call_done = nfs4_layoutget_done,
5384 .rpc_release = nfs4_layoutget_release,
5385};
5386
5387int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5388{
5389 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5390 struct rpc_task *task;
5391 struct rpc_message msg = {
5392 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
5393 .rpc_argp = &lgp->args,
5394 .rpc_resp = &lgp->res,
5395 };
5396 struct rpc_task_setup task_setup_data = {
5397 .rpc_client = server->client,
5398 .rpc_message = &msg,
5399 .callback_ops = &nfs4_layoutget_call_ops,
5400 .callback_data = lgp,
5401 .flags = RPC_TASK_ASYNC,
5402 };
5403 int status = 0;
5404
5405 dprintk("--> %s\n", __func__);
5406
5407 lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
5408 if (lgp->res.layout.buf == NULL) {
5409 nfs4_layoutget_release(lgp);
5410 return -ENOMEM;
5411 }
5412
5413 lgp->res.seq_res.sr_slot = NULL;
5414 task = rpc_run_task(&task_setup_data);
5415 if (IS_ERR(task))
5416 return PTR_ERR(task);
5417 status = nfs4_wait_for_completion_rpc_task(task);
5418 if (status == 0)
5419 status = task->tk_status;
5420 if (status == 0)
5421 status = pnfs_layout_process(lgp);
5422 rpc_put_task(task);
5423 dprintk("<-- %s status=%d\n", __func__, status);
5424 return status;
5425}
5426
5427static int
5428_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5429{
5430 struct nfs4_getdeviceinfo_args args = {
5431 .pdev = pdev,
5432 };
5433 struct nfs4_getdeviceinfo_res res = {
5434 .pdev = pdev,
5435 };
5436 struct rpc_message msg = {
5437 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
5438 .rpc_argp = &args,
5439 .rpc_resp = &res,
5440 };
5441 int status;
5442
5443 dprintk("--> %s\n", __func__);
5444 status = nfs4_call_sync(server, &msg, &args, &res, 0);
5445 dprintk("<-- %s status=%d\n", __func__, status);
5446
5447 return status;
5448}
5449
5450int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5451{
5452 struct nfs4_exception exception = { };
5453 int err;
5454
5455 do {
5456 err = nfs4_handle_exception(server,
5457 _nfs4_proc_getdeviceinfo(server, pdev),
5458 &exception);
5459 } while (exception.retry);
5460 return err;
5461}
5462EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
5463
5336#endif /* CONFIG_NFS_V4_1 */ 5464#endif /* CONFIG_NFS_V4_1 */
5337 5465
5338struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 5466struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5421,9 +5549,10 @@ static const struct inode_operations nfs4_file_inode_operations = {
5421 .permission = nfs_permission, 5549 .permission = nfs_permission,
5422 .getattr = nfs_getattr, 5550 .getattr = nfs_getattr,
5423 .setattr = nfs_setattr, 5551 .setattr = nfs_setattr,
5424 .getxattr = nfs4_getxattr, 5552 .getxattr = generic_getxattr,
5425 .setxattr = nfs4_setxattr, 5553 .setxattr = generic_setxattr,
5426 .listxattr = nfs4_listxattr, 5554 .listxattr = generic_listxattr,
5555 .removexattr = generic_removexattr,
5427}; 5556};
5428 5557
5429const struct nfs_rpc_ops nfs_v4_clientops = { 5558const struct nfs_rpc_ops nfs_v4_clientops = {
@@ -5443,6 +5572,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5443 .unlink_setup = nfs4_proc_unlink_setup, 5572 .unlink_setup = nfs4_proc_unlink_setup,
5444 .unlink_done = nfs4_proc_unlink_done, 5573 .unlink_done = nfs4_proc_unlink_done,
5445 .rename = nfs4_proc_rename, 5574 .rename = nfs4_proc_rename,
5575 .rename_setup = nfs4_proc_rename_setup,
5576 .rename_done = nfs4_proc_rename_done,
5446 .link = nfs4_proc_link, 5577 .link = nfs4_proc_link,
5447 .symlink = nfs4_proc_symlink, 5578 .symlink = nfs4_proc_symlink,
5448 .mkdir = nfs4_proc_mkdir, 5579 .mkdir = nfs4_proc_mkdir,
@@ -5463,6 +5594,19 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5463 .lock = nfs4_proc_lock, 5594 .lock = nfs4_proc_lock,
5464 .clear_acl_cache = nfs4_zap_acl_attr, 5595 .clear_acl_cache = nfs4_zap_acl_attr,
5465 .close_context = nfs4_close_context, 5596 .close_context = nfs4_close_context,
5597 .open_context = nfs4_atomic_open,
5598};
5599
5600static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
5601 .prefix = XATTR_NAME_NFSV4_ACL,
5602 .list = nfs4_xattr_list_nfs4_acl,
5603 .get = nfs4_xattr_get_nfs4_acl,
5604 .set = nfs4_xattr_set_nfs4_acl,
5605};
5606
5607const struct xattr_handler *nfs4_xattr_handlers[] = {
5608 &nfs4_xattr_nfs4_acl_handler,
5609 NULL
5466}; 5610};
5467 5611
5468/* 5612/*
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 72b6c580af13..402143d75fc5 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -63,9 +63,14 @@ nfs4_renew_state(struct work_struct *work)
63 63
64 ops = clp->cl_mvops->state_renewal_ops; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 /* Are there any active superblocks? */ 66
67 if (list_empty(&clp->cl_superblocks)) 67 rcu_read_lock();
68 if (list_empty(&clp->cl_superblocks)) {
69 rcu_read_unlock();
68 goto out; 70 goto out;
71 }
72 rcu_read_unlock();
73
69 spin_lock(&clp->cl_lock); 74 spin_lock(&clp->cl_lock);
70 lease = clp->cl_lease_time; 75 lease = clp->cl_lease_time;
71 last = clp->cl_last_renewal; 76 last = clp->cl_last_renewal;
@@ -75,7 +80,7 @@ nfs4_renew_state(struct work_struct *work)
75 cred = ops->get_state_renewal_cred_locked(clp); 80 cred = ops->get_state_renewal_cred_locked(clp);
76 spin_unlock(&clp->cl_lock); 81 spin_unlock(&clp->cl_lock);
77 if (cred == NULL) { 82 if (cred == NULL) {
78 if (list_empty(&clp->cl_delegations)) { 83 if (!nfs_delegations_present(clp)) {
79 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 84 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
80 goto out; 85 goto out;
81 } 86 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e2f19b04c06..e6742b57a04c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,12 +40,13 @@
40 40
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/smp_lock.h> 43#include <linux/fs.h>
44#include <linux/nfs_fs.h> 44#include <linux/nfs_fs.h>
45#include <linux/nfs_idmap.h> 45#include <linux/nfs_idmap.h>
46#include <linux/kthread.h> 46#include <linux/kthread.h>
47#include <linux/module.h> 47#include <linux/module.h>
48#include <linux/random.h> 48#include <linux/random.h>
49#include <linux/ratelimit.h>
49#include <linux/workqueue.h> 50#include <linux/workqueue.h>
50#include <linux/bitops.h> 51#include <linux/bitops.h>
51 52
@@ -53,6 +54,7 @@
53#include "callback.h" 54#include "callback.h"
54#include "delegation.h" 55#include "delegation.h"
55#include "internal.h" 56#include "internal.h"
57#include "pnfs.h"
56 58
57#define OPENOWNER_POOL_SIZE 8 59#define OPENOWNER_POOL_SIZE 8
58 60
@@ -103,14 +105,17 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
103 put_rpccred(cred); 105 put_rpccred(cred);
104} 106}
105 107
106struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) 108static struct rpc_cred *
109nfs4_get_renew_cred_server_locked(struct nfs_server *server)
107{ 110{
111 struct rpc_cred *cred = NULL;
108 struct nfs4_state_owner *sp; 112 struct nfs4_state_owner *sp;
109 struct rb_node *pos; 113 struct rb_node *pos;
110 struct rpc_cred *cred = NULL;
111 114
112 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 115 for (pos = rb_first(&server->state_owners);
113 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 116 pos != NULL;
117 pos = rb_next(pos)) {
118 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
114 if (list_empty(&sp->so_states)) 119 if (list_empty(&sp->so_states))
115 continue; 120 continue;
116 cred = get_rpccred(sp->so_cred); 121 cred = get_rpccred(sp->so_cred);
@@ -119,6 +124,28 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
119 return cred; 124 return cred;
120} 125}
121 126
127/**
128 * nfs4_get_renew_cred_locked - Acquire credential for a renew operation
129 * @clp: client state handle
130 *
131 * Returns an rpc_cred with reference count bumped, or NULL.
132 * Caller must hold clp->cl_lock.
133 */
134struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
135{
136 struct rpc_cred *cred = NULL;
137 struct nfs_server *server;
138
139 rcu_read_lock();
140 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
141 cred = nfs4_get_renew_cred_server_locked(server);
142 if (cred != NULL)
143 break;
144 }
145 rcu_read_unlock();
146 return cred;
147}
148
122#if defined(CONFIG_NFS_V4_1) 149#if defined(CONFIG_NFS_V4_1)
123 150
124static int nfs41_setup_state_renewal(struct nfs_client *clp) 151static int nfs41_setup_state_renewal(struct nfs_client *clp)
@@ -140,6 +167,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
140 return status; 167 return status;
141} 168}
142 169
170/*
171 * Back channel returns NFS4ERR_DELAY for new requests when
172 * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
173 * is ended.
174 */
143static void nfs4_end_drain_session(struct nfs_client *clp) 175static void nfs4_end_drain_session(struct nfs_client *clp)
144{ 176{
145 struct nfs4_session *ses = clp->cl_session; 177 struct nfs4_session *ses = clp->cl_session;
@@ -163,22 +195,32 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
163 } 195 }
164} 196}
165 197
166static int nfs4_begin_drain_session(struct nfs_client *clp) 198static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
167{ 199{
168 struct nfs4_session *ses = clp->cl_session;
169 struct nfs4_slot_table *tbl = &ses->fc_slot_table;
170
171 spin_lock(&tbl->slot_tbl_lock); 200 spin_lock(&tbl->slot_tbl_lock);
172 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
173 if (tbl->highest_used_slotid != -1) { 201 if (tbl->highest_used_slotid != -1) {
174 INIT_COMPLETION(ses->complete); 202 INIT_COMPLETION(tbl->complete);
175 spin_unlock(&tbl->slot_tbl_lock); 203 spin_unlock(&tbl->slot_tbl_lock);
176 return wait_for_completion_interruptible(&ses->complete); 204 return wait_for_completion_interruptible(&tbl->complete);
177 } 205 }
178 spin_unlock(&tbl->slot_tbl_lock); 206 spin_unlock(&tbl->slot_tbl_lock);
179 return 0; 207 return 0;
180} 208}
181 209
210static int nfs4_begin_drain_session(struct nfs_client *clp)
211{
212 struct nfs4_session *ses = clp->cl_session;
213 int ret = 0;
214
215 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
216 /* back channel */
217 ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
218 if (ret)
219 return ret;
220 /* fore channel */
221 return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
222}
223
182int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 224int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
183{ 225{
184 int status; 226 int status;
@@ -208,28 +250,56 @@ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
208 250
209#endif /* CONFIG_NFS_V4_1 */ 251#endif /* CONFIG_NFS_V4_1 */
210 252
211struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) 253static struct rpc_cred *
254nfs4_get_setclientid_cred_server(struct nfs_server *server)
212{ 255{
256 struct nfs_client *clp = server->nfs_client;
257 struct rpc_cred *cred = NULL;
213 struct nfs4_state_owner *sp; 258 struct nfs4_state_owner *sp;
214 struct rb_node *pos; 259 struct rb_node *pos;
260
261 spin_lock(&clp->cl_lock);
262 pos = rb_first(&server->state_owners);
263 if (pos != NULL) {
264 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
265 cred = get_rpccred(sp->so_cred);
266 }
267 spin_unlock(&clp->cl_lock);
268 return cred;
269}
270
271/**
272 * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
273 * @clp: client state handle
274 *
275 * Returns an rpc_cred with reference count bumped, or NULL.
276 */
277struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
278{
279 struct nfs_server *server;
215 struct rpc_cred *cred; 280 struct rpc_cred *cred;
216 281
217 spin_lock(&clp->cl_lock); 282 spin_lock(&clp->cl_lock);
218 cred = nfs4_get_machine_cred_locked(clp); 283 cred = nfs4_get_machine_cred_locked(clp);
284 spin_unlock(&clp->cl_lock);
219 if (cred != NULL) 285 if (cred != NULL)
220 goto out; 286 goto out;
221 pos = rb_first(&clp->cl_state_owners); 287
222 if (pos != NULL) { 288 rcu_read_lock();
223 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 289 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
224 cred = get_rpccred(sp->so_cred); 290 cred = nfs4_get_setclientid_cred_server(server);
291 if (cred != NULL)
292 break;
225 } 293 }
294 rcu_read_unlock();
295
226out: 296out:
227 spin_unlock(&clp->cl_lock);
228 return cred; 297 return cred;
229} 298}
230 299
231static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, 300static void nfs_alloc_unique_id_locked(struct rb_root *root,
232 __u64 minval, int maxbits) 301 struct nfs_unique_id *new,
302 __u64 minval, int maxbits)
233{ 303{
234 struct rb_node **p, *parent; 304 struct rb_node **p, *parent;
235 struct nfs_unique_id *pos; 305 struct nfs_unique_id *pos;
@@ -284,16 +354,15 @@ static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
284} 354}
285 355
286static struct nfs4_state_owner * 356static struct nfs4_state_owner *
287nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) 357nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
288{ 358{
289 struct nfs_client *clp = server->nfs_client; 359 struct rb_node **p = &server->state_owners.rb_node,
290 struct rb_node **p = &clp->cl_state_owners.rb_node,
291 *parent = NULL; 360 *parent = NULL;
292 struct nfs4_state_owner *sp, *res = NULL; 361 struct nfs4_state_owner *sp, *res = NULL;
293 362
294 while (*p != NULL) { 363 while (*p != NULL) {
295 parent = *p; 364 parent = *p;
296 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); 365 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
297 366
298 if (server < sp->so_server) { 367 if (server < sp->so_server) {
299 p = &parent->rb_left; 368 p = &parent->rb_left;
@@ -317,24 +386,17 @@ nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
317} 386}
318 387
319static struct nfs4_state_owner * 388static struct nfs4_state_owner *
320nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) 389nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
321{ 390{
322 struct rb_node **p = &clp->cl_state_owners.rb_node, 391 struct nfs_server *server = new->so_server;
392 struct rb_node **p = &server->state_owners.rb_node,
323 *parent = NULL; 393 *parent = NULL;
324 struct nfs4_state_owner *sp; 394 struct nfs4_state_owner *sp;
325 395
326 while (*p != NULL) { 396 while (*p != NULL) {
327 parent = *p; 397 parent = *p;
328 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); 398 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
329 399
330 if (new->so_server < sp->so_server) {
331 p = &parent->rb_left;
332 continue;
333 }
334 if (new->so_server > sp->so_server) {
335 p = &parent->rb_right;
336 continue;
337 }
338 if (new->so_cred < sp->so_cred) 400 if (new->so_cred < sp->so_cred)
339 p = &parent->rb_left; 401 p = &parent->rb_left;
340 else if (new->so_cred > sp->so_cred) 402 else if (new->so_cred > sp->so_cred)
@@ -344,18 +406,21 @@ nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
344 return sp; 406 return sp;
345 } 407 }
346 } 408 }
347 nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64); 409 nfs_alloc_unique_id_locked(&server->openowner_id,
348 rb_link_node(&new->so_client_node, parent, p); 410 &new->so_owner_id, 1, 64);
349 rb_insert_color(&new->so_client_node, &clp->cl_state_owners); 411 rb_link_node(&new->so_server_node, parent, p);
412 rb_insert_color(&new->so_server_node, &server->state_owners);
350 return new; 413 return new;
351} 414}
352 415
353static void 416static void
354nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) 417nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
355{ 418{
356 if (!RB_EMPTY_NODE(&sp->so_client_node)) 419 struct nfs_server *server = sp->so_server;
357 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 420
358 nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id); 421 if (!RB_EMPTY_NODE(&sp->so_server_node))
422 rb_erase(&sp->so_server_node, &server->state_owners);
423 nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id);
359} 424}
360 425
361/* 426/*
@@ -384,23 +449,32 @@ nfs4_alloc_state_owner(void)
384static void 449static void
385nfs4_drop_state_owner(struct nfs4_state_owner *sp) 450nfs4_drop_state_owner(struct nfs4_state_owner *sp)
386{ 451{
387 if (!RB_EMPTY_NODE(&sp->so_client_node)) { 452 if (!RB_EMPTY_NODE(&sp->so_server_node)) {
388 struct nfs_client *clp = sp->so_server->nfs_client; 453 struct nfs_server *server = sp->so_server;
454 struct nfs_client *clp = server->nfs_client;
389 455
390 spin_lock(&clp->cl_lock); 456 spin_lock(&clp->cl_lock);
391 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 457 rb_erase(&sp->so_server_node, &server->state_owners);
392 RB_CLEAR_NODE(&sp->so_client_node); 458 RB_CLEAR_NODE(&sp->so_server_node);
393 spin_unlock(&clp->cl_lock); 459 spin_unlock(&clp->cl_lock);
394 } 460 }
395} 461}
396 462
397struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) 463/**
464 * nfs4_get_state_owner - Look up a state owner given a credential
465 * @server: nfs_server to search
466 * @cred: RPC credential to match
467 *
468 * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
469 */
470struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
471 struct rpc_cred *cred)
398{ 472{
399 struct nfs_client *clp = server->nfs_client; 473 struct nfs_client *clp = server->nfs_client;
400 struct nfs4_state_owner *sp, *new; 474 struct nfs4_state_owner *sp, *new;
401 475
402 spin_lock(&clp->cl_lock); 476 spin_lock(&clp->cl_lock);
403 sp = nfs4_find_state_owner(server, cred); 477 sp = nfs4_find_state_owner_locked(server, cred);
404 spin_unlock(&clp->cl_lock); 478 spin_unlock(&clp->cl_lock);
405 if (sp != NULL) 479 if (sp != NULL)
406 return sp; 480 return sp;
@@ -410,7 +484,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
410 new->so_server = server; 484 new->so_server = server;
411 new->so_cred = cred; 485 new->so_cred = cred;
412 spin_lock(&clp->cl_lock); 486 spin_lock(&clp->cl_lock);
413 sp = nfs4_insert_state_owner(clp, new); 487 sp = nfs4_insert_state_owner_locked(new);
414 spin_unlock(&clp->cl_lock); 488 spin_unlock(&clp->cl_lock);
415 if (sp == new) 489 if (sp == new)
416 get_rpccred(cred); 490 get_rpccred(cred);
@@ -421,6 +495,11 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
421 return sp; 495 return sp;
422} 496}
423 497
498/**
499 * nfs4_put_state_owner - Release a nfs4_state_owner
500 * @sp: state owner data to release
501 *
502 */
424void nfs4_put_state_owner(struct nfs4_state_owner *sp) 503void nfs4_put_state_owner(struct nfs4_state_owner *sp)
425{ 504{
426 struct nfs_client *clp = sp->so_server->nfs_client; 505 struct nfs_client *clp = sp->so_server->nfs_client;
@@ -428,7 +507,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
428 507
429 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 508 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
430 return; 509 return;
431 nfs4_remove_state_owner(clp, sp); 510 nfs4_remove_state_owner_locked(sp);
432 spin_unlock(&clp->cl_lock); 511 spin_unlock(&clp->cl_lock);
433 rpc_destroy_wait_queue(&sp->so_sequence.wait); 512 rpc_destroy_wait_queue(&sp->so_sequence.wait);
434 put_rpccred(cred); 513 put_rpccred(cred);
@@ -583,8 +662,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
583 if (!call_close) { 662 if (!call_close) {
584 nfs4_put_open_state(state); 663 nfs4_put_open_state(state);
585 nfs4_put_state_owner(owner); 664 nfs4_put_state_owner(owner);
586 } else 665 } else {
587 nfs4_do_close(path, state, gfp_mask, wait); 666 bool roc = pnfs_roc(state->inode);
667
668 nfs4_do_close(path, state, gfp_mask, wait, roc);
669 }
588} 670}
589 671
590void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) 672void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
@@ -631,7 +713,8 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
631static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) 713static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
632{ 714{
633 struct nfs4_lock_state *lsp; 715 struct nfs4_lock_state *lsp;
634 struct nfs_client *clp = state->owner->so_server->nfs_client; 716 struct nfs_server *server = state->owner->so_server;
717 struct nfs_client *clp = server->nfs_client;
635 718
636 lsp = kzalloc(sizeof(*lsp), GFP_NOFS); 719 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
637 if (lsp == NULL) 720 if (lsp == NULL)
@@ -655,7 +738,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
655 return NULL; 738 return NULL;
656 } 739 }
657 spin_lock(&clp->cl_lock); 740 spin_lock(&clp->cl_lock);
658 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 741 nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64);
659 spin_unlock(&clp->cl_lock); 742 spin_unlock(&clp->cl_lock);
660 INIT_LIST_HEAD(&lsp->ls_locks); 743 INIT_LIST_HEAD(&lsp->ls_locks);
661 return lsp; 744 return lsp;
@@ -663,10 +746,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
663 746
664static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) 747static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
665{ 748{
666 struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; 749 struct nfs_server *server = lsp->ls_state->owner->so_server;
750 struct nfs_client *clp = server->nfs_client;
667 751
668 spin_lock(&clp->cl_lock); 752 spin_lock(&clp->cl_lock);
669 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); 753 nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
670 spin_unlock(&clp->cl_lock); 754 spin_unlock(&clp->cl_lock);
671 rpc_destroy_wait_queue(&lsp->ls_sequence.wait); 755 rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
672 kfree(lsp); 756 kfree(lsp);
@@ -970,13 +1054,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
970 /* Guard against delegation returns and new lock/unlock calls */ 1054 /* Guard against delegation returns and new lock/unlock calls */
971 down_write(&nfsi->rwsem); 1055 down_write(&nfsi->rwsem);
972 /* Protect inode->i_flock using the BKL */ 1056 /* Protect inode->i_flock using the BKL */
973 lock_kernel(); 1057 lock_flocks();
974 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1058 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
975 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 1059 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
976 continue; 1060 continue;
977 if (nfs_file_open_context(fl->fl_file)->state != state) 1061 if (nfs_file_open_context(fl->fl_file)->state != state)
978 continue; 1062 continue;
979 unlock_kernel(); 1063 unlock_flocks();
980 status = ops->recover_lock(state, fl); 1064 status = ops->recover_lock(state, fl);
981 switch (status) { 1065 switch (status) {
982 case 0: 1066 case 0:
@@ -1003,9 +1087,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1003 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1087 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1004 status = 0; 1088 status = 0;
1005 } 1089 }
1006 lock_kernel(); 1090 lock_flocks();
1007 } 1091 }
1008 unlock_kernel(); 1092 unlock_flocks();
1009out: 1093out:
1010 up_write(&nfsi->rwsem); 1094 up_write(&nfsi->rwsem);
1011 return status; 1095 return status;
@@ -1063,6 +1147,14 @@ restart:
1063 /* Mark the file as being 'closed' */ 1147 /* Mark the file as being 'closed' */
1064 state->state = 0; 1148 state->state = 0;
1065 break; 1149 break;
1150 case -EKEYEXPIRED:
1151 /*
1152 * User RPCSEC_GSS context has expired.
1153 * We cannot recover this stateid now, so
1154 * skip it and allow recovery thread to
1155 * proceed.
1156 */
1157 break;
1066 case -NFS4ERR_ADMIN_REVOKED: 1158 case -NFS4ERR_ADMIN_REVOKED:
1067 case -NFS4ERR_STALE_STATEID: 1159 case -NFS4ERR_STALE_STATEID:
1068 case -NFS4ERR_BAD_STATEID: 1160 case -NFS4ERR_BAD_STATEID:
@@ -1104,15 +1196,19 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
1104 } 1196 }
1105} 1197}
1106 1198
1107static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) 1199static void nfs4_reset_seqids(struct nfs_server *server,
1200 int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
1108{ 1201{
1202 struct nfs_client *clp = server->nfs_client;
1109 struct nfs4_state_owner *sp; 1203 struct nfs4_state_owner *sp;
1110 struct rb_node *pos; 1204 struct rb_node *pos;
1111 struct nfs4_state *state; 1205 struct nfs4_state *state;
1112 1206
1113 /* Reset all sequence ids to zero */ 1207 spin_lock(&clp->cl_lock);
1114 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1208 for (pos = rb_first(&server->state_owners);
1115 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1209 pos != NULL;
1210 pos = rb_next(pos)) {
1211 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
1116 sp->so_seqid.flags = 0; 1212 sp->so_seqid.flags = 0;
1117 spin_lock(&sp->so_lock); 1213 spin_lock(&sp->so_lock);
1118 list_for_each_entry(state, &sp->so_states, open_states) { 1214 list_for_each_entry(state, &sp->so_states, open_states) {
@@ -1121,6 +1217,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re
1121 } 1217 }
1122 spin_unlock(&sp->so_lock); 1218 spin_unlock(&sp->so_lock);
1123 } 1219 }
1220 spin_unlock(&clp->cl_lock);
1221}
1222
1223static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
1224 int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
1225{
1226 struct nfs_server *server;
1227
1228 rcu_read_lock();
1229 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
1230 nfs4_reset_seqids(server, mark_reclaim);
1231 rcu_read_unlock();
1124} 1232}
1125 1233
1126static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) 1234static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
@@ -1138,29 +1246,51 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
1138 (void)ops->reclaim_complete(clp); 1246 (void)ops->reclaim_complete(clp);
1139} 1247}
1140 1248
1141static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1249static void nfs4_clear_reclaim_server(struct nfs_server *server)
1142{ 1250{
1251 struct nfs_client *clp = server->nfs_client;
1143 struct nfs4_state_owner *sp; 1252 struct nfs4_state_owner *sp;
1144 struct rb_node *pos; 1253 struct rb_node *pos;
1145 struct nfs4_state *state; 1254 struct nfs4_state *state;
1146 1255
1147 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1256 spin_lock(&clp->cl_lock);
1148 return; 1257 for (pos = rb_first(&server->state_owners);
1149 1258 pos != NULL;
1150 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); 1259 pos = rb_next(pos)) {
1151 1260 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
1152 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1153 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
1154 spin_lock(&sp->so_lock); 1261 spin_lock(&sp->so_lock);
1155 list_for_each_entry(state, &sp->so_states, open_states) { 1262 list_for_each_entry(state, &sp->so_states, open_states) {
1156 if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags)) 1263 if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
1264 &state->flags))
1157 continue; 1265 continue;
1158 nfs4_state_mark_reclaim_nograce(clp, state); 1266 nfs4_state_mark_reclaim_nograce(clp, state);
1159 } 1267 }
1160 spin_unlock(&sp->so_lock); 1268 spin_unlock(&sp->so_lock);
1161 } 1269 }
1270 spin_unlock(&clp->cl_lock);
1271}
1272
1273static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1274{
1275 struct nfs_server *server;
1276
1277 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1278 return 0;
1279
1280 rcu_read_lock();
1281 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
1282 nfs4_clear_reclaim_server(server);
1283 rcu_read_unlock();
1162 1284
1163 nfs_delegation_reap_unclaimed(clp); 1285 nfs_delegation_reap_unclaimed(clp);
1286 return 1;
1287}
1288
1289static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1290{
1291 if (!nfs4_state_clear_reclaim_reboot(clp))
1292 return;
1293 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1164} 1294}
1165 1295
1166static void nfs_delegation_clear_all(struct nfs_client *clp) 1296static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1175,6 +1305,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1175 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); 1305 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1176} 1306}
1177 1307
1308static void nfs4_warn_keyexpired(const char *s)
1309{
1310 printk_ratelimited(KERN_WARNING "Error: state manager"
1311 " encountered RPCSEC_GSS session"
1312 " expired against NFSv4 server %s.\n",
1313 s);
1314}
1315
1178static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) 1316static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1179{ 1317{
1180 switch (error) { 1318 switch (error) {
@@ -1187,7 +1325,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1187 case -NFS4ERR_STALE_CLIENTID: 1325 case -NFS4ERR_STALE_CLIENTID:
1188 case -NFS4ERR_LEASE_MOVED: 1326 case -NFS4ERR_LEASE_MOVED:
1189 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1327 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1190 nfs4_state_end_reclaim_reboot(clp); 1328 nfs4_state_clear_reclaim_reboot(clp);
1191 nfs4_state_start_reclaim_reboot(clp); 1329 nfs4_state_start_reclaim_reboot(clp);
1192 break; 1330 break;
1193 case -NFS4ERR_EXPIRED: 1331 case -NFS4ERR_EXPIRED:
@@ -1204,33 +1342,50 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1204 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); 1342 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1205 /* Zero session reset errors */ 1343 /* Zero session reset errors */
1206 return 0; 1344 return 0;
1345 case -EKEYEXPIRED:
1346 /* Nothing we can do */
1347 nfs4_warn_keyexpired(clp->cl_hostname);
1348 return 0;
1207 } 1349 }
1208 return error; 1350 return error;
1209} 1351}
1210 1352
1211static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) 1353static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
1212{ 1354{
1355 struct nfs4_state_owner *sp;
1356 struct nfs_server *server;
1213 struct rb_node *pos; 1357 struct rb_node *pos;
1214 int status = 0; 1358 int status = 0;
1215 1359
1216restart: 1360restart:
1217 spin_lock(&clp->cl_lock); 1361 rcu_read_lock();
1218 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1362 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
1219 struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1363 spin_lock(&clp->cl_lock);
1220 if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags)) 1364 for (pos = rb_first(&server->state_owners);
1221 continue; 1365 pos != NULL;
1222 atomic_inc(&sp->so_count); 1366 pos = rb_next(pos)) {
1223 spin_unlock(&clp->cl_lock); 1367 sp = rb_entry(pos,
1224 status = nfs4_reclaim_open_state(sp, ops); 1368 struct nfs4_state_owner, so_server_node);
1225 if (status < 0) { 1369 if (!test_and_clear_bit(ops->owner_flag_bit,
1226 set_bit(ops->owner_flag_bit, &sp->so_flags); 1370 &sp->so_flags))
1371 continue;
1372 atomic_inc(&sp->so_count);
1373 spin_unlock(&clp->cl_lock);
1374 rcu_read_unlock();
1375
1376 status = nfs4_reclaim_open_state(sp, ops);
1377 if (status < 0) {
1378 set_bit(ops->owner_flag_bit, &sp->so_flags);
1379 nfs4_put_state_owner(sp);
1380 return nfs4_recovery_handle_error(clp, status);
1381 }
1382
1227 nfs4_put_state_owner(sp); 1383 nfs4_put_state_owner(sp);
1228 return nfs4_recovery_handle_error(clp, status); 1384 goto restart;
1229 } 1385 }
1230 nfs4_put_state_owner(sp); 1386 spin_unlock(&clp->cl_lock);
1231 goto restart;
1232 } 1387 }
1233 spin_unlock(&clp->cl_lock); 1388 rcu_read_unlock();
1234 return status; 1389 return status;
1235} 1390}
1236 1391
@@ -1414,9 +1569,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1414 case -NFS4ERR_DELAY: 1569 case -NFS4ERR_DELAY:
1415 case -NFS4ERR_CLID_INUSE: 1570 case -NFS4ERR_CLID_INUSE:
1416 case -EAGAIN: 1571 case -EAGAIN:
1417 case -EKEYEXPIRED:
1418 break; 1572 break;
1419 1573
1574 case -EKEYEXPIRED:
1575 nfs4_warn_keyexpired(clp->cl_hostname);
1420 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1576 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1421 * in nfs4_exchange_id */ 1577 * in nfs4_exchange_id */
1422 default: 1578 default:
@@ -1447,6 +1603,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1447 } 1603 }
1448 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1604 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1449 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); 1605 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
1606 pnfs_destroy_all_layouts(clp);
1450 } 1607 }
1451 1608
1452 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { 1609 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 08ef91291132..4e2c168b6ee9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
52#include <linux/nfs_idmap.h> 52#include <linux/nfs_idmap.h>
53#include "nfs4_fs.h" 53#include "nfs4_fs.h"
54#include "internal.h" 54#include "internal.h"
55#include "pnfs.h"
55 56
56#define NFSDBG_FACILITY NFSDBG_XDR 57#define NFSDBG_FACILITY NFSDBG_XDR
57 58
@@ -70,8 +71,8 @@ static int nfs4_stat_to_errno(int);
70/* lock,open owner id: 71/* lock,open owner id:
71 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) 72 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
72 */ 73 */
73#define open_owner_id_maxsz (1 + 4) 74#define open_owner_id_maxsz (1 + 1 + 4)
74#define lock_owner_id_maxsz (1 + 4) 75#define lock_owner_id_maxsz (1 + 1 + 4)
75#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 76#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
76#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 77#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
77#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 78#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
310 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 311 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
311#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 312#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
312#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 313#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
314#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
315 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
316#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
317 1 /* layout type */ + \
318 1 /* opaque devaddr4 length */ + \
319 /* devaddr4 payload is read into page */ \
320 1 /* notification bitmap length */ + \
321 1 /* notification bitmap */)
322#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
323 encode_stateid_maxsz)
324#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
325 decode_stateid_maxsz + \
326 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
313#else /* CONFIG_NFS_V4_1 */ 327#else /* CONFIG_NFS_V4_1 */
314#define encode_sequence_maxsz 0 328#define encode_sequence_maxsz 0
315#define decode_sequence_maxsz 0 329#define decode_sequence_maxsz 0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
699#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 713#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
700 decode_sequence_maxsz + \ 714 decode_sequence_maxsz + \
701 decode_reclaim_complete_maxsz) 715 decode_reclaim_complete_maxsz)
716#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
717 encode_sequence_maxsz +\
718 encode_getdeviceinfo_maxsz)
719#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
720 decode_sequence_maxsz + \
721 decode_getdeviceinfo_maxsz)
722#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
723 encode_sequence_maxsz + \
724 encode_putfh_maxsz + \
725 encode_layoutget_maxsz)
726#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
727 decode_sequence_maxsz + \
728 decode_putfh_maxsz + \
729 decode_layoutget_maxsz)
702 730
703const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 731const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
704 compound_encode_hdr_maxsz + 732 compound_encode_hdr_maxsz +
@@ -816,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
816 if (iap->ia_valid & ATTR_MODE) 844 if (iap->ia_valid & ATTR_MODE)
817 len += 4; 845 len += 4;
818 if (iap->ia_valid & ATTR_UID) { 846 if (iap->ia_valid & ATTR_UID) {
819 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); 847 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
820 if (owner_namelen < 0) { 848 if (owner_namelen < 0) {
821 dprintk("nfs: couldn't resolve uid %d to string\n", 849 dprintk("nfs: couldn't resolve uid %d to string\n",
822 iap->ia_uid); 850 iap->ia_uid);
@@ -828,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
828 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
829 } 857 }
830 if (iap->ia_valid & ATTR_GID) { 858 if (iap->ia_valid & ATTR_GID) {
831 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); 859 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
832 if (owner_grouplen < 0) { 860 if (owner_grouplen < 0) {
833 dprintk("nfs: couldn't resolve gid %d to string\n", 861 dprintk("nfs: couldn't resolve gid %d to string\n",
834 iap->ia_gid); 862 iap->ia_gid);
@@ -1060,10 +1088,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo
1060{ 1088{
1061 __be32 *p; 1089 __be32 *p;
1062 1090
1063 p = reserve_space(xdr, 28); 1091 p = reserve_space(xdr, 32);
1064 p = xdr_encode_hyper(p, lowner->clientid); 1092 p = xdr_encode_hyper(p, lowner->clientid);
1065 *p++ = cpu_to_be32(16); 1093 *p++ = cpu_to_be32(20);
1066 p = xdr_encode_opaque_fixed(p, "lock id:", 8); 1094 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1095 *p++ = cpu_to_be32(lowner->s_dev);
1067 xdr_encode_hyper(p, lowner->id); 1096 xdr_encode_hyper(p, lowner->id);
1068} 1097}
1069 1098
@@ -1182,10 +1211,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1182 *p++ = cpu_to_be32(OP_OPEN); 1211 *p++ = cpu_to_be32(OP_OPEN);
1183 *p = cpu_to_be32(arg->seqid->sequence->counter); 1212 *p = cpu_to_be32(arg->seqid->sequence->counter);
1184 encode_share_access(xdr, arg->fmode); 1213 encode_share_access(xdr, arg->fmode);
1185 p = reserve_space(xdr, 28); 1214 p = reserve_space(xdr, 32);
1186 p = xdr_encode_hyper(p, arg->clientid); 1215 p = xdr_encode_hyper(p, arg->clientid);
1187 *p++ = cpu_to_be32(16); 1216 *p++ = cpu_to_be32(20);
1188 p = xdr_encode_opaque_fixed(p, "open id:", 8); 1217 p = xdr_encode_opaque_fixed(p, "open id:", 8);
1218 *p++ = cpu_to_be32(arg->server->s_dev);
1189 xdr_encode_hyper(p, arg->id); 1219 xdr_encode_hyper(p, arg->id);
1190} 1220}
1191 1221
@@ -1385,24 +1415,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1385 1415
1386static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1416static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1387{ 1417{
1388 uint32_t attrs[2] = { 1418 uint32_t attrs[2] = {0, 0};
1389 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, 1419 uint32_t dircount = readdir->count >> 1;
1390 FATTR4_WORD1_MOUNTED_ON_FILEID,
1391 };
1392 __be32 *p; 1420 __be32 *p;
1393 1421
1422 if (readdir->plus) {
1423 attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
1424 FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
1425 attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
1426 FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
1427 FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
1428 FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1429 dircount >>= 1;
1430 }
1431 attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
1432 attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
1433 /* Switch to mounted_on_fileid if the server supports it */
1434 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1435 attrs[0] &= ~FATTR4_WORD0_FILEID;
1436 else
1437 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1438
1394 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); 1439 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
1395 *p++ = cpu_to_be32(OP_READDIR); 1440 *p++ = cpu_to_be32(OP_READDIR);
1396 p = xdr_encode_hyper(p, readdir->cookie); 1441 p = xdr_encode_hyper(p, readdir->cookie);
1397 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); 1442 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
1398 *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ 1443 *p++ = cpu_to_be32(dircount);
1399 *p++ = cpu_to_be32(readdir->count); 1444 *p++ = cpu_to_be32(readdir->count);
1400 *p++ = cpu_to_be32(2); 1445 *p++ = cpu_to_be32(2);
1401 /* Switch to mounted_on_fileid if the server supports it */ 1446
1402 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1403 attrs[0] &= ~FATTR4_WORD0_FILEID;
1404 else
1405 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1406 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1447 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1407 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1448 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1408 hdr->nops++; 1449 hdr->nops++;
@@ -1471,7 +1512,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1471 hdr->replen += decode_restorefh_maxsz; 1512 hdr->replen += decode_restorefh_maxsz;
1472} 1513}
1473 1514
1474static int 1515static void
1475encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) 1516encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
1476{ 1517{
1477 __be32 *p; 1518 __be32 *p;
@@ -1482,14 +1523,12 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1482 p = reserve_space(xdr, 2*4); 1523 p = reserve_space(xdr, 2*4);
1483 *p++ = cpu_to_be32(1); 1524 *p++ = cpu_to_be32(1);
1484 *p = cpu_to_be32(FATTR4_WORD0_ACL); 1525 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1485 if (arg->acl_len % 4) 1526 BUG_ON(arg->acl_len % 4);
1486 return -EINVAL;
1487 p = reserve_space(xdr, 4); 1527 p = reserve_space(xdr, 4);
1488 *p = cpu_to_be32(arg->acl_len); 1528 *p = cpu_to_be32(arg->acl_len);
1489 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1529 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1490 hdr->nops++; 1530 hdr->nops++;
1491 hdr->replen += decode_setacl_maxsz; 1531 hdr->replen += decode_setacl_maxsz;
1492 return 0;
1493} 1532}
1494 1533
1495static void 1534static void
@@ -1726,6 +1765,55 @@ static void encode_sequence(struct xdr_stream *xdr,
1726#endif /* CONFIG_NFS_V4_1 */ 1765#endif /* CONFIG_NFS_V4_1 */
1727} 1766}
1728 1767
1768#ifdef CONFIG_NFS_V4_1
1769static void
1770encode_getdeviceinfo(struct xdr_stream *xdr,
1771 const struct nfs4_getdeviceinfo_args *args,
1772 struct compound_hdr *hdr)
1773{
1774 __be32 *p;
1775
1776 p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
1777 *p++ = cpu_to_be32(OP_GETDEVICEINFO);
1778 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1779 NFS4_DEVICEID4_SIZE);
1780 *p++ = cpu_to_be32(args->pdev->layout_type);
1781 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
1782 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1783 hdr->nops++;
1784 hdr->replen += decode_getdeviceinfo_maxsz;
1785}
1786
1787static void
1788encode_layoutget(struct xdr_stream *xdr,
1789 const struct nfs4_layoutget_args *args,
1790 struct compound_hdr *hdr)
1791{
1792 __be32 *p;
1793
1794 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
1795 *p++ = cpu_to_be32(OP_LAYOUTGET);
1796 *p++ = cpu_to_be32(0); /* Signal layout available */
1797 *p++ = cpu_to_be32(args->type);
1798 *p++ = cpu_to_be32(args->range.iomode);
1799 p = xdr_encode_hyper(p, args->range.offset);
1800 p = xdr_encode_hyper(p, args->range.length);
1801 p = xdr_encode_hyper(p, args->minlength);
1802 p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
1803 *p = cpu_to_be32(args->maxcount);
1804
1805 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
1806 __func__,
1807 args->type,
1808 args->range.iomode,
1809 (unsigned long)args->range.offset,
1810 (unsigned long)args->range.length,
1811 args->maxcount);
1812 hdr->nops++;
1813 hdr->replen += decode_layoutget_maxsz;
1814}
1815#endif /* CONFIG_NFS_V4_1 */
1816
1729/* 1817/*
1730 * END OF "GENERIC" ENCODE ROUTINES. 1818 * END OF "GENERIC" ENCODE ROUTINES.
1731 */ 1819 */
@@ -1742,393 +1830,362 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
1742/* 1830/*
1743 * Encode an ACCESS request 1831 * Encode an ACCESS request
1744 */ 1832 */
1745static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args) 1833static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
1834 const struct nfs4_accessargs *args)
1746{ 1835{
1747 struct xdr_stream xdr;
1748 struct compound_hdr hdr = { 1836 struct compound_hdr hdr = {
1749 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1837 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1750 }; 1838 };
1751 1839
1752 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1840 encode_compound_hdr(xdr, req, &hdr);
1753 encode_compound_hdr(&xdr, req, &hdr); 1841 encode_sequence(xdr, &args->seq_args, &hdr);
1754 encode_sequence(&xdr, &args->seq_args, &hdr); 1842 encode_putfh(xdr, args->fh, &hdr);
1755 encode_putfh(&xdr, args->fh, &hdr); 1843 encode_access(xdr, args->access, &hdr);
1756 encode_access(&xdr, args->access, &hdr); 1844 encode_getfattr(xdr, args->bitmask, &hdr);
1757 encode_getfattr(&xdr, args->bitmask, &hdr);
1758 encode_nops(&hdr); 1845 encode_nops(&hdr);
1759 return 0;
1760} 1846}
1761 1847
1762/* 1848/*
1763 * Encode LOOKUP request 1849 * Encode LOOKUP request
1764 */ 1850 */
1765static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args) 1851static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
1852 const struct nfs4_lookup_arg *args)
1766{ 1853{
1767 struct xdr_stream xdr;
1768 struct compound_hdr hdr = { 1854 struct compound_hdr hdr = {
1769 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1855 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1770 }; 1856 };
1771 1857
1772 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1858 encode_compound_hdr(xdr, req, &hdr);
1773 encode_compound_hdr(&xdr, req, &hdr); 1859 encode_sequence(xdr, &args->seq_args, &hdr);
1774 encode_sequence(&xdr, &args->seq_args, &hdr); 1860 encode_putfh(xdr, args->dir_fh, &hdr);
1775 encode_putfh(&xdr, args->dir_fh, &hdr); 1861 encode_lookup(xdr, args->name, &hdr);
1776 encode_lookup(&xdr, args->name, &hdr); 1862 encode_getfh(xdr, &hdr);
1777 encode_getfh(&xdr, &hdr); 1863 encode_getfattr(xdr, args->bitmask, &hdr);
1778 encode_getfattr(&xdr, args->bitmask, &hdr);
1779 encode_nops(&hdr); 1864 encode_nops(&hdr);
1780 return 0;
1781} 1865}
1782 1866
1783/* 1867/*
1784 * Encode LOOKUP_ROOT request 1868 * Encode LOOKUP_ROOT request
1785 */ 1869 */
1786static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args) 1870static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
1871 struct xdr_stream *xdr,
1872 const struct nfs4_lookup_root_arg *args)
1787{ 1873{
1788 struct xdr_stream xdr;
1789 struct compound_hdr hdr = { 1874 struct compound_hdr hdr = {
1790 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1875 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1791 }; 1876 };
1792 1877
1793 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1878 encode_compound_hdr(xdr, req, &hdr);
1794 encode_compound_hdr(&xdr, req, &hdr); 1879 encode_sequence(xdr, &args->seq_args, &hdr);
1795 encode_sequence(&xdr, &args->seq_args, &hdr); 1880 encode_putrootfh(xdr, &hdr);
1796 encode_putrootfh(&xdr, &hdr); 1881 encode_getfh(xdr, &hdr);
1797 encode_getfh(&xdr, &hdr); 1882 encode_getfattr(xdr, args->bitmask, &hdr);
1798 encode_getfattr(&xdr, args->bitmask, &hdr);
1799 encode_nops(&hdr); 1883 encode_nops(&hdr);
1800 return 0;
1801} 1884}
1802 1885
1803/* 1886/*
1804 * Encode REMOVE request 1887 * Encode REMOVE request
1805 */ 1888 */
1806static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) 1889static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
1890 const struct nfs_removeargs *args)
1807{ 1891{
1808 struct xdr_stream xdr;
1809 struct compound_hdr hdr = { 1892 struct compound_hdr hdr = {
1810 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1893 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1811 }; 1894 };
1812 1895
1813 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1896 encode_compound_hdr(xdr, req, &hdr);
1814 encode_compound_hdr(&xdr, req, &hdr); 1897 encode_sequence(xdr, &args->seq_args, &hdr);
1815 encode_sequence(&xdr, &args->seq_args, &hdr); 1898 encode_putfh(xdr, args->fh, &hdr);
1816 encode_putfh(&xdr, args->fh, &hdr); 1899 encode_remove(xdr, &args->name, &hdr);
1817 encode_remove(&xdr, &args->name, &hdr); 1900 encode_getfattr(xdr, args->bitmask, &hdr);
1818 encode_getfattr(&xdr, args->bitmask, &hdr);
1819 encode_nops(&hdr); 1901 encode_nops(&hdr);
1820 return 0;
1821} 1902}
1822 1903
1823/* 1904/*
1824 * Encode RENAME request 1905 * Encode RENAME request
1825 */ 1906 */
1826static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args) 1907static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
1908 const struct nfs_renameargs *args)
1827{ 1909{
1828 struct xdr_stream xdr;
1829 struct compound_hdr hdr = { 1910 struct compound_hdr hdr = {
1830 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1911 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1831 }; 1912 };
1832 1913
1833 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1914 encode_compound_hdr(xdr, req, &hdr);
1834 encode_compound_hdr(&xdr, req, &hdr); 1915 encode_sequence(xdr, &args->seq_args, &hdr);
1835 encode_sequence(&xdr, &args->seq_args, &hdr); 1916 encode_putfh(xdr, args->old_dir, &hdr);
1836 encode_putfh(&xdr, args->old_dir, &hdr); 1917 encode_savefh(xdr, &hdr);
1837 encode_savefh(&xdr, &hdr); 1918 encode_putfh(xdr, args->new_dir, &hdr);
1838 encode_putfh(&xdr, args->new_dir, &hdr); 1919 encode_rename(xdr, args->old_name, args->new_name, &hdr);
1839 encode_rename(&xdr, args->old_name, args->new_name, &hdr); 1920 encode_getfattr(xdr, args->bitmask, &hdr);
1840 encode_getfattr(&xdr, args->bitmask, &hdr); 1921 encode_restorefh(xdr, &hdr);
1841 encode_restorefh(&xdr, &hdr); 1922 encode_getfattr(xdr, args->bitmask, &hdr);
1842 encode_getfattr(&xdr, args->bitmask, &hdr);
1843 encode_nops(&hdr); 1923 encode_nops(&hdr);
1844 return 0;
1845} 1924}
1846 1925
1847/* 1926/*
1848 * Encode LINK request 1927 * Encode LINK request
1849 */ 1928 */
1850static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args) 1929static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
1930 const struct nfs4_link_arg *args)
1851{ 1931{
1852 struct xdr_stream xdr;
1853 struct compound_hdr hdr = { 1932 struct compound_hdr hdr = {
1854 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1933 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1855 }; 1934 };
1856 1935
1857 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1936 encode_compound_hdr(xdr, req, &hdr);
1858 encode_compound_hdr(&xdr, req, &hdr); 1937 encode_sequence(xdr, &args->seq_args, &hdr);
1859 encode_sequence(&xdr, &args->seq_args, &hdr); 1938 encode_putfh(xdr, args->fh, &hdr);
1860 encode_putfh(&xdr, args->fh, &hdr); 1939 encode_savefh(xdr, &hdr);
1861 encode_savefh(&xdr, &hdr); 1940 encode_putfh(xdr, args->dir_fh, &hdr);
1862 encode_putfh(&xdr, args->dir_fh, &hdr); 1941 encode_link(xdr, args->name, &hdr);
1863 encode_link(&xdr, args->name, &hdr); 1942 encode_getfattr(xdr, args->bitmask, &hdr);
1864 encode_getfattr(&xdr, args->bitmask, &hdr); 1943 encode_restorefh(xdr, &hdr);
1865 encode_restorefh(&xdr, &hdr); 1944 encode_getfattr(xdr, args->bitmask, &hdr);
1866 encode_getfattr(&xdr, args->bitmask, &hdr);
1867 encode_nops(&hdr); 1945 encode_nops(&hdr);
1868 return 0;
1869} 1946}
1870 1947
1871/* 1948/*
1872 * Encode CREATE request 1949 * Encode CREATE request
1873 */ 1950 */
1874static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) 1951static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
1952 const struct nfs4_create_arg *args)
1875{ 1953{
1876 struct xdr_stream xdr;
1877 struct compound_hdr hdr = { 1954 struct compound_hdr hdr = {
1878 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1955 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1879 }; 1956 };
1880 1957
1881 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1958 encode_compound_hdr(xdr, req, &hdr);
1882 encode_compound_hdr(&xdr, req, &hdr); 1959 encode_sequence(xdr, &args->seq_args, &hdr);
1883 encode_sequence(&xdr, &args->seq_args, &hdr); 1960 encode_putfh(xdr, args->dir_fh, &hdr);
1884 encode_putfh(&xdr, args->dir_fh, &hdr); 1961 encode_savefh(xdr, &hdr);
1885 encode_savefh(&xdr, &hdr); 1962 encode_create(xdr, args, &hdr);
1886 encode_create(&xdr, args, &hdr); 1963 encode_getfh(xdr, &hdr);
1887 encode_getfh(&xdr, &hdr); 1964 encode_getfattr(xdr, args->bitmask, &hdr);
1888 encode_getfattr(&xdr, args->bitmask, &hdr); 1965 encode_restorefh(xdr, &hdr);
1889 encode_restorefh(&xdr, &hdr); 1966 encode_getfattr(xdr, args->bitmask, &hdr);
1890 encode_getfattr(&xdr, args->bitmask, &hdr);
1891 encode_nops(&hdr); 1967 encode_nops(&hdr);
1892 return 0;
1893} 1968}
1894 1969
1895/* 1970/*
1896 * Encode SYMLINK request 1971 * Encode SYMLINK request
1897 */ 1972 */
1898static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) 1973static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
1974 const struct nfs4_create_arg *args)
1899{ 1975{
1900 return nfs4_xdr_enc_create(req, p, args); 1976 nfs4_xdr_enc_create(req, xdr, args);
1901} 1977}
1902 1978
1903/* 1979/*
1904 * Encode GETATTR request 1980 * Encode GETATTR request
1905 */ 1981 */
1906static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args) 1982static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
1983 const struct nfs4_getattr_arg *args)
1907{ 1984{
1908 struct xdr_stream xdr;
1909 struct compound_hdr hdr = { 1985 struct compound_hdr hdr = {
1910 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 1986 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1911 }; 1987 };
1912 1988
1913 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1989 encode_compound_hdr(xdr, req, &hdr);
1914 encode_compound_hdr(&xdr, req, &hdr); 1990 encode_sequence(xdr, &args->seq_args, &hdr);
1915 encode_sequence(&xdr, &args->seq_args, &hdr); 1991 encode_putfh(xdr, args->fh, &hdr);
1916 encode_putfh(&xdr, args->fh, &hdr); 1992 encode_getfattr(xdr, args->bitmask, &hdr);
1917 encode_getfattr(&xdr, args->bitmask, &hdr);
1918 encode_nops(&hdr); 1993 encode_nops(&hdr);
1919 return 0;
1920} 1994}
1921 1995
1922/* 1996/*
1923 * Encode a CLOSE request 1997 * Encode a CLOSE request
1924 */ 1998 */
1925static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) 1999static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
2000 struct nfs_closeargs *args)
1926{ 2001{
1927 struct xdr_stream xdr;
1928 struct compound_hdr hdr = { 2002 struct compound_hdr hdr = {
1929 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2003 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1930 }; 2004 };
1931 2005
1932 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2006 encode_compound_hdr(xdr, req, &hdr);
1933 encode_compound_hdr(&xdr, req, &hdr); 2007 encode_sequence(xdr, &args->seq_args, &hdr);
1934 encode_sequence(&xdr, &args->seq_args, &hdr); 2008 encode_putfh(xdr, args->fh, &hdr);
1935 encode_putfh(&xdr, args->fh, &hdr); 2009 encode_close(xdr, args, &hdr);
1936 encode_close(&xdr, args, &hdr); 2010 encode_getfattr(xdr, args->bitmask, &hdr);
1937 encode_getfattr(&xdr, args->bitmask, &hdr);
1938 encode_nops(&hdr); 2011 encode_nops(&hdr);
1939 return 0;
1940} 2012}
1941 2013
1942/* 2014/*
1943 * Encode an OPEN request 2015 * Encode an OPEN request
1944 */ 2016 */
1945static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) 2017static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
2018 struct nfs_openargs *args)
1946{ 2019{
1947 struct xdr_stream xdr;
1948 struct compound_hdr hdr = { 2020 struct compound_hdr hdr = {
1949 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2021 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1950 }; 2022 };
1951 2023
1952 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2024 encode_compound_hdr(xdr, req, &hdr);
1953 encode_compound_hdr(&xdr, req, &hdr); 2025 encode_sequence(xdr, &args->seq_args, &hdr);
1954 encode_sequence(&xdr, &args->seq_args, &hdr); 2026 encode_putfh(xdr, args->fh, &hdr);
1955 encode_putfh(&xdr, args->fh, &hdr); 2027 encode_savefh(xdr, &hdr);
1956 encode_savefh(&xdr, &hdr); 2028 encode_open(xdr, args, &hdr);
1957 encode_open(&xdr, args, &hdr); 2029 encode_getfh(xdr, &hdr);
1958 encode_getfh(&xdr, &hdr); 2030 encode_getfattr(xdr, args->bitmask, &hdr);
1959 encode_getfattr(&xdr, args->bitmask, &hdr); 2031 encode_restorefh(xdr, &hdr);
1960 encode_restorefh(&xdr, &hdr); 2032 encode_getfattr(xdr, args->bitmask, &hdr);
1961 encode_getfattr(&xdr, args->bitmask, &hdr);
1962 encode_nops(&hdr); 2033 encode_nops(&hdr);
1963 return 0;
1964} 2034}
1965 2035
1966/* 2036/*
1967 * Encode an OPEN_CONFIRM request 2037 * Encode an OPEN_CONFIRM request
1968 */ 2038 */
1969static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args) 2039static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
2040 struct xdr_stream *xdr,
2041 struct nfs_open_confirmargs *args)
1970{ 2042{
1971 struct xdr_stream xdr;
1972 struct compound_hdr hdr = { 2043 struct compound_hdr hdr = {
1973 .nops = 0, 2044 .nops = 0,
1974 }; 2045 };
1975 2046
1976 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2047 encode_compound_hdr(xdr, req, &hdr);
1977 encode_compound_hdr(&xdr, req, &hdr); 2048 encode_putfh(xdr, args->fh, &hdr);
1978 encode_putfh(&xdr, args->fh, &hdr); 2049 encode_open_confirm(xdr, args, &hdr);
1979 encode_open_confirm(&xdr, args, &hdr);
1980 encode_nops(&hdr); 2050 encode_nops(&hdr);
1981 return 0;
1982} 2051}
1983 2052
1984/* 2053/*
1985 * Encode an OPEN request with no attributes. 2054 * Encode an OPEN request with no attributes.
1986 */ 2055 */
1987static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) 2056static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
2057 struct xdr_stream *xdr,
2058 struct nfs_openargs *args)
1988{ 2059{
1989 struct xdr_stream xdr;
1990 struct compound_hdr hdr = { 2060 struct compound_hdr hdr = {
1991 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2061 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
1992 }; 2062 };
1993 2063
1994 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2064 encode_compound_hdr(xdr, req, &hdr);
1995 encode_compound_hdr(&xdr, req, &hdr); 2065 encode_sequence(xdr, &args->seq_args, &hdr);
1996 encode_sequence(&xdr, &args->seq_args, &hdr); 2066 encode_putfh(xdr, args->fh, &hdr);
1997 encode_putfh(&xdr, args->fh, &hdr); 2067 encode_open(xdr, args, &hdr);
1998 encode_open(&xdr, args, &hdr); 2068 encode_getfattr(xdr, args->bitmask, &hdr);
1999 encode_getfattr(&xdr, args->bitmask, &hdr);
2000 encode_nops(&hdr); 2069 encode_nops(&hdr);
2001 return 0;
2002} 2070}
2003 2071
2004/* 2072/*
2005 * Encode an OPEN_DOWNGRADE request 2073 * Encode an OPEN_DOWNGRADE request
2006 */ 2074 */
2007static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) 2075static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
2076 struct xdr_stream *xdr,
2077 struct nfs_closeargs *args)
2008{ 2078{
2009 struct xdr_stream xdr;
2010 struct compound_hdr hdr = { 2079 struct compound_hdr hdr = {
2011 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2080 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2012 }; 2081 };
2013 2082
2014 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2083 encode_compound_hdr(xdr, req, &hdr);
2015 encode_compound_hdr(&xdr, req, &hdr); 2084 encode_sequence(xdr, &args->seq_args, &hdr);
2016 encode_sequence(&xdr, &args->seq_args, &hdr); 2085 encode_putfh(xdr, args->fh, &hdr);
2017 encode_putfh(&xdr, args->fh, &hdr); 2086 encode_open_downgrade(xdr, args, &hdr);
2018 encode_open_downgrade(&xdr, args, &hdr); 2087 encode_getfattr(xdr, args->bitmask, &hdr);
2019 encode_getfattr(&xdr, args->bitmask, &hdr);
2020 encode_nops(&hdr); 2088 encode_nops(&hdr);
2021 return 0;
2022} 2089}
2023 2090
2024/* 2091/*
2025 * Encode a LOCK request 2092 * Encode a LOCK request
2026 */ 2093 */
2027static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args) 2094static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
2095 struct nfs_lock_args *args)
2028{ 2096{
2029 struct xdr_stream xdr;
2030 struct compound_hdr hdr = { 2097 struct compound_hdr hdr = {
2031 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2098 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2032 }; 2099 };
2033 2100
2034 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2101 encode_compound_hdr(xdr, req, &hdr);
2035 encode_compound_hdr(&xdr, req, &hdr); 2102 encode_sequence(xdr, &args->seq_args, &hdr);
2036 encode_sequence(&xdr, &args->seq_args, &hdr); 2103 encode_putfh(xdr, args->fh, &hdr);
2037 encode_putfh(&xdr, args->fh, &hdr); 2104 encode_lock(xdr, args, &hdr);
2038 encode_lock(&xdr, args, &hdr);
2039 encode_nops(&hdr); 2105 encode_nops(&hdr);
2040 return 0;
2041} 2106}
2042 2107
2043/* 2108/*
2044 * Encode a LOCKT request 2109 * Encode a LOCKT request
2045 */ 2110 */
2046static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args) 2111static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
2112 struct nfs_lockt_args *args)
2047{ 2113{
2048 struct xdr_stream xdr;
2049 struct compound_hdr hdr = { 2114 struct compound_hdr hdr = {
2050 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2115 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2051 }; 2116 };
2052 2117
2053 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2118 encode_compound_hdr(xdr, req, &hdr);
2054 encode_compound_hdr(&xdr, req, &hdr); 2119 encode_sequence(xdr, &args->seq_args, &hdr);
2055 encode_sequence(&xdr, &args->seq_args, &hdr); 2120 encode_putfh(xdr, args->fh, &hdr);
2056 encode_putfh(&xdr, args->fh, &hdr); 2121 encode_lockt(xdr, args, &hdr);
2057 encode_lockt(&xdr, args, &hdr);
2058 encode_nops(&hdr); 2122 encode_nops(&hdr);
2059 return 0;
2060} 2123}
2061 2124
2062/* 2125/*
2063 * Encode a LOCKU request 2126 * Encode a LOCKU request
2064 */ 2127 */
2065static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args) 2128static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
2129 struct nfs_locku_args *args)
2066{ 2130{
2067 struct xdr_stream xdr;
2068 struct compound_hdr hdr = { 2131 struct compound_hdr hdr = {
2069 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2132 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2070 }; 2133 };
2071 2134
2072 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2135 encode_compound_hdr(xdr, req, &hdr);
2073 encode_compound_hdr(&xdr, req, &hdr); 2136 encode_sequence(xdr, &args->seq_args, &hdr);
2074 encode_sequence(&xdr, &args->seq_args, &hdr); 2137 encode_putfh(xdr, args->fh, &hdr);
2075 encode_putfh(&xdr, args->fh, &hdr); 2138 encode_locku(xdr, args, &hdr);
2076 encode_locku(&xdr, args, &hdr);
2077 encode_nops(&hdr); 2139 encode_nops(&hdr);
2078 return 0;
2079} 2140}
2080 2141
2081static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) 2142static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
2143 struct xdr_stream *xdr,
2144 struct nfs_release_lockowner_args *args)
2082{ 2145{
2083 struct xdr_stream xdr;
2084 struct compound_hdr hdr = { 2146 struct compound_hdr hdr = {
2085 .minorversion = 0, 2147 .minorversion = 0,
2086 }; 2148 };
2087 2149
2088 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2150 encode_compound_hdr(xdr, req, &hdr);
2089 encode_compound_hdr(&xdr, req, &hdr); 2151 encode_release_lockowner(xdr, &args->lock_owner, &hdr);
2090 encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
2091 encode_nops(&hdr); 2152 encode_nops(&hdr);
2092 return 0;
2093} 2153}
2094 2154
2095/* 2155/*
2096 * Encode a READLINK request 2156 * Encode a READLINK request
2097 */ 2157 */
2098static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args) 2158static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
2159 const struct nfs4_readlink *args)
2099{ 2160{
2100 struct xdr_stream xdr;
2101 struct compound_hdr hdr = { 2161 struct compound_hdr hdr = {
2102 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2162 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2103 }; 2163 };
2104 2164
2105 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2165 encode_compound_hdr(xdr, req, &hdr);
2106 encode_compound_hdr(&xdr, req, &hdr); 2166 encode_sequence(xdr, &args->seq_args, &hdr);
2107 encode_sequence(&xdr, &args->seq_args, &hdr); 2167 encode_putfh(xdr, args->fh, &hdr);
2108 encode_putfh(&xdr, args->fh, &hdr); 2168 encode_readlink(xdr, args, req, &hdr);
2109 encode_readlink(&xdr, args, req, &hdr);
2110 2169
2111 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, 2170 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
2112 args->pgbase, args->pglen); 2171 args->pgbase, args->pglen);
2113 encode_nops(&hdr); 2172 encode_nops(&hdr);
2114 return 0;
2115} 2173}
2116 2174
2117/* 2175/*
2118 * Encode a READDIR request 2176 * Encode a READDIR request
2119 */ 2177 */
2120static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args) 2178static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
2179 const struct nfs4_readdir_arg *args)
2121{ 2180{
2122 struct xdr_stream xdr;
2123 struct compound_hdr hdr = { 2181 struct compound_hdr hdr = {
2124 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2182 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2125 }; 2183 };
2126 2184
2127 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2185 encode_compound_hdr(xdr, req, &hdr);
2128 encode_compound_hdr(&xdr, req, &hdr); 2186 encode_sequence(xdr, &args->seq_args, &hdr);
2129 encode_sequence(&xdr, &args->seq_args, &hdr); 2187 encode_putfh(xdr, args->fh, &hdr);
2130 encode_putfh(&xdr, args->fh, &hdr); 2188 encode_readdir(xdr, args, req, &hdr);
2131 encode_readdir(&xdr, args, req, &hdr);
2132 2189
2133 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, 2190 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,
2134 args->pgbase, args->count); 2191 args->pgbase, args->count);
@@ -2136,413 +2193,414 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
2136 __func__, hdr.replen << 2, args->pages, 2193 __func__, hdr.replen << 2, args->pages,
2137 args->pgbase, args->count); 2194 args->pgbase, args->count);
2138 encode_nops(&hdr); 2195 encode_nops(&hdr);
2139 return 0;
2140} 2196}
2141 2197
2142/* 2198/*
2143 * Encode a READ request 2199 * Encode a READ request
2144 */ 2200 */
2145static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 2201static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
2202 struct nfs_readargs *args)
2146{ 2203{
2147 struct xdr_stream xdr;
2148 struct compound_hdr hdr = { 2204 struct compound_hdr hdr = {
2149 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2205 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2150 }; 2206 };
2151 2207
2152 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2208 encode_compound_hdr(xdr, req, &hdr);
2153 encode_compound_hdr(&xdr, req, &hdr); 2209 encode_sequence(xdr, &args->seq_args, &hdr);
2154 encode_sequence(&xdr, &args->seq_args, &hdr); 2210 encode_putfh(xdr, args->fh, &hdr);
2155 encode_putfh(&xdr, args->fh, &hdr); 2211 encode_read(xdr, args, &hdr);
2156 encode_read(&xdr, args, &hdr);
2157 2212
2158 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, 2213 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
2159 args->pages, args->pgbase, args->count); 2214 args->pages, args->pgbase, args->count);
2160 req->rq_rcv_buf.flags |= XDRBUF_READ; 2215 req->rq_rcv_buf.flags |= XDRBUF_READ;
2161 encode_nops(&hdr); 2216 encode_nops(&hdr);
2162 return 0;
2163} 2217}
2164 2218
2165/* 2219/*
2166 * Encode an SETATTR request 2220 * Encode an SETATTR request
2167 */ 2221 */
2168static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) 2222static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
2223 struct nfs_setattrargs *args)
2169{ 2224{
2170 struct xdr_stream xdr;
2171 struct compound_hdr hdr = { 2225 struct compound_hdr hdr = {
2172 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2226 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2173 }; 2227 };
2174 2228
2175 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2229 encode_compound_hdr(xdr, req, &hdr);
2176 encode_compound_hdr(&xdr, req, &hdr); 2230 encode_sequence(xdr, &args->seq_args, &hdr);
2177 encode_sequence(&xdr, &args->seq_args, &hdr); 2231 encode_putfh(xdr, args->fh, &hdr);
2178 encode_putfh(&xdr, args->fh, &hdr); 2232 encode_setattr(xdr, args, args->server, &hdr);
2179 encode_setattr(&xdr, args, args->server, &hdr); 2233 encode_getfattr(xdr, args->bitmask, &hdr);
2180 encode_getfattr(&xdr, args->bitmask, &hdr);
2181 encode_nops(&hdr); 2234 encode_nops(&hdr);
2182 return 0;
2183} 2235}
2184 2236
2185/* 2237/*
2186 * Encode a GETACL request 2238 * Encode a GETACL request
2187 */ 2239 */
2188static int 2240static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
2189nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, 2241 struct nfs_getaclargs *args)
2190 struct nfs_getaclargs *args)
2191{ 2242{
2192 struct xdr_stream xdr;
2193 struct compound_hdr hdr = { 2243 struct compound_hdr hdr = {
2194 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2244 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2195 }; 2245 };
2196 uint32_t replen; 2246 uint32_t replen;
2197 2247
2198 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2248 encode_compound_hdr(xdr, req, &hdr);
2199 encode_compound_hdr(&xdr, req, &hdr); 2249 encode_sequence(xdr, &args->seq_args, &hdr);
2200 encode_sequence(&xdr, &args->seq_args, &hdr); 2250 encode_putfh(xdr, args->fh, &hdr);
2201 encode_putfh(&xdr, args->fh, &hdr);
2202 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; 2251 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
2203 encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); 2252 encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
2204 2253
2205 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, 2254 xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
2206 args->acl_pages, args->acl_pgbase, args->acl_len); 2255 args->acl_pages, args->acl_pgbase, args->acl_len);
2207 encode_nops(&hdr); 2256 encode_nops(&hdr);
2208 return 0;
2209} 2257}
2210 2258
2211/* 2259/*
2212 * Encode a WRITE request 2260 * Encode a WRITE request
2213 */ 2261 */
2214static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 2262static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
2263 struct nfs_writeargs *args)
2215{ 2264{
2216 struct xdr_stream xdr;
2217 struct compound_hdr hdr = { 2265 struct compound_hdr hdr = {
2218 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2266 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2219 }; 2267 };
2220 2268
2221 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2269 encode_compound_hdr(xdr, req, &hdr);
2222 encode_compound_hdr(&xdr, req, &hdr); 2270 encode_sequence(xdr, &args->seq_args, &hdr);
2223 encode_sequence(&xdr, &args->seq_args, &hdr); 2271 encode_putfh(xdr, args->fh, &hdr);
2224 encode_putfh(&xdr, args->fh, &hdr); 2272 encode_write(xdr, args, &hdr);
2225 encode_write(&xdr, args, &hdr);
2226 req->rq_snd_buf.flags |= XDRBUF_WRITE; 2273 req->rq_snd_buf.flags |= XDRBUF_WRITE;
2227 encode_getfattr(&xdr, args->bitmask, &hdr); 2274 encode_getfattr(xdr, args->bitmask, &hdr);
2228 encode_nops(&hdr); 2275 encode_nops(&hdr);
2229 return 0;
2230} 2276}
2231 2277
2232/* 2278/*
2233 * a COMMIT request 2279 * a COMMIT request
2234 */ 2280 */
2235static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) 2281static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
2282 struct nfs_writeargs *args)
2236{ 2283{
2237 struct xdr_stream xdr;
2238 struct compound_hdr hdr = { 2284 struct compound_hdr hdr = {
2239 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2285 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2240 }; 2286 };
2241 2287
2242 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2288 encode_compound_hdr(xdr, req, &hdr);
2243 encode_compound_hdr(&xdr, req, &hdr); 2289 encode_sequence(xdr, &args->seq_args, &hdr);
2244 encode_sequence(&xdr, &args->seq_args, &hdr); 2290 encode_putfh(xdr, args->fh, &hdr);
2245 encode_putfh(&xdr, args->fh, &hdr); 2291 encode_commit(xdr, args, &hdr);
2246 encode_commit(&xdr, args, &hdr); 2292 encode_getfattr(xdr, args->bitmask, &hdr);
2247 encode_getfattr(&xdr, args->bitmask, &hdr);
2248 encode_nops(&hdr); 2293 encode_nops(&hdr);
2249 return 0;
2250} 2294}
2251 2295
2252/* 2296/*
2253 * FSINFO request 2297 * FSINFO request
2254 */ 2298 */
2255static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args) 2299static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
2300 struct nfs4_fsinfo_arg *args)
2256{ 2301{
2257 struct xdr_stream xdr;
2258 struct compound_hdr hdr = { 2302 struct compound_hdr hdr = {
2259 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2303 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2260 }; 2304 };
2261 2305
2262 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2306 encode_compound_hdr(xdr, req, &hdr);
2263 encode_compound_hdr(&xdr, req, &hdr); 2307 encode_sequence(xdr, &args->seq_args, &hdr);
2264 encode_sequence(&xdr, &args->seq_args, &hdr); 2308 encode_putfh(xdr, args->fh, &hdr);
2265 encode_putfh(&xdr, args->fh, &hdr); 2309 encode_fsinfo(xdr, args->bitmask, &hdr);
2266 encode_fsinfo(&xdr, args->bitmask, &hdr);
2267 encode_nops(&hdr); 2310 encode_nops(&hdr);
2268 return 0;
2269} 2311}
2270 2312
2271/* 2313/*
2272 * a PATHCONF request 2314 * a PATHCONF request
2273 */ 2315 */
2274static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args) 2316static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
2317 const struct nfs4_pathconf_arg *args)
2275{ 2318{
2276 struct xdr_stream xdr;
2277 struct compound_hdr hdr = { 2319 struct compound_hdr hdr = {
2278 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2320 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2279 }; 2321 };
2280 2322
2281 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2323 encode_compound_hdr(xdr, req, &hdr);
2282 encode_compound_hdr(&xdr, req, &hdr); 2324 encode_sequence(xdr, &args->seq_args, &hdr);
2283 encode_sequence(&xdr, &args->seq_args, &hdr); 2325 encode_putfh(xdr, args->fh, &hdr);
2284 encode_putfh(&xdr, args->fh, &hdr); 2326 encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
2285 encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
2286 &hdr); 2327 &hdr);
2287 encode_nops(&hdr); 2328 encode_nops(&hdr);
2288 return 0;
2289} 2329}
2290 2330
2291/* 2331/*
2292 * a STATFS request 2332 * a STATFS request
2293 */ 2333 */
2294static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args) 2334static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
2335 const struct nfs4_statfs_arg *args)
2295{ 2336{
2296 struct xdr_stream xdr;
2297 struct compound_hdr hdr = { 2337 struct compound_hdr hdr = {
2298 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2338 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2299 }; 2339 };
2300 2340
2301 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2341 encode_compound_hdr(xdr, req, &hdr);
2302 encode_compound_hdr(&xdr, req, &hdr); 2342 encode_sequence(xdr, &args->seq_args, &hdr);
2303 encode_sequence(&xdr, &args->seq_args, &hdr); 2343 encode_putfh(xdr, args->fh, &hdr);
2304 encode_putfh(&xdr, args->fh, &hdr); 2344 encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
2305 encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
2306 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); 2345 args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
2307 encode_nops(&hdr); 2346 encode_nops(&hdr);
2308 return 0;
2309} 2347}
2310 2348
2311/* 2349/*
2312 * GETATTR_BITMAP request 2350 * GETATTR_BITMAP request
2313 */ 2351 */
2314static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, 2352static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
2315 struct nfs4_server_caps_arg *args) 2353 struct xdr_stream *xdr,
2354 struct nfs4_server_caps_arg *args)
2316{ 2355{
2317 struct xdr_stream xdr;
2318 struct compound_hdr hdr = { 2356 struct compound_hdr hdr = {
2319 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2357 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2320 }; 2358 };
2321 2359
2322 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2360 encode_compound_hdr(xdr, req, &hdr);
2323 encode_compound_hdr(&xdr, req, &hdr); 2361 encode_sequence(xdr, &args->seq_args, &hdr);
2324 encode_sequence(&xdr, &args->seq_args, &hdr); 2362 encode_putfh(xdr, args->fhandle, &hdr);
2325 encode_putfh(&xdr, args->fhandle, &hdr); 2363 encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2326 encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2327 FATTR4_WORD0_LINK_SUPPORT| 2364 FATTR4_WORD0_LINK_SUPPORT|
2328 FATTR4_WORD0_SYMLINK_SUPPORT| 2365 FATTR4_WORD0_SYMLINK_SUPPORT|
2329 FATTR4_WORD0_ACLSUPPORT, &hdr); 2366 FATTR4_WORD0_ACLSUPPORT, &hdr);
2330 encode_nops(&hdr); 2367 encode_nops(&hdr);
2331 return 0;
2332} 2368}
2333 2369
2334/* 2370/*
2335 * a RENEW request 2371 * a RENEW request
2336 */ 2372 */
2337static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) 2373static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
2374 struct nfs_client *clp)
2338{ 2375{
2339 struct xdr_stream xdr;
2340 struct compound_hdr hdr = { 2376 struct compound_hdr hdr = {
2341 .nops = 0, 2377 .nops = 0,
2342 }; 2378 };
2343 2379
2344 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2380 encode_compound_hdr(xdr, req, &hdr);
2345 encode_compound_hdr(&xdr, req, &hdr); 2381 encode_renew(xdr, clp, &hdr);
2346 encode_renew(&xdr, clp, &hdr);
2347 encode_nops(&hdr); 2382 encode_nops(&hdr);
2348 return 0;
2349} 2383}
2350 2384
2351/* 2385/*
2352 * a SETCLIENTID request 2386 * a SETCLIENTID request
2353 */ 2387 */
2354static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc) 2388static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
2389 struct xdr_stream *xdr,
2390 struct nfs4_setclientid *sc)
2355{ 2391{
2356 struct xdr_stream xdr;
2357 struct compound_hdr hdr = { 2392 struct compound_hdr hdr = {
2358 .nops = 0, 2393 .nops = 0,
2359 }; 2394 };
2360 2395
2361 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2396 encode_compound_hdr(xdr, req, &hdr);
2362 encode_compound_hdr(&xdr, req, &hdr); 2397 encode_setclientid(xdr, sc, &hdr);
2363 encode_setclientid(&xdr, sc, &hdr);
2364 encode_nops(&hdr); 2398 encode_nops(&hdr);
2365 return 0;
2366} 2399}
2367 2400
2368/* 2401/*
2369 * a SETCLIENTID_CONFIRM request 2402 * a SETCLIENTID_CONFIRM request
2370 */ 2403 */
2371static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) 2404static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
2405 struct xdr_stream *xdr,
2406 struct nfs4_setclientid_res *arg)
2372{ 2407{
2373 struct xdr_stream xdr;
2374 struct compound_hdr hdr = { 2408 struct compound_hdr hdr = {
2375 .nops = 0, 2409 .nops = 0,
2376 }; 2410 };
2377 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2411 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2378 2412
2379 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2413 encode_compound_hdr(xdr, req, &hdr);
2380 encode_compound_hdr(&xdr, req, &hdr); 2414 encode_setclientid_confirm(xdr, arg, &hdr);
2381 encode_setclientid_confirm(&xdr, arg, &hdr); 2415 encode_putrootfh(xdr, &hdr);
2382 encode_putrootfh(&xdr, &hdr); 2416 encode_fsinfo(xdr, lease_bitmap, &hdr);
2383 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2384 encode_nops(&hdr); 2417 encode_nops(&hdr);
2385 return 0;
2386} 2418}
2387 2419
2388/* 2420/*
2389 * DELEGRETURN request 2421 * DELEGRETURN request
2390 */ 2422 */
2391static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args) 2423static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
2424 struct xdr_stream *xdr,
2425 const struct nfs4_delegreturnargs *args)
2392{ 2426{
2393 struct xdr_stream xdr;
2394 struct compound_hdr hdr = { 2427 struct compound_hdr hdr = {
2395 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2428 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2396 }; 2429 };
2397 2430
2398 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2431 encode_compound_hdr(xdr, req, &hdr);
2399 encode_compound_hdr(&xdr, req, &hdr); 2432 encode_sequence(xdr, &args->seq_args, &hdr);
2400 encode_sequence(&xdr, &args->seq_args, &hdr); 2433 encode_putfh(xdr, args->fhandle, &hdr);
2401 encode_putfh(&xdr, args->fhandle, &hdr); 2434 encode_delegreturn(xdr, args->stateid, &hdr);
2402 encode_delegreturn(&xdr, args->stateid, &hdr); 2435 encode_getfattr(xdr, args->bitmask, &hdr);
2403 encode_getfattr(&xdr, args->bitmask, &hdr);
2404 encode_nops(&hdr); 2436 encode_nops(&hdr);
2405 return 0;
2406} 2437}
2407 2438
2408/* 2439/*
2409 * Encode FS_LOCATIONS request 2440 * Encode FS_LOCATIONS request
2410 */ 2441 */
2411static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args) 2442static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
2443 struct xdr_stream *xdr,
2444 struct nfs4_fs_locations_arg *args)
2412{ 2445{
2413 struct xdr_stream xdr;
2414 struct compound_hdr hdr = { 2446 struct compound_hdr hdr = {
2415 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2447 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2416 }; 2448 };
2417 uint32_t replen; 2449 uint32_t replen;
2418 2450
2419 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2451 encode_compound_hdr(xdr, req, &hdr);
2420 encode_compound_hdr(&xdr, req, &hdr); 2452 encode_sequence(xdr, &args->seq_args, &hdr);
2421 encode_sequence(&xdr, &args->seq_args, &hdr); 2453 encode_putfh(xdr, args->dir_fh, &hdr);
2422 encode_putfh(&xdr, args->dir_fh, &hdr); 2454 encode_lookup(xdr, args->name, &hdr);
2423 encode_lookup(&xdr, args->name, &hdr);
2424 replen = hdr.replen; /* get the attribute into args->page */ 2455 replen = hdr.replen; /* get the attribute into args->page */
2425 encode_fs_locations(&xdr, args->bitmask, &hdr); 2456 encode_fs_locations(xdr, args->bitmask, &hdr);
2426 2457
2427 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 2458 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
2428 0, PAGE_SIZE); 2459 0, PAGE_SIZE);
2429 encode_nops(&hdr); 2460 encode_nops(&hdr);
2430 return 0;
2431} 2461}
2432 2462
2433#if defined(CONFIG_NFS_V4_1) 2463#if defined(CONFIG_NFS_V4_1)
2434/* 2464/*
2435 * EXCHANGE_ID request 2465 * EXCHANGE_ID request
2436 */ 2466 */
2437static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p, 2467static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
2438 struct nfs41_exchange_id_args *args) 2468 struct xdr_stream *xdr,
2469 struct nfs41_exchange_id_args *args)
2439{ 2470{
2440 struct xdr_stream xdr;
2441 struct compound_hdr hdr = { 2471 struct compound_hdr hdr = {
2442 .minorversion = args->client->cl_mvops->minor_version, 2472 .minorversion = args->client->cl_mvops->minor_version,
2443 }; 2473 };
2444 2474
2445 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2475 encode_compound_hdr(xdr, req, &hdr);
2446 encode_compound_hdr(&xdr, req, &hdr); 2476 encode_exchange_id(xdr, args, &hdr);
2447 encode_exchange_id(&xdr, args, &hdr);
2448 encode_nops(&hdr); 2477 encode_nops(&hdr);
2449 return 0;
2450} 2478}
2451 2479
2452/* 2480/*
2453 * a CREATE_SESSION request 2481 * a CREATE_SESSION request
2454 */ 2482 */
2455static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p, 2483static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
2456 struct nfs41_create_session_args *args) 2484 struct xdr_stream *xdr,
2485 struct nfs41_create_session_args *args)
2457{ 2486{
2458 struct xdr_stream xdr;
2459 struct compound_hdr hdr = { 2487 struct compound_hdr hdr = {
2460 .minorversion = args->client->cl_mvops->minor_version, 2488 .minorversion = args->client->cl_mvops->minor_version,
2461 }; 2489 };
2462 2490
2463 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2491 encode_compound_hdr(xdr, req, &hdr);
2464 encode_compound_hdr(&xdr, req, &hdr); 2492 encode_create_session(xdr, args, &hdr);
2465 encode_create_session(&xdr, args, &hdr);
2466 encode_nops(&hdr); 2493 encode_nops(&hdr);
2467 return 0;
2468} 2494}
2469 2495
2470/* 2496/*
2471 * a DESTROY_SESSION request 2497 * a DESTROY_SESSION request
2472 */ 2498 */
2473static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p, 2499static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
2474 struct nfs4_session *session) 2500 struct xdr_stream *xdr,
2501 struct nfs4_session *session)
2475{ 2502{
2476 struct xdr_stream xdr;
2477 struct compound_hdr hdr = { 2503 struct compound_hdr hdr = {
2478 .minorversion = session->clp->cl_mvops->minor_version, 2504 .minorversion = session->clp->cl_mvops->minor_version,
2479 }; 2505 };
2480 2506
2481 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2507 encode_compound_hdr(xdr, req, &hdr);
2482 encode_compound_hdr(&xdr, req, &hdr); 2508 encode_destroy_session(xdr, session, &hdr);
2483 encode_destroy_session(&xdr, session, &hdr);
2484 encode_nops(&hdr); 2509 encode_nops(&hdr);
2485 return 0;
2486} 2510}
2487 2511
2488/* 2512/*
2489 * a SEQUENCE request 2513 * a SEQUENCE request
2490 */ 2514 */
2491static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p, 2515static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
2492 struct nfs4_sequence_args *args) 2516 struct nfs4_sequence_args *args)
2493{ 2517{
2494 struct xdr_stream xdr;
2495 struct compound_hdr hdr = { 2518 struct compound_hdr hdr = {
2496 .minorversion = nfs4_xdr_minorversion(args), 2519 .minorversion = nfs4_xdr_minorversion(args),
2497 }; 2520 };
2498 2521
2499 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2522 encode_compound_hdr(xdr, req, &hdr);
2500 encode_compound_hdr(&xdr, req, &hdr); 2523 encode_sequence(xdr, args, &hdr);
2501 encode_sequence(&xdr, args, &hdr);
2502 encode_nops(&hdr); 2524 encode_nops(&hdr);
2503 return 0;
2504} 2525}
2505 2526
2506/* 2527/*
2507 * a GET_LEASE_TIME request 2528 * a GET_LEASE_TIME request
2508 */ 2529 */
2509static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, 2530static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
2510 struct nfs4_get_lease_time_args *args) 2531 struct xdr_stream *xdr,
2532 struct nfs4_get_lease_time_args *args)
2511{ 2533{
2512 struct xdr_stream xdr;
2513 struct compound_hdr hdr = { 2534 struct compound_hdr hdr = {
2514 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), 2535 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
2515 }; 2536 };
2516 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2537 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
2517 2538
2518 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2539 encode_compound_hdr(xdr, req, &hdr);
2519 encode_compound_hdr(&xdr, req, &hdr); 2540 encode_sequence(xdr, &args->la_seq_args, &hdr);
2520 encode_sequence(&xdr, &args->la_seq_args, &hdr); 2541 encode_putrootfh(xdr, &hdr);
2521 encode_putrootfh(&xdr, &hdr); 2542 encode_fsinfo(xdr, lease_bitmap, &hdr);
2522 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2523 encode_nops(&hdr); 2543 encode_nops(&hdr);
2524 return 0;
2525} 2544}
2526 2545
2527/* 2546/*
2528 * a RECLAIM_COMPLETE request 2547 * a RECLAIM_COMPLETE request
2529 */ 2548 */
2530static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, 2549static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2531 struct nfs41_reclaim_complete_args *args) 2550 struct xdr_stream *xdr,
2551 struct nfs41_reclaim_complete_args *args)
2532{ 2552{
2533 struct xdr_stream xdr;
2534 struct compound_hdr hdr = { 2553 struct compound_hdr hdr = {
2535 .minorversion = nfs4_xdr_minorversion(&args->seq_args) 2554 .minorversion = nfs4_xdr_minorversion(&args->seq_args)
2536 }; 2555 };
2537 2556
2538 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2557 encode_compound_hdr(xdr, req, &hdr);
2539 encode_compound_hdr(&xdr, req, &hdr); 2558 encode_sequence(xdr, &args->seq_args, &hdr);
2540 encode_sequence(&xdr, &args->seq_args, &hdr); 2559 encode_reclaim_complete(xdr, args, &hdr);
2541 encode_reclaim_complete(&xdr, args, &hdr);
2542 encode_nops(&hdr); 2560 encode_nops(&hdr);
2543 return 0;
2544} 2561}
2545 2562
2563/*
2564 * Encode GETDEVICEINFO request
2565 */
2566static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
2567 struct xdr_stream *xdr,
2568 struct nfs4_getdeviceinfo_args *args)
2569{
2570 struct compound_hdr hdr = {
2571 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2572 };
2573
2574 encode_compound_hdr(xdr, req, &hdr);
2575 encode_sequence(xdr, &args->seq_args, &hdr);
2576 encode_getdeviceinfo(xdr, args, &hdr);
2577
2578 /* set up reply kvec. Subtract notification bitmap max size (2)
2579 * so that notification bitmap is put in xdr_buf tail */
2580 xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
2581 args->pdev->pages, args->pdev->pgbase,
2582 args->pdev->pglen);
2583
2584 encode_nops(&hdr);
2585}
2586
2587/*
2588 * Encode LAYOUTGET request
2589 */
2590static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2591 struct xdr_stream *xdr,
2592 struct nfs4_layoutget_args *args)
2593{
2594 struct compound_hdr hdr = {
2595 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2596 };
2597
2598 encode_compound_hdr(xdr, req, &hdr);
2599 encode_sequence(xdr, &args->seq_args, &hdr);
2600 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2601 encode_layoutget(xdr, args, &hdr);
2602 encode_nops(&hdr);
2603}
2546#endif /* CONFIG_NFS_V4_1 */ 2604#endif /* CONFIG_NFS_V4_1 */
2547 2605
2548static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 2606static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2676,7 +2734,10 @@ out_overflow:
2676static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) 2734static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
2677{ 2735{
2678 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { 2736 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
2679 decode_attr_bitmap(xdr, bitmask); 2737 int ret;
2738 ret = decode_attr_bitmap(xdr, bitmask);
2739 if (unlikely(ret < 0))
2740 return ret;
2680 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 2741 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
2681 } else 2742 } else
2682 bitmask[0] = bitmask[1] = 0; 2743 bitmask[0] = bitmask[1] = 0;
@@ -2848,6 +2909,56 @@ out_overflow:
2848 return -EIO; 2909 return -EIO;
2849} 2910}
2850 2911
2912static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
2913{
2914 __be32 *p;
2915
2916 if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
2917 return -EIO;
2918 if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
2919 p = xdr_inline_decode(xdr, 4);
2920 if (unlikely(!p))
2921 goto out_overflow;
2922 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
2923 }
2924 return 0;
2925out_overflow:
2926 print_overflow_msg(__func__, xdr);
2927 return -EIO;
2928}
2929
2930static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
2931{
2932 __be32 *p;
2933 int len;
2934
2935 if (fh != NULL)
2936 memset(fh, 0, sizeof(*fh));
2937
2938 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
2939 return -EIO;
2940 if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
2941 p = xdr_inline_decode(xdr, 4);
2942 if (unlikely(!p))
2943 goto out_overflow;
2944 len = be32_to_cpup(p);
2945 if (len > NFS4_FHSIZE)
2946 return -EIO;
2947 p = xdr_inline_decode(xdr, len);
2948 if (unlikely(!p))
2949 goto out_overflow;
2950 if (fh != NULL) {
2951 memcpy(fh->data, p, len);
2952 fh->size = len;
2953 }
2954 bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
2955 }
2956 return 0;
2957out_overflow:
2958 print_overflow_msg(__func__, xdr);
2959 return -EIO;
2960}
2961
2851static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2962static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2852{ 2963{
2853 __be32 *p; 2964 __be32 *p;
@@ -3521,6 +3632,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
3521 return status; 3632 return status;
3522} 3633}
3523 3634
3635static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
3636 struct timespec *time)
3637{
3638 int status = 0;
3639
3640 time->tv_sec = 0;
3641 time->tv_nsec = 0;
3642 if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
3643 return -EIO;
3644 if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
3645 status = decode_attr_time(xdr, time);
3646 bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
3647 }
3648 dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
3649 (long)time->tv_nsec);
3650 return status;
3651}
3652
3524static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 3653static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
3525{ 3654{
3526 int status = 0; 3655 int status = 0;
@@ -3744,29 +3873,14 @@ xdr_error:
3744 return status; 3873 return status;
3745} 3874}
3746 3875
3747static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 3876static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3877 struct nfs_fattr *fattr, struct nfs_fh *fh,
3748 const struct nfs_server *server, int may_sleep) 3878 const struct nfs_server *server, int may_sleep)
3749{ 3879{
3750 __be32 *savep;
3751 uint32_t attrlen,
3752 bitmap[2] = {0},
3753 type;
3754 int status; 3880 int status;
3755 umode_t fmode = 0; 3881 umode_t fmode = 0;
3756 uint64_t fileid; 3882 uint64_t fileid;
3757 3883 uint32_t type;
3758 status = decode_op_hdr(xdr, OP_GETATTR);
3759 if (status < 0)
3760 goto xdr_error;
3761
3762 status = decode_attr_bitmap(xdr, bitmap);
3763 if (status < 0)
3764 goto xdr_error;
3765
3766 status = decode_attr_length(xdr, &attrlen, &savep);
3767 if (status < 0)
3768 goto xdr_error;
3769
3770 3884
3771 status = decode_attr_type(xdr, bitmap, &type); 3885 status = decode_attr_type(xdr, bitmap, &type);
3772 if (status < 0) 3886 if (status < 0)
@@ -3792,6 +3906,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3792 goto xdr_error; 3906 goto xdr_error;
3793 fattr->valid |= status; 3907 fattr->valid |= status;
3794 3908
3909 status = decode_attr_error(xdr, bitmap);
3910 if (status < 0)
3911 goto xdr_error;
3912
3913 status = decode_attr_filehandle(xdr, bitmap, fh);
3914 if (status < 0)
3915 goto xdr_error;
3916
3795 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); 3917 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3796 if (status < 0) 3918 if (status < 0)
3797 goto xdr_error; 3919 goto xdr_error;
@@ -3862,12 +3984,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3862 fattr->valid |= status; 3984 fattr->valid |= status;
3863 } 3985 }
3864 3986
3987xdr_error:
3988 dprintk("%s: xdr returned %d\n", __func__, -status);
3989 return status;
3990}
3991
3992static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3993 struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
3994{
3995 __be32 *savep;
3996 uint32_t attrlen,
3997 bitmap[2] = {0};
3998 int status;
3999
4000 status = decode_op_hdr(xdr, OP_GETATTR);
4001 if (status < 0)
4002 goto xdr_error;
4003
4004 status = decode_attr_bitmap(xdr, bitmap);
4005 if (status < 0)
4006 goto xdr_error;
4007
4008 status = decode_attr_length(xdr, &attrlen, &savep);
4009 if (status < 0)
4010 goto xdr_error;
4011
4012 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
4013 if (status < 0)
4014 goto xdr_error;
4015
3865 status = verify_attr_len(xdr, savep, attrlen); 4016 status = verify_attr_len(xdr, savep, attrlen);
3866xdr_error: 4017xdr_error:
3867 dprintk("%s: xdr returned %d\n", __func__, -status); 4018 dprintk("%s: xdr returned %d\n", __func__, -status);
3868 return status; 4019 return status;
3869} 4020}
3870 4021
4022static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4023 const struct nfs_server *server, int may_sleep)
4024{
4025 return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
4026}
4027
4028/*
4029 * Decode potentially multiple layout types. Currently we only support
4030 * one layout driver per file system.
4031 */
4032static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
4033 uint32_t *layouttype)
4034{
4035 uint32_t *p;
4036 int num;
4037
4038 p = xdr_inline_decode(xdr, 4);
4039 if (unlikely(!p))
4040 goto out_overflow;
4041 num = be32_to_cpup(p);
4042
4043 /* pNFS is not supported by the underlying file system */
4044 if (num == 0) {
4045 *layouttype = 0;
4046 return 0;
4047 }
4048 if (num > 1)
4049 printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
4050 "per filesystem not supported\n", __func__);
4051
4052 /* Decode and set first layout type, move xdr->p past unused types */
4053 p = xdr_inline_decode(xdr, num * 4);
4054 if (unlikely(!p))
4055 goto out_overflow;
4056 *layouttype = be32_to_cpup(p);
4057 return 0;
4058out_overflow:
4059 print_overflow_msg(__func__, xdr);
4060 return -EIO;
4061}
4062
4063/*
4064 * The type of file system exported.
4065 * Note we must ensure that layouttype is set in any non-error case.
4066 */
4067static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
4068 uint32_t *layouttype)
4069{
4070 int status = 0;
4071
4072 dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
4073 if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
4074 return -EIO;
4075 if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
4076 status = decode_first_pnfs_layout_type(xdr, layouttype);
4077 bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
4078 } else
4079 *layouttype = 0;
4080 return status;
4081}
3871 4082
3872static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4083static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3873{ 4084{
@@ -3894,6 +4105,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3894 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) 4105 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
3895 goto xdr_error; 4106 goto xdr_error;
3896 fsinfo->wtpref = fsinfo->wtmax; 4107 fsinfo->wtpref = fsinfo->wtmax;
4108 status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
4109 if (status != 0)
4110 goto xdr_error;
4111 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
4112 if (status != 0)
4113 goto xdr_error;
3897 4114
3898 status = verify_attr_len(xdr, savep, attrlen); 4115 status = verify_attr_len(xdr, savep, attrlen);
3899xdr_error: 4116xdr_error:
@@ -3950,13 +4167,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3950 __be32 *p; 4167 __be32 *p;
3951 uint32_t namelen, type; 4168 uint32_t namelen, type;
3952 4169
3953 p = xdr_inline_decode(xdr, 32); 4170 p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
3954 if (unlikely(!p)) 4171 if (unlikely(!p))
3955 goto out_overflow; 4172 goto out_overflow;
3956 p = xdr_decode_hyper(p, &offset); 4173 p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
3957 p = xdr_decode_hyper(p, &length); 4174 p = xdr_decode_hyper(p, &length);
3958 type = be32_to_cpup(p++); 4175 type = be32_to_cpup(p++); /* 4 byte read */
3959 if (fl != NULL) { 4176 if (fl != NULL) { /* manipulate file lock */
3960 fl->fl_start = (loff_t)offset; 4177 fl->fl_start = (loff_t)offset;
3961 fl->fl_end = fl->fl_start + (loff_t)length - 1; 4178 fl->fl_end = fl->fl_start + (loff_t)length - 1;
3962 if (length == ~(uint64_t)0) 4179 if (length == ~(uint64_t)0)
@@ -3966,9 +4183,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3966 fl->fl_type = F_RDLCK; 4183 fl->fl_type = F_RDLCK;
3967 fl->fl_pid = 0; 4184 fl->fl_pid = 0;
3968 } 4185 }
3969 p = xdr_decode_hyper(p, &clientid); 4186 p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
3970 namelen = be32_to_cpup(p); 4187 namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
3971 p = xdr_inline_decode(xdr, namelen); 4188 p = xdr_inline_decode(xdr, namelen); /* variable size field */
3972 if (likely(p)) 4189 if (likely(p))
3973 return -NFS4ERR_DENIED; 4190 return -NFS4ERR_DENIED;
3974out_overflow: 4191out_overflow:
@@ -4180,7 +4397,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
4180 goto out_overflow; 4397 goto out_overflow;
4181 eof = be32_to_cpup(p++); 4398 eof = be32_to_cpup(p++);
4182 count = be32_to_cpup(p); 4399 count = be32_to_cpup(p);
4183 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 4400 hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
4184 recvd = req->rq_rcv_buf.len - hdrlen; 4401 recvd = req->rq_rcv_buf.len - hdrlen;
4185 if (count > recvd) { 4402 if (count > recvd) {
4186 dprintk("NFS: server cheating in read reply: " 4403 dprintk("NFS: server cheating in read reply: "
@@ -4200,12 +4417,9 @@ out_overflow:
4200static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) 4417static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
4201{ 4418{
4202 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 4419 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
4203 struct page *page = *rcvbuf->pages;
4204 struct kvec *iov = rcvbuf->head; 4420 struct kvec *iov = rcvbuf->head;
4205 size_t hdrlen; 4421 size_t hdrlen;
4206 u32 recvd, pglen = rcvbuf->page_len; 4422 u32 recvd, pglen = rcvbuf->page_len;
4207 __be32 *end, *entry, *p, *kaddr;
4208 unsigned int nr = 0;
4209 int status; 4423 int status;
4210 4424
4211 status = decode_op_hdr(xdr, OP_READDIR); 4425 status = decode_op_hdr(xdr, OP_READDIR);
@@ -4225,71 +4439,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
4225 pglen = recvd; 4439 pglen = recvd;
4226 xdr_read_pages(xdr, pglen); 4440 xdr_read_pages(xdr, pglen);
4227 4441
4228 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); 4442
4229 kaddr = p = kmap_atomic(page, KM_USER0); 4443 return pglen;
4230 end = p + ((pglen + readdir->pgbase) >> 2);
4231 entry = p;
4232
4233 /* Make sure the packet actually has a value_follows and EOF entry */
4234 if ((entry + 1) > end)
4235 goto short_pkt;
4236
4237 for (; *p++; nr++) {
4238 u32 len, attrlen, xlen;
4239 if (end - p < 3)
4240 goto short_pkt;
4241 dprintk("cookie = %Lu, ", *((unsigned long long *)p));
4242 p += 2; /* cookie */
4243 len = ntohl(*p++); /* filename length */
4244 if (len > NFS4_MAXNAMLEN) {
4245 dprintk("NFS: giant filename in readdir (len 0x%x)\n",
4246 len);
4247 goto err_unmap;
4248 }
4249 xlen = XDR_QUADLEN(len);
4250 if (end - p < xlen + 1)
4251 goto short_pkt;
4252 dprintk("filename = %*s\n", len, (char *)p);
4253 p += xlen;
4254 len = ntohl(*p++); /* bitmap length */
4255 if (end - p < len + 1)
4256 goto short_pkt;
4257 p += len;
4258 attrlen = XDR_QUADLEN(ntohl(*p++));
4259 if (end - p < attrlen + 2)
4260 goto short_pkt;
4261 p += attrlen; /* attributes */
4262 entry = p;
4263 }
4264 /*
4265 * Apparently some server sends responses that are a valid size, but
4266 * contain no entries, and have value_follows==0 and EOF==0. For
4267 * those, just set the EOF marker.
4268 */
4269 if (!nr && entry[1] == 0) {
4270 dprintk("NFS: readdir reply truncated!\n");
4271 entry[1] = 1;
4272 }
4273out:
4274 kunmap_atomic(kaddr, KM_USER0);
4275 return 0;
4276short_pkt:
4277 /*
4278 * When we get a short packet there are 2 possibilities. We can
4279 * return an error, or fix up the response to look like a valid
4280 * response and return what we have so far. If there are no
4281 * entries and the packet was short, then return -EIO. If there
4282 * are valid entries in the response, return them and pretend that
4283 * the call was successful, but incomplete. The caller can retry the
4284 * readdir starting at the last cookie.
4285 */
4286 dprintk("%s: short packet at entry %d\n", __func__, nr);
4287 entry[0] = entry[1] = 0;
4288 if (nr)
4289 goto out;
4290err_unmap:
4291 kunmap_atomic(kaddr, KM_USER0);
4292 return -errno_NFSERR_IO;
4293} 4444}
4294 4445
4295static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) 4446static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4299,7 +4450,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4299 size_t hdrlen; 4450 size_t hdrlen;
4300 u32 len, recvd; 4451 u32 len, recvd;
4301 __be32 *p; 4452 __be32 *p;
4302 char *kaddr;
4303 int status; 4453 int status;
4304 4454
4305 status = decode_op_hdr(xdr, OP_READLINK); 4455 status = decode_op_hdr(xdr, OP_READLINK);
@@ -4330,9 +4480,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4330 * and and null-terminate the text (the VFS expects 4480 * and and null-terminate the text (the VFS expects
4331 * null-termination). 4481 * null-termination).
4332 */ 4482 */
4333 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); 4483 xdr_terminate_string(rcvbuf, len);
4334 kaddr[len+rcvbuf->page_base] = '\0';
4335 kunmap_atomic(kaddr, KM_USER0);
4336 return 0; 4484 return 0;
4337out_overflow: 4485out_overflow:
4338 print_overflow_msg(__func__, xdr); 4486 print_overflow_msg(__func__, xdr);
@@ -4668,7 +4816,6 @@ static int decode_sequence(struct xdr_stream *xdr,
4668 struct rpc_rqst *rqstp) 4816 struct rpc_rqst *rqstp)
4669{ 4817{
4670#if defined(CONFIG_NFS_V4_1) 4818#if defined(CONFIG_NFS_V4_1)
4671 struct nfs4_slot *slot;
4672 struct nfs4_sessionid id; 4819 struct nfs4_sessionid id;
4673 u32 dummy; 4820 u32 dummy;
4674 int status; 4821 int status;
@@ -4700,15 +4847,14 @@ static int decode_sequence(struct xdr_stream *xdr,
4700 goto out_overflow; 4847 goto out_overflow;
4701 4848
4702 /* seqid */ 4849 /* seqid */
4703 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4704 dummy = be32_to_cpup(p++); 4850 dummy = be32_to_cpup(p++);
4705 if (dummy != slot->seq_nr) { 4851 if (dummy != res->sr_slot->seq_nr) {
4706 dprintk("%s Invalid sequence number\n", __func__); 4852 dprintk("%s Invalid sequence number\n", __func__);
4707 goto out_err; 4853 goto out_err;
4708 } 4854 }
4709 /* slot id */ 4855 /* slot id */
4710 dummy = be32_to_cpup(p++); 4856 dummy = be32_to_cpup(p++);
4711 if (dummy != res->sr_slotid) { 4857 if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
4712 dprintk("%s Invalid slot id\n", __func__); 4858 dprintk("%s Invalid slot id\n", __func__);
4713 goto out_err; 4859 goto out_err;
4714 } 4860 }
@@ -4731,6 +4877,134 @@ out_overflow:
4731#endif /* CONFIG_NFS_V4_1 */ 4877#endif /* CONFIG_NFS_V4_1 */
4732} 4878}
4733 4879
4880#if defined(CONFIG_NFS_V4_1)
4881
4882static int decode_getdeviceinfo(struct xdr_stream *xdr,
4883 struct pnfs_device *pdev)
4884{
4885 __be32 *p;
4886 uint32_t len, type;
4887 int status;
4888
4889 status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
4890 if (status) {
4891 if (status == -ETOOSMALL) {
4892 p = xdr_inline_decode(xdr, 4);
4893 if (unlikely(!p))
4894 goto out_overflow;
4895 pdev->mincount = be32_to_cpup(p);
4896 dprintk("%s: Min count too small. mincnt = %u\n",
4897 __func__, pdev->mincount);
4898 }
4899 return status;
4900 }
4901
4902 p = xdr_inline_decode(xdr, 8);
4903 if (unlikely(!p))
4904 goto out_overflow;
4905 type = be32_to_cpup(p++);
4906 if (type != pdev->layout_type) {
4907 dprintk("%s: layout mismatch req: %u pdev: %u\n",
4908 __func__, pdev->layout_type, type);
4909 return -EINVAL;
4910 }
4911 /*
4912 * Get the length of the opaque device_addr4. xdr_read_pages places
4913 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
4914 * and places the remaining xdr data in xdr_buf->tail
4915 */
4916 pdev->mincount = be32_to_cpup(p);
4917 xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
4918
4919 /* Parse notification bitmap, verifying that it is zero. */
4920 p = xdr_inline_decode(xdr, 4);
4921 if (unlikely(!p))
4922 goto out_overflow;
4923 len = be32_to_cpup(p);
4924 if (len) {
4925 uint32_t i;
4926
4927 p = xdr_inline_decode(xdr, 4 * len);
4928 if (unlikely(!p))
4929 goto out_overflow;
4930 for (i = 0; i < len; i++, p++) {
4931 if (be32_to_cpup(p)) {
4932 dprintk("%s: notifications not supported\n",
4933 __func__);
4934 return -EIO;
4935 }
4936 }
4937 }
4938 return 0;
4939out_overflow:
4940 print_overflow_msg(__func__, xdr);
4941 return -EIO;
4942}
4943
4944static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
4945 struct nfs4_layoutget_res *res)
4946{
4947 __be32 *p;
4948 int status;
4949 u32 layout_count;
4950
4951 status = decode_op_hdr(xdr, OP_LAYOUTGET);
4952 if (status)
4953 return status;
4954 p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
4955 if (unlikely(!p))
4956 goto out_overflow;
4957 res->return_on_close = be32_to_cpup(p++);
4958 p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
4959 layout_count = be32_to_cpup(p);
4960 if (!layout_count) {
4961 dprintk("%s: server responded with empty layout array\n",
4962 __func__);
4963 return -EINVAL;
4964 }
4965
4966 p = xdr_inline_decode(xdr, 24);
4967 if (unlikely(!p))
4968 goto out_overflow;
4969 p = xdr_decode_hyper(p, &res->range.offset);
4970 p = xdr_decode_hyper(p, &res->range.length);
4971 res->range.iomode = be32_to_cpup(p++);
4972 res->type = be32_to_cpup(p++);
4973
4974 status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
4975 if (unlikely(status))
4976 return status;
4977
4978 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
4979 __func__,
4980 (unsigned long)res->range.offset,
4981 (unsigned long)res->range.length,
4982 res->range.iomode,
4983 res->type,
4984 res->layout.len);
4985
4986 /* nfs4_proc_layoutget allocated a single page */
4987 if (res->layout.len > PAGE_SIZE)
4988 return -ENOMEM;
4989 memcpy(res->layout.buf, p, res->layout.len);
4990
4991 if (layout_count > 1) {
4992 /* We only handle a length one array at the moment. Any
4993 * further entries are just ignored. Note that this means
4994 * the client may see a response that is less than the
4995 * minimum it requested.
4996 */
4997 dprintk("%s: server responded with %d layouts, dropping tail\n",
4998 __func__, layout_count);
4999 }
5000
5001 return 0;
5002out_overflow:
5003 print_overflow_msg(__func__, xdr);
5004 return -EIO;
5005}
5006#endif /* CONFIG_NFS_V4_1 */
5007
4734/* 5008/*
4735 * END OF "GENERIC" DECODE ROUTINES. 5009 * END OF "GENERIC" DECODE ROUTINES.
4736 */ 5010 */
@@ -4738,26 +5012,26 @@ out_overflow:
4738/* 5012/*
4739 * Decode OPEN_DOWNGRADE response 5013 * Decode OPEN_DOWNGRADE response
4740 */ 5014 */
4741static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 5015static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
5016 struct xdr_stream *xdr,
5017 struct nfs_closeres *res)
4742{ 5018{
4743 struct xdr_stream xdr;
4744 struct compound_hdr hdr; 5019 struct compound_hdr hdr;
4745 int status; 5020 int status;
4746 5021
4747 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5022 status = decode_compound_hdr(xdr, &hdr);
4748 status = decode_compound_hdr(&xdr, &hdr);
4749 if (status) 5023 if (status)
4750 goto out; 5024 goto out;
4751 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5025 status = decode_sequence(xdr, &res->seq_res, rqstp);
4752 if (status) 5026 if (status)
4753 goto out; 5027 goto out;
4754 status = decode_putfh(&xdr); 5028 status = decode_putfh(xdr);
4755 if (status) 5029 if (status)
4756 goto out; 5030 goto out;
4757 status = decode_open_downgrade(&xdr, res); 5031 status = decode_open_downgrade(xdr, res);
4758 if (status != 0) 5032 if (status != 0)
4759 goto out; 5033 goto out;
4760 decode_getfattr(&xdr, res->fattr, res->server, 5034 decode_getfattr(xdr, res->fattr, res->server,
4761 !RPC_IS_ASYNC(rqstp->rq_task)); 5035 !RPC_IS_ASYNC(rqstp->rq_task));
4762out: 5036out:
4763 return status; 5037 return status;
@@ -4766,26 +5040,25 @@ out:
4766/* 5040/*
4767 * Decode ACCESS response 5041 * Decode ACCESS response
4768 */ 5042 */
4769static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) 5043static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5044 struct nfs4_accessres *res)
4770{ 5045{
4771 struct xdr_stream xdr;
4772 struct compound_hdr hdr; 5046 struct compound_hdr hdr;
4773 int status; 5047 int status;
4774 5048
4775 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5049 status = decode_compound_hdr(xdr, &hdr);
4776 status = decode_compound_hdr(&xdr, &hdr);
4777 if (status) 5050 if (status)
4778 goto out; 5051 goto out;
4779 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5052 status = decode_sequence(xdr, &res->seq_res, rqstp);
4780 if (status) 5053 if (status)
4781 goto out; 5054 goto out;
4782 status = decode_putfh(&xdr); 5055 status = decode_putfh(xdr);
4783 if (status != 0) 5056 if (status != 0)
4784 goto out; 5057 goto out;
4785 status = decode_access(&xdr, res); 5058 status = decode_access(xdr, res);
4786 if (status != 0) 5059 if (status != 0)
4787 goto out; 5060 goto out;
4788 decode_getfattr(&xdr, res->fattr, res->server, 5061 decode_getfattr(xdr, res->fattr, res->server,
4789 !RPC_IS_ASYNC(rqstp->rq_task)); 5062 !RPC_IS_ASYNC(rqstp->rq_task));
4790out: 5063out:
4791 return status; 5064 return status;
@@ -4794,26 +5067,28 @@ out:
4794/* 5067/*
4795 * Decode LOOKUP response 5068 * Decode LOOKUP response
4796 */ 5069 */
4797static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) 5070static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5071 struct nfs4_lookup_res *res)
4798{ 5072{
4799 struct xdr_stream xdr;
4800 struct compound_hdr hdr; 5073 struct compound_hdr hdr;
4801 int status; 5074 int status;
4802 5075
4803 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5076 status = decode_compound_hdr(xdr, &hdr);
4804 status = decode_compound_hdr(&xdr, &hdr);
4805 if (status) 5077 if (status)
4806 goto out; 5078 goto out;
4807 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5079 status = decode_sequence(xdr, &res->seq_res, rqstp);
4808 if (status) 5080 if (status)
4809 goto out; 5081 goto out;
4810 if ((status = decode_putfh(&xdr)) != 0) 5082 status = decode_putfh(xdr);
5083 if (status)
4811 goto out; 5084 goto out;
4812 if ((status = decode_lookup(&xdr)) != 0) 5085 status = decode_lookup(xdr);
5086 if (status)
4813 goto out; 5087 goto out;
4814 if ((status = decode_getfh(&xdr, res->fh)) != 0) 5088 status = decode_getfh(xdr, res->fh);
5089 if (status)
4815 goto out; 5090 goto out;
4816 status = decode_getfattr(&xdr, res->fattr, res->server 5091 status = decode_getfattr(xdr, res->fattr, res->server
4817 ,!RPC_IS_ASYNC(rqstp->rq_task)); 5092 ,!RPC_IS_ASYNC(rqstp->rq_task));
4818out: 5093out:
4819 return status; 5094 return status;
@@ -4822,23 +5097,25 @@ out:
4822/* 5097/*
4823 * Decode LOOKUP_ROOT response 5098 * Decode LOOKUP_ROOT response
4824 */ 5099 */
4825static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) 5100static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
5101 struct xdr_stream *xdr,
5102 struct nfs4_lookup_res *res)
4826{ 5103{
4827 struct xdr_stream xdr;
4828 struct compound_hdr hdr; 5104 struct compound_hdr hdr;
4829 int status; 5105 int status;
4830 5106
4831 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5107 status = decode_compound_hdr(xdr, &hdr);
4832 status = decode_compound_hdr(&xdr, &hdr);
4833 if (status) 5108 if (status)
4834 goto out; 5109 goto out;
4835 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5110 status = decode_sequence(xdr, &res->seq_res, rqstp);
4836 if (status) 5111 if (status)
4837 goto out; 5112 goto out;
4838 if ((status = decode_putrootfh(&xdr)) != 0) 5113 status = decode_putrootfh(xdr);
5114 if (status)
4839 goto out; 5115 goto out;
4840 if ((status = decode_getfh(&xdr, res->fh)) == 0) 5116 status = decode_getfh(xdr, res->fh);
4841 status = decode_getfattr(&xdr, res->fattr, res->server, 5117 if (status == 0)
5118 status = decode_getfattr(xdr, res->fattr, res->server,
4842 !RPC_IS_ASYNC(rqstp->rq_task)); 5119 !RPC_IS_ASYNC(rqstp->rq_task));
4843out: 5120out:
4844 return status; 5121 return status;
@@ -4847,24 +5124,25 @@ out:
4847/* 5124/*
4848 * Decode REMOVE response 5125 * Decode REMOVE response
4849 */ 5126 */
4850static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res) 5127static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5128 struct nfs_removeres *res)
4851{ 5129{
4852 struct xdr_stream xdr;
4853 struct compound_hdr hdr; 5130 struct compound_hdr hdr;
4854 int status; 5131 int status;
4855 5132
4856 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5133 status = decode_compound_hdr(xdr, &hdr);
4857 status = decode_compound_hdr(&xdr, &hdr);
4858 if (status) 5134 if (status)
4859 goto out; 5135 goto out;
4860 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5136 status = decode_sequence(xdr, &res->seq_res, rqstp);
4861 if (status) 5137 if (status)
4862 goto out; 5138 goto out;
4863 if ((status = decode_putfh(&xdr)) != 0) 5139 status = decode_putfh(xdr);
5140 if (status)
4864 goto out; 5141 goto out;
4865 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 5142 status = decode_remove(xdr, &res->cinfo);
5143 if (status)
4866 goto out; 5144 goto out;
4867 decode_getfattr(&xdr, res->dir_attr, res->server, 5145 decode_getfattr(xdr, res->dir_attr, res->server,
4868 !RPC_IS_ASYNC(rqstp->rq_task)); 5146 !RPC_IS_ASYNC(rqstp->rq_task));
4869out: 5147out:
4870 return status; 5148 return status;
@@ -4873,34 +5151,38 @@ out:
4873/* 5151/*
4874 * Decode RENAME response 5152 * Decode RENAME response
4875 */ 5153 */
4876static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res) 5154static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5155 struct nfs_renameres *res)
4877{ 5156{
4878 struct xdr_stream xdr;
4879 struct compound_hdr hdr; 5157 struct compound_hdr hdr;
4880 int status; 5158 int status;
4881 5159
4882 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5160 status = decode_compound_hdr(xdr, &hdr);
4883 status = decode_compound_hdr(&xdr, &hdr);
4884 if (status) 5161 if (status)
4885 goto out; 5162 goto out;
4886 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5163 status = decode_sequence(xdr, &res->seq_res, rqstp);
4887 if (status) 5164 if (status)
4888 goto out; 5165 goto out;
4889 if ((status = decode_putfh(&xdr)) != 0) 5166 status = decode_putfh(xdr);
5167 if (status)
4890 goto out; 5168 goto out;
4891 if ((status = decode_savefh(&xdr)) != 0) 5169 status = decode_savefh(xdr);
5170 if (status)
4892 goto out; 5171 goto out;
4893 if ((status = decode_putfh(&xdr)) != 0) 5172 status = decode_putfh(xdr);
5173 if (status)
4894 goto out; 5174 goto out;
4895 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) 5175 status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
5176 if (status)
4896 goto out; 5177 goto out;
4897 /* Current FH is target directory */ 5178 /* Current FH is target directory */
4898 if (decode_getfattr(&xdr, res->new_fattr, res->server, 5179 if (decode_getfattr(xdr, res->new_fattr, res->server,
4899 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5180 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4900 goto out; 5181 goto out;
4901 if ((status = decode_restorefh(&xdr)) != 0) 5182 status = decode_restorefh(xdr);
5183 if (status)
4902 goto out; 5184 goto out;
4903 decode_getfattr(&xdr, res->old_fattr, res->server, 5185 decode_getfattr(xdr, res->old_fattr, res->server,
4904 !RPC_IS_ASYNC(rqstp->rq_task)); 5186 !RPC_IS_ASYNC(rqstp->rq_task));
4905out: 5187out:
4906 return status; 5188 return status;
@@ -4909,37 +5191,41 @@ out:
4909/* 5191/*
4910 * Decode LINK response 5192 * Decode LINK response
4911 */ 5193 */
4912static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res) 5194static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5195 struct nfs4_link_res *res)
4913{ 5196{
4914 struct xdr_stream xdr;
4915 struct compound_hdr hdr; 5197 struct compound_hdr hdr;
4916 int status; 5198 int status;
4917 5199
4918 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5200 status = decode_compound_hdr(xdr, &hdr);
4919 status = decode_compound_hdr(&xdr, &hdr);
4920 if (status) 5201 if (status)
4921 goto out; 5202 goto out;
4922 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5203 status = decode_sequence(xdr, &res->seq_res, rqstp);
4923 if (status) 5204 if (status)
4924 goto out; 5205 goto out;
4925 if ((status = decode_putfh(&xdr)) != 0) 5206 status = decode_putfh(xdr);
5207 if (status)
4926 goto out; 5208 goto out;
4927 if ((status = decode_savefh(&xdr)) != 0) 5209 status = decode_savefh(xdr);
5210 if (status)
4928 goto out; 5211 goto out;
4929 if ((status = decode_putfh(&xdr)) != 0) 5212 status = decode_putfh(xdr);
5213 if (status)
4930 goto out; 5214 goto out;
4931 if ((status = decode_link(&xdr, &res->cinfo)) != 0) 5215 status = decode_link(xdr, &res->cinfo);
5216 if (status)
4932 goto out; 5217 goto out;
4933 /* 5218 /*
4934 * Note order: OP_LINK leaves the directory as the current 5219 * Note order: OP_LINK leaves the directory as the current
4935 * filehandle. 5220 * filehandle.
4936 */ 5221 */
4937 if (decode_getfattr(&xdr, res->dir_attr, res->server, 5222 if (decode_getfattr(xdr, res->dir_attr, res->server,
4938 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5223 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4939 goto out; 5224 goto out;
4940 if ((status = decode_restorefh(&xdr)) != 0) 5225 status = decode_restorefh(xdr);
5226 if (status)
4941 goto out; 5227 goto out;
4942 decode_getfattr(&xdr, res->fattr, res->server, 5228 decode_getfattr(xdr, res->fattr, res->server,
4943 !RPC_IS_ASYNC(rqstp->rq_task)); 5229 !RPC_IS_ASYNC(rqstp->rq_task));
4944out: 5230out:
4945 return status; 5231 return status;
@@ -4948,33 +5234,37 @@ out:
4948/* 5234/*
4949 * Decode CREATE response 5235 * Decode CREATE response
4950 */ 5236 */
4951static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) 5237static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5238 struct nfs4_create_res *res)
4952{ 5239{
4953 struct xdr_stream xdr;
4954 struct compound_hdr hdr; 5240 struct compound_hdr hdr;
4955 int status; 5241 int status;
4956 5242
4957 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5243 status = decode_compound_hdr(xdr, &hdr);
4958 status = decode_compound_hdr(&xdr, &hdr);
4959 if (status) 5244 if (status)
4960 goto out; 5245 goto out;
4961 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5246 status = decode_sequence(xdr, &res->seq_res, rqstp);
4962 if (status) 5247 if (status)
4963 goto out; 5248 goto out;
4964 if ((status = decode_putfh(&xdr)) != 0) 5249 status = decode_putfh(xdr);
5250 if (status)
4965 goto out; 5251 goto out;
4966 if ((status = decode_savefh(&xdr)) != 0) 5252 status = decode_savefh(xdr);
5253 if (status)
4967 goto out; 5254 goto out;
4968 if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) 5255 status = decode_create(xdr, &res->dir_cinfo);
5256 if (status)
4969 goto out; 5257 goto out;
4970 if ((status = decode_getfh(&xdr, res->fh)) != 0) 5258 status = decode_getfh(xdr, res->fh);
5259 if (status)
4971 goto out; 5260 goto out;
4972 if (decode_getfattr(&xdr, res->fattr, res->server, 5261 if (decode_getfattr(xdr, res->fattr, res->server,
4973 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5262 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4974 goto out; 5263 goto out;
4975 if ((status = decode_restorefh(&xdr)) != 0) 5264 status = decode_restorefh(xdr);
5265 if (status)
4976 goto out; 5266 goto out;
4977 decode_getfattr(&xdr, res->dir_fattr, res->server, 5267 decode_getfattr(xdr, res->dir_fattr, res->server,
4978 !RPC_IS_ASYNC(rqstp->rq_task)); 5268 !RPC_IS_ASYNC(rqstp->rq_task));
4979out: 5269out:
4980 return status; 5270 return status;
@@ -4983,31 +5273,31 @@ out:
4983/* 5273/*
4984 * Decode SYMLINK response 5274 * Decode SYMLINK response
4985 */ 5275 */
4986static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) 5276static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5277 struct nfs4_create_res *res)
4987{ 5278{
4988 return nfs4_xdr_dec_create(rqstp, p, res); 5279 return nfs4_xdr_dec_create(rqstp, xdr, res);
4989} 5280}
4990 5281
4991/* 5282/*
4992 * Decode GETATTR response 5283 * Decode GETATTR response
4993 */ 5284 */
4994static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res) 5285static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5286 struct nfs4_getattr_res *res)
4995{ 5287{
4996 struct xdr_stream xdr;
4997 struct compound_hdr hdr; 5288 struct compound_hdr hdr;
4998 int status; 5289 int status;
4999 5290
5000 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5291 status = decode_compound_hdr(xdr, &hdr);
5001 status = decode_compound_hdr(&xdr, &hdr);
5002 if (status) 5292 if (status)
5003 goto out; 5293 goto out;
5004 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5294 status = decode_sequence(xdr, &res->seq_res, rqstp);
5005 if (status) 5295 if (status)
5006 goto out; 5296 goto out;
5007 status = decode_putfh(&xdr); 5297 status = decode_putfh(xdr);
5008 if (status) 5298 if (status)
5009 goto out; 5299 goto out;
5010 status = decode_getfattr(&xdr, res->fattr, res->server, 5300 status = decode_getfattr(xdr, res->fattr, res->server,
5011 !RPC_IS_ASYNC(rqstp->rq_task)); 5301 !RPC_IS_ASYNC(rqstp->rq_task));
5012out: 5302out:
5013 return status; 5303 return status;
@@ -5016,46 +5306,40 @@ out:
5016/* 5306/*
5017 * Encode an SETACL request 5307 * Encode an SETACL request
5018 */ 5308 */
5019static int 5309static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
5020nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) 5310 struct nfs_setaclargs *args)
5021{ 5311{
5022 struct xdr_stream xdr;
5023 struct compound_hdr hdr = { 5312 struct compound_hdr hdr = {
5024 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 5313 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
5025 }; 5314 };
5026 int status;
5027 5315
5028 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 5316 encode_compound_hdr(xdr, req, &hdr);
5029 encode_compound_hdr(&xdr, req, &hdr); 5317 encode_sequence(xdr, &args->seq_args, &hdr);
5030 encode_sequence(&xdr, &args->seq_args, &hdr); 5318 encode_putfh(xdr, args->fh, &hdr);
5031 encode_putfh(&xdr, args->fh, &hdr); 5319 encode_setacl(xdr, args, &hdr);
5032 status = encode_setacl(&xdr, args, &hdr);
5033 encode_nops(&hdr); 5320 encode_nops(&hdr);
5034 return status;
5035} 5321}
5036 5322
5037/* 5323/*
5038 * Decode SETACL response 5324 * Decode SETACL response
5039 */ 5325 */
5040static int 5326static int
5041nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, 5327nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5042 struct nfs_setaclres *res) 5328 struct nfs_setaclres *res)
5043{ 5329{
5044 struct xdr_stream xdr;
5045 struct compound_hdr hdr; 5330 struct compound_hdr hdr;
5046 int status; 5331 int status;
5047 5332
5048 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5333 status = decode_compound_hdr(xdr, &hdr);
5049 status = decode_compound_hdr(&xdr, &hdr);
5050 if (status) 5334 if (status)
5051 goto out; 5335 goto out;
5052 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5336 status = decode_sequence(xdr, &res->seq_res, rqstp);
5053 if (status) 5337 if (status)
5054 goto out; 5338 goto out;
5055 status = decode_putfh(&xdr); 5339 status = decode_putfh(xdr);
5056 if (status) 5340 if (status)
5057 goto out; 5341 goto out;
5058 status = decode_setattr(&xdr); 5342 status = decode_setattr(xdr);
5059out: 5343out:
5060 return status; 5344 return status;
5061} 5345}
@@ -5064,24 +5348,22 @@ out:
5064 * Decode GETACL response 5348 * Decode GETACL response
5065 */ 5349 */
5066static int 5350static int
5067nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, 5351nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5068 struct nfs_getaclres *res) 5352 struct nfs_getaclres *res)
5069{ 5353{
5070 struct xdr_stream xdr;
5071 struct compound_hdr hdr; 5354 struct compound_hdr hdr;
5072 int status; 5355 int status;
5073 5356
5074 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5357 status = decode_compound_hdr(xdr, &hdr);
5075 status = decode_compound_hdr(&xdr, &hdr);
5076 if (status) 5358 if (status)
5077 goto out; 5359 goto out;
5078 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5360 status = decode_sequence(xdr, &res->seq_res, rqstp);
5079 if (status) 5361 if (status)
5080 goto out; 5362 goto out;
5081 status = decode_putfh(&xdr); 5363 status = decode_putfh(xdr);
5082 if (status) 5364 if (status)
5083 goto out; 5365 goto out;
5084 status = decode_getacl(&xdr, rqstp, &res->acl_len); 5366 status = decode_getacl(xdr, rqstp, &res->acl_len);
5085 5367
5086out: 5368out:
5087 return status; 5369 return status;
@@ -5090,23 +5372,22 @@ out:
5090/* 5372/*
5091 * Decode CLOSE response 5373 * Decode CLOSE response
5092 */ 5374 */
5093static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) 5375static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5376 struct nfs_closeres *res)
5094{ 5377{
5095 struct xdr_stream xdr;
5096 struct compound_hdr hdr; 5378 struct compound_hdr hdr;
5097 int status; 5379 int status;
5098 5380
5099 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5381 status = decode_compound_hdr(xdr, &hdr);
5100 status = decode_compound_hdr(&xdr, &hdr);
5101 if (status) 5382 if (status)
5102 goto out; 5383 goto out;
5103 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5384 status = decode_sequence(xdr, &res->seq_res, rqstp);
5104 if (status) 5385 if (status)
5105 goto out; 5386 goto out;
5106 status = decode_putfh(&xdr); 5387 status = decode_putfh(xdr);
5107 if (status) 5388 if (status)
5108 goto out; 5389 goto out;
5109 status = decode_close(&xdr, res); 5390 status = decode_close(xdr, res);
5110 if (status != 0) 5391 if (status != 0)
5111 goto out; 5392 goto out;
5112 /* 5393 /*
@@ -5115,7 +5396,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
5115 * an ESTALE error. Shouldn't be a problem, 5396 * an ESTALE error. Shouldn't be a problem,
5116 * though, since fattr->valid will remain unset. 5397 * though, since fattr->valid will remain unset.
5117 */ 5398 */
5118 decode_getfattr(&xdr, res->fattr, res->server, 5399 decode_getfattr(xdr, res->fattr, res->server,
5119 !RPC_IS_ASYNC(rqstp->rq_task)); 5400 !RPC_IS_ASYNC(rqstp->rq_task));
5120out: 5401out:
5121 return status; 5402 return status;
@@ -5124,36 +5405,35 @@ out:
5124/* 5405/*
5125 * Decode OPEN response 5406 * Decode OPEN response
5126 */ 5407 */
5127static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 5408static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5409 struct nfs_openres *res)
5128{ 5410{
5129 struct xdr_stream xdr;
5130 struct compound_hdr hdr; 5411 struct compound_hdr hdr;
5131 int status; 5412 int status;
5132 5413
5133 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5414 status = decode_compound_hdr(xdr, &hdr);
5134 status = decode_compound_hdr(&xdr, &hdr);
5135 if (status) 5415 if (status)
5136 goto out; 5416 goto out;
5137 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5417 status = decode_sequence(xdr, &res->seq_res, rqstp);
5138 if (status) 5418 if (status)
5139 goto out; 5419 goto out;
5140 status = decode_putfh(&xdr); 5420 status = decode_putfh(xdr);
5141 if (status) 5421 if (status)
5142 goto out; 5422 goto out;
5143 status = decode_savefh(&xdr); 5423 status = decode_savefh(xdr);
5144 if (status) 5424 if (status)
5145 goto out; 5425 goto out;
5146 status = decode_open(&xdr, res); 5426 status = decode_open(xdr, res);
5147 if (status) 5427 if (status)
5148 goto out; 5428 goto out;
5149 if (decode_getfh(&xdr, &res->fh) != 0) 5429 if (decode_getfh(xdr, &res->fh) != 0)
5150 goto out; 5430 goto out;
5151 if (decode_getfattr(&xdr, res->f_attr, res->server, 5431 if (decode_getfattr(xdr, res->f_attr, res->server,
5152 !RPC_IS_ASYNC(rqstp->rq_task)) != 0) 5432 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5153 goto out; 5433 goto out;
5154 if (decode_restorefh(&xdr) != 0) 5434 if (decode_restorefh(xdr) != 0)
5155 goto out; 5435 goto out;
5156 decode_getfattr(&xdr, res->dir_attr, res->server, 5436 decode_getfattr(xdr, res->dir_attr, res->server,
5157 !RPC_IS_ASYNC(rqstp->rq_task)); 5437 !RPC_IS_ASYNC(rqstp->rq_task));
5158out: 5438out:
5159 return status; 5439 return status;
@@ -5162,20 +5442,20 @@ out:
5162/* 5442/*
5163 * Decode OPEN_CONFIRM response 5443 * Decode OPEN_CONFIRM response
5164 */ 5444 */
5165static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) 5445static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
5446 struct xdr_stream *xdr,
5447 struct nfs_open_confirmres *res)
5166{ 5448{
5167 struct xdr_stream xdr;
5168 struct compound_hdr hdr; 5449 struct compound_hdr hdr;
5169 int status; 5450 int status;
5170 5451
5171 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5452 status = decode_compound_hdr(xdr, &hdr);
5172 status = decode_compound_hdr(&xdr, &hdr);
5173 if (status) 5453 if (status)
5174 goto out; 5454 goto out;
5175 status = decode_putfh(&xdr); 5455 status = decode_putfh(xdr);
5176 if (status) 5456 if (status)
5177 goto out; 5457 goto out;
5178 status = decode_open_confirm(&xdr, res); 5458 status = decode_open_confirm(xdr, res);
5179out: 5459out:
5180 return status; 5460 return status;
5181} 5461}
@@ -5183,26 +5463,26 @@ out:
5183/* 5463/*
5184 * Decode OPEN response 5464 * Decode OPEN response
5185 */ 5465 */
5186static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) 5466static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
5467 struct xdr_stream *xdr,
5468 struct nfs_openres *res)
5187{ 5469{
5188 struct xdr_stream xdr;
5189 struct compound_hdr hdr; 5470 struct compound_hdr hdr;
5190 int status; 5471 int status;
5191 5472
5192 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5473 status = decode_compound_hdr(xdr, &hdr);
5193 status = decode_compound_hdr(&xdr, &hdr);
5194 if (status) 5474 if (status)
5195 goto out; 5475 goto out;
5196 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5476 status = decode_sequence(xdr, &res->seq_res, rqstp);
5197 if (status) 5477 if (status)
5198 goto out; 5478 goto out;
5199 status = decode_putfh(&xdr); 5479 status = decode_putfh(xdr);
5200 if (status) 5480 if (status)
5201 goto out; 5481 goto out;
5202 status = decode_open(&xdr, res); 5482 status = decode_open(xdr, res);
5203 if (status) 5483 if (status)
5204 goto out; 5484 goto out;
5205 decode_getfattr(&xdr, res->f_attr, res->server, 5485 decode_getfattr(xdr, res->f_attr, res->server,
5206 !RPC_IS_ASYNC(rqstp->rq_task)); 5486 !RPC_IS_ASYNC(rqstp->rq_task));
5207out: 5487out:
5208 return status; 5488 return status;
@@ -5211,26 +5491,26 @@ out:
5211/* 5491/*
5212 * Decode SETATTR response 5492 * Decode SETATTR response
5213 */ 5493 */
5214static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) 5494static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
5495 struct xdr_stream *xdr,
5496 struct nfs_setattrres *res)
5215{ 5497{
5216 struct xdr_stream xdr;
5217 struct compound_hdr hdr; 5498 struct compound_hdr hdr;
5218 int status; 5499 int status;
5219 5500
5220 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5501 status = decode_compound_hdr(xdr, &hdr);
5221 status = decode_compound_hdr(&xdr, &hdr);
5222 if (status) 5502 if (status)
5223 goto out; 5503 goto out;
5224 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5504 status = decode_sequence(xdr, &res->seq_res, rqstp);
5225 if (status) 5505 if (status)
5226 goto out; 5506 goto out;
5227 status = decode_putfh(&xdr); 5507 status = decode_putfh(xdr);
5228 if (status) 5508 if (status)
5229 goto out; 5509 goto out;
5230 status = decode_setattr(&xdr); 5510 status = decode_setattr(xdr);
5231 if (status) 5511 if (status)
5232 goto out; 5512 goto out;
5233 decode_getfattr(&xdr, res->fattr, res->server, 5513 decode_getfattr(xdr, res->fattr, res->server,
5234 !RPC_IS_ASYNC(rqstp->rq_task)); 5514 !RPC_IS_ASYNC(rqstp->rq_task));
5235out: 5515out:
5236 return status; 5516 return status;
@@ -5239,23 +5519,22 @@ out:
5239/* 5519/*
5240 * Decode LOCK response 5520 * Decode LOCK response
5241 */ 5521 */
5242static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res) 5522static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5523 struct nfs_lock_res *res)
5243{ 5524{
5244 struct xdr_stream xdr;
5245 struct compound_hdr hdr; 5525 struct compound_hdr hdr;
5246 int status; 5526 int status;
5247 5527
5248 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5528 status = decode_compound_hdr(xdr, &hdr);
5249 status = decode_compound_hdr(&xdr, &hdr);
5250 if (status) 5529 if (status)
5251 goto out; 5530 goto out;
5252 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5531 status = decode_sequence(xdr, &res->seq_res, rqstp);
5253 if (status) 5532 if (status)
5254 goto out; 5533 goto out;
5255 status = decode_putfh(&xdr); 5534 status = decode_putfh(xdr);
5256 if (status) 5535 if (status)
5257 goto out; 5536 goto out;
5258 status = decode_lock(&xdr, res); 5537 status = decode_lock(xdr, res);
5259out: 5538out:
5260 return status; 5539 return status;
5261} 5540}
@@ -5263,23 +5542,22 @@ out:
5263/* 5542/*
5264 * Decode LOCKT response 5543 * Decode LOCKT response
5265 */ 5544 */
5266static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res) 5545static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5546 struct nfs_lockt_res *res)
5267{ 5547{
5268 struct xdr_stream xdr;
5269 struct compound_hdr hdr; 5548 struct compound_hdr hdr;
5270 int status; 5549 int status;
5271 5550
5272 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5551 status = decode_compound_hdr(xdr, &hdr);
5273 status = decode_compound_hdr(&xdr, &hdr);
5274 if (status) 5552 if (status)
5275 goto out; 5553 goto out;
5276 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5554 status = decode_sequence(xdr, &res->seq_res, rqstp);
5277 if (status) 5555 if (status)
5278 goto out; 5556 goto out;
5279 status = decode_putfh(&xdr); 5557 status = decode_putfh(xdr);
5280 if (status) 5558 if (status)
5281 goto out; 5559 goto out;
5282 status = decode_lockt(&xdr, res); 5560 status = decode_lockt(xdr, res);
5283out: 5561out:
5284 return status; 5562 return status;
5285} 5563}
@@ -5287,61 +5565,58 @@ out:
5287/* 5565/*
5288 * Decode LOCKU response 5566 * Decode LOCKU response
5289 */ 5567 */
5290static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res) 5568static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5569 struct nfs_locku_res *res)
5291{ 5570{
5292 struct xdr_stream xdr;
5293 struct compound_hdr hdr; 5571 struct compound_hdr hdr;
5294 int status; 5572 int status;
5295 5573
5296 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5574 status = decode_compound_hdr(xdr, &hdr);
5297 status = decode_compound_hdr(&xdr, &hdr);
5298 if (status) 5575 if (status)
5299 goto out; 5576 goto out;
5300 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5577 status = decode_sequence(xdr, &res->seq_res, rqstp);
5301 if (status) 5578 if (status)
5302 goto out; 5579 goto out;
5303 status = decode_putfh(&xdr); 5580 status = decode_putfh(xdr);
5304 if (status) 5581 if (status)
5305 goto out; 5582 goto out;
5306 status = decode_locku(&xdr, res); 5583 status = decode_locku(xdr, res);
5307out: 5584out:
5308 return status; 5585 return status;
5309} 5586}
5310 5587
5311static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) 5588static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
5589 struct xdr_stream *xdr, void *dummy)
5312{ 5590{
5313 struct xdr_stream xdr;
5314 struct compound_hdr hdr; 5591 struct compound_hdr hdr;
5315 int status; 5592 int status;
5316 5593
5317 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5594 status = decode_compound_hdr(xdr, &hdr);
5318 status = decode_compound_hdr(&xdr, &hdr);
5319 if (!status) 5595 if (!status)
5320 status = decode_release_lockowner(&xdr); 5596 status = decode_release_lockowner(xdr);
5321 return status; 5597 return status;
5322} 5598}
5323 5599
5324/* 5600/*
5325 * Decode READLINK response 5601 * Decode READLINK response
5326 */ 5602 */
5327static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, 5603static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
5604 struct xdr_stream *xdr,
5328 struct nfs4_readlink_res *res) 5605 struct nfs4_readlink_res *res)
5329{ 5606{
5330 struct xdr_stream xdr;
5331 struct compound_hdr hdr; 5607 struct compound_hdr hdr;
5332 int status; 5608 int status;
5333 5609
5334 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5610 status = decode_compound_hdr(xdr, &hdr);
5335 status = decode_compound_hdr(&xdr, &hdr);
5336 if (status) 5611 if (status)
5337 goto out; 5612 goto out;
5338 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5613 status = decode_sequence(xdr, &res->seq_res, rqstp);
5339 if (status) 5614 if (status)
5340 goto out; 5615 goto out;
5341 status = decode_putfh(&xdr); 5616 status = decode_putfh(xdr);
5342 if (status) 5617 if (status)
5343 goto out; 5618 goto out;
5344 status = decode_readlink(&xdr, rqstp); 5619 status = decode_readlink(xdr, rqstp);
5345out: 5620out:
5346 return status; 5621 return status;
5347} 5622}
@@ -5349,23 +5624,22 @@ out:
5349/* 5624/*
5350 * Decode READDIR response 5625 * Decode READDIR response
5351 */ 5626 */
5352static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res) 5627static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5628 struct nfs4_readdir_res *res)
5353{ 5629{
5354 struct xdr_stream xdr;
5355 struct compound_hdr hdr; 5630 struct compound_hdr hdr;
5356 int status; 5631 int status;
5357 5632
5358 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5633 status = decode_compound_hdr(xdr, &hdr);
5359 status = decode_compound_hdr(&xdr, &hdr);
5360 if (status) 5634 if (status)
5361 goto out; 5635 goto out;
5362 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5636 status = decode_sequence(xdr, &res->seq_res, rqstp);
5363 if (status) 5637 if (status)
5364 goto out; 5638 goto out;
5365 status = decode_putfh(&xdr); 5639 status = decode_putfh(xdr);
5366 if (status) 5640 if (status)
5367 goto out; 5641 goto out;
5368 status = decode_readdir(&xdr, rqstp, res); 5642 status = decode_readdir(xdr, rqstp, res);
5369out: 5643out:
5370 return status; 5644 return status;
5371} 5645}
@@ -5373,23 +5647,22 @@ out:
5373/* 5647/*
5374 * Decode Read response 5648 * Decode Read response
5375 */ 5649 */
5376static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res) 5650static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5651 struct nfs_readres *res)
5377{ 5652{
5378 struct xdr_stream xdr;
5379 struct compound_hdr hdr; 5653 struct compound_hdr hdr;
5380 int status; 5654 int status;
5381 5655
5382 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5656 status = decode_compound_hdr(xdr, &hdr);
5383 status = decode_compound_hdr(&xdr, &hdr);
5384 if (status) 5657 if (status)
5385 goto out; 5658 goto out;
5386 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5659 status = decode_sequence(xdr, &res->seq_res, rqstp);
5387 if (status) 5660 if (status)
5388 goto out; 5661 goto out;
5389 status = decode_putfh(&xdr); 5662 status = decode_putfh(xdr);
5390 if (status) 5663 if (status)
5391 goto out; 5664 goto out;
5392 status = decode_read(&xdr, rqstp, res); 5665 status = decode_read(xdr, rqstp, res);
5393 if (!status) 5666 if (!status)
5394 status = res->count; 5667 status = res->count;
5395out: 5668out:
@@ -5399,26 +5672,25 @@ out:
5399/* 5672/*
5400 * Decode WRITE response 5673 * Decode WRITE response
5401 */ 5674 */
5402static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) 5675static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5676 struct nfs_writeres *res)
5403{ 5677{
5404 struct xdr_stream xdr;
5405 struct compound_hdr hdr; 5678 struct compound_hdr hdr;
5406 int status; 5679 int status;
5407 5680
5408 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5681 status = decode_compound_hdr(xdr, &hdr);
5409 status = decode_compound_hdr(&xdr, &hdr);
5410 if (status) 5682 if (status)
5411 goto out; 5683 goto out;
5412 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5684 status = decode_sequence(xdr, &res->seq_res, rqstp);
5413 if (status) 5685 if (status)
5414 goto out; 5686 goto out;
5415 status = decode_putfh(&xdr); 5687 status = decode_putfh(xdr);
5416 if (status) 5688 if (status)
5417 goto out; 5689 goto out;
5418 status = decode_write(&xdr, res); 5690 status = decode_write(xdr, res);
5419 if (status) 5691 if (status)
5420 goto out; 5692 goto out;
5421 decode_getfattr(&xdr, res->fattr, res->server, 5693 decode_getfattr(xdr, res->fattr, res->server,
5422 !RPC_IS_ASYNC(rqstp->rq_task)); 5694 !RPC_IS_ASYNC(rqstp->rq_task));
5423 if (!status) 5695 if (!status)
5424 status = res->count; 5696 status = res->count;
@@ -5429,26 +5701,25 @@ out:
5429/* 5701/*
5430 * Decode COMMIT response 5702 * Decode COMMIT response
5431 */ 5703 */
5432static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) 5704static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5705 struct nfs_writeres *res)
5433{ 5706{
5434 struct xdr_stream xdr;
5435 struct compound_hdr hdr; 5707 struct compound_hdr hdr;
5436 int status; 5708 int status;
5437 5709
5438 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5710 status = decode_compound_hdr(xdr, &hdr);
5439 status = decode_compound_hdr(&xdr, &hdr);
5440 if (status) 5711 if (status)
5441 goto out; 5712 goto out;
5442 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5713 status = decode_sequence(xdr, &res->seq_res, rqstp);
5443 if (status) 5714 if (status)
5444 goto out; 5715 goto out;
5445 status = decode_putfh(&xdr); 5716 status = decode_putfh(xdr);
5446 if (status) 5717 if (status)
5447 goto out; 5718 goto out;
5448 status = decode_commit(&xdr, res); 5719 status = decode_commit(xdr, res);
5449 if (status) 5720 if (status)
5450 goto out; 5721 goto out;
5451 decode_getfattr(&xdr, res->fattr, res->server, 5722 decode_getfattr(xdr, res->fattr, res->server,
5452 !RPC_IS_ASYNC(rqstp->rq_task)); 5723 !RPC_IS_ASYNC(rqstp->rq_task));
5453out: 5724out:
5454 return status; 5725 return status;
@@ -5457,85 +5728,80 @@ out:
5457/* 5728/*
5458 * Decode FSINFO response 5729 * Decode FSINFO response
5459 */ 5730 */
5460static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, 5731static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
5461 struct nfs4_fsinfo_res *res) 5732 struct nfs4_fsinfo_res *res)
5462{ 5733{
5463 struct xdr_stream xdr;
5464 struct compound_hdr hdr; 5734 struct compound_hdr hdr;
5465 int status; 5735 int status;
5466 5736
5467 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5737 status = decode_compound_hdr(xdr, &hdr);
5468 status = decode_compound_hdr(&xdr, &hdr);
5469 if (!status) 5738 if (!status)
5470 status = decode_sequence(&xdr, &res->seq_res, req); 5739 status = decode_sequence(xdr, &res->seq_res, req);
5471 if (!status) 5740 if (!status)
5472 status = decode_putfh(&xdr); 5741 status = decode_putfh(xdr);
5473 if (!status) 5742 if (!status)
5474 status = decode_fsinfo(&xdr, res->fsinfo); 5743 status = decode_fsinfo(xdr, res->fsinfo);
5475 return status; 5744 return status;
5476} 5745}
5477 5746
5478/* 5747/*
5479 * Decode PATHCONF response 5748 * Decode PATHCONF response
5480 */ 5749 */
5481static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, 5750static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
5482 struct nfs4_pathconf_res *res) 5751 struct nfs4_pathconf_res *res)
5483{ 5752{
5484 struct xdr_stream xdr;
5485 struct compound_hdr hdr; 5753 struct compound_hdr hdr;
5486 int status; 5754 int status;
5487 5755
5488 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5756 status = decode_compound_hdr(xdr, &hdr);
5489 status = decode_compound_hdr(&xdr, &hdr);
5490 if (!status) 5757 if (!status)
5491 status = decode_sequence(&xdr, &res->seq_res, req); 5758 status = decode_sequence(xdr, &res->seq_res, req);
5492 if (!status) 5759 if (!status)
5493 status = decode_putfh(&xdr); 5760 status = decode_putfh(xdr);
5494 if (!status) 5761 if (!status)
5495 status = decode_pathconf(&xdr, res->pathconf); 5762 status = decode_pathconf(xdr, res->pathconf);
5496 return status; 5763 return status;
5497} 5764}
5498 5765
5499/* 5766/*
5500 * Decode STATFS response 5767 * Decode STATFS response
5501 */ 5768 */
5502static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, 5769static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
5503 struct nfs4_statfs_res *res) 5770 struct nfs4_statfs_res *res)
5504{ 5771{
5505 struct xdr_stream xdr;
5506 struct compound_hdr hdr; 5772 struct compound_hdr hdr;
5507 int status; 5773 int status;
5508 5774
5509 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5775 status = decode_compound_hdr(xdr, &hdr);
5510 status = decode_compound_hdr(&xdr, &hdr);
5511 if (!status) 5776 if (!status)
5512 status = decode_sequence(&xdr, &res->seq_res, req); 5777 status = decode_sequence(xdr, &res->seq_res, req);
5513 if (!status) 5778 if (!status)
5514 status = decode_putfh(&xdr); 5779 status = decode_putfh(xdr);
5515 if (!status) 5780 if (!status)
5516 status = decode_statfs(&xdr, res->fsstat); 5781 status = decode_statfs(xdr, res->fsstat);
5517 return status; 5782 return status;
5518} 5783}
5519 5784
5520/* 5785/*
5521 * Decode GETATTR_BITMAP response 5786 * Decode GETATTR_BITMAP response
5522 */ 5787 */
5523static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) 5788static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
5789 struct xdr_stream *xdr,
5790 struct nfs4_server_caps_res *res)
5524{ 5791{
5525 struct xdr_stream xdr;
5526 struct compound_hdr hdr; 5792 struct compound_hdr hdr;
5527 int status; 5793 int status;
5528 5794
5529 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5795 status = decode_compound_hdr(xdr, &hdr);
5530 status = decode_compound_hdr(&xdr, &hdr);
5531 if (status) 5796 if (status)
5532 goto out; 5797 goto out;
5533 status = decode_sequence(&xdr, &res->seq_res, req); 5798 status = decode_sequence(xdr, &res->seq_res, req);
5534 if (status) 5799 if (status)
5535 goto out; 5800 goto out;
5536 if ((status = decode_putfh(&xdr)) != 0) 5801 status = decode_putfh(xdr);
5802 if (status)
5537 goto out; 5803 goto out;
5538 status = decode_server_caps(&xdr, res); 5804 status = decode_server_caps(xdr, res);
5539out: 5805out:
5540 return status; 5806 return status;
5541} 5807}
@@ -5543,79 +5809,77 @@ out:
5543/* 5809/*
5544 * Decode RENEW response 5810 * Decode RENEW response
5545 */ 5811 */
5546static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) 5812static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5813 void *__unused)
5547{ 5814{
5548 struct xdr_stream xdr;
5549 struct compound_hdr hdr; 5815 struct compound_hdr hdr;
5550 int status; 5816 int status;
5551 5817
5552 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5818 status = decode_compound_hdr(xdr, &hdr);
5553 status = decode_compound_hdr(&xdr, &hdr);
5554 if (!status) 5819 if (!status)
5555 status = decode_renew(&xdr); 5820 status = decode_renew(xdr);
5556 return status; 5821 return status;
5557} 5822}
5558 5823
5559/* 5824/*
5560 * Decode SETCLIENTID response 5825 * Decode SETCLIENTID response
5561 */ 5826 */
5562static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, 5827static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
5563 struct nfs4_setclientid_res *res) 5828 struct xdr_stream *xdr,
5829 struct nfs4_setclientid_res *res)
5564{ 5830{
5565 struct xdr_stream xdr;
5566 struct compound_hdr hdr; 5831 struct compound_hdr hdr;
5567 int status; 5832 int status;
5568 5833
5569 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5834 status = decode_compound_hdr(xdr, &hdr);
5570 status = decode_compound_hdr(&xdr, &hdr);
5571 if (!status) 5835 if (!status)
5572 status = decode_setclientid(&xdr, res); 5836 status = decode_setclientid(xdr, res);
5573 return status; 5837 return status;
5574} 5838}
5575 5839
5576/* 5840/*
5577 * Decode SETCLIENTID_CONFIRM response 5841 * Decode SETCLIENTID_CONFIRM response
5578 */ 5842 */
5579static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) 5843static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
5844 struct xdr_stream *xdr,
5845 struct nfs_fsinfo *fsinfo)
5580{ 5846{
5581 struct xdr_stream xdr;
5582 struct compound_hdr hdr; 5847 struct compound_hdr hdr;
5583 int status; 5848 int status;
5584 5849
5585 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5850 status = decode_compound_hdr(xdr, &hdr);
5586 status = decode_compound_hdr(&xdr, &hdr);
5587 if (!status) 5851 if (!status)
5588 status = decode_setclientid_confirm(&xdr); 5852 status = decode_setclientid_confirm(xdr);
5589 if (!status) 5853 if (!status)
5590 status = decode_putrootfh(&xdr); 5854 status = decode_putrootfh(xdr);
5591 if (!status) 5855 if (!status)
5592 status = decode_fsinfo(&xdr, fsinfo); 5856 status = decode_fsinfo(xdr, fsinfo);
5593 return status; 5857 return status;
5594} 5858}
5595 5859
5596/* 5860/*
5597 * Decode DELEGRETURN response 5861 * Decode DELEGRETURN response
5598 */ 5862 */
5599static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) 5863static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
5864 struct xdr_stream *xdr,
5865 struct nfs4_delegreturnres *res)
5600{ 5866{
5601 struct xdr_stream xdr;
5602 struct compound_hdr hdr; 5867 struct compound_hdr hdr;
5603 int status; 5868 int status;
5604 5869
5605 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5870 status = decode_compound_hdr(xdr, &hdr);
5606 status = decode_compound_hdr(&xdr, &hdr);
5607 if (status) 5871 if (status)
5608 goto out; 5872 goto out;
5609 status = decode_sequence(&xdr, &res->seq_res, rqstp); 5873 status = decode_sequence(xdr, &res->seq_res, rqstp);
5610 if (status) 5874 if (status)
5611 goto out; 5875 goto out;
5612 status = decode_putfh(&xdr); 5876 status = decode_putfh(xdr);
5613 if (status != 0) 5877 if (status != 0)
5614 goto out; 5878 goto out;
5615 status = decode_delegreturn(&xdr); 5879 status = decode_delegreturn(xdr);
5616 if (status != 0) 5880 if (status != 0)
5617 goto out; 5881 goto out;
5618 decode_getfattr(&xdr, res->fattr, res->server, 5882 decode_getfattr(xdr, res->fattr, res->server,
5619 !RPC_IS_ASYNC(rqstp->rq_task)); 5883 !RPC_IS_ASYNC(rqstp->rq_task));
5620out: 5884out:
5621 return status; 5885 return status;
@@ -5624,26 +5888,27 @@ out:
5624/* 5888/*
5625 * Decode FS_LOCATIONS response 5889 * Decode FS_LOCATIONS response
5626 */ 5890 */
5627static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, 5891static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
5892 struct xdr_stream *xdr,
5628 struct nfs4_fs_locations_res *res) 5893 struct nfs4_fs_locations_res *res)
5629{ 5894{
5630 struct xdr_stream xdr;
5631 struct compound_hdr hdr; 5895 struct compound_hdr hdr;
5632 int status; 5896 int status;
5633 5897
5634 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5898 status = decode_compound_hdr(xdr, &hdr);
5635 status = decode_compound_hdr(&xdr, &hdr);
5636 if (status) 5899 if (status)
5637 goto out; 5900 goto out;
5638 status = decode_sequence(&xdr, &res->seq_res, req); 5901 status = decode_sequence(xdr, &res->seq_res, req);
5639 if (status) 5902 if (status)
5640 goto out; 5903 goto out;
5641 if ((status = decode_putfh(&xdr)) != 0) 5904 status = decode_putfh(xdr);
5905 if (status)
5642 goto out; 5906 goto out;
5643 if ((status = decode_lookup(&xdr)) != 0) 5907 status = decode_lookup(xdr);
5908 if (status)
5644 goto out; 5909 goto out;
5645 xdr_enter_page(&xdr, PAGE_SIZE); 5910 xdr_enter_page(xdr, PAGE_SIZE);
5646 status = decode_getfattr(&xdr, &res->fs_locations->fattr, 5911 status = decode_getfattr(xdr, &res->fs_locations->fattr,
5647 res->fs_locations->server, 5912 res->fs_locations->server,
5648 !RPC_IS_ASYNC(req->rq_task)); 5913 !RPC_IS_ASYNC(req->rq_task));
5649out: 5914out:
@@ -5654,129 +5919,194 @@ out:
5654/* 5919/*
5655 * Decode EXCHANGE_ID response 5920 * Decode EXCHANGE_ID response
5656 */ 5921 */
5657static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, 5922static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
5923 struct xdr_stream *xdr,
5658 void *res) 5924 void *res)
5659{ 5925{
5660 struct xdr_stream xdr;
5661 struct compound_hdr hdr; 5926 struct compound_hdr hdr;
5662 int status; 5927 int status;
5663 5928
5664 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5929 status = decode_compound_hdr(xdr, &hdr);
5665 status = decode_compound_hdr(&xdr, &hdr);
5666 if (!status) 5930 if (!status)
5667 status = decode_exchange_id(&xdr, res); 5931 status = decode_exchange_id(xdr, res);
5668 return status; 5932 return status;
5669} 5933}
5670 5934
5671/* 5935/*
5672 * Decode CREATE_SESSION response 5936 * Decode CREATE_SESSION response
5673 */ 5937 */
5674static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, 5938static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
5939 struct xdr_stream *xdr,
5675 struct nfs41_create_session_res *res) 5940 struct nfs41_create_session_res *res)
5676{ 5941{
5677 struct xdr_stream xdr;
5678 struct compound_hdr hdr; 5942 struct compound_hdr hdr;
5679 int status; 5943 int status;
5680 5944
5681 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5945 status = decode_compound_hdr(xdr, &hdr);
5682 status = decode_compound_hdr(&xdr, &hdr);
5683 if (!status) 5946 if (!status)
5684 status = decode_create_session(&xdr, res); 5947 status = decode_create_session(xdr, res);
5685 return status; 5948 return status;
5686} 5949}
5687 5950
5688/* 5951/*
5689 * Decode DESTROY_SESSION response 5952 * Decode DESTROY_SESSION response
5690 */ 5953 */
5691static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, 5954static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
5692 void *dummy) 5955 struct xdr_stream *xdr,
5956 void *res)
5693{ 5957{
5694 struct xdr_stream xdr;
5695 struct compound_hdr hdr; 5958 struct compound_hdr hdr;
5696 int status; 5959 int status;
5697 5960
5698 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5961 status = decode_compound_hdr(xdr, &hdr);
5699 status = decode_compound_hdr(&xdr, &hdr);
5700 if (!status) 5962 if (!status)
5701 status = decode_destroy_session(&xdr, dummy); 5963 status = decode_destroy_session(xdr, res);
5702 return status; 5964 return status;
5703} 5965}
5704 5966
5705/* 5967/*
5706 * Decode SEQUENCE response 5968 * Decode SEQUENCE response
5707 */ 5969 */
5708static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, 5970static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
5971 struct xdr_stream *xdr,
5709 struct nfs4_sequence_res *res) 5972 struct nfs4_sequence_res *res)
5710{ 5973{
5711 struct xdr_stream xdr;
5712 struct compound_hdr hdr; 5974 struct compound_hdr hdr;
5713 int status; 5975 int status;
5714 5976
5715 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5977 status = decode_compound_hdr(xdr, &hdr);
5716 status = decode_compound_hdr(&xdr, &hdr);
5717 if (!status) 5978 if (!status)
5718 status = decode_sequence(&xdr, res, rqstp); 5979 status = decode_sequence(xdr, res, rqstp);
5719 return status; 5980 return status;
5720} 5981}
5721 5982
5722/* 5983/*
5723 * Decode GET_LEASE_TIME response 5984 * Decode GET_LEASE_TIME response
5724 */ 5985 */
5725static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, 5986static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
5987 struct xdr_stream *xdr,
5726 struct nfs4_get_lease_time_res *res) 5988 struct nfs4_get_lease_time_res *res)
5727{ 5989{
5728 struct xdr_stream xdr;
5729 struct compound_hdr hdr; 5990 struct compound_hdr hdr;
5730 int status; 5991 int status;
5731 5992
5732 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 5993 status = decode_compound_hdr(xdr, &hdr);
5733 status = decode_compound_hdr(&xdr, &hdr);
5734 if (!status) 5994 if (!status)
5735 status = decode_sequence(&xdr, &res->lr_seq_res, rqstp); 5995 status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
5736 if (!status) 5996 if (!status)
5737 status = decode_putrootfh(&xdr); 5997 status = decode_putrootfh(xdr);
5738 if (!status) 5998 if (!status)
5739 status = decode_fsinfo(&xdr, res->lr_fsinfo); 5999 status = decode_fsinfo(xdr, res->lr_fsinfo);
5740 return status; 6000 return status;
5741} 6001}
5742 6002
5743/* 6003/*
5744 * Decode RECLAIM_COMPLETE response 6004 * Decode RECLAIM_COMPLETE response
5745 */ 6005 */
5746static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, 6006static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
6007 struct xdr_stream *xdr,
5747 struct nfs41_reclaim_complete_res *res) 6008 struct nfs41_reclaim_complete_res *res)
5748{ 6009{
5749 struct xdr_stream xdr;
5750 struct compound_hdr hdr; 6010 struct compound_hdr hdr;
5751 int status; 6011 int status;
5752 6012
5753 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 6013 status = decode_compound_hdr(xdr, &hdr);
5754 status = decode_compound_hdr(&xdr, &hdr);
5755 if (!status) 6014 if (!status)
5756 status = decode_sequence(&xdr, &res->seq_res, rqstp); 6015 status = decode_sequence(xdr, &res->seq_res, rqstp);
5757 if (!status) 6016 if (!status)
5758 status = decode_reclaim_complete(&xdr, (void *)NULL); 6017 status = decode_reclaim_complete(xdr, (void *)NULL);
6018 return status;
6019}
6020
6021/*
6022 * Decode GETDEVINFO response
6023 */
6024static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
6025 struct xdr_stream *xdr,
6026 struct nfs4_getdeviceinfo_res *res)
6027{
6028 struct compound_hdr hdr;
6029 int status;
6030
6031 status = decode_compound_hdr(xdr, &hdr);
6032 if (status != 0)
6033 goto out;
6034 status = decode_sequence(xdr, &res->seq_res, rqstp);
6035 if (status != 0)
6036 goto out;
6037 status = decode_getdeviceinfo(xdr, res->pdev);
6038out:
6039 return status;
6040}
6041
6042/*
6043 * Decode LAYOUTGET response
6044 */
6045static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
6046 struct xdr_stream *xdr,
6047 struct nfs4_layoutget_res *res)
6048{
6049 struct compound_hdr hdr;
6050 int status;
6051
6052 status = decode_compound_hdr(xdr, &hdr);
6053 if (status)
6054 goto out;
6055 status = decode_sequence(xdr, &res->seq_res, rqstp);
6056 if (status)
6057 goto out;
6058 status = decode_putfh(xdr);
6059 if (status)
6060 goto out;
6061 status = decode_layoutget(xdr, rqstp, res);
6062out:
5759 return status; 6063 return status;
5760} 6064}
5761#endif /* CONFIG_NFS_V4_1 */ 6065#endif /* CONFIG_NFS_V4_1 */
5762 6066
5763__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 6067/**
6068 * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
6069 * the local page cache.
6070 * @xdr: XDR stream where entry resides
6071 * @entry: buffer to fill in with entry data
6072 * @plus: boolean indicating whether this should be a readdirplus entry
6073 *
6074 * Returns zero if successful, otherwise a negative errno value is
6075 * returned.
6076 *
6077 * This function is not invoked during READDIR reply decoding, but
6078 * rather whenever an application invokes the getdents(2) system call
6079 * on a directory already in our cache.
6080 */
6081int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6082 int plus)
5764{ 6083{
5765 uint32_t bitmap[2] = {0}; 6084 uint32_t bitmap[2] = {0};
5766 uint32_t len; 6085 uint32_t len;
5767 6086 __be32 *p = xdr_inline_decode(xdr, 4);
5768 if (!*p++) { 6087 if (unlikely(!p))
5769 if (!*p) 6088 goto out_overflow;
5770 return ERR_PTR(-EAGAIN); 6089 if (*p == xdr_zero) {
6090 p = xdr_inline_decode(xdr, 4);
6091 if (unlikely(!p))
6092 goto out_overflow;
6093 if (*p == xdr_zero)
6094 return -EAGAIN;
5771 entry->eof = 1; 6095 entry->eof = 1;
5772 return ERR_PTR(-EBADCOOKIE); 6096 return -EBADCOOKIE;
5773 } 6097 }
5774 6098
6099 p = xdr_inline_decode(xdr, 12);
6100 if (unlikely(!p))
6101 goto out_overflow;
5775 entry->prev_cookie = entry->cookie; 6102 entry->prev_cookie = entry->cookie;
5776 p = xdr_decode_hyper(p, &entry->cookie); 6103 p = xdr_decode_hyper(p, &entry->cookie);
5777 entry->len = ntohl(*p++); 6104 entry->len = be32_to_cpup(p);
6105
6106 p = xdr_inline_decode(xdr, entry->len);
6107 if (unlikely(!p))
6108 goto out_overflow;
5778 entry->name = (const char *) p; 6109 entry->name = (const char *) p;
5779 p += XDR_QUADLEN(entry->len);
5780 6110
5781 /* 6111 /*
5782 * In case the server doesn't return an inode number, 6112 * In case the server doesn't return an inode number,
@@ -5784,32 +6114,29 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
5784 * since glibc seems to choke on it...) 6114 * since glibc seems to choke on it...)
5785 */ 6115 */
5786 entry->ino = 1; 6116 entry->ino = 1;
6117 entry->fattr->valid = 0;
5787 6118
5788 len = ntohl(*p++); /* bitmap length */ 6119 if (decode_attr_bitmap(xdr, bitmap) < 0)
5789 if (len-- > 0) { 6120 goto out_overflow;
5790 bitmap[0] = ntohl(*p++);
5791 if (len-- > 0) {
5792 bitmap[1] = ntohl(*p++);
5793 p += len;
5794 }
5795 }
5796 len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */
5797 if (len > 0) {
5798 if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) {
5799 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
5800 /* Ignore the return value of rdattr_error for now */
5801 p++;
5802 len--;
5803 }
5804 if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID)
5805 xdr_decode_hyper(p, &entry->ino);
5806 else if (bitmap[0] == FATTR4_WORD0_FILEID)
5807 xdr_decode_hyper(p, &entry->ino);
5808 p += len;
5809 }
5810 6121
5811 entry->eof = !p[0] && p[1]; 6122 if (decode_attr_length(xdr, &len, &p) < 0)
5812 return p; 6123 goto out_overflow;
6124
6125 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
6126 entry->server, 1) < 0)
6127 goto out_overflow;
6128 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
6129 entry->ino = entry->fattr->fileid;
6130
6131 entry->d_type = DT_UNKNOWN;
6132 if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
6133 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
6134
6135 return 0;
6136
6137out_overflow:
6138 print_overflow_msg(__func__, xdr);
6139 return -EAGAIN;
5813} 6140}
5814 6141
5815/* 6142/*
@@ -5885,8 +6212,8 @@ nfs4_stat_to_errno(int stat)
5885#define PROC(proc, argtype, restype) \ 6212#define PROC(proc, argtype, restype) \
5886[NFSPROC4_CLNT_##proc] = { \ 6213[NFSPROC4_CLNT_##proc] = { \
5887 .p_proc = NFSPROC4_COMPOUND, \ 6214 .p_proc = NFSPROC4_COMPOUND, \
5888 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 6215 .p_encode = (kxdreproc_t)nfs4_xdr_##argtype, \
5889 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 6216 .p_decode = (kxdrdproc_t)nfs4_xdr_##restype, \
5890 .p_arglen = NFS4_##argtype##_sz, \ 6217 .p_arglen = NFS4_##argtype##_sz, \
5891 .p_replen = NFS4_##restype##_sz, \ 6218 .p_replen = NFS4_##restype##_sz, \
5892 .p_statidx = NFSPROC4_CLNT_##proc, \ 6219 .p_statidx = NFSPROC4_CLNT_##proc, \
@@ -5894,48 +6221,50 @@ nfs4_stat_to_errno(int stat)
5894} 6221}
5895 6222
5896struct rpc_procinfo nfs4_procedures[] = { 6223struct rpc_procinfo nfs4_procedures[] = {
5897 PROC(READ, enc_read, dec_read), 6224 PROC(READ, enc_read, dec_read),
5898 PROC(WRITE, enc_write, dec_write), 6225 PROC(WRITE, enc_write, dec_write),
5899 PROC(COMMIT, enc_commit, dec_commit), 6226 PROC(COMMIT, enc_commit, dec_commit),
5900 PROC(OPEN, enc_open, dec_open), 6227 PROC(OPEN, enc_open, dec_open),
5901 PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), 6228 PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm),
5902 PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), 6229 PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr),
5903 PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), 6230 PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade),
5904 PROC(CLOSE, enc_close, dec_close), 6231 PROC(CLOSE, enc_close, dec_close),
5905 PROC(SETATTR, enc_setattr, dec_setattr), 6232 PROC(SETATTR, enc_setattr, dec_setattr),
5906 PROC(FSINFO, enc_fsinfo, dec_fsinfo), 6233 PROC(FSINFO, enc_fsinfo, dec_fsinfo),
5907 PROC(RENEW, enc_renew, dec_renew), 6234 PROC(RENEW, enc_renew, dec_renew),
5908 PROC(SETCLIENTID, enc_setclientid, dec_setclientid), 6235 PROC(SETCLIENTID, enc_setclientid, dec_setclientid),
5909 PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), 6236 PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
5910 PROC(LOCK, enc_lock, dec_lock), 6237 PROC(LOCK, enc_lock, dec_lock),
5911 PROC(LOCKT, enc_lockt, dec_lockt), 6238 PROC(LOCKT, enc_lockt, dec_lockt),
5912 PROC(LOCKU, enc_locku, dec_locku), 6239 PROC(LOCKU, enc_locku, dec_locku),
5913 PROC(ACCESS, enc_access, dec_access), 6240 PROC(ACCESS, enc_access, dec_access),
5914 PROC(GETATTR, enc_getattr, dec_getattr), 6241 PROC(GETATTR, enc_getattr, dec_getattr),
5915 PROC(LOOKUP, enc_lookup, dec_lookup), 6242 PROC(LOOKUP, enc_lookup, dec_lookup),
5916 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), 6243 PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root),
5917 PROC(REMOVE, enc_remove, dec_remove), 6244 PROC(REMOVE, enc_remove, dec_remove),
5918 PROC(RENAME, enc_rename, dec_rename), 6245 PROC(RENAME, enc_rename, dec_rename),
5919 PROC(LINK, enc_link, dec_link), 6246 PROC(LINK, enc_link, dec_link),
5920 PROC(SYMLINK, enc_symlink, dec_symlink), 6247 PROC(SYMLINK, enc_symlink, dec_symlink),
5921 PROC(CREATE, enc_create, dec_create), 6248 PROC(CREATE, enc_create, dec_create),
5922 PROC(PATHCONF, enc_pathconf, dec_pathconf), 6249 PROC(PATHCONF, enc_pathconf, dec_pathconf),
5923 PROC(STATFS, enc_statfs, dec_statfs), 6250 PROC(STATFS, enc_statfs, dec_statfs),
5924 PROC(READLINK, enc_readlink, dec_readlink), 6251 PROC(READLINK, enc_readlink, dec_readlink),
5925 PROC(READDIR, enc_readdir, dec_readdir), 6252 PROC(READDIR, enc_readdir, dec_readdir),
5926 PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), 6253 PROC(SERVER_CAPS, enc_server_caps, dec_server_caps),
5927 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), 6254 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
5928 PROC(GETACL, enc_getacl, dec_getacl), 6255 PROC(GETACL, enc_getacl, dec_getacl),
5929 PROC(SETACL, enc_setacl, dec_setacl), 6256 PROC(SETACL, enc_setacl, dec_setacl),
5930 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 6257 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
5931 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), 6258 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
5932#if defined(CONFIG_NFS_V4_1) 6259#if defined(CONFIG_NFS_V4_1)
5933 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 6260 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
5934 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 6261 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
5935 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), 6262 PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
5936 PROC(SEQUENCE, enc_sequence, dec_sequence), 6263 PROC(SEQUENCE, enc_sequence, dec_sequence),
5937 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 6264 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
5938 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6265 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6266 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6267 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
5939#endif /* CONFIG_NFS_V4_1 */ 6268#endif /* CONFIG_NFS_V4_1 */
5940}; 6269};
5941 6270
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546a..903908a20023 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
3 * 3 *
4 * Allow an NFS filesystem to be mounted as root. The way this works is: 4 * Allow an NFS filesystem to be mounted as root. The way this works is:
5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. 5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
6 * (2) Handle RPC negotiation with the system which replied to RARP or 6 * (2) Construct the device string and the options string using DHCP
7 * was reported as a boot server by BOOTP or manually. 7 * option 17 and/or kernel command line options.
8 * (3) The actual mounting is done later, when init() is running. 8 * (3) When mount_root() sets up the root file system, pass these strings
9 * to the NFS client's regular mount interface via sys_mount().
9 * 10 *
10 * 11 *
11 * Changes: 12 * Changes:
@@ -65,470 +66,245 @@
65 * Hua Qin : Support for mounting root file system via 66 * Hua Qin : Support for mounting root file system via
66 * NFS over TCP. 67 * NFS over TCP.
67 * Fabian Frederick: Option parser rebuilt (using parser lib) 68 * Fabian Frederick: Option parser rebuilt (using parser lib)
68*/ 69 * Chuck Lever : Use super.c's text-based mount option parsing
70 * Chuck Lever : Add "nfsrootdebug".
71 */
69 72
70#include <linux/types.h> 73#include <linux/types.h>
71#include <linux/string.h> 74#include <linux/string.h>
72#include <linux/kernel.h>
73#include <linux/time.h>
74#include <linux/fs.h>
75#include <linux/init.h> 75#include <linux/init.h>
76#include <linux/sunrpc/clnt.h>
77#include <linux/sunrpc/xprtsock.h>
78#include <linux/nfs.h> 76#include <linux/nfs.h>
79#include <linux/nfs_fs.h> 77#include <linux/nfs_fs.h>
80#include <linux/nfs_mount.h>
81#include <linux/in.h>
82#include <linux/major.h>
83#include <linux/utsname.h> 78#include <linux/utsname.h>
84#include <linux/inet.h>
85#include <linux/root_dev.h> 79#include <linux/root_dev.h>
86#include <net/ipconfig.h> 80#include <net/ipconfig.h>
87#include <linux/parser.h>
88 81
89#include "internal.h" 82#include "internal.h"
90 83
91/* Define this to allow debugging output */
92#undef NFSROOT_DEBUG
93#define NFSDBG_FACILITY NFSDBG_ROOT 84#define NFSDBG_FACILITY NFSDBG_ROOT
94 85
95/* Default port to use if server is not running a portmapper */
96#define NFS_MNT_PORT 627
97
98/* Default path we try to mount. "%s" gets replaced by our IP address */ 86/* Default path we try to mount. "%s" gets replaced by our IP address */
99#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
100 88
101/* Parameters passed from the kernel command line */ 89/* Parameters passed from the kernel command line */
102static char nfs_root_name[256] __initdata = ""; 90static char nfs_root_parms[256] __initdata = "";
91
92/* Text-based mount options passed to super.c */
93static char nfs_root_options[256] __initdata = "";
103 94
104/* Address of NFS server */ 95/* Address of NFS server */
105static __be32 servaddr __initdata = 0; 96static __be32 servaddr __initdata = htonl(INADDR_NONE);
106 97
107/* Name of directory to mount */ 98/* Name of directory to mount */
108static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; 99static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
109
110/* NFS-related data */
111static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
112static int nfs_port __initdata = 0; /* Port to connect to for NFS */
113static int mount_port __initdata = 0; /* Mount daemon port number */
114
115
116/***************************************************************************
117
118 Parsing of options
119
120 ***************************************************************************/
121
122enum {
123 /* Options that take integer arguments */
124 Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
125 Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
126 /* Options that take no arguments */
127 Opt_soft, Opt_hard, Opt_intr,
128 Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac,
129 Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
130 Opt_acl, Opt_noacl,
131 /* Error token */
132 Opt_err
133};
134
135static const match_table_t tokens __initconst = {
136 {Opt_port, "port=%u"},
137 {Opt_rsize, "rsize=%u"},
138 {Opt_wsize, "wsize=%u"},
139 {Opt_timeo, "timeo=%u"},
140 {Opt_retrans, "retrans=%u"},
141 {Opt_acregmin, "acregmin=%u"},
142 {Opt_acregmax, "acregmax=%u"},
143 {Opt_acdirmin, "acdirmin=%u"},
144 {Opt_acdirmax, "acdirmax=%u"},
145 {Opt_soft, "soft"},
146 {Opt_hard, "hard"},
147 {Opt_intr, "intr"},
148 {Opt_nointr, "nointr"},
149 {Opt_posix, "posix"},
150 {Opt_noposix, "noposix"},
151 {Opt_cto, "cto"},
152 {Opt_nocto, "nocto"},
153 {Opt_ac, "ac"},
154 {Opt_noac, "noac"},
155 {Opt_lock, "lock"},
156 {Opt_nolock, "nolock"},
157 {Opt_v2, "nfsvers=2"},
158 {Opt_v2, "v2"},
159 {Opt_v3, "nfsvers=3"},
160 {Opt_v3, "v3"},
161 {Opt_udp, "proto=udp"},
162 {Opt_udp, "udp"},
163 {Opt_tcp, "proto=tcp"},
164 {Opt_tcp, "tcp"},
165 {Opt_acl, "acl"},
166 {Opt_noacl, "noacl"},
167 {Opt_err, NULL}
168
169};
170 100
101/* server:export path string passed to super.c */
102static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
103
104#ifdef RPC_DEBUG
171/* 105/*
172 * Parse option string. 106 * When the "nfsrootdebug" kernel command line option is specified,
107 * enable debugging messages for NFSROOT.
173 */ 108 */
174 109static int __init nfs_root_debug(char *__unused)
175static int __init root_nfs_parse(char *name, char *buf)
176{ 110{
177 111 nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
178 char *p;
179 substring_t args[MAX_OPT_ARGS];
180 int option;
181
182 if (!name)
183 return 1;
184
185 /* Set the NFS remote path */
186 p = strsep(&name, ",");
187 if (p[0] != '\0' && strcmp(p, "default") != 0)
188 strlcpy(buf, p, NFS_MAXPATHLEN);
189
190 while ((p = strsep (&name, ",")) != NULL) {
191 int token;
192 if (!*p)
193 continue;
194 token = match_token(p, tokens, args);
195
196 /* %u tokens only. Beware if you add new tokens! */
197 if (token < Opt_soft && match_int(&args[0], &option))
198 return 0;
199 switch (token) {
200 case Opt_port:
201 nfs_port = option;
202 break;
203 case Opt_rsize:
204 nfs_data.rsize = option;
205 break;
206 case Opt_wsize:
207 nfs_data.wsize = option;
208 break;
209 case Opt_timeo:
210 nfs_data.timeo = option;
211 break;
212 case Opt_retrans:
213 nfs_data.retrans = option;
214 break;
215 case Opt_acregmin:
216 nfs_data.acregmin = option;
217 break;
218 case Opt_acregmax:
219 nfs_data.acregmax = option;
220 break;
221 case Opt_acdirmin:
222 nfs_data.acdirmin = option;
223 break;
224 case Opt_acdirmax:
225 nfs_data.acdirmax = option;
226 break;
227 case Opt_soft:
228 nfs_data.flags |= NFS_MOUNT_SOFT;
229 break;
230 case Opt_hard:
231 nfs_data.flags &= ~NFS_MOUNT_SOFT;
232 break;
233 case Opt_intr:
234 case Opt_nointr:
235 break;
236 case Opt_posix:
237 nfs_data.flags |= NFS_MOUNT_POSIX;
238 break;
239 case Opt_noposix:
240 nfs_data.flags &= ~NFS_MOUNT_POSIX;
241 break;
242 case Opt_cto:
243 nfs_data.flags &= ~NFS_MOUNT_NOCTO;
244 break;
245 case Opt_nocto:
246 nfs_data.flags |= NFS_MOUNT_NOCTO;
247 break;
248 case Opt_ac:
249 nfs_data.flags &= ~NFS_MOUNT_NOAC;
250 break;
251 case Opt_noac:
252 nfs_data.flags |= NFS_MOUNT_NOAC;
253 break;
254 case Opt_lock:
255 nfs_data.flags &= ~NFS_MOUNT_NONLM;
256 break;
257 case Opt_nolock:
258 nfs_data.flags |= NFS_MOUNT_NONLM;
259 break;
260 case Opt_v2:
261 nfs_data.flags &= ~NFS_MOUNT_VER3;
262 break;
263 case Opt_v3:
264 nfs_data.flags |= NFS_MOUNT_VER3;
265 break;
266 case Opt_udp:
267 nfs_data.flags &= ~NFS_MOUNT_TCP;
268 break;
269 case Opt_tcp:
270 nfs_data.flags |= NFS_MOUNT_TCP;
271 break;
272 case Opt_acl:
273 nfs_data.flags &= ~NFS_MOUNT_NOACL;
274 break;
275 case Opt_noacl:
276 nfs_data.flags |= NFS_MOUNT_NOACL;
277 break;
278 default:
279 printk(KERN_WARNING "Root-NFS: unknown "
280 "option: %s\n", p);
281 return 0;
282 }
283 }
284
285 return 1; 112 return 1;
286} 113}
287 114
115__setup("nfsrootdebug", nfs_root_debug);
116#endif
117
288/* 118/*
289 * Prepare the NFS data structure and parse all options. 119 * Parse NFS server and directory information passed on the kernel
120 * command line.
121 *
122 * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
123 *
124 * If there is a "%s" token in the <root-dir> string, it is replaced
125 * by the ASCII-representation of the client's IP address.
290 */ 126 */
291static int __init root_nfs_name(char *name) 127static int __init nfs_root_setup(char *line)
292{ 128{
293 static char buf[NFS_MAXPATHLEN] __initdata; 129 ROOT_DEV = Root_NFS;
294 char *cp; 130
295 131 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
296 /* Set some default values */ 132 strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
297 memset(&nfs_data, 0, sizeof(nfs_data)); 133 } else {
298 nfs_port = -1; 134 size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
299 nfs_data.version = NFS_MOUNT_VERSION; 135 if (n >= sizeof(nfs_root_parms))
300 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ 136 line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
301 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; 137 sprintf(nfs_root_parms, NFS_ROOT, line);
302 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
303 nfs_data.acregmin = NFS_DEF_ACREGMIN;
304 nfs_data.acregmax = NFS_DEF_ACREGMAX;
305 nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
306 nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
307 strcpy(buf, NFS_ROOT);
308
309 /* Process options received from the remote server */
310 root_nfs_parse(root_server_path, buf);
311
312 /* Override them by options set on kernel command-line */
313 root_nfs_parse(name, buf);
314
315 cp = utsname()->nodename;
316 if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
317 printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
318 return -1;
319 } 138 }
320 sprintf(nfs_export_path, buf, cp); 139
140 /*
141 * Extract the IP address of the NFS server containing our
142 * root file system, if one was specified.
143 *
144 * Note: root_nfs_parse_addr() removes the server-ip from
145 * nfs_root_parms, if it exists.
146 */
147 root_server_addr = root_nfs_parse_addr(nfs_root_parms);
321 148
322 return 1; 149 return 1;
323} 150}
324 151
152__setup("nfsroot=", nfs_root_setup);
325 153
326/* 154static int __init root_nfs_copy(char *dest, const char *src,
327 * Get NFS server address. 155 const size_t destlen)
328 */
329static int __init root_nfs_addr(void)
330{ 156{
331 if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) { 157 if (strlcpy(dest, src, destlen) > destlen)
332 printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
333 return -1; 158 return -1;
334 } 159 return 0;
160}
335 161
336 snprintf(nfs_data.hostname, sizeof(nfs_data.hostname), 162static int __init root_nfs_cat(char *dest, const char *src,
337 "%pI4", &servaddr); 163 const size_t destlen)
164{
165 if (strlcat(dest, src, destlen) > destlen)
166 return -1;
338 return 0; 167 return 0;
339} 168}
340 169
341/* 170/*
342 * Tell the user what's going on. 171 * Parse out root export path and mount options from
172 * passed-in string @incoming.
173 *
174 * Copy the export path into @exppath.
343 */ 175 */
344#ifdef NFSROOT_DEBUG 176static int __init root_nfs_parse_options(char *incoming, char *exppath,
345static void __init root_nfs_print(void) 177 const size_t exppathlen)
346{ 178{
347 printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", 179 char *p;
348 nfs_export_path, nfs_data.hostname);
349 printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
350 nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
351 printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
352 nfs_data.acregmin, nfs_data.acregmax,
353 nfs_data.acdirmin, nfs_data.acdirmax);
354 printk(KERN_NOTICE "Root-NFS: nfsd port = %d, mountd port = %d, flags = %08x\n",
355 nfs_port, mount_port, nfs_data.flags);
356}
357#endif
358
359 180
360static int __init root_nfs_init(void) 181 /*
361{ 182 * Set the NFS remote path
362#ifdef NFSROOT_DEBUG 183 */
363 nfs_debug |= NFSDBG_ROOT; 184 p = strsep(&incoming, ",");
364#endif 185 if (*p != '\0' && strcmp(p, "default") != 0)
186 if (root_nfs_copy(exppath, p, exppathlen))
187 return -1;
365 188
366 /* 189 /*
367 * Decode the root directory path name and NFS options from 190 * @incoming now points to the rest of the string; if it
368 * the kernel command line. This has to go here in order to 191 * contains something, append it to our root options buffer
369 * be able to use the client IP address for the remote root
370 * directory (necessary for pure RARP booting).
371 */ 192 */
372 if (root_nfs_name(nfs_root_name) < 0 || 193 if (incoming != NULL && *incoming != '\0')
373 root_nfs_addr() < 0) 194 if (root_nfs_cat(nfs_root_options, incoming,
374 return -1; 195 sizeof(nfs_root_options)))
196 return -1;
375 197
376#ifdef NFSROOT_DEBUG 198 /*
377 root_nfs_print(); 199 * Possibly prepare for more options to be appended
378#endif 200 */
201 if (nfs_root_options[0] != '\0' &&
202 nfs_root_options[strlen(nfs_root_options)] != ',')
203 if (root_nfs_cat(nfs_root_options, ",",
204 sizeof(nfs_root_options)))
205 return -1;
379 206
380 return 0; 207 return 0;
381} 208}
382 209
383
384/* 210/*
385 * Parse NFS server and directory information passed on the kernel 211 * Decode the export directory path name and NFS options from
386 * command line. 212 * the kernel command line. This has to be done late in order to
213 * use a dynamically acquired client IP address for the remote
214 * root directory path.
215 *
216 * Returns zero if successful; otherwise -1 is returned.
387 */ 217 */
388static int __init nfs_root_setup(char *line) 218static int __init root_nfs_data(char *cmdline)
389{ 219{
390 ROOT_DEV = Root_NFS; 220 char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
391 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { 221 int len, retval = -1;
392 strlcpy(nfs_root_name, line, sizeof(nfs_root_name)); 222 char *tmp = NULL;
393 } else { 223 const size_t tmplen = sizeof(nfs_export_path);
394 int n = strlen(line) + sizeof(NFS_ROOT) - 1; 224
395 if (n >= sizeof(nfs_root_name)) 225 tmp = kzalloc(tmplen, GFP_KERNEL);
396 line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0'; 226 if (tmp == NULL)
397 sprintf(nfs_root_name, NFS_ROOT, line); 227 goto out_nomem;
228 strcpy(tmp, NFS_ROOT);
229
230 if (root_server_path[0] != '\0') {
231 dprintk("Root-NFS: DHCPv4 option 17: %s\n",
232 root_server_path);
233 if (root_nfs_parse_options(root_server_path, tmp, tmplen))
234 goto out_optionstoolong;
398 } 235 }
399 root_server_addr = root_nfs_parse_addr(nfs_root_name);
400 return 1;
401}
402
403__setup("nfsroot=", nfs_root_setup);
404
405/***************************************************************************
406 236
407 Routines to actually mount the root directory 237 if (cmdline[0] != '\0') {
238 dprintk("Root-NFS: nfsroot=%s\n", cmdline);
239 if (root_nfs_parse_options(cmdline, tmp, tmplen))
240 goto out_optionstoolong;
241 }
408 242
409 ***************************************************************************/ 243 /*
244 * Append mandatory options for nfsroot so they override
245 * what has come before
246 */
247 snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
248 &servaddr);
249 if (root_nfs_cat(nfs_root_options, addr_option,
250 sizeof(nfs_root_options)))
251 goto out_optionstoolong;
410 252
411/* 253 /*
412 * Construct sockaddr_in from address and port number. 254 * Set up nfs_root_device. For NFS mounts, this looks like
413 */ 255 *
414static inline void 256 * server:/path
415set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) 257 *
416{ 258 * At this point, utsname()->nodename contains our local
417 sin->sin_family = AF_INET; 259 * IP address or hostname, set by ipconfig. If "%s" exists
418 sin->sin_addr.s_addr = addr; 260 * in tmp, substitute the nodename, then shovel the whole
419 sin->sin_port = port; 261 * mess into nfs_root_device.
420} 262 */
263 len = snprintf(nfs_export_path, sizeof(nfs_export_path),
264 tmp, utsname()->nodename);
265 if (len > (int)sizeof(nfs_export_path))
266 goto out_devnametoolong;
267 len = snprintf(nfs_root_device, sizeof(nfs_root_device),
268 "%pI4:%s", &servaddr, nfs_export_path);
269 if (len > (int)sizeof(nfs_root_device))
270 goto out_devnametoolong;
421 271
422/* 272 retval = 0;
423 * Query server portmapper for the port of a daemon program.
424 */
425static int __init root_nfs_getport(int program, int version, int proto)
426{
427 struct sockaddr_in sin;
428 273
429 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n", 274out:
430 program, version, &servaddr); 275 kfree(tmp);
431 set_sockaddr(&sin, servaddr, 0); 276 return retval;
432 return rpcb_getport_sync(&sin, program, version, proto); 277out_nomem:
278 printk(KERN_ERR "Root-NFS: could not allocate memory\n");
279 goto out;
280out_optionstoolong:
281 printk(KERN_ERR "Root-NFS: mount options string too long\n");
282 goto out;
283out_devnametoolong:
284 printk(KERN_ERR "Root-NFS: root device name too long.\n");
285 goto out;
433} 286}
434 287
435 288/**
436/* 289 * nfs_root_data - Return prepared 'data' for NFSROOT mount
437 * Use portmapper to find mountd and nfsd port numbers if not overriden 290 * @root_device: OUT: address of string containing NFSROOT device
438 * by the user. Use defaults if portmapper is not available. 291 * @root_data: OUT: address of string containing NFSROOT mount options
439 * XXX: Is there any nfs server with no portmapper? 292 *
293 * Returns zero and sets @root_device and @root_data if successful,
294 * otherwise -1 is returned.
440 */ 295 */
441static int __init root_nfs_ports(void) 296int __init nfs_root_data(char **root_device, char **root_data)
442{ 297{
443 int port; 298 servaddr = root_server_addr;
444 int nfsd_ver, mountd_ver; 299 if (servaddr == htonl(INADDR_NONE)) {
445 int nfsd_port, mountd_port; 300 printk(KERN_ERR "Root-NFS: no NFS server address\n");
446 int proto; 301 return -1;
447
448 if (nfs_data.flags & NFS_MOUNT_VER3) {
449 nfsd_ver = NFS3_VERSION;
450 mountd_ver = NFS_MNT3_VERSION;
451 nfsd_port = NFS_PORT;
452 mountd_port = NFS_MNT_PORT;
453 } else {
454 nfsd_ver = NFS2_VERSION;
455 mountd_ver = NFS_MNT_VERSION;
456 nfsd_port = NFS_PORT;
457 mountd_port = NFS_MNT_PORT;
458 }
459
460 proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
461
462 if (nfs_port < 0) {
463 if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
464 printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
465 "number from server, using default\n");
466 port = nfsd_port;
467 }
468 nfs_port = port;
469 dprintk("Root-NFS: Portmapper on server returned %d "
470 "as nfsd port\n", port);
471 } 302 }
472 303
473 if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { 304 if (root_nfs_data(nfs_root_parms) < 0)
474 printk(KERN_ERR "Root-NFS: Unable to get mountd port " 305 return -1;
475 "number from server, using default\n");
476 port = mountd_port;
477 }
478 mount_port = port;
479 dprintk("Root-NFS: mountd port is %d\n", port);
480 306
307 *root_device = nfs_root_device;
308 *root_data = nfs_root_options;
481 return 0; 309 return 0;
482} 310}
483
484
485/*
486 * Get a file handle from the server for the directory which is to be
487 * mounted.
488 */
489static int __init root_nfs_get_handle(void)
490{
491 struct sockaddr_in sin;
492 unsigned int auth_flav_len = 0;
493 struct nfs_mount_request request = {
494 .sap = (struct sockaddr *)&sin,
495 .salen = sizeof(sin),
496 .dirpath = nfs_export_path,
497 .version = (nfs_data.flags & NFS_MOUNT_VER3) ?
498 NFS_MNT3_VERSION : NFS_MNT_VERSION,
499 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
500 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
501 .auth_flav_len = &auth_flav_len,
502 };
503 int status = -ENOMEM;
504
505 request.fh = nfs_alloc_fhandle();
506 if (!request.fh)
507 goto out;
508 set_sockaddr(&sin, servaddr, htons(mount_port));
509 status = nfs_mount(&request);
510 if (status < 0)
511 printk(KERN_ERR "Root-NFS: Server returned error %d "
512 "while mounting %s\n", status, nfs_export_path);
513 else {
514 nfs_data.root.size = request.fh->size;
515 memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
516 }
517 nfs_free_fhandle(request.fh);
518out:
519 return status;
520}
521
522/*
523 * Get the NFS port numbers and file handle, and return the prepared 'data'
524 * argument for mount() if everything went OK. Return NULL otherwise.
525 */
526void * __init nfs_root_data(void)
527{
528 if (root_nfs_init() < 0
529 || root_nfs_ports() < 0
530 || root_nfs_get_handle() < 0)
531 return NULL;
532 set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
533 return (void*)&nfs_data;
534}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 919490232e17..e1164e3f9e69 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,12 +26,9 @@ static struct kmem_cache *nfs_page_cachep;
26static inline struct nfs_page * 26static inline struct nfs_page *
27nfs_page_alloc(void) 27nfs_page_alloc(void)
28{ 28{
29 struct nfs_page *p; 29 struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
30 p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); 30 if (p)
31 if (p) {
32 memset(p, 0, sizeof(*p));
33 INIT_LIST_HEAD(&p->wb_list); 31 INIT_LIST_HEAD(&p->wb_list);
34 }
35 return p; 32 return p;
36} 33}
37 34
@@ -65,6 +62,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
65 if (req == NULL) 62 if (req == NULL)
66 return ERR_PTR(-ENOMEM); 63 return ERR_PTR(-ENOMEM);
67 64
65 /* get lock context early so we can deal with alloc failures */
66 req->wb_lock_context = nfs_get_lock_context(ctx);
67 if (req->wb_lock_context == NULL) {
68 nfs_page_free(req);
69 return ERR_PTR(-ENOMEM);
70 }
71
68 /* Initialize the request struct. Initially, we assume a 72 /* Initialize the request struct. Initially, we assume a
69 * long write-back delay. This will be adjusted in 73 * long write-back delay. This will be adjusted in
70 * update_nfs_request below if the region is not locked. */ 74 * update_nfs_request below if the region is not locked. */
@@ -79,7 +83,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
79 req->wb_pgbase = offset; 83 req->wb_pgbase = offset;
80 req->wb_bytes = count; 84 req->wb_bytes = count;
81 req->wb_context = get_nfs_open_context(ctx); 85 req->wb_context = get_nfs_open_context(ctx);
82 req->wb_lock_context = nfs_get_lock_context(ctx);
83 kref_init(&req->wb_kref); 86 kref_init(&req->wb_kref);
84 return req; 87 return req;
85} 88}
@@ -109,7 +112,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
109{ 112{
110 if (!nfs_lock_request_dontget(req)) 113 if (!nfs_lock_request_dontget(req))
111 return 0; 114 return 0;
112 if (req->wb_page != NULL) 115 if (test_bit(PG_MAPPED, &req->wb_flags))
113 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 116 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
114 return 1; 117 return 1;
115} 118}
@@ -119,7 +122,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
119 */ 122 */
120void nfs_clear_page_tag_locked(struct nfs_page *req) 123void nfs_clear_page_tag_locked(struct nfs_page *req)
121{ 124{
122 if (req->wb_page != NULL) { 125 if (test_bit(PG_MAPPED, &req->wb_flags)) {
123 struct inode *inode = req->wb_context->path.dentry->d_inode; 126 struct inode *inode = req->wb_context->path.dentry->d_inode;
124 struct nfs_inode *nfsi = NFS_I(inode); 127 struct nfs_inode *nfsi = NFS_I(inode);
125 128
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000000..1b1bc1a0fb0a
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,965 @@
1/*
2 * pNFS functions to call and manage layout drivers.
3 *
4 * Copyright (c) 2002 [year of first publication]
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#include <linux/nfs_fs.h>
31#include "internal.h"
32#include "pnfs.h"
33
34#define NFSDBG_FACILITY NFSDBG_PNFS
35
36/* Locking:
37 *
38 * pnfs_spinlock:
39 * protects pnfs_modules_tbl.
40 */
41static DEFINE_SPINLOCK(pnfs_spinlock);
42
43/*
44 * pnfs_modules_tbl holds all pnfs modules
45 */
46static LIST_HEAD(pnfs_modules_tbl);
47
48/* Return the registered pnfs layout driver module matching given id */
49static struct pnfs_layoutdriver_type *
50find_pnfs_driver_locked(u32 id)
51{
52 struct pnfs_layoutdriver_type *local;
53
54 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
55 if (local->id == id)
56 goto out;
57 local = NULL;
58out:
59 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
60 return local;
61}
62
63static struct pnfs_layoutdriver_type *
64find_pnfs_driver(u32 id)
65{
66 struct pnfs_layoutdriver_type *local;
67
68 spin_lock(&pnfs_spinlock);
69 local = find_pnfs_driver_locked(id);
70 spin_unlock(&pnfs_spinlock);
71 return local;
72}
73
74void
75unset_pnfs_layoutdriver(struct nfs_server *nfss)
76{
77 if (nfss->pnfs_curr_ld) {
78 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner);
80 }
81 nfss->pnfs_curr_ld = NULL;
82}
83
84/*
85 * Try to set the server's pnfs module to the pnfs layout type specified by id.
86 * Currently only one pNFS layout driver per filesystem is supported.
87 *
88 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
89 */
90void
91set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
92{
93 struct pnfs_layoutdriver_type *ld_type = NULL;
94
95 if (id == 0)
96 goto out_no_driver;
97 if (!(server->nfs_client->cl_exchange_flags &
98 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
99 printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
100 id, server->nfs_client->cl_exchange_flags);
101 goto out_no_driver;
102 }
103 ld_type = find_pnfs_driver(id);
104 if (!ld_type) {
105 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
106 ld_type = find_pnfs_driver(id);
107 if (!ld_type) {
108 dprintk("%s: No pNFS module found for %u.\n",
109 __func__, id);
110 goto out_no_driver;
111 }
112 }
113 if (!try_module_get(ld_type->owner)) {
114 dprintk("%s: Could not grab reference on module\n", __func__);
115 goto out_no_driver;
116 }
117 server->pnfs_curr_ld = ld_type;
118 if (ld_type->set_layoutdriver(server)) {
119 printk(KERN_ERR
120 "%s: Error initializing mount point for layout driver %u.\n",
121 __func__, id);
122 module_put(ld_type->owner);
123 goto out_no_driver;
124 }
125 dprintk("%s: pNFS module for %u set\n", __func__, id);
126 return;
127
128out_no_driver:
129 dprintk("%s: Using NFSv4 I/O\n", __func__);
130 server->pnfs_curr_ld = NULL;
131}
132
133int
134pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
135{
136 int status = -EINVAL;
137 struct pnfs_layoutdriver_type *tmp;
138
139 if (ld_type->id == 0) {
140 printk(KERN_ERR "%s id 0 is reserved\n", __func__);
141 return status;
142 }
143 if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
144 printk(KERN_ERR "%s Layout driver must provide "
145 "alloc_lseg and free_lseg.\n", __func__);
146 return status;
147 }
148
149 spin_lock(&pnfs_spinlock);
150 tmp = find_pnfs_driver_locked(ld_type->id);
151 if (!tmp) {
152 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
153 status = 0;
154 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
155 ld_type->name);
156 } else {
157 printk(KERN_ERR "%s Module with id %d already loaded!\n",
158 __func__, ld_type->id);
159 }
160 spin_unlock(&pnfs_spinlock);
161
162 return status;
163}
164EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
165
166void
167pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
168{
169 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
170 spin_lock(&pnfs_spinlock);
171 list_del(&ld_type->pnfs_tblid);
172 spin_unlock(&pnfs_spinlock);
173}
174EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
175
176/*
177 * pNFS client layout cache
178 */
179
180/* Need to hold i_lock if caller does not already hold reference */
181void
182get_layout_hdr(struct pnfs_layout_hdr *lo)
183{
184 atomic_inc(&lo->plh_refcount);
185}
186
187static void
188destroy_layout_hdr(struct pnfs_layout_hdr *lo)
189{
190 dprintk("%s: freeing layout cache %p\n", __func__, lo);
191 BUG_ON(!list_empty(&lo->plh_layouts));
192 NFS_I(lo->plh_inode)->layout = NULL;
193 kfree(lo);
194}
195
196static void
197put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
198{
199 if (atomic_dec_and_test(&lo->plh_refcount))
200 destroy_layout_hdr(lo);
201}
202
203void
204put_layout_hdr(struct pnfs_layout_hdr *lo)
205{
206 struct inode *inode = lo->plh_inode;
207
208 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
209 destroy_layout_hdr(lo);
210 spin_unlock(&inode->i_lock);
211 }
212}
213
214static void
215init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
216{
217 INIT_LIST_HEAD(&lseg->pls_list);
218 atomic_set(&lseg->pls_refcount, 1);
219 smp_mb();
220 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
221 lseg->pls_layout = lo;
222}
223
224static void free_lseg(struct pnfs_layout_segment *lseg)
225{
226 struct inode *ino = lseg->pls_layout->plh_inode;
227
228 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
229 /* Matched by get_layout_hdr in pnfs_insert_layout */
230 put_layout_hdr(NFS_I(ino)->layout);
231}
232
233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
234 * could sleep, so must be called outside of the lock.
235 * Returns 1 if object was removed, otherwise return 0.
236 */
237static int
238put_lseg_locked(struct pnfs_layout_segment *lseg,
239 struct list_head *tmp_list)
240{
241 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
242 atomic_read(&lseg->pls_refcount),
243 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
244 if (atomic_dec_and_test(&lseg->pls_refcount)) {
245 struct inode *ino = lseg->pls_layout->plh_inode;
246
247 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
248 list_del(&lseg->pls_list);
249 if (list_empty(&lseg->pls_layout->plh_segs)) {
250 struct nfs_client *clp;
251
252 clp = NFS_SERVER(ino)->nfs_client;
253 spin_lock(&clp->cl_lock);
254 /* List does not take a reference, so no need for put here */
255 list_del_init(&lseg->pls_layout->plh_layouts);
256 spin_unlock(&clp->cl_lock);
257 clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
258 }
259 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
260 list_add(&lseg->pls_list, tmp_list);
261 return 1;
262 }
263 return 0;
264}
265
266static bool
267should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
268{
269 return (recall_iomode == IOMODE_ANY ||
270 lseg_iomode == recall_iomode);
271}
272
273/* Returns 1 if lseg is removed from list, 0 otherwise */
274static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
275 struct list_head *tmp_list)
276{
277 int rv = 0;
278
279 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
280 /* Remove the reference keeping the lseg in the
281 * list. It will now be removed when all
282 * outstanding io is finished.
283 */
284 rv = put_lseg_locked(lseg, tmp_list);
285 }
286 return rv;
287}
288
289/* Returns count of number of matching invalid lsegs remaining in list
290 * after call.
291 */
292int
293mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
294 struct list_head *tmp_list,
295 u32 iomode)
296{
297 struct pnfs_layout_segment *lseg, *next;
298 int invalid = 0, removed = 0;
299
300 dprintk("%s:Begin lo %p\n", __func__, lo);
301
302 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
303 if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
304 dprintk("%s: freeing lseg %p iomode %d "
305 "offset %llu length %llu\n", __func__,
306 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
307 lseg->pls_range.length);
308 invalid++;
309 removed += mark_lseg_invalid(lseg, tmp_list);
310 }
311 dprintk("%s:Return %i\n", __func__, invalid - removed);
312 return invalid - removed;
313}
314
315void
316pnfs_free_lseg_list(struct list_head *free_me)
317{
318 struct pnfs_layout_segment *lseg, *tmp;
319
320 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
321 list_del(&lseg->pls_list);
322 free_lseg(lseg);
323 }
324}
325
326void
327pnfs_destroy_layout(struct nfs_inode *nfsi)
328{
329 struct pnfs_layout_hdr *lo;
330 LIST_HEAD(tmp_list);
331
332 spin_lock(&nfsi->vfs_inode.i_lock);
333 lo = nfsi->layout;
334 if (lo) {
335 set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
336 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
337 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
338 put_layout_hdr_locked(lo);
339 }
340 spin_unlock(&nfsi->vfs_inode.i_lock);
341 pnfs_free_lseg_list(&tmp_list);
342}
343
344/*
345 * Called by the state manger to remove all layouts established under an
346 * expired lease.
347 */
348void
349pnfs_destroy_all_layouts(struct nfs_client *clp)
350{
351 struct pnfs_layout_hdr *lo;
352 LIST_HEAD(tmp_list);
353
354 spin_lock(&clp->cl_lock);
355 list_splice_init(&clp->cl_layouts, &tmp_list);
356 spin_unlock(&clp->cl_lock);
357
358 while (!list_empty(&tmp_list)) {
359 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
360 plh_layouts);
361 dprintk("%s freeing layout for inode %lu\n", __func__,
362 lo->plh_inode->i_ino);
363 pnfs_destroy_layout(NFS_I(lo->plh_inode));
364 }
365}
366
367/* update lo->plh_stateid with new if is more recent */
368void
369pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
370 bool update_barrier)
371{
372 u32 oldseq, newseq;
373
374 oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
375 newseq = be32_to_cpu(new->stateid.seqid);
376 if ((int)(newseq - oldseq) > 0) {
377 memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
378 if (update_barrier) {
379 u32 new_barrier = be32_to_cpu(new->stateid.seqid);
380
381 if ((int)(new_barrier - lo->plh_barrier))
382 lo->plh_barrier = new_barrier;
383 } else {
384 /* Because of wraparound, we want to keep the barrier
385 * "close" to the current seqids. It needs to be
386 * within 2**31 to count as "behind", so if it
387 * gets too near that limit, give us a litle leeway
388 * and bring it to within 2**30.
389 * NOTE - and yes, this is all unsigned arithmetic.
390 */
391 if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
392 lo->plh_barrier = newseq - (1 << 30);
393 }
394 }
395}
396
397/* lget is set to 1 if called from inside send_layoutget call chain */
398static bool
399pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
400 int lget)
401{
402 if ((stateid) &&
403 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
404 return true;
405 return lo->plh_block_lgets ||
406 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
407 (list_empty(&lo->plh_segs) &&
408 (atomic_read(&lo->plh_outstanding) > lget));
409}
410
411int
412pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
413 struct nfs4_state *open_state)
414{
415 int status = 0;
416
417 dprintk("--> %s\n", __func__);
418 spin_lock(&lo->plh_inode->i_lock);
419 if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
420 status = -EAGAIN;
421 } else if (list_empty(&lo->plh_segs)) {
422 int seq;
423
424 do {
425 seq = read_seqbegin(&open_state->seqlock);
426 memcpy(dst->data, open_state->stateid.data,
427 sizeof(open_state->stateid.data));
428 } while (read_seqretry(&open_state->seqlock, seq));
429 } else
430 memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
431 spin_unlock(&lo->plh_inode->i_lock);
432 dprintk("<-- %s\n", __func__);
433 return status;
434}
435
436/*
437* Get layout from server.
438* for now, assume that whole file layouts are requested.
439* arg->offset: 0
440* arg->length: all ones
441*/
442static struct pnfs_layout_segment *
443send_layoutget(struct pnfs_layout_hdr *lo,
444 struct nfs_open_context *ctx,
445 u32 iomode)
446{
447 struct inode *ino = lo->plh_inode;
448 struct nfs_server *server = NFS_SERVER(ino);
449 struct nfs4_layoutget *lgp;
450 struct pnfs_layout_segment *lseg = NULL;
451
452 dprintk("--> %s\n", __func__);
453
454 BUG_ON(ctx == NULL);
455 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
456 if (lgp == NULL)
457 return NULL;
458 lgp->args.minlength = NFS4_MAX_UINT64;
459 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
460 lgp->args.range.iomode = iomode;
461 lgp->args.range.offset = 0;
462 lgp->args.range.length = NFS4_MAX_UINT64;
463 lgp->args.type = server->pnfs_curr_ld->id;
464 lgp->args.inode = ino;
465 lgp->args.ctx = get_nfs_open_context(ctx);
466 lgp->lsegpp = &lseg;
467
468 /* Synchronously retrieve layout information from server and
469 * store in lseg.
470 */
471 nfs4_proc_layoutget(lgp);
472 if (!lseg) {
473 /* remember that LAYOUTGET failed and suspend trying */
474 set_bit(lo_fail_bit(iomode), &lo->plh_flags);
475 }
476 return lseg;
477}
478
479bool pnfs_roc(struct inode *ino)
480{
481 struct pnfs_layout_hdr *lo;
482 struct pnfs_layout_segment *lseg, *tmp;
483 LIST_HEAD(tmp_list);
484 bool found = false;
485
486 spin_lock(&ino->i_lock);
487 lo = NFS_I(ino)->layout;
488 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
489 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
490 goto out_nolayout;
491 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
492 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
493 mark_lseg_invalid(lseg, &tmp_list);
494 found = true;
495 }
496 if (!found)
497 goto out_nolayout;
498 lo->plh_block_lgets++;
499 get_layout_hdr(lo); /* matched in pnfs_roc_release */
500 spin_unlock(&ino->i_lock);
501 pnfs_free_lseg_list(&tmp_list);
502 return true;
503
504out_nolayout:
505 spin_unlock(&ino->i_lock);
506 return false;
507}
508
509void pnfs_roc_release(struct inode *ino)
510{
511 struct pnfs_layout_hdr *lo;
512
513 spin_lock(&ino->i_lock);
514 lo = NFS_I(ino)->layout;
515 lo->plh_block_lgets--;
516 put_layout_hdr_locked(lo);
517 spin_unlock(&ino->i_lock);
518}
519
520void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
521{
522 struct pnfs_layout_hdr *lo;
523
524 spin_lock(&ino->i_lock);
525 lo = NFS_I(ino)->layout;
526 if ((int)(barrier - lo->plh_barrier) > 0)
527 lo->plh_barrier = barrier;
528 spin_unlock(&ino->i_lock);
529}
530
531bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
532{
533 struct nfs_inode *nfsi = NFS_I(ino);
534 struct pnfs_layout_segment *lseg;
535 bool found = false;
536
537 spin_lock(&ino->i_lock);
538 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
539 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
540 found = true;
541 break;
542 }
543 if (!found) {
544 struct pnfs_layout_hdr *lo = nfsi->layout;
545 u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
546
547 /* Since close does not return a layout stateid for use as
548 * a barrier, we choose the worst-case barrier.
549 */
550 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
551 }
552 spin_unlock(&ino->i_lock);
553 return found;
554}
555
556/*
557 * Compare two layout segments for sorting into layout cache.
558 * We want to preferentially return RW over RO layouts, so ensure those
559 * are seen first.
560 */
561static s64
562cmp_layout(u32 iomode1, u32 iomode2)
563{
564 /* read > read/write */
565 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
566}
567
568static void
569pnfs_insert_layout(struct pnfs_layout_hdr *lo,
570 struct pnfs_layout_segment *lseg)
571{
572 struct pnfs_layout_segment *lp;
573 int found = 0;
574
575 dprintk("%s:Begin\n", __func__);
576
577 assert_spin_locked(&lo->plh_inode->i_lock);
578 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
579 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
580 continue;
581 list_add_tail(&lseg->pls_list, &lp->pls_list);
582 dprintk("%s: inserted lseg %p "
583 "iomode %d offset %llu length %llu before "
584 "lp %p iomode %d offset %llu length %llu\n",
585 __func__, lseg, lseg->pls_range.iomode,
586 lseg->pls_range.offset, lseg->pls_range.length,
587 lp, lp->pls_range.iomode, lp->pls_range.offset,
588 lp->pls_range.length);
589 found = 1;
590 break;
591 }
592 if (!found) {
593 list_add_tail(&lseg->pls_list, &lo->plh_segs);
594 dprintk("%s: inserted lseg %p "
595 "iomode %d offset %llu length %llu at tail\n",
596 __func__, lseg, lseg->pls_range.iomode,
597 lseg->pls_range.offset, lseg->pls_range.length);
598 }
599 get_layout_hdr(lo);
600
601 dprintk("%s:Return\n", __func__);
602}
603
604static struct pnfs_layout_hdr *
605alloc_init_layout_hdr(struct inode *ino)
606{
607 struct pnfs_layout_hdr *lo;
608
609 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
610 if (!lo)
611 return NULL;
612 atomic_set(&lo->plh_refcount, 1);
613 INIT_LIST_HEAD(&lo->plh_layouts);
614 INIT_LIST_HEAD(&lo->plh_segs);
615 INIT_LIST_HEAD(&lo->plh_bulk_recall);
616 lo->plh_inode = ino;
617 return lo;
618}
619
620static struct pnfs_layout_hdr *
621pnfs_find_alloc_layout(struct inode *ino)
622{
623 struct nfs_inode *nfsi = NFS_I(ino);
624 struct pnfs_layout_hdr *new = NULL;
625
626 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
627
628 assert_spin_locked(&ino->i_lock);
629 if (nfsi->layout) {
630 if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
631 return NULL;
632 else
633 return nfsi->layout;
634 }
635 spin_unlock(&ino->i_lock);
636 new = alloc_init_layout_hdr(ino);
637 spin_lock(&ino->i_lock);
638
639 if (likely(nfsi->layout == NULL)) /* Won the race? */
640 nfsi->layout = new;
641 else
642 kfree(new);
643 return nfsi->layout;
644}
645
646/*
647 * iomode matching rules:
648 * iomode lseg match
649 * ----- ----- -----
650 * ANY READ true
651 * ANY RW true
652 * RW READ false
653 * RW RW true
654 * READ READ true
655 * READ RW true
656 */
657static int
658is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
659{
660 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
661}
662
663/*
664 * lookup range in layout
665 */
666static struct pnfs_layout_segment *
667pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
668{
669 struct pnfs_layout_segment *lseg, *ret = NULL;
670
671 dprintk("%s:Begin\n", __func__);
672
673 assert_spin_locked(&lo->plh_inode->i_lock);
674 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
675 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
676 is_matching_lseg(lseg, iomode)) {
677 ret = lseg;
678 break;
679 }
680 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
681 break;
682 }
683
684 dprintk("%s:Return lseg %p ref %d\n",
685 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
686 return ret;
687}
688
689/*
690 * Layout segment is retreived from the server if not cached.
691 * The appropriate layout segment is referenced and returned to the caller.
692 */
693struct pnfs_layout_segment *
694pnfs_update_layout(struct inode *ino,
695 struct nfs_open_context *ctx,
696 enum pnfs_iomode iomode)
697{
698 struct nfs_inode *nfsi = NFS_I(ino);
699 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
700 struct pnfs_layout_hdr *lo;
701 struct pnfs_layout_segment *lseg = NULL;
702
703 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
704 return NULL;
705 spin_lock(&ino->i_lock);
706 lo = pnfs_find_alloc_layout(ino);
707 if (lo == NULL) {
708 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
709 goto out_unlock;
710 }
711
712 /* Do we even need to bother with this? */
713 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
714 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
715 dprintk("%s matches recall, use MDS\n", __func__);
716 goto out_unlock;
717 }
718 /* Check to see if the layout for the given range already exists */
719 lseg = pnfs_find_lseg(lo, iomode);
720 if (lseg)
721 goto out_unlock;
722
723 /* if LAYOUTGET already failed once we don't try again */
724 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
725 goto out_unlock;
726
727 if (pnfs_layoutgets_blocked(lo, NULL, 0))
728 goto out_unlock;
729 atomic_inc(&lo->plh_outstanding);
730
731 get_layout_hdr(lo);
732 if (list_empty(&lo->plh_segs)) {
733 /* The lo must be on the clp list if there is any
734 * chance of a CB_LAYOUTRECALL(FILE) coming in.
735 */
736 spin_lock(&clp->cl_lock);
737 BUG_ON(!list_empty(&lo->plh_layouts));
738 list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
739 spin_unlock(&clp->cl_lock);
740 }
741 spin_unlock(&ino->i_lock);
742
743 lseg = send_layoutget(lo, ctx, iomode);
744 if (!lseg) {
745 spin_lock(&ino->i_lock);
746 if (list_empty(&lo->plh_segs)) {
747 spin_lock(&clp->cl_lock);
748 list_del_init(&lo->plh_layouts);
749 spin_unlock(&clp->cl_lock);
750 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
751 }
752 spin_unlock(&ino->i_lock);
753 }
754 atomic_dec(&lo->plh_outstanding);
755 put_layout_hdr(lo);
756out:
757 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
758 nfsi->layout->plh_flags, lseg);
759 return lseg;
760out_unlock:
761 spin_unlock(&ino->i_lock);
762 goto out;
763}
764
765int
766pnfs_layout_process(struct nfs4_layoutget *lgp)
767{
768 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
769 struct nfs4_layoutget_res *res = &lgp->res;
770 struct pnfs_layout_segment *lseg;
771 struct inode *ino = lo->plh_inode;
772 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
773 int status = 0;
774
775 /* Verify we got what we asked for.
776 * Note that because the xdr parsing only accepts a single
777 * element array, this can fail even if the server is behaving
778 * correctly.
779 */
780 if (lgp->args.range.iomode > res->range.iomode ||
781 res->range.offset != 0 ||
782 res->range.length != NFS4_MAX_UINT64) {
783 status = -EINVAL;
784 goto out;
785 }
786 /* Inject layout blob into I/O device driver */
787 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
788 if (!lseg || IS_ERR(lseg)) {
789 if (!lseg)
790 status = -ENOMEM;
791 else
792 status = PTR_ERR(lseg);
793 dprintk("%s: Could not allocate layout: error %d\n",
794 __func__, status);
795 goto out;
796 }
797
798 spin_lock(&ino->i_lock);
799 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
800 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
801 dprintk("%s forget reply due to recall\n", __func__);
802 goto out_forget_reply;
803 }
804
805 if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
806 dprintk("%s forget reply due to state\n", __func__);
807 goto out_forget_reply;
808 }
809 init_lseg(lo, lseg);
810 lseg->pls_range = res->range;
811 *lgp->lsegpp = lseg;
812 pnfs_insert_layout(lo, lseg);
813
814 if (res->return_on_close) {
815 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
816 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
817 }
818
819 /* Done processing layoutget. Set the layout stateid */
820 pnfs_set_layout_stateid(lo, &res->stateid, false);
821 spin_unlock(&ino->i_lock);
822out:
823 return status;
824
825out_forget_reply:
826 spin_unlock(&ino->i_lock);
827 lseg->pls_layout = lo;
828 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
829 goto out;
830}
831
832/*
833 * Device ID cache. Currently supports one layout type per struct nfs_client.
834 * Add layout type to the lookup key to expand to support multiple types.
835 */
836int
837pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
838 void (*free_callback)(struct pnfs_deviceid_node *))
839{
840 struct pnfs_deviceid_cache *c;
841
842 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
843 if (!c)
844 return -ENOMEM;
845 spin_lock(&clp->cl_lock);
846 if (clp->cl_devid_cache != NULL) {
847 atomic_inc(&clp->cl_devid_cache->dc_ref);
848 dprintk("%s [kref [%d]]\n", __func__,
849 atomic_read(&clp->cl_devid_cache->dc_ref));
850 kfree(c);
851 } else {
852 /* kzalloc initializes hlists */
853 spin_lock_init(&c->dc_lock);
854 atomic_set(&c->dc_ref, 1);
855 c->dc_free_callback = free_callback;
856 clp->cl_devid_cache = c;
857 dprintk("%s [new]\n", __func__);
858 }
859 spin_unlock(&clp->cl_lock);
860 return 0;
861}
862EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
863
864/*
865 * Called from pnfs_layoutdriver_type->free_lseg
866 * last layout segment reference frees deviceid
867 */
868void
869pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
870 struct pnfs_deviceid_node *devid)
871{
872 struct nfs4_deviceid *id = &devid->de_id;
873 struct pnfs_deviceid_node *d;
874 struct hlist_node *n;
875 long h = nfs4_deviceid_hash(id);
876
877 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
878 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
879 return;
880
881 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
882 if (!memcmp(&d->de_id, id, sizeof(*id))) {
883 hlist_del_rcu(&d->de_node);
884 spin_unlock(&c->dc_lock);
885 synchronize_rcu();
886 c->dc_free_callback(devid);
887 return;
888 }
889 spin_unlock(&c->dc_lock);
890 /* Why wasn't it found in the list? */
891 BUG();
892}
893EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
894
895/* Find and reference a deviceid */
896struct pnfs_deviceid_node *
897pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
898{
899 struct pnfs_deviceid_node *d;
900 struct hlist_node *n;
901 long hash = nfs4_deviceid_hash(id);
902
903 dprintk("--> %s hash %ld\n", __func__, hash);
904 rcu_read_lock();
905 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
906 if (!memcmp(&d->de_id, id, sizeof(*id))) {
907 if (!atomic_inc_not_zero(&d->de_ref)) {
908 goto fail;
909 } else {
910 rcu_read_unlock();
911 return d;
912 }
913 }
914 }
915fail:
916 rcu_read_unlock();
917 return NULL;
918}
919EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
920
921/*
922 * Add a deviceid to the cache.
923 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
924 */
925struct pnfs_deviceid_node *
926pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
927{
928 struct pnfs_deviceid_node *d;
929 long hash = nfs4_deviceid_hash(&new->de_id);
930
931 dprintk("--> %s hash %ld\n", __func__, hash);
932 spin_lock(&c->dc_lock);
933 d = pnfs_find_get_deviceid(c, &new->de_id);
934 if (d) {
935 spin_unlock(&c->dc_lock);
936 dprintk("%s [discard]\n", __func__);
937 c->dc_free_callback(new);
938 return d;
939 }
940 INIT_HLIST_NODE(&new->de_node);
941 atomic_set(&new->de_ref, 1);
942 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
943 spin_unlock(&c->dc_lock);
944 dprintk("%s [new]\n", __func__);
945 return new;
946}
947EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
948
949void
950pnfs_put_deviceid_cache(struct nfs_client *clp)
951{
952 struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
953
954 dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
955 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
956 int i;
957 /* Verify cache is empty */
958 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
959 BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
960 clp->cl_devid_cache = NULL;
961 spin_unlock(&clp->cl_lock);
962 kfree(local);
963 }
964}
965EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000000..e2612ea0cbed
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,235 @@
1/*
2 * pNFS client data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H
32
33enum {
34 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
35 NFS_LSEG_ROC, /* roc bit received from server */
36};
37
38struct pnfs_layout_segment {
39 struct list_head pls_list;
40 struct pnfs_layout_range pls_range;
41 atomic_t pls_refcount;
42 unsigned long pls_flags;
43 struct pnfs_layout_hdr *pls_layout;
44};
45
46#ifdef CONFIG_NFS_V4_1
47
48#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
49
50enum {
51 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
52 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
53 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
54 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
55 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
56};
57
58/* Per-layout driver specific registration structure */
59struct pnfs_layoutdriver_type {
60 struct list_head pnfs_tblid;
61 const u32 id;
62 const char *name;
63 struct module *owner;
64 int (*set_layoutdriver) (struct nfs_server *);
65 int (*clear_layoutdriver) (struct nfs_server *);
66 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
67 void (*free_lseg) (struct pnfs_layout_segment *lseg);
68};
69
70struct pnfs_layout_hdr {
71 atomic_t plh_refcount;
72 struct list_head plh_layouts; /* other client layouts */
73 struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
74 struct list_head plh_segs; /* layout segments list */
75 nfs4_stateid plh_stateid;
76 atomic_t plh_outstanding; /* number of RPCs out */
77 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
78 u32 plh_barrier; /* ignore lower seqids */
79 unsigned long plh_flags;
80 struct inode *plh_inode;
81};
82
83struct pnfs_device {
84 struct nfs4_deviceid dev_id;
85 unsigned int layout_type;
86 unsigned int mincount;
87 struct page **pages;
88 void *area;
89 unsigned int pgbase;
90 unsigned int pglen;
91};
92
93/*
94 * Device ID RCU cache. A device ID is unique per client ID and layout type.
95 */
96#define NFS4_DEVICE_ID_HASH_BITS 5
97#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
98#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
99
100static inline u32
101nfs4_deviceid_hash(struct nfs4_deviceid *id)
102{
103 unsigned char *cptr = (unsigned char *)id->data;
104 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
105 u32 x = 0;
106
107 while (nbytes--) {
108 x *= 37;
109 x += *cptr++;
110 }
111 return x & NFS4_DEVICE_ID_HASH_MASK;
112}
113
114struct pnfs_deviceid_node {
115 struct hlist_node de_node;
116 struct nfs4_deviceid de_id;
117 atomic_t de_ref;
118};
119
120struct pnfs_deviceid_cache {
121 spinlock_t dc_lock;
122 atomic_t dc_ref;
123 void (*dc_free_callback)(struct pnfs_deviceid_node *);
124 struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
125};
126
127extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
128 void (*free_callback)(struct pnfs_deviceid_node *));
129extern void pnfs_put_deviceid_cache(struct nfs_client *);
130extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
131 struct pnfs_deviceid_cache *,
132 struct nfs4_deviceid *);
133extern struct pnfs_deviceid_node *pnfs_add_deviceid(
134 struct pnfs_deviceid_cache *,
135 struct pnfs_deviceid_node *);
136extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
137 struct pnfs_deviceid_node *devid);
138
139extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
140extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
141
142/* nfs4proc.c */
143extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
144 struct pnfs_device *dev);
145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
146
147/* pnfs.c */
148void get_layout_hdr(struct pnfs_layout_hdr *lo);
149struct pnfs_layout_segment *
150pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
151 enum pnfs_iomode access_type);
152void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
153void unset_pnfs_layoutdriver(struct nfs_server *);
154int pnfs_layout_process(struct nfs4_layoutget *lgp);
155void pnfs_free_lseg_list(struct list_head *tmp_list);
156void pnfs_destroy_layout(struct nfs_inode *);
157void pnfs_destroy_all_layouts(struct nfs_client *);
158void put_layout_hdr(struct pnfs_layout_hdr *lo);
159void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
160 const nfs4_stateid *new,
161 bool update_barrier);
162int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
163 struct pnfs_layout_hdr *lo,
164 struct nfs4_state *open_state);
165int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
166 struct list_head *tmp_list,
167 u32 iomode);
168bool pnfs_roc(struct inode *ino);
169void pnfs_roc_release(struct inode *ino);
170void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
171bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
172
173
174static inline int lo_fail_bit(u32 iomode)
175{
176 return iomode == IOMODE_RW ?
177 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
178}
179
180/* Return true if a layout driver is being used for this mountpoint */
181static inline int pnfs_enabled_sb(struct nfs_server *nfss)
182{
183 return nfss->pnfs_curr_ld != NULL;
184}
185
186#else /* CONFIG_NFS_V4_1 */
187
188static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
189{
190}
191
192static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
193{
194}
195
196static inline struct pnfs_layout_segment *
197pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
198 enum pnfs_iomode access_type)
199{
200 return NULL;
201}
202
203static inline bool
204pnfs_roc(struct inode *ino)
205{
206 return false;
207}
208
209static inline void
210pnfs_roc_release(struct inode *ino)
211{
212}
213
214static inline void
215pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
216{
217}
218
219static inline bool
220pnfs_roc_drain(struct inode *ino, u32 *barrier)
221{
222 return false;
223}
224
225static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
226{
227}
228
229static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
230{
231}
232
233#endif /* CONFIG_NFS_V4_1 */
234
235#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f552..77d5e21c4ad6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
258 258
259static int 259static int
260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
261 int flags, struct nameidata *nd) 261 int flags, struct nfs_open_context *ctx)
262{ 262{
263 struct nfs_createdata *data; 263 struct nfs_createdata *data;
264 struct rpc_message msg = { 264 struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
365 return 1; 365 return 1;
366} 366}
367 367
368static void
369nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
370{
371 msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
372}
373
374static int
375nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
376 struct inode *new_dir)
377{
378 if (nfs_async_handle_expired_key(task))
379 return 0;
380 nfs_mark_for_revalidate(old_dir);
381 nfs_mark_for_revalidate(new_dir);
382 return 1;
383}
384
368static int 385static int
369nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, 386nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
370 struct inode *new_dir, struct qstr *new_name) 387 struct inode *new_dir, struct qstr *new_name)
371{ 388{
372 struct nfs_renameargs arg = { 389 struct nfs_renameargs arg = {
373 .fromfh = NFS_FH(old_dir), 390 .old_dir = NFS_FH(old_dir),
374 .fromname = old_name->name, 391 .old_name = old_name,
375 .fromlen = old_name->len, 392 .new_dir = NFS_FH(new_dir),
376 .tofh = NFS_FH(new_dir), 393 .new_name = new_name,
377 .toname = new_name->name,
378 .tolen = new_name->len
379 }; 394 };
380 struct rpc_message msg = { 395 struct rpc_message msg = {
381 .rpc_proc = &nfs_procedures[NFSPROC_RENAME], 396 .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
@@ -443,7 +458,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
443 fattr = nfs_alloc_fattr(); 458 fattr = nfs_alloc_fattr();
444 status = -ENOMEM; 459 status = -ENOMEM;
445 if (fh == NULL || fattr == NULL) 460 if (fh == NULL || fattr == NULL)
446 goto out; 461 goto out_free;
447 462
448 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 463 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
449 nfs_mark_for_revalidate(dir); 464 nfs_mark_for_revalidate(dir);
@@ -456,6 +471,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
456 if (status == 0) 471 if (status == 0)
457 status = nfs_instantiate(dentry, fh, fattr); 472 status = nfs_instantiate(dentry, fh, fattr);
458 473
474out_free:
459 nfs_free_fattr(fattr); 475 nfs_free_fattr(fattr);
460 nfs_free_fhandle(fh); 476 nfs_free_fhandle(fh);
461out: 477out:
@@ -519,14 +535,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
519 */ 535 */
520static int 536static int
521nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 537nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
522 u64 cookie, struct page *page, unsigned int count, int plus) 538 u64 cookie, struct page **pages, unsigned int count, int plus)
523{ 539{
524 struct inode *dir = dentry->d_inode; 540 struct inode *dir = dentry->d_inode;
525 struct nfs_readdirargs arg = { 541 struct nfs_readdirargs arg = {
526 .fh = NFS_FH(dir), 542 .fh = NFS_FH(dir),
527 .cookie = cookie, 543 .cookie = cookie,
528 .count = count, 544 .count = count,
529 .pages = &page, 545 .pages = pages,
530 }; 546 };
531 struct rpc_message msg = { 547 struct rpc_message msg = {
532 .rpc_proc = &nfs_procedures[NFSPROC_READDIR], 548 .rpc_proc = &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +721,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
705 .unlink_setup = nfs_proc_unlink_setup, 721 .unlink_setup = nfs_proc_unlink_setup,
706 .unlink_done = nfs_proc_unlink_done, 722 .unlink_done = nfs_proc_unlink_done,
707 .rename = nfs_proc_rename, 723 .rename = nfs_proc_rename,
724 .rename_setup = nfs_proc_rename_setup,
725 .rename_done = nfs_proc_rename_done,
708 .link = nfs_proc_link, 726 .link = nfs_proc_link,
709 .symlink = nfs_proc_symlink, 727 .symlink = nfs_proc_symlink,
710 .mkdir = nfs_proc_mkdir, 728 .mkdir = nfs_proc_mkdir,
@@ -714,7 +732,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
714 .statfs = nfs_proc_statfs, 732 .statfs = nfs_proc_statfs,
715 .fsinfo = nfs_proc_fsinfo, 733 .fsinfo = nfs_proc_fsinfo,
716 .pathconf = nfs_proc_pathconf, 734 .pathconf = nfs_proc_pathconf,
717 .decode_dirent = nfs_decode_dirent, 735 .decode_dirent = nfs2_decode_dirent,
718 .read_setup = nfs_proc_read_setup, 736 .read_setup = nfs_proc_read_setup,
719 .read_done = nfs_read_done, 737 .read_done = nfs_read_done,
720 .write_setup = nfs_proc_write_setup, 738 .write_setup = nfs_proc_write_setup,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87adc2744246..aedcaa7f291f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h" 27#include "fscache.h"
28#include "pnfs.h"
28 29
29#define NFSDBG_FACILITY NFSDBG_PAGECACHE 30#define NFSDBG_FACILITY NFSDBG_PAGECACHE
30 31
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
46 memset(p, 0, sizeof(*p)); 47 memset(p, 0, sizeof(*p));
47 INIT_LIST_HEAD(&p->pages); 48 INIT_LIST_HEAD(&p->pages);
48 p->npages = pagecount; 49 p->npages = pagecount;
49 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
51 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
52 else { 52 else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
121 len = nfs_page_length(page); 121 len = nfs_page_length(page);
122 if (len == 0) 122 if (len == 0)
123 return nfs_return_empty_page(page); 123 return nfs_return_empty_page(page);
124 pnfs_update_layout(inode, ctx, IOMODE_READ);
124 new = nfs_create_request(ctx, inode, page, 0, len); 125 new = nfs_create_request(ctx, inode, page, 0, len);
125 if (IS_ERR(new)) { 126 if (IS_ERR(new)) {
126 unlock_page(page); 127 unlock_page(page);
@@ -151,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
151 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 152 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
152 req->wb_bytes, 153 req->wb_bytes,
153 (long long)req_offset(req)); 154 (long long)req_offset(req));
154 nfs_clear_request(req);
155 nfs_release_request(req); 155 nfs_release_request(req);
156} 156}
157 157
@@ -625,6 +625,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
625 if (ret == 0) 625 if (ret == 0)
626 goto read_complete; /* all pages were read */ 626 goto read_complete; /* all pages were read */
627 627
628 pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
628 if (rsize < PAGE_CACHE_SIZE) 629 if (rsize < PAGE_CACHE_SIZE)
629 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 630 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
630 else 631 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f4cbf0c306c6..b68c8607770f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
39#include <linux/nfs_mount.h> 39#include <linux/nfs_mount.h>
40#include <linux/nfs4_mount.h> 40#include <linux/nfs4_mount.h>
41#include <linux/lockd/bind.h> 41#include <linux/lockd/bind.h>
42#include <linux/smp_lock.h>
43#include <linux/seq_file.h> 42#include <linux/seq_file.h>
44#include <linux/mount.h> 43#include <linux/mount.h>
45#include <linux/mnt_namespace.h> 44#include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
67 66
68#define NFSDBG_FACILITY NFSDBG_VFS 67#define NFSDBG_FACILITY NFSDBG_VFS
69 68
69#ifdef CONFIG_NFS_V3
70#define NFS_DEFAULT_VERSION 3
71#else
72#define NFS_DEFAULT_VERSION 2
73#endif
74
70enum { 75enum {
71 /* Mount options that take no arguments */ 76 /* Mount options that take no arguments */
72 Opt_soft, Opt_hard, 77 Opt_soft, Opt_hard,
@@ -100,6 +105,7 @@ enum {
100 Opt_addr, Opt_mountaddr, Opt_clientaddr, 105 Opt_addr, Opt_mountaddr, Opt_clientaddr,
101 Opt_lookupcache, 106 Opt_lookupcache,
102 Opt_fscache_uniq, 107 Opt_fscache_uniq,
108 Opt_local_lock,
103 109
104 /* Special mount options */ 110 /* Special mount options */
105 Opt_userspace, Opt_deprecated, Opt_sloppy, 111 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +177,7 @@ static const match_table_t nfs_mount_option_tokens = {
171 177
172 { Opt_lookupcache, "lookupcache=%s" }, 178 { Opt_lookupcache, "lookupcache=%s" },
173 { Opt_fscache_uniq, "fsc=%s" }, 179 { Opt_fscache_uniq, "fsc=%s" },
180 { Opt_local_lock, "local_lock=%s" },
174 181
175 { Opt_err, NULL } 182 { Opt_err, NULL }
176}; 183};
@@ -236,14 +243,30 @@ static match_table_t nfs_lookupcache_tokens = {
236 { Opt_lookupcache_err, NULL } 243 { Opt_lookupcache_err, NULL }
237}; 244};
238 245
246enum {
247 Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
248 Opt_local_lock_none,
249
250 Opt_local_lock_err
251};
252
253static match_table_t nfs_local_lock_tokens = {
254 { Opt_local_lock_all, "all" },
255 { Opt_local_lock_flock, "flock" },
256 { Opt_local_lock_posix, "posix" },
257 { Opt_local_lock_none, "none" },
258
259 { Opt_local_lock_err, NULL }
260};
261
239 262
240static void nfs_umount_begin(struct super_block *); 263static void nfs_umount_begin(struct super_block *);
241static int nfs_statfs(struct dentry *, struct kstatfs *); 264static int nfs_statfs(struct dentry *, struct kstatfs *);
242static int nfs_show_options(struct seq_file *, struct vfsmount *); 265static int nfs_show_options(struct seq_file *, struct vfsmount *);
243static int nfs_show_stats(struct seq_file *, struct vfsmount *); 266static int nfs_show_stats(struct seq_file *, struct vfsmount *);
244static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); 267static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
245static int nfs_xdev_get_sb(struct file_system_type *fs_type, 268static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
246 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 269 int flags, const char *dev_name, void *raw_data);
247static void nfs_put_super(struct super_block *); 270static void nfs_put_super(struct super_block *);
248static void nfs_kill_super(struct super_block *); 271static void nfs_kill_super(struct super_block *);
249static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 272static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -259,7 +282,7 @@ static struct file_system_type nfs_fs_type = {
259struct file_system_type nfs_xdev_fs_type = { 282struct file_system_type nfs_xdev_fs_type = {
260 .owner = THIS_MODULE, 283 .owner = THIS_MODULE,
261 .name = "nfs", 284 .name = "nfs",
262 .get_sb = nfs_xdev_get_sb, 285 .mount = nfs_xdev_mount,
263 .kill_sb = nfs_kill_super, 286 .kill_sb = nfs_kill_super,
264 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 287 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
265}; 288};
@@ -284,14 +307,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
284 struct nfs_parsed_mount_data *data, struct vfsmount *mnt); 307 struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
285static int nfs4_get_sb(struct file_system_type *fs_type, 308static int nfs4_get_sb(struct file_system_type *fs_type,
286 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 309 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
287static int nfs4_remote_get_sb(struct file_system_type *fs_type, 310static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
288 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 311 int flags, const char *dev_name, void *raw_data);
289static int nfs4_xdev_get_sb(struct file_system_type *fs_type, 312static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
290 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 313 int flags, const char *dev_name, void *raw_data);
291static int nfs4_referral_get_sb(struct file_system_type *fs_type, 314static int nfs4_referral_get_sb(struct file_system_type *fs_type,
292 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 315 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
293static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, 316static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
294 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 317 int flags, const char *dev_name, void *raw_data);
295static void nfs4_kill_super(struct super_block *sb); 318static void nfs4_kill_super(struct super_block *sb);
296 319
297static struct file_system_type nfs4_fs_type = { 320static struct file_system_type nfs4_fs_type = {
@@ -305,7 +328,7 @@ static struct file_system_type nfs4_fs_type = {
305static struct file_system_type nfs4_remote_fs_type = { 328static struct file_system_type nfs4_remote_fs_type = {
306 .owner = THIS_MODULE, 329 .owner = THIS_MODULE,
307 .name = "nfs4", 330 .name = "nfs4",
308 .get_sb = nfs4_remote_get_sb, 331 .mount = nfs4_remote_mount,
309 .kill_sb = nfs4_kill_super, 332 .kill_sb = nfs4_kill_super,
310 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 333 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
311}; 334};
@@ -313,7 +336,7 @@ static struct file_system_type nfs4_remote_fs_type = {
313struct file_system_type nfs4_xdev_fs_type = { 336struct file_system_type nfs4_xdev_fs_type = {
314 .owner = THIS_MODULE, 337 .owner = THIS_MODULE,
315 .name = "nfs4", 338 .name = "nfs4",
316 .get_sb = nfs4_xdev_get_sb, 339 .mount = nfs4_xdev_mount,
317 .kill_sb = nfs4_kill_super, 340 .kill_sb = nfs4_kill_super,
318 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 341 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
319}; 342};
@@ -321,7 +344,7 @@ struct file_system_type nfs4_xdev_fs_type = {
321static struct file_system_type nfs4_remote_referral_fs_type = { 344static struct file_system_type nfs4_remote_referral_fs_type = {
322 .owner = THIS_MODULE, 345 .owner = THIS_MODULE,
323 .name = "nfs4", 346 .name = "nfs4",
324 .get_sb = nfs4_remote_referral_get_sb, 347 .mount = nfs4_remote_referral_mount,
325 .kill_sb = nfs4_kill_super, 348 .kill_sb = nfs4_kill_super,
326 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 349 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
327}; 350};
@@ -575,7 +598,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
575 598
576 if (nfss->mountd_version || showdefaults) 599 if (nfss->mountd_version || showdefaults)
577 seq_printf(m, ",mountvers=%u", nfss->mountd_version); 600 seq_printf(m, ",mountvers=%u", nfss->mountd_version);
578 if (nfss->mountd_port || showdefaults) 601 if ((nfss->mountd_port &&
602 nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
603 showdefaults)
579 seq_printf(m, ",mountport=%u", nfss->mountd_port); 604 seq_printf(m, ",mountport=%u", nfss->mountd_port);
580 605
581 nfs_show_mountd_netid(m, nfss, showdefaults); 606 nfs_show_mountd_netid(m, nfss, showdefaults);
@@ -622,6 +647,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
622 const struct proc_nfs_info *nfs_infop; 647 const struct proc_nfs_info *nfs_infop;
623 struct nfs_client *clp = nfss->nfs_client; 648 struct nfs_client *clp = nfss->nfs_client;
624 u32 version = clp->rpc_ops->version; 649 u32 version = clp->rpc_ops->version;
650 int local_flock, local_fcntl;
625 651
626 seq_printf(m, ",vers=%u", version); 652 seq_printf(m, ",vers=%u", version);
627 seq_printf(m, ",rsize=%u", nfss->rsize); 653 seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -670,6 +696,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
670 else 696 else
671 seq_printf(m, ",lookupcache=pos"); 697 seq_printf(m, ",lookupcache=pos");
672 } 698 }
699
700 local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
701 local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
702
703 if (!local_flock && !local_fcntl)
704 seq_printf(m, ",local_lock=none");
705 else if (local_flock && local_fcntl)
706 seq_printf(m, ",local_lock=all");
707 else if (local_flock)
708 seq_printf(m, ",local_lock=flock");
709 else
710 seq_printf(m, ",local_lock=posix");
673} 711}
674 712
675/* 713/*
@@ -1017,9 +1055,13 @@ static int nfs_parse_mount_options(char *raw,
1017 break; 1055 break;
1018 case Opt_lock: 1056 case Opt_lock:
1019 mnt->flags &= ~NFS_MOUNT_NONLM; 1057 mnt->flags &= ~NFS_MOUNT_NONLM;
1058 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1059 NFS_MOUNT_LOCAL_FCNTL);
1020 break; 1060 break;
1021 case Opt_nolock: 1061 case Opt_nolock:
1022 mnt->flags |= NFS_MOUNT_NONLM; 1062 mnt->flags |= NFS_MOUNT_NONLM;
1063 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1064 NFS_MOUNT_LOCAL_FCNTL);
1023 break; 1065 break;
1024 case Opt_v2: 1066 case Opt_v2:
1025 mnt->flags &= ~NFS_MOUNT_VER3; 1067 mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1029,12 +1071,10 @@ static int nfs_parse_mount_options(char *raw,
1029 mnt->flags |= NFS_MOUNT_VER3; 1071 mnt->flags |= NFS_MOUNT_VER3;
1030 mnt->version = 3; 1072 mnt->version = 3;
1031 break; 1073 break;
1032#ifdef CONFIG_NFS_V4
1033 case Opt_v4: 1074 case Opt_v4:
1034 mnt->flags &= ~NFS_MOUNT_VER3; 1075 mnt->flags &= ~NFS_MOUNT_VER3;
1035 mnt->version = 4; 1076 mnt->version = 4;
1036 break; 1077 break;
1037#endif
1038 case Opt_udp: 1078 case Opt_udp:
1039 mnt->flags &= ~NFS_MOUNT_TCP; 1079 mnt->flags &= ~NFS_MOUNT_TCP;
1040 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1080 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1246,12 +1286,10 @@ static int nfs_parse_mount_options(char *raw,
1246 mnt->flags |= NFS_MOUNT_VER3; 1286 mnt->flags |= NFS_MOUNT_VER3;
1247 mnt->version = 3; 1287 mnt->version = 3;
1248 break; 1288 break;
1249#ifdef CONFIG_NFS_V4
1250 case NFS4_VERSION: 1289 case NFS4_VERSION:
1251 mnt->flags &= ~NFS_MOUNT_VER3; 1290 mnt->flags &= ~NFS_MOUNT_VER3;
1252 mnt->version = 4; 1291 mnt->version = 4;
1253 break; 1292 break;
1254#endif
1255 default: 1293 default:
1256 goto out_invalid_value; 1294 goto out_invalid_value;
1257 } 1295 }
@@ -1420,6 +1458,34 @@ static int nfs_parse_mount_options(char *raw,
1420 mnt->fscache_uniq = string; 1458 mnt->fscache_uniq = string;
1421 mnt->options |= NFS_OPTION_FSCACHE; 1459 mnt->options |= NFS_OPTION_FSCACHE;
1422 break; 1460 break;
1461 case Opt_local_lock:
1462 string = match_strdup(args);
1463 if (string == NULL)
1464 goto out_nomem;
1465 token = match_token(string, nfs_local_lock_tokens,
1466 args);
1467 kfree(string);
1468 switch (token) {
1469 case Opt_local_lock_all:
1470 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1471 NFS_MOUNT_LOCAL_FCNTL);
1472 break;
1473 case Opt_local_lock_flock:
1474 mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
1475 break;
1476 case Opt_local_lock_posix:
1477 mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
1478 break;
1479 case Opt_local_lock_none:
1480 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1481 NFS_MOUNT_LOCAL_FCNTL);
1482 break;
1483 default:
1484 dfprintk(MOUNT, "NFS: invalid "
1485 "local_lock argument\n");
1486 return 0;
1487 };
1488 break;
1423 1489
1424 /* 1490 /*
1425 * Special options 1491 * Special options
@@ -1825,6 +1891,12 @@ static int nfs_validate_mount_data(void *options,
1825 if (!args->nfs_server.hostname) 1891 if (!args->nfs_server.hostname)
1826 goto out_nomem; 1892 goto out_nomem;
1827 1893
1894 if (!(data->flags & NFS_MOUNT_NONLM))
1895 args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
1896 NFS_MOUNT_LOCAL_FCNTL);
1897 else
1898 args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
1899 NFS_MOUNT_LOCAL_FCNTL);
1828 /* 1900 /*
1829 * The legacy version 6 binary mount data from userspace has a 1901 * The legacy version 6 binary mount data from userspace has a
1830 * field used only to transport selinux information into the 1902 * field used only to transport selinux information into the
@@ -2130,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
2130 2202
2131 s->s_flags = sb_mntdata->mntflags; 2203 s->s_flags = sb_mntdata->mntflags;
2132 s->s_fs_info = server; 2204 s->s_fs_info = server;
2205 s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
2133 ret = set_anon_super(s, server); 2206 ret = set_anon_super(s, server);
2134 if (ret == 0) 2207 if (ret == 0)
2135 server->s_dev = s->s_dev; 2208 server->s_dev = s->s_dev;
@@ -2208,7 +2281,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2208 }; 2281 };
2209 int error = -ENOMEM; 2282 int error = -ENOMEM;
2210 2283
2211 data = nfs_alloc_parsed_mount_data(3); 2284 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
2212 mntfh = nfs_alloc_fhandle(); 2285 mntfh = nfs_alloc_fhandle();
2213 if (data == NULL || mntfh == NULL) 2286 if (data == NULL || mntfh == NULL)
2214 goto out_free_fh; 2287 goto out_free_fh;
@@ -2328,9 +2401,9 @@ static void nfs_kill_super(struct super_block *s)
2328/* 2401/*
2329 * Clone an NFS2/3 server record on xdev traversal (FSID-change) 2402 * Clone an NFS2/3 server record on xdev traversal (FSID-change)
2330 */ 2403 */
2331static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, 2404static struct dentry *
2332 const char *dev_name, void *raw_data, 2405nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2333 struct vfsmount *mnt) 2406 const char *dev_name, void *raw_data)
2334{ 2407{
2335 struct nfs_clone_mount *data = raw_data; 2408 struct nfs_clone_mount *data = raw_data;
2336 struct super_block *s; 2409 struct super_block *s;
@@ -2342,7 +2415,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2342 }; 2415 };
2343 int error; 2416 int error;
2344 2417
2345 dprintk("--> nfs_xdev_get_sb()\n"); 2418 dprintk("--> nfs_xdev_mount()\n");
2346 2419
2347 /* create a new volume representation */ 2420 /* create a new volume representation */
2348 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); 2421 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2389,28 +2462,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2389 } 2462 }
2390 2463
2391 s->s_flags |= MS_ACTIVE; 2464 s->s_flags |= MS_ACTIVE;
2392 mnt->mnt_sb = s;
2393 mnt->mnt_root = mntroot;
2394 2465
2395 /* clone any lsm security options from the parent to the new sb */ 2466 /* clone any lsm security options from the parent to the new sb */
2396 security_sb_clone_mnt_opts(data->sb, s); 2467 security_sb_clone_mnt_opts(data->sb, s);
2397 2468
2398 dprintk("<-- nfs_xdev_get_sb() = 0\n"); 2469 dprintk("<-- nfs_xdev_mount() = 0\n");
2399 return 0; 2470 return mntroot;
2400 2471
2401out_err_nosb: 2472out_err_nosb:
2402 nfs_free_server(server); 2473 nfs_free_server(server);
2403out_err_noserver: 2474out_err_noserver:
2404 dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error); 2475 dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
2405 return error; 2476 return ERR_PTR(error);
2406 2477
2407error_splat_super: 2478error_splat_super:
2408 if (server && !s->s_root) 2479 if (server && !s->s_root)
2409 bdi_unregister(&server->backing_dev_info); 2480 bdi_unregister(&server->backing_dev_info);
2410error_splat_bdi: 2481error_splat_bdi:
2411 deactivate_locked_super(s); 2482 deactivate_locked_super(s);
2412 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2483 dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
2413 return error; 2484 return ERR_PTR(error);
2414} 2485}
2415 2486
2416#ifdef CONFIG_NFS_V4 2487#ifdef CONFIG_NFS_V4
@@ -2426,7 +2497,13 @@ static void nfs4_clone_super(struct super_block *sb,
2426 sb->s_maxbytes = old_sb->s_maxbytes; 2497 sb->s_maxbytes = old_sb->s_maxbytes;
2427 sb->s_time_gran = 1; 2498 sb->s_time_gran = 1;
2428 sb->s_op = old_sb->s_op; 2499 sb->s_op = old_sb->s_op;
2429 nfs_initialise_sb(sb); 2500 /*
2501 * The VFS shouldn't apply the umask to mode bits. We will do
2502 * so ourselves when necessary.
2503 */
2504 sb->s_flags |= MS_POSIXACL;
2505 sb->s_xattr = old_sb->s_xattr;
2506 nfs_initialise_sb(sb);
2430} 2507}
2431 2508
2432/* 2509/*
@@ -2436,12 +2513,19 @@ static void nfs4_fill_super(struct super_block *sb)
2436{ 2513{
2437 sb->s_time_gran = 1; 2514 sb->s_time_gran = 1;
2438 sb->s_op = &nfs4_sops; 2515 sb->s_op = &nfs4_sops;
2516 /*
2517 * The VFS shouldn't apply the umask to mode bits. We will do
2518 * so ourselves when necessary.
2519 */
2520 sb->s_flags |= MS_POSIXACL;
2521 sb->s_xattr = nfs4_xattr_handlers;
2439 nfs_initialise_sb(sb); 2522 nfs_initialise_sb(sb);
2440} 2523}
2441 2524
2442static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) 2525static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2443{ 2526{
2444 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); 2527 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
2528 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
2445} 2529}
2446 2530
2447static int nfs4_validate_text_mount_data(void *options, 2531static int nfs4_validate_text_mount_data(void *options,
@@ -2579,8 +2663,9 @@ out_no_address:
2579/* 2663/*
2580 * Get the superblock for the NFS4 root partition 2664 * Get the superblock for the NFS4 root partition
2581 */ 2665 */
2582static int nfs4_remote_get_sb(struct file_system_type *fs_type, 2666static struct dentry *
2583 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2667nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2668 const char *dev_name, void *raw_data)
2584{ 2669{
2585 struct nfs_parsed_mount_data *data = raw_data; 2670 struct nfs_parsed_mount_data *data = raw_data;
2586 struct super_block *s; 2671 struct super_block *s;
@@ -2644,15 +2729,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2644 goto error_splat_root; 2729 goto error_splat_root;
2645 2730
2646 s->s_flags |= MS_ACTIVE; 2731 s->s_flags |= MS_ACTIVE;
2647 mnt->mnt_sb = s; 2732
2648 mnt->mnt_root = mntroot; 2733 security_free_mnt_opts(&data->lsm_opts);
2649 error = 0; 2734 nfs_free_fhandle(mntfh);
2735 return mntroot;
2650 2736
2651out: 2737out:
2652 security_free_mnt_opts(&data->lsm_opts); 2738 security_free_mnt_opts(&data->lsm_opts);
2653out_free_fh: 2739out_free_fh:
2654 nfs_free_fhandle(mntfh); 2740 nfs_free_fhandle(mntfh);
2655 return error; 2741 return ERR_PTR(error);
2656 2742
2657out_free: 2743out_free:
2658 nfs_free_server(server); 2744 nfs_free_server(server);
@@ -2898,9 +2984,9 @@ static void nfs4_kill_super(struct super_block *sb)
2898/* 2984/*
2899 * Clone an NFS4 server record on xdev traversal (FSID-change) 2985 * Clone an NFS4 server record on xdev traversal (FSID-change)
2900 */ 2986 */
2901static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, 2987static struct dentry *
2902 const char *dev_name, void *raw_data, 2988nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
2903 struct vfsmount *mnt) 2989 const char *dev_name, void *raw_data)
2904{ 2990{
2905 struct nfs_clone_mount *data = raw_data; 2991 struct nfs_clone_mount *data = raw_data;
2906 struct super_block *s; 2992 struct super_block *s;
@@ -2912,7 +2998,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2912 }; 2998 };
2913 int error; 2999 int error;
2914 3000
2915 dprintk("--> nfs4_xdev_get_sb()\n"); 3001 dprintk("--> nfs4_xdev_mount()\n");
2916 3002
2917 /* create a new volume representation */ 3003 /* create a new volume representation */
2918 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); 3004 server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2959,32 +3045,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2959 } 3045 }
2960 3046
2961 s->s_flags |= MS_ACTIVE; 3047 s->s_flags |= MS_ACTIVE;
2962 mnt->mnt_sb = s;
2963 mnt->mnt_root = mntroot;
2964 3048
2965 security_sb_clone_mnt_opts(data->sb, s); 3049 security_sb_clone_mnt_opts(data->sb, s);
2966 3050
2967 dprintk("<-- nfs4_xdev_get_sb() = 0\n"); 3051 dprintk("<-- nfs4_xdev_mount() = 0\n");
2968 return 0; 3052 return mntroot;
2969 3053
2970out_err_nosb: 3054out_err_nosb:
2971 nfs_free_server(server); 3055 nfs_free_server(server);
2972out_err_noserver: 3056out_err_noserver:
2973 dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error); 3057 dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
2974 return error; 3058 return ERR_PTR(error);
2975 3059
2976error_splat_super: 3060error_splat_super:
2977 if (server && !s->s_root) 3061 if (server && !s->s_root)
2978 bdi_unregister(&server->backing_dev_info); 3062 bdi_unregister(&server->backing_dev_info);
2979error_splat_bdi: 3063error_splat_bdi:
2980 deactivate_locked_super(s); 3064 deactivate_locked_super(s);
2981 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 3065 dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
2982 return error; 3066 return ERR_PTR(error);
2983} 3067}
2984 3068
2985static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, 3069static struct dentry *
2986 int flags, const char *dev_name, void *raw_data, 3070nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
2987 struct vfsmount *mnt) 3071 const char *dev_name, void *raw_data)
2988{ 3072{
2989 struct nfs_clone_mount *data = raw_data; 3073 struct nfs_clone_mount *data = raw_data;
2990 struct super_block *s; 3074 struct super_block *s;
@@ -3048,14 +3132,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
3048 } 3132 }
3049 3133
3050 s->s_flags |= MS_ACTIVE; 3134 s->s_flags |= MS_ACTIVE;
3051 mnt->mnt_sb = s;
3052 mnt->mnt_root = mntroot;
3053 3135
3054 security_sb_clone_mnt_opts(data->sb, s); 3136 security_sb_clone_mnt_opts(data->sb, s);
3055 3137
3056 nfs_free_fhandle(mntfh); 3138 nfs_free_fhandle(mntfh);
3057 dprintk("<-- nfs4_referral_get_sb() = 0\n"); 3139 dprintk("<-- nfs4_referral_get_sb() = 0\n");
3058 return 0; 3140 return mntroot;
3059 3141
3060out_err_nosb: 3142out_err_nosb:
3061 nfs_free_server(server); 3143 nfs_free_server(server);
@@ -3063,7 +3145,7 @@ out_err_noserver:
3063 nfs_free_fhandle(mntfh); 3145 nfs_free_fhandle(mntfh);
3064out_err_nofh: 3146out_err_nofh:
3065 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); 3147 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
3066 return error; 3148 return ERR_PTR(error);
3067 3149
3068error_splat_super: 3150error_splat_super:
3069 if (server && !s->s_root) 3151 if (server && !s->s_root)
@@ -3072,7 +3154,7 @@ error_splat_bdi:
3072 deactivate_locked_super(s); 3154 deactivate_locked_super(s);
3073 nfs_free_fhandle(mntfh); 3155 nfs_free_fhandle(mntfh);
3074 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 3156 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
3075 return error; 3157 return ERR_PTR(error);
3076} 3158}
3077 3159
3078/* 3160/*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b20..978aaeb8a093 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
32 .extra1 = (int *)&nfs_set_port_min, 32 .extra1 = (int *)&nfs_set_port_min,
33 .extra2 = (int *)&nfs_set_port_max, 33 .extra2 = (int *)&nfs_set_port_max,
34 }, 34 },
35#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
35 { 36 {
36 .procname = "idmap_cache_timeout", 37 .procname = "idmap_cache_timeout",
37 .data = &nfs_idmap_cache_timeout, 38 .data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
39 .mode = 0644, 40 .mode = 0644,
40 .proc_handler = proc_dointvec_jiffies, 41 .proc_handler = proc_dointvec_jiffies,
41 }, 42 },
43#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
42#endif 44#endif
43 { 45 {
44 .procname = "nfs_mountpoint_timeout", 46 .procname = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 2f84adaad427..e313a51acdd1 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/wait.h> 15#include <linux/wait.h>
16#include <linux/namei.h>
16 17
17#include "internal.h" 18#include "internal.h"
18#include "nfs4_fs.h" 19#include "nfs4_fs.h"
20#include "iostat.h"
21#include "delegation.h"
19 22
20struct nfs_unlinkdata { 23struct nfs_unlinkdata {
21 struct hlist_node list; 24 struct hlist_node list;
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
244 * @dir: parent directory of dentry 247 * @dir: parent directory of dentry
245 * @dentry: dentry to unlink 248 * @dentry: dentry to unlink
246 */ 249 */
247int 250static int
248nfs_async_unlink(struct inode *dir, struct dentry *dentry) 251nfs_async_unlink(struct inode *dir, struct dentry *dentry)
249{ 252{
250 struct nfs_unlinkdata *data; 253 struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
259 status = PTR_ERR(data->cred); 262 status = PTR_ERR(data->cred);
260 goto out_free; 263 goto out_free;
261 } 264 }
262 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
263 data->res.dir_attr = &data->dir_attr; 265 data->res.dir_attr = &data->dir_attr;
264 266
265 status = -EBUSY; 267 status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
303 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) 305 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
304 nfs_free_unlinkdata(data); 306 nfs_free_unlinkdata(data);
305} 307}
308
309/* Cancel a queued async unlink. Called when a sillyrename run fails. */
310static void
311nfs_cancel_async_unlink(struct dentry *dentry)
312{
313 spin_lock(&dentry->d_lock);
314 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
315 struct nfs_unlinkdata *data = dentry->d_fsdata;
316
317 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
318 spin_unlock(&dentry->d_lock);
319 nfs_free_unlinkdata(data);
320 return;
321 }
322 spin_unlock(&dentry->d_lock);
323}
324
325struct nfs_renamedata {
326 struct nfs_renameargs args;
327 struct nfs_renameres res;
328 struct rpc_cred *cred;
329 struct inode *old_dir;
330 struct dentry *old_dentry;
331 struct nfs_fattr old_fattr;
332 struct inode *new_dir;
333 struct dentry *new_dentry;
334 struct nfs_fattr new_fattr;
335};
336
337/**
338 * nfs_async_rename_done - Sillyrename post-processing
339 * @task: rpc_task of the sillyrename
340 * @calldata: nfs_renamedata for the sillyrename
341 *
342 * Do the directory attribute updates and the d_move
343 */
344static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
345{
346 struct nfs_renamedata *data = calldata;
347 struct inode *old_dir = data->old_dir;
348 struct inode *new_dir = data->new_dir;
349
350 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
351 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
352 return;
353 }
354
355 if (task->tk_status != 0) {
356 nfs_cancel_async_unlink(data->old_dentry);
357 return;
358 }
359
360 nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
361 d_move(data->old_dentry, data->new_dentry);
362}
363
364/**
365 * nfs_async_rename_release - Release the sillyrename data.
366 * @calldata: the struct nfs_renamedata to be released
367 */
368static void nfs_async_rename_release(void *calldata)
369{
370 struct nfs_renamedata *data = calldata;
371 struct super_block *sb = data->old_dir->i_sb;
372
373 if (data->old_dentry->d_inode)
374 nfs_mark_for_revalidate(data->old_dentry->d_inode);
375
376 dput(data->old_dentry);
377 dput(data->new_dentry);
378 iput(data->old_dir);
379 iput(data->new_dir);
380 nfs_sb_deactive(sb);
381 put_rpccred(data->cred);
382 kfree(data);
383}
384
385#if defined(CONFIG_NFS_V4_1)
386static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
387{
388 struct nfs_renamedata *data = calldata;
389 struct nfs_server *server = NFS_SERVER(data->old_dir);
390
391 if (nfs4_setup_sequence(server, &data->args.seq_args,
392 &data->res.seq_res, 1, task))
393 return;
394 rpc_call_start(task);
395}
396#endif /* CONFIG_NFS_V4_1 */
397
398static const struct rpc_call_ops nfs_rename_ops = {
399 .rpc_call_done = nfs_async_rename_done,
400 .rpc_release = nfs_async_rename_release,
401#if defined(CONFIG_NFS_V4_1)
402 .rpc_call_prepare = nfs_rename_prepare,
403#endif /* CONFIG_NFS_V4_1 */
404};
405
406/**
407 * nfs_async_rename - perform an asynchronous rename operation
408 * @old_dir: directory that currently holds the dentry to be renamed
409 * @new_dir: target directory for the rename
410 * @old_dentry: original dentry to be renamed
411 * @new_dentry: dentry to which the old_dentry should be renamed
412 *
413 * It's expected that valid references to the dentries and inodes are held
414 */
415static struct rpc_task *
416nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
417 struct dentry *old_dentry, struct dentry *new_dentry)
418{
419 struct nfs_renamedata *data;
420 struct rpc_message msg = { };
421 struct rpc_task_setup task_setup_data = {
422 .rpc_message = &msg,
423 .callback_ops = &nfs_rename_ops,
424 .workqueue = nfsiod_workqueue,
425 .rpc_client = NFS_CLIENT(old_dir),
426 .flags = RPC_TASK_ASYNC,
427 };
428
429 data = kzalloc(sizeof(*data), GFP_KERNEL);
430 if (data == NULL)
431 return ERR_PTR(-ENOMEM);
432 task_setup_data.callback_data = data;
433
434 data->cred = rpc_lookup_cred();
435 if (IS_ERR(data->cred)) {
436 struct rpc_task *task = ERR_CAST(data->cred);
437 kfree(data);
438 return task;
439 }
440
441 msg.rpc_argp = &data->args;
442 msg.rpc_resp = &data->res;
443 msg.rpc_cred = data->cred;
444
445 /* set up nfs_renamedata */
446 data->old_dir = old_dir;
447 ihold(old_dir);
448 data->new_dir = new_dir;
449 ihold(new_dir);
450 data->old_dentry = dget(old_dentry);
451 data->new_dentry = dget(new_dentry);
452 nfs_fattr_init(&data->old_fattr);
453 nfs_fattr_init(&data->new_fattr);
454
455 /* set up nfs_renameargs */
456 data->args.old_dir = NFS_FH(old_dir);
457 data->args.old_name = &old_dentry->d_name;
458 data->args.new_dir = NFS_FH(new_dir);
459 data->args.new_name = &new_dentry->d_name;
460
461 /* set up nfs_renameres */
462 data->res.old_fattr = &data->old_fattr;
463 data->res.new_fattr = &data->new_fattr;
464
465 nfs_sb_active(old_dir->i_sb);
466
467 NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
468
469 return rpc_run_task(&task_setup_data);
470}
471
472/**
473 * nfs_sillyrename - Perform a silly-rename of a dentry
474 * @dir: inode of directory that contains dentry
475 * @dentry: dentry to be sillyrenamed
476 *
477 * NFSv2/3 is stateless and the server doesn't know when the client is
478 * holding a file open. To prevent application problems when a file is
479 * unlinked while it's still open, the client performs a "silly-rename".
480 * That is, it renames the file to a hidden file in the same directory,
481 * and only performs the unlink once the last reference to it is put.
482 *
483 * The final cleanup is done during dentry_iput.
484 */
485int
486nfs_sillyrename(struct inode *dir, struct dentry *dentry)
487{
488 static unsigned int sillycounter;
489 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
490 const int countersize = sizeof(sillycounter)*2;
491 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
492 char silly[slen+1];
493 struct dentry *sdentry;
494 struct rpc_task *task;
495 int error = -EIO;
496
497 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
498 dentry->d_parent->d_name.name, dentry->d_name.name,
499 dentry->d_count);
500 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
501
502 /*
503 * We don't allow a dentry to be silly-renamed twice.
504 */
505 error = -EBUSY;
506 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
507 goto out;
508
509 sprintf(silly, ".nfs%*.*Lx",
510 fileidsize, fileidsize,
511 (unsigned long long)NFS_FILEID(dentry->d_inode));
512
513 /* Return delegation in anticipation of the rename */
514 nfs_inode_return_delegation(dentry->d_inode);
515
516 sdentry = NULL;
517 do {
518 char *suffix = silly + slen - countersize;
519
520 dput(sdentry);
521 sillycounter++;
522 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
523
524 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
525 dentry->d_name.name, silly);
526
527 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
528 /*
529 * N.B. Better to return EBUSY here ... it could be
530 * dangerous to delete the file while it's in use.
531 */
532 if (IS_ERR(sdentry))
533 goto out;
534 } while (sdentry->d_inode != NULL); /* need negative lookup */
535
536 /* queue unlink first. Can't do this from rpc_release as it
537 * has to allocate memory
538 */
539 error = nfs_async_unlink(dir, dentry);
540 if (error)
541 goto out_dput;
542
543 /* run the rename task, undo unlink if it fails */
544 task = nfs_async_rename(dir, dir, dentry, sdentry);
545 if (IS_ERR(task)) {
546 error = -EBUSY;
547 nfs_cancel_async_unlink(dentry);
548 goto out_dput;
549 }
550
551 /* wait for the RPC task to complete, unless a SIGKILL intervenes */
552 error = rpc_wait_for_completion_task(task);
553 if (error == 0)
554 error = task->tk_status;
555 rpc_put_task(task);
556out_dput:
557 dput(sdentry);
558out:
559 return error;
560}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 874972d9427c..c8278f4046cb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
55 if (p) { 55 if (p) {
56 memset(p, 0, sizeof(*p)); 56 memset(p, 0, sizeof(*p));
57 INIT_LIST_HEAD(&p->pages); 57 INIT_LIST_HEAD(&p->pages);
58 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
59 } 58 }
60 return p; 59 return p;
61} 60}
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
75 memset(p, 0, sizeof(*p)); 74 memset(p, 0, sizeof(*p));
76 INIT_LIST_HEAD(&p->pages); 75 INIT_LIST_HEAD(&p->pages);
77 p->npages = pagecount; 76 p->npages = pagecount;
78 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
79 if (pagecount <= ARRAY_SIZE(p->page_array)) 77 if (pagecount <= ARRAY_SIZE(p->page_array))
80 p->pagevec = p->page_array; 78 p->pagevec = p->page_array;
81 else { 79 else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
292 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 290 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
293 291
294 nfs_pageio_cond_complete(pgio, page->index); 292 nfs_pageio_cond_complete(pgio, page->index);
295 ret = nfs_page_async_flush(pgio, page, 293 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
296 wbc->sync_mode == WB_SYNC_NONE ||
297 wbc->nonblocking != 0);
298 if (ret == -EAGAIN) { 294 if (ret == -EAGAIN) {
299 redirty_page_for_writepage(wbc, page); 295 redirty_page_for_writepage(wbc, page);
300 ret = 0; 296 ret = 0;
@@ -394,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
394 if (nfs_have_delegation(inode, FMODE_WRITE)) 390 if (nfs_have_delegation(inode, FMODE_WRITE))
395 nfsi->change_attr++; 391 nfsi->change_attr++;
396 } 392 }
393 set_bit(PG_MAPPED, &req->wb_flags);
397 SetPagePrivate(req->wb_page); 394 SetPagePrivate(req->wb_page);
398 set_page_private(req->wb_page, (unsigned long)req); 395 set_page_private(req->wb_page, (unsigned long)req);
399 nfsi->npages++; 396 nfsi->npages++;
@@ -419,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
419 spin_lock(&inode->i_lock); 416 spin_lock(&inode->i_lock);
420 set_page_private(req->wb_page, 0); 417 set_page_private(req->wb_page, 0);
421 ClearPagePrivate(req->wb_page); 418 ClearPagePrivate(req->wb_page);
419 clear_bit(PG_MAPPED, &req->wb_flags);
422 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 420 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
423 nfsi->npages--; 421 nfsi->npages--;
424 if (!nfsi->npages) { 422 if (!nfsi->npages) {
@@ -426,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
426 iput(inode); 424 iput(inode);
427 } else 425 } else
428 spin_unlock(&inode->i_lock); 426 spin_unlock(&inode->i_lock);
429 nfs_clear_request(req);
430 nfs_release_request(req); 427 nfs_release_request(req);
431} 428}
432 429
@@ -935,7 +932,7 @@ out_bad:
935 while (!list_empty(&list)) { 932 while (!list_empty(&list)) {
936 data = list_entry(list.next, struct nfs_write_data, pages); 933 data = list_entry(list.next, struct nfs_write_data, pages);
937 list_del(&data->pages); 934 list_del(&data->pages);
938 nfs_writedata_release(data); 935 nfs_writedata_free(data);
939 } 936 }
940 nfs_redirty_request(req); 937 nfs_redirty_request(req);
941 return -ENOMEM; 938 return -ENOMEM;
@@ -1433,15 +1430,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1433 int flags = FLUSH_SYNC; 1430 int flags = FLUSH_SYNC;
1434 int ret = 0; 1431 int ret = 0;
1435 1432
1436 /* Don't commit yet if this is a non-blocking flush and there are 1433 if (wbc->sync_mode == WB_SYNC_NONE) {
1437 * lots of outstanding writes for this mapping. 1434 /* Don't commit yet if this is a non-blocking flush and there
1438 */ 1435 * are a lot of outstanding writes for this mapping.
1439 if (wbc->sync_mode == WB_SYNC_NONE && 1436 */
1440 nfsi->ncommit <= (nfsi->npages >> 1)) 1437 if (nfsi->ncommit <= (nfsi->npages >> 1))
1441 goto out_mark_dirty; 1438 goto out_mark_dirty;
1442 1439
1443 if (wbc->nonblocking || wbc->for_background) 1440 /* don't wait for the COMMIT response */
1444 flags = 0; 1441 flags = 0;
1442 }
1443
1445 ret = nfs_commit_inode(inode, flags); 1444 ret = nfs_commit_inode(inode, flags);
1446 if (ret >= 0) { 1445 if (ret >= 0) {
1447 if (wbc->sync_mode == WB_SYNC_NONE) { 1446 if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c03..84c27d69d421 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
42 gid_t gid; 42 gid_t gid;
43}; 43};
44 44
45struct nfsacl_simple_acl {
46 struct posix_acl acl;
47 struct posix_acl_entry ace[4];
48};
49
45static int 50static int
46xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) 51xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
47{ 52{
@@ -72,9 +77,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
72 return 0; 77 return 0;
73} 78}
74 79
75unsigned int 80/**
76nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, 81 * nfsacl_encode - Encode an NFSv3 ACL
77 struct posix_acl *acl, int encode_entries, int typeflag) 82 *
83 * @buf: destination xdr_buf to contain XDR encoded ACL
84 * @base: byte offset in xdr_buf where XDR'd ACL begins
85 * @inode: inode of file whose ACL this is
86 * @acl: posix_acl to encode
87 * @encode_entries: whether to encode ACEs as well
88 * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
89 *
90 * Returns size of encoded ACL in bytes or a negative errno value.
91 */
92int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
93 struct posix_acl *acl, int encode_entries, int typeflag)
78{ 94{
79 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; 95 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
80 struct nfsacl_encode_desc nfsacl_desc = { 96 struct nfsacl_encode_desc nfsacl_desc = {
@@ -88,17 +104,22 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
88 .uid = inode->i_uid, 104 .uid = inode->i_uid,
89 .gid = inode->i_gid, 105 .gid = inode->i_gid,
90 }; 106 };
107 struct nfsacl_simple_acl aclbuf;
91 int err; 108 int err;
92 struct posix_acl *acl2 = NULL;
93 109
94 if (entries > NFS_ACL_MAX_ENTRIES || 110 if (entries > NFS_ACL_MAX_ENTRIES ||
95 xdr_encode_word(buf, base, entries)) 111 xdr_encode_word(buf, base, entries))
96 return -EINVAL; 112 return -EINVAL;
97 if (encode_entries && acl && acl->a_count == 3) { 113 if (encode_entries && acl && acl->a_count == 3) {
98 /* Fake up an ACL_MASK entry. */ 114 struct posix_acl *acl2 = &aclbuf.acl;
99 acl2 = posix_acl_alloc(4, GFP_KERNEL); 115
100 if (!acl2) 116 /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
101 return -ENOMEM; 117 * invoked in contexts where a memory allocation failure is
118 * fatal. Fortunately this fake ACL is small enough to
119 * construct on the stack. */
120 memset(acl2, 0, sizeof(acl2));
121 posix_acl_init(acl2, 4);
122
102 /* Insert entries in canonical order: other orders seem 123 /* Insert entries in canonical order: other orders seem
103 to confuse Solaris VxFS. */ 124 to confuse Solaris VxFS. */
104 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ 125 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
@@ -109,8 +130,6 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
109 nfsacl_desc.acl = acl2; 130 nfsacl_desc.acl = acl2;
110 } 131 }
111 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); 132 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
112 if (acl2)
113 posix_acl_release(acl2);
114 if (!err) 133 if (!err)
115 err = 8 + nfsacl_desc.desc.elem_size * 134 err = 8 + nfsacl_desc.desc.elem_size *
116 nfsacl_desc.desc.array_len; 135 nfsacl_desc.desc.array_len;
@@ -224,9 +243,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
224 return 0; 243 return 0;
225} 244}
226 245
227unsigned int 246/**
228nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, 247 * nfsacl_decode - Decode an NFSv3 ACL
229 struct posix_acl **pacl) 248 *
249 * @buf: xdr_buf containing XDR'd ACL data to decode
250 * @base: byte offset in xdr_buf where XDR'd ACL begins
251 * @aclcnt: count of ACEs in decoded posix_acl
252 * @pacl: buffer in which to place decoded posix_acl
253 *
254 * Returns the length of the decoded ACL in bytes, or a negative errno value.
255 */
256int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
257 struct posix_acl **pacl)
230{ 258{
231 struct nfsacl_decode_desc nfsacl_desc = { 259 struct nfsacl_decode_desc nfsacl_desc = {
232 .desc = { 260 .desc = {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
28 28
29 If unsure, say N. 29 If unsure, say N.
30 30
31config NFSD_DEPRECATED
32 bool "Include support for deprecated syscall interface to NFSD"
33 depends on NFSD
34 default y
35 help
36 The syscall interface to nfsd was obsoleted in 2.6.0 by a new
37 filesystem based interface. The old interface is due for removal
38 in 2.6.40. If you wish to remove the interface before then
39 say N.
40
41 In unsure, say Y.
42
31config NFSD_V2_ACL 43config NFSD_V2_ACL
32 bool 44 bool
33 depends on NFSD 45 depends on NFSD
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..34e5c40af5ef
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
1/*
2 * Common NFSv4 ACL handling definitions.
3 *
4 * Copyright (c) 2002 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFS4_ACL_H
36#define LINUX_NFS4_ACL_H
37
38#include <linux/posix_acl.h>
39
40/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
41 * fit in a page: */
42#define NFS4_ACL_MAX 170
43
44struct nfs4_acl *nfs4_acl_new(int);
45int nfs4_acl_get_whotype(char *, u32);
46int nfs4_acl_write_who(int who, char *p);
47int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
48 uid_t who, u32 mask);
49
50#define NFS4_ACL_TYPE_DEFAULT 0x01
51#define NFS4_ACL_DIR 0x02
52#define NFS4_ACL_OWNER 0x04
53
54struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
55 struct posix_acl *, unsigned int flags);
56int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
57 struct posix_acl **, unsigned int flags);
58
59#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87dd..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * NFS exporting and validation. 2 * NFS exporting and validation.
4 * 3 *
@@ -28,9 +27,6 @@
28typedef struct auth_domain svc_client; 27typedef struct auth_domain svc_client;
29typedef struct svc_export svc_export; 28typedef struct svc_export svc_export;
30 29
31static void exp_do_unexport(svc_export *unexp);
32static int exp_verify_string(char *cp, int max);
33
34/* 30/*
35 * We have two caches. 31 * We have two caches.
36 * One maps client+vfsmnt+dentry to export options - the export map 32 * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +798,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
802 return ek; 798 return ek;
803} 799}
804 800
801#ifdef CONFIG_NFSD_DEPRECATED
805static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, 802static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
806 struct svc_export *exp) 803 struct svc_export *exp)
807{ 804{
@@ -852,6 +849,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
852 849
853 return exp_find_key(clp, FSID_NUM, fsidv, NULL); 850 return exp_find_key(clp, FSID_NUM, fsidv, NULL);
854} 851}
852#endif
855 853
856static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, 854static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
857 struct cache_req *reqp) 855 struct cache_req *reqp)
@@ -893,6 +891,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
893 return exp; 891 return exp;
894} 892}
895 893
894#ifdef CONFIG_NFSD_DEPRECATED
896/* 895/*
897 * Hashtable locking. Write locks are placed only by user processes 896 * Hashtable locking. Write locks are placed only by user processes
898 * wanting to modify export information. 897 * wanting to modify export information.
@@ -925,6 +924,19 @@ exp_writeunlock(void)
925{ 924{
926 up_write(&hash_sem); 925 up_write(&hash_sem);
927} 926}
927#else
928
929/* hash_sem not needed once deprecated interface is removed */
930void exp_readlock(void) {}
931static inline void exp_writelock(void){}
932void exp_readunlock(void) {}
933static inline void exp_writeunlock(void){}
934
935#endif
936
937#ifdef CONFIG_NFSD_DEPRECATED
938static void exp_do_unexport(svc_export *unexp);
939static int exp_verify_string(char *cp, int max);
928 940
929static void exp_fsid_unhash(struct svc_export *exp) 941static void exp_fsid_unhash(struct svc_export *exp)
930{ 942{
@@ -935,10 +947,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
935 947
936 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); 948 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
937 if (!IS_ERR(ek)) { 949 if (!IS_ERR(ek)) {
938 ek->h.expiry_time = get_seconds()-1; 950 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
939 cache_put(&ek->h, &svc_expkey_cache); 951 cache_put(&ek->h, &svc_expkey_cache);
940 } 952 }
941 svc_expkey_cache.nextcheck = get_seconds();
942} 953}
943 954
944static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) 955static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +984,9 @@ static void exp_unhash(struct svc_export *exp)
973 984
974 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); 985 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
975 if (!IS_ERR(ek)) { 986 if (!IS_ERR(ek)) {
976 ek->h.expiry_time = get_seconds()-1; 987 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
977 cache_put(&ek->h, &svc_expkey_cache); 988 cache_put(&ek->h, &svc_expkey_cache);
978 } 989 }
979 svc_expkey_cache.nextcheck = get_seconds();
980} 990}
981 991
982/* 992/*
@@ -1097,8 +1107,7 @@ out:
1097static void 1107static void
1098exp_do_unexport(svc_export *unexp) 1108exp_do_unexport(svc_export *unexp)
1099{ 1109{
1100 unexp->h.expiry_time = get_seconds()-1; 1110 sunrpc_invalidate(&unexp->h, &svc_export_cache);
1101 svc_export_cache.nextcheck = get_seconds();
1102 exp_unhash(unexp); 1111 exp_unhash(unexp);
1103 exp_fsid_unhash(unexp); 1112 exp_fsid_unhash(unexp);
1104} 1113}
@@ -1150,6 +1159,7 @@ out_unlock:
1150 exp_writeunlock(); 1159 exp_writeunlock();
1151 return err; 1160 return err;
1152} 1161}
1162#endif /* CONFIG_NFSD_DEPRECATED */
1153 1163
1154/* 1164/*
1155 * Obtain the root fh on behalf of a client. 1165 * Obtain the root fh on behalf of a client.
@@ -1433,9 +1443,6 @@ static struct flags {
1433 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1443 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
1434 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, 1444 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
1435 { NFSEXP_V4ROOT, {"v4root", ""}}, 1445 { NFSEXP_V4ROOT, {"v4root", ""}},
1436#ifdef MSNFS
1437 { NFSEXP_MSNFS, {"msnfs", ""}},
1438#endif
1439 { 0, {"", ""}} 1446 { 0, {"", ""}}
1440}; 1447};
1441 1448
@@ -1459,25 +1466,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
1459 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); 1466 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
1460} 1467}
1461 1468
1469static bool secinfo_flags_equal(int f, int g)
1470{
1471 f &= NFSEXP_SECINFO_FLAGS;
1472 g &= NFSEXP_SECINFO_FLAGS;
1473 return f == g;
1474}
1475
1476static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
1477{
1478 int flags;
1479
1480 flags = (*fp)->flags;
1481 seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
1482 (*fp)++;
1483 while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
1484 seq_printf(m, ":%d", (*fp)->pseudoflavor);
1485 (*fp)++;
1486 }
1487 return flags;
1488}
1489
1462static void show_secinfo(struct seq_file *m, struct svc_export *exp) 1490static void show_secinfo(struct seq_file *m, struct svc_export *exp)
1463{ 1491{
1464 struct exp_flavor_info *f; 1492 struct exp_flavor_info *f;
1465 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; 1493 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1466 int lastflags = 0, first = 0; 1494 int flags;
1467 1495
1468 if (exp->ex_nflavors == 0) 1496 if (exp->ex_nflavors == 0)
1469 return; 1497 return;
1470 for (f = exp->ex_flavors; f < end; f++) { 1498 f = exp->ex_flavors;
1471 if (first || f->flags != lastflags) { 1499 flags = show_secinfo_run(m, &f, end);
1472 if (!first) 1500 if (!secinfo_flags_equal(flags, exp->ex_flags))
1473 show_secinfo_flags(m, lastflags); 1501 show_secinfo_flags(m, flags);
1474 seq_printf(m, ",sec=%d", f->pseudoflavor); 1502 while (f != end) {
1475 lastflags = f->flags; 1503 flags = show_secinfo_run(m, &f, end);
1476 } else { 1504 show_secinfo_flags(m, flags);
1477 seq_printf(m, ":%d", f->pseudoflavor);
1478 }
1479 } 1505 }
1480 show_secinfo_flags(m, lastflags);
1481} 1506}
1482 1507
1483static void exp_flags(struct seq_file *m, int flag, int fsid, 1508static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1557,7 @@ const struct seq_operations nfs_exports_op = {
1532 .show = e_show, 1557 .show = e_show,
1533}; 1558};
1534 1559
1560#ifdef CONFIG_NFSD_DEPRECATED
1535/* 1561/*
1536 * Add or modify a client. 1562 * Add or modify a client.
1537 * Change requests may involve the list of host addresses. The list of 1563 * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1589,7 @@ exp_addclient(struct nfsctl_client *ncp)
1563 /* Insert client into hashtable. */ 1589 /* Insert client into hashtable. */
1564 for (i = 0; i < ncp->cl_naddr; i++) { 1590 for (i = 0; i < ncp->cl_naddr; i++) {
1565 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); 1591 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
1566 auth_unix_add_addr(&addr6, dom); 1592 auth_unix_add_addr(&init_net, &addr6, dom);
1567 } 1593 }
1568 auth_unix_forget_old(dom); 1594 auth_unix_forget_old(dom);
1569 auth_domain_put(dom); 1595 auth_domain_put(dom);
@@ -1621,6 +1647,7 @@ exp_verify_string(char *cp, int max)
1621 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); 1647 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
1622 return 0; 1648 return 0;
1623} 1649}
1650#endif /* CONFIG_NFSD_DEPRECATED */
1624 1651
1625/* 1652/*
1626 * Initialize the exports module. 1653 * Initialize the exports module.
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..2f3be1321534
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
1/*
2 * Mapping of UID to name and vice versa.
3 *
4 * Copyright (c) 2002, 2003 The Regents of the University of
5 * Michigan. All rights reserved.
6> *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFSD_IDMAP_H
36#define LINUX_NFSD_IDMAP_H
37
38#include <linux/in.h>
39#include <linux/sunrpc/svc.h>
40
41/* XXX from linux/nfs_idmap.h */
42#define IDMAP_NAMESZ 128
43
44#ifdef CONFIG_NFSD_V4
45int nfsd_idmap_init(void);
46void nfsd_idmap_shutdown(void);
47#else
48static inline int nfsd_idmap_init(void)
49{
50 return 0;
51}
52static inline void nfsd_idmap_shutdown(void)
53{
54}
55#endif
56
57__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
58__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
59int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
60int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
61
62#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06b..2247fc91d5e9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
151 __be32 nfserr; 151 __be32 nfserr;
152 u32 max_blocksize = svc_max_payload(rqstp); 152 u32 max_blocksize = svc_max_payload(rqstp);
153 153
154 dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", 154 dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
155 SVCFH_fmt(&argp->fh), 155 SVCFH_fmt(&argp->fh),
156 (unsigned long) argp->count, 156 (unsigned long) argp->count,
157 (unsigned long) argp->offset); 157 (unsigned long long) argp->offset);
158 158
159 /* Obtain buffer pointer for payload. 159 /* Obtain buffer pointer for payload.
160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) 160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
191 __be32 nfserr; 191 __be32 nfserr;
192 unsigned long cnt = argp->len; 192 unsigned long cnt = argp->len;
193 193
194 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 194 dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
195 SVCFH_fmt(&argp->fh), 195 SVCFH_fmt(&argp->fh),
196 argp->len, 196 argp->len,
197 (unsigned long) argp->offset, 197 (unsigned long long) argp->offset,
198 argp->stable? " stable" : ""); 198 argp->stable? " stable" : "");
199 199
200 fh_copy(&resp->fh, &argp->fh); 200 fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a9..7e84a852cdae 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
260 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, 260 err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
261 &fhp->fh_post_attr); 261 &fhp->fh_post_attr);
262 fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; 262 fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
263 if (err) 263 if (err) {
264 fhp->fh_post_saved = 0; 264 fhp->fh_post_saved = 0;
265 else 265 /* Grab the ctime anyway - set_change_info might use it */
266 fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
267 } else
266 fhp->fh_post_saved = 1; 268 fhp->fh_post_saved = 1;
267} 269}
268 270
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/nfs4_acl.h> 39#include "acl.h"
40 40
41 41
42/* mode bit translations: */ 42/* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..3be975e18919 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
41 41
42#define NFSPROC4_CB_NULL 0 42#define NFSPROC4_CB_NULL 0
43#define NFSPROC4_CB_COMPOUND 1 43#define NFSPROC4_CB_COMPOUND 1
44#define NFS4_STATEID_SIZE 16
45 44
46/* Index of predefined Linux callback client operations */ 45/* Index of predefined Linux callback client operations */
47 46
@@ -51,11 +50,6 @@ enum {
51 NFSPROC4_CLNT_CB_SEQUENCE, 50 NFSPROC4_CLNT_CB_SEQUENCE,
52}; 51};
53 52
54enum nfs_cb_opnum4 {
55 OP_CB_RECALL = 4,
56 OP_CB_SEQUENCE = 11,
57};
58
59#define NFS4_MAXTAGLEN 20 53#define NFS4_MAXTAGLEN 20
60 54
61#define NFS4_enc_cb_null_sz 0 55#define NFS4_enc_cb_null_sz 0
@@ -80,61 +74,6 @@ enum nfs_cb_opnum4 {
80 cb_sequence_dec_sz + \ 74 cb_sequence_dec_sz + \
81 op_dec_sz) 75 op_dec_sz)
82 76
83/*
84* Generic encode routines from fs/nfs/nfs4xdr.c
85*/
86static inline __be32 *
87xdr_writemem(__be32 *p, const void *ptr, int nbytes)
88{
89 int tmp = XDR_QUADLEN(nbytes);
90 if (!tmp)
91 return p;
92 p[tmp-1] = 0;
93 memcpy(p, ptr, nbytes);
94 return p + tmp;
95}
96
97#define WRITE32(n) *p++ = htonl(n)
98#define WRITEMEM(ptr,nbytes) do { \
99 p = xdr_writemem(p, ptr, nbytes); \
100} while (0)
101#define RESERVE_SPACE(nbytes) do { \
102 p = xdr_reserve_space(xdr, nbytes); \
103 if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
104 BUG_ON(!p); \
105} while (0)
106
107/*
108 * Generic decode routines from fs/nfs/nfs4xdr.c
109 */
110#define DECODE_TAIL \
111 status = 0; \
112out: \
113 return status; \
114xdr_error: \
115 dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
116 status = -EIO; \
117 goto out
118
119#define READ32(x) (x) = ntohl(*p++)
120#define READ64(x) do { \
121 (x) = (u64)ntohl(*p++) << 32; \
122 (x) |= ntohl(*p++); \
123} while (0)
124#define READTIME(x) do { \
125 p++; \
126 (x.tv_sec) = ntohl(*p++); \
127 (x.tv_nsec) = ntohl(*p++); \
128} while (0)
129#define READ_BUF(nbytes) do { \
130 p = xdr_inline_decode(xdr, nbytes); \
131 if (!p) { \
132 dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
133 __func__, __LINE__); \
134 return -EIO; \
135 } \
136} while (0)
137
138struct nfs4_cb_compound_hdr { 77struct nfs4_cb_compound_hdr {
139 /* args */ 78 /* args */
140 u32 ident; /* minorversion 0 only */ 79 u32 ident; /* minorversion 0 only */
@@ -145,294 +84,513 @@ struct nfs4_cb_compound_hdr {
145 int status; 84 int status;
146}; 85};
147 86
148static struct { 87/*
149int stat; 88 * Handle decode buffer overflows out-of-line.
150int errno; 89 */
151} nfs_cb_errtbl[] = { 90static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
152 { NFS4_OK, 0 }, 91{
153 { NFS4ERR_PERM, EPERM }, 92 dprintk("NFS: %s prematurely hit the end of our receive buffer. "
154 { NFS4ERR_NOENT, ENOENT }, 93 "Remaining buffer length is %tu words.\n",
155 { NFS4ERR_IO, EIO }, 94 func, xdr->end - xdr->p);
156 { NFS4ERR_NXIO, ENXIO }, 95}
157 { NFS4ERR_ACCESS, EACCES },
158 { NFS4ERR_EXIST, EEXIST },
159 { NFS4ERR_XDEV, EXDEV },
160 { NFS4ERR_NOTDIR, ENOTDIR },
161 { NFS4ERR_ISDIR, EISDIR },
162 { NFS4ERR_INVAL, EINVAL },
163 { NFS4ERR_FBIG, EFBIG },
164 { NFS4ERR_NOSPC, ENOSPC },
165 { NFS4ERR_ROFS, EROFS },
166 { NFS4ERR_MLINK, EMLINK },
167 { NFS4ERR_NAMETOOLONG, ENAMETOOLONG },
168 { NFS4ERR_NOTEMPTY, ENOTEMPTY },
169 { NFS4ERR_DQUOT, EDQUOT },
170 { NFS4ERR_STALE, ESTALE },
171 { NFS4ERR_BADHANDLE, EBADHANDLE },
172 { NFS4ERR_BAD_COOKIE, EBADCOOKIE },
173 { NFS4ERR_NOTSUPP, ENOTSUPP },
174 { NFS4ERR_TOOSMALL, ETOOSMALL },
175 { NFS4ERR_SERVERFAULT, ESERVERFAULT },
176 { NFS4ERR_BADTYPE, EBADTYPE },
177 { NFS4ERR_LOCKED, EAGAIN },
178 { NFS4ERR_RESOURCE, EREMOTEIO },
179 { NFS4ERR_SYMLINK, ELOOP },
180 { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP },
181 { NFS4ERR_DEADLOCK, EDEADLK },
182 { -1, EIO }
183};
184 96
185static int 97static __be32 *xdr_encode_empty_array(__be32 *p)
186nfs_cb_stat_to_errno(int stat)
187{ 98{
188 int i; 99 *p++ = xdr_zero;
189 for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { 100 return p;
190 if (nfs_cb_errtbl[i].stat == stat)
191 return nfs_cb_errtbl[i].errno;
192 }
193 /* If we cannot translate the error, the recovery routines should
194 * handle it.
195 * Note: remaining NFSv4 error codes have values > 10000, so should
196 * not conflict with native Linux error codes.
197 */
198 return stat;
199} 101}
200 102
201/* 103/*
202 * XDR encode 104 * Encode/decode NFSv4 CB basic data types
105 *
106 * Basic NFSv4 callback data types are defined in section 15 of RFC
107 * 3530: "Network File System (NFS) version 4 Protocol" and section
108 * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
109 * 1 Protocol"
203 */ 110 */
204 111
205static void 112/*
206encode_stateid(struct xdr_stream *xdr, stateid_t *sid) 113 * nfs_cb_opnum4
114 *
115 * enum nfs_cb_opnum4 {
116 * OP_CB_GETATTR = 3,
117 * ...
118 * };
119 */
120enum nfs_cb_opnum4 {
121 OP_CB_GETATTR = 3,
122 OP_CB_RECALL = 4,
123 OP_CB_LAYOUTRECALL = 5,
124 OP_CB_NOTIFY = 6,
125 OP_CB_PUSH_DELEG = 7,
126 OP_CB_RECALL_ANY = 8,
127 OP_CB_RECALLABLE_OBJ_AVAIL = 9,
128 OP_CB_RECALL_SLOT = 10,
129 OP_CB_SEQUENCE = 11,
130 OP_CB_WANTS_CANCELLED = 12,
131 OP_CB_NOTIFY_LOCK = 13,
132 OP_CB_NOTIFY_DEVICEID = 14,
133 OP_CB_ILLEGAL = 10044
134};
135
136static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
207{ 137{
208 __be32 *p; 138 __be32 *p;
209 139
210 RESERVE_SPACE(sizeof(stateid_t)); 140 p = xdr_reserve_space(xdr, 4);
211 WRITE32(sid->si_generation); 141 *p = cpu_to_be32(op);
212 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
213} 142}
214 143
215static void 144/*
216encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 145 * nfs_fh4
146 *
147 * typedef opaque nfs_fh4<NFS4_FHSIZE>;
148 */
149static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
217{ 150{
218 __be32 * p; 151 u32 length = fh->fh_size;
152 __be32 *p;
219 153
220 RESERVE_SPACE(16); 154 BUG_ON(length > NFS4_FHSIZE);
221 WRITE32(0); /* tag length is always 0 */ 155 p = xdr_reserve_space(xdr, 4 + length);
222 WRITE32(hdr->minorversion); 156 xdr_encode_opaque(p, &fh->fh_base, length);
223 WRITE32(hdr->ident);
224 hdr->nops_p = p;
225 WRITE32(hdr->nops);
226} 157}
227 158
228static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) 159/*
160 * stateid4
161 *
162 * struct stateid4 {
163 * uint32_t seqid;
164 * opaque other[12];
165 * };
166 */
167static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
229{ 168{
230 *hdr->nops_p = htonl(hdr->nops); 169 __be32 *p;
170
171 p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
172 *p++ = cpu_to_be32(sid->si_generation);
173 xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
231} 174}
232 175
233static void 176/*
234encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, 177 * sessionid4
235 struct nfs4_cb_compound_hdr *hdr) 178 *
179 * typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
180 */
181static void encode_sessionid4(struct xdr_stream *xdr,
182 const struct nfsd4_session *session)
236{ 183{
237 __be32 *p; 184 __be32 *p;
238 int len = dp->dl_fh.fh_size; 185
239 186 p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
240 RESERVE_SPACE(4); 187 xdr_encode_opaque_fixed(p, session->se_sessionid.data,
241 WRITE32(OP_CB_RECALL); 188 NFS4_MAX_SESSIONID_LEN);
242 encode_stateid(xdr, &dp->dl_stateid);
243 RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
244 WRITE32(0); /* truncate optimization not implemented */
245 WRITE32(len);
246 WRITEMEM(&dp->dl_fh.fh_base, len);
247 hdr->nops++;
248} 189}
249 190
250static void 191/*
251encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, 192 * nfsstat4
252 struct nfs4_cb_compound_hdr *hdr) 193 */
253{ 194static const struct {
254 __be32 *p; 195 int stat;
196 int errno;
197} nfs_cb_errtbl[] = {
198 { NFS4_OK, 0 },
199 { NFS4ERR_PERM, -EPERM },
200 { NFS4ERR_NOENT, -ENOENT },
201 { NFS4ERR_IO, -EIO },
202 { NFS4ERR_NXIO, -ENXIO },
203 { NFS4ERR_ACCESS, -EACCES },
204 { NFS4ERR_EXIST, -EEXIST },
205 { NFS4ERR_XDEV, -EXDEV },
206 { NFS4ERR_NOTDIR, -ENOTDIR },
207 { NFS4ERR_ISDIR, -EISDIR },
208 { NFS4ERR_INVAL, -EINVAL },
209 { NFS4ERR_FBIG, -EFBIG },
210 { NFS4ERR_NOSPC, -ENOSPC },
211 { NFS4ERR_ROFS, -EROFS },
212 { NFS4ERR_MLINK, -EMLINK },
213 { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
214 { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
215 { NFS4ERR_DQUOT, -EDQUOT },
216 { NFS4ERR_STALE, -ESTALE },
217 { NFS4ERR_BADHANDLE, -EBADHANDLE },
218 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
219 { NFS4ERR_NOTSUPP, -ENOTSUPP },
220 { NFS4ERR_TOOSMALL, -ETOOSMALL },
221 { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
222 { NFS4ERR_BADTYPE, -EBADTYPE },
223 { NFS4ERR_LOCKED, -EAGAIN },
224 { NFS4ERR_RESOURCE, -EREMOTEIO },
225 { NFS4ERR_SYMLINK, -ELOOP },
226 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
227 { NFS4ERR_DEADLOCK, -EDEADLK },
228 { -1, -EIO }
229};
255 230
256 if (hdr->minorversion == 0) 231/*
257 return; 232 * If we cannot translate the error, the recovery routines should
233 * handle it.
234 *
235 * Note: remaining NFSv4 error codes have values > 10000, so should
236 * not conflict with native Linux error codes.
237 */
238static int nfs_cb_stat_to_errno(int status)
239{
240 int i;
258 241
259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); 242 for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
243 if (nfs_cb_errtbl[i].stat == status)
244 return nfs_cb_errtbl[i].errno;
245 }
260 246
261 WRITE32(OP_CB_SEQUENCE); 247 dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
262 WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); 248 return -status;
263 WRITE32(args->cbs_clp->cl_cb_seq_nr);
264 WRITE32(0); /* slotid, always 0 */
265 WRITE32(0); /* highest slotid always 0 */
266 WRITE32(0); /* cachethis always 0 */
267 WRITE32(0); /* FIXME: support referring_call_lists */
268 hdr->nops++;
269} 249}
270 250
271static int 251static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
272nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) 252 enum nfsstat4 *status)
273{ 253{
274 struct xdr_stream xdrs, *xdr = &xdrs; 254 __be32 *p;
255 u32 op;
275 256
276 xdr_init_encode(&xdrs, &req->rq_snd_buf, p); 257 p = xdr_inline_decode(xdr, 4 + 4);
277 RESERVE_SPACE(0); 258 if (unlikely(p == NULL))
259 goto out_overflow;
260 op = be32_to_cpup(p++);
261 if (unlikely(op != expected))
262 goto out_unexpected;
263 *status = be32_to_cpup(p);
278 return 0; 264 return 0;
265out_overflow:
266 print_overflow_msg(__func__, xdr);
267 return -EIO;
268out_unexpected:
269 dprintk("NFSD: Callback server returned operation %d but "
270 "we issued a request for %d\n", op, expected);
271 return -EIO;
279} 272}
280 273
281static int 274/*
282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, 275 * CB_COMPOUND4args
283 struct nfs4_rpc_args *rpc_args) 276 *
277 * struct CB_COMPOUND4args {
278 * utf8str_cs tag;
279 * uint32_t minorversion;
280 * uint32_t callback_ident;
281 * nfs_cb_argop4 argarray<>;
282 * };
283*/
284static void encode_cb_compound4args(struct xdr_stream *xdr,
285 struct nfs4_cb_compound_hdr *hdr)
284{ 286{
285 struct xdr_stream xdr; 287 __be32 * p;
286 struct nfs4_delegation *args = rpc_args->args_op;
287 struct nfs4_cb_compound_hdr hdr = {
288 .ident = args->dl_ident,
289 .minorversion = rpc_args->args_seq.cbs_minorversion,
290 };
291 288
292 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 289 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
293 encode_cb_compound_hdr(&xdr, &hdr); 290 p = xdr_encode_empty_array(p); /* empty tag */
294 encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); 291 *p++ = cpu_to_be32(hdr->minorversion);
295 encode_cb_recall(&xdr, args, &hdr); 292 *p++ = cpu_to_be32(hdr->ident);
296 encode_cb_nops(&hdr); 293
294 hdr->nops_p = p;
295 *p = cpu_to_be32(hdr->nops); /* argarray element count */
296}
297
298/*
299 * Update argarray element count
300 */
301static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
302{
303 BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
304 *hdr->nops_p = cpu_to_be32(hdr->nops);
305}
306
307/*
308 * CB_COMPOUND4res
309 *
310 * struct CB_COMPOUND4res {
311 * nfsstat4 status;
312 * utf8str_cs tag;
313 * nfs_cb_resop4 resarray<>;
314 * };
315 */
316static int decode_cb_compound4res(struct xdr_stream *xdr,
317 struct nfs4_cb_compound_hdr *hdr)
318{
319 u32 length;
320 __be32 *p;
321
322 p = xdr_inline_decode(xdr, 4 + 4);
323 if (unlikely(p == NULL))
324 goto out_overflow;
325 hdr->status = be32_to_cpup(p++);
326 /* Ignore the tag */
327 length = be32_to_cpup(p++);
328 p = xdr_inline_decode(xdr, length + 4);
329 if (unlikely(p == NULL))
330 goto out_overflow;
331 hdr->nops = be32_to_cpup(p);
297 return 0; 332 return 0;
333out_overflow:
334 print_overflow_msg(__func__, xdr);
335 return -EIO;
298} 336}
299 337
338/*
339 * CB_RECALL4args
340 *
341 * struct CB_RECALL4args {
342 * stateid4 stateid;
343 * bool truncate;
344 * nfs_fh4 fh;
345 * };
346 */
347static void encode_cb_recall4args(struct xdr_stream *xdr,
348 const struct nfs4_delegation *dp,
349 struct nfs4_cb_compound_hdr *hdr)
350{
351 __be32 *p;
352
353 encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
354 encode_stateid4(xdr, &dp->dl_stateid);
300 355
301static int 356 p = xdr_reserve_space(xdr, 4);
302decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ 357 *p++ = xdr_zero; /* truncate */
303 __be32 *p;
304 u32 taglen;
305 358
306 READ_BUF(8); 359 encode_nfs_fh4(xdr, &dp->dl_fh);
307 READ32(hdr->status); 360
308 /* We've got no use for the tag; ignore it: */ 361 hdr->nops++;
309 READ32(taglen);
310 READ_BUF(taglen + 4);
311 p += XDR_QUADLEN(taglen);
312 READ32(hdr->nops);
313 return 0;
314} 362}
315 363
316static int 364/*
317decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 365 * CB_SEQUENCE4args
366 *
367 * struct CB_SEQUENCE4args {
368 * sessionid4 csa_sessionid;
369 * sequenceid4 csa_sequenceid;
370 * slotid4 csa_slotid;
371 * slotid4 csa_highest_slotid;
372 * bool csa_cachethis;
373 * referring_call_list4 csa_referring_call_lists<>;
374 * };
375 */
376static void encode_cb_sequence4args(struct xdr_stream *xdr,
377 const struct nfsd4_callback *cb,
378 struct nfs4_cb_compound_hdr *hdr)
318{ 379{
380 struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
319 __be32 *p; 381 __be32 *p;
320 u32 op; 382
321 int32_t nfserr; 383 if (hdr->minorversion == 0)
322 384 return;
323 READ_BUF(8); 385
324 READ32(op); 386 encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
325 if (op != expected) { 387 encode_sessionid4(xdr, session);
326 dprintk("NFSD: decode_cb_op_hdr: Callback server returned " 388
327 " operation %d but we issued a request for %d\n", 389 p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
328 op, expected); 390 *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */
329 return -EIO; 391 *p++ = xdr_zero; /* csa_slotid */
330 } 392 *p++ = xdr_zero; /* csa_highest_slotid */
331 READ32(nfserr); 393 *p++ = xdr_zero; /* csa_cachethis */
332 if (nfserr != NFS_OK) 394 xdr_encode_empty_array(p); /* csa_referring_call_lists */
333 return -nfs_cb_stat_to_errno(nfserr); 395
334 return 0; 396 hdr->nops++;
335} 397}
336 398
337/* 399/*
400 * CB_SEQUENCE4resok
401 *
402 * struct CB_SEQUENCE4resok {
403 * sessionid4 csr_sessionid;
404 * sequenceid4 csr_sequenceid;
405 * slotid4 csr_slotid;
406 * slotid4 csr_highest_slotid;
407 * slotid4 csr_target_highest_slotid;
408 * };
409 *
410 * union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
411 * case NFS4_OK:
412 * CB_SEQUENCE4resok csr_resok4;
413 * default:
414 * void;
415 * };
416 *
338 * Our current back channel implmentation supports a single backchannel 417 * Our current back channel implmentation supports a single backchannel
339 * with a single slot. 418 * with a single slot.
340 */ 419 */
341static int 420static int decode_cb_sequence4resok(struct xdr_stream *xdr,
342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, 421 struct nfsd4_callback *cb)
343 struct rpc_rqst *rqstp)
344{ 422{
423 struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
345 struct nfs4_sessionid id; 424 struct nfs4_sessionid id;
346 int status; 425 int status;
347 u32 dummy;
348 __be32 *p; 426 __be32 *p;
427 u32 dummy;
349 428
350 if (res->cbs_minorversion == 0) 429 status = -ESERVERFAULT;
351 return 0;
352
353 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
354 if (status)
355 return status;
356 430
357 /* 431 /*
358 * If the server returns different values for sessionID, slotID or 432 * If the server returns different values for sessionID, slotID or
359 * sequence number, the server is looney tunes. 433 * sequence number, the server is looney tunes.
360 */ 434 */
361 status = -ESERVERFAULT; 435 p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
362 436 if (unlikely(p == NULL))
363 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); 437 goto out_overflow;
364 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 438 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
365 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); 439 if (memcmp(id.data, session->se_sessionid.data,
366 if (memcmp(id.data, res->cbs_clp->cl_sessionid.data, 440 NFS4_MAX_SESSIONID_LEN) != 0) {
367 NFS4_MAX_SESSIONID_LEN)) { 441 dprintk("NFS: %s Invalid session id\n", __func__);
368 dprintk("%s Invalid session id\n", __func__);
369 goto out; 442 goto out;
370 } 443 }
371 READ32(dummy); 444 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
372 if (dummy != res->cbs_clp->cl_cb_seq_nr) { 445
373 dprintk("%s Invalid sequence number\n", __func__); 446 dummy = be32_to_cpup(p++);
447 if (dummy != session->se_cb_seq_nr) {
448 dprintk("NFS: %s Invalid sequence number\n", __func__);
374 goto out; 449 goto out;
375 } 450 }
376 READ32(dummy); /* slotid must be 0 */ 451
452 dummy = be32_to_cpup(p++);
377 if (dummy != 0) { 453 if (dummy != 0) {
378 dprintk("%s Invalid slotid\n", __func__); 454 dprintk("NFS: %s Invalid slotid\n", __func__);
379 goto out; 455 goto out;
380 } 456 }
381 /* FIXME: process highest slotid and target highest slotid */ 457
458 /*
459 * FIXME: process highest slotid and target highest slotid
460 */
382 status = 0; 461 status = 0;
383out: 462out:
384 return status; 463 return status;
464out_overflow:
465 print_overflow_msg(__func__, xdr);
466 return -EIO;
467}
468
469static int decode_cb_sequence4res(struct xdr_stream *xdr,
470 struct nfsd4_callback *cb)
471{
472 enum nfsstat4 nfserr;
473 int status;
474
475 if (cb->cb_minorversion == 0)
476 return 0;
477
478 status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
479 if (unlikely(status))
480 goto out;
481 if (unlikely(nfserr != NFS4_OK))
482 goto out_default;
483 status = decode_cb_sequence4resok(xdr, cb);
484out:
485 return status;
486out_default:
487 return nfs_cb_stat_to_errno(status);
488}
489
490/*
491 * NFSv4.0 and NFSv4.1 XDR encode functions
492 *
493 * NFSv4.0 callback argument types are defined in section 15 of RFC
494 * 3530: "Network File System (NFS) version 4 Protocol" and section 20
495 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
496 * Protocol".
497 */
498
499/*
500 * NB: Without this zero space reservation, callbacks over krb5p fail
501 */
502static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
503 void *__unused)
504{
505 xdr_reserve_space(xdr, 0);
506}
507
508/*
509 * 20.2. Operation 4: CB_RECALL - Recall a Delegation
510 */
511static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
512 const struct nfsd4_callback *cb)
513{
514 const struct nfs4_delegation *args = cb->cb_op;
515 struct nfs4_cb_compound_hdr hdr = {
516 .ident = cb->cb_clp->cl_cb_ident,
517 .minorversion = cb->cb_minorversion,
518 };
519
520 encode_cb_compound4args(xdr, &hdr);
521 encode_cb_sequence4args(xdr, cb, &hdr);
522 encode_cb_recall4args(xdr, args, &hdr);
523 encode_cb_nops(&hdr);
385} 524}
386 525
387 526
388static int 527/*
389nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) 528 * NFSv4.0 and NFSv4.1 XDR decode functions
529 *
530 * NFSv4.0 callback result types are defined in section 15 of RFC
531 * 3530: "Network File System (NFS) version 4 Protocol" and section 20
532 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
533 * Protocol".
534 */
535
536static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
537 void *__unused)
390{ 538{
391 return 0; 539 return 0;
392} 540}
393 541
394static int 542/*
395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, 543 * 20.2. Operation 4: CB_RECALL - Recall a Delegation
396 struct nfsd4_cb_sequence *seq) 544 */
545static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
546 struct xdr_stream *xdr,
547 struct nfsd4_callback *cb)
397{ 548{
398 struct xdr_stream xdr;
399 struct nfs4_cb_compound_hdr hdr; 549 struct nfs4_cb_compound_hdr hdr;
550 enum nfsstat4 nfserr;
400 int status; 551 int status;
401 552
402 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 553 status = decode_cb_compound4res(xdr, &hdr);
403 status = decode_cb_compound_hdr(&xdr, &hdr); 554 if (unlikely(status))
404 if (status)
405 goto out; 555 goto out;
406 if (seq) { 556
407 status = decode_cb_sequence(&xdr, seq, rqstp); 557 if (cb != NULL) {
408 if (status) 558 status = decode_cb_sequence4res(xdr, cb);
559 if (unlikely(status))
409 goto out; 560 goto out;
410 } 561 }
411 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); 562
563 status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
564 if (unlikely(status))
565 goto out;
566 if (unlikely(nfserr != NFS4_OK))
567 goto out_default;
412out: 568out:
413 return status; 569 return status;
570out_default:
571 return nfs_cb_stat_to_errno(status);
414} 572}
415 573
416/* 574/*
417 * RPC procedure tables 575 * RPC procedure tables
418 */ 576 */
419#define PROC(proc, call, argtype, restype) \ 577#define PROC(proc, call, argtype, restype) \
420[NFSPROC4_CLNT_##proc] = { \ 578[NFSPROC4_CLNT_##proc] = { \
421 .p_proc = NFSPROC4_CB_##call, \ 579 .p_proc = NFSPROC4_CB_##call, \
422 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 580 .p_encode = (kxdreproc_t)nfs4_xdr_enc_##argtype, \
423 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 581 .p_decode = (kxdrdproc_t)nfs4_xdr_dec_##restype, \
424 .p_arglen = NFS4_##argtype##_sz, \ 582 .p_arglen = NFS4_enc_##argtype##_sz, \
425 .p_replen = NFS4_##restype##_sz, \ 583 .p_replen = NFS4_dec_##restype##_sz, \
426 .p_statidx = NFSPROC4_CB_##call, \ 584 .p_statidx = NFSPROC4_CB_##call, \
427 .p_name = #proc, \ 585 .p_name = #proc, \
428} 586}
429 587
430static struct rpc_procinfo nfs4_cb_procedures[] = { 588static struct rpc_procinfo nfs4_cb_procedures[] = {
431 PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), 589 PROC(CB_NULL, NULL, cb_null, cb_null),
432 PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), 590 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
433}; 591};
434 592
435static struct rpc_version nfs_cb_version4 = { 593static struct rpc_version nfs_cb_version4 = {
436/* 594/*
437 * Note on the callback rpc program version number: despite language in rfc 595 * Note on the callback rpc program version number: despite language in rfc
438 * 5661 section 18.36.3 requiring servers to use 4 in this field, the 596 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
@@ -440,29 +598,29 @@ static struct rpc_version nfs_cb_version4 = {
440 * in practice that appears to be what implementations use. The section 598 * in practice that appears to be what implementations use. The section
441 * 18.36.3 language is expected to be fixed in an erratum. 599 * 18.36.3 language is expected to be fixed in an erratum.
442 */ 600 */
443 .number = 1, 601 .number = 1,
444 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), 602 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
445 .procs = nfs4_cb_procedures 603 .procs = nfs4_cb_procedures
446}; 604};
447 605
448static struct rpc_version * nfs_cb_version[] = { 606static struct rpc_version *nfs_cb_version[] = {
449 &nfs_cb_version4, 607 &nfs_cb_version4,
450}; 608};
451 609
452static struct rpc_program cb_program; 610static struct rpc_program cb_program;
453 611
454static struct rpc_stat cb_stats = { 612static struct rpc_stat cb_stats = {
455 .program = &cb_program 613 .program = &cb_program
456}; 614};
457 615
458#define NFS4_CALLBACK 0x40000000 616#define NFS4_CALLBACK 0x40000000
459static struct rpc_program cb_program = { 617static struct rpc_program cb_program = {
460 .name = "nfs4_cb", 618 .name = "nfs4_cb",
461 .number = NFS4_CALLBACK, 619 .number = NFS4_CALLBACK,
462 .nrvers = ARRAY_SIZE(nfs_cb_version), 620 .nrvers = ARRAY_SIZE(nfs_cb_version),
463 .version = nfs_cb_version, 621 .version = nfs_cb_version,
464 .stats = &cb_stats, 622 .stats = &cb_stats,
465 .pipe_dir_name = "/nfsd4_cb", 623 .pipe_dir_name = "/nfsd4_cb",
466}; 624};
467 625
468static int max_cb_time(void) 626static int max_cb_time(void)
@@ -470,33 +628,40 @@ static int max_cb_time(void)
470 return max(nfsd4_lease/10, (time_t)1) * HZ; 628 return max(nfsd4_lease/10, (time_t)1) * HZ;
471} 629}
472 630
473/* Reference counting, callback cleanup, etc., all look racy as heck.
474 * And why is cl_cb_set an atomic? */
475 631
476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 632static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
477{ 633{
478 struct rpc_timeout timeparms = { 634 struct rpc_timeout timeparms = {
479 .to_initval = max_cb_time(), 635 .to_initval = max_cb_time(),
480 .to_retries = 0, 636 .to_retries = 0,
481 }; 637 };
482 struct rpc_create_args args = { 638 struct rpc_create_args args = {
483 .protocol = XPRT_TRANSPORT_TCP, 639 .net = &init_net,
484 .address = (struct sockaddr *) &cb->cb_addr, 640 .address = (struct sockaddr *) &conn->cb_addr,
485 .addrsize = cb->cb_addrlen, 641 .addrsize = conn->cb_addrlen,
642 .saddress = (struct sockaddr *) &conn->cb_saddr,
486 .timeout = &timeparms, 643 .timeout = &timeparms,
487 .program = &cb_program, 644 .program = &cb_program,
488 .prognumber = cb->cb_prog,
489 .version = 0, 645 .version = 0,
490 .authflavor = clp->cl_flavor, 646 .authflavor = clp->cl_flavor,
491 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 647 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
492 .client_name = clp->cl_principal,
493 }; 648 };
494 struct rpc_clnt *client; 649 struct rpc_clnt *client;
495 650
496 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 651 if (clp->cl_minorversion == 0) {
497 return -EINVAL; 652 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
498 if (cb->cb_minorversion) { 653 return -EINVAL;
499 args.bc_xprt = cb->cb_xprt; 654 args.client_name = clp->cl_principal;
655 args.prognumber = conn->cb_prog,
656 args.protocol = XPRT_TRANSPORT_TCP;
657 clp->cl_cb_ident = conn->cb_ident;
658 } else {
659 if (!conn->cb_xprt)
660 return -EINVAL;
661 clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
662 clp->cl_cb_session = ses;
663 args.bc_xprt = conn->cb_xprt;
664 args.prognumber = clp->cl_cb_session->se_cb_prog;
500 args.protocol = XPRT_TRANSPORT_BC_TCP; 665 args.protocol = XPRT_TRANSPORT_BC_TCP;
501 } 666 }
502 /* Create RPC client */ 667 /* Create RPC client */
@@ -506,7 +671,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
506 PTR_ERR(client)); 671 PTR_ERR(client));
507 return PTR_ERR(client); 672 return PTR_ERR(client);
508 } 673 }
509 nfsd4_set_callback_client(clp, client); 674 clp->cl_cb_client = client;
510 return 0; 675 return 0;
511 676
512} 677}
@@ -517,17 +682,25 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
517 (int)clp->cl_name.len, clp->cl_name.data, reason); 682 (int)clp->cl_name.len, clp->cl_name.data, reason);
518} 683}
519 684
685static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
686{
687 clp->cl_cb_state = NFSD4_CB_DOWN;
688 warn_no_callback_path(clp, reason);
689}
690
520static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 691static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
521{ 692{
522 struct nfs4_client *clp = calldata; 693 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
523 694
524 if (task->tk_status) 695 if (task->tk_status)
525 warn_no_callback_path(clp, task->tk_status); 696 nfsd4_mark_cb_down(clp, task->tk_status);
526 else 697 else
527 atomic_set(&clp->cl_cb_set, 1); 698 clp->cl_cb_state = NFSD4_CB_UP;
528} 699}
529 700
530static const struct rpc_call_ops nfsd4_cb_probe_ops = { 701static const struct rpc_call_ops nfsd4_cb_probe_ops = {
702 /* XXX: release method to ensure we set the cb channel down if
703 * necessary on early failure? */
531 .rpc_call_done = nfsd4_cb_probe_done, 704 .rpc_call_done = nfsd4_cb_probe_done,
532}; 705};
533 706
@@ -543,38 +716,54 @@ int set_callback_cred(void)
543 return 0; 716 return 0;
544} 717}
545 718
719static struct workqueue_struct *callback_wq;
546 720
547void do_probe_callback(struct nfs4_client *clp) 721static void run_nfsd4_cb(struct nfsd4_callback *cb)
548{ 722{
549 struct rpc_message msg = { 723 queue_work(callback_wq, &cb->cb_work);
550 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 724}
551 .rpc_argp = clp, 725
552 .rpc_cred = callback_cred 726static void do_probe_callback(struct nfs4_client *clp)
553 }; 727{
554 int status; 728 struct nfsd4_callback *cb = &clp->cl_cb_null;
729
730 cb->cb_op = NULL;
731 cb->cb_clp = clp;
732
733 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
734 cb->cb_msg.rpc_argp = NULL;
735 cb->cb_msg.rpc_resp = NULL;
736 cb->cb_msg.rpc_cred = callback_cred;
555 737
556 status = rpc_call_async(clp->cl_cb_client, &msg, 738 cb->cb_ops = &nfsd4_cb_probe_ops;
557 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 739
558 &nfsd4_cb_probe_ops, (void *)clp); 740 run_nfsd4_cb(cb);
559 if (status)
560 warn_no_callback_path(clp, status);
561} 741}
562 742
563/* 743/*
564 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 744 * Poke the callback thread to process any updates to the callback
745 * parameters, and send a null probe.
565 */ 746 */
566void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 747void nfsd4_probe_callback(struct nfs4_client *clp)
567{ 748{
568 int status; 749 /* XXX: atomicity? Also, should we be using cl_cb_flags? */
750 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
751 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
752 do_probe_callback(clp);
753}
569 754
570 BUG_ON(atomic_read(&clp->cl_cb_set)); 755void nfsd4_probe_callback_sync(struct nfs4_client *clp)
756{
757 nfsd4_probe_callback(clp);
758 flush_workqueue(callback_wq);
759}
571 760
572 status = setup_callback_client(clp, cb); 761void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
573 if (status) { 762{
574 warn_no_callback_path(clp, status); 763 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
575 return; 764 spin_lock(&clp->cl_lock);
576 } 765 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
577 do_probe_callback(clp); 766 spin_unlock(&clp->cl_lock);
578} 767}
579 768
580/* 769/*
@@ -582,33 +771,14 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
582 * If the slot is available, then mark it busy. Otherwise, set the 771 * If the slot is available, then mark it busy. Otherwise, set the
583 * thread for sleeping on the callback RPC wait queue. 772 * thread for sleeping on the callback RPC wait queue.
584 */ 773 */
585static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, 774static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
586 struct rpc_task *task)
587{ 775{
588 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
589 u32 *ptr = (u32 *)clp->cl_sessionid.data;
590 int status = 0;
591
592 dprintk("%s: %u:%u:%u:%u\n", __func__,
593 ptr[0], ptr[1], ptr[2], ptr[3]);
594
595 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { 776 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
596 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); 777 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
597 dprintk("%s slot is busy\n", __func__); 778 dprintk("%s slot is busy\n", __func__);
598 status = -EAGAIN; 779 return false;
599 goto out;
600 } 780 }
601 781 return true;
602 /*
603 * We'll need the clp during XDR encoding and decoding,
604 * and the sequence during decoding to verify the reply
605 */
606 args->args_seq.cbs_clp = clp;
607 task->tk_msg.rpc_resp = &args->args_seq;
608
609out:
610 dprintk("%s status=%d\n", __func__, status);
611 return status;
612} 782}
613 783
614/* 784/*
@@ -617,42 +787,42 @@ out:
617 */ 787 */
618static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) 788static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
619{ 789{
620 struct nfs4_delegation *dp = calldata; 790 struct nfsd4_callback *cb = calldata;
791 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
621 struct nfs4_client *clp = dp->dl_client; 792 struct nfs4_client *clp = dp->dl_client;
622 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; 793 u32 minorversion = clp->cl_minorversion;
623 u32 minorversion = clp->cl_cb_conn.cb_minorversion;
624 int status = 0;
625 794
626 args->args_seq.cbs_minorversion = minorversion; 795 cb->cb_minorversion = minorversion;
627 if (minorversion) { 796 if (minorversion) {
628 status = nfsd41_cb_setup_sequence(clp, task); 797 if (!nfsd41_cb_get_slot(clp, task))
629 if (status) {
630 if (status != -EAGAIN) {
631 /* terminate rpc task */
632 task->tk_status = status;
633 task->tk_action = NULL;
634 }
635 return; 798 return;
636 }
637 } 799 }
800 spin_lock(&clp->cl_lock);
801 if (list_empty(&cb->cb_per_client)) {
802 /* This is the first call, not a restart */
803 cb->cb_done = false;
804 list_add(&cb->cb_per_client, &clp->cl_callbacks);
805 }
806 spin_unlock(&clp->cl_lock);
638 rpc_call_start(task); 807 rpc_call_start(task);
639} 808}
640 809
641static void nfsd4_cb_done(struct rpc_task *task, void *calldata) 810static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
642{ 811{
643 struct nfs4_delegation *dp = calldata; 812 struct nfsd4_callback *cb = calldata;
813 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
644 struct nfs4_client *clp = dp->dl_client; 814 struct nfs4_client *clp = dp->dl_client;
645 815
646 dprintk("%s: minorversion=%d\n", __func__, 816 dprintk("%s: minorversion=%d\n", __func__,
647 clp->cl_cb_conn.cb_minorversion); 817 clp->cl_minorversion);
648 818
649 if (clp->cl_cb_conn.cb_minorversion) { 819 if (clp->cl_minorversion) {
650 /* No need for lock, access serialized in nfsd4_cb_prepare */ 820 /* No need for lock, access serialized in nfsd4_cb_prepare */
651 ++clp->cl_cb_seq_nr; 821 ++clp->cl_cb_session->se_cb_seq_nr;
652 clear_bit(0, &clp->cl_cb_slot_busy); 822 clear_bit(0, &clp->cl_cb_slot_busy);
653 rpc_wake_up_next(&clp->cl_cb_waitq); 823 rpc_wake_up_next(&clp->cl_cb_waitq);
654 dprintk("%s: freed slot, new seqid=%d\n", __func__, 824 dprintk("%s: freed slot, new seqid=%d\n", __func__,
655 clp->cl_cb_seq_nr); 825 clp->cl_cb_session->se_cb_seq_nr);
656 826
657 /* We're done looking into the sequence information */ 827 /* We're done looking into the sequence information */
658 task->tk_msg.rpc_resp = NULL; 828 task->tk_msg.rpc_resp = NULL;
@@ -662,21 +832,25 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
662 832
663static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 833static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
664{ 834{
665 struct nfs4_delegation *dp = calldata; 835 struct nfsd4_callback *cb = calldata;
836 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
666 struct nfs4_client *clp = dp->dl_client; 837 struct nfs4_client *clp = dp->dl_client;
667 struct rpc_clnt *current_rpc_client = clp->cl_cb_client; 838 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
668 839
669 nfsd4_cb_done(task, calldata); 840 nfsd4_cb_done(task, calldata);
670 841
671 if (current_rpc_client == NULL) { 842 if (current_rpc_client != task->tk_client) {
672 /* We're shutting down; give up. */ 843 /* We're shutting down or changing cl_cb_client; leave
673 /* XXX: err, or is it ok just to fall through 844 * it to nfsd4_process_cb_update to restart the call if
674 * and rpc_restart_call? */ 845 * necessary. */
675 return; 846 return;
676 } 847 }
677 848
849 if (cb->cb_done)
850 return;
678 switch (task->tk_status) { 851 switch (task->tk_status) {
679 case 0: 852 case 0:
853 cb->cb_done = true;
680 return; 854 return;
681 case -EBADHANDLE: 855 case -EBADHANDLE:
682 case -NFS4ERR_BAD_STATEID: 856 case -NFS4ERR_BAD_STATEID:
@@ -685,31 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
685 break; 859 break;
686 default: 860 default:
687 /* Network partition? */ 861 /* Network partition? */
688 atomic_set(&clp->cl_cb_set, 0); 862 nfsd4_mark_cb_down(clp, task->tk_status);
689 warn_no_callback_path(clp, task->tk_status);
690 if (current_rpc_client != task->tk_client) {
691 /* queue a callback on the new connection: */
692 atomic_inc(&dp->dl_count);
693 nfsd4_cb_recall(dp);
694 return;
695 }
696 } 863 }
697 if (dp->dl_retries--) { 864 if (dp->dl_retries--) {
698 rpc_delay(task, 2*HZ); 865 rpc_delay(task, 2*HZ);
699 task->tk_status = 0; 866 task->tk_status = 0;
700 rpc_restart_call_prepare(task); 867 rpc_restart_call_prepare(task);
701 return; 868 return;
702 } else {
703 atomic_set(&clp->cl_cb_set, 0);
704 warn_no_callback_path(clp, task->tk_status);
705 } 869 }
870 nfsd4_mark_cb_down(clp, task->tk_status);
871 cb->cb_done = true;
706} 872}
707 873
708static void nfsd4_cb_recall_release(void *calldata) 874static void nfsd4_cb_recall_release(void *calldata)
709{ 875{
710 struct nfs4_delegation *dp = calldata; 876 struct nfsd4_callback *cb = calldata;
711 877 struct nfs4_client *clp = cb->cb_clp;
712 nfs4_put_delegation(dp); 878 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
879
880 if (cb->cb_done) {
881 spin_lock(&clp->cl_lock);
882 list_del(&cb->cb_per_client);
883 spin_unlock(&clp->cl_lock);
884 nfs4_put_delegation(dp);
885 }
713} 886}
714 887
715static const struct rpc_call_ops nfsd4_cb_recall_ops = { 888static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -718,8 +891,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
718 .rpc_release = nfsd4_cb_recall_release, 891 .rpc_release = nfsd4_cb_recall_release,
719}; 892};
720 893
721static struct workqueue_struct *callback_wq;
722
723int nfsd4_create_callback_queue(void) 894int nfsd4_create_callback_queue(void)
724{ 895{
725 callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); 896 callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -734,57 +905,124 @@ void nfsd4_destroy_callback_queue(void)
734} 905}
735 906
736/* must be called under the state lock */ 907/* must be called under the state lock */
737void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) 908void nfsd4_shutdown_callback(struct nfs4_client *clp)
738{ 909{
739 struct rpc_clnt *old = clp->cl_cb_client; 910 set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
740
741 clp->cl_cb_client = new;
742 /* 911 /*
743 * After this, any work that saw the old value of cl_cb_client will 912 * Note this won't actually result in a null callback;
744 * be gone: 913 * instead, nfsd4_do_callback_rpc() will detect the killed
914 * client, destroy the rpc client, and stop:
745 */ 915 */
916 do_probe_callback(clp);
746 flush_workqueue(callback_wq); 917 flush_workqueue(callback_wq);
747 /* So we can safely shut it down: */
748 if (old)
749 rpc_shutdown_client(old);
750} 918}
751 919
752/* 920static void nfsd4_release_cb(struct nfsd4_callback *cb)
753 * called with dp->dl_count inc'ed.
754 */
755static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
756{ 921{
757 struct nfs4_client *clp = dp->dl_client; 922 if (cb->cb_ops->rpc_release)
758 struct rpc_clnt *clnt = clp->cl_cb_client; 923 cb->cb_ops->rpc_release(cb);
759 struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; 924}
760 struct rpc_message msg = {
761 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
762 .rpc_cred = callback_cred
763 };
764 925
765 if (clnt == NULL) { 926/* requires cl_lock: */
766 nfs4_put_delegation(dp); 927static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
767 return; /* Client is shutting down; give up. */ 928{
929 struct nfsd4_session *s;
930 struct nfsd4_conn *c;
931
932 list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
933 list_for_each_entry(c, &s->se_conns, cn_persession) {
934 if (c->cn_flags & NFS4_CDFC4_BACK)
935 return c;
936 }
768 } 937 }
938 return NULL;
939}
769 940
770 args->args_op = dp; 941static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
771 msg.rpc_argp = args; 942{
772 dp->dl_retries = 1; 943 struct nfs4_cb_conn conn;
773 rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp); 944 struct nfs4_client *clp = cb->cb_clp;
945 struct nfsd4_session *ses = NULL;
946 struct nfsd4_conn *c;
947 int err;
948
949 /*
950 * This is either an update, or the client dying; in either case,
951 * kill the old client:
952 */
953 if (clp->cl_cb_client) {
954 rpc_shutdown_client(clp->cl_cb_client);
955 clp->cl_cb_client = NULL;
956 }
957 if (clp->cl_cb_conn.cb_xprt) {
958 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
959 clp->cl_cb_conn.cb_xprt = NULL;
960 }
961 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
962 return;
963 spin_lock(&clp->cl_lock);
964 /*
965 * Only serialized callback code is allowed to clear these
966 * flags; main nfsd code can only set them:
967 */
968 BUG_ON(!clp->cl_cb_flags);
969 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
970 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
971 c = __nfsd4_find_backchannel(clp);
972 if (c) {
973 svc_xprt_get(c->cn_xprt);
974 conn.cb_xprt = c->cn_xprt;
975 ses = c->cn_session;
976 }
977 spin_unlock(&clp->cl_lock);
978
979 err = setup_callback_client(clp, &conn, ses);
980 if (err) {
981 warn_no_callback_path(clp, err);
982 return;
983 }
984 /* Yay, the callback channel's back! Restart any callbacks: */
985 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
986 run_nfsd4_cb(cb);
774} 987}
775 988
776void nfsd4_do_callback_rpc(struct work_struct *w) 989void nfsd4_do_callback_rpc(struct work_struct *w)
777{ 990{
778 /* XXX: for now, just send off delegation recall. */ 991 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
779 /* In future, generalize to handle any sort of callback. */ 992 struct nfs4_client *clp = cb->cb_clp;
780 struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); 993 struct rpc_clnt *clnt;
781 struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
782 994
783 _nfsd4_cb_recall(dp); 995 if (clp->cl_cb_flags)
784} 996 nfsd4_process_cb_update(cb);
785 997
998 clnt = clp->cl_cb_client;
999 if (!clnt) {
1000 /* Callback channel broken, or client killed; give up: */
1001 nfsd4_release_cb(cb);
1002 return;
1003 }
1004 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
1005 cb->cb_ops, cb);
1006}
786 1007
787void nfsd4_cb_recall(struct nfs4_delegation *dp) 1008void nfsd4_cb_recall(struct nfs4_delegation *dp)
788{ 1009{
789 queue_work(callback_wq, &dp->dl_recall.cb_work); 1010 struct nfsd4_callback *cb = &dp->dl_recall;
1011 struct nfs4_client *clp = dp->dl_client;
1012
1013 dp->dl_retries = 1;
1014 cb->cb_op = dp;
1015 cb->cb_clp = clp;
1016 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
1017 cb->cb_msg.rpc_argp = cb;
1018 cb->cb_msg.rpc_resp = cb;
1019 cb->cb_msg.rpc_cred = callback_cred;
1020
1021 cb->cb_ops = &nfsd4_cb_recall_ops;
1022 dp->dl_retries = 1;
1023
1024 INIT_LIST_HEAD(&cb->cb_per_client);
1025 cb->cb_done = true;
1026
1027 run_nfsd4_cb(&dp->dl_recall);
790} 1028}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf493424..6d2c397d458b 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
33 */ 33 */
34 34
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 36#include <linux/seq_file.h>
38#include <linux/sched.h> 37#include <linux/sched.h>
39#include <linux/slab.h> 38#include <linux/slab.h>
39#include "idmap.h"
40#include "nfsd.h"
40 41
41/* 42/*
42 * Cache entry 43 * Cache entry
@@ -482,109 +483,26 @@ nfsd_idmap_shutdown(void)
482 cache_unregister(&nametoid_cache); 483 cache_unregister(&nametoid_cache);
483} 484}
484 485
485/*
486 * Deferred request handling
487 */
488
489struct idmap_defer_req {
490 struct cache_req req;
491 struct cache_deferred_req deferred_req;
492 wait_queue_head_t waitq;
493 atomic_t count;
494};
495
496static inline void
497put_mdr(struct idmap_defer_req *mdr)
498{
499 if (atomic_dec_and_test(&mdr->count))
500 kfree(mdr);
501}
502
503static inline void
504get_mdr(struct idmap_defer_req *mdr)
505{
506 atomic_inc(&mdr->count);
507}
508
509static void
510idmap_revisit(struct cache_deferred_req *dreq, int toomany)
511{
512 struct idmap_defer_req *mdr =
513 container_of(dreq, struct idmap_defer_req, deferred_req);
514
515 wake_up(&mdr->waitq);
516 put_mdr(mdr);
517}
518
519static struct cache_deferred_req *
520idmap_defer(struct cache_req *req)
521{
522 struct idmap_defer_req *mdr =
523 container_of(req, struct idmap_defer_req, req);
524
525 mdr->deferred_req.revisit = idmap_revisit;
526 get_mdr(mdr);
527 return (&mdr->deferred_req);
528}
529
530static inline int
531do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
532 struct cache_detail *detail, struct ent **item,
533 struct idmap_defer_req *mdr)
534{
535 *item = lookup_fn(key);
536 if (!*item)
537 return -ENOMEM;
538 return cache_check(detail, &(*item)->h, &mdr->req);
539}
540
541static inline int
542do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
543 struct ent *key, struct cache_detail *detail,
544 struct ent **item)
545{
546 int ret = -ENOMEM;
547
548 *item = lookup_fn(key);
549 if (!*item)
550 goto out_err;
551 ret = -ETIMEDOUT;
552 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
553 || (*item)->h.expiry_time < get_seconds()
554 || detail->flush_time > (*item)->h.last_refresh)
555 goto out_put;
556 ret = -ENOENT;
557 if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
558 goto out_put;
559 return 0;
560out_put:
561 cache_put(&(*item)->h, detail);
562out_err:
563 *item = NULL;
564 return ret;
565}
566
567static int 486static int
568idmap_lookup(struct svc_rqst *rqstp, 487idmap_lookup(struct svc_rqst *rqstp,
569 struct ent *(*lookup_fn)(struct ent *), struct ent *key, 488 struct ent *(*lookup_fn)(struct ent *), struct ent *key,
570 struct cache_detail *detail, struct ent **item) 489 struct cache_detail *detail, struct ent **item)
571{ 490{
572 struct idmap_defer_req *mdr;
573 int ret; 491 int ret;
574 492
575 mdr = kzalloc(sizeof(*mdr), GFP_KERNEL); 493 *item = lookup_fn(key);
576 if (!mdr) 494 if (!*item)
577 return -ENOMEM; 495 return -ENOMEM;
578 atomic_set(&mdr->count, 1); 496 retry:
579 init_waitqueue_head(&mdr->waitq); 497 ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
580 mdr->req.defer = idmap_defer; 498
581 ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr); 499 if (ret == -ETIMEDOUT) {
582 if (ret == -EAGAIN) { 500 struct ent *prev_item = *item;
583 wait_event_interruptible_timeout(mdr->waitq, 501 *item = lookup_fn(key);
584 test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ); 502 if (*item != prev_item)
585 ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item); 503 goto retry;
504 cache_put(&(*item)->h, detail);
586 } 505 }
587 put_mdr(mdr);
588 return ret; 506 return ret;
589} 507}
590 508
@@ -597,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
597 return clp->name; 515 return clp->name;
598} 516}
599 517
600static int 518static __be32
601idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, 519idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
602 uid_t *id) 520 uid_t *id)
603{ 521{
@@ -607,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
607 int ret; 525 int ret;
608 526
609 if (namelen + 1 > sizeof(key.name)) 527 if (namelen + 1 > sizeof(key.name))
610 return -EINVAL; 528 return nfserr_badowner;
611 memcpy(key.name, name, namelen); 529 memcpy(key.name, name, namelen);
612 key.name[namelen] = '\0'; 530 key.name[namelen] = '\0';
613 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); 531 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
614 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); 532 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
615 if (ret == -ENOENT) 533 if (ret == -ENOENT)
616 ret = -ESRCH; /* nfserr_badname */ 534 return nfserr_badowner;
617 if (ret) 535 if (ret)
618 return ret; 536 return nfserrno(ret);
619 *id = item->id; 537 *id = item->id;
620 cache_put(&item->h, &nametoid_cache); 538 cache_put(&item->h, &nametoid_cache);
621 return 0; 539 return 0;
@@ -643,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
643 return ret; 561 return ret;
644} 562}
645 563
646int 564__be32
647nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, 565nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
648 __u32 *id) 566 __u32 *id)
649{ 567{
650 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); 568 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
651} 569}
652 570
653int 571__be32
654nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, 572nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
655 __u32 *id) 573 __u32 *id)
656{ 574{
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7f..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
604 return status; 604 return status;
605} 605}
606 606
607static __be32 607static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
608nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
609 void *arg)
610{ 608{
611 struct svc_fh tmp_fh; 609 struct svc_fh tmp_fh;
612 __be32 ret; 610 __be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 ret = exp_pseudoroot(rqstp, &tmp_fh); 613 ret = exp_pseudoroot(rqstp, &tmp_fh);
616 if (ret) 614 if (ret)
617 return ret; 615 return ret;
618 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { 616 if (tmp_fh.fh_dentry == fh->fh_dentry) {
619 fh_put(&tmp_fh); 617 fh_put(&tmp_fh);
620 return nfserr_noent; 618 return nfserr_noent;
621 } 619 }
622 fh_put(&tmp_fh); 620 fh_put(&tmp_fh);
623 return nfsd_lookup(rqstp, &cstate->current_fh, 621 return nfsd_lookup(rqstp, fh, "..", 2, fh);
624 "..", 2, &cstate->current_fh); 622}
623
624static __be32
625nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
626 void *arg)
627{
628 return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
625} 629}
626 630
627static __be32 631static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
769 } else 773 } else
770 secinfo->si_exp = exp; 774 secinfo->si_exp = exp;
771 dput(dentry); 775 dput(dentry);
776 if (cstate->minorversion)
777 /* See rfc 5661 section 2.6.3.1.1.8 */
778 fh_put(&cstate->current_fh);
772 return err; 779 return err;
773} 780}
774 781
775static __be32 782static __be32
783nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
784 struct nfsd4_secinfo_no_name *sin)
785{
786 __be32 err;
787
788 switch (sin->sin_style) {
789 case NFS4_SECINFO_STYLE4_CURRENT_FH:
790 break;
791 case NFS4_SECINFO_STYLE4_PARENT:
792 err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
793 if (err)
794 return err;
795 break;
796 default:
797 return nfserr_inval;
798 }
799 exp_get(cstate->current_fh.fh_export);
800 sin->sin_exp = cstate->current_fh.fh_export;
801 fh_put(&cstate->current_fh);
802 return nfs_ok;
803}
804
805static __be32
776nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 806nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
777 struct nfsd4_setattr *setattr) 807 struct nfsd4_setattr *setattr)
778{ 808{
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
974 * Also note, enforced elsewhere: 1004 * Also note, enforced elsewhere:
975 * - SEQUENCE other than as first op results in 1005 * - SEQUENCE other than as first op results in
976 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) 1006 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
977 * - BIND_CONN_TO_SESSION must be the only op in its compound 1007 * - BIND_CONN_TO_SESSION must be the only op in its compound.
978 * (Will be enforced in nfsd4_bind_conn_to_session().) 1008 * (Enforced in nfsd4_bind_conn_to_session().)
979 * - DESTROY_SESSION must be the final operation in a compound, if 1009 * - DESTROY_SESSION must be the final operation in a compound, if
980 * sessionid's in SEQUENCE and DESTROY_SESSION are the same. 1010 * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
981 * (Enforced in nfsd4_destroy_session().) 1011 * (Enforced in nfsd4_destroy_session().)
@@ -1031,8 +1061,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1031 resp->cstate.session = NULL; 1061 resp->cstate.session = NULL;
1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1062 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1063 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
1034 /* Use the deferral mechanism only for NFSv4.0 compounds */ 1064 /*
1035 rqstp->rq_usedeferral = (args->minorversion == 0); 1065 * Don't use the deferral mechanism for NFSv4; compounds make it
1066 * too hard to avoid non-idempotency problems.
1067 */
1068 rqstp->rq_usedeferral = 0;
1036 1069
1037 /* 1070 /*
1038 * According to RFC3010, this takes precedence over all other errors. 1071 * According to RFC3010, this takes precedence over all other errors.
@@ -1123,10 +1156,6 @@ encode_op:
1123 1156
1124 nfsd4_increment_op_stats(op->opnum); 1157 nfsd4_increment_op_stats(op->opnum);
1125 } 1158 }
1126 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1127 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1128 status = nfserr_jukebox;
1129 }
1130 1159
1131 resp->cstate.status = status; 1160 resp->cstate.status = status;
1132 fh_put(&resp->cstate.current_fh); 1161 fh_put(&resp->cstate.current_fh);
@@ -1297,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
1297 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1326 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1298 .op_name = "OP_EXCHANGE_ID", 1327 .op_name = "OP_EXCHANGE_ID",
1299 }, 1328 },
1329 [OP_BIND_CONN_TO_SESSION] = {
1330 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1331 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1332 .op_name = "OP_BIND_CONN_TO_SESSION",
1333 },
1300 [OP_CREATE_SESSION] = { 1334 [OP_CREATE_SESSION] = {
1301 .op_func = (nfsd4op_func)nfsd4_create_session, 1335 .op_func = (nfsd4op_func)nfsd4_create_session,
1302 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1336 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1317,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
1317 .op_flags = ALLOWED_WITHOUT_FH, 1351 .op_flags = ALLOWED_WITHOUT_FH,
1318 .op_name = "OP_RECLAIM_COMPLETE", 1352 .op_name = "OP_RECLAIM_COMPLETE",
1319 }, 1353 },
1354 [OP_SECINFO_NO_NAME] = {
1355 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
1356 .op_name = "OP_SECINFO_NO_NAME",
1357 },
1320}; 1358};
1321 1359
1322static const char *nfsd4_op_name(unsigned opnum) 1360static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a26..ffb59ef6f82f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
302{ 302{
303 int status; 303 int status;
304 304
305 /* note: we currently use this path only for minorversion 0 */
306 if (nfs4_has_reclaimed_state(child->d_name.name, false)) 305 if (nfs4_has_reclaimed_state(child->d_name.name, false))
307 return 0; 306 return 0;
308 307
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cf0d2ffb3c84..d98d0213285d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
33*/ 33*/
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/fs.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/swap.h> 39#include <linux/swap.h>
@@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
207{ 207{
208 struct nfs4_delegation *dp; 208 struct nfs4_delegation *dp;
209 struct nfs4_file *fp = stp->st_file; 209 struct nfs4_file *fp = stp->st_file;
210 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
211 210
212 dprintk("NFSD alloc_init_deleg\n"); 211 dprintk("NFSD alloc_init_deleg\n");
213 /* 212 /*
@@ -231,10 +230,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
231 dp->dl_client = clp; 230 dp->dl_client = clp;
232 get_nfs4_file(fp); 231 get_nfs4_file(fp);
233 dp->dl_file = fp; 232 dp->dl_file = fp;
234 nfs4_file_get_access(fp, O_RDONLY); 233 dp->dl_vfs_file = find_readable_file(fp);
234 get_file(dp->dl_vfs_file);
235 dp->dl_flock = NULL; 235 dp->dl_flock = NULL;
236 dp->dl_type = type; 236 dp->dl_type = type;
237 dp->dl_ident = cb->cb_ident;
238 dp->dl_stateid.si_boot = boot_time; 237 dp->dl_stateid.si_boot = boot_time;
239 dp->dl_stateid.si_stateownerid = current_delegid++; 238 dp->dl_stateid.si_stateownerid = current_delegid++;
240 dp->dl_stateid.si_fileid = 0; 239 dp->dl_stateid.si_fileid = 0;
@@ -254,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
254 if (atomic_dec_and_test(&dp->dl_count)) { 253 if (atomic_dec_and_test(&dp->dl_count)) {
255 dprintk("NFSD: freeing dp %p\n",dp); 254 dprintk("NFSD: freeing dp %p\n",dp);
256 put_nfs4_file(dp->dl_file); 255 put_nfs4_file(dp->dl_file);
256 fput(dp->dl_vfs_file);
257 kmem_cache_free(deleg_slab, dp); 257 kmem_cache_free(deleg_slab, dp);
258 num_delegations--; 258 num_delegations--;
259 } 259 }
@@ -267,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
267static void 267static void
268nfs4_close_delegation(struct nfs4_delegation *dp) 268nfs4_close_delegation(struct nfs4_delegation *dp)
269{ 269{
270 struct file *filp = find_readable_file(dp->dl_file);
271
272 dprintk("NFSD: close_delegation dp %p\n",dp); 270 dprintk("NFSD: close_delegation dp %p\n",dp);
271 /* XXX: do we even need this check?: */
273 if (dp->dl_flock) 272 if (dp->dl_flock)
274 vfs_setlease(filp, F_UNLCK, &dp->dl_flock); 273 vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
275 nfs4_file_put_access(dp->dl_file, O_RDONLY);
276} 274}
277 275
278/* Called under the state lock. */ 276/* Called under the state lock. */
@@ -535,171 +533,278 @@ gen_sessionid(struct nfsd4_session *ses)
535 */ 533 */
536#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) 534#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
537 535
536static void
537free_session_slots(struct nfsd4_session *ses)
538{
539 int i;
540
541 for (i = 0; i < ses->se_fchannel.maxreqs; i++)
542 kfree(ses->se_slots[i]);
543}
544
538/* 545/*
539 * Give the client the number of ca_maxresponsesize_cached slots it 546 * We don't actually need to cache the rpc and session headers, so we
540 * requests, of size bounded by NFSD_SLOT_CACHE_SIZE, 547 * can allocate a little less for each slot:
541 * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more 548 */
542 * than NFSD_MAX_SLOTS_PER_SESSION. 549static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
543 * 550{
544 * If we run out of reserved DRC memory we should (up to a point) 551 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
552}
553
554static int nfsd4_sanitize_slot_size(u32 size)
555{
556 size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
557 size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
558
559 return size;
560}
561
562/*
563 * XXX: If we run out of reserved DRC memory we could (up to a point)
545 * re-negotiate active sessions and reduce their slot usage to make 564 * re-negotiate active sessions and reduce their slot usage to make
546 * rooom for new connections. For now we just fail the create session. 565 * rooom for new connections. For now we just fail the create session.
547 */ 566 */
548static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) 567static int nfsd4_get_drc_mem(int slotsize, u32 num)
549{ 568{
550 int mem, size = fchan->maxresp_cached; 569 int avail;
551 570
552 if (fchan->maxreqs < 1) 571 num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
553 return nfserr_inval;
554 572
555 if (size < NFSD_MIN_HDR_SEQ_SZ) 573 spin_lock(&nfsd_drc_lock);
556 size = NFSD_MIN_HDR_SEQ_SZ; 574 avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
557 size -= NFSD_MIN_HDR_SEQ_SZ; 575 nfsd_drc_max_mem - nfsd_drc_mem_used);
558 if (size > NFSD_SLOT_CACHE_SIZE) 576 num = min_t(int, num, avail / slotsize);
559 size = NFSD_SLOT_CACHE_SIZE; 577 nfsd_drc_mem_used += num * slotsize;
560 578 spin_unlock(&nfsd_drc_lock);
561 /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */ 579
562 mem = fchan->maxreqs * size; 580 return num;
563 if (mem > NFSD_MAX_MEM_PER_SESSION) { 581}
564 fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
565 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
566 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
567 mem = fchan->maxreqs * size;
568 }
569 582
583static void nfsd4_put_drc_mem(int slotsize, int num)
584{
570 spin_lock(&nfsd_drc_lock); 585 spin_lock(&nfsd_drc_lock);
571 /* bound the total session drc memory ussage */ 586 nfsd_drc_mem_used -= slotsize * num;
572 if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
573 fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
574 mem = fchan->maxreqs * size;
575 }
576 nfsd_drc_mem_used += mem;
577 spin_unlock(&nfsd_drc_lock); 587 spin_unlock(&nfsd_drc_lock);
588}
578 589
579 if (fchan->maxreqs == 0) 590static struct nfsd4_session *alloc_session(int slotsize, int numslots)
580 return nfserr_jukebox; 591{
592 struct nfsd4_session *new;
593 int mem, i;
581 594
582 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; 595 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
583 return 0; 596 + sizeof(struct nfsd4_session) > PAGE_SIZE);
597 mem = numslots * sizeof(struct nfsd4_slot *);
598
599 new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
600 if (!new)
601 return NULL;
602 /* allocate each struct nfsd4_slot and data cache in one piece */
603 for (i = 0; i < numslots; i++) {
604 mem = sizeof(struct nfsd4_slot) + slotsize;
605 new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
606 if (!new->se_slots[i])
607 goto out_free;
608 }
609 return new;
610out_free:
611 while (i--)
612 kfree(new->se_slots[i]);
613 kfree(new);
614 return NULL;
584} 615}
585 616
586/* 617static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
587 * fchan holds the client values on input, and the server values on output
588 * sv_max_mesg is the maximum payload plus one page for overhead.
589 */
590static int init_forechannel_attrs(struct svc_rqst *rqstp,
591 struct nfsd4_channel_attrs *session_fchan,
592 struct nfsd4_channel_attrs *fchan)
593{ 618{
594 int status = 0; 619 u32 maxrpc = nfsd_serv->sv_max_mesg;
595 __u32 maxcount = nfsd_serv->sv_max_mesg;
596 620
597 /* headerpadsz set to zero in encode routine */ 621 new->maxreqs = numslots;
622 new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
623 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
624 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
625 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
626}
598 627
599 /* Use the client's max request and max response size if possible */ 628static void free_conn(struct nfsd4_conn *c)
600 if (fchan->maxreq_sz > maxcount) 629{
601 fchan->maxreq_sz = maxcount; 630 svc_xprt_put(c->cn_xprt);
602 session_fchan->maxreq_sz = fchan->maxreq_sz; 631 kfree(c);
632}
603 633
604 if (fchan->maxresp_sz > maxcount) 634static void nfsd4_conn_lost(struct svc_xpt_user *u)
605 fchan->maxresp_sz = maxcount; 635{
606 session_fchan->maxresp_sz = fchan->maxresp_sz; 636 struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
637 struct nfs4_client *clp = c->cn_session->se_client;
607 638
608 /* Use the client's maxops if possible */ 639 spin_lock(&clp->cl_lock);
609 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 640 if (!list_empty(&c->cn_persession)) {
610 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 641 list_del(&c->cn_persession);
611 session_fchan->maxops = fchan->maxops; 642 free_conn(c);
643 }
644 spin_unlock(&clp->cl_lock);
645 nfsd4_probe_callback(clp);
646}
612 647
613 /* FIXME: Error means no more DRC pages so the server should 648static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
614 * recover pages from existing sessions. For now fail session 649{
615 * creation. 650 struct nfsd4_conn *conn;
616 */
617 status = set_forechannel_drc_size(fchan);
618 651
619 session_fchan->maxresp_cached = fchan->maxresp_cached; 652 conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
620 session_fchan->maxreqs = fchan->maxreqs; 653 if (!conn)
654 return NULL;
655 svc_xprt_get(rqstp->rq_xprt);
656 conn->cn_xprt = rqstp->rq_xprt;
657 conn->cn_flags = flags;
658 INIT_LIST_HEAD(&conn->cn_xpt_user.list);
659 return conn;
660}
621 661
622 dprintk("%s status %d\n", __func__, status); 662static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
623 return status; 663{
664 conn->cn_session = ses;
665 list_add(&conn->cn_persession, &ses->se_conns);
624} 666}
625 667
626static void 668static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
627free_session_slots(struct nfsd4_session *ses)
628{ 669{
629 int i; 670 struct nfs4_client *clp = ses->se_client;
630 671
631 for (i = 0; i < ses->se_fchannel.maxreqs; i++) 672 spin_lock(&clp->cl_lock);
632 kfree(ses->se_slots[i]); 673 __nfsd4_hash_conn(conn, ses);
674 spin_unlock(&clp->cl_lock);
633} 675}
634 676
635/* 677static int nfsd4_register_conn(struct nfsd4_conn *conn)
636 * We don't actually need to cache the rpc and session headers, so we
637 * can allocate a little less for each slot:
638 */
639static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
640{ 678{
641 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; 679 conn->cn_xpt_user.callback = nfsd4_conn_lost;
680 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
642} 681}
643 682
644static int 683static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
645alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
646 struct nfsd4_create_session *cses)
647{ 684{
648 struct nfsd4_session *new, tmp; 685 struct nfsd4_conn *conn;
649 struct nfsd4_slot *sp; 686 int ret;
650 int idx, slotsize, cachesize, i;
651 int status;
652 687
653 memset(&tmp, 0, sizeof(tmp)); 688 conn = alloc_conn(rqstp, dir);
689 if (!conn)
690 return nfserr_jukebox;
691 nfsd4_hash_conn(conn, ses);
692 ret = nfsd4_register_conn(conn);
693 if (ret)
694 /* oops; xprt is already down: */
695 nfsd4_conn_lost(&conn->cn_xpt_user);
696 return nfs_ok;
697}
654 698
655 /* FIXME: For now, we just accept the client back channel attributes. */ 699static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
656 tmp.se_bchannel = cses->back_channel; 700{
657 status = init_forechannel_attrs(rqstp, &tmp.se_fchannel, 701 u32 dir = NFS4_CDFC4_FORE;
658 &cses->fore_channel);
659 if (status)
660 goto out;
661 702
662 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) 703 if (ses->se_flags & SESSION4_BACK_CHAN)
663 + sizeof(struct nfsd4_session) > PAGE_SIZE); 704 dir |= NFS4_CDFC4_BACK;
664 705
665 status = nfserr_jukebox; 706 return nfsd4_new_conn(rqstp, ses, dir);
666 /* allocate struct nfsd4_session and slot table pointers in one piece */ 707}
667 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
668 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
669 if (!new)
670 goto out;
671 708
672 memcpy(new, &tmp, sizeof(*new)); 709/* must be called under client_lock */
710static void nfsd4_del_conns(struct nfsd4_session *s)
711{
712 struct nfs4_client *clp = s->se_client;
713 struct nfsd4_conn *c;
673 714
674 /* allocate each struct nfsd4_slot and data cache in one piece */ 715 spin_lock(&clp->cl_lock);
675 cachesize = slot_bytes(&new->se_fchannel); 716 while (!list_empty(&s->se_conns)) {
676 for (i = 0; i < new->se_fchannel.maxreqs; i++) { 717 c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
677 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); 718 list_del_init(&c->cn_persession);
678 if (!sp) 719 spin_unlock(&clp->cl_lock);
679 goto out_free; 720
680 new->se_slots[i] = sp; 721 unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
722 free_conn(c);
723
724 spin_lock(&clp->cl_lock);
681 } 725 }
726 spin_unlock(&clp->cl_lock);
727}
728
729void free_session(struct kref *kref)
730{
731 struct nfsd4_session *ses;
732 int mem;
733
734 ses = container_of(kref, struct nfsd4_session, se_ref);
735 nfsd4_del_conns(ses);
736 spin_lock(&nfsd_drc_lock);
737 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
738 nfsd_drc_mem_used -= mem;
739 spin_unlock(&nfsd_drc_lock);
740 free_session_slots(ses);
741 kfree(ses);
742}
743
744static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
745{
746 struct nfsd4_session *new;
747 struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
748 int numslots, slotsize;
749 int status;
750 int idx;
751
752 /*
753 * Note decreasing slot size below client's request may
754 * make it difficult for client to function correctly, whereas
755 * decreasing the number of slots will (just?) affect
756 * performance. When short on memory we therefore prefer to
757 * decrease number of slots instead of their size.
758 */
759 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
760 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
761 if (numslots < 1)
762 return NULL;
763
764 new = alloc_session(slotsize, numslots);
765 if (!new) {
766 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
767 return NULL;
768 }
769 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
682 770
683 new->se_client = clp; 771 new->se_client = clp;
684 gen_sessionid(new); 772 gen_sessionid(new);
685 idx = hash_sessionid(&new->se_sessionid);
686 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
687 NFS4_MAX_SESSIONID_LEN);
688 773
774 INIT_LIST_HEAD(&new->se_conns);
775
776 new->se_cb_seq_nr = 1;
689 new->se_flags = cses->flags; 777 new->se_flags = cses->flags;
778 new->se_cb_prog = cses->callback_prog;
690 kref_init(&new->se_ref); 779 kref_init(&new->se_ref);
780 idx = hash_sessionid(&new->se_sessionid);
691 spin_lock(&client_lock); 781 spin_lock(&client_lock);
692 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 782 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
783 spin_lock(&clp->cl_lock);
693 list_add(&new->se_perclnt, &clp->cl_sessions); 784 list_add(&new->se_perclnt, &clp->cl_sessions);
785 spin_unlock(&clp->cl_lock);
694 spin_unlock(&client_lock); 786 spin_unlock(&client_lock);
695 787
696 status = nfs_ok; 788 status = nfsd4_new_conn_from_crses(rqstp, new);
697out: 789 /* whoops: benny points out, status is ignored! (err, or bogus) */
698 return status; 790 if (status) {
699out_free: 791 free_session(&new->se_ref);
700 free_session_slots(new); 792 return NULL;
701 kfree(new); 793 }
702 goto out; 794 if (cses->flags & SESSION4_BACK_CHAN) {
795 struct sockaddr *sa = svc_addr(rqstp);
796 /*
797 * This is a little silly; with sessions there's no real
798 * use for the callback address. Use the peer address
799 * as a reasonable default for now, but consider fixing
800 * the rpc client not to require an address in the
801 * future:
802 */
803 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
804 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
805 }
806 nfsd4_probe_callback(clp);
807 return new;
703} 808}
704 809
705/* caller must hold client_lock */ 810/* caller must hold client_lock */
@@ -728,22 +833,9 @@ static void
728unhash_session(struct nfsd4_session *ses) 833unhash_session(struct nfsd4_session *ses)
729{ 834{
730 list_del(&ses->se_hash); 835 list_del(&ses->se_hash);
836 spin_lock(&ses->se_client->cl_lock);
731 list_del(&ses->se_perclnt); 837 list_del(&ses->se_perclnt);
732} 838 spin_unlock(&ses->se_client->cl_lock);
733
734void
735free_session(struct kref *kref)
736{
737 struct nfsd4_session *ses;
738 int mem;
739
740 ses = container_of(kref, struct nfsd4_session, se_ref);
741 spin_lock(&nfsd_drc_lock);
742 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
743 nfsd_drc_mem_used -= mem;
744 spin_unlock(&nfsd_drc_lock);
745 free_session_slots(ses);
746 kfree(ses);
747} 839}
748 840
749/* must be called under the client_lock */ 841/* must be called under the client_lock */
@@ -812,6 +904,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
812static inline void 904static inline void
813free_client(struct nfs4_client *clp) 905free_client(struct nfs4_client *clp)
814{ 906{
907 while (!list_empty(&clp->cl_sessions)) {
908 struct nfsd4_session *ses;
909 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
910 se_perclnt);
911 list_del(&ses->se_perclnt);
912 nfsd4_put_session(ses);
913 }
815 if (clp->cl_cred.cr_group_info) 914 if (clp->cl_cred.cr_group_info)
816 put_group_info(clp->cl_cred.cr_group_info); 915 put_group_info(clp->cl_cred.cr_group_info);
817 kfree(clp->cl_principal); 916 kfree(clp->cl_principal);
@@ -838,15 +937,14 @@ release_session_client(struct nfsd4_session *session)
838static inline void 937static inline void
839unhash_client_locked(struct nfs4_client *clp) 938unhash_client_locked(struct nfs4_client *clp)
840{ 939{
940 struct nfsd4_session *ses;
941
841 mark_client_expired(clp); 942 mark_client_expired(clp);
842 list_del(&clp->cl_lru); 943 list_del(&clp->cl_lru);
843 while (!list_empty(&clp->cl_sessions)) { 944 spin_lock(&clp->cl_lock);
844 struct nfsd4_session *ses; 945 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
845 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, 946 list_del_init(&ses->se_hash);
846 se_perclnt); 947 spin_unlock(&clp->cl_lock);
847 unhash_session(ses);
848 nfsd4_put_session(ses);
849 }
850} 948}
851 949
852static void 950static void
@@ -875,7 +973,7 @@ expire_client(struct nfs4_client *clp)
875 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 973 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
876 release_openowner(sop); 974 release_openowner(sop);
877 } 975 }
878 nfsd4_set_callback_client(clp, NULL); 976 nfsd4_shutdown_callback(clp);
879 if (clp->cl_cb_conn.cb_xprt) 977 if (clp->cl_cb_conn.cb_xprt)
880 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 978 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
881 list_del(&clp->cl_idhash); 979 list_del(&clp->cl_idhash);
@@ -960,6 +1058,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
960 if (clp == NULL) 1058 if (clp == NULL)
961 return NULL; 1059 return NULL;
962 1060
1061 INIT_LIST_HEAD(&clp->cl_sessions);
1062
963 princ = svc_gss_principal(rqstp); 1063 princ = svc_gss_principal(rqstp);
964 if (princ) { 1064 if (princ) {
965 clp->cl_principal = kstrdup(princ, GFP_KERNEL); 1065 clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -971,13 +1071,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
971 1071
972 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 1072 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
973 atomic_set(&clp->cl_refcount, 0); 1073 atomic_set(&clp->cl_refcount, 0);
974 atomic_set(&clp->cl_cb_set, 0); 1074 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
975 INIT_LIST_HEAD(&clp->cl_idhash); 1075 INIT_LIST_HEAD(&clp->cl_idhash);
976 INIT_LIST_HEAD(&clp->cl_strhash); 1076 INIT_LIST_HEAD(&clp->cl_strhash);
977 INIT_LIST_HEAD(&clp->cl_openowners); 1077 INIT_LIST_HEAD(&clp->cl_openowners);
978 INIT_LIST_HEAD(&clp->cl_delegations); 1078 INIT_LIST_HEAD(&clp->cl_delegations);
979 INIT_LIST_HEAD(&clp->cl_sessions);
980 INIT_LIST_HEAD(&clp->cl_lru); 1079 INIT_LIST_HEAD(&clp->cl_lru);
1080 INIT_LIST_HEAD(&clp->cl_callbacks);
1081 spin_lock_init(&clp->cl_lock);
1082 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
981 clp->cl_time = get_seconds(); 1083 clp->cl_time = get_seconds();
982 clear_bit(0, &clp->cl_cb_slot_busy); 1084 clear_bit(0, &clp->cl_cb_slot_busy);
983 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1085 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -986,7 +1088,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
986 clp->cl_flavor = rqstp->rq_flavor; 1088 clp->cl_flavor = rqstp->rq_flavor;
987 copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1089 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
988 gen_confirm(clp); 1090 gen_confirm(clp);
989 1091 clp->cl_cb_session = NULL;
990 return clp; 1092 return clp;
991} 1093}
992 1094
@@ -1051,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
1051 return NULL; 1153 return NULL;
1052} 1154}
1053 1155
1054/* 1156static bool clp_used_exchangeid(struct nfs4_client *clp)
1055 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
1056 * parameter. Matching is based on the fact the at least one of the
1057 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
1058 *
1059 * FIXME: we need to unify the clientid namespaces for nfsv4.x
1060 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
1061 * and SET_CLIENTID{,_CONFIRM}
1062 */
1063static inline int
1064match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
1065{ 1157{
1066 bool has_exchange_flags = (clp->cl_exchange_flags != 0); 1158 return clp->cl_exchange_flags != 0;
1067 return use_exchange_id == has_exchange_flags; 1159}
1068}
1069 1160
1070static struct nfs4_client * 1161static struct nfs4_client *
1071find_confirmed_client_by_str(const char *dname, unsigned int hashval, 1162find_confirmed_client_by_str(const char *dname, unsigned int hashval)
1072 bool use_exchange_id)
1073{ 1163{
1074 struct nfs4_client *clp; 1164 struct nfs4_client *clp;
1075 1165
1076 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 1166 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
1077 if (same_name(clp->cl_recdir, dname) && 1167 if (same_name(clp->cl_recdir, dname))
1078 match_clientid_establishment(clp, use_exchange_id))
1079 return clp; 1168 return clp;
1080 } 1169 }
1081 return NULL; 1170 return NULL;
1082} 1171}
1083 1172
1084static struct nfs4_client * 1173static struct nfs4_client *
1085find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, 1174find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
1086 bool use_exchange_id)
1087{ 1175{
1088 struct nfs4_client *clp; 1176 struct nfs4_client *clp;
1089 1177
1090 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 1178 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
1091 if (same_name(clp->cl_recdir, dname) && 1179 if (same_name(clp->cl_recdir, dname))
1092 match_clientid_establishment(clp, use_exchange_id))
1093 return clp; 1180 return clp;
1094 } 1181 }
1095 return NULL; 1182 return NULL;
1096} 1183}
1097 1184
1185static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
1186{
1187 switch (family) {
1188 case AF_INET:
1189 ((struct sockaddr_in *)sa)->sin_family = AF_INET;
1190 ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
1191 return;
1192 case AF_INET6:
1193 ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
1194 ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
1195 return;
1196 }
1197}
1198
1098static void 1199static void
1099gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) 1200gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
1100{ 1201{
1101 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 1202 struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
1203 struct sockaddr *sa = svc_addr(rqstp);
1204 u32 scopeid = rpc_get_scope_id(sa);
1102 unsigned short expected_family; 1205 unsigned short expected_family;
1103 1206
1104 /* Currently, we only support tcp and tcp6 for the callback channel */ 1207 /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1111,24 +1214,24 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1111 else 1214 else
1112 goto out_err; 1215 goto out_err;
1113 1216
1114 cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, 1217 conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
1115 se->se_callback_addr_len, 1218 se->se_callback_addr_len,
1116 (struct sockaddr *) &cb->cb_addr, 1219 (struct sockaddr *)&conn->cb_addr,
1117 sizeof(cb->cb_addr)); 1220 sizeof(conn->cb_addr));
1118 1221
1119 if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) 1222 if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
1120 goto out_err; 1223 goto out_err;
1121 1224
1122 if (cb->cb_addr.ss_family == AF_INET6) 1225 if (conn->cb_addr.ss_family == AF_INET6)
1123 ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; 1226 ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
1124 1227
1125 cb->cb_minorversion = 0; 1228 conn->cb_prog = se->se_callback_prog;
1126 cb->cb_prog = se->se_callback_prog; 1229 conn->cb_ident = se->se_callback_ident;
1127 cb->cb_ident = se->se_callback_ident; 1230 rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
1128 return; 1231 return;
1129out_err: 1232out_err:
1130 cb->cb_addr.ss_family = AF_UNSPEC; 1233 conn->cb_addr.ss_family = AF_UNSPEC;
1131 cb->cb_addrlen = 0; 1234 conn->cb_addrlen = 0;
1132 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " 1235 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
1133 "will not receive delegations\n", 1236 "will not receive delegations\n",
1134 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); 1237 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1264,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1264 case SP4_NONE: 1367 case SP4_NONE:
1265 break; 1368 break;
1266 case SP4_SSV: 1369 case SP4_SSV:
1267 return nfserr_encr_alg_unsupp; 1370 return nfserr_serverfault;
1268 default: 1371 default:
1269 BUG(); /* checked by xdr code */ 1372 BUG(); /* checked by xdr code */
1270 case SP4_MACH_CRED: 1373 case SP4_MACH_CRED:
@@ -1281,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1281 nfs4_lock_state(); 1384 nfs4_lock_state();
1282 status = nfs_ok; 1385 status = nfs_ok;
1283 1386
1284 conf = find_confirmed_client_by_str(dname, strhashval, true); 1387 conf = find_confirmed_client_by_str(dname, strhashval);
1285 if (conf) { 1388 if (conf) {
1389 if (!clp_used_exchangeid(conf)) {
1390 status = nfserr_clid_inuse; /* XXX: ? */
1391 goto out;
1392 }
1286 if (!same_verf(&verf, &conf->cl_verifier)) { 1393 if (!same_verf(&verf, &conf->cl_verifier)) {
1287 /* 18.35.4 case 8 */ 1394 /* 18.35.4 case 8 */
1288 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { 1395 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1323,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1323 goto out; 1430 goto out;
1324 } 1431 }
1325 1432
1326 unconf = find_unconfirmed_client_by_str(dname, strhashval, true); 1433 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1327 if (unconf) { 1434 if (unconf) {
1328 /* 1435 /*
1329 * Possible retry or client restart. Per 18.35.4 case 4, 1436 * Possible retry or client restart. Per 18.35.4 case 4,
@@ -1415,7 +1522,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1415{ 1522{
1416 struct sockaddr *sa = svc_addr(rqstp); 1523 struct sockaddr *sa = svc_addr(rqstp);
1417 struct nfs4_client *conf, *unconf; 1524 struct nfs4_client *conf, *unconf;
1525 struct nfsd4_session *new;
1418 struct nfsd4_clid_slot *cs_slot = NULL; 1526 struct nfsd4_clid_slot *cs_slot = NULL;
1527 bool confirm_me = false;
1419 int status = 0; 1528 int status = 0;
1420 1529
1421 nfs4_lock_state(); 1530 nfs4_lock_state();
@@ -1438,7 +1547,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1438 cs_slot->sl_seqid, cr_ses->seqid); 1547 cs_slot->sl_seqid, cr_ses->seqid);
1439 goto out; 1548 goto out;
1440 } 1549 }
1441 cs_slot->sl_seqid++;
1442 } else if (unconf) { 1550 } else if (unconf) {
1443 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1551 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1444 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1552 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1451,25 +1559,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1451 if (status) { 1559 if (status) {
1452 /* an unconfirmed replay returns misordered */ 1560 /* an unconfirmed replay returns misordered */
1453 status = nfserr_seq_misordered; 1561 status = nfserr_seq_misordered;
1454 goto out_cache; 1562 goto out;
1455 } 1563 }
1456 1564
1457 cs_slot->sl_seqid++; /* from 0 to 1 */ 1565 confirm_me = true;
1458 move_to_confirmed(unconf);
1459
1460 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1461 unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
1462 svc_xprt_get(rqstp->rq_xprt);
1463 rpc_copy_addr(
1464 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1465 sa);
1466 unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
1467 unconf->cl_cb_conn.cb_minorversion =
1468 cstate->minorversion;
1469 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1470 unconf->cl_cb_seq_nr = 1;
1471 nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
1472 }
1473 conf = unconf; 1566 conf = unconf;
1474 } else { 1567 } else {
1475 status = nfserr_stale_clientid; 1568 status = nfserr_stale_clientid;
@@ -1477,22 +1570,32 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1477 } 1570 }
1478 1571
1479 /* 1572 /*
1573 * XXX: we should probably set this at creation time, and check
1574 * for consistent minorversion use throughout:
1575 */
1576 conf->cl_minorversion = 1;
1577 /*
1480 * We do not support RDMA or persistent sessions 1578 * We do not support RDMA or persistent sessions
1481 */ 1579 */
1482 cr_ses->flags &= ~SESSION4_PERSIST; 1580 cr_ses->flags &= ~SESSION4_PERSIST;
1483 cr_ses->flags &= ~SESSION4_RDMA; 1581 cr_ses->flags &= ~SESSION4_RDMA;
1484 1582
1485 status = alloc_init_session(rqstp, conf, cr_ses); 1583 status = nfserr_jukebox;
1486 if (status) 1584 new = alloc_init_session(rqstp, conf, cr_ses);
1585 if (!new)
1487 goto out; 1586 goto out;
1488 1587 status = nfs_ok;
1489 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, 1588 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1490 NFS4_MAX_SESSIONID_LEN); 1589 NFS4_MAX_SESSIONID_LEN);
1590 memcpy(&cr_ses->fore_channel, &new->se_fchannel,
1591 sizeof(struct nfsd4_channel_attrs));
1592 cs_slot->sl_seqid++;
1491 cr_ses->seqid = cs_slot->sl_seqid; 1593 cr_ses->seqid = cs_slot->sl_seqid;
1492 1594
1493out_cache:
1494 /* cache solo and embedded create sessions under the state lock */ 1595 /* cache solo and embedded create sessions under the state lock */
1495 nfsd4_cache_create_session(cr_ses, cs_slot, status); 1596 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1597 if (confirm_me)
1598 move_to_confirmed(conf);
1496out: 1599out:
1497 nfs4_unlock_state(); 1600 nfs4_unlock_state();
1498 dprintk("%s returns %d\n", __func__, ntohl(status)); 1601 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1507,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1507 return argp->opcnt == resp->opcnt; 1610 return argp->opcnt == resp->opcnt;
1508} 1611}
1509 1612
1613static __be32 nfsd4_map_bcts_dir(u32 *dir)
1614{
1615 switch (*dir) {
1616 case NFS4_CDFC4_FORE:
1617 case NFS4_CDFC4_BACK:
1618 return nfs_ok;
1619 case NFS4_CDFC4_FORE_OR_BOTH:
1620 case NFS4_CDFC4_BACK_OR_BOTH:
1621 *dir = NFS4_CDFC4_BOTH;
1622 return nfs_ok;
1623 };
1624 return nfserr_inval;
1625}
1626
1627__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1628 struct nfsd4_compound_state *cstate,
1629 struct nfsd4_bind_conn_to_session *bcts)
1630{
1631 __be32 status;
1632
1633 if (!nfsd4_last_compound_op(rqstp))
1634 return nfserr_not_only_op;
1635 spin_lock(&client_lock);
1636 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
1637 /* Sorta weird: we only need the refcnt'ing because new_conn acquires
1638 * client_lock iself: */
1639 if (cstate->session) {
1640 nfsd4_get_session(cstate->session);
1641 atomic_inc(&cstate->session->se_client->cl_refcount);
1642 }
1643 spin_unlock(&client_lock);
1644 if (!cstate->session)
1645 return nfserr_badsession;
1646
1647 status = nfsd4_map_bcts_dir(&bcts->dir);
1648 nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
1649 return nfs_ok;
1650}
1651
1510static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1652static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
1511{ 1653{
1512 if (!session) 1654 if (!session)
@@ -1545,9 +1687,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
1545 spin_unlock(&client_lock); 1687 spin_unlock(&client_lock);
1546 1688
1547 nfs4_lock_state(); 1689 nfs4_lock_state();
1548 /* wait for callbacks */ 1690 nfsd4_probe_callback_sync(ses->se_client);
1549 nfsd4_set_callback_client(ses->se_client, NULL);
1550 nfs4_unlock_state(); 1691 nfs4_unlock_state();
1692
1693 nfsd4_del_conns(ses);
1694
1551 nfsd4_put_session(ses); 1695 nfsd4_put_session(ses);
1552 status = nfs_ok; 1696 status = nfs_ok;
1553out: 1697out:
@@ -1555,6 +1699,40 @@ out:
1555 return status; 1699 return status;
1556} 1700}
1557 1701
1702static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
1703{
1704 struct nfsd4_conn *c;
1705
1706 list_for_each_entry(c, &s->se_conns, cn_persession) {
1707 if (c->cn_xprt == xpt) {
1708 return c;
1709 }
1710 }
1711 return NULL;
1712}
1713
1714static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
1715{
1716 struct nfs4_client *clp = ses->se_client;
1717 struct nfsd4_conn *c;
1718 int ret;
1719
1720 spin_lock(&clp->cl_lock);
1721 c = __nfsd4_find_conn(new->cn_xprt, ses);
1722 if (c) {
1723 spin_unlock(&clp->cl_lock);
1724 free_conn(new);
1725 return;
1726 }
1727 __nfsd4_hash_conn(new, ses);
1728 spin_unlock(&clp->cl_lock);
1729 ret = nfsd4_register_conn(new);
1730 if (ret)
1731 /* oops; xprt is already down: */
1732 nfsd4_conn_lost(&new->cn_xpt_user);
1733 return;
1734}
1735
1558__be32 1736__be32
1559nfsd4_sequence(struct svc_rqst *rqstp, 1737nfsd4_sequence(struct svc_rqst *rqstp,
1560 struct nfsd4_compound_state *cstate, 1738 struct nfsd4_compound_state *cstate,
@@ -1563,11 +1741,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1563 struct nfsd4_compoundres *resp = rqstp->rq_resp; 1741 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1564 struct nfsd4_session *session; 1742 struct nfsd4_session *session;
1565 struct nfsd4_slot *slot; 1743 struct nfsd4_slot *slot;
1744 struct nfsd4_conn *conn;
1566 int status; 1745 int status;
1567 1746
1568 if (resp->opcnt != 1) 1747 if (resp->opcnt != 1)
1569 return nfserr_sequence_pos; 1748 return nfserr_sequence_pos;
1570 1749
1750 /*
1751 * Will be either used or freed by nfsd4_sequence_check_conn
1752 * below.
1753 */
1754 conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
1755 if (!conn)
1756 return nfserr_jukebox;
1757
1571 spin_lock(&client_lock); 1758 spin_lock(&client_lock);
1572 status = nfserr_badsession; 1759 status = nfserr_badsession;
1573 session = find_in_sessionid_hashtbl(&seq->sessionid); 1760 session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1599,6 +1786,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1599 if (status) 1786 if (status)
1600 goto out; 1787 goto out;
1601 1788
1789 nfsd4_sequence_check_conn(conn, session);
1790 conn = NULL;
1791
1602 /* Success! bump slot seqid */ 1792 /* Success! bump slot seqid */
1603 slot->sl_inuse = true; 1793 slot->sl_inuse = true;
1604 slot->sl_seqid = seq->seqid; 1794 slot->sl_seqid = seq->seqid;
@@ -1610,9 +1800,14 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1610out: 1800out:
1611 /* Hold a session reference until done processing the compound. */ 1801 /* Hold a session reference until done processing the compound. */
1612 if (cstate->session) { 1802 if (cstate->session) {
1803 struct nfs4_client *clp = session->se_client;
1804
1613 nfsd4_get_session(cstate->session); 1805 nfsd4_get_session(cstate->session);
1614 atomic_inc(&session->se_client->cl_refcount); 1806 atomic_inc(&clp->cl_refcount);
1807 if (clp->cl_cb_state == NFSD4_CB_DOWN)
1808 seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
1615 } 1809 }
1810 kfree(conn);
1616 spin_unlock(&client_lock); 1811 spin_unlock(&client_lock);
1617 dprintk("%s: return %d\n", __func__, ntohl(status)); 1812 dprintk("%s: return %d\n", __func__, ntohl(status));
1618 return status; 1813 return status;
@@ -1651,7 +1846,6 @@ __be32
1651nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1846nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1652 struct nfsd4_setclientid *setclid) 1847 struct nfsd4_setclientid *setclid)
1653{ 1848{
1654 struct sockaddr *sa = svc_addr(rqstp);
1655 struct xdr_netobj clname = { 1849 struct xdr_netobj clname = {
1656 .len = setclid->se_namelen, 1850 .len = setclid->se_namelen,
1657 .data = setclid->se_name, 1851 .data = setclid->se_name,
@@ -1677,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1677 strhashval = clientstr_hashval(dname); 1871 strhashval = clientstr_hashval(dname);
1678 1872
1679 nfs4_lock_state(); 1873 nfs4_lock_state();
1680 conf = find_confirmed_client_by_str(dname, strhashval, false); 1874 conf = find_confirmed_client_by_str(dname, strhashval);
1681 if (conf) { 1875 if (conf) {
1682 /* RFC 3530 14.2.33 CASE 0: */ 1876 /* RFC 3530 14.2.33 CASE 0: */
1683 status = nfserr_clid_inuse; 1877 status = nfserr_clid_inuse;
1878 if (clp_used_exchangeid(conf))
1879 goto out;
1684 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1880 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1685 char addr_str[INET6_ADDRSTRLEN]; 1881 char addr_str[INET6_ADDRSTRLEN];
1686 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, 1882 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1695,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1695 * has a description of SETCLIENTID request processing consisting 1891 * has a description of SETCLIENTID request processing consisting
1696 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1892 * of 5 bullet points, labeled as CASE0 - CASE4 below.
1697 */ 1893 */
1698 unconf = find_unconfirmed_client_by_str(dname, strhashval, false); 1894 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1699 status = nfserr_resource; 1895 status = nfserr_resource;
1700 if (!conf) { 1896 if (!conf) {
1701 /* 1897 /*
@@ -1747,7 +1943,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1747 goto out; 1943 goto out;
1748 gen_clid(new); 1944 gen_clid(new);
1749 } 1945 }
1750 gen_callback(new, setclid, rpc_get_scope_id(sa)); 1946 /*
1947 * XXX: we should probably set this at creation time, and check
1948 * for consistent minorversion use throughout:
1949 */
1950 new->cl_minorversion = 0;
1951 gen_callback(new, setclid, rqstp);
1751 add_to_unconfirmed(new, strhashval); 1952 add_to_unconfirmed(new, strhashval);
1752 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1953 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
1753 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 1954 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1806,8 +2007,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1806 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 2007 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
1807 status = nfserr_clid_inuse; 2008 status = nfserr_clid_inuse;
1808 else { 2009 else {
1809 atomic_set(&conf->cl_cb_set, 0); 2010 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
1810 nfsd4_probe_callback(conf, &unconf->cl_cb_conn); 2011 nfsd4_probe_callback(conf);
1811 expire_client(unconf); 2012 expire_client(unconf);
1812 status = nfs_ok; 2013 status = nfs_ok;
1813 2014
@@ -1834,14 +2035,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1834 unsigned int hash = 2035 unsigned int hash =
1835 clientstr_hashval(unconf->cl_recdir); 2036 clientstr_hashval(unconf->cl_recdir);
1836 conf = find_confirmed_client_by_str(unconf->cl_recdir, 2037 conf = find_confirmed_client_by_str(unconf->cl_recdir,
1837 hash, false); 2038 hash);
1838 if (conf) { 2039 if (conf) {
1839 nfsd4_remove_clid_dir(conf); 2040 nfsd4_remove_clid_dir(conf);
1840 expire_client(conf); 2041 expire_client(conf);
1841 } 2042 }
1842 move_to_confirmed(unconf); 2043 move_to_confirmed(unconf);
1843 conf = unconf; 2044 conf = unconf;
1844 nfsd4_probe_callback(conf, &conf->cl_cb_conn); 2045 nfsd4_probe_callback(conf);
1845 status = nfs_ok; 2046 status = nfs_ok;
1846 } 2047 }
1847 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 2048 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -2132,7 +2333,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
2132 * Spawn a thread to perform a recall on the delegation represented 2333 * Spawn a thread to perform a recall on the delegation represented
2133 * by the lease (file_lock) 2334 * by the lease (file_lock)
2134 * 2335 *
2135 * Called from break_lease() with lock_kernel() held. 2336 * Called from break_lease() with lock_flocks() held.
2136 * Note: we assume break_lease will only call this *once* for any given 2337 * Note: we assume break_lease will only call this *once* for any given
2137 * lease. 2338 * lease.
2138 */ 2339 */
@@ -2156,7 +2357,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2156 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2357 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
2157 spin_unlock(&recall_lock); 2358 spin_unlock(&recall_lock);
2158 2359
2159 /* only place dl_time is set. protected by lock_kernel*/ 2360 /* only place dl_time is set. protected by lock_flocks*/
2160 dp->dl_time = get_seconds(); 2361 dp->dl_time = get_seconds();
2161 2362
2162 /* 2363 /*
@@ -2170,57 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2170 nfsd4_cb_recall(dp); 2371 nfsd4_cb_recall(dp);
2171} 2372}
2172 2373
2173/*
2174 * The file_lock is being reapd.
2175 *
2176 * Called by locks_free_lock() with lock_kernel() held.
2177 */
2178static
2179void nfsd_release_deleg_cb(struct file_lock *fl)
2180{
2181 struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
2182
2183 dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
2184
2185 if (!(fl->fl_flags & FL_LEASE) || !dp)
2186 return;
2187 dp->dl_flock = NULL;
2188}
2189
2190/*
2191 * Set the delegation file_lock back pointer.
2192 *
2193 * Called from setlease() with lock_kernel() held.
2194 */
2195static
2196void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
2197{
2198 struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
2199
2200 dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
2201 if (!dp)
2202 return;
2203 dp->dl_flock = new;
2204}
2205
2206/*
2207 * Called from setlease() with lock_kernel() held
2208 */
2209static
2210int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
2211{
2212 struct nfs4_delegation *onlistd =
2213 (struct nfs4_delegation *)onlist->fl_owner;
2214 struct nfs4_delegation *tryd =
2215 (struct nfs4_delegation *)try->fl_owner;
2216
2217 if (onlist->fl_lmops != try->fl_lmops)
2218 return 0;
2219
2220 return onlistd->dl_client == tryd->dl_client;
2221}
2222
2223
2224static 2374static
2225int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) 2375int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2226{ 2376{
@@ -2232,9 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2232 2382
2233static const struct lock_manager_operations nfsd_lease_mng_ops = { 2383static const struct lock_manager_operations nfsd_lease_mng_ops = {
2234 .fl_break = nfsd_break_deleg_cb, 2384 .fl_break = nfsd_break_deleg_cb,
2235 .fl_release_private = nfsd_release_deleg_cb,
2236 .fl_copy_lock = nfsd_copy_lock_deleg_cb,
2237 .fl_mylease = nfsd_same_client_deleg_cb,
2238 .fl_change = nfsd_change_deleg_cb, 2385 .fl_change = nfsd_change_deleg_cb,
2239}; 2386};
2240 2387
@@ -2401,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
2401 if (!fp->fi_fds[oflag]) { 2548 if (!fp->fi_fds[oflag]) {
2402 status = nfsd_open(rqstp, cur_fh, S_IFREG, access, 2549 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2403 &fp->fi_fds[oflag]); 2550 &fp->fi_fds[oflag]);
2404 if (status == nfserr_dropit)
2405 status = nfserr_jukebox;
2406 if (status) 2551 if (status)
2407 return status; 2552 return status;
2408 } 2553 }
@@ -2483,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
2483 open->op_stateowner->so_client->cl_firststate = 1; 2628 open->op_stateowner->so_client->cl_firststate = 1;
2484} 2629}
2485 2630
2631/* Should we give out recallable state?: */
2632static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
2633{
2634 if (clp->cl_cb_state == NFSD4_CB_UP)
2635 return true;
2636 /*
2637 * In the sessions case, since we don't have to establish a
2638 * separate connection for callbacks, we assume it's OK
2639 * until we hear otherwise:
2640 */
2641 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
2642}
2643
2486/* 2644/*
2487 * Attempt to hand out a delegation. 2645 * Attempt to hand out a delegation.
2488 */ 2646 */
@@ -2491,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2491{ 2649{
2492 struct nfs4_delegation *dp; 2650 struct nfs4_delegation *dp;
2493 struct nfs4_stateowner *sop = stp->st_stateowner; 2651 struct nfs4_stateowner *sop = stp->st_stateowner;
2494 int cb_up = atomic_read(&sop->so_client->cl_cb_set); 2652 int cb_up;
2495 struct file_lock fl, *flp = &fl; 2653 struct file_lock *fl;
2496 int status, flag = 0; 2654 int status, flag = 0;
2497 2655
2656 cb_up = nfsd4_cb_channel_good(sop->so_client);
2498 flag = NFS4_OPEN_DELEGATE_NONE; 2657 flag = NFS4_OPEN_DELEGATE_NONE;
2499 open->op_recall = 0; 2658 open->op_recall = 0;
2500 switch (open->op_claim_type) { 2659 switch (open->op_claim_type) {
@@ -2526,21 +2685,28 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2526 flag = NFS4_OPEN_DELEGATE_NONE; 2685 flag = NFS4_OPEN_DELEGATE_NONE;
2527 goto out; 2686 goto out;
2528 } 2687 }
2529 locks_init_lock(&fl); 2688 status = -ENOMEM;
2530 fl.fl_lmops = &nfsd_lease_mng_ops; 2689 fl = locks_alloc_lock();
2531 fl.fl_flags = FL_LEASE; 2690 if (!fl)
2532 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 2691 goto out;
2533 fl.fl_end = OFFSET_MAX; 2692 locks_init_lock(fl);
2534 fl.fl_owner = (fl_owner_t)dp; 2693 fl->fl_lmops = &nfsd_lease_mng_ops;
2535 fl.fl_file = find_readable_file(stp->st_file); 2694 fl->fl_flags = FL_LEASE;
2536 BUG_ON(!fl.fl_file); 2695 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2537 fl.fl_pid = current->tgid; 2696 fl->fl_end = OFFSET_MAX;
2697 fl->fl_owner = (fl_owner_t)dp;
2698 fl->fl_file = find_readable_file(stp->st_file);
2699 BUG_ON(!fl->fl_file);
2700 fl->fl_pid = current->tgid;
2701 dp->dl_flock = fl;
2538 2702
2539 /* vfs_setlease checks to see if delegation should be handed out. 2703 /* vfs_setlease checks to see if delegation should be handed out.
2540 * the lock_manager callbacks fl_mylease and fl_change are used 2704 * the lock_manager callback fl_change is used
2541 */ 2705 */
2542 if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) { 2706 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2543 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2707 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2708 dp->dl_flock = NULL;
2709 locks_free_lock(fl);
2544 unhash_delegation(dp); 2710 unhash_delegation(dp);
2545 flag = NFS4_OPEN_DELEGATE_NONE; 2711 flag = NFS4_OPEN_DELEGATE_NONE;
2546 goto out; 2712 goto out;
@@ -2674,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2674 renew_client(clp); 2840 renew_client(clp);
2675 status = nfserr_cb_path_down; 2841 status = nfserr_cb_path_down;
2676 if (!list_empty(&clp->cl_delegations) 2842 if (!list_empty(&clp->cl_delegations)
2677 && !atomic_read(&clp->cl_cb_set)) 2843 && clp->cl_cb_state != NFSD4_CB_UP)
2678 goto out; 2844 goto out;
2679 status = nfs_ok; 2845 status = nfs_ok;
2680out: 2846out:
@@ -2944,7 +3110,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2944 if (STALE_STATEID(stateid)) 3110 if (STALE_STATEID(stateid))
2945 goto out; 3111 goto out;
2946 3112
2947 status = nfserr_bad_stateid; 3113 /*
3114 * We assume that any stateid that has the current boot time,
3115 * but that we can't find, is expired:
3116 */
3117 status = nfserr_expired;
2948 if (is_delegation_stateid(stateid)) { 3118 if (is_delegation_stateid(stateid)) {
2949 dp = find_delegation_stateid(ino, stateid); 3119 dp = find_delegation_stateid(ino, stateid);
2950 if (!dp) 3120 if (!dp)
@@ -2957,13 +3127,15 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2957 if (status) 3127 if (status)
2958 goto out; 3128 goto out;
2959 renew_client(dp->dl_client); 3129 renew_client(dp->dl_client);
2960 if (filpp) 3130 if (filpp) {
2961 *filpp = find_readable_file(dp->dl_file); 3131 *filpp = find_readable_file(dp->dl_file);
2962 BUG_ON(!*filpp); 3132 BUG_ON(!*filpp);
3133 }
2963 } else { /* open or lock stateid */ 3134 } else { /* open or lock stateid */
2964 stp = find_stateid(stateid, flags); 3135 stp = find_stateid(stateid, flags);
2965 if (!stp) 3136 if (!stp)
2966 goto out; 3137 goto out;
3138 status = nfserr_bad_stateid;
2967 if (nfs4_check_fh(current_fh, stp)) 3139 if (nfs4_check_fh(current_fh, stp))
2968 goto out; 3140 goto out;
2969 if (!stp->st_stateowner->so_confirmed) 3141 if (!stp->st_stateowner->so_confirmed)
@@ -3038,8 +3210,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3038 * a replayed close: 3210 * a replayed close:
3039 */ 3211 */
3040 sop = search_close_lru(stateid->si_stateownerid, flags); 3212 sop = search_close_lru(stateid->si_stateownerid, flags);
3213 /* It's not stale; let's assume it's expired: */
3041 if (sop == NULL) 3214 if (sop == NULL)
3042 return nfserr_bad_stateid; 3215 return nfserr_expired;
3043 *sopp = sop; 3216 *sopp = sop;
3044 goto check_replay; 3217 goto check_replay;
3045 } 3218 }
@@ -3304,6 +3477,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3304 status = nfserr_bad_stateid; 3477 status = nfserr_bad_stateid;
3305 if (!is_delegation_stateid(stateid)) 3478 if (!is_delegation_stateid(stateid))
3306 goto out; 3479 goto out;
3480 status = nfserr_expired;
3307 dp = find_delegation_stateid(inode, stateid); 3481 dp = find_delegation_stateid(inode, stateid);
3308 if (!dp) 3482 if (!dp)
3309 goto out; 3483 goto out;
@@ -3895,7 +4069,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3895 struct inode *inode = filp->fi_inode; 4069 struct inode *inode = filp->fi_inode;
3896 int status = 0; 4070 int status = 0;
3897 4071
3898 lock_kernel(); 4072 lock_flocks();
3899 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 4073 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
3900 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 4074 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
3901 status = 1; 4075 status = 1;
@@ -3903,7 +4077,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3903 } 4077 }
3904 } 4078 }
3905out: 4079out:
3906 unlock_kernel(); 4080 unlock_flocks();
3907 return status; 4081 return status;
3908} 4082}
3909 4083
@@ -3980,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
3980 unsigned int strhashval = clientstr_hashval(name); 4154 unsigned int strhashval = clientstr_hashval(name);
3981 struct nfs4_client *clp; 4155 struct nfs4_client *clp;
3982 4156
3983 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); 4157 clp = find_confirmed_client_by_str(name, strhashval);
3984 return clp ? 1 : 0; 4158 return clp ? 1 : 0;
3985} 4159}
3986 4160
@@ -4209,7 +4383,7 @@ __nfs4_state_shutdown(void)
4209void 4383void
4210nfs4_state_shutdown(void) 4384nfs4_state_shutdown(void)
4211{ 4385{
4212 cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); 4386 cancel_delayed_work_sync(&laundromat_work);
4213 destroy_workqueue(laundry_wq); 4387 destroy_workqueue(laundry_wq);
4214 locks_end_grace(&nfsd4_manager); 4388 locks_end_grace(&nfsd4_manager);
4215 nfs4_lock_state(); 4389 nfs4_lock_state();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1a468bbd330f..956629b9cdc9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
44#include <linux/namei.h> 44#include <linux/namei.h>
45#include <linux/statfs.h> 45#include <linux/statfs.h>
46#include <linux/utsname.h> 46#include <linux/utsname.h>
47#include <linux/nfsd_idmap.h>
48#include <linux/nfs4_acl.h>
49#include <linux/sunrpc/svcauth_gss.h> 47#include <linux/sunrpc/svcauth_gss.h>
50 48
49#include "idmap.h"
50#include "acl.h"
51#include "xdr4.h" 51#include "xdr4.h"
52#include "vfs.h" 52#include "vfs.h"
53 53
54
54#define NFSDDBG_FACILITY NFSDDBG_XDR 55#define NFSDDBG_FACILITY NFSDDBG_XDR
55 56
56/* 57/*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
288 len += XDR_QUADLEN(dummy32) << 2; 289 len += XDR_QUADLEN(dummy32) << 2;
289 READMEM(buf, dummy32); 290 READMEM(buf, dummy32);
290 ace->whotype = nfs4_acl_get_whotype(buf, dummy32); 291 ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
291 host_err = 0; 292 status = nfs_ok;
292 if (ace->whotype != NFS4_ACL_WHO_NAMED) 293 if (ace->whotype != NFS4_ACL_WHO_NAMED)
293 ace->who = 0; 294 ace->who = 0;
294 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) 295 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
295 host_err = nfsd_map_name_to_gid(argp->rqstp, 296 status = nfsd_map_name_to_gid(argp->rqstp,
296 buf, dummy32, &ace->who); 297 buf, dummy32, &ace->who);
297 else 298 else
298 host_err = nfsd_map_name_to_uid(argp->rqstp, 299 status = nfsd_map_name_to_uid(argp->rqstp,
299 buf, dummy32, &ace->who); 300 buf, dummy32, &ace->who);
300 if (host_err) 301 if (status)
301 goto out_nfserr; 302 return status;
302 } 303 }
303 } else 304 } else
304 *acl = NULL; 305 *acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
420 DECODE_TAIL; 421 DECODE_TAIL;
421} 422}
422 423
424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
425{
426 DECODE_HEAD;
427 u32 dummy;
428
429 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
430 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
431 READ32(bcts->dir);
432 /* XXX: Perhaps Tom Tucker could help us figure out how we
433 * should be using ctsa_use_conn_in_rdma_mode: */
434 READ32(dummy);
435
436 DECODE_TAIL;
437}
438
423static __be32 439static __be32
424nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) 440nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
425{ 441{
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
847} 863}
848 864
849static __be32 865static __be32
866nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
867 struct nfsd4_secinfo_no_name *sin)
868{
869 DECODE_HEAD;
870
871 READ_BUF(4);
872 READ32(sin->sin_style);
873 DECODE_TAIL;
874}
875
876static __be32
850nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 877nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
851{ 878{
852 __be32 status; 879 __be32 status;
@@ -1005,7 +1032,7 @@ static __be32
1005nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, 1032nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1006 struct nfsd4_exchange_id *exid) 1033 struct nfsd4_exchange_id *exid)
1007{ 1034{
1008 int dummy; 1035 int dummy, tmp;
1009 DECODE_HEAD; 1036 DECODE_HEAD;
1010 1037
1011 READ_BUF(NFS4_VERIFIER_SIZE); 1038 READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1053 1080
1054 /* ssp_hash_algs<> */ 1081 /* ssp_hash_algs<> */
1055 READ_BUF(4); 1082 READ_BUF(4);
1056 READ32(dummy); 1083 READ32(tmp);
1057 READ_BUF(dummy); 1084 while (tmp--) {
1058 p += XDR_QUADLEN(dummy); 1085 READ_BUF(4);
1086 READ32(dummy);
1087 READ_BUF(dummy);
1088 p += XDR_QUADLEN(dummy);
1089 }
1059 1090
1060 /* ssp_encr_algs<> */ 1091 /* ssp_encr_algs<> */
1061 READ_BUF(4); 1092 READ_BUF(4);
1062 READ32(dummy); 1093 READ32(tmp);
1063 READ_BUF(dummy); 1094 while (tmp--) {
1064 p += XDR_QUADLEN(dummy); 1095 READ_BUF(4);
1096 READ32(dummy);
1097 READ_BUF(dummy);
1098 p += XDR_QUADLEN(dummy);
1099 }
1065 1100
1066 /* ssp_window and ssp_num_gss_handles */ 1101 /* ssp_window and ssp_num_gss_handles */
1067 READ_BUF(8); 1102 READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1339 1374
1340 /* new operations for NFSv4.1 */ 1375 /* new operations for NFSv4.1 */
1341 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, 1376 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
1342 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp, 1377 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
1343 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, 1378 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
1344 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, 1379 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
1345 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1380 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1350 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1385 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1351 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1386 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1387 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1353 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, 1388 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
1354 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1389 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1355 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1390 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
1356 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, 1391 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -1805,19 +1840,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1805 goto out_nfserr; 1840 goto out_nfserr;
1806 } 1841 }
1807 } 1842 }
1808 if ((buflen -= 16) < 0)
1809 goto out_resource;
1810 1843
1811 if (unlikely(bmval2)) { 1844 if (bmval2) {
1845 if ((buflen -= 16) < 0)
1846 goto out_resource;
1812 WRITE32(3); 1847 WRITE32(3);
1813 WRITE32(bmval0); 1848 WRITE32(bmval0);
1814 WRITE32(bmval1); 1849 WRITE32(bmval1);
1815 WRITE32(bmval2); 1850 WRITE32(bmval2);
1816 } else if (likely(bmval1)) { 1851 } else if (bmval1) {
1852 if ((buflen -= 12) < 0)
1853 goto out_resource;
1817 WRITE32(2); 1854 WRITE32(2);
1818 WRITE32(bmval0); 1855 WRITE32(bmval0);
1819 WRITE32(bmval1); 1856 WRITE32(bmval1);
1820 } else { 1857 } else {
1858 if ((buflen -= 8) < 0)
1859 goto out_resource;
1821 WRITE32(1); 1860 WRITE32(1);
1822 WRITE32(bmval0); 1861 WRITE32(bmval0);
1823 } 1862 }
@@ -1828,15 +1867,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1828 u32 word1 = nfsd_suppattrs1(minorversion); 1867 u32 word1 = nfsd_suppattrs1(minorversion);
1829 u32 word2 = nfsd_suppattrs2(minorversion); 1868 u32 word2 = nfsd_suppattrs2(minorversion);
1830 1869
1831 if ((buflen -= 12) < 0)
1832 goto out_resource;
1833 if (!aclsupport) 1870 if (!aclsupport)
1834 word0 &= ~FATTR4_WORD0_ACL; 1871 word0 &= ~FATTR4_WORD0_ACL;
1835 if (!word2) { 1872 if (!word2) {
1873 if ((buflen -= 12) < 0)
1874 goto out_resource;
1836 WRITE32(2); 1875 WRITE32(2);
1837 WRITE32(word0); 1876 WRITE32(word0);
1838 WRITE32(word1); 1877 WRITE32(word1);
1839 } else { 1878 } else {
1879 if ((buflen -= 16) < 0)
1880 goto out_resource;
1840 WRITE32(3); 1881 WRITE32(3);
1841 WRITE32(word0); 1882 WRITE32(word0);
1842 WRITE32(word1); 1883 WRITE32(word1);
@@ -2303,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2303 case nfserr_resource: 2344 case nfserr_resource:
2304 nfserr = nfserr_toosmall; 2345 nfserr = nfserr_toosmall;
2305 goto fail; 2346 goto fail;
2306 case nfserr_dropit:
2307 goto fail;
2308 case nfserr_noent: 2347 case nfserr_noent:
2309 goto skip_entry; 2348 goto skip_entry;
2310 default: 2349 default:
@@ -2359,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2359 return nfserr; 2398 return nfserr;
2360} 2399}
2361 2400
2401static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
2402{
2403 __be32 *p;
2404
2405 if (!nfserr) {
2406 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
2407 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
2408 WRITE32(bcts->dir);
2409 /* XXX: ? */
2410 WRITE32(0);
2411 ADJUST_ARGS();
2412 }
2413 return nfserr;
2414}
2415
2362static __be32 2416static __be32
2363nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) 2417nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
2364{ 2418{
@@ -2820,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2820} 2874}
2821 2875
2822static __be32 2876static __be32
2823nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, 2877nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
2824 struct nfsd4_secinfo *secinfo) 2878 __be32 nfserr,struct svc_export *exp)
2825{ 2879{
2826 int i = 0; 2880 int i = 0;
2827 struct svc_export *exp = secinfo->si_exp;
2828 u32 nflavs; 2881 u32 nflavs;
2829 struct exp_flavor_info *flavs; 2882 struct exp_flavor_info *flavs;
2830 struct exp_flavor_info def_flavs[2]; 2883 struct exp_flavor_info def_flavs[2];
@@ -2886,6 +2939,20 @@ out:
2886 return nfserr; 2939 return nfserr;
2887} 2940}
2888 2941
2942static __be32
2943nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
2944 struct nfsd4_secinfo *secinfo)
2945{
2946 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
2947}
2948
2949static __be32
2950nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
2951 struct nfsd4_secinfo_no_name *secinfo)
2952{
2953 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
2954}
2955
2889/* 2956/*
2890 * The SETATTR encode routine is special -- it always encodes a bitmap, 2957 * The SETATTR encode routine is special -- it always encodes a bitmap,
2891 * regardless of the error status. 2958 * regardless of the error status.
@@ -3070,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3070 WRITE32(seq->seqid); 3137 WRITE32(seq->seqid);
3071 WRITE32(seq->slotid); 3138 WRITE32(seq->slotid);
3072 WRITE32(seq->maxslots); 3139 WRITE32(seq->maxslots);
3073 /* 3140 /* For now: target_maxslots = maxslots */
3074 * FIXME: for now:
3075 * target_maxslots = maxslots
3076 * status_flags = 0
3077 */
3078 WRITE32(seq->maxslots); 3141 WRITE32(seq->maxslots);
3079 WRITE32(0); 3142 WRITE32(seq->status_flags);
3080 3143
3081 ADJUST_ARGS(); 3144 ADJUST_ARGS();
3082 resp->cstate.datap = p; /* DRC cache data pointer */ 3145 resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3137,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3137 3200
3138 /* NFSv4.1 operations */ 3201 /* NFSv4.1 operations */
3139 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, 3202 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3140 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 3203 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
3141 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, 3204 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3142 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, 3205 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3143 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, 3206 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3148,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3148 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 3211 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3149 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 3212 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3150 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 3213 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3151 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, 3214 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
3152 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 3215 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3153 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, 3216 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3154 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 3217 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b53b1d042f1f..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
8#include <linux/namei.h> 8#include <linux/namei.h>
9#include <linux/ctype.h> 9#include <linux/ctype.h>
10 10
11#include <linux/nfsd_idmap.h>
12#include <linux/sunrpc/svcsock.h> 11#include <linux/sunrpc/svcsock.h>
13#include <linux/nfsd/syscall.h> 12#include <linux/nfsd/syscall.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
16 15
16#include "idmap.h"
17#include "nfsd.h" 17#include "nfsd.h"
18#include "cache.h" 18#include "cache.h"
19 19
@@ -22,6 +22,7 @@
22 */ 22 */
23enum { 23enum {
24 NFSD_Root = 1, 24 NFSD_Root = 1,
25#ifdef CONFIG_NFSD_DEPRECATED
25 NFSD_Svc, 26 NFSD_Svc,
26 NFSD_Add, 27 NFSD_Add,
27 NFSD_Del, 28 NFSD_Del,
@@ -29,6 +30,7 @@ enum {
29 NFSD_Unexport, 30 NFSD_Unexport,
30 NFSD_Getfd, 31 NFSD_Getfd,
31 NFSD_Getfs, 32 NFSD_Getfs,
33#endif
32 NFSD_List, 34 NFSD_List,
33 NFSD_Export_features, 35 NFSD_Export_features,
34 NFSD_Fh, 36 NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
54/* 56/*
55 * write() for these nodes. 57 * write() for these nodes.
56 */ 58 */
59#ifdef CONFIG_NFSD_DEPRECATED
57static ssize_t write_svc(struct file *file, char *buf, size_t size); 60static ssize_t write_svc(struct file *file, char *buf, size_t size);
58static ssize_t write_add(struct file *file, char *buf, size_t size); 61static ssize_t write_add(struct file *file, char *buf, size_t size);
59static ssize_t write_del(struct file *file, char *buf, size_t size); 62static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
61static ssize_t write_unexport(struct file *file, char *buf, size_t size); 64static ssize_t write_unexport(struct file *file, char *buf, size_t size);
62static ssize_t write_getfd(struct file *file, char *buf, size_t size); 65static ssize_t write_getfd(struct file *file, char *buf, size_t size);
63static ssize_t write_getfs(struct file *file, char *buf, size_t size); 66static ssize_t write_getfs(struct file *file, char *buf, size_t size);
67#endif
64static ssize_t write_filehandle(struct file *file, char *buf, size_t size); 68static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
65static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); 69static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
66static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); 70static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
76#endif 80#endif
77 81
78static ssize_t (*write_op[])(struct file *, char *, size_t) = { 82static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83#ifdef CONFIG_NFSD_DEPRECATED
79 [NFSD_Svc] = write_svc, 84 [NFSD_Svc] = write_svc,
80 [NFSD_Add] = write_add, 85 [NFSD_Add] = write_add,
81 [NFSD_Del] = write_del, 86 [NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83 [NFSD_Unexport] = write_unexport, 88 [NFSD_Unexport] = write_unexport,
84 [NFSD_Getfd] = write_getfd, 89 [NFSD_Getfd] = write_getfd,
85 [NFSD_Getfs] = write_getfs, 90 [NFSD_Getfs] = write_getfs,
91#endif
86 [NFSD_Fh] = write_filehandle, 92 [NFSD_Fh] = write_filehandle,
87 [NFSD_FO_UnlockIP] = write_unlock_ip, 93 [NFSD_FO_UnlockIP] = write_unlock_ip,
88 [NFSD_FO_UnlockFS] = write_unlock_fs, 94 [NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,16 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
121 127
122static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
123{ 129{
130#ifdef CONFIG_NFSD_DEPRECATED
131 static int warned;
132 if (file->f_dentry->d_name.name[0] == '.' && !warned) {
133 printk(KERN_INFO
134 "Warning: \"%s\" uses deprecated NFSD interface: %s."
135 " This will be removed in 2.6.40\n",
136 current->comm, file->f_dentry->d_name.name);
137 warned = 1;
138 }
139#endif
124 if (! file->private_data) { 140 if (! file->private_data) {
125 /* An attempt to read a transaction file without writing 141 /* An attempt to read a transaction file without writing
126 * causes a 0-byte write so that the file can return 142 * causes a 0-byte write so that the file can return
@@ -137,6 +153,7 @@ static const struct file_operations transaction_ops = {
137 .write = nfsctl_transaction_write, 153 .write = nfsctl_transaction_write,
138 .read = nfsctl_transaction_read, 154 .read = nfsctl_transaction_read,
139 .release = simple_transaction_release, 155 .release = simple_transaction_release,
156 .llseek = default_llseek,
140}; 157};
141 158
142static int exports_open(struct inode *inode, struct file *file) 159static int exports_open(struct inode *inode, struct file *file)
@@ -186,6 +203,7 @@ static const struct file_operations pool_stats_operations = {
186 * payload - write methods 203 * payload - write methods
187 */ 204 */
188 205
206#ifdef CONFIG_NFSD_DEPRECATED
189/** 207/**
190 * write_svc - Start kernel's NFSD server 208 * write_svc - Start kernel's NFSD server
191 * 209 *
@@ -401,7 +419,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
401 419
402 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 420 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
403 421
404 clp = auth_unix_lookup(&in6); 422 clp = auth_unix_lookup(&init_net, &in6);
405 if (!clp) 423 if (!clp)
406 err = -EPERM; 424 err = -EPERM;
407 else { 425 else {
@@ -464,7 +482,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
464 482
465 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 483 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
466 484
467 clp = auth_unix_lookup(&in6); 485 clp = auth_unix_lookup(&init_net, &in6);
468 if (!clp) 486 if (!clp)
469 err = -EPERM; 487 err = -EPERM;
470 else { 488 else {
@@ -481,6 +499,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
481 out: 499 out:
482 return err; 500 return err;
483} 501}
502#endif /* CONFIG_NFSD_DEPRECATED */
484 503
485/** 504/**
486 * write_unlock_ip - Release all locks used by a client 505 * write_unlock_ip - Release all locks used by a client
@@ -999,12 +1018,12 @@ static ssize_t __write_ports_addxprt(char *buf)
999 if (err != 0) 1018 if (err != 0)
1000 return err; 1019 return err;
1001 1020
1002 err = svc_create_xprt(nfsd_serv, transport, 1021 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1003 PF_INET, port, SVC_SOCK_ANONYMOUS); 1022 PF_INET, port, SVC_SOCK_ANONYMOUS);
1004 if (err < 0) 1023 if (err < 0)
1005 goto out_err; 1024 goto out_err;
1006 1025
1007 err = svc_create_xprt(nfsd_serv, transport, 1026 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1008 PF_INET6, port, SVC_SOCK_ANONYMOUS); 1027 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1009 if (err < 0 && err != -EAFNOSUPPORT) 1028 if (err < 0 && err != -EAFNOSUPPORT)
1010 goto out_close; 1029 goto out_close;
@@ -1355,6 +1374,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
1355static int nfsd_fill_super(struct super_block * sb, void * data, int silent) 1374static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1356{ 1375{
1357 static struct tree_descr nfsd_files[] = { 1376 static struct tree_descr nfsd_files[] = {
1377#ifdef CONFIG_NFSD_DEPRECATED
1358 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, 1378 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
1359 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, 1379 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
1360 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, 1380 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1362,6 +1382,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1362 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, 1382 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
1363 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, 1383 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
1364 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, 1384 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
1385#endif
1365 [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, 1386 [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
1366 [NFSD_Export_features] = {"export_features", 1387 [NFSD_Export_features] = {"export_features",
1367 &export_features_operations, S_IRUGO}, 1388 &export_features_operations, S_IRUGO},
@@ -1386,16 +1407,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1386 return simple_fill_super(sb, 0x6e667364, nfsd_files); 1407 return simple_fill_super(sb, 0x6e667364, nfsd_files);
1387} 1408}
1388 1409
1389static int nfsd_get_sb(struct file_system_type *fs_type, 1410static struct dentry *nfsd_mount(struct file_system_type *fs_type,
1390 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1411 int flags, const char *dev_name, void *data)
1391{ 1412{
1392 return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt); 1413 return mount_single(fs_type, flags, data, nfsd_fill_super);
1393} 1414}
1394 1415
1395static struct file_system_type nfsd_fs_type = { 1416static struct file_system_type nfsd_fs_type = {
1396 .owner = THIS_MODULE, 1417 .owner = THIS_MODULE,
1397 .name = "nfsd", 1418 .name = "nfsd",
1398 .get_sb = nfsd_get_sb, 1419 .mount = nfsd_mount,
1399 .kill_sb = kill_litter_super, 1420 .kill_sb = kill_litter_super,
1400}; 1421};
1401 1422
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b76ac3a82e39..7ecfa2420307 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void);
158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) 158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) 159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) 160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
161#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER)
161#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) 162#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
162#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) 163#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
163#define nfserr_grace cpu_to_be32(NFSERR_GRACE) 164#define nfserr_grace cpu_to_be32(NFSERR_GRACE)
@@ -249,7 +250,7 @@ extern time_t nfsd4_grace;
249#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 250#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
250#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 251#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
251 252
252#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ 253#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
253 254
254/* 255/*
255 * The following attributes are currently not supported by the NFSv4 server: 256 * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784b..e15dc45fc5ec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
735 { nfserr_stale, -ESTALE }, 735 { nfserr_stale, -ESTALE },
736 { nfserr_jukebox, -ETIMEDOUT }, 736 { nfserr_jukebox, -ETIMEDOUT },
737 { nfserr_jukebox, -ERESTARTSYS }, 737 { nfserr_jukebox, -ERESTARTSYS },
738 { nfserr_dropit, -EAGAIN }, 738 { nfserr_jukebox, -EAGAIN },
739 { nfserr_dropit, -ENOMEM }, 739 { nfserr_jukebox, -EWOULDBLOCK },
740 { nfserr_badname, -ESRCH }, 740 { nfserr_jukebox, -ENOMEM },
741 { nfserr_io, -ETXTBSY }, 741 { nfserr_io, -ETXTBSY },
742 { nfserr_notsupp, -EOPNOTSUPP }, 742 { nfserr_notsupp, -EOPNOTSUPP },
743 { nfserr_toosmall, -ETOOSMALL }, 743 { nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e2c43464f237..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
16#include <linux/lockd/bind.h> 16#include <linux/lockd/bind.h>
17#include <linux/nfsacl.h> 17#include <linux/nfsacl.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <net/net_namespace.h>
19#include "nfsd.h" 20#include "nfsd.h"
20#include "cache.h" 21#include "cache.h"
21#include "vfs.h" 22#include "vfs.h"
@@ -186,12 +187,12 @@ static int nfsd_init_socks(int port)
186 if (!list_empty(&nfsd_serv->sv_permsocks)) 187 if (!list_empty(&nfsd_serv->sv_permsocks))
187 return 0; 188 return 0;
188 189
189 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, 190 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
190 SVC_SOCK_DEFAULTS); 191 SVC_SOCK_DEFAULTS);
191 if (error < 0) 192 if (error < 0)
192 return error; 193 return error;
193 194
194 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, 195 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
195 SVC_SOCK_DEFAULTS); 196 SVC_SOCK_DEFAULTS);
196 if (error < 0) 197 if (error < 0)
197 return error; 198 return error;
@@ -607,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
607 /* Now call the procedure handler, and encode NFS status. */ 608 /* Now call the procedure handler, and encode NFS status. */
608 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 609 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
609 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 610 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
610 if (nfserr == nfserr_dropit) { 611 if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
611 dprintk("nfsd: Dropping request; may be revisited later\n"); 612 dprintk("nfsd: Dropping request; may be revisited later\n");
612 nfsd_cache_update(rqstp, RC_NOCACHE, NULL); 613 nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
613 return 0; 614 return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 322518c88e4b..3074656ba7bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
35#ifndef _NFSD4_STATE_H 35#ifndef _NFSD4_STATE_H
36#define _NFSD4_STATE_H 36#define _NFSD4_STATE_H
37 37
38#include <linux/sunrpc/svc_xprt.h>
38#include <linux/nfsd/nfsfh.h> 39#include <linux/nfsd/nfsfh.h>
39#include "nfsfh.h" 40#include "nfsfh.h"
40 41
@@ -64,20 +65,15 @@ typedef struct {
64 (s)->si_fileid, \ 65 (s)->si_fileid, \
65 (s)->si_generation 66 (s)->si_generation
66 67
67struct nfsd4_cb_sequence {
68 /* args/res */
69 u32 cbs_minorversion;
70 struct nfs4_client *cbs_clp;
71};
72
73struct nfs4_rpc_args {
74 void *args_op;
75 struct nfsd4_cb_sequence args_seq;
76};
77
78struct nfsd4_callback { 68struct nfsd4_callback {
79 struct nfs4_rpc_args cb_args; 69 void *cb_op;
70 struct nfs4_client *cb_clp;
71 struct list_head cb_per_client;
72 u32 cb_minorversion;
73 struct rpc_message cb_msg;
74 const struct rpc_call_ops *cb_ops;
80 struct work_struct cb_work; 75 struct work_struct cb_work;
76 bool cb_done;
81}; 77};
82 78
83struct nfs4_delegation { 79struct nfs4_delegation {
@@ -87,11 +83,11 @@ struct nfs4_delegation {
87 atomic_t dl_count; /* ref count */ 83 atomic_t dl_count; /* ref count */
88 struct nfs4_client *dl_client; 84 struct nfs4_client *dl_client;
89 struct nfs4_file *dl_file; 85 struct nfs4_file *dl_file;
86 struct file *dl_vfs_file;
90 struct file_lock *dl_flock; 87 struct file_lock *dl_flock;
91 u32 dl_type; 88 u32 dl_type;
92 time_t dl_time; 89 time_t dl_time;
93/* For recall: */ 90/* For recall: */
94 u32 dl_ident;
95 stateid_t dl_stateid; 91 stateid_t dl_stateid;
96 struct knfsd_fh dl_fh; 92 struct knfsd_fh dl_fh;
97 int dl_retries; 93 int dl_retries;
@@ -102,9 +98,10 @@ struct nfs4_delegation {
102struct nfs4_cb_conn { 98struct nfs4_cb_conn {
103 /* SETCLIENTID info */ 99 /* SETCLIENTID info */
104 struct sockaddr_storage cb_addr; 100 struct sockaddr_storage cb_addr;
101 struct sockaddr_storage cb_saddr;
105 size_t cb_addrlen; 102 size_t cb_addrlen;
106 u32 cb_prog; 103 u32 cb_prog; /* used only in 4.0 case;
107 u32 cb_minorversion; 104 per-session otherwise */
108 u32 cb_ident; /* minorversion 0 only */ 105 u32 cb_ident; /* minorversion 0 only */
109 struct svc_xprt *cb_xprt; /* minorversion 1 only */ 106 struct svc_xprt *cb_xprt; /* minorversion 1 only */
110}; 107};
@@ -153,6 +150,11 @@ struct nfsd4_create_session {
153 u32 gid; 150 u32 gid;
154}; 151};
155 152
153struct nfsd4_bind_conn_to_session {
154 struct nfs4_sessionid sessionid;
155 u32 dir;
156};
157
156/* The single slot clientid cache structure */ 158/* The single slot clientid cache structure */
157struct nfsd4_clid_slot { 159struct nfsd4_clid_slot {
158 u32 sl_seqid; 160 u32 sl_seqid;
@@ -160,6 +162,15 @@ struct nfsd4_clid_slot {
160 struct nfsd4_create_session sl_cr_ses; 162 struct nfsd4_create_session sl_cr_ses;
161}; 163};
162 164
165struct nfsd4_conn {
166 struct list_head cn_persession;
167 struct svc_xprt *cn_xprt;
168 struct svc_xpt_user cn_xpt_user;
169 struct nfsd4_session *cn_session;
170/* CDFC4_FORE, CDFC4_BACK: */
171 unsigned char cn_flags;
172};
173
163struct nfsd4_session { 174struct nfsd4_session {
164 struct kref se_ref; 175 struct kref se_ref;
165 struct list_head se_hash; /* hash by sessionid */ 176 struct list_head se_hash; /* hash by sessionid */
@@ -169,6 +180,9 @@ struct nfsd4_session {
169 struct nfs4_sessionid se_sessionid; 180 struct nfs4_sessionid se_sessionid;
170 struct nfsd4_channel_attrs se_fchannel; 181 struct nfsd4_channel_attrs se_fchannel;
171 struct nfsd4_channel_attrs se_bchannel; 182 struct nfsd4_channel_attrs se_bchannel;
183 struct list_head se_conns;
184 u32 se_cb_prog;
185 u32 se_cb_seq_nr;
172 struct nfsd4_slot *se_slots[]; /* forward channel slots */ 186 struct nfsd4_slot *se_slots[]; /* forward channel slots */
173}; 187};
174 188
@@ -221,24 +235,36 @@ struct nfs4_client {
221 clientid_t cl_clientid; /* generated by server */ 235 clientid_t cl_clientid; /* generated by server */
222 nfs4_verifier cl_confirm; /* generated by server */ 236 nfs4_verifier cl_confirm; /* generated by server */
223 u32 cl_firststate; /* recovery dir creation */ 237 u32 cl_firststate; /* recovery dir creation */
238 u32 cl_minorversion;
224 239
225 /* for v4.0 and v4.1 callbacks: */ 240 /* for v4.0 and v4.1 callbacks: */
226 struct nfs4_cb_conn cl_cb_conn; 241 struct nfs4_cb_conn cl_cb_conn;
242#define NFSD4_CLIENT_CB_UPDATE 1
243#define NFSD4_CLIENT_KILL 2
244 unsigned long cl_cb_flags;
227 struct rpc_clnt *cl_cb_client; 245 struct rpc_clnt *cl_cb_client;
228 atomic_t cl_cb_set; 246 u32 cl_cb_ident;
247#define NFSD4_CB_UP 0
248#define NFSD4_CB_UNKNOWN 1
249#define NFSD4_CB_DOWN 2
250 int cl_cb_state;
251 struct nfsd4_callback cl_cb_null;
252 struct nfsd4_session *cl_cb_session;
253 struct list_head cl_callbacks; /* list of in-progress callbacks */
254
255 /* for all client information that callback code might need: */
256 spinlock_t cl_lock;
229 257
230 /* for nfs41 */ 258 /* for nfs41 */
231 struct list_head cl_sessions; 259 struct list_head cl_sessions;
232 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ 260 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
233 u32 cl_exchange_flags; 261 u32 cl_exchange_flags;
234 struct nfs4_sessionid cl_sessionid;
235 /* number of rpc's in progress over an associated session: */ 262 /* number of rpc's in progress over an associated session: */
236 atomic_t cl_refcount; 263 atomic_t cl_refcount;
237 264
238 /* for nfs41 callbacks */ 265 /* for nfs41 callbacks */
239 /* We currently support a single back channel with a single slot */ 266 /* We currently support a single back channel with a single slot */
240 unsigned long cl_cb_slot_busy; 267 unsigned long cl_cb_slot_busy;
241 u32 cl_cb_seq_nr;
242 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 268 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
243 /* wait here for slots */ 269 /* wait here for slots */
244}; 270};
@@ -440,12 +466,14 @@ extern int nfs4_in_grace(void);
440extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 466extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
441extern void nfs4_free_stateowner(struct kref *kref); 467extern void nfs4_free_stateowner(struct kref *kref);
442extern int set_callback_cred(void); 468extern int set_callback_cred(void);
443extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 469extern void nfsd4_probe_callback(struct nfs4_client *clp);
470extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
471extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
444extern void nfsd4_do_callback_rpc(struct work_struct *); 472extern void nfsd4_do_callback_rpc(struct work_struct *);
445extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 473extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
446extern int nfsd4_create_callback_queue(void); 474extern int nfsd4_create_callback_queue(void);
447extern void nfsd4_destroy_callback_queue(void); 475extern void nfsd4_destroy_callback_queue(void);
448extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); 476extern void nfsd4_shutdown_callback(struct nfs4_client *);
449extern void nfs4_put_delegation(struct nfs4_delegation *dp); 477extern void nfs4_put_delegation(struct nfs4_delegation *dp);
450extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 478extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
451extern void nfsd4_init_recdir(char *recdir_name); 479extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..641117f2188d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * File operations used by nfsd. Some of these have been ripped from 2 * File operations used by nfsd. Some of these have been ripped from
4 * other parts of the kernel because they weren't exported, others 3 * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
35#endif /* CONFIG_NFSD_V3 */ 34#endif /* CONFIG_NFSD_V3 */
36 35
37#ifdef CONFIG_NFSD_V4 36#ifdef CONFIG_NFSD_V4
38#include <linux/nfs4_acl.h> 37#include "acl.h"
39#include <linux/nfsd_idmap.h> 38#include "idmap.h"
40#endif /* CONFIG_NFSD_V4 */ 39#endif /* CONFIG_NFSD_V4 */
41 40
42#include "nfsd.h" 41#include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
88 .dentry = dget(dentry)}; 87 .dentry = dget(dentry)};
89 int err = 0; 88 int err = 0;
90 89
91 while (d_mountpoint(path.dentry) && follow_down(&path)) 90 err = follow_down(&path, false);
92 ; 91 if (err < 0)
92 goto out;
93 93
94 exp2 = rqst_exp_get_by_name(rqstp, &path); 94 exp2 = rqst_exp_get_by_name(rqstp, &path);
95 if (IS_ERR(exp2)) { 95 if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
273 return err; 273 return err;
274} 274}
275 275
276static int nfsd_break_lease(struct inode *inode)
277{
278 if (!S_ISREG(inode->i_mode))
279 return 0;
280 return break_lease(inode, O_WRONLY | O_NONBLOCK);
281}
282
276/* 283/*
277 * Commit metadata changes to stable storage. 284 * Commit metadata changes to stable storage.
278 */ 285 */
@@ -281,23 +288,13 @@ commit_metadata(struct svc_fh *fhp)
281{ 288{
282 struct inode *inode = fhp->fh_dentry->d_inode; 289 struct inode *inode = fhp->fh_dentry->d_inode;
283 const struct export_operations *export_ops = inode->i_sb->s_export_op; 290 const struct export_operations *export_ops = inode->i_sb->s_export_op;
284 int error = 0;
285 291
286 if (!EX_ISSYNC(fhp->fh_export)) 292 if (!EX_ISSYNC(fhp->fh_export))
287 return 0; 293 return 0;
288 294
289 if (export_ops->commit_metadata) { 295 if (export_ops->commit_metadata)
290 error = export_ops->commit_metadata(inode); 296 return export_ops->commit_metadata(inode);
291 } else { 297 return sync_inode_metadata(inode, 1);
292 struct writeback_control wbc = {
293 .sync_mode = WB_SYNC_ALL,
294 .nr_to_write = 0, /* metadata only */
295 };
296
297 error = sync_inode(inode, &wbc);
298 }
299
300 return error;
301} 298}
302 299
303/* 300/*
@@ -385,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
385 goto out; 382 goto out;
386 } 383 }
387 384
388 /*
389 * If we are changing the size of the file, then
390 * we need to break all leases.
391 */
392 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
393 if (host_err == -EWOULDBLOCK)
394 host_err = -ETIMEDOUT;
395 if (host_err) /* ENOMEM or EWOULDBLOCK */
396 goto out_nfserr;
397
398 host_err = get_write_access(inode); 385 host_err = get_write_access(inode);
399 if (host_err) 386 if (host_err)
400 goto out_nfserr; 387 goto out_nfserr;
@@ -435,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
435 422
436 err = nfserr_notsync; 423 err = nfserr_notsync;
437 if (!check_guard || guardtime == inode->i_ctime.tv_sec) { 424 if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
425 host_err = nfsd_break_lease(inode);
426 if (host_err)
427 goto out_nfserr;
438 fh_lock(fhp); 428 fh_lock(fhp);
429
439 host_err = notify_change(dentry, iap); 430 host_err = notify_change(dentry, iap);
440 err = nfserrno(host_err); 431 err = nfserrno(host_err);
441 fh_unlock(fhp); 432 fh_unlock(fhp);
@@ -762,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
762 */ 753 */
763 if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) 754 if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
764 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); 755 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
765 if (host_err == -EWOULDBLOCK)
766 host_err = -ETIMEDOUT;
767 if (host_err) /* NOMEM or WOULDBLOCK */ 756 if (host_err) /* NOMEM or WOULDBLOCK */
768 goto out_nfserr; 757 goto out_nfserr;
769 758
@@ -855,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
855 struct page **pp = rqstp->rq_respages + rqstp->rq_resused; 844 struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
856 struct page *page = buf->page; 845 struct page *page = buf->page;
857 size_t size; 846 size_t size;
858 int ret;
859
860 ret = buf->ops->confirm(pipe, buf);
861 if (unlikely(ret))
862 return ret;
863 847
864 size = sd->len; 848 size = sd->len;
865 849
@@ -889,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
889 return __splice_from_pipe(pipe, sd, nfsd_splice_actor); 873 return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
890} 874}
891 875
892static inline int svc_msnfs(struct svc_fh *ffhp)
893{
894#ifdef MSNFS
895 return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
896#else
897 return 0;
898#endif
899}
900
901static __be32 876static __be32
902nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 877nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
903 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 878 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -910,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
910 err = nfserr_perm; 885 err = nfserr_perm;
911 inode = file->f_path.dentry->d_inode; 886 inode = file->f_path.dentry->d_inode;
912 887
913 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
914 goto out;
915
916 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 888 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
917 struct splice_desc sd = { 889 struct splice_desc sd = {
918 .len = 0, 890 .len = 0,
@@ -937,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
937 fsnotify_access(file); 909 fsnotify_access(file);
938 } else 910 } else
939 err = nfserrno(host_err); 911 err = nfserrno(host_err);
940out:
941 return err; 912 return err;
942} 913}
943 914
@@ -1002,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1002 int stable = *stablep; 973 int stable = *stablep;
1003 int use_wgather; 974 int use_wgather;
1004 975
1005#ifdef MSNFS
1006 err = nfserr_perm;
1007
1008 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1009 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
1010 goto out;
1011#endif
1012
1013 dentry = file->f_path.dentry; 976 dentry = file->f_path.dentry;
1014 inode = dentry->d_inode; 977 inode = dentry->d_inode;
1015 exp = fhp->fh_export; 978 exp = fhp->fh_export;
@@ -1060,7 +1023,6 @@ out_nfserr:
1060 err = 0; 1023 err = 0;
1061 else 1024 else
1062 err = nfserrno(host_err); 1025 err = nfserrno(host_err);
1063out:
1064 return err; 1026 return err;
1065} 1027}
1066 1028
@@ -1680,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1680 err = nfserrno(host_err); 1642 err = nfserrno(host_err);
1681 goto out_dput; 1643 goto out_dput;
1682 } 1644 }
1645 err = nfserr_noent;
1646 if (!dold->d_inode)
1647 goto out_drop_write;
1648 host_err = nfsd_break_lease(dold->d_inode);
1649 if (host_err)
1650 goto out_drop_write;
1683 host_err = vfs_link(dold, dirp, dnew); 1651 host_err = vfs_link(dold, dirp, dnew);
1684 if (!host_err) { 1652 if (!host_err) {
1685 err = nfserrno(commit_metadata(ffhp)); 1653 err = nfserrno(commit_metadata(ffhp));
@@ -1691,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1691 else 1659 else
1692 err = nfserrno(host_err); 1660 err = nfserrno(host_err);
1693 } 1661 }
1662out_drop_write:
1694 mnt_drop_write(tfhp->fh_export->ex_path.mnt); 1663 mnt_drop_write(tfhp->fh_export->ex_path.mnt);
1695out_dput: 1664out_dput:
1696 dput(dnew); 1665 dput(dnew);
@@ -1765,13 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1765 if (ndentry == trap) 1734 if (ndentry == trap)
1766 goto out_dput_new; 1735 goto out_dput_new;
1767 1736
1768 if (svc_msnfs(ffhp) &&
1769 ((atomic_read(&odentry->d_count) > 1)
1770 || (atomic_read(&ndentry->d_count) > 1))) {
1771 host_err = -EPERM;
1772 goto out_dput_new;
1773 }
1774
1775 host_err = -EXDEV; 1737 host_err = -EXDEV;
1776 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1738 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
1777 goto out_dput_new; 1739 goto out_dput_new;
@@ -1779,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1779 if (host_err) 1741 if (host_err)
1780 goto out_dput_new; 1742 goto out_dput_new;
1781 1743
1744 host_err = nfsd_break_lease(odentry->d_inode);
1745 if (host_err)
1746 goto out_drop_write;
1782 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1747 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1783 if (!host_err) { 1748 if (!host_err) {
1784 host_err = commit_metadata(tfhp); 1749 host_err = commit_metadata(tfhp);
1785 if (!host_err) 1750 if (!host_err)
1786 host_err = commit_metadata(ffhp); 1751 host_err = commit_metadata(ffhp);
1787 } 1752 }
1788 1753out_drop_write:
1789 mnt_drop_write(ffhp->fh_export->ex_path.mnt); 1754 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
1790
1791 out_dput_new: 1755 out_dput_new:
1792 dput(ndentry); 1756 dput(ndentry);
1793 out_dput_old: 1757 out_dput_old:
@@ -1850,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1850 if (host_err) 1814 if (host_err)
1851 goto out_nfserr; 1815 goto out_nfserr;
1852 1816
1853 if (type != S_IFDIR) { /* It's UNLINK */ 1817 host_err = nfsd_break_lease(rdentry->d_inode);
1854#ifdef MSNFS 1818 if (host_err)
1855 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1819 goto out_put;
1856 (atomic_read(&rdentry->d_count) > 1)) { 1820 if (type != S_IFDIR)
1857 host_err = -EPERM;
1858 } else
1859#endif
1860 host_err = vfs_unlink(dirp, rdentry); 1821 host_err = vfs_unlink(dirp, rdentry);
1861 } else { /* It's RMDIR */ 1822 else
1862 host_err = vfs_rmdir(dirp, rdentry); 1823 host_err = vfs_rmdir(dirp, rdentry);
1863 } 1824out_put:
1864
1865 dput(rdentry); 1825 dput(rdentry);
1866 1826
1867 if (!host_err) 1827 if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae6..366401e1a536 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
311 struct svc_export *si_exp; /* response */ 311 struct svc_export *si_exp; /* response */
312}; 312};
313 313
314struct nfsd4_secinfo_no_name {
315 u32 sin_style; /* request */
316 struct svc_export *sin_exp; /* response */
317};
318
314struct nfsd4_setattr { 319struct nfsd4_setattr {
315 stateid_t sa_stateid; /* request */ 320 stateid_t sa_stateid; /* request */
316 u32 sa_bmval[3]; /* request */ 321 u32 sa_bmval[3]; /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
373 u32 cachethis; /* request */ 378 u32 cachethis; /* request */
374#if 0 379#if 0
375 u32 target_maxslots; /* response */ 380 u32 target_maxslots; /* response */
376 u32 status_flags; /* response */
377#endif /* not yet */ 381#endif /* not yet */
382 u32 status_flags; /* response */
378}; 383};
379 384
380struct nfsd4_destroy_session { 385struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
422 427
423 /* NFSv4.1 */ 428 /* NFSv4.1 */
424 struct nfsd4_exchange_id exchange_id; 429 struct nfsd4_exchange_id exchange_id;
430 struct nfsd4_bind_conn_to_session bind_conn_to_session;
425 struct nfsd4_create_session create_session; 431 struct nfsd4_create_session create_session;
426 struct nfsd4_destroy_session destroy_session; 432 struct nfsd4_destroy_session destroy_session;
427 struct nfsd4_sequence sequence; 433 struct nfsd4_sequence sequence;
@@ -484,18 +490,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
484static inline void 490static inline void
485set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) 491set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
486{ 492{
487 BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); 493 BUG_ON(!fhp->fh_pre_saved);
488 cinfo->atomic = 1; 494 cinfo->atomic = fhp->fh_post_saved;
489 cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); 495 cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
490 if (cinfo->change_supported) { 496
491 cinfo->before_change = fhp->fh_pre_change; 497 cinfo->before_change = fhp->fh_pre_change;
492 cinfo->after_change = fhp->fh_post_change; 498 cinfo->after_change = fhp->fh_post_change;
493 } else { 499 cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
494 cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; 500 cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
495 cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; 501 cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
496 cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; 502 cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
497 cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; 503
498 }
499} 504}
500 505
501int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); 506int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
@@ -519,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
519 struct nfsd4_sequence *seq); 524 struct nfsd4_sequence *seq);
520extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 525extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
521 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 526 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
527extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
522extern __be32 nfsd4_create_session(struct svc_rqst *, 528extern __be32 nfsd4_create_session(struct svc_rqst *,
523 struct nfsd4_compound_state *, 529 struct nfsd4_compound_state *,
524 struct nfsd4_create_session *); 530 struct nfsd4_create_session *);
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc5..85c98737a146 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ 2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \ 3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ 4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o 5 ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3dbdc1d356bf..3ee67c67cc52 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -35,7 +35,20 @@
35 35
36struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) 36struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
37{ 37{
38 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); 38 return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
39}
40
41static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
42 const char *fname, int err)
43{
44 struct inode *inode = bmap->b_inode;
45
46 if (err == -EINVAL) {
47 nilfs_error(inode->i_sb, fname,
48 "broken bmap (inode number=%lu)\n", inode->i_ino);
49 err = -EIO;
50 }
51 return err;
39} 52}
40 53
41/** 54/**
@@ -66,8 +79,10 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
66 79
67 down_read(&bmap->b_sem); 80 down_read(&bmap->b_sem);
68 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); 81 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
69 if (ret < 0) 82 if (ret < 0) {
83 ret = nilfs_bmap_convert_error(bmap, __func__, ret);
70 goto out; 84 goto out;
85 }
71 if (NILFS_BMAP_USE_VBN(bmap)) { 86 if (NILFS_BMAP_USE_VBN(bmap)) {
72 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, 87 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
73 &blocknr); 88 &blocknr);
@@ -88,7 +103,8 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
88 down_read(&bmap->b_sem); 103 down_read(&bmap->b_sem);
89 ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks); 104 ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
90 up_read(&bmap->b_sem); 105 up_read(&bmap->b_sem);
91 return ret; 106
107 return nilfs_bmap_convert_error(bmap, __func__, ret);
92} 108}
93 109
94static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 110static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -144,7 +160,8 @@ int nilfs_bmap_insert(struct nilfs_bmap *bmap,
144 down_write(&bmap->b_sem); 160 down_write(&bmap->b_sem);
145 ret = nilfs_bmap_do_insert(bmap, key, rec); 161 ret = nilfs_bmap_do_insert(bmap, key, rec);
146 up_write(&bmap->b_sem); 162 up_write(&bmap->b_sem);
147 return ret; 163
164 return nilfs_bmap_convert_error(bmap, __func__, ret);
148} 165}
149 166
150static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) 167static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
@@ -180,9 +197,12 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
180 197
181 down_read(&bmap->b_sem); 198 down_read(&bmap->b_sem);
182 ret = bmap->b_ops->bop_last_key(bmap, &lastkey); 199 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
183 if (!ret)
184 *key = lastkey;
185 up_read(&bmap->b_sem); 200 up_read(&bmap->b_sem);
201
202 if (ret < 0)
203 ret = nilfs_bmap_convert_error(bmap, __func__, ret);
204 else
205 *key = lastkey;
186 return ret; 206 return ret;
187} 207}
188 208
@@ -210,7 +230,8 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
210 down_write(&bmap->b_sem); 230 down_write(&bmap->b_sem);
211 ret = nilfs_bmap_do_delete(bmap, key); 231 ret = nilfs_bmap_do_delete(bmap, key);
212 up_write(&bmap->b_sem); 232 up_write(&bmap->b_sem);
213 return ret; 233
234 return nilfs_bmap_convert_error(bmap, __func__, ret);
214} 235}
215 236
216static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) 237static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
@@ -261,7 +282,8 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
261 down_write(&bmap->b_sem); 282 down_write(&bmap->b_sem);
262 ret = nilfs_bmap_do_truncate(bmap, key); 283 ret = nilfs_bmap_do_truncate(bmap, key);
263 up_write(&bmap->b_sem); 284 up_write(&bmap->b_sem);
264 return ret; 285
286 return nilfs_bmap_convert_error(bmap, __func__, ret);
265} 287}
266 288
267/** 289/**
@@ -300,7 +322,8 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
300 down_write(&bmap->b_sem); 322 down_write(&bmap->b_sem);
301 ret = bmap->b_ops->bop_propagate(bmap, bh); 323 ret = bmap->b_ops->bop_propagate(bmap, bh);
302 up_write(&bmap->b_sem); 324 up_write(&bmap->b_sem);
303 return ret; 325
326 return nilfs_bmap_convert_error(bmap, __func__, ret);
304} 327}
305 328
306/** 329/**
@@ -344,7 +367,8 @@ int nilfs_bmap_assign(struct nilfs_bmap *bmap,
344 down_write(&bmap->b_sem); 367 down_write(&bmap->b_sem);
345 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); 368 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
346 up_write(&bmap->b_sem); 369 up_write(&bmap->b_sem);
347 return ret; 370
371 return nilfs_bmap_convert_error(bmap, __func__, ret);
348} 372}
349 373
350/** 374/**
@@ -373,7 +397,8 @@ int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
373 down_write(&bmap->b_sem); 397 down_write(&bmap->b_sem);
374 ret = bmap->b_ops->bop_mark(bmap, key, level); 398 ret = bmap->b_ops->bop_mark(bmap, key, level);
375 up_write(&bmap->b_sem); 399 up_write(&bmap->b_sem);
376 return ret; 400
401 return nilfs_bmap_convert_error(bmap, __func__, ret);
377} 402}
378 403
379/** 404/**
@@ -533,18 +558,20 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
533 nilfs_btree_init_gc(bmap); 558 nilfs_btree_init_gc(bmap);
534} 559}
535 560
536void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 561void nilfs_bmap_save(const struct nilfs_bmap *bmap,
562 struct nilfs_bmap_store *store)
537{ 563{
538 memcpy(gcbmap, bmap, sizeof(*bmap)); 564 memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
539 init_rwsem(&gcbmap->b_sem); 565 store->last_allocated_key = bmap->b_last_allocated_key;
540 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 566 store->last_allocated_ptr = bmap->b_last_allocated_ptr;
541 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; 567 store->state = bmap->b_state;
542} 568}
543 569
544void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 570void nilfs_bmap_restore(struct nilfs_bmap *bmap,
571 const struct nilfs_bmap_store *store)
545{ 572{
546 memcpy(bmap, gcbmap, sizeof(*bmap)); 573 memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
547 init_rwsem(&bmap->b_sem); 574 bmap->b_last_allocated_key = store->last_allocated_key;
548 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 575 bmap->b_last_allocated_ptr = store->last_allocated_ptr;
549 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 576 bmap->b_state = store->state;
550} 577}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a20569b19929..bde1c0aa2e15 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -135,6 +135,12 @@ struct nilfs_bmap {
135/* state */ 135/* state */
136#define NILFS_BMAP_DIRTY 0x00000001 136#define NILFS_BMAP_DIRTY 0x00000001
137 137
138struct nilfs_bmap_store {
139 __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
140 __u64 last_allocated_key;
141 __u64 last_allocated_ptr;
142 int state;
143};
138 144
139int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); 145int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
140int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 146int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -153,9 +159,9 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
153int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); 159int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
154 160
155void nilfs_bmap_init_gc(struct nilfs_bmap *); 161void nilfs_bmap_init_gc(struct nilfs_bmap *);
156void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
157void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
158 162
163void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
164void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
159 165
160static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, 166static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
161 __u64 *ptr) 167 __u64 *ptr)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f78ab1044d1d..388e9e8f5286 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
37 37
38void nilfs_btnode_cache_init_once(struct address_space *btnc) 38void nilfs_btnode_cache_init_once(struct address_space *btnc)
39{ 39{
40 memset(btnc, 0, sizeof(*btnc)); 40 nilfs_mapping_init_once(btnc);
41 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
42 spin_lock_init(&btnc->tree_lock);
43 INIT_LIST_HEAD(&btnc->private_list);
44 spin_lock_init(&btnc->private_lock);
45
46 spin_lock_init(&btnc->i_mmap_lock);
47 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
48 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
49} 41}
50 42
51static const struct address_space_operations def_btnode_aops = { 43static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
55void nilfs_btnode_cache_init(struct address_space *btnc, 47void nilfs_btnode_cache_init(struct address_space *btnc,
56 struct backing_dev_info *bdi) 48 struct backing_dev_info *bdi)
57{ 49{
58 btnc->host = NULL; /* can safely set to host inode ? */ 50 nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
59 btnc->flags = 0;
60 mapping_set_gfp_mask(btnc, GFP_NOFS);
61 btnc->assoc_mapping = NULL;
62 btnc->backing_dev_info = bdi;
63 btnc->a_ops = &def_btnode_aops;
64} 51}
65 52
66void nilfs_btnode_cache_clear(struct address_space *btnc) 53void nilfs_btnode_cache_clear(struct address_space *btnc)
@@ -117,8 +104,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
117 if (pblocknr == 0) { 104 if (pblocknr == 0) {
118 pblocknr = blocknr; 105 pblocknr = blocknr;
119 if (inode->i_ino != NILFS_DAT_INO) { 106 if (inode->i_ino != NILFS_DAT_INO) {
120 struct inode *dat = 107 struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
121 nilfs_dat_inode(NILFS_I_NILFS(inode));
122 108
123 /* blocknr is a virtual block number */ 109 /* blocknr is a virtual block number */
124 err = nilfs_dat_translate(dat, blocknr, &pblocknr); 110 err = nilfs_dat_translate(dat, blocknr, &pblocknr);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db63..5ff15a8a1024 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
863 */ 863 */
864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) 864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
865{ 865{
866 struct the_nilfs *nilfs;
867 int ret; 866 int ret;
868 867
869 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
870
871 switch (mode) { 868 switch (mode) {
872 case NILFS_CHECKPOINT: 869 case NILFS_CHECKPOINT:
873 /* 870 if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
874 * Check for protecting existing snapshot mounts: 871 /*
875 * ns_mount_mutex is used to make this operation atomic and 872 * Current implementation does not have to protect
876 * exclusive with a new mount job. Though it doesn't cover 873 * plain read-only mounts since they are exclusive
877 * umount, it's enough for the purpose. 874 * with a read/write mount and are protected from the
878 */ 875 * cleaner.
879 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 876 */
880 /* Current implementation does not have to protect
881 plain read-only mounts since they are exclusive
882 with a read/write mount and are protected from the
883 cleaner. */
884 ret = -EBUSY; 877 ret = -EBUSY;
885 } else 878 else
886 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
887 return ret; 880 return ret;
888 case NILFS_SNAPSHOT: 881 case NILFS_SNAPSHOT:
@@ -933,27 +926,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
933} 926}
934 927
935/** 928/**
936 * nilfs_cpfile_read - read cpfile inode 929 * nilfs_cpfile_read - read or get cpfile inode
937 * @cpfile: cpfile inode 930 * @sb: super block instance
938 * @raw_inode: on-disk cpfile inode
939 */
940int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
941{
942 return nilfs_read_inode_common(cpfile, raw_inode);
943}
944
945/**
946 * nilfs_cpfile_new - create cpfile
947 * @nilfs: nilfs object
948 * @cpsize: size of a checkpoint entry 931 * @cpsize: size of a checkpoint entry
932 * @raw_inode: on-disk cpfile inode
933 * @inodep: buffer to store the inode
949 */ 934 */
950struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize) 935int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
936 struct nilfs_inode *raw_inode, struct inode **inodep)
951{ 937{
952 struct inode *cpfile; 938 struct inode *cpfile;
939 int err;
940
941 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
942 if (unlikely(!cpfile))
943 return -ENOMEM;
944 if (!(cpfile->i_state & I_NEW))
945 goto out;
946
947 err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
948 if (err)
949 goto failed;
953 950
954 cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0); 951 nilfs_mdt_set_entry_size(cpfile, cpsize,
955 if (cpfile) 952 sizeof(struct nilfs_cpfile_header));
956 nilfs_mdt_set_entry_size(cpfile, cpsize, 953
957 sizeof(struct nilfs_cpfile_header)); 954 err = nilfs_read_inode_common(cpfile, raw_inode);
958 return cpfile; 955 if (err)
956 goto failed;
957
958 unlock_new_inode(cpfile);
959 out:
960 *inodep = cpfile;
961 return 0;
962 failed:
963 iget_failed(cpfile);
964 return err;
959} 965}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab43..a242b9a314f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, 40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
41 size_t); 41 size_t);
42 42
43int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode); 43int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
44struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize); 44 struct nilfs_inode *raw_inode, struct inode **inodep);
45 45
46#endif /* _NILFS_CPFILE_H */ 46#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 013146755683..59e5fe742f7b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
36struct nilfs_dat_info { 36struct nilfs_dat_info {
37 struct nilfs_mdt_info mi; 37 struct nilfs_mdt_info mi;
38 struct nilfs_palloc_cache palloc_cache; 38 struct nilfs_palloc_cache palloc_cache;
39 struct nilfs_shadow_map shadow;
39}; 40};
40 41
41static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) 42static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -102,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
102 nilfs_palloc_abort_alloc_entry(dat, req); 103 nilfs_palloc_abort_alloc_entry(dat, req);
103} 104}
104 105
105void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) 106static void nilfs_dat_commit_free(struct inode *dat,
107 struct nilfs_palloc_req *req)
106{ 108{
107 struct nilfs_dat_entry *entry; 109 struct nilfs_dat_entry *entry;
108 void *kaddr; 110 void *kaddr;
@@ -327,6 +329,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
327 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); 329 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
328 if (ret < 0) 330 if (ret < 0)
329 return ret; 331 return ret;
332
333 /*
334 * The given disk block number (blocknr) is not yet written to
335 * the device at this point.
336 *
337 * To prevent nilfs_dat_translate() from returning the
338 * uncommitted block number, this makes a copy of the entry
339 * buffer and redirects nilfs_dat_translate() to the copy.
340 */
341 if (!buffer_nilfs_redirected(entry_bh)) {
342 ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
343 if (ret) {
344 brelse(entry_bh);
345 return ret;
346 }
347 }
348
330 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 349 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
331 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 350 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
332 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { 351 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +390,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
371 */ 390 */
372int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) 391int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
373{ 392{
374 struct buffer_head *entry_bh; 393 struct buffer_head *entry_bh, *bh;
375 struct nilfs_dat_entry *entry; 394 struct nilfs_dat_entry *entry;
376 sector_t blocknr; 395 sector_t blocknr;
377 void *kaddr; 396 void *kaddr;
@@ -381,6 +400,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
381 if (ret < 0) 400 if (ret < 0)
382 return ret; 401 return ret;
383 402
403 if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
404 bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
405 if (bh) {
406 WARN_ON(!buffer_uptodate(bh));
407 brelse(entry_bh);
408 entry_bh = bh;
409 }
410 }
411
384 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 412 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
385 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 413 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
386 blocknr = le64_to_cpu(entry->de_blocknr); 414 blocknr = le64_to_cpu(entry->de_blocknr);
@@ -436,38 +464,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
436} 464}
437 465
438/** 466/**
439 * nilfs_dat_read - read dat inode 467 * nilfs_dat_read - read or get dat inode
440 * @dat: dat inode 468 * @sb: super block instance
441 * @raw_inode: on-disk dat inode
442 */
443int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
444{
445 return nilfs_read_inode_common(dat, raw_inode);
446}
447
448/**
449 * nilfs_dat_new - create dat file
450 * @nilfs: nilfs object
451 * @entry_size: size of a dat entry 469 * @entry_size: size of a dat entry
470 * @raw_inode: on-disk dat inode
471 * @inodep: buffer to store the inode
452 */ 472 */
453struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size) 473int nilfs_dat_read(struct super_block *sb, size_t entry_size,
474 struct nilfs_inode *raw_inode, struct inode **inodep)
454{ 475{
455 static struct lock_class_key dat_lock_key; 476 static struct lock_class_key dat_lock_key;
456 struct inode *dat; 477 struct inode *dat;
457 struct nilfs_dat_info *di; 478 struct nilfs_dat_info *di;
458 int err; 479 int err;
459 480
460 dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di)); 481 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
461 if (dat) { 482 if (unlikely(!dat))
462 err = nilfs_palloc_init_blockgroup(dat, entry_size); 483 return -ENOMEM;
463 if (unlikely(err)) { 484 if (!(dat->i_state & I_NEW))
464 nilfs_mdt_destroy(dat); 485 goto out;
465 return NULL;
466 }
467 486
468 di = NILFS_DAT_I(dat); 487 err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
469 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); 488 if (err)
470 nilfs_palloc_setup_cache(dat, &di->palloc_cache); 489 goto failed;
471 } 490
472 return dat; 491 err = nilfs_palloc_init_blockgroup(dat, entry_size);
492 if (err)
493 goto failed;
494
495 di = NILFS_DAT_I(dat);
496 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
497 nilfs_palloc_setup_cache(dat, &di->palloc_cache);
498 nilfs_mdt_setup_shadow_map(dat, &di->shadow);
499
500 err = nilfs_read_inode_common(dat, raw_inode);
501 if (err)
502 goto failed;
503
504 unlock_new_inode(dat);
505 out:
506 *inodep = dat;
507 return 0;
508 failed:
509 iget_failed(dat);
510 return err;
473} 511}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0efe..cbd8e9732503 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
53int nilfs_dat_move(struct inode *, __u64, sector_t); 53int nilfs_dat_move(struct inode *, __u64, sector_t);
54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); 54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
55 55
56int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode); 56int nilfs_dat_read(struct super_block *sb, size_t entry_size,
57struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size); 57 struct nilfs_inode *raw_inode, struct inode **inodep);
58 58
59#endif /* _NILFS_DAT_H */ 59#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index cb003c8ee1f6..9d45773b79e6 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -91,7 +91,6 @@ static void nilfs_commit_chunk(struct page *page,
91 unsigned from, unsigned to) 91 unsigned from, unsigned to)
92{ 92{
93 struct inode *dir = mapping->host; 93 struct inode *dir = mapping->host;
94 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
95 loff_t pos = page_offset(page) + from; 94 loff_t pos = page_offset(page) + from;
96 unsigned len = to - from; 95 unsigned len = to - from;
97 unsigned nr_dirty, copied; 96 unsigned nr_dirty, copied;
@@ -103,7 +102,7 @@ static void nilfs_commit_chunk(struct page *page,
103 i_size_write(dir, pos + copied); 102 i_size_write(dir, pos + copied);
104 if (IS_DIRSYNC(dir)) 103 if (IS_DIRSYNC(dir))
105 nilfs_set_transaction_flag(NILFS_TI_SYNC); 104 nilfs_set_transaction_flag(NILFS_TI_SYNC);
106 err = nilfs_set_file_dirty(sbi, dir, nr_dirty); 105 err = nilfs_set_file_dirty(dir, nr_dirty);
107 WARN_ON(err); /* do not happen */ 106 WARN_ON(err); /* do not happen */
108 unlock_page(page); 107 unlock_page(page);
109} 108}
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 000000000000..a71cc412b651
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
1#ifndef NILFS_EXPORT_H
2#define NILFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations nilfs_export_ops;
7
8struct nilfs_fid {
9 u64 cno;
10 u64 ino;
11 u32 gen;
12
13 u32 parent_gen;
14 u64 parent_ino;
15} __attribute__ ((packed));
16
17#endif
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c9a30d7ff6fc..2f560c9fb808 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -155,6 +155,7 @@ const struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate, 155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr, 156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission, 157 .permission = nilfs_permission,
158 .fiemap = nilfs_fiemap,
158}; 159};
159 160
160/* end of file */ 161/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index 84a45d1d5464..000000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_palloc_clear_cache(dat);
65 nilfs_palloc_clear_cache(gcdat);
66 nilfs_clear_dirty_pages(mapping);
67 nilfs_copy_back_pages(mapping, gmapping);
68 /* note: mdt dirty flags should be cleared by segctor. */
69
70 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
71 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
72
73 up_write(&NILFS_MDT(dat)->mi_sem);
74}
75
76void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
77{
78 struct inode *gcdat = nilfs->ns_gc_dat;
79 struct nilfs_inode_info *gii = NILFS_I(gcdat);
80
81 gcdat->i_state = I_FREEING | I_CLEAR;
82 gii->i_flags = 0;
83
84 nilfs_palloc_clear_cache(gcdat);
85 truncate_inode_pages(gcdat->i_mapping, 0);
86 truncate_inode_pages(&gii->i_btnode_cache, 0);
87}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index bed3a783129b..caf9a6a3fb54 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each 31 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the 32 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap 33 * current generation and the blocks to be moved by GC never overlap
@@ -175,125 +168,37 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
175 } 168 }
176 nilfs_btnode_mark_dirty(bh); 169 nilfs_btnode_mark_dirty(bh);
177 } else { 170 } else {
178 nilfs_mdt_mark_buffer_dirty(bh); 171 nilfs_mark_buffer_dirty(bh);
179 } 172 }
180 return 0; 173 return 0;
181} 174}
182 175
183/* 176int nilfs_init_gcinode(struct inode *inode)
184 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
185 * @nilfs - the_nilfs
186 *
187 * Return Value: On success, 0.
188 * On error, a negative error code is returned.
189 */
190int nilfs_init_gccache(struct the_nilfs *nilfs)
191{ 177{
192 int loop; 178 struct nilfs_inode_info *ii = NILFS_I(inode);
193
194 BUG_ON(nilfs->ns_gc_inodes_h);
195
196 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
197
198 nilfs->ns_gc_inodes_h =
199 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
200 GFP_NOFS);
201 if (nilfs->ns_gc_inodes_h == NULL)
202 return -ENOMEM;
203
204 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
205 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
206 return 0;
207}
208
209/*
210 * nilfs_destroy_gccache() - free gc_inode hash table
211 * @nilfs - the nilfs
212 */
213void nilfs_destroy_gccache(struct the_nilfs *nilfs)
214{
215 if (nilfs->ns_gc_inodes_h) {
216 nilfs_remove_all_gcinode(nilfs);
217 kfree(nilfs->ns_gc_inodes_h);
218 nilfs->ns_gc_inodes_h = NULL;
219 }
220}
221
222static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
223 __u64 cno)
224{
225 struct inode *inode;
226 struct nilfs_inode_info *ii;
227
228 inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
229 if (!inode)
230 return NULL;
231 179
232 inode->i_op = NULL; 180 inode->i_mode = S_IFREG;
233 inode->i_fop = NULL; 181 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
234 inode->i_mapping->a_ops = &def_gcinode_aops; 182 inode->i_mapping->a_ops = &def_gcinode_aops;
183 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
235 184
236 ii = NILFS_I(inode);
237 ii->i_cno = cno;
238 ii->i_flags = 0; 185 ii->i_flags = 0;
239 ii->i_state = 1 << NILFS_I_GCINODE;
240 ii->i_bh = NULL;
241 nilfs_bmap_init_gc(ii->i_bmap); 186 nilfs_bmap_init_gc(ii->i_bmap);
242 187
243 return inode; 188 return 0;
244}
245
246static unsigned long ihash(ino_t ino, __u64 cno)
247{
248 return hash_long((unsigned long)((ino << 2) + cno),
249 NILFS_GCINODE_HASH_BITS);
250}
251
252/*
253 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
254 */
255struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
256{
257 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
258 struct hlist_node *node;
259 struct inode *inode;
260
261 hlist_for_each_entry(inode, node, head, i_hash) {
262 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
263 return inode;
264 }
265
266 inode = alloc_gcinode(nilfs, ino, cno);
267 if (likely(inode)) {
268 hlist_add_head(&inode->i_hash, head);
269 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
270 }
271 return inode;
272}
273
274/*
275 * nilfs_clear_gcinode() - clear and free a gc inode
276 */
277void nilfs_clear_gcinode(struct inode *inode)
278{
279 nilfs_mdt_destroy(inode);
280} 189}
281 190
282/* 191/**
283 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs 192 * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
284 */ 193 */
285void nilfs_remove_all_gcinode(struct the_nilfs *nilfs) 194void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
286{ 195{
287 struct hlist_head *head = nilfs->ns_gc_inodes_h; 196 struct list_head *head = &nilfs->ns_gc_inodes;
288 struct hlist_node *node, *n; 197 struct nilfs_inode_info *ii;
289 struct inode *inode;
290 int loop;
291 198
292 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) { 199 while (!list_empty(head)) {
293 hlist_for_each_entry_safe(inode, node, n, head, i_hash) { 200 ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
294 hlist_del_init(&inode->i_hash); 201 list_del_init(&ii->i_dirty);
295 list_del_init(&NILFS_I(inode)->i_dirty); 202 iput(&ii->vfs_inode);
296 nilfs_clear_gcinode(inode); /* might sleep */
297 }
298 } 203 }
299} 204}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8f..bfc73d3a30ed 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -149,37 +149,53 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
149 } 149 }
150 150
151 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); 151 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
152 if (unlikely(err)) { 152 if (unlikely(err))
153 if (err == -EINVAL) 153 nilfs_warning(sb, __func__, "unable to read inode: %lu",
154 nilfs_error(sb, __func__, "ifile is broken"); 154 (unsigned long) ino);
155 else
156 nilfs_warning(sb, __func__,
157 "unable to read inode: %lu",
158 (unsigned long) ino);
159 }
160 return err; 155 return err;
161} 156}
162 157
163/** 158/**
164 * nilfs_ifile_new - create inode file 159 * nilfs_ifile_read - read or get ifile inode
165 * @sbi: nilfs_sb_info struct 160 * @sb: super block instance
161 * @root: root object
166 * @inode_size: size of an inode 162 * @inode_size: size of an inode
163 * @raw_inode: on-disk ifile inode
164 * @inodep: buffer to store the inode
167 */ 165 */
168struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size) 166int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
167 size_t inode_size, struct nilfs_inode *raw_inode,
168 struct inode **inodep)
169{ 169{
170 struct inode *ifile; 170 struct inode *ifile;
171 int err; 171 int err;
172 172
173 ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO, 173 ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
174 sizeof(struct nilfs_ifile_info)); 174 if (unlikely(!ifile))
175 if (ifile) { 175 return -ENOMEM;
176 err = nilfs_palloc_init_blockgroup(ifile, inode_size); 176 if (!(ifile->i_state & I_NEW))
177 if (unlikely(err)) { 177 goto out;
178 nilfs_mdt_destroy(ifile); 178
179 return NULL; 179 err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
180 } 180 sizeof(struct nilfs_ifile_info));
181 nilfs_palloc_setup_cache(ifile, 181 if (err)
182 &NILFS_IFILE_I(ifile)->palloc_cache); 182 goto failed;
183 } 183
184 return ifile; 184 err = nilfs_palloc_init_blockgroup(ifile, inode_size);
185 if (err)
186 goto failed;
187
188 nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
189
190 err = nilfs_read_inode_common(ifile, raw_inode);
191 if (err)
192 goto failed;
193
194 unlock_new_inode(ifile);
195 out:
196 *inodep = ifile;
197 return 0;
198 failed:
199 iget_failed(ifile);
200 return err;
185} 201}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f2..59b6f2b51df6 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size); 52int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
53 size_t inode_size, struct nilfs_inode *raw_inode,
54 struct inode **inodep);
53 55
54#endif /* _NILFS_IFILE_H */ 56#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index eccb2f2e2315..2fd440d8d6b8 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -34,6 +34,12 @@
34#include "cpfile.h" 34#include "cpfile.h"
35#include "ifile.h" 35#include "ifile.h"
36 36
37struct nilfs_iget_args {
38 u64 ino;
39 __u64 cno;
40 struct nilfs_root *root;
41 int for_gc;
42};
37 43
38/** 44/**
39 * nilfs_get_block() - get a file block on the filesystem (callback function) 45 * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -52,7 +58,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
52 struct nilfs_inode_info *ii = NILFS_I(inode); 58 struct nilfs_inode_info *ii = NILFS_I(inode);
53 __u64 blknum = 0; 59 __u64 blknum = 0;
54 int err = 0, ret; 60 int err = 0, ret;
55 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); 61 struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
56 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; 62 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
57 63
58 down_read(&NILFS_MDT(dat)->mi_sem); 64 down_read(&NILFS_MDT(dat)->mi_sem);
@@ -90,11 +96,6 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
90 inode->i_ino, 96 inode->i_ino,
91 (unsigned long long)blkoff); 97 (unsigned long long)blkoff);
92 err = 0; 98 err = 0;
93 } else if (err == -EINVAL) {
94 nilfs_error(inode->i_sb, __func__,
95 "broken bmap (inode=%lu)\n",
96 inode->i_ino);
97 err = -EIO;
98 } 99 }
99 nilfs_transaction_abort(inode->i_sb); 100 nilfs_transaction_abort(inode->i_sb);
100 goto out; 101 goto out;
@@ -103,6 +104,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
103 nilfs_transaction_commit(inode->i_sb); /* never fails */ 104 nilfs_transaction_commit(inode->i_sb); /* never fails */
104 /* Error handling should be detailed */ 105 /* Error handling should be detailed */
105 set_buffer_new(bh_result); 106 set_buffer_new(bh_result);
107 set_buffer_delay(bh_result);
106 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed 108 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
107 to proper value */ 109 to proper value */
108 } else if (ret == -ENOENT) { 110 } else if (ret == -ENOENT) {
@@ -179,10 +181,9 @@ static int nilfs_set_page_dirty(struct page *page)
179 181
180 if (ret) { 182 if (ret) {
181 struct inode *inode = page->mapping->host; 183 struct inode *inode = page->mapping->host;
182 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
183 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); 184 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
184 185
185 nilfs_set_file_dirty(sbi, inode, nr_dirty); 186 nilfs_set_file_dirty(inode, nr_dirty);
186 } 187 }
187 return ret; 188 return ret;
188} 189}
@@ -223,7 +224,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
223 start + copied); 224 start + copied);
224 copied = generic_write_end(file, mapping, pos, len, copied, page, 225 copied = generic_write_end(file, mapping, pos, len, copied, page,
225 fsdata); 226 fsdata);
226 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); 227 nilfs_set_file_dirty(inode, nr_dirty);
227 err = nilfs_transaction_commit(inode->i_sb); 228 err = nilfs_transaction_commit(inode->i_sb);
228 return err ? : copied; 229 return err ? : copied;
229} 230}
@@ -279,6 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
279 struct nilfs_sb_info *sbi = NILFS_SB(sb); 280 struct nilfs_sb_info *sbi = NILFS_SB(sb);
280 struct inode *inode; 281 struct inode *inode;
281 struct nilfs_inode_info *ii; 282 struct nilfs_inode_info *ii;
283 struct nilfs_root *root;
282 int err = -ENOMEM; 284 int err = -ENOMEM;
283 ino_t ino; 285 ino_t ino;
284 286
@@ -289,15 +291,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
289 mapping_set_gfp_mask(inode->i_mapping, 291 mapping_set_gfp_mask(inode->i_mapping,
290 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 292 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
291 293
294 root = NILFS_I(dir)->i_root;
292 ii = NILFS_I(inode); 295 ii = NILFS_I(inode);
293 ii->i_state = 1 << NILFS_I_NEW; 296 ii->i_state = 1 << NILFS_I_NEW;
297 ii->i_root = root;
294 298
295 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); 299 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
296 if (unlikely(err)) 300 if (unlikely(err))
297 goto failed_ifile_create_inode; 301 goto failed_ifile_create_inode;
298 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 302 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
299 303
300 atomic_inc(&sbi->s_inodes_count); 304 atomic_inc(&root->inodes_count);
301 inode_init_owner(inode, dir, mode); 305 inode_init_owner(inode, dir, mode);
302 inode->i_ino = ino; 306 inode->i_ino = ino;
303 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 307 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -320,7 +324,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
320 /* ii->i_file_acl = 0; */ 324 /* ii->i_file_acl = 0; */
321 /* ii->i_dir_acl = 0; */ 325 /* ii->i_dir_acl = 0; */
322 ii->i_dir_start_lookup = 0; 326 ii->i_dir_start_lookup = 0;
323 ii->i_cno = 0;
324 nilfs_set_inode_flags(inode); 327 nilfs_set_inode_flags(inode);
325 spin_lock(&sbi->s_next_gen_lock); 328 spin_lock(&sbi->s_next_gen_lock);
326 inode->i_generation = sbi->s_next_generation++; 329 inode->i_generation = sbi->s_next_generation++;
@@ -350,16 +353,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
350 return ERR_PTR(err); 353 return ERR_PTR(err);
351} 354}
352 355
353void nilfs_free_inode(struct inode *inode)
354{
355 struct super_block *sb = inode->i_sb;
356 struct nilfs_sb_info *sbi = NILFS_SB(sb);
357
358 /* XXX: check error code? Is there any thing I can do? */
359 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
360 atomic_dec(&sbi->s_inodes_count);
361}
362
363void nilfs_set_inode_flags(struct inode *inode) 356void nilfs_set_inode_flags(struct inode *inode)
364{ 357{
365 unsigned int flags = NILFS_I(inode)->i_flags; 358 unsigned int flags = NILFS_I(inode)->i_flags;
@@ -410,7 +403,6 @@ int nilfs_read_inode_common(struct inode *inode,
410 0 : le32_to_cpu(raw_inode->i_dir_acl); 403 0 : le32_to_cpu(raw_inode->i_dir_acl);
411#endif 404#endif
412 ii->i_dir_start_lookup = 0; 405 ii->i_dir_start_lookup = 0;
413 ii->i_cno = 0;
414 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 406 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
415 407
416 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 408 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -424,21 +416,21 @@ int nilfs_read_inode_common(struct inode *inode,
424 return 0; 416 return 0;
425} 417}
426 418
427static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, 419static int __nilfs_read_inode(struct super_block *sb,
420 struct nilfs_root *root, unsigned long ino,
428 struct inode *inode) 421 struct inode *inode)
429{ 422{
430 struct nilfs_sb_info *sbi = NILFS_SB(sb); 423 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
431 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
432 struct buffer_head *bh; 424 struct buffer_head *bh;
433 struct nilfs_inode *raw_inode; 425 struct nilfs_inode *raw_inode;
434 int err; 426 int err;
435 427
436 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 428 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
437 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); 429 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
438 if (unlikely(err)) 430 if (unlikely(err))
439 goto bad_inode; 431 goto bad_inode;
440 432
441 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 433 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
442 434
443 err = nilfs_read_inode_common(inode, raw_inode); 435 err = nilfs_read_inode_common(inode, raw_inode);
444 if (err) 436 if (err)
@@ -461,33 +453,110 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
461 inode, inode->i_mode, 453 inode, inode->i_mode,
462 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 454 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
463 } 455 }
464 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 456 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
465 brelse(bh); 457 brelse(bh);
466 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 458 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
467 nilfs_set_inode_flags(inode); 459 nilfs_set_inode_flags(inode);
468 return 0; 460 return 0;
469 461
470 failed_unmap: 462 failed_unmap:
471 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 463 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
472 brelse(bh); 464 brelse(bh);
473 465
474 bad_inode: 466 bad_inode:
475 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 467 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
476 return err; 468 return err;
477} 469}
478 470
479struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) 471static int nilfs_iget_test(struct inode *inode, void *opaque)
472{
473 struct nilfs_iget_args *args = opaque;
474 struct nilfs_inode_info *ii;
475
476 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
477 return 0;
478
479 ii = NILFS_I(inode);
480 if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
481 return !args->for_gc;
482
483 return args->for_gc && args->cno == ii->i_cno;
484}
485
486static int nilfs_iget_set(struct inode *inode, void *opaque)
487{
488 struct nilfs_iget_args *args = opaque;
489
490 inode->i_ino = args->ino;
491 if (args->for_gc) {
492 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
493 NILFS_I(inode)->i_cno = args->cno;
494 NILFS_I(inode)->i_root = NULL;
495 } else {
496 if (args->root && args->ino == NILFS_ROOT_INO)
497 nilfs_get_root(args->root);
498 NILFS_I(inode)->i_root = args->root;
499 }
500 return 0;
501}
502
503struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
504 unsigned long ino)
505{
506 struct nilfs_iget_args args = {
507 .ino = ino, .root = root, .cno = 0, .for_gc = 0
508 };
509
510 return ilookup5(sb, ino, nilfs_iget_test, &args);
511}
512
513struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
514 unsigned long ino)
515{
516 struct nilfs_iget_args args = {
517 .ino = ino, .root = root, .cno = 0, .for_gc = 0
518 };
519
520 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
521}
522
523struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
524 unsigned long ino)
525{
526 struct inode *inode;
527 int err;
528
529 inode = nilfs_iget_locked(sb, root, ino);
530 if (unlikely(!inode))
531 return ERR_PTR(-ENOMEM);
532 if (!(inode->i_state & I_NEW))
533 return inode;
534
535 err = __nilfs_read_inode(sb, root, ino, inode);
536 if (unlikely(err)) {
537 iget_failed(inode);
538 return ERR_PTR(err);
539 }
540 unlock_new_inode(inode);
541 return inode;
542}
543
544struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
545 __u64 cno)
480{ 546{
547 struct nilfs_iget_args args = {
548 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
549 };
481 struct inode *inode; 550 struct inode *inode;
482 int err; 551 int err;
483 552
484 inode = iget_locked(sb, ino); 553 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
485 if (unlikely(!inode)) 554 if (unlikely(!inode))
486 return ERR_PTR(-ENOMEM); 555 return ERR_PTR(-ENOMEM);
487 if (!(inode->i_state & I_NEW)) 556 if (!(inode->i_state & I_NEW))
488 return inode; 557 return inode;
489 558
490 err = __nilfs_read_inode(sb, ino, inode); 559 err = nilfs_init_gcinode(inode);
491 if (unlikely(err)) { 560 if (unlikely(err)) {
492 iget_failed(inode); 561 iget_failed(inode);
493 return ERR_PTR(err); 562 return ERR_PTR(err);
@@ -528,21 +597,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
528{ 597{
529 ino_t ino = inode->i_ino; 598 ino_t ino = inode->i_ino;
530 struct nilfs_inode_info *ii = NILFS_I(inode); 599 struct nilfs_inode_info *ii = NILFS_I(inode);
531 struct super_block *sb = inode->i_sb; 600 struct inode *ifile = ii->i_root->ifile;
532 struct nilfs_sb_info *sbi = NILFS_SB(sb);
533 struct nilfs_inode *raw_inode; 601 struct nilfs_inode *raw_inode;
534 602
535 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); 603 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
536 604
537 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 605 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
538 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); 606 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
539 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 607 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
540 608
541 nilfs_write_inode_common(inode, raw_inode, 0); 609 nilfs_write_inode_common(inode, raw_inode, 0);
542 /* XXX: call with has_bmap = 0 is a workaround to avoid 610 /* XXX: call with has_bmap = 0 is a workaround to avoid
543 deadlock of bmap. This delays update of i_bmap to just 611 deadlock of bmap. This delays update of i_bmap to just
544 before writing */ 612 before writing */
545 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); 613 nilfs_ifile_unmap_inode(ifile, ino, ibh);
546} 614}
547 615
548#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ 616#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
@@ -555,7 +623,7 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
555 623
556 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 624 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
557 return; 625 return;
558 repeat: 626repeat:
559 ret = nilfs_bmap_last_key(ii->i_bmap, &b); 627 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
560 if (ret == -ENOENT) 628 if (ret == -ENOENT)
561 return; 629 return;
@@ -572,14 +640,10 @@ static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
572 nilfs_bmap_truncate(ii->i_bmap, b) == 0)) 640 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
573 goto repeat; 641 goto repeat;
574 642
575 failed: 643failed:
576 if (ret == -EINVAL) 644 nilfs_warning(ii->vfs_inode.i_sb, __func__,
577 nilfs_error(ii->vfs_inode.i_sb, __func__, 645 "failed to truncate bmap (ino=%lu, err=%d)",
578 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); 646 ii->vfs_inode.i_ino, ret);
579 else
580 nilfs_warning(ii->vfs_inode.i_sb, __func__,
581 "failed to truncate bmap (ino=%lu, err=%d)",
582 ii->vfs_inode.i_ino, ret);
583} 647}
584 648
585void nilfs_truncate(struct inode *inode) 649void nilfs_truncate(struct inode *inode)
@@ -608,7 +672,7 @@ void nilfs_truncate(struct inode *inode)
608 nilfs_set_transaction_flag(NILFS_TI_SYNC); 672 nilfs_set_transaction_flag(NILFS_TI_SYNC);
609 673
610 nilfs_mark_inode_dirty(inode); 674 nilfs_mark_inode_dirty(inode);
611 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); 675 nilfs_set_file_dirty(inode, 0);
612 nilfs_transaction_commit(sb); 676 nilfs_transaction_commit(sb);
613 /* May construct a logical segment and may fail in sync mode. 677 /* May construct a logical segment and may fail in sync mode.
614 But truncate has no return value. */ 678 But truncate has no return value. */
@@ -617,6 +681,7 @@ void nilfs_truncate(struct inode *inode)
617static void nilfs_clear_inode(struct inode *inode) 681static void nilfs_clear_inode(struct inode *inode)
618{ 682{
619 struct nilfs_inode_info *ii = NILFS_I(inode); 683 struct nilfs_inode_info *ii = NILFS_I(inode);
684 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
620 685
621 /* 686 /*
622 * Free resources allocated in nilfs_read_inode(), here. 687 * Free resources allocated in nilfs_read_inode(), here.
@@ -625,10 +690,16 @@ static void nilfs_clear_inode(struct inode *inode)
625 brelse(ii->i_bh); 690 brelse(ii->i_bh);
626 ii->i_bh = NULL; 691 ii->i_bh = NULL;
627 692
693 if (mdi && mdi->mi_palloc_cache)
694 nilfs_palloc_destroy_cache(inode);
695
628 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 696 if (test_bit(NILFS_I_BMAP, &ii->i_state))
629 nilfs_bmap_clear(ii->i_bmap); 697 nilfs_bmap_clear(ii->i_bmap);
630 698
631 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 699 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
700
701 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
702 nilfs_put_root(ii->i_root);
632} 703}
633 704
634void nilfs_evict_inode(struct inode *inode) 705void nilfs_evict_inode(struct inode *inode)
@@ -637,7 +708,7 @@ void nilfs_evict_inode(struct inode *inode)
637 struct super_block *sb = inode->i_sb; 708 struct super_block *sb = inode->i_sb;
638 struct nilfs_inode_info *ii = NILFS_I(inode); 709 struct nilfs_inode_info *ii = NILFS_I(inode);
639 710
640 if (inode->i_nlink || unlikely(is_bad_inode(inode))) { 711 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
641 if (inode->i_data.nrpages) 712 if (inode->i_data.nrpages)
642 truncate_inode_pages(&inode->i_data, 0); 713 truncate_inode_pages(&inode->i_data, 0);
643 end_writeback(inode); 714 end_writeback(inode);
@@ -649,12 +720,16 @@ void nilfs_evict_inode(struct inode *inode)
649 if (inode->i_data.nrpages) 720 if (inode->i_data.nrpages)
650 truncate_inode_pages(&inode->i_data, 0); 721 truncate_inode_pages(&inode->i_data, 0);
651 722
723 /* TODO: some of the following operations may fail. */
652 nilfs_truncate_bmap(ii, 0); 724 nilfs_truncate_bmap(ii, 0);
653 nilfs_mark_inode_dirty(inode); 725 nilfs_mark_inode_dirty(inode);
654 end_writeback(inode); 726 end_writeback(inode);
727
728 nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
729 atomic_dec(&ii->i_root->inodes_count);
730
655 nilfs_clear_inode(inode); 731 nilfs_clear_inode(inode);
656 nilfs_free_inode(inode); 732
657 /* nilfs_free_inode() marks inode buffer dirty */
658 if (IS_SYNC(inode)) 733 if (IS_SYNC(inode))
659 nilfs_set_transaction_flag(NILFS_TI_SYNC); 734 nilfs_set_transaction_flag(NILFS_TI_SYNC);
660 nilfs_transaction_commit(sb); 735 nilfs_transaction_commit(sb);
@@ -700,17 +775,32 @@ out_err:
700 return err; 775 return err;
701} 776}
702 777
703int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, 778int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
704 struct buffer_head **pbh) 779{
780 struct nilfs_root *root;
781
782 if (flags & IPERM_FLAG_RCU)
783 return -ECHILD;
784
785 root = NILFS_I(inode)->i_root;
786 if ((mask & MAY_WRITE) && root &&
787 root->cno != NILFS_CPTREE_CURRENT_CNO)
788 return -EROFS; /* snapshot is not writable */
789
790 return generic_permission(inode, mask, flags, NULL);
791}
792
793int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
705{ 794{
795 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
706 struct nilfs_inode_info *ii = NILFS_I(inode); 796 struct nilfs_inode_info *ii = NILFS_I(inode);
707 int err; 797 int err;
708 798
709 spin_lock(&sbi->s_inode_lock); 799 spin_lock(&sbi->s_inode_lock);
710 if (ii->i_bh == NULL) { 800 if (ii->i_bh == NULL) {
711 spin_unlock(&sbi->s_inode_lock); 801 spin_unlock(&sbi->s_inode_lock);
712 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, 802 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
713 pbh); 803 inode->i_ino, pbh);
714 if (unlikely(err)) 804 if (unlikely(err))
715 return err; 805 return err;
716 spin_lock(&sbi->s_inode_lock); 806 spin_lock(&sbi->s_inode_lock);
@@ -743,9 +833,9 @@ int nilfs_inode_dirty(struct inode *inode)
743 return ret; 833 return ret;
744} 834}
745 835
746int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, 836int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
747 unsigned nr_dirty)
748{ 837{
838 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
749 struct nilfs_inode_info *ii = NILFS_I(inode); 839 struct nilfs_inode_info *ii = NILFS_I(inode);
750 840
751 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); 841 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
@@ -778,11 +868,10 @@ int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
778 868
779int nilfs_mark_inode_dirty(struct inode *inode) 869int nilfs_mark_inode_dirty(struct inode *inode)
780{ 870{
781 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
782 struct buffer_head *ibh; 871 struct buffer_head *ibh;
783 int err; 872 int err;
784 873
785 err = nilfs_load_inode_block(sbi, inode, &ibh); 874 err = nilfs_load_inode_block(inode, &ibh);
786 if (unlikely(err)) { 875 if (unlikely(err)) {
787 nilfs_warning(inode->i_sb, __func__, 876 nilfs_warning(inode->i_sb, __func__,
788 "failed to reget inode block.\n"); 877 "failed to reget inode block.\n");
@@ -790,7 +879,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
790 } 879 }
791 nilfs_update_inode(inode, ibh); 880 nilfs_update_inode(inode, ibh);
792 nilfs_mdt_mark_buffer_dirty(ibh); 881 nilfs_mdt_mark_buffer_dirty(ibh);
793 nilfs_mdt_mark_dirty(sbi->s_ifile); 882 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
794 brelse(ibh); 883 brelse(ibh);
795 return 0; 884 return 0;
796} 885}
@@ -808,6 +897,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
808void nilfs_dirty_inode(struct inode *inode) 897void nilfs_dirty_inode(struct inode *inode)
809{ 898{
810 struct nilfs_transaction_info ti; 899 struct nilfs_transaction_info ti;
900 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
811 901
812 if (is_bad_inode(inode)) { 902 if (is_bad_inode(inode)) {
813 nilfs_warning(inode->i_sb, __func__, 903 nilfs_warning(inode->i_sb, __func__,
@@ -815,7 +905,142 @@ void nilfs_dirty_inode(struct inode *inode)
815 dump_stack(); 905 dump_stack();
816 return; 906 return;
817 } 907 }
908 if (mdi) {
909 nilfs_mdt_mark_dirty(inode);
910 return;
911 }
818 nilfs_transaction_begin(inode->i_sb, &ti, 0); 912 nilfs_transaction_begin(inode->i_sb, &ti, 0);
819 nilfs_mark_inode_dirty(inode); 913 nilfs_mark_inode_dirty(inode);
820 nilfs_transaction_commit(inode->i_sb); /* never fails */ 914 nilfs_transaction_commit(inode->i_sb); /* never fails */
821} 915}
916
917int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
918 __u64 start, __u64 len)
919{
920 struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
921 __u64 logical = 0, phys = 0, size = 0;
922 __u32 flags = 0;
923 loff_t isize;
924 sector_t blkoff, end_blkoff;
925 sector_t delalloc_blkoff;
926 unsigned long delalloc_blklen;
927 unsigned int blkbits = inode->i_blkbits;
928 int ret, n;
929
930 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
931 if (ret)
932 return ret;
933
934 mutex_lock(&inode->i_mutex);
935
936 isize = i_size_read(inode);
937
938 blkoff = start >> blkbits;
939 end_blkoff = (start + len - 1) >> blkbits;
940
941 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
942 &delalloc_blkoff);
943
944 do {
945 __u64 blkphy;
946 unsigned int maxblocks;
947
948 if (delalloc_blklen && blkoff == delalloc_blkoff) {
949 if (size) {
950 /* End of the current extent */
951 ret = fiemap_fill_next_extent(
952 fieinfo, logical, phys, size, flags);
953 if (ret)
954 break;
955 }
956 if (blkoff > end_blkoff)
957 break;
958
959 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
960 logical = blkoff << blkbits;
961 phys = 0;
962 size = delalloc_blklen << blkbits;
963
964 blkoff = delalloc_blkoff + delalloc_blklen;
965 delalloc_blklen = nilfs_find_uncommitted_extent(
966 inode, blkoff, &delalloc_blkoff);
967 continue;
968 }
969
970 /*
971 * Limit the number of blocks that we look up so as
972 * not to get into the next delayed allocation extent.
973 */
974 maxblocks = INT_MAX;
975 if (delalloc_blklen)
976 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
977 maxblocks);
978 blkphy = 0;
979
980 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
981 n = nilfs_bmap_lookup_contig(
982 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
983 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
984
985 if (n < 0) {
986 int past_eof;
987
988 if (unlikely(n != -ENOENT))
989 break; /* error */
990
991 /* HOLE */
992 blkoff++;
993 past_eof = ((blkoff << blkbits) >= isize);
994
995 if (size) {
996 /* End of the current extent */
997
998 if (past_eof)
999 flags |= FIEMAP_EXTENT_LAST;
1000
1001 ret = fiemap_fill_next_extent(
1002 fieinfo, logical, phys, size, flags);
1003 if (ret)
1004 break;
1005 size = 0;
1006 }
1007 if (blkoff > end_blkoff || past_eof)
1008 break;
1009 } else {
1010 if (size) {
1011 if (phys && blkphy << blkbits == phys + size) {
1012 /* The current extent goes on */
1013 size += n << blkbits;
1014 } else {
1015 /* Terminate the current extent */
1016 ret = fiemap_fill_next_extent(
1017 fieinfo, logical, phys, size,
1018 flags);
1019 if (ret || blkoff > end_blkoff)
1020 break;
1021
1022 /* Start another extent */
1023 flags = FIEMAP_EXTENT_MERGED;
1024 logical = blkoff << blkbits;
1025 phys = blkphy << blkbits;
1026 size = n << blkbits;
1027 }
1028 } else {
1029 /* Start a new extent */
1030 flags = FIEMAP_EXTENT_MERGED;
1031 logical = blkoff << blkbits;
1032 phys = blkphy << blkbits;
1033 size = n << blkbits;
1034 }
1035 blkoff += n;
1036 }
1037 cond_resched();
1038 } while (true);
1039
1040 /* If ret is 1 then we just hit the end of the extent array */
1041 if (ret == 1)
1042 ret = 0;
1043
1044 mutex_unlock(&inode->i_mutex);
1045 return ret;
1046}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b0..496738963fdb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h> 25#include <linux/slab.h>
27#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
@@ -118,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
118 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 117 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
119 goto out; 118 goto out;
120 119
121 mutex_lock(&nilfs->ns_mount_mutex); 120 down_read(&inode->i_sb->s_umount);
122 121
123 nilfs_transaction_begin(inode->i_sb, &ti, 0); 122 nilfs_transaction_begin(inode->i_sb, &ti, 0);
124 ret = nilfs_cpfile_change_cpmode( 123 ret = nilfs_cpfile_change_cpmode(
@@ -128,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
128 else 127 else
129 nilfs_transaction_commit(inode->i_sb); /* never fails */ 128 nilfs_transaction_commit(inode->i_sb); /* never fails */
130 129
131 mutex_unlock(&nilfs->ns_mount_mutex); 130 up_read(&inode->i_sb->s_umount);
132out: 131out:
133 mnt_drop_write(filp->f_path.mnt); 132 mnt_drop_write(filp->f_path.mnt);
134 return ret; 133 return ret;
@@ -234,7 +233,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
234 int ret; 233 int ret;
235 234
236 down_read(&nilfs->ns_segctor_sem); 235 down_read(&nilfs->ns_segctor_sem);
237 ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs); 236 ret = nilfs_dat_get_vinfo(nilfs->ns_dat, buf, size, nmembs);
238 up_read(&nilfs->ns_segctor_sem); 237 up_read(&nilfs->ns_segctor_sem);
239 return ret; 238 return ret;
240} 239}
@@ -243,8 +242,7 @@ static ssize_t
243nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, 242nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
244 void *buf, size_t size, size_t nmembs) 243 void *buf, size_t size, size_t nmembs)
245{ 244{
246 struct inode *dat = nilfs_dat_inode(nilfs); 245 struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
247 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
248 struct nilfs_bdesc *bdescs = buf; 246 struct nilfs_bdesc *bdescs = buf;
249 int ret, i; 247 int ret, i;
250 248
@@ -334,10 +332,11 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
334 return 0; 332 return 0;
335} 333}
336 334
337static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, 335static int nilfs_ioctl_move_blocks(struct super_block *sb,
338 struct nilfs_argv *argv, void *buf) 336 struct nilfs_argv *argv, void *buf)
339{ 337{
340 size_t nmembs = argv->v_nmembs; 338 size_t nmembs = argv->v_nmembs;
339 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
341 struct inode *inode; 340 struct inode *inode;
342 struct nilfs_vdesc *vdesc; 341 struct nilfs_vdesc *vdesc;
343 struct buffer_head *bh, *n; 342 struct buffer_head *bh, *n;
@@ -349,19 +348,34 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
349 for (i = 0, vdesc = buf; i < nmembs; ) { 348 for (i = 0, vdesc = buf; i < nmembs; ) {
350 ino = vdesc->vd_ino; 349 ino = vdesc->vd_ino;
351 cno = vdesc->vd_cno; 350 cno = vdesc->vd_cno;
352 inode = nilfs_gc_iget(nilfs, ino, cno); 351 inode = nilfs_iget_for_gc(sb, ino, cno);
353 if (unlikely(inode == NULL)) { 352 if (IS_ERR(inode)) {
354 ret = -ENOMEM; 353 ret = PTR_ERR(inode);
355 goto failed; 354 goto failed;
356 } 355 }
356 if (list_empty(&NILFS_I(inode)->i_dirty)) {
357 /*
358 * Add the inode to GC inode list. Garbage Collection
359 * is serialized and no two processes manipulate the
360 * list simultaneously.
361 */
362 igrab(inode);
363 list_add(&NILFS_I(inode)->i_dirty,
364 &nilfs->ns_gc_inodes);
365 }
366
357 do { 367 do {
358 ret = nilfs_ioctl_move_inode_block(inode, vdesc, 368 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
359 &buffers); 369 &buffers);
360 if (unlikely(ret < 0)) 370 if (unlikely(ret < 0)) {
371 iput(inode);
361 goto failed; 372 goto failed;
373 }
362 vdesc++; 374 vdesc++;
363 } while (++i < nmembs && 375 } while (++i < nmembs &&
364 vdesc->vd_ino == ino && vdesc->vd_cno == cno); 376 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
377
378 iput(inode); /* The inode still remains in GC inode list */
365 } 379 }
366 380
367 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 381 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -406,7 +420,7 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
406 size_t nmembs = argv->v_nmembs; 420 size_t nmembs = argv->v_nmembs;
407 int ret; 421 int ret;
408 422
409 ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs); 423 ret = nilfs_dat_freev(nilfs->ns_dat, buf, nmembs);
410 424
411 return (ret < 0) ? ret : nmembs; 425 return (ret < 0) ? ret : nmembs;
412} 426}
@@ -415,8 +429,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
415 struct nilfs_argv *argv, void *buf) 429 struct nilfs_argv *argv, void *buf)
416{ 430{
417 size_t nmembs = argv->v_nmembs; 431 size_t nmembs = argv->v_nmembs;
418 struct inode *dat = nilfs_dat_inode(nilfs); 432 struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap;
419 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
420 struct nilfs_bdesc *bdescs = buf; 433 struct nilfs_bdesc *bdescs = buf;
421 int ret, i; 434 int ret, i;
422 435
@@ -435,7 +448,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
435 /* skip dead block */ 448 /* skip dead block */
436 continue; 449 continue;
437 if (bdescs[i].bd_level == 0) { 450 if (bdescs[i].bd_level == 0) {
438 ret = nilfs_mdt_mark_block_dirty(dat, 451 ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat,
439 bdescs[i].bd_offset); 452 bdescs[i].bd_offset);
440 if (ret < 0) { 453 if (ret < 0) {
441 WARN_ON(ret == -ENOENT); 454 WARN_ON(ret == -ENOENT);
@@ -567,7 +580,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
567 } 580 }
568 581
569 /* 582 /*
570 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), 583 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
571 * which will operates an inode list without blocking. 584 * which will operates an inode list without blocking.
572 * To protect the list from concurrent operations, 585 * To protect the list from concurrent operations,
573 * nilfs_ioctl_move_blocks should be atomic operation. 586 * nilfs_ioctl_move_blocks should be atomic operation.
@@ -577,15 +590,16 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
577 goto out_free; 590 goto out_free;
578 } 591 }
579 592
580 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); 593 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
594
595 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
581 if (ret < 0) 596 if (ret < 0)
582 printk(KERN_ERR "NILFS: GC failed during preparation: " 597 printk(KERN_ERR "NILFS: GC failed during preparation: "
583 "cannot read source blocks: err=%d\n", ret); 598 "cannot read source blocks: err=%d\n", ret);
584 else 599 else
585 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 600 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
586 601
587 if (ret < 0) 602 nilfs_remove_all_gcinodes(nilfs);
588 nilfs_remove_all_gcinode(nilfs);
589 clear_nilfs_gc_running(nilfs); 603 clear_nilfs_gc_running(nilfs);
590 604
591out_free: 605out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d01aff4957d9..6a0e2a189f60 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -36,7 +36,6 @@
36 36
37#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) 37#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
38 38
39#define INIT_UNUSED_INODE_FIELDS
40 39
41static int 40static int
42nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, 41nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -78,25 +77,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
78 struct buffer_head *, 77 struct buffer_head *,
79 void *)) 78 void *))
80{ 79{
81 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
82 struct super_block *sb = inode->i_sb; 80 struct super_block *sb = inode->i_sb;
83 struct nilfs_transaction_info ti; 81 struct nilfs_transaction_info ti;
84 struct buffer_head *bh; 82 struct buffer_head *bh;
85 int err; 83 int err;
86 84
87 if (!sb) {
88 /*
89 * Make sure this function is not called from any
90 * read-only context.
91 */
92 if (!nilfs->ns_writer) {
93 WARN_ON(1);
94 err = -EROFS;
95 goto out;
96 }
97 sb = nilfs->ns_writer->s_super;
98 }
99
100 nilfs_transaction_begin(sb, &ti, 0); 85 nilfs_transaction_begin(sb, &ti, 0);
101 86
102 err = -ENOMEM; 87 err = -ENOMEM;
@@ -112,7 +97,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
112 if (buffer_uptodate(bh)) 97 if (buffer_uptodate(bh))
113 goto failed_bh; 98 goto failed_bh;
114 99
115 bh->b_bdev = nilfs->ns_bdev; 100 bh->b_bdev = sb->s_bdev;
116 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 101 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
117 if (likely(!err)) { 102 if (likely(!err)) {
118 get_bh(bh); 103 get_bh(bh);
@@ -129,7 +114,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
129 err = nilfs_transaction_commit(sb); 114 err = nilfs_transaction_commit(sb);
130 else 115 else
131 nilfs_transaction_abort(sb); 116 nilfs_transaction_abort(sb);
132 out: 117
133 return err; 118 return err;
134} 119}
135 120
@@ -167,9 +152,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
167 unlock_buffer(bh); 152 unlock_buffer(bh);
168 goto failed_bh; 153 goto failed_bh;
169 } 154 }
170 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; 155 map_bh(bh, inode->i_sb, (sector_t)blknum);
171 bh->b_blocknr = (sector_t)blknum;
172 set_buffer_mapped(bh);
173 156
174 bh->b_end_io = end_buffer_read_sync; 157 bh->b_end_io = end_buffer_read_sync;
175 get_bh(bh); 158 get_bh(bh);
@@ -254,8 +237,6 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
254 * 237 *
255 * %-ENOENT - the specified block does not exist (hole block) 238 * %-ENOENT - the specified block does not exist (hole block)
256 * 239 *
257 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
258 *
259 * %-EROFS - Read only filesystem (for create mode) 240 * %-EROFS - Read only filesystem (for create mode)
260 */ 241 */
261int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, 242int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
@@ -290,8 +271,6 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
290 * %-ENOMEM - Insufficient memory available. 271 * %-ENOMEM - Insufficient memory available.
291 * 272 *
292 * %-EIO - I/O error 273 * %-EIO - I/O error
293 *
294 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
295 */ 274 */
296int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) 275int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
297{ 276{
@@ -367,8 +346,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
367 * %-EIO - I/O error 346 * %-EIO - I/O error
368 * 347 *
369 * %-ENOENT - the specified block does not exist (hole block) 348 * %-ENOENT - the specified block does not exist (hole block)
370 *
371 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
372 */ 349 */
373int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) 350int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
374{ 351{
@@ -398,35 +375,24 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
398static int 375static int
399nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) 376nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
400{ 377{
401 struct inode *inode = container_of(page->mapping, 378 struct inode *inode;
402 struct inode, i_data); 379 struct super_block *sb;
403 struct super_block *sb = inode->i_sb;
404 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
405 struct nilfs_sb_info *writer = NULL;
406 int err = 0; 380 int err = 0;
407 381
408 redirty_page_for_writepage(wbc, page); 382 redirty_page_for_writepage(wbc, page);
409 unlock_page(page); 383 unlock_page(page);
410 384
411 if (page->mapping->assoc_mapping) 385 inode = page->mapping->host;
412 return 0; /* Do not request flush for shadow page cache */ 386 if (!inode)
413 if (!sb) { 387 return 0;
414 down_read(&nilfs->ns_writer_sem); 388
415 writer = nilfs->ns_writer; 389 sb = inode->i_sb;
416 if (!writer) {
417 up_read(&nilfs->ns_writer_sem);
418 return -EROFS;
419 }
420 sb = writer->s_super;
421 }
422 390
423 if (wbc->sync_mode == WB_SYNC_ALL) 391 if (wbc->sync_mode == WB_SYNC_ALL)
424 err = nilfs_construct_segment(sb); 392 err = nilfs_construct_segment(sb);
425 else if (wbc->for_reclaim) 393 else if (wbc->for_reclaim)
426 nilfs_flush_segment(sb, inode->i_ino); 394 nilfs_flush_segment(sb, inode->i_ino);
427 395
428 if (writer)
429 up_read(&nilfs->ns_writer_sem);
430 return err; 396 return err;
431} 397}
432 398
@@ -439,105 +405,27 @@ static const struct address_space_operations def_mdt_aops = {
439static const struct inode_operations def_mdt_iops; 405static const struct inode_operations def_mdt_iops;
440static const struct file_operations def_mdt_fops; 406static const struct file_operations def_mdt_fops;
441 407
442/*
443 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
444 * ifile, or gcinodes. This allows the B-tree code and segment constructor
445 * to treat them like regular files, and this helps to simplify the
446 * implementation.
447 * On the other hand, some of the pseudo inodes have an irregular point:
448 * They don't have valid inode->i_sb pointer because their lifetimes are
449 * longer than those of the super block structs; they may continue for
450 * several consecutive mounts/umounts. This would need discussions.
451 */
452/**
453 * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
454 * @nilfs: nilfs object
455 * @sb: super block instance the metadata file belongs to
456 * @ino: inode number
457 * @gfp_mask: gfp mask for data pages
458 * @objsz: size of the private object attached to inode->i_private
459 */
460struct inode *
461nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
462 ino_t ino, gfp_t gfp_mask, size_t objsz)
463{
464 struct inode *inode = nilfs_alloc_inode_common(nilfs);
465 408
466 if (!inode) 409int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
467 return NULL; 410{
468 else { 411 struct nilfs_mdt_info *mi;
469 struct address_space * const mapping = &inode->i_data;
470 struct nilfs_mdt_info *mi;
471
472 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
473 if (!mi) {
474 nilfs_destroy_inode(inode);
475 return NULL;
476 }
477 mi->mi_nilfs = nilfs;
478 init_rwsem(&mi->mi_sem);
479
480 inode->i_sb = sb; /* sb may be NULL for some meta data files */
481 inode->i_blkbits = nilfs->ns_blocksize_bits;
482 inode->i_flags = 0;
483 atomic_set(&inode->i_count, 1);
484 inode->i_nlink = 1;
485 inode->i_ino = ino;
486 inode->i_mode = S_IFREG;
487 inode->i_private = mi;
488
489#ifdef INIT_UNUSED_INODE_FIELDS
490 atomic_set(&inode->i_writecount, 0);
491 inode->i_size = 0;
492 inode->i_blocks = 0;
493 inode->i_bytes = 0;
494 inode->i_generation = 0;
495#ifdef CONFIG_QUOTA
496 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
497#endif
498 inode->i_pipe = NULL;
499 inode->i_bdev = NULL;
500 inode->i_cdev = NULL;
501 inode->i_rdev = 0;
502#ifdef CONFIG_SECURITY
503 inode->i_security = NULL;
504#endif
505 inode->dirtied_when = 0;
506
507 INIT_LIST_HEAD(&inode->i_list);
508 INIT_LIST_HEAD(&inode->i_sb_list);
509 inode->i_state = 0;
510#endif
511
512 spin_lock_init(&inode->i_lock);
513 mutex_init(&inode->i_mutex);
514 init_rwsem(&inode->i_alloc_sem);
515
516 mapping->host = NULL; /* instead of inode */
517 mapping->flags = 0;
518 mapping_set_gfp_mask(mapping, gfp_mask);
519 mapping->assoc_mapping = NULL;
520 mapping->backing_dev_info = nilfs->ns_bdi;
521
522 inode->i_mapping = mapping;
523 }
524 412
525 return inode; 413 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
526} 414 if (!mi)
415 return -ENOMEM;
527 416
528struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 417 init_rwsem(&mi->mi_sem);
529 ino_t ino, size_t objsz) 418 inode->i_private = mi;
530{
531 struct inode *inode;
532 419
533 inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz); 420 inode->i_mode = S_IFREG;
534 if (!inode) 421 mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
535 return NULL; 422 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
536 423
537 inode->i_op = &def_mdt_iops; 424 inode->i_op = &def_mdt_iops;
538 inode->i_fop = &def_mdt_fops; 425 inode->i_fop = &def_mdt_fops;
539 inode->i_mapping->a_ops = &def_mdt_aops; 426 inode->i_mapping->a_ops = &def_mdt_aops;
540 return inode; 427
428 return 0;
541} 429}
542 430
543void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, 431void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
@@ -550,34 +438,157 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
550 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 438 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
551} 439}
552 440
553void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) 441static const struct address_space_operations shadow_map_aops = {
442 .sync_page = block_sync_page,
443};
444
445/**
446 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
447 * @inode: inode of the metadata file
448 * @shadow: shadow mapping
449 */
450int nilfs_mdt_setup_shadow_map(struct inode *inode,
451 struct nilfs_shadow_map *shadow)
554{ 452{
555 shadow->i_mapping->assoc_mapping = orig->i_mapping; 453 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
556 NILFS_I(shadow)->i_btnode_cache.assoc_mapping = 454 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
557 &NILFS_I(orig)->i_btnode_cache; 455
456 INIT_LIST_HEAD(&shadow->frozen_buffers);
457 nilfs_mapping_init_once(&shadow->frozen_data);
458 nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
459 nilfs_mapping_init_once(&shadow->frozen_btnodes);
460 nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
461 mi->mi_shadow = shadow;
462 return 0;
558} 463}
559 464
560static void nilfs_mdt_clear(struct inode *inode) 465/**
466 * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
467 * @inode: inode of the metadata file
468 */
469int nilfs_mdt_save_to_shadow_map(struct inode *inode)
561{ 470{
471 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
562 struct nilfs_inode_info *ii = NILFS_I(inode); 472 struct nilfs_inode_info *ii = NILFS_I(inode);
473 struct nilfs_shadow_map *shadow = mi->mi_shadow;
474 int ret;
475
476 ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
477 if (ret)
478 goto out;
479
480 ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
481 &ii->i_btnode_cache);
482 if (ret)
483 goto out;
484
485 nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
486 out:
487 return ret;
488}
489
490int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
491{
492 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
493 struct buffer_head *bh_frozen;
494 struct page *page;
495 int blkbits = inode->i_blkbits;
496
497 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
498 if (!page)
499 return -ENOMEM;
500
501 if (!page_has_buffers(page))
502 create_empty_buffers(page, 1 << blkbits, 0);
503
504 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
505
506 if (!buffer_uptodate(bh_frozen))
507 nilfs_copy_buffer(bh_frozen, bh);
508 if (list_empty(&bh_frozen->b_assoc_buffers)) {
509 list_add_tail(&bh_frozen->b_assoc_buffers,
510 &shadow->frozen_buffers);
511 set_buffer_nilfs_redirected(bh);
512 } else {
513 brelse(bh_frozen); /* already frozen */
514 }
515
516 unlock_page(page);
517 page_cache_release(page);
518 return 0;
519}
520
521struct buffer_head *
522nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
523{
524 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
525 struct buffer_head *bh_frozen = NULL;
526 struct page *page;
527 int n;
563 528
564 invalidate_mapping_pages(inode->i_mapping, 0, -1); 529 page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
565 truncate_inode_pages(inode->i_mapping, 0); 530 if (page) {
531 if (page_has_buffers(page)) {
532 n = bh_offset(bh) >> inode->i_blkbits;
533 bh_frozen = nilfs_page_get_nth_block(page, n);
534 }
535 unlock_page(page);
536 page_cache_release(page);
537 }
538 return bh_frozen;
539}
540
541static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
542{
543 struct list_head *head = &shadow->frozen_buffers;
544 struct buffer_head *bh;
566 545
567 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 546 while (!list_empty(head)) {
568 nilfs_bmap_clear(ii->i_bmap); 547 bh = list_first_entry(head, struct buffer_head,
569 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 548 b_assoc_buffers);
549 list_del_init(&bh->b_assoc_buffers);
550 brelse(bh); /* drop ref-count to make it releasable */
551 }
570} 552}
571 553
572void nilfs_mdt_destroy(struct inode *inode) 554/**
555 * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
556 * @inode: inode of the metadata file
557 */
558void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
573{ 559{
574 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 560 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
561 struct nilfs_inode_info *ii = NILFS_I(inode);
562 struct nilfs_shadow_map *shadow = mi->mi_shadow;
575 563
576 if (mdi->mi_palloc_cache) 564 down_write(&mi->mi_sem);
577 nilfs_palloc_destroy_cache(inode); 565
578 nilfs_mdt_clear(inode); 566 if (mi->mi_palloc_cache)
567 nilfs_palloc_clear_cache(inode);
568
569 nilfs_clear_dirty_pages(inode->i_mapping);
570 nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
571
572 nilfs_clear_dirty_pages(&ii->i_btnode_cache);
573 nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
574
575 nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
576
577 up_write(&mi->mi_sem);
578}
579
580/**
581 * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
582 * @inode: inode of the metadata file
583 */
584void nilfs_mdt_clear_shadow_map(struct inode *inode)
585{
586 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
587 struct nilfs_shadow_map *shadow = mi->mi_shadow;
579 588
580 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ 589 down_write(&mi->mi_sem);
581 kfree(mdi); 590 nilfs_release_frozen_buffers(shadow);
582 nilfs_destroy_inode(inode); 591 truncate_inode_pages(&shadow->frozen_data, 0);
592 truncate_inode_pages(&shadow->frozen_btnodes, 0);
593 up_write(&mi->mi_sem);
583} 594}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470fc..b13734bf3521 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,26 +28,33 @@
28#include "nilfs.h" 28#include "nilfs.h"
29#include "page.h" 29#include "page.h"
30 30
31struct nilfs_shadow_map {
32 struct nilfs_bmap_store bmap_store;
33 struct address_space frozen_data;
34 struct address_space frozen_btnodes;
35 struct list_head frozen_buffers;
36};
37
31/** 38/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files 39 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations 40 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking 41 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry 42 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry 43 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block 44 * @mi_entries_per_block: number of entries in a block
39 * @mi_palloc_cache: persistent object allocator cache 45 * @mi_palloc_cache: persistent object allocator cache
46 * @mi_shadow: shadow of bmap and page caches
40 * @mi_blocks_per_group: number of blocks in a group 47 * @mi_blocks_per_group: number of blocks in a group
41 * @mi_blocks_per_desc_block: number of blocks per descriptor block 48 * @mi_blocks_per_desc_block: number of blocks per descriptor block
42 */ 49 */
43struct nilfs_mdt_info { 50struct nilfs_mdt_info {
44 struct the_nilfs *mi_nilfs;
45 struct rw_semaphore mi_sem; 51 struct rw_semaphore mi_sem;
46 struct blockgroup_lock *mi_bgl; 52 struct blockgroup_lock *mi_bgl;
47 unsigned mi_entry_size; 53 unsigned mi_entry_size;
48 unsigned mi_first_entry_offset; 54 unsigned mi_first_entry_offset;
49 unsigned long mi_entries_per_block; 55 unsigned long mi_entries_per_block;
50 struct nilfs_palloc_cache *mi_palloc_cache; 56 struct nilfs_palloc_cache *mi_palloc_cache;
57 struct nilfs_shadow_map *mi_shadow;
51 unsigned long mi_blocks_per_group; 58 unsigned long mi_blocks_per_group;
52 unsigned long mi_blocks_per_desc_block; 59 unsigned long mi_blocks_per_desc_block;
53}; 60};
@@ -59,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
59 66
60static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) 67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
61{ 68{
62 struct super_block *sb = inode->i_sb; 69 return NILFS_SB(inode->i_sb)->s_nilfs;
63
64 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
65} 70}
66 71
67/* Default GFP flags using highmem */ 72/* Default GFP flags using highmem */
@@ -76,14 +81,17 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
76int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 81int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
77int nilfs_mdt_fetch_dirty(struct inode *); 82int nilfs_mdt_fetch_dirty(struct inode *);
78 83
79struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, 84int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
80 size_t);
81struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
82 ino_t, gfp_t, size_t);
83void nilfs_mdt_destroy(struct inode *);
84void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); 85void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
85void nilfs_mdt_set_shadow(struct inode *, struct inode *);
86 86
87int nilfs_mdt_setup_shadow_map(struct inode *inode,
88 struct nilfs_shadow_map *shadow);
89int nilfs_mdt_save_to_shadow_map(struct inode *inode);
90void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
91void nilfs_mdt_clear_shadow_map(struct inode *inode);
92int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
93struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
94 struct buffer_head *bh);
87 95
88#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) 96#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
89 97
@@ -100,7 +108,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
100 108
101static inline __u64 nilfs_mdt_cno(struct inode *inode) 109static inline __u64 nilfs_mdt_cno(struct inode *inode)
102{ 110{
103 return NILFS_MDT(inode)->mi_nilfs->ns_cno; 111 return NILFS_I_NILFS(inode)->ns_cno;
104} 112}
105 113
106#define nilfs_mdt_bgl_lock(inode, bg) \ 114#define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b4..98034271cd02 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
40 40
41#include <linux/pagemap.h> 41#include <linux/pagemap.h>
42#include "nilfs.h" 42#include "nilfs.h"
43#include "export.h"
43 44
45#define NILFS_FID_SIZE_NON_CONNECTABLE \
46 (offsetof(struct nilfs_fid, parent_gen) / 4)
47#define NILFS_FID_SIZE_CONNECTABLE (sizeof(struct nilfs_fid) / 4)
44 48
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) 49static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{ 50{
@@ -70,29 +74,13 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
70 ino = nilfs_inode_by_name(dir, &dentry->d_name); 74 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 75 inode = NULL;
72 if (ino) { 76 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 77 inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
74 if (IS_ERR(inode)) 78 if (IS_ERR(inode))
75 return ERR_CAST(inode); 79 return ERR_CAST(inode);
76 } 80 }
77 return d_splice_alias(inode, dentry); 81 return d_splice_alias(inode, dentry);
78} 82}
79 83
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
87 if (!ino)
88 return ERR_PTR(-ENOENT);
89
90 inode = nilfs_iget(child->d_inode->i_sb, ino);
91 if (IS_ERR(inode))
92 return ERR_CAST(inode);
93 return d_obtain_alias(inode);
94}
95
96/* 84/*
97 * By the time this is called, we already have created 85 * By the time this is called, we already have created
98 * the directory cache entry for the new file, but it 86 * the directory cache entry for the new file, but it
@@ -219,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
219 207
220 inode->i_ctime = CURRENT_TIME; 208 inode->i_ctime = CURRENT_TIME;
221 inode_inc_link_count(inode); 209 inode_inc_link_count(inode);
222 atomic_inc(&inode->i_count); 210 ihold(inode);
223 211
224 err = nilfs_add_nondir(dentry, inode); 212 err = nilfs_add_nondir(dentry, inode);
225 if (!err) 213 if (!err)
@@ -468,6 +456,115 @@ out:
468 return err; 456 return err;
469} 457}
470 458
459/*
460 * Export operations
461 */
462static struct dentry *nilfs_get_parent(struct dentry *child)
463{
464 unsigned long ino;
465 struct inode *inode;
466 struct qstr dotdot = {.name = "..", .len = 2};
467 struct nilfs_root *root;
468
469 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
470 if (!ino)
471 return ERR_PTR(-ENOENT);
472
473 root = NILFS_I(child->d_inode)->i_root;
474
475 inode = nilfs_iget(child->d_inode->i_sb, root, ino);
476 if (IS_ERR(inode))
477 return ERR_CAST(inode);
478
479 return d_obtain_alias(inode);
480}
481
482static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
483 u64 ino, u32 gen)
484{
485 struct nilfs_root *root;
486 struct inode *inode;
487
488 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
489 return ERR_PTR(-ESTALE);
490
491 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
492 if (!root)
493 return ERR_PTR(-ESTALE);
494
495 inode = nilfs_iget(sb, root, ino);
496 nilfs_put_root(root);
497
498 if (IS_ERR(inode))
499 return ERR_CAST(inode);
500 if (gen && inode->i_generation != gen) {
501 iput(inode);
502 return ERR_PTR(-ESTALE);
503 }
504 return d_obtain_alias(inode);
505}
506
507static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
508 int fh_len, int fh_type)
509{
510 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
511
512 if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
513 fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
514 (fh_type != FILEID_NILFS_WITH_PARENT &&
515 fh_type != FILEID_NILFS_WITHOUT_PARENT))
516 return NULL;
517
518 return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
519}
520
521static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
522 int fh_len, int fh_type)
523{
524 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
525
526 if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
527 fh_type != FILEID_NILFS_WITH_PARENT)
528 return NULL;
529
530 return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
531}
532
533static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
534 int connectable)
535{
536 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
537 struct inode *inode = dentry->d_inode;
538 struct nilfs_root *root = NILFS_I(inode)->i_root;
539 int type;
540
541 if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
542 (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
543 return 255;
544
545 fid->cno = root->cno;
546 fid->ino = inode->i_ino;
547 fid->gen = inode->i_generation;
548
549 if (connectable && !S_ISDIR(inode->i_mode)) {
550 struct inode *parent;
551
552 spin_lock(&dentry->d_lock);
553 parent = dentry->d_parent->d_inode;
554 fid->parent_ino = parent->i_ino;
555 fid->parent_gen = parent->i_generation;
556 spin_unlock(&dentry->d_lock);
557
558 type = FILEID_NILFS_WITH_PARENT;
559 *lenp = NILFS_FID_SIZE_CONNECTABLE;
560 } else {
561 type = FILEID_NILFS_WITHOUT_PARENT;
562 *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
563 }
564
565 return type;
566}
567
471const struct inode_operations nilfs_dir_inode_operations = { 568const struct inode_operations nilfs_dir_inode_operations = {
472 .create = nilfs_create, 569 .create = nilfs_create,
473 .lookup = nilfs_lookup, 570 .lookup = nilfs_lookup,
@@ -480,6 +577,7 @@ const struct inode_operations nilfs_dir_inode_operations = {
480 .rename = nilfs_rename, 577 .rename = nilfs_rename,
481 .setattr = nilfs_setattr, 578 .setattr = nilfs_setattr,
482 .permission = nilfs_permission, 579 .permission = nilfs_permission,
580 .fiemap = nilfs_fiemap,
483}; 581};
484 582
485const struct inode_operations nilfs_special_inode_operations = { 583const struct inode_operations nilfs_special_inode_operations = {
@@ -491,4 +589,12 @@ const struct inode_operations nilfs_symlink_inode_operations = {
491 .readlink = generic_readlink, 589 .readlink = generic_readlink,
492 .follow_link = page_follow_link_light, 590 .follow_link = page_follow_link_light,
493 .put_link = page_put_link, 591 .put_link = page_put_link,
592 .permission = nilfs_permission,
593};
594
595const struct export_operations nilfs_export_ops = {
596 .encode_fh = nilfs_encode_fh,
597 .fh_to_dentry = nilfs_fh_to_dentry,
598 .fh_to_parent = nilfs_fh_to_parent,
599 .get_parent = nilfs_get_parent,
494}; 600};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index d3d54046e5f8..777e8fd04304 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -59,6 +59,7 @@ struct nilfs_inode_info {
59#endif 59#endif
60 struct buffer_head *i_bh; /* i_bh contains a new or dirty 60 struct buffer_head *i_bh; /* i_bh contains a new or dirty
61 disk inode */ 61 disk inode */
62 struct nilfs_root *i_root;
62 struct inode vfs_inode; 63 struct inode vfs_inode;
63}; 64};
64 65
@@ -100,7 +101,6 @@ enum {
100 NILFS_I_INODE_DIRTY, /* write_inode is requested */ 101 NILFS_I_INODE_DIRTY, /* write_inode is requested */
101 NILFS_I_BMAP, /* has bmap and btnode_cache */ 102 NILFS_I_BMAP, /* has bmap and btnode_cache */
102 NILFS_I_GCINODE, /* inode for GC, on memory only */ 103 NILFS_I_GCINODE, /* inode for GC, on memory only */
103 NILFS_I_GCDAT, /* shadow DAT, on memory only */
104}; 104};
105 105
106/* 106/*
@@ -190,22 +190,14 @@ static inline int nilfs_doing_construction(void)
190 return nilfs_test_transaction_flag(NILFS_TI_WRITER); 190 return nilfs_test_transaction_flag(NILFS_TI_WRITER);
191} 191}
192 192
193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
194{
195 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
196}
197
198/* 193/*
199 * function prototype 194 * function prototype
200 */ 195 */
201#ifdef CONFIG_NILFS_POSIX_ACL 196#ifdef CONFIG_NILFS_POSIX_ACL
202#error "NILFS: not yet supported POSIX ACL" 197#error "NILFS: not yet supported POSIX ACL"
203extern int nilfs_permission(struct inode *, int, struct nameidata *);
204extern int nilfs_acl_chmod(struct inode *); 198extern int nilfs_acl_chmod(struct inode *);
205extern int nilfs_init_acl(struct inode *, struct inode *); 199extern int nilfs_init_acl(struct inode *, struct inode *);
206#else 200#else
207#define nilfs_permission NULL
208
209static inline int nilfs_acl_chmod(struct inode *inode) 201static inline int nilfs_acl_chmod(struct inode *inode)
210{ 202{
211 return 0; 203 return 0;
@@ -247,24 +239,28 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
247extern void nilfs_set_inode_flags(struct inode *); 239extern void nilfs_set_inode_flags(struct inode *);
248extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); 240extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
249extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); 241extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
250extern struct inode *nilfs_iget(struct super_block *, unsigned long); 242struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
243 unsigned long ino);
244struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
245 unsigned long ino);
246struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
247 unsigned long ino);
248extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
249 unsigned long ino, __u64 cno);
251extern void nilfs_update_inode(struct inode *, struct buffer_head *); 250extern void nilfs_update_inode(struct inode *, struct buffer_head *);
252extern void nilfs_truncate(struct inode *); 251extern void nilfs_truncate(struct inode *);
253extern void nilfs_evict_inode(struct inode *); 252extern void nilfs_evict_inode(struct inode *);
254extern int nilfs_setattr(struct dentry *, struct iattr *); 253extern int nilfs_setattr(struct dentry *, struct iattr *);
255extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, 254int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
256 struct buffer_head **); 255int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
257extern int nilfs_inode_dirty(struct inode *); 256extern int nilfs_inode_dirty(struct inode *);
258extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *, 257int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
259 unsigned);
260extern int nilfs_mark_inode_dirty(struct inode *); 258extern int nilfs_mark_inode_dirty(struct inode *);
261extern void nilfs_dirty_inode(struct inode *); 259extern void nilfs_dirty_inode(struct inode *);
262 260int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
263/* namei.c */ 261 __u64 start, __u64 len);
264extern struct dentry *nilfs_get_parent(struct dentry *);
265 262
266/* super.c */ 263/* super.c */
267extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
268extern struct inode *nilfs_alloc_inode(struct super_block *); 264extern struct inode *nilfs_alloc_inode(struct super_block *);
269extern void nilfs_destroy_inode(struct inode *); 265extern void nilfs_destroy_inode(struct inode *);
270extern void nilfs_error(struct super_block *, const char *, const char *, ...) 266extern void nilfs_error(struct super_block *, const char *, const char *, ...)
@@ -283,8 +279,9 @@ extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
283 int flip); 279 int flip);
284extern int nilfs_commit_super(struct nilfs_sb_info *, int); 280extern int nilfs_commit_super(struct nilfs_sb_info *, int);
285extern int nilfs_cleanup_super(struct nilfs_sb_info *); 281extern int nilfs_cleanup_super(struct nilfs_sb_info *);
286extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); 282int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
287extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); 283 struct nilfs_root **root);
284int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
288 285
289/* gcinode.c */ 286/* gcinode.c */
290int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, 287int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
@@ -292,16 +289,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
292int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, 289int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
293 struct buffer_head **); 290 struct buffer_head **);
294int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); 291int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
295int nilfs_init_gccache(struct the_nilfs *); 292int nilfs_init_gcinode(struct inode *inode);
296void nilfs_destroy_gccache(struct the_nilfs *); 293void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
297void nilfs_clear_gcinode(struct inode *);
298struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
299void nilfs_remove_all_gcinode(struct the_nilfs *);
300
301/* gcdat.c */
302int nilfs_init_gcdat_inode(struct the_nilfs *);
303void nilfs_commit_gcdat_inode(struct the_nilfs *);
304void nilfs_clear_gcdat_inode(struct the_nilfs *);
305 294
306/* 295/*
307 * Inodes and files operations 296 * Inodes and files operations
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index aab11db2cb08..0c432416cfef 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -79,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
79{ 79{
80 int blkbits = inode->i_blkbits; 80 int blkbits = inode->i_blkbits;
81 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); 81 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
82 struct page *page, *opage; 82 struct page *page;
83 struct buffer_head *bh, *obh; 83 struct buffer_head *bh;
84 84
85 page = grab_cache_page(mapping, index); 85 page = grab_cache_page(mapping, index);
86 if (unlikely(!page)) 86 if (unlikely(!page))
@@ -92,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
92 page_cache_release(page); 92 page_cache_release(page);
93 return NULL; 93 return NULL;
94 } 94 }
95 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
96 /*
97 * Shadow page cache uses assoc_mapping to point its original
98 * page cache. The following code tries the original cache
99 * if the given cache is a shadow and it didn't hit.
100 */
101 opage = find_lock_page(mapping->assoc_mapping, index);
102 if (!opage)
103 return bh;
104
105 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
106 b_state);
107 if (buffer_uptodate(obh)) {
108 nilfs_copy_buffer(bh, obh);
109 if (buffer_dirty(obh)) {
110 nilfs_mark_buffer_dirty(bh);
111 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
112 nilfs_mdt_mark_dirty(inode);
113 }
114 }
115 brelse(obh);
116 unlock_page(opage);
117 page_cache_release(opage);
118 }
119 return bh; 95 return bh;
120} 96}
121 97
@@ -131,6 +107,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
131 lock_buffer(bh); 107 lock_buffer(bh);
132 clear_buffer_nilfs_volatile(bh); 108 clear_buffer_nilfs_volatile(bh);
133 clear_buffer_nilfs_checked(bh); 109 clear_buffer_nilfs_checked(bh);
110 clear_buffer_nilfs_redirected(bh);
134 clear_buffer_dirty(bh); 111 clear_buffer_dirty(bh);
135 if (nilfs_page_buffers_clean(page)) 112 if (nilfs_page_buffers_clean(page))
136 __nilfs_clear_page_dirty(page); 113 __nilfs_clear_page_dirty(page);
@@ -483,6 +460,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
483 clear_buffer_dirty(bh); 460 clear_buffer_dirty(bh);
484 clear_buffer_nilfs_volatile(bh); 461 clear_buffer_nilfs_volatile(bh);
485 clear_buffer_nilfs_checked(bh); 462 clear_buffer_nilfs_checked(bh);
463 clear_buffer_nilfs_redirected(bh);
486 clear_buffer_uptodate(bh); 464 clear_buffer_uptodate(bh);
487 clear_buffer_mapped(bh); 465 clear_buffer_mapped(bh);
488 unlock_buffer(bh); 466 unlock_buffer(bh);
@@ -514,6 +492,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
514 return nc; 492 return nc;
515} 493}
516 494
495void nilfs_mapping_init_once(struct address_space *mapping)
496{
497 memset(mapping, 0, sizeof(*mapping));
498 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
499 spin_lock_init(&mapping->tree_lock);
500 INIT_LIST_HEAD(&mapping->private_list);
501 spin_lock_init(&mapping->private_lock);
502
503 spin_lock_init(&mapping->i_mmap_lock);
504 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
505 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
506}
507
508void nilfs_mapping_init(struct address_space *mapping,
509 struct backing_dev_info *bdi,
510 const struct address_space_operations *aops)
511{
512 mapping->host = NULL;
513 mapping->flags = 0;
514 mapping_set_gfp_mask(mapping, GFP_NOFS);
515 mapping->assoc_mapping = NULL;
516 mapping->backing_dev_info = bdi;
517 mapping->a_ops = aops;
518}
519
517/* 520/*
518 * NILFS2 needs clear_page_dirty() in the following two cases: 521 * NILFS2 needs clear_page_dirty() in the following two cases:
519 * 522 *
@@ -543,3 +546,87 @@ int __nilfs_clear_page_dirty(struct page *page)
543 } 546 }
544 return TestClearPageDirty(page); 547 return TestClearPageDirty(page);
545} 548}
549
550/**
551 * nilfs_find_uncommitted_extent - find extent of uncommitted data
552 * @inode: inode
553 * @start_blk: start block offset (in)
554 * @blkoff: start offset of the found extent (out)
555 *
556 * This function searches an extent of buffers marked "delayed" which
557 * starts from a block offset equal to or larger than @start_blk. If
558 * such an extent was found, this will store the start offset in
559 * @blkoff and return its length in blocks. Otherwise, zero is
560 * returned.
561 */
562unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
563 sector_t start_blk,
564 sector_t *blkoff)
565{
566 unsigned int i;
567 pgoff_t index;
568 unsigned int nblocks_in_page;
569 unsigned long length = 0;
570 sector_t b;
571 struct pagevec pvec;
572 struct page *page;
573
574 if (inode->i_mapping->nrpages == 0)
575 return 0;
576
577 index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
578 nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
579
580 pagevec_init(&pvec, 0);
581
582repeat:
583 pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
584 pvec.pages);
585 if (pvec.nr == 0)
586 return length;
587
588 if (length > 0 && pvec.pages[0]->index > index)
589 goto out;
590
591 b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
592 i = 0;
593 do {
594 page = pvec.pages[i];
595
596 lock_page(page);
597 if (page_has_buffers(page)) {
598 struct buffer_head *bh, *head;
599
600 bh = head = page_buffers(page);
601 do {
602 if (b < start_blk)
603 continue;
604 if (buffer_delay(bh)) {
605 if (length == 0)
606 *blkoff = b;
607 length++;
608 } else if (length > 0) {
609 goto out_locked;
610 }
611 } while (++b, bh = bh->b_this_page, bh != head);
612 } else {
613 if (length > 0)
614 goto out_locked;
615
616 b += nblocks_in_page;
617 }
618 unlock_page(page);
619
620 } while (++i < pagevec_count(&pvec));
621
622 index = page->index + 1;
623 pagevec_release(&pvec);
624 cond_resched();
625 goto repeat;
626
627out_locked:
628 unlock_page(page);
629out:
630 pagevec_release(&pvec);
631 return length;
632}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f53d8da41ed7..622df27cd891 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -35,12 +35,14 @@ enum {
35 BH_NILFS_Node, 35 BH_NILFS_Node,
36 BH_NILFS_Volatile, 36 BH_NILFS_Volatile,
37 BH_NILFS_Checked, 37 BH_NILFS_Checked,
38 BH_NILFS_Redirected,
38}; 39};
39 40
40BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ 41BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
41BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ 42BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
42BUFFER_FNS(NILFS_Volatile, nilfs_volatile) 43BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
43BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ 44BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
45BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */
44 46
45 47
46void nilfs_mark_buffer_dirty(struct buffer_head *bh); 48void nilfs_mark_buffer_dirty(struct buffer_head *bh);
@@ -59,7 +61,14 @@ void nilfs_free_private_page(struct page *);
59int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); 61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
60void nilfs_copy_back_pages(struct address_space *, struct address_space *); 62void nilfs_copy_back_pages(struct address_space *, struct address_space *);
61void nilfs_clear_dirty_pages(struct address_space *); 63void nilfs_clear_dirty_pages(struct address_space *);
64void nilfs_mapping_init_once(struct address_space *mapping);
65void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi,
67 const struct address_space_operations *aops);
62unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
69unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
70 sector_t start_blk,
71 sector_t *blkoff);
63 72
64#define NILFS_PAGE_BUG(page, m, a...) \ 73#define NILFS_PAGE_BUG(page, m, a...) \
65 do { nilfs_page_bug(page); BUG(); } while (0) 74 do { nilfs_page_bug(page); BUG(); } while (0)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d0c35ef39f6a..3dfcd3b7d389 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -440,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
440 segnum[2] = ri->ri_segnum; 440 segnum[2] = ri->ri_segnum;
441 segnum[3] = ri->ri_nextnum; 441 segnum[3] = ri->ri_nextnum;
442 442
443 nilfs_attach_writer(nilfs, sbi);
444 /* 443 /*
445 * Releasing the next segment of the latest super root. 444 * Releasing the next segment of the latest super root.
446 * The next segment is invalidated by this recovery. 445 * The next segment is invalidated by this recovery.
@@ -480,7 +479,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
480 479
481 failed: 480 failed:
482 /* No need to recover sufile because it will be destroyed on error */ 481 /* No need to recover sufile because it will be destroyed on error */
483 nilfs_detach_writer(nilfs, sbi);
484 return err; 482 return err;
485} 483}
486 484
@@ -504,6 +502,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
504 502
505static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, 503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
506 struct nilfs_sb_info *sbi, 504 struct nilfs_sb_info *sbi,
505 struct nilfs_root *root,
507 struct list_head *head, 506 struct list_head *head,
508 unsigned long *nr_salvaged_blocks) 507 unsigned long *nr_salvaged_blocks)
509{ 508{
@@ -515,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
515 int err = 0, err2 = 0; 514 int err = 0, err2 = 0;
516 515
517 list_for_each_entry_safe(rb, n, head, list) { 516 list_for_each_entry_safe(rb, n, head, list) {
518 inode = nilfs_iget(sbi->s_super, rb->ino); 517 inode = nilfs_iget(sbi->s_super, root, rb->ino);
519 if (IS_ERR(inode)) { 518 if (IS_ERR(inode)) {
520 err = PTR_ERR(inode); 519 err = PTR_ERR(inode);
521 inode = NULL; 520 inode = NULL;
@@ -536,7 +535,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
536 if (unlikely(err)) 535 if (unlikely(err))
537 goto failed_page; 536 goto failed_page;
538 537
539 err = nilfs_set_file_dirty(sbi, inode, 1); 538 err = nilfs_set_file_dirty(inode, 1);
540 if (unlikely(err)) 539 if (unlikely(err))
541 goto failed_page; 540 goto failed_page;
542 541
@@ -578,6 +577,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
578 */ 577 */
579static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 578static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
580 struct nilfs_sb_info *sbi, 579 struct nilfs_sb_info *sbi,
580 struct nilfs_root *root,
581 struct nilfs_recovery_info *ri) 581 struct nilfs_recovery_info *ri)
582{ 582{
583 struct buffer_head *bh_sum = NULL; 583 struct buffer_head *bh_sum = NULL;
@@ -597,7 +597,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
597 }; 597 };
598 int state = RF_INIT_ST; 598 int state = RF_INIT_ST;
599 599
600 nilfs_attach_writer(nilfs, sbi);
601 pseg_start = ri->ri_lsegs_start; 600 pseg_start = ri->ri_lsegs_start;
602 seg_seq = ri->ri_lsegs_start_seq; 601 seg_seq = ri->ri_lsegs_start_seq;
603 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); 602 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
@@ -649,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
649 goto failed; 648 goto failed;
650 if (flags & NILFS_SS_LOGEND) { 649 if (flags & NILFS_SS_LOGEND) {
651 err = nilfs_recover_dsync_blocks( 650 err = nilfs_recover_dsync_blocks(
652 nilfs, sbi, &dsync_blocks, 651 nilfs, sbi, root, &dsync_blocks,
653 &nsalvaged_blocks); 652 &nsalvaged_blocks);
654 if (unlikely(err)) 653 if (unlikely(err))
655 goto failed; 654 goto failed;
@@ -688,7 +687,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
688 out: 687 out:
689 brelse(bh_sum); 688 brelse(bh_sum);
690 dispose_recovery_list(&dsync_blocks); 689 dispose_recovery_list(&dsync_blocks);
691 nilfs_detach_writer(nilfs, sbi);
692 return err; 690 return err;
693 691
694 confused: 692 confused:
@@ -746,19 +744,20 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
746 struct nilfs_sb_info *sbi, 744 struct nilfs_sb_info *sbi,
747 struct nilfs_recovery_info *ri) 745 struct nilfs_recovery_info *ri)
748{ 746{
747 struct nilfs_root *root;
749 int err; 748 int err;
750 749
751 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) 750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
752 return 0; 751 return 0;
753 752
754 err = nilfs_attach_checkpoint(sbi, ri->ri_cno); 753 err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
755 if (unlikely(err)) { 754 if (unlikely(err)) {
756 printk(KERN_ERR 755 printk(KERN_ERR
757 "NILFS: error loading the latest checkpoint.\n"); 756 "NILFS: error loading the latest checkpoint.\n");
758 return err; 757 return err;
759 } 758 }
760 759
761 err = nilfs_do_roll_forward(nilfs, sbi, ri); 760 err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
762 if (unlikely(err)) 761 if (unlikely(err))
763 goto failed; 762 goto failed;
764 763
@@ -770,7 +769,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
770 goto failed; 769 goto failed;
771 } 770 }
772 771
773 err = nilfs_attach_segment_constructor(sbi); 772 err = nilfs_attach_segment_constructor(sbi, root);
774 if (unlikely(err)) 773 if (unlikely(err))
775 goto failed; 774 goto failed;
776 775
@@ -788,7 +787,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
788 } 787 }
789 788
790 failed: 789 failed:
791 nilfs_detach_checkpoint(sbi); 790 nilfs_put_root(root);
792 return err; 791 return err;
793} 792}
794 793
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504a..7a17715f215f 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -27,14 +27,6 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30/*
31 * Mount options
32 */
33struct nilfs_mount_options {
34 unsigned long mount_opt;
35 __u64 snapshot_cno;
36};
37
38struct the_nilfs; 30struct the_nilfs;
39struct nilfs_sc_info; 31struct nilfs_sc_info;
40 32
@@ -42,11 +34,6 @@ struct nilfs_sc_info;
42 * NILFS super-block data in memory 34 * NILFS super-block data in memory
43 */ 35 */
44struct nilfs_sb_info { 36struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */ 37 /* Mount options */
51 unsigned long s_mount_opt; 38 unsigned long s_mount_opt;
52 uid_t s_resuid; 39 uid_t s_resuid;
@@ -59,8 +46,6 @@ struct nilfs_sb_info {
59 /* Fundamental members */ 46 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */ 47 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs; 48 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63 atomic_t s_count; /* reference count */
64 49
65 /* Segment constructor */ 50 /* Segment constructor */
66 struct list_head s_dirty_files; /* dirty files list */ 51 struct list_head s_dirty_files; /* dirty files list */
@@ -68,9 +53,6 @@ struct nilfs_sb_info {
68 spinlock_t s_inode_lock; /* Lock for the nilfs inode. 53 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
69 It covers s_dirty_files list */ 54 It covers s_dirty_files list */
70 55
71 /* Metadata files */
72 struct inode *s_ifile; /* index file inode */
73
74 /* Inode allocator */ 56 /* Inode allocator */
75 spinlock_t s_next_gen_lock; 57 spinlock_t s_next_gen_lock;
76 u32 s_next_generation; 58 u32 s_next_generation;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 4588fb9e93df..0f83e93935b2 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
371 struct bio *bio = wi->bio; 371 struct bio *bio = wi->bio;
372 int err; 372 int err;
373 373
374 if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) { 374 if (segbuf->sb_nbio > 0 &&
375 bdi_write_congested(segbuf->sb_super->s_bdi)) {
375 wait_for_completion(&segbuf->sb_bio_event); 376 wait_for_completion(&segbuf->sb_bio_event);
376 segbuf->sb_nbio--; 377 segbuf->sb_nbio--;
377 if (unlikely(atomic_read(&segbuf->sb_err))) { 378 if (unlikely(atomic_read(&segbuf->sb_err))) {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9fd051a33c4f..55ebae5c7f39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
191 if (ret > 0) 191 if (ret > 0)
192 return 0; 192 return 0;
193 193
194 vfs_check_frozen(sb, SB_FREEZE_WRITE);
195
194 sbi = NILFS_SB(sb); 196 sbi = NILFS_SB(sb);
195 nilfs = sbi->s_nilfs; 197 nilfs = sbi->s_nilfs;
196 down_read(&nilfs->ns_segctor_sem); 198 down_read(&nilfs->ns_segctor_sem);
@@ -366,8 +368,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
366 368
367 if (nilfs_doing_gc()) 369 if (nilfs_doing_gc())
368 flags = NILFS_SS_GC; 370 flags = NILFS_SS_GC;
369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, 371 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
370 sci->sc_sbi->s_nilfs->ns_cno);
371 if (unlikely(err)) 372 if (unlikely(err))
372 return err; 373 return err;
373 374
@@ -440,17 +441,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
440 struct nilfs_finfo *finfo; 441 struct nilfs_finfo *finfo;
441 struct nilfs_inode_info *ii; 442 struct nilfs_inode_info *ii;
442 struct nilfs_segment_buffer *segbuf; 443 struct nilfs_segment_buffer *segbuf;
444 __u64 cno;
443 445
444 if (sci->sc_blk_cnt == 0) 446 if (sci->sc_blk_cnt == 0)
445 return; 447 return;
446 448
447 ii = NILFS_I(inode); 449 ii = NILFS_I(inode);
450
451 if (test_bit(NILFS_I_GCINODE, &ii->i_state))
452 cno = ii->i_cno;
453 else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
454 cno = 0;
455 else
456 cno = sci->sc_cno;
457
448 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, 458 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
449 sizeof(*finfo)); 459 sizeof(*finfo));
450 finfo->fi_ino = cpu_to_le64(inode->i_ino); 460 finfo->fi_ino = cpu_to_le64(inode->i_ino);
451 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); 461 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
452 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); 462 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
453 finfo->fi_cno = cpu_to_le64(ii->i_cno); 463 finfo->fi_cno = cpu_to_le64(cno);
454 464
455 segbuf = sci->sc_curseg; 465 segbuf = sci->sc_curseg;
456 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + 466 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -494,17 +504,6 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
494 return err; 504 return err;
495} 505}
496 506
497static int nilfs_handle_bmap_error(int err, const char *fname,
498 struct inode *inode, struct super_block *sb)
499{
500 if (err == -EINVAL) {
501 nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
502 inode->i_ino);
503 err = -EIO;
504 }
505 return err;
506}
507
508/* 507/*
509 * Callback functions that enumerate, mark, and collect dirty blocks 508 * Callback functions that enumerate, mark, and collect dirty blocks
510 */ 509 */
@@ -514,9 +513,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
514 int err; 513 int err;
515 514
516 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); 515 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
517 if (unlikely(err < 0)) 516 if (err < 0)
518 return nilfs_handle_bmap_error(err, __func__, inode, 517 return err;
519 sci->sc_super);
520 518
521 err = nilfs_segctor_add_file_block(sci, bh, inode, 519 err = nilfs_segctor_add_file_block(sci, bh, inode,
522 sizeof(struct nilfs_binfo_v)); 520 sizeof(struct nilfs_binfo_v));
@@ -529,13 +527,7 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
529 struct buffer_head *bh, 527 struct buffer_head *bh,
530 struct inode *inode) 528 struct inode *inode)
531{ 529{
532 int err; 530 return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
533
534 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
535 if (unlikely(err < 0))
536 return nilfs_handle_bmap_error(err, __func__, inode,
537 sci->sc_super);
538 return 0;
539} 531}
540 532
541static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, 533static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
@@ -578,9 +570,8 @@ static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
578 int err; 570 int err;
579 571
580 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); 572 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
581 if (unlikely(err < 0)) 573 if (err < 0)
582 return nilfs_handle_bmap_error(err, __func__, inode, 574 return err;
583 sci->sc_super);
584 575
585 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); 576 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
586 if (!err) 577 if (!err)
@@ -755,20 +746,19 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
755 } 746 }
756} 747}
757 748
758static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi) 749static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
750 struct nilfs_root *root)
759{ 751{
760 struct the_nilfs *nilfs = sbi->s_nilfs;
761 int ret = 0; 752 int ret = 0;
762 753
763 if (nilfs_mdt_fetch_dirty(sbi->s_ifile)) 754 if (nilfs_mdt_fetch_dirty(root->ifile))
764 ret++; 755 ret++;
765 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) 756 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
766 ret++; 757 ret++;
767 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) 758 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
768 ret++; 759 ret++;
769 if (ret || nilfs_doing_gc()) 760 if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
770 if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs))) 761 ret++;
771 ret++;
772 return ret; 762 return ret;
773} 763}
774 764
@@ -785,7 +775,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
785 struct nilfs_sb_info *sbi = sci->sc_sbi; 775 struct nilfs_sb_info *sbi = sci->sc_sbi;
786 int ret = 0; 776 int ret = 0;
787 777
788 if (nilfs_test_metadata_dirty(sbi)) 778 if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
789 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 779 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
790 780
791 spin_lock(&sbi->s_inode_lock); 781 spin_lock(&sbi->s_inode_lock);
@@ -801,10 +791,10 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
801 struct nilfs_sb_info *sbi = sci->sc_sbi; 791 struct nilfs_sb_info *sbi = sci->sc_sbi;
802 struct the_nilfs *nilfs = sbi->s_nilfs; 792 struct the_nilfs *nilfs = sbi->s_nilfs;
803 793
804 nilfs_mdt_clear_dirty(sbi->s_ifile); 794 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
805 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 795 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
806 nilfs_mdt_clear_dirty(nilfs->ns_sufile); 796 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
807 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); 797 nilfs_mdt_clear_dirty(nilfs->ns_dat);
808} 798}
809 799
810static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) 800static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
@@ -848,9 +838,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
848 raw_cp->cp_snapshot_list.ssl_next = 0; 838 raw_cp->cp_snapshot_list.ssl_next = 0;
849 raw_cp->cp_snapshot_list.ssl_prev = 0; 839 raw_cp->cp_snapshot_list.ssl_prev = 0;
850 raw_cp->cp_inodes_count = 840 raw_cp->cp_inodes_count =
851 cpu_to_le64(atomic_read(&sbi->s_inodes_count)); 841 cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
852 raw_cp->cp_blocks_count = 842 raw_cp->cp_blocks_count =
853 cpu_to_le64(atomic_read(&sbi->s_blocks_count)); 843 cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
854 raw_cp->cp_nblk_inc = 844 raw_cp->cp_nblk_inc =
855 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); 845 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
856 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); 846 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
@@ -861,7 +851,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
861 else 851 else
862 nilfs_checkpoint_set_minor(raw_cp); 852 nilfs_checkpoint_set_minor(raw_cp);
863 853
864 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); 854 nilfs_write_inode_common(sci->sc_root->ifile,
855 &raw_cp->cp_ifile_inode, 1);
865 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); 856 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
866 return 0; 857 return 0;
867 858
@@ -886,13 +877,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
886 } 877 }
887} 878}
888 879
889static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, 880static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
890 struct inode *ifile)
891{ 881{
892 struct nilfs_inode_info *ii; 882 struct nilfs_inode_info *ii;
893 883
894 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { 884 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
895 nilfs_fill_in_file_bmap(ifile, ii); 885 nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
896 set_bit(NILFS_I_COLLECTED, &ii->i_state); 886 set_bit(NILFS_I_COLLECTED, &ii->i_state);
897 } 887 }
898} 888}
@@ -913,7 +903,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
913 nilfs->ns_nongc_ctime : sci->sc_seg_ctime); 903 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
914 raw_sr->sr_flags = 0; 904 raw_sr->sr_flags = 0;
915 905
916 nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr + 906 nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
917 NILFS_SR_DAT_OFFSET(isz), 1); 907 NILFS_SR_DAT_OFFSET(isz), 1);
918 nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + 908 nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
919 NILFS_SR_CPFILE_OFFSET(isz), 1); 909 NILFS_SR_CPFILE_OFFSET(isz), 1);
@@ -1135,7 +1125,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1135 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; 1125 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1136 /* Fall through */ 1126 /* Fall through */
1137 case NILFS_ST_IFILE: 1127 case NILFS_ST_IFILE:
1138 err = nilfs_segctor_scan_file(sci, sbi->s_ifile, 1128 err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
1139 &nilfs_sc_file_ops); 1129 &nilfs_sc_file_ops);
1140 if (unlikely(err)) 1130 if (unlikely(err))
1141 break; 1131 break;
@@ -1169,7 +1159,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1169 sci->sc_stage.scnt++; /* Fall through */ 1159 sci->sc_stage.scnt++; /* Fall through */
1170 case NILFS_ST_DAT: 1160 case NILFS_ST_DAT:
1171 dat_stage: 1161 dat_stage:
1172 err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs), 1162 err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
1173 &nilfs_sc_dat_ops); 1163 &nilfs_sc_dat_ops);
1174 if (unlikely(err)) 1164 if (unlikely(err))
1175 break; 1165 break;
@@ -1553,7 +1543,6 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1553 return 0; 1543 return 0;
1554 1544
1555 failed_bmap: 1545 failed_bmap:
1556 err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
1557 return err; 1546 return err;
1558} 1547}
1559 1548
@@ -1599,7 +1588,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1599 kunmap_atomic(kaddr, KM_USER0); 1588 kunmap_atomic(kaddr, KM_USER0);
1600 1589
1601 if (!TestSetPageWriteback(clone_page)) 1590 if (!TestSetPageWriteback(clone_page))
1602 inc_zone_page_state(clone_page, NR_WRITEBACK); 1591 account_page_writeback(clone_page);
1603 unlock_page(clone_page); 1592 unlock_page(clone_page);
1604 1593
1605 return 0; 1594 return 0;
@@ -1773,6 +1762,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1773 if (!err) { 1762 if (!err) {
1774 set_buffer_uptodate(bh); 1763 set_buffer_uptodate(bh);
1775 clear_buffer_dirty(bh); 1764 clear_buffer_dirty(bh);
1765 clear_buffer_delay(bh);
1776 clear_buffer_nilfs_volatile(bh); 1766 clear_buffer_nilfs_volatile(bh);
1777 } 1767 }
1778 brelse(bh); /* for b_assoc_buffers */ 1768 brelse(bh); /* for b_assoc_buffers */
@@ -1899,7 +1889,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1899 b_assoc_buffers) { 1889 b_assoc_buffers) {
1900 set_buffer_uptodate(bh); 1890 set_buffer_uptodate(bh);
1901 clear_buffer_dirty(bh); 1891 clear_buffer_dirty(bh);
1892 clear_buffer_delay(bh);
1902 clear_buffer_nilfs_volatile(bh); 1893 clear_buffer_nilfs_volatile(bh);
1894 clear_buffer_nilfs_redirected(bh);
1903 if (bh == segbuf->sb_super_root) { 1895 if (bh == segbuf->sb_super_root) {
1904 if (bh->b_page != bd_page) { 1896 if (bh->b_page != bd_page) {
1905 end_page_writeback(bd_page); 1897 end_page_writeback(bd_page);
@@ -1936,11 +1928,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1936 1928
1937 nilfs_drop_collected_inodes(&sci->sc_dirty_files); 1929 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
1938 1930
1939 if (nilfs_doing_gc()) { 1931 if (nilfs_doing_gc())
1940 nilfs_drop_collected_inodes(&sci->sc_gc_inodes); 1932 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
1941 if (update_sr) 1933 else
1942 nilfs_commit_gcdat_inode(nilfs);
1943 } else
1944 nilfs->ns_nongc_ctime = sci->sc_seg_ctime; 1934 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
1945 1935
1946 sci->sc_nblk_inc += sci->sc_nblk_this_inc; 1936 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -1976,7 +1966,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1976 struct nilfs_sb_info *sbi) 1966 struct nilfs_sb_info *sbi)
1977{ 1967{
1978 struct nilfs_inode_info *ii, *n; 1968 struct nilfs_inode_info *ii, *n;
1979 __u64 cno = sbi->s_nilfs->ns_cno; 1969 struct inode *ifile = sci->sc_root->ifile;
1980 1970
1981 spin_lock(&sbi->s_inode_lock); 1971 spin_lock(&sbi->s_inode_lock);
1982 retry: 1972 retry:
@@ -1987,14 +1977,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1987 1977
1988 spin_unlock(&sbi->s_inode_lock); 1978 spin_unlock(&sbi->s_inode_lock);
1989 err = nilfs_ifile_get_inode_block( 1979 err = nilfs_ifile_get_inode_block(
1990 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh); 1980 ifile, ii->vfs_inode.i_ino, &ibh);
1991 if (unlikely(err)) { 1981 if (unlikely(err)) {
1992 nilfs_warning(sbi->s_super, __func__, 1982 nilfs_warning(sbi->s_super, __func__,
1993 "failed to get inode block.\n"); 1983 "failed to get inode block.\n");
1994 return err; 1984 return err;
1995 } 1985 }
1996 nilfs_mdt_mark_buffer_dirty(ibh); 1986 nilfs_mdt_mark_buffer_dirty(ibh);
1997 nilfs_mdt_mark_dirty(sbi->s_ifile); 1987 nilfs_mdt_mark_dirty(ifile);
1998 spin_lock(&sbi->s_inode_lock); 1988 spin_lock(&sbi->s_inode_lock);
1999 if (likely(!ii->i_bh)) 1989 if (likely(!ii->i_bh))
2000 ii->i_bh = ibh; 1990 ii->i_bh = ibh;
@@ -2002,7 +1992,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2002 brelse(ibh); 1992 brelse(ibh);
2003 goto retry; 1993 goto retry;
2004 } 1994 }
2005 ii->i_cno = cno;
2006 1995
2007 clear_bit(NILFS_I_QUEUED, &ii->i_state); 1996 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2008 set_bit(NILFS_I_BUSY, &ii->i_state); 1997 set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2011,8 +2000,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2011 } 2000 }
2012 spin_unlock(&sbi->s_inode_lock); 2001 spin_unlock(&sbi->s_inode_lock);
2013 2002
2014 NILFS_I(sbi->s_ifile)->i_cno = cno;
2015
2016 return 0; 2003 return 0;
2017} 2004}
2018 2005
@@ -2021,19 +2008,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2021{ 2008{
2022 struct nilfs_transaction_info *ti = current->journal_info; 2009 struct nilfs_transaction_info *ti = current->journal_info;
2023 struct nilfs_inode_info *ii, *n; 2010 struct nilfs_inode_info *ii, *n;
2024 __u64 cno = sbi->s_nilfs->ns_cno;
2025 2011
2026 spin_lock(&sbi->s_inode_lock); 2012 spin_lock(&sbi->s_inode_lock);
2027 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { 2013 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2028 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || 2014 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2029 test_bit(NILFS_I_DIRTY, &ii->i_state)) { 2015 test_bit(NILFS_I_DIRTY, &ii->i_state))
2030 /* The current checkpoint number (=nilfs->ns_cno) is
2031 changed between check-in and check-out only if the
2032 super root is written out. So, we can update i_cno
2033 for the inodes that remain in the dirty list. */
2034 ii->i_cno = cno;
2035 continue; 2016 continue;
2036 } 2017
2037 clear_bit(NILFS_I_BUSY, &ii->i_state); 2018 clear_bit(NILFS_I_BUSY, &ii->i_state);
2038 brelse(ii->i_bh); 2019 brelse(ii->i_bh);
2039 ii->i_bh = NULL; 2020 ii->i_bh = NULL;
@@ -2054,12 +2035,13 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2054 int err; 2035 int err;
2055 2036
2056 sci->sc_stage.scnt = NILFS_ST_INIT; 2037 sci->sc_stage.scnt = NILFS_ST_INIT;
2038 sci->sc_cno = nilfs->ns_cno;
2057 2039
2058 err = nilfs_segctor_check_in_files(sci, sbi); 2040 err = nilfs_segctor_check_in_files(sci, sbi);
2059 if (unlikely(err)) 2041 if (unlikely(err))
2060 goto out; 2042 goto out;
2061 2043
2062 if (nilfs_test_metadata_dirty(sbi)) 2044 if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
2063 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2045 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2064 2046
2065 if (nilfs_segctor_clean(sci)) 2047 if (nilfs_segctor_clean(sci))
@@ -2091,7 +2073,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2091 goto failed; 2073 goto failed;
2092 2074
2093 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2075 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2094 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2076 nilfs_segctor_fill_in_file_bmap(sci);
2095 2077
2096 if (mode == SC_LSEG_SR && 2078 if (mode == SC_LSEG_SR &&
2097 sci->sc_stage.scnt >= NILFS_ST_CPFILE) { 2079 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2452,9 +2434,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2452 list_for_each_entry_safe(ii, n, head, i_dirty) { 2434 list_for_each_entry_safe(ii, n, head, i_dirty) {
2453 if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) 2435 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2454 continue; 2436 continue;
2455 hlist_del_init(&ii->vfs_inode.i_hash);
2456 list_del_init(&ii->i_dirty); 2437 list_del_init(&ii->i_dirty);
2457 nilfs_clear_gcinode(&ii->vfs_inode); 2438 iput(&ii->vfs_inode);
2458 } 2439 }
2459} 2440}
2460 2441
@@ -2472,13 +2453,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2472 2453
2473 nilfs_transaction_lock(sbi, &ti, 1); 2454 nilfs_transaction_lock(sbi, &ti, 1);
2474 2455
2475 err = nilfs_init_gcdat_inode(nilfs); 2456 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
2476 if (unlikely(err)) 2457 if (unlikely(err))
2477 goto out_unlock; 2458 goto out_unlock;
2478 2459
2479 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); 2460 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
2480 if (unlikely(err)) 2461 if (unlikely(err)) {
2462 nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
2481 goto out_unlock; 2463 goto out_unlock;
2464 }
2482 2465
2483 sci->sc_freesegs = kbufs[4]; 2466 sci->sc_freesegs = kbufs[4];
2484 sci->sc_nfreesegs = argv[4].v_nmembs; 2467 sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2510,7 +2493,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2510 out_unlock: 2493 out_unlock:
2511 sci->sc_freesegs = NULL; 2494 sci->sc_freesegs = NULL;
2512 sci->sc_nfreesegs = 0; 2495 sci->sc_nfreesegs = 0;
2513 nilfs_clear_gcdat_inode(nilfs); 2496 nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
2514 nilfs_transaction_unlock(sbi); 2497 nilfs_transaction_unlock(sbi);
2515 return err; 2498 return err;
2516} 2499}
@@ -2672,6 +2655,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2672} 2655}
2673 2656
2674static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) 2657static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2658 __acquires(&sci->sc_state_lock)
2659 __releases(&sci->sc_state_lock)
2675{ 2660{
2676 sci->sc_state |= NILFS_SEGCTOR_QUIT; 2661 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2677 2662
@@ -2686,7 +2671,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2686/* 2671/*
2687 * Setup & clean-up functions 2672 * Setup & clean-up functions
2688 */ 2673 */
2689static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) 2674static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
2675 struct nilfs_root *root)
2690{ 2676{
2691 struct nilfs_sc_info *sci; 2677 struct nilfs_sc_info *sci;
2692 2678
@@ -2697,6 +2683,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2697 sci->sc_sbi = sbi; 2683 sci->sc_sbi = sbi;
2698 sci->sc_super = sbi->s_super; 2684 sci->sc_super = sbi->s_super;
2699 2685
2686 nilfs_get_root(root);
2687 sci->sc_root = root;
2688
2700 init_waitqueue_head(&sci->sc_wait_request); 2689 init_waitqueue_head(&sci->sc_wait_request);
2701 init_waitqueue_head(&sci->sc_wait_daemon); 2690 init_waitqueue_head(&sci->sc_wait_daemon);
2702 init_waitqueue_head(&sci->sc_wait_task); 2691 init_waitqueue_head(&sci->sc_wait_task);
@@ -2771,6 +2760,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2771 WARN_ON(!list_empty(&sci->sc_segbufs)); 2760 WARN_ON(!list_empty(&sci->sc_segbufs));
2772 WARN_ON(!list_empty(&sci->sc_write_logs)); 2761 WARN_ON(!list_empty(&sci->sc_write_logs));
2773 2762
2763 nilfs_put_root(sci->sc_root);
2764
2774 down_write(&sbi->s_nilfs->ns_segctor_sem); 2765 down_write(&sbi->s_nilfs->ns_segctor_sem);
2775 2766
2776 del_timer_sync(&sci->sc_timer); 2767 del_timer_sync(&sci->sc_timer);
@@ -2780,6 +2771,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2780/** 2771/**
2781 * nilfs_attach_segment_constructor - attach a segment constructor 2772 * nilfs_attach_segment_constructor - attach a segment constructor
2782 * @sbi: nilfs_sb_info 2773 * @sbi: nilfs_sb_info
2774 * @root: root object of the current filesystem tree
2783 * 2775 *
2784 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2776 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2785 * initializes it, and starts the segment constructor. 2777 * initializes it, and starts the segment constructor.
@@ -2789,9 +2781,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2789 * 2781 *
2790 * %-ENOMEM - Insufficient memory available. 2782 * %-ENOMEM - Insufficient memory available.
2791 */ 2783 */
2792int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) 2784int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
2785 struct nilfs_root *root)
2793{ 2786{
2794 struct the_nilfs *nilfs = sbi->s_nilfs;
2795 int err; 2787 int err;
2796 2788
2797 if (NILFS_SC(sbi)) { 2789 if (NILFS_SC(sbi)) {
@@ -2803,14 +2795,12 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2803 nilfs_detach_segment_constructor(sbi); 2795 nilfs_detach_segment_constructor(sbi);
2804 } 2796 }
2805 2797
2806 sbi->s_sc_info = nilfs_segctor_new(sbi); 2798 sbi->s_sc_info = nilfs_segctor_new(sbi, root);
2807 if (!sbi->s_sc_info) 2799 if (!sbi->s_sc_info)
2808 return -ENOMEM; 2800 return -ENOMEM;
2809 2801
2810 nilfs_attach_writer(nilfs, sbi);
2811 err = nilfs_segctor_start_thread(NILFS_SC(sbi)); 2802 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2812 if (err) { 2803 if (err) {
2813 nilfs_detach_writer(nilfs, sbi);
2814 kfree(sbi->s_sc_info); 2804 kfree(sbi->s_sc_info);
2815 sbi->s_sc_info = NULL; 2805 sbi->s_sc_info = NULL;
2816 } 2806 }
@@ -2847,5 +2837,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2847 up_write(&nilfs->ns_segctor_sem); 2837 up_write(&nilfs->ns_segctor_sem);
2848 2838
2849 nilfs_dispose_list(sbi, &garbage_list, 1); 2839 nilfs_dispose_list(sbi, &garbage_list, 1);
2850 nilfs_detach_writer(nilfs, sbi);
2851} 2840}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 17c487bd8152..cd8056e7cbed 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "sb.h" 30#include "sb.h"
31 31
32struct nilfs_root;
33
32/** 34/**
33 * struct nilfs_recovery_info - Recovery information 35 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 36 * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
87 * struct nilfs_sc_info - Segment constructor information 89 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct 90 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct 91 * @sc_sbi: Back pointer to nilfs_sb_info struct
92 * @sc_root: root object of the current filesystem tree
90 * @sc_nblk_inc: Block count of current generation 93 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written 94 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written 95 * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -107,6 +110,7 @@ struct nilfs_segsum_pointer {
107 * @sc_datablk_cnt: Data block count of a file 110 * @sc_datablk_cnt: Data block count of a file
108 * @sc_nblk_this_inc: Number of blocks included in the current logical segment 111 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
109 * @sc_seg_ctime: Creation time 112 * @sc_seg_ctime: Creation time
113 * @sc_cno: checkpoint number of current log
110 * @sc_flags: Internal flags 114 * @sc_flags: Internal flags
111 * @sc_state_lock: spinlock for sc_state and so on 115 * @sc_state_lock: spinlock for sc_state and so on
112 * @sc_state: Segctord state flags 116 * @sc_state: Segctord state flags
@@ -128,6 +132,7 @@ struct nilfs_segsum_pointer {
128struct nilfs_sc_info { 132struct nilfs_sc_info {
129 struct super_block *sc_super; 133 struct super_block *sc_super;
130 struct nilfs_sb_info *sc_sbi; 134 struct nilfs_sb_info *sc_sbi;
135 struct nilfs_root *sc_root;
131 136
132 unsigned long sc_nblk_inc; 137 unsigned long sc_nblk_inc;
133 138
@@ -156,7 +161,7 @@ struct nilfs_sc_info {
156 unsigned long sc_datablk_cnt; 161 unsigned long sc_datablk_cnt;
157 unsigned long sc_nblk_this_inc; 162 unsigned long sc_nblk_this_inc;
158 time_t sc_seg_ctime; 163 time_t sc_seg_ctime;
159 164 __u64 sc_cno;
160 unsigned long sc_flags; 165 unsigned long sc_flags;
161 166
162 spinlock_t sc_state_lock; 167 spinlock_t sc_state_lock;
@@ -230,7 +235,8 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
230extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, 235extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
231 void **); 236 void **);
232 237
233extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); 238int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
239 struct nilfs_root *root);
234extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); 240extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
235 241
236/* recovery.c */ 242/* recovery.c */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2e..1d6f488ccae8 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
505{ 505{
506 struct buffer_head *header_bh; 506 struct buffer_head *header_bh;
507 struct nilfs_sufile_header *header; 507 struct nilfs_sufile_header *header;
508 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 508 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
509 void *kaddr; 509 void *kaddr;
510 int ret; 510 int ret;
511 511
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
583 struct nilfs_segment_usage *su; 583 struct nilfs_segment_usage *su;
584 struct nilfs_suinfo *si = buf; 584 struct nilfs_suinfo *si = buf;
585 size_t susz = NILFS_MDT(sufile)->mi_entry_size; 585 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
586 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 586 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
587 void *kaddr; 587 void *kaddr;
588 unsigned long nsegs, segusages_per_block; 588 unsigned long nsegs, segusages_per_block;
589 ssize_t n; 589 ssize_t n;
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
635} 635}
636 636
637/** 637/**
638 * nilfs_sufile_read - read sufile inode 638 * nilfs_sufile_read - read or get sufile inode
639 * @sufile: sufile inode 639 * @sb: super block instance
640 * @susize: size of a segment usage entry
640 * @raw_inode: on-disk sufile inode 641 * @raw_inode: on-disk sufile inode
642 * @inodep: buffer to store the inode
641 */ 643 */
642int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode) 644int nilfs_sufile_read(struct super_block *sb, size_t susize,
645 struct nilfs_inode *raw_inode, struct inode **inodep)
643{ 646{
644 struct nilfs_sufile_info *sui = NILFS_SUI(sufile); 647 struct inode *sufile;
648 struct nilfs_sufile_info *sui;
645 struct buffer_head *header_bh; 649 struct buffer_head *header_bh;
646 struct nilfs_sufile_header *header; 650 struct nilfs_sufile_header *header;
647 void *kaddr; 651 void *kaddr;
648 int ret; 652 int err;
649 653
650 ret = nilfs_read_inode_common(sufile, raw_inode); 654 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
651 if (ret < 0) 655 if (unlikely(!sufile))
652 return ret; 656 return -ENOMEM;
657 if (!(sufile->i_state & I_NEW))
658 goto out;
653 659
654 ret = nilfs_sufile_get_header_block(sufile, &header_bh); 660 err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
655 if (!ret) { 661 if (err)
656 kaddr = kmap_atomic(header_bh->b_page, KM_USER0); 662 goto failed;
657 header = kaddr + bh_offset(header_bh);
658 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
659 kunmap_atomic(kaddr, KM_USER0);
660 brelse(header_bh);
661 }
662 return ret;
663}
664 663
665/** 664 nilfs_mdt_set_entry_size(sufile, susize,
666 * nilfs_sufile_new - create sufile 665 sizeof(struct nilfs_sufile_header));
667 * @nilfs: nilfs object 666
668 * @susize: size of a segment usage entry 667 err = nilfs_read_inode_common(sufile, raw_inode);
669 */ 668 if (err)
670struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize) 669 goto failed;
671{ 670
672 struct inode *sufile; 671 err = nilfs_sufile_get_header_block(sufile, &header_bh);
672 if (err)
673 goto failed;
673 674
674 sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO, 675 sui = NILFS_SUI(sufile);
675 sizeof(struct nilfs_sufile_info)); 676 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
676 if (sufile) 677 header = kaddr + bh_offset(header_bh);
677 nilfs_mdt_set_entry_size(sufile, susize, 678 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
678 sizeof(struct nilfs_sufile_header)); 679 kunmap_atomic(kaddr, KM_USER0);
679 return sufile; 680 brelse(header_bh);
681
682 unlock_new_inode(sufile);
683 out:
684 *inodep = sufile;
685 return 0;
686 failed:
687 iget_failed(sufile);
688 return err;
680} 689}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7d..a943fbacb45b 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
31 31
32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) 32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
33{ 33{
34 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; 34 return NILFS_I_NILFS(sufile)->ns_nsegments;
35} 35}
36 36
37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); 37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, 61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
62 struct buffer_head *); 62 struct buffer_head *);
63 63
64int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode); 64int nilfs_sufile_read(struct super_block *sb, size_t susize,
65struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize); 65 struct nilfs_inode *raw_inode, struct inode **inodep);
66 66
67/** 67/**
68 * nilfs_sufile_scrap - make a segment garbage 68 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..58fd707174e1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,14 +45,12 @@
45#include <linux/parser.h> 45#include <linux/parser.h>
46#include <linux/random.h> 46#include <linux/random.h>
47#include <linux/crc32.h> 47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h> 48#include <linux/vfs.h>
50#include <linux/writeback.h> 49#include <linux/writeback.h>
51#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include <linux/seq_file.h> 50#include <linux/seq_file.h>
54#include <linux/mount.h> 51#include <linux/mount.h>
55#include "nilfs.h" 52#include "nilfs.h"
53#include "export.h"
56#include "mdt.h" 54#include "mdt.h"
57#include "alloc.h" 55#include "alloc.h"
58#include "btree.h" 56#include "btree.h"
@@ -69,11 +67,12 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
69 "(NILFS)"); 67 "(NILFS)");
70MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
71 69
72struct kmem_cache *nilfs_inode_cachep; 70static struct kmem_cache *nilfs_inode_cachep;
73struct kmem_cache *nilfs_transaction_cachep; 71struct kmem_cache *nilfs_transaction_cachep;
74struct kmem_cache *nilfs_segbuf_cachep; 72struct kmem_cache *nilfs_segbuf_cachep;
75struct kmem_cache *nilfs_btree_path_cache; 73struct kmem_cache *nilfs_btree_path_cache;
76 74
75static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
77static int nilfs_remount(struct super_block *sb, int *flags, char *data); 76static int nilfs_remount(struct super_block *sb, int *flags, char *data);
78 77
79static void nilfs_set_error(struct nilfs_sb_info *sbi) 78static void nilfs_set_error(struct nilfs_sb_info *sbi)
@@ -111,12 +110,17 @@ void nilfs_error(struct super_block *sb, const char *function,
111 const char *fmt, ...) 110 const char *fmt, ...)
112{ 111{
113 struct nilfs_sb_info *sbi = NILFS_SB(sb); 112 struct nilfs_sb_info *sbi = NILFS_SB(sb);
113 struct va_format vaf;
114 va_list args; 114 va_list args;
115 115
116 va_start(args, fmt); 116 va_start(args, fmt);
117 printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); 117
118 vprintk(fmt, args); 118 vaf.fmt = fmt;
119 printk("\n"); 119 vaf.va = &args;
120
121 printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n",
122 sb->s_id, function, &vaf);
123
120 va_end(args); 124 va_end(args);
121 125
122 if (!(sb->s_flags & MS_RDONLY)) { 126 if (!(sb->s_flags & MS_RDONLY)) {
@@ -136,18 +140,22 @@ void nilfs_error(struct super_block *sb, const char *function,
136void nilfs_warning(struct super_block *sb, const char *function, 140void nilfs_warning(struct super_block *sb, const char *function,
137 const char *fmt, ...) 141 const char *fmt, ...)
138{ 142{
143 struct va_format vaf;
139 va_list args; 144 va_list args;
140 145
141 va_start(args, fmt); 146 va_start(args, fmt);
142 printk(KERN_WARNING "NILFS warning (device %s): %s: ", 147
143 sb->s_id, function); 148 vaf.fmt = fmt;
144 vprintk(fmt, args); 149 vaf.va = &args;
145 printk("\n"); 150
151 printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
152 sb->s_id, function, &vaf);
153
146 va_end(args); 154 va_end(args);
147} 155}
148 156
149 157
150struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 158struct inode *nilfs_alloc_inode(struct super_block *sb)
151{ 159{
152 struct nilfs_inode_info *ii; 160 struct nilfs_inode_info *ii;
153 161
@@ -156,19 +164,29 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
156 return NULL; 164 return NULL;
157 ii->i_bh = NULL; 165 ii->i_bh = NULL;
158 ii->i_state = 0; 166 ii->i_state = 0;
167 ii->i_cno = 0;
159 ii->vfs_inode.i_version = 1; 168 ii->vfs_inode.i_version = 1;
160 nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi); 169 nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
161 return &ii->vfs_inode; 170 return &ii->vfs_inode;
162} 171}
163 172
164struct inode *nilfs_alloc_inode(struct super_block *sb) 173static void nilfs_i_callback(struct rcu_head *head)
165{ 174{
166 return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs); 175 struct inode *inode = container_of(head, struct inode, i_rcu);
176 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
177
178 INIT_LIST_HEAD(&inode->i_dentry);
179
180 if (mdi) {
181 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
182 kfree(mdi);
183 }
184 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
167} 185}
168 186
169void nilfs_destroy_inode(struct inode *inode) 187void nilfs_destroy_inode(struct inode *inode)
170{ 188{
171 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 189 call_rcu(&inode->i_rcu, nilfs_i_callback);
172} 190}
173 191
174static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) 192static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
@@ -178,17 +196,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
178 196
179 retry: 197 retry:
180 set_buffer_dirty(nilfs->ns_sbh[0]); 198 set_buffer_dirty(nilfs->ns_sbh[0]);
181
182 if (nilfs_test_opt(sbi, BARRIER)) { 199 if (nilfs_test_opt(sbi, BARRIER)) {
183 err = __sync_dirty_buffer(nilfs->ns_sbh[0], 200 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
184 WRITE_SYNC | WRITE_BARRIER); 201 WRITE_SYNC | WRITE_FLUSH_FUA);
185 if (err == -EOPNOTSUPP) {
186 nilfs_warning(sbi->s_super, __func__,
187 "barrier-based sync failed. "
188 "disabling barriers\n");
189 nilfs_clear_opt(sbi, BARRIER);
190 goto retry;
191 }
192 } else { 202 } else {
193 err = sync_dirty_buffer(nilfs->ns_sbh[0]); 203 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
194 } 204 }
@@ -342,8 +352,6 @@ static void nilfs_put_super(struct super_block *sb)
342 struct nilfs_sb_info *sbi = NILFS_SB(sb); 352 struct nilfs_sb_info *sbi = NILFS_SB(sb);
343 struct the_nilfs *nilfs = sbi->s_nilfs; 353 struct the_nilfs *nilfs = sbi->s_nilfs;
344 354
345 lock_kernel();
346
347 nilfs_detach_segment_constructor(sbi); 355 nilfs_detach_segment_constructor(sbi);
348 356
349 if (!(sb->s_flags & MS_RDONLY)) { 357 if (!(sb->s_flags & MS_RDONLY)) {
@@ -351,18 +359,15 @@ static void nilfs_put_super(struct super_block *sb)
351 nilfs_cleanup_super(sbi); 359 nilfs_cleanup_super(sbi);
352 up_write(&nilfs->ns_sem); 360 up_write(&nilfs->ns_sem);
353 } 361 }
354 down_write(&nilfs->ns_super_sem);
355 if (nilfs->ns_current == sbi)
356 nilfs->ns_current = NULL;
357 up_write(&nilfs->ns_super_sem);
358 362
359 nilfs_detach_checkpoint(sbi); 363 iput(nilfs->ns_sufile);
360 put_nilfs(sbi->s_nilfs); 364 iput(nilfs->ns_cpfile);
365 iput(nilfs->ns_dat);
366
367 destroy_nilfs(nilfs);
361 sbi->s_super = NULL; 368 sbi->s_super = NULL;
362 sb->s_fs_info = NULL; 369 sb->s_fs_info = NULL;
363 nilfs_put_sbinfo(sbi); 370 kfree(sbi);
364
365 unlock_kernel();
366} 371}
367 372
368static int nilfs_sync_fs(struct super_block *sb, int wait) 373static int nilfs_sync_fs(struct super_block *sb, int wait)
@@ -389,21 +394,22 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
389 return err; 394 return err;
390} 395}
391 396
392int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) 397int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
398 struct nilfs_root **rootp)
393{ 399{
394 struct the_nilfs *nilfs = sbi->s_nilfs; 400 struct the_nilfs *nilfs = sbi->s_nilfs;
401 struct nilfs_root *root;
395 struct nilfs_checkpoint *raw_cp; 402 struct nilfs_checkpoint *raw_cp;
396 struct buffer_head *bh_cp; 403 struct buffer_head *bh_cp;
397 int err; 404 int err = -ENOMEM;
398 405
399 down_write(&nilfs->ns_super_sem); 406 root = nilfs_find_or_create_root(
400 list_add(&sbi->s_list, &nilfs->ns_supers); 407 nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
401 up_write(&nilfs->ns_super_sem); 408 if (!root)
409 return err;
402 410
403 err = -ENOMEM; 411 if (root->ifile)
404 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); 412 goto reuse; /* already attached checkpoint */
405 if (!sbi->s_ifile)
406 goto delist;
407 413
408 down_read(&nilfs->ns_segctor_sem); 414 down_read(&nilfs->ns_segctor_sem);
409 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 415 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -419,45 +425,64 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
419 } 425 }
420 goto failed; 426 goto failed;
421 } 427 }
422 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); 428
423 if (unlikely(err)) 429 err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
430 &raw_cp->cp_ifile_inode, &root->ifile);
431 if (err)
424 goto failed_bh; 432 goto failed_bh;
425 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); 433
426 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); 434 atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
435 atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
427 436
428 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 437 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
438
439 reuse:
440 *rootp = root;
429 return 0; 441 return 0;
430 442
431 failed_bh: 443 failed_bh:
432 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 444 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
433 failed: 445 failed:
434 nilfs_mdt_destroy(sbi->s_ifile); 446 nilfs_put_root(root);
435 sbi->s_ifile = NULL;
436 447
437 delist: 448 return err;
438 down_write(&nilfs->ns_super_sem); 449}
439 list_del_init(&sbi->s_list);
440 up_write(&nilfs->ns_super_sem);
441 450
451static int nilfs_freeze(struct super_block *sb)
452{
453 struct nilfs_sb_info *sbi = NILFS_SB(sb);
454 struct the_nilfs *nilfs = sbi->s_nilfs;
455 int err;
456
457 if (sb->s_flags & MS_RDONLY)
458 return 0;
459
460 /* Mark super block clean */
461 down_write(&nilfs->ns_sem);
462 err = nilfs_cleanup_super(sbi);
463 up_write(&nilfs->ns_sem);
442 return err; 464 return err;
443} 465}
444 466
445void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) 467static int nilfs_unfreeze(struct super_block *sb)
446{ 468{
469 struct nilfs_sb_info *sbi = NILFS_SB(sb);
447 struct the_nilfs *nilfs = sbi->s_nilfs; 470 struct the_nilfs *nilfs = sbi->s_nilfs;
448 471
449 nilfs_mdt_destroy(sbi->s_ifile); 472 if (sb->s_flags & MS_RDONLY)
450 sbi->s_ifile = NULL; 473 return 0;
451 down_write(&nilfs->ns_super_sem); 474
452 list_del_init(&sbi->s_list); 475 down_write(&nilfs->ns_sem);
453 up_write(&nilfs->ns_super_sem); 476 nilfs_setup_super(sbi, false);
477 up_write(&nilfs->ns_sem);
478 return 0;
454} 479}
455 480
456static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) 481static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
457{ 482{
458 struct super_block *sb = dentry->d_sb; 483 struct super_block *sb = dentry->d_sb;
459 struct nilfs_sb_info *sbi = NILFS_SB(sb); 484 struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
460 struct the_nilfs *nilfs = sbi->s_nilfs; 485 struct the_nilfs *nilfs = root->nilfs;
461 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 486 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
462 unsigned long long blocks; 487 unsigned long long blocks;
463 unsigned long overhead; 488 unsigned long overhead;
@@ -493,7 +518,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
493 buf->f_bfree = nfreeblocks; 518 buf->f_bfree = nfreeblocks;
494 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? 519 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
495 (buf->f_bfree - nrsvblocks) : 0; 520 (buf->f_bfree - nrsvblocks) : 0;
496 buf->f_files = atomic_read(&sbi->s_inodes_count); 521 buf->f_files = atomic_read(&root->inodes_count);
497 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ 522 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
498 buf->f_namelen = NILFS_NAME_LEN; 523 buf->f_namelen = NILFS_NAME_LEN;
499 buf->f_fsid.val[0] = (u32)id; 524 buf->f_fsid.val[0] = (u32)id;
@@ -506,12 +531,12 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
506{ 531{
507 struct super_block *sb = vfs->mnt_sb; 532 struct super_block *sb = vfs->mnt_sb;
508 struct nilfs_sb_info *sbi = NILFS_SB(sb); 533 struct nilfs_sb_info *sbi = NILFS_SB(sb);
534 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
509 535
510 if (!nilfs_test_opt(sbi, BARRIER)) 536 if (!nilfs_test_opt(sbi, BARRIER))
511 seq_puts(seq, ",nobarrier"); 537 seq_puts(seq, ",nobarrier");
512 if (nilfs_test_opt(sbi, SNAPSHOT)) 538 if (root->cno != NILFS_CPTREE_CURRENT_CNO)
513 seq_printf(seq, ",cp=%llu", 539 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
514 (unsigned long long int)sbi->s_snapshot_cno);
515 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 540 if (nilfs_test_opt(sbi, ERRORS_PANIC))
516 seq_puts(seq, ",errors=panic"); 541 seq_puts(seq, ",errors=panic");
517 if (nilfs_test_opt(sbi, ERRORS_CONT)) 542 if (nilfs_test_opt(sbi, ERRORS_CONT))
@@ -537,6 +562,8 @@ static const struct super_operations nilfs_sops = {
537 .put_super = nilfs_put_super, 562 .put_super = nilfs_put_super,
538 /* .write_super = nilfs_write_super, */ 563 /* .write_super = nilfs_write_super, */
539 .sync_fs = nilfs_sync_fs, 564 .sync_fs = nilfs_sync_fs,
565 .freeze_fs = nilfs_freeze,
566 .unfreeze_fs = nilfs_unfreeze,
540 /* .write_super_lockfs */ 567 /* .write_super_lockfs */
541 /* .unlockfs */ 568 /* .unlockfs */
542 .statfs = nilfs_statfs, 569 .statfs = nilfs_statfs,
@@ -545,48 +572,6 @@ static const struct super_operations nilfs_sops = {
545 .show_options = nilfs_show_options 572 .show_options = nilfs_show_options
546}; 573};
547 574
548static struct inode *
549nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
550{
551 struct inode *inode;
552
553 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
554 ino != NILFS_SKETCH_INO)
555 return ERR_PTR(-ESTALE);
556
557 inode = nilfs_iget(sb, ino);
558 if (IS_ERR(inode))
559 return ERR_CAST(inode);
560 if (generation && inode->i_generation != generation) {
561 iput(inode);
562 return ERR_PTR(-ESTALE);
563 }
564
565 return inode;
566}
567
568static struct dentry *
569nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
570 int fh_type)
571{
572 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
573 nilfs_nfs_get_inode);
574}
575
576static struct dentry *
577nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
578 int fh_type)
579{
580 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
581 nilfs_nfs_get_inode);
582}
583
584static const struct export_operations nilfs_export_ops = {
585 .fh_to_dentry = nilfs_fh_to_dentry,
586 .fh_to_parent = nilfs_fh_to_parent,
587 .get_parent = nilfs_get_parent,
588};
589
590enum { 575enum {
591 Opt_err_cont, Opt_err_panic, Opt_err_ro, 576 Opt_err_cont, Opt_err_panic, Opt_err_ro,
592 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 577 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
@@ -612,7 +597,6 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
612 struct nilfs_sb_info *sbi = NILFS_SB(sb); 597 struct nilfs_sb_info *sbi = NILFS_SB(sb);
613 char *p; 598 char *p;
614 substring_t args[MAX_OPT_ARGS]; 599 substring_t args[MAX_OPT_ARGS];
615 int option;
616 600
617 if (!options) 601 if (!options)
618 return 1; 602 return 1;
@@ -650,30 +634,12 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
650 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); 634 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
651 break; 635 break;
652 case Opt_snapshot: 636 case Opt_snapshot:
653 if (match_int(&args[0], &option) || option <= 0)
654 return 0;
655 if (is_remount) { 637 if (is_remount) {
656 if (!nilfs_test_opt(sbi, SNAPSHOT)) { 638 printk(KERN_ERR
657 printk(KERN_ERR 639 "NILFS: \"%s\" option is invalid "
658 "NILFS: cannot change regular " 640 "for remount.\n", p);
659 "mount to snapshot.\n");
660 return 0;
661 } else if (option != sbi->s_snapshot_cno) {
662 printk(KERN_ERR
663 "NILFS: cannot remount to a "
664 "different snapshot.\n");
665 return 0;
666 }
667 break;
668 }
669 if (!(sb->s_flags & MS_RDONLY)) {
670 printk(KERN_ERR "NILFS: cannot mount snapshot "
671 "read/write. A read-only option is "
672 "required.\n");
673 return 0; 641 return 0;
674 } 642 }
675 sbi->s_snapshot_cno = option;
676 nilfs_set_opt(sbi, SNAPSHOT);
677 break; 643 break;
678 case Opt_norecovery: 644 case Opt_norecovery:
679 nilfs_set_opt(sbi, NORECOVERY); 645 nilfs_set_opt(sbi, NORECOVERY);
@@ -701,7 +667,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
701 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; 667 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
702} 668}
703 669
704static int nilfs_setup_super(struct nilfs_sb_info *sbi) 670static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
705{ 671{
706 struct the_nilfs *nilfs = sbi->s_nilfs; 672 struct the_nilfs *nilfs = sbi->s_nilfs;
707 struct nilfs_super_block **sbp; 673 struct nilfs_super_block **sbp;
@@ -713,6 +679,9 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
713 if (!sbp) 679 if (!sbp)
714 return -EIO; 680 return -EIO;
715 681
682 if (!is_mount)
683 goto skip_mount_setup;
684
716 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); 685 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
717 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); 686 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
718 687
@@ -729,11 +698,14 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
729 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); 698 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
730 699
731 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); 700 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
701 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
702
703skip_mount_setup:
732 sbp[0]->s_state = 704 sbp[0]->s_state =
733 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); 705 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
734 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
735 /* synchronize sbp[1] with sbp[0] */ 706 /* synchronize sbp[1] with sbp[0] */
736 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 707 if (sbp[1])
708 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
737 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 709 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
738} 710}
739 711
@@ -798,22 +770,156 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
798 return 0; 770 return 0;
799} 771}
800 772
773static int nilfs_get_root_dentry(struct super_block *sb,
774 struct nilfs_root *root,
775 struct dentry **root_dentry)
776{
777 struct inode *inode;
778 struct dentry *dentry;
779 int ret = 0;
780
781 inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
782 if (IS_ERR(inode)) {
783 printk(KERN_ERR "NILFS: get root inode failed\n");
784 ret = PTR_ERR(inode);
785 goto out;
786 }
787 if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
788 iput(inode);
789 printk(KERN_ERR "NILFS: corrupt root inode.\n");
790 ret = -EINVAL;
791 goto out;
792 }
793
794 if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
795 dentry = d_find_alias(inode);
796 if (!dentry) {
797 dentry = d_alloc_root(inode);
798 if (!dentry) {
799 iput(inode);
800 ret = -ENOMEM;
801 goto failed_dentry;
802 }
803 } else {
804 iput(inode);
805 }
806 } else {
807 dentry = d_obtain_alias(inode);
808 if (IS_ERR(dentry)) {
809 ret = PTR_ERR(dentry);
810 goto failed_dentry;
811 }
812 }
813 *root_dentry = dentry;
814 out:
815 return ret;
816
817 failed_dentry:
818 printk(KERN_ERR "NILFS: get root dentry failed\n");
819 goto out;
820}
821
822static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
823 struct dentry **root_dentry)
824{
825 struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
826 struct nilfs_root *root;
827 int ret;
828
829 down_read(&nilfs->ns_segctor_sem);
830 ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
831 up_read(&nilfs->ns_segctor_sem);
832 if (ret < 0) {
833 ret = (ret == -ENOENT) ? -EINVAL : ret;
834 goto out;
835 } else if (!ret) {
836 printk(KERN_ERR "NILFS: The specified checkpoint is "
837 "not a snapshot (checkpoint number=%llu).\n",
838 (unsigned long long)cno);
839 ret = -EINVAL;
840 goto out;
841 }
842
843 ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
844 if (ret) {
845 printk(KERN_ERR "NILFS: error loading snapshot "
846 "(checkpoint number=%llu).\n",
847 (unsigned long long)cno);
848 goto out;
849 }
850 ret = nilfs_get_root_dentry(s, root, root_dentry);
851 nilfs_put_root(root);
852 out:
853 return ret;
854}
855
856static int nilfs_tree_was_touched(struct dentry *root_dentry)
857{
858 return root_dentry->d_count > 1;
859}
860
861/**
862 * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
863 * @root_dentry: root dentry of the tree to be shrunk
864 *
865 * This function returns true if the tree was in-use.
866 */
867static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
868{
869 if (have_submounts(root_dentry))
870 return true;
871 shrink_dcache_parent(root_dentry);
872 return nilfs_tree_was_touched(root_dentry);
873}
874
875int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
876{
877 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
878 struct nilfs_root *root;
879 struct inode *inode;
880 struct dentry *dentry;
881 int ret;
882
883 if (cno < 0 || cno > nilfs->ns_cno)
884 return false;
885
886 if (cno >= nilfs_last_cno(nilfs))
887 return true; /* protect recent checkpoints */
888
889 ret = false;
890 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
891 if (root) {
892 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
893 if (inode) {
894 dentry = d_find_alias(inode);
895 if (dentry) {
896 if (nilfs_tree_was_touched(dentry))
897 ret = nilfs_try_to_shrink_tree(dentry);
898 dput(dentry);
899 }
900 iput(inode);
901 }
902 nilfs_put_root(root);
903 }
904 return ret;
905}
906
801/** 907/**
802 * nilfs_fill_super() - initialize a super block instance 908 * nilfs_fill_super() - initialize a super block instance
803 * @sb: super_block 909 * @sb: super_block
804 * @data: mount options 910 * @data: mount options
805 * @silent: silent mode flag 911 * @silent: silent mode flag
806 * @nilfs: the_nilfs struct
807 * 912 *
808 * This function is called exclusively by nilfs->ns_mount_mutex. 913 * This function is called exclusively by nilfs->ns_mount_mutex.
809 * So, the recovery process is protected from other simultaneous mounts. 914 * So, the recovery process is protected from other simultaneous mounts.
810 */ 915 */
811static int 916static int
812nilfs_fill_super(struct super_block *sb, void *data, int silent, 917nilfs_fill_super(struct super_block *sb, void *data, int silent)
813 struct the_nilfs *nilfs)
814{ 918{
919 struct the_nilfs *nilfs;
815 struct nilfs_sb_info *sbi; 920 struct nilfs_sb_info *sbi;
816 struct inode *root; 921 struct nilfs_root *fsroot;
922 struct backing_dev_info *bdi;
817 __u64 cno; 923 __u64 cno;
818 int err; 924 int err;
819 925
@@ -822,19 +928,21 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
822 return -ENOMEM; 928 return -ENOMEM;
823 929
824 sb->s_fs_info = sbi; 930 sb->s_fs_info = sbi;
931 sbi->s_super = sb;
825 932
826 get_nilfs(nilfs); 933 nilfs = alloc_nilfs(sb->s_bdev);
934 if (!nilfs) {
935 err = -ENOMEM;
936 goto failed_sbi;
937 }
827 sbi->s_nilfs = nilfs; 938 sbi->s_nilfs = nilfs;
828 sbi->s_super = sb;
829 atomic_set(&sbi->s_count, 1);
830 939
831 err = init_nilfs(nilfs, sbi, (char *)data); 940 err = init_nilfs(nilfs, sbi, (char *)data);
832 if (err) 941 if (err)
833 goto failed_sbi; 942 goto failed_nilfs;
834 943
835 spin_lock_init(&sbi->s_inode_lock); 944 spin_lock_init(&sbi->s_inode_lock);
836 INIT_LIST_HEAD(&sbi->s_dirty_files); 945 INIT_LIST_HEAD(&sbi->s_dirty_files);
837 INIT_LIST_HEAD(&sbi->s_list);
838 946
839 /* 947 /*
840 * Following initialization is overlapped because 948 * Following initialization is overlapped because
@@ -850,94 +958,59 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
850 sb->s_export_op = &nilfs_export_ops; 958 sb->s_export_op = &nilfs_export_ops;
851 sb->s_root = NULL; 959 sb->s_root = NULL;
852 sb->s_time_gran = 1; 960 sb->s_time_gran = 1;
853 sb->s_bdi = nilfs->ns_bdi; 961
962 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
963 sb->s_bdi = bdi ? : &default_backing_dev_info;
854 964
855 err = load_nilfs(nilfs, sbi); 965 err = load_nilfs(nilfs, sbi);
856 if (err) 966 if (err)
857 goto failed_sbi; 967 goto failed_nilfs;
858 968
859 cno = nilfs_last_cno(nilfs); 969 cno = nilfs_last_cno(nilfs);
860 970 err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
861 if (sb->s_flags & MS_RDONLY) {
862 if (nilfs_test_opt(sbi, SNAPSHOT)) {
863 down_read(&nilfs->ns_segctor_sem);
864 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
865 sbi->s_snapshot_cno);
866 up_read(&nilfs->ns_segctor_sem);
867 if (err < 0) {
868 if (err == -ENOENT)
869 err = -EINVAL;
870 goto failed_sbi;
871 }
872 if (!err) {
873 printk(KERN_ERR
874 "NILFS: The specified checkpoint is "
875 "not a snapshot "
876 "(checkpoint number=%llu).\n",
877 (unsigned long long)sbi->s_snapshot_cno);
878 err = -EINVAL;
879 goto failed_sbi;
880 }
881 cno = sbi->s_snapshot_cno;
882 }
883 }
884
885 err = nilfs_attach_checkpoint(sbi, cno);
886 if (err) { 971 if (err) {
887 printk(KERN_ERR "NILFS: error loading a checkpoint" 972 printk(KERN_ERR "NILFS: error loading last checkpoint "
888 " (checkpoint number=%llu).\n", (unsigned long long)cno); 973 "(checkpoint number=%llu).\n", (unsigned long long)cno);
889 goto failed_sbi; 974 goto failed_unload;
890 } 975 }
891 976
892 if (!(sb->s_flags & MS_RDONLY)) { 977 if (!(sb->s_flags & MS_RDONLY)) {
893 err = nilfs_attach_segment_constructor(sbi); 978 err = nilfs_attach_segment_constructor(sbi, fsroot);
894 if (err) 979 if (err)
895 goto failed_checkpoint; 980 goto failed_checkpoint;
896 } 981 }
897 982
898 root = nilfs_iget(sb, NILFS_ROOT_INO); 983 err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
899 if (IS_ERR(root)) { 984 if (err)
900 printk(KERN_ERR "NILFS: get root inode failed\n");
901 err = PTR_ERR(root);
902 goto failed_segctor;
903 }
904 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
905 iput(root);
906 printk(KERN_ERR "NILFS: corrupt root inode.\n");
907 err = -EINVAL;
908 goto failed_segctor;
909 }
910 sb->s_root = d_alloc_root(root);
911 if (!sb->s_root) {
912 iput(root);
913 printk(KERN_ERR "NILFS: get root dentry failed\n");
914 err = -ENOMEM;
915 goto failed_segctor; 985 goto failed_segctor;
916 } 986
987 nilfs_put_root(fsroot);
917 988
918 if (!(sb->s_flags & MS_RDONLY)) { 989 if (!(sb->s_flags & MS_RDONLY)) {
919 down_write(&nilfs->ns_sem); 990 down_write(&nilfs->ns_sem);
920 nilfs_setup_super(sbi); 991 nilfs_setup_super(sbi, true);
921 up_write(&nilfs->ns_sem); 992 up_write(&nilfs->ns_sem);
922 } 993 }
923 994
924 down_write(&nilfs->ns_super_sem);
925 if (!nilfs_test_opt(sbi, SNAPSHOT))
926 nilfs->ns_current = sbi;
927 up_write(&nilfs->ns_super_sem);
928
929 return 0; 995 return 0;
930 996
931 failed_segctor: 997 failed_segctor:
932 nilfs_detach_segment_constructor(sbi); 998 nilfs_detach_segment_constructor(sbi);
933 999
934 failed_checkpoint: 1000 failed_checkpoint:
935 nilfs_detach_checkpoint(sbi); 1001 nilfs_put_root(fsroot);
1002
1003 failed_unload:
1004 iput(nilfs->ns_sufile);
1005 iput(nilfs->ns_cpfile);
1006 iput(nilfs->ns_dat);
1007
1008 failed_nilfs:
1009 destroy_nilfs(nilfs);
936 1010
937 failed_sbi: 1011 failed_sbi:
938 put_nilfs(nilfs);
939 sb->s_fs_info = NULL; 1012 sb->s_fs_info = NULL;
940 nilfs_put_sbinfo(sbi); 1013 kfree(sbi);
941 return err; 1014 return err;
942} 1015}
943 1016
@@ -946,16 +1019,11 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
946 struct nilfs_sb_info *sbi = NILFS_SB(sb); 1019 struct nilfs_sb_info *sbi = NILFS_SB(sb);
947 struct the_nilfs *nilfs = sbi->s_nilfs; 1020 struct the_nilfs *nilfs = sbi->s_nilfs;
948 unsigned long old_sb_flags; 1021 unsigned long old_sb_flags;
949 struct nilfs_mount_options old_opts; 1022 unsigned long old_mount_opt;
950 int was_snapshot, err; 1023 int err;
951
952 lock_kernel();
953 1024
954 down_write(&nilfs->ns_super_sem);
955 old_sb_flags = sb->s_flags; 1025 old_sb_flags = sb->s_flags;
956 old_opts.mount_opt = sbi->s_mount_opt; 1026 old_mount_opt = sbi->s_mount_opt;
957 old_opts.snapshot_cno = sbi->s_snapshot_cno;
958 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
959 1027
960 if (!parse_options(data, sb, 1)) { 1028 if (!parse_options(data, sb, 1)) {
961 err = -EINVAL; 1029 err = -EINVAL;
@@ -964,11 +1032,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
964 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 1032 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
965 1033
966 err = -EINVAL; 1034 err = -EINVAL;
967 if (was_snapshot && !(*flags & MS_RDONLY)) {
968 printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
969 "read/write.\n", sb->s_id);
970 goto restore_opts;
971 }
972 1035
973 if (!nilfs_valid_fs(nilfs)) { 1036 if (!nilfs_valid_fs(nilfs)) {
974 printk(KERN_WARNING "NILFS (device %s): couldn't " 1037 printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -993,6 +1056,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
993 up_write(&nilfs->ns_sem); 1056 up_write(&nilfs->ns_sem);
994 } else { 1057 } else {
995 __u64 features; 1058 __u64 features;
1059 struct nilfs_root *root;
996 1060
997 /* 1061 /*
998 * Mounting a RDONLY partition read-write, so reread and 1062 * Mounting a RDONLY partition read-write, so reread and
@@ -1014,25 +1078,21 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1014 1078
1015 sb->s_flags &= ~MS_RDONLY; 1079 sb->s_flags &= ~MS_RDONLY;
1016 1080
1017 err = nilfs_attach_segment_constructor(sbi); 1081 root = NILFS_I(sb->s_root->d_inode)->i_root;
1082 err = nilfs_attach_segment_constructor(sbi, root);
1018 if (err) 1083 if (err)
1019 goto restore_opts; 1084 goto restore_opts;
1020 1085
1021 down_write(&nilfs->ns_sem); 1086 down_write(&nilfs->ns_sem);
1022 nilfs_setup_super(sbi); 1087 nilfs_setup_super(sbi, true);
1023 up_write(&nilfs->ns_sem); 1088 up_write(&nilfs->ns_sem);
1024 } 1089 }
1025 out: 1090 out:
1026 up_write(&nilfs->ns_super_sem);
1027 unlock_kernel();
1028 return 0; 1091 return 0;
1029 1092
1030 restore_opts: 1093 restore_opts:
1031 sb->s_flags = old_sb_flags; 1094 sb->s_flags = old_sb_flags;
1032 sbi->s_mount_opt = old_opts.mount_opt; 1095 sbi->s_mount_opt = old_mount_opt;
1033 sbi->s_snapshot_cno = old_opts.snapshot_cno;
1034 up_write(&nilfs->ns_super_sem);
1035 unlock_kernel();
1036 return err; 1096 return err;
1037} 1097}
1038 1098
@@ -1052,7 +1112,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1052{ 1112{
1053 char *p, *options = data; 1113 char *p, *options = data;
1054 substring_t args[MAX_OPT_ARGS]; 1114 substring_t args[MAX_OPT_ARGS];
1055 int option, token; 1115 int token;
1056 int ret = 0; 1116 int ret = 0;
1057 1117
1058 do { 1118 do {
@@ -1060,16 +1120,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1060 if (p != NULL && *p) { 1120 if (p != NULL && *p) {
1061 token = match_token(p, tokens, args); 1121 token = match_token(p, tokens, args);
1062 if (token == Opt_snapshot) { 1122 if (token == Opt_snapshot) {
1063 if (!(sd->flags & MS_RDONLY)) 1123 if (!(sd->flags & MS_RDONLY)) {
1064 ret++; 1124 ret++;
1065 else { 1125 } else {
1066 ret = match_int(&args[0], &option); 1126 sd->cno = simple_strtoull(args[0].from,
1067 if (!ret) { 1127 NULL, 0);
1068 if (option > 0) 1128 /*
1069 sd->cno = option; 1129 * No need to see the end pointer;
1070 else 1130 * match_token() has done syntax
1071 ret++; 1131 * checking.
1072 } 1132 */
1133 if (sd->cno == 0)
1134 ret++;
1073 } 1135 }
1074 } 1136 }
1075 if (ret) 1137 if (ret)
@@ -1086,43 +1148,33 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1086 1148
1087static int nilfs_set_bdev_super(struct super_block *s, void *data) 1149static int nilfs_set_bdev_super(struct super_block *s, void *data)
1088{ 1150{
1089 struct nilfs_super_data *sd = data; 1151 s->s_bdev = data;
1090
1091 s->s_bdev = sd->bdev;
1092 s->s_dev = s->s_bdev->bd_dev; 1152 s->s_dev = s->s_bdev->bd_dev;
1093 return 0; 1153 return 0;
1094} 1154}
1095 1155
1096static int nilfs_test_bdev_super(struct super_block *s, void *data) 1156static int nilfs_test_bdev_super(struct super_block *s, void *data)
1097{ 1157{
1098 struct nilfs_super_data *sd = data; 1158 return (void *)s->s_bdev == data;
1099
1100 return sd->sbi && s->s_fs_info == (void *)sd->sbi;
1101} 1159}
1102 1160
1103static int 1161static struct dentry *
1104nilfs_get_sb(struct file_system_type *fs_type, int flags, 1162nilfs_mount(struct file_system_type *fs_type, int flags,
1105 const char *dev_name, void *data, struct vfsmount *mnt) 1163 const char *dev_name, void *data)
1106{ 1164{
1107 struct nilfs_super_data sd; 1165 struct nilfs_super_data sd;
1108 struct super_block *s; 1166 struct super_block *s;
1109 fmode_t mode = FMODE_READ; 1167 fmode_t mode = FMODE_READ | FMODE_EXCL;
1110 struct the_nilfs *nilfs; 1168 struct dentry *root_dentry;
1111 int err, need_to_close = 1; 1169 int err, s_new = false;
1112 1170
1113 if (!(flags & MS_RDONLY)) 1171 if (!(flags & MS_RDONLY))
1114 mode |= FMODE_WRITE; 1172 mode |= FMODE_WRITE;
1115 1173
1116 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); 1174 sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1117 if (IS_ERR(sd.bdev)) 1175 if (IS_ERR(sd.bdev))
1118 return PTR_ERR(sd.bdev); 1176 return ERR_CAST(sd.bdev);
1119 1177
1120 /*
1121 * To get mount instance using sget() vfs-routine, NILFS needs
1122 * much more information than normal filesystems to identify mount
1123 * instance. For snapshot mounts, not only a mount type (ro-mount
1124 * or rw-mount) but also a checkpoint number is required.
1125 */
1126 sd.cno = 0; 1178 sd.cno = 0;
1127 sd.flags = flags; 1179 sd.flags = flags;
1128 if (nilfs_identify((char *)data, &sd)) { 1180 if (nilfs_identify((char *)data, &sd)) {
@@ -1130,101 +1182,91 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1130 goto failed; 1182 goto failed;
1131 } 1183 }
1132 1184
1133 nilfs = find_or_create_nilfs(sd.bdev);
1134 if (!nilfs) {
1135 err = -ENOMEM;
1136 goto failed;
1137 }
1138
1139 mutex_lock(&nilfs->ns_mount_mutex);
1140
1141 if (!sd.cno) {
1142 /*
1143 * Check if an exclusive mount exists or not.
1144 * Snapshot mounts coexist with a current mount
1145 * (i.e. rw-mount or ro-mount), whereas rw-mount and
1146 * ro-mount are mutually exclusive.
1147 */
1148 down_read(&nilfs->ns_super_sem);
1149 if (nilfs->ns_current &&
1150 ((nilfs->ns_current->s_super->s_flags ^ flags)
1151 & MS_RDONLY)) {
1152 up_read(&nilfs->ns_super_sem);
1153 err = -EBUSY;
1154 goto failed_unlock;
1155 }
1156 up_read(&nilfs->ns_super_sem);
1157 }
1158
1159 /*
1160 * Find existing nilfs_sb_info struct
1161 */
1162 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1163
1164 /* 1185 /*
1165 * Get super block instance holding the nilfs_sb_info struct. 1186 * once the super is inserted into the list by sget, s_umount
1166 * A new instance is allocated if no existing mount is present or 1187 * will protect the lockfs code from trying to start a snapshot
1167 * existing instance has been unmounted. 1188 * while we are mounting
1168 */ 1189 */
1169 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); 1190 mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
1170 if (sd.sbi) 1191 if (sd.bdev->bd_fsfreeze_count > 0) {
1171 nilfs_put_sbinfo(sd.sbi); 1192 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1172 1193 err = -EBUSY;
1194 goto failed;
1195 }
1196 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
1197 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1173 if (IS_ERR(s)) { 1198 if (IS_ERR(s)) {
1174 err = PTR_ERR(s); 1199 err = PTR_ERR(s);
1175 goto failed_unlock; 1200 goto failed;
1176 } 1201 }
1177 1202
1178 if (!s->s_root) { 1203 if (!s->s_root) {
1179 char b[BDEVNAME_SIZE]; 1204 char b[BDEVNAME_SIZE];
1180 1205
1206 s_new = true;
1207
1181 /* New superblock instance created */ 1208 /* New superblock instance created */
1182 s->s_flags = flags; 1209 s->s_flags = flags;
1183 s->s_mode = mode; 1210 s->s_mode = mode;
1184 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1211 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1185 sb_set_blocksize(s, block_size(sd.bdev)); 1212 sb_set_blocksize(s, block_size(sd.bdev));
1186 1213
1187 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, 1214 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1188 nilfs);
1189 if (err) 1215 if (err)
1190 goto cancel_new; 1216 goto failed_super;
1191 1217
1192 s->s_flags |= MS_ACTIVE; 1218 s->s_flags |= MS_ACTIVE;
1193 need_to_close = 0; 1219 } else if (!sd.cno) {
1220 int busy = false;
1221
1222 if (nilfs_tree_was_touched(s->s_root)) {
1223 busy = nilfs_try_to_shrink_tree(s->s_root);
1224 if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
1225 printk(KERN_ERR "NILFS: the device already "
1226 "has a %s mount.\n",
1227 (s->s_flags & MS_RDONLY) ?
1228 "read-only" : "read/write");
1229 err = -EBUSY;
1230 goto failed_super;
1231 }
1232 }
1233 if (!busy) {
1234 /*
1235 * Try remount to setup mount states if the current
1236 * tree is not mounted and only snapshots use this sb.
1237 */
1238 err = nilfs_remount(s, &flags, data);
1239 if (err)
1240 goto failed_super;
1241 }
1194 } 1242 }
1195 1243
1196 mutex_unlock(&nilfs->ns_mount_mutex); 1244 if (sd.cno) {
1197 put_nilfs(nilfs); 1245 err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
1198 if (need_to_close) 1246 if (err)
1199 close_bdev_exclusive(sd.bdev, mode); 1247 goto failed_super;
1200 simple_set_mnt(mnt, s); 1248 } else {
1201 return 0; 1249 root_dentry = dget(s->s_root);
1250 }
1202 1251
1203 failed_unlock: 1252 if (!s_new)
1204 mutex_unlock(&nilfs->ns_mount_mutex); 1253 blkdev_put(sd.bdev, mode);
1205 put_nilfs(nilfs);
1206 failed:
1207 close_bdev_exclusive(sd.bdev, mode);
1208 1254
1209 return err; 1255 return root_dentry;
1210 1256
1211 cancel_new: 1257 failed_super:
1212 /* Abandoning the newly allocated superblock */
1213 mutex_unlock(&nilfs->ns_mount_mutex);
1214 put_nilfs(nilfs);
1215 deactivate_locked_super(s); 1258 deactivate_locked_super(s);
1216 /* 1259
1217 * deactivate_locked_super() invokes close_bdev_exclusive(). 1260 failed:
1218 * We must finish all post-cleaning before this call; 1261 if (!s_new)
1219 * put_nilfs() needs the block device. 1262 blkdev_put(sd.bdev, mode);
1220 */ 1263 return ERR_PTR(err);
1221 return err;
1222} 1264}
1223 1265
1224struct file_system_type nilfs_fs_type = { 1266struct file_system_type nilfs_fs_type = {
1225 .owner = THIS_MODULE, 1267 .owner = THIS_MODULE,
1226 .name = "nilfs2", 1268 .name = "nilfs2",
1227 .get_sb = nilfs_get_sb, 1269 .mount = nilfs_mount,
1228 .kill_sb = kill_block_super, 1270 .kill_sb = kill_block_super,
1229 .fs_flags = FS_REQUIRES_DEV, 1271 .fs_flags = FS_REQUIRES_DEV,
1230}; 1272};
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba7c10c917fc..ad4ac607cf57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,9 +35,6 @@
35#include "segbuf.h" 35#include "segbuf.h"
36 36
37 37
38static LIST_HEAD(nilfs_objects);
39static DEFINE_SPINLOCK(nilfs_lock);
40
41static int nilfs_valid_sb(struct nilfs_super_block *sbp); 38static int nilfs_valid_sb(struct nilfs_super_block *sbp);
42 39
43void nilfs_set_last_segment(struct the_nilfs *nilfs, 40void nilfs_set_last_segment(struct the_nilfs *nilfs,
@@ -61,16 +58,13 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
61} 58}
62 59
63/** 60/**
64 * alloc_nilfs - allocate the_nilfs structure 61 * alloc_nilfs - allocate a nilfs object
65 * @bdev: block device to which the_nilfs is related 62 * @bdev: block device to which the_nilfs is related
66 * 63 *
67 * alloc_nilfs() allocates memory for the_nilfs and
68 * initializes its reference count and locks.
69 *
70 * Return Value: On success, pointer to the_nilfs is returned. 64 * Return Value: On success, pointer to the_nilfs is returned.
71 * On error, NULL is returned. 65 * On error, NULL is returned.
72 */ 66 */
73static struct the_nilfs *alloc_nilfs(struct block_device *bdev) 67struct the_nilfs *alloc_nilfs(struct block_device *bdev)
74{ 68{
75 struct the_nilfs *nilfs; 69 struct the_nilfs *nilfs;
76 70
@@ -79,103 +73,38 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
79 return NULL; 73 return NULL;
80 74
81 nilfs->ns_bdev = bdev; 75 nilfs->ns_bdev = bdev;
82 atomic_set(&nilfs->ns_count, 1);
83 atomic_set(&nilfs->ns_ndirtyblks, 0); 76 atomic_set(&nilfs->ns_ndirtyblks, 0);
84 init_rwsem(&nilfs->ns_sem); 77 init_rwsem(&nilfs->ns_sem);
85 init_rwsem(&nilfs->ns_super_sem); 78 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
86 mutex_init(&nilfs->ns_mount_mutex);
87 init_rwsem(&nilfs->ns_writer_sem);
88 INIT_LIST_HEAD(&nilfs->ns_list);
89 INIT_LIST_HEAD(&nilfs->ns_supers);
90 spin_lock_init(&nilfs->ns_last_segment_lock); 79 spin_lock_init(&nilfs->ns_last_segment_lock);
91 nilfs->ns_gc_inodes_h = NULL; 80 nilfs->ns_cptree = RB_ROOT;
81 spin_lock_init(&nilfs->ns_cptree_lock);
92 init_rwsem(&nilfs->ns_segctor_sem); 82 init_rwsem(&nilfs->ns_segctor_sem);
93 83
94 return nilfs; 84 return nilfs;
95} 85}
96 86
97/** 87/**
98 * find_or_create_nilfs - find or create nilfs object 88 * destroy_nilfs - destroy nilfs object
99 * @bdev: block device to which the_nilfs is related 89 * @nilfs: nilfs object to be released
100 *
101 * find_nilfs() looks up an existent nilfs object created on the
102 * device and gets the reference count of the object. If no nilfs object
103 * is found on the device, a new nilfs object is allocated.
104 *
105 * Return Value: On success, pointer to the nilfs object is returned.
106 * On error, NULL is returned.
107 */
108struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
109{
110 struct the_nilfs *nilfs, *new = NULL;
111
112 retry:
113 spin_lock(&nilfs_lock);
114 list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
115 if (nilfs->ns_bdev == bdev) {
116 get_nilfs(nilfs);
117 spin_unlock(&nilfs_lock);
118 if (new)
119 put_nilfs(new);
120 return nilfs; /* existing object */
121 }
122 }
123 if (new) {
124 list_add_tail(&new->ns_list, &nilfs_objects);
125 spin_unlock(&nilfs_lock);
126 return new; /* new object */
127 }
128 spin_unlock(&nilfs_lock);
129
130 new = alloc_nilfs(bdev);
131 if (new)
132 goto retry;
133 return NULL; /* insufficient memory */
134}
135
136/**
137 * put_nilfs - release a reference to the_nilfs
138 * @nilfs: the_nilfs structure to be released
139 *
140 * put_nilfs() decrements a reference counter of the_nilfs.
141 * If the reference count reaches zero, the_nilfs is freed.
142 */ 90 */
143void put_nilfs(struct the_nilfs *nilfs) 91void destroy_nilfs(struct the_nilfs *nilfs)
144{ 92{
145 spin_lock(&nilfs_lock);
146 if (!atomic_dec_and_test(&nilfs->ns_count)) {
147 spin_unlock(&nilfs_lock);
148 return;
149 }
150 list_del_init(&nilfs->ns_list);
151 spin_unlock(&nilfs_lock);
152
153 /*
154 * Increment of ns_count never occurs below because the caller
155 * of get_nilfs() holds at least one reference to the_nilfs.
156 * Thus its exclusion control is not required here.
157 */
158
159 might_sleep(); 93 might_sleep();
160 if (nilfs_loaded(nilfs)) {
161 nilfs_mdt_destroy(nilfs->ns_sufile);
162 nilfs_mdt_destroy(nilfs->ns_cpfile);
163 nilfs_mdt_destroy(nilfs->ns_dat);
164 nilfs_mdt_destroy(nilfs->ns_gc_dat);
165 }
166 if (nilfs_init(nilfs)) { 94 if (nilfs_init(nilfs)) {
167 nilfs_destroy_gccache(nilfs);
168 brelse(nilfs->ns_sbh[0]); 95 brelse(nilfs->ns_sbh[0]);
169 brelse(nilfs->ns_sbh[1]); 96 brelse(nilfs->ns_sbh[1]);
170 } 97 }
171 kfree(nilfs); 98 kfree(nilfs);
172} 99}
173 100
174static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block) 101static int nilfs_load_super_root(struct the_nilfs *nilfs,
102 struct super_block *sb, sector_t sr_block)
175{ 103{
176 struct buffer_head *bh_sr; 104 struct buffer_head *bh_sr;
177 struct nilfs_super_root *raw_sr; 105 struct nilfs_super_root *raw_sr;
178 struct nilfs_super_block **sbp = nilfs->ns_sbp; 106 struct nilfs_super_block **sbp = nilfs->ns_sbp;
107 struct nilfs_inode *rawi;
179 unsigned dat_entry_size, segment_usage_size, checkpoint_size; 108 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
180 unsigned inode_size; 109 unsigned inode_size;
181 int err; 110 int err;
@@ -192,40 +121,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
192 121
193 inode_size = nilfs->ns_inode_size; 122 inode_size = nilfs->ns_inode_size;
194 123
195 err = -ENOMEM; 124 rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
196 nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size); 125 err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
197 if (unlikely(!nilfs->ns_dat)) 126 if (err)
198 goto failed; 127 goto failed;
199 128
200 nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size); 129 rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
201 if (unlikely(!nilfs->ns_gc_dat)) 130 err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
131 if (err)
202 goto failed_dat; 132 goto failed_dat;
203 133
204 nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size); 134 rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
205 if (unlikely(!nilfs->ns_cpfile)) 135 err = nilfs_sufile_read(sb, segment_usage_size, rawi,
206 goto failed_gc_dat; 136 &nilfs->ns_sufile);
207 137 if (err)
208 nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
209 if (unlikely(!nilfs->ns_sufile))
210 goto failed_cpfile; 138 goto failed_cpfile;
211 139
212 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
213
214 err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
215 NILFS_SR_DAT_OFFSET(inode_size));
216 if (unlikely(err))
217 goto failed_sufile;
218
219 err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
220 NILFS_SR_CPFILE_OFFSET(inode_size));
221 if (unlikely(err))
222 goto failed_sufile;
223
224 err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
225 NILFS_SR_SUFILE_OFFSET(inode_size));
226 if (unlikely(err))
227 goto failed_sufile;
228
229 raw_sr = (struct nilfs_super_root *)bh_sr->b_data; 140 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
230 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); 141 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
231 142
@@ -233,17 +144,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
233 brelse(bh_sr); 144 brelse(bh_sr);
234 return err; 145 return err;
235 146
236 failed_sufile:
237 nilfs_mdt_destroy(nilfs->ns_sufile);
238
239 failed_cpfile: 147 failed_cpfile:
240 nilfs_mdt_destroy(nilfs->ns_cpfile); 148 iput(nilfs->ns_cpfile);
241
242 failed_gc_dat:
243 nilfs_mdt_destroy(nilfs->ns_gc_dat);
244 149
245 failed_dat: 150 failed_dat:
246 nilfs_mdt_destroy(nilfs->ns_dat); 151 iput(nilfs->ns_dat);
247 goto failed; 152 goto failed;
248} 153}
249 154
@@ -306,15 +211,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
306 int valid_fs = nilfs_valid_fs(nilfs); 211 int valid_fs = nilfs_valid_fs(nilfs);
307 int err; 212 int err;
308 213
309 if (nilfs_loaded(nilfs)) {
310 if (valid_fs ||
311 ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
312 return 0;
313 printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
314 "recovery state.\n");
315 return -EINVAL;
316 }
317
318 if (!valid_fs) { 214 if (!valid_fs) {
319 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); 215 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
320 if (s_flags & MS_RDONLY) { 216 if (s_flags & MS_RDONLY) {
@@ -375,7 +271,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
375 goto scan_error; 271 goto scan_error;
376 } 272 }
377 273
378 err = nilfs_load_super_root(nilfs, ri.ri_super_root); 274 err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
379 if (unlikely(err)) { 275 if (unlikely(err)) {
380 printk(KERN_ERR "NILFS: error loading super root.\n"); 276 printk(KERN_ERR "NILFS: error loading super root.\n");
381 goto failed; 277 goto failed;
@@ -433,7 +329,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
433 printk(KERN_INFO "NILFS: recovery complete.\n"); 329 printk(KERN_INFO "NILFS: recovery complete.\n");
434 330
435 skip_recovery: 331 skip_recovery:
436 set_nilfs_loaded(nilfs);
437 nilfs_clear_recovery_info(&ri); 332 nilfs_clear_recovery_info(&ri);
438 sbi->s_super->s_flags = s_flags; 333 sbi->s_super->s_flags = s_flags;
439 return 0; 334 return 0;
@@ -443,10 +338,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
443 goto failed; 338 goto failed;
444 339
445 failed_unload: 340 failed_unload:
446 nilfs_mdt_destroy(nilfs->ns_cpfile); 341 iput(nilfs->ns_cpfile);
447 nilfs_mdt_destroy(nilfs->ns_sufile); 342 iput(nilfs->ns_sufile);
448 nilfs_mdt_destroy(nilfs->ns_dat); 343 iput(nilfs->ns_dat);
449 nilfs_mdt_destroy(nilfs->ns_gc_dat);
450 344
451 failed: 345 failed:
452 nilfs_clear_recovery_info(&ri); 346 nilfs_clear_recovery_info(&ri);
@@ -468,8 +362,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
468static int nilfs_store_disk_layout(struct the_nilfs *nilfs, 362static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
469 struct nilfs_super_block *sbp) 363 struct nilfs_super_block *sbp)
470{ 364{
471 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) { 365 if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
472 printk(KERN_ERR "NILFS: revision mismatch " 366 printk(KERN_ERR "NILFS: unsupported revision "
473 "(superblock rev.=%d.%d, current rev.=%d.%d). " 367 "(superblock rev.=%d.%d, current rev.=%d.%d). "
474 "Please check the version of mkfs.nilfs.\n", 368 "Please check the version of mkfs.nilfs.\n",
475 le32_to_cpu(sbp->s_rev_level), 369 le32_to_cpu(sbp->s_rev_level),
@@ -631,12 +525,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
631 * 525 *
632 * init_nilfs() performs common initialization per block device (e.g. 526 * init_nilfs() performs common initialization per block device (e.g.
633 * reading the super block, getting disk layout information, initializing 527 * reading the super block, getting disk layout information, initializing
634 * shared fields in the_nilfs). It takes on some portion of the jobs 528 * shared fields in the_nilfs).
635 * typically done by a fill_super() routine. This division arises from
636 * the nature that multiple NILFS instances may be simultaneously
637 * mounted on a device.
638 * For multiple mounts on the same device, only the first mount
639 * invokes these tasks.
640 * 529 *
641 * Return Value: On success, 0 is returned. On error, a negative error 530 * Return Value: On success, 0 is returned. On error, a negative error
642 * code is returned. 531 * code is returned.
@@ -645,32 +534,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
645{ 534{
646 struct super_block *sb = sbi->s_super; 535 struct super_block *sb = sbi->s_super;
647 struct nilfs_super_block *sbp; 536 struct nilfs_super_block *sbp;
648 struct backing_dev_info *bdi;
649 int blocksize; 537 int blocksize;
650 int err; 538 int err;
651 539
652 down_write(&nilfs->ns_sem); 540 down_write(&nilfs->ns_sem);
653 if (nilfs_init(nilfs)) {
654 /* Load values from existing the_nilfs */
655 sbp = nilfs->ns_sbp[0];
656 err = nilfs_store_magic_and_option(sb, sbp, data);
657 if (err)
658 goto out;
659
660 err = nilfs_check_feature_compatibility(sb, sbp);
661 if (err)
662 goto out;
663
664 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
665 if (sb->s_blocksize != blocksize &&
666 !sb_set_blocksize(sb, blocksize)) {
667 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
668 blocksize);
669 err = -EINVAL;
670 }
671 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
672 goto out;
673 }
674 541
675 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); 542 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
676 if (!blocksize) { 543 if (!blocksize) {
@@ -729,18 +596,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
729 596
730 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 597 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
731 598
732 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
733 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
734
735 err = nilfs_store_log_cursor(nilfs, sbp); 599 err = nilfs_store_log_cursor(nilfs, sbp);
736 if (err) 600 if (err)
737 goto failed_sbh; 601 goto failed_sbh;
738 602
739 /* Initialize gcinode cache */
740 err = nilfs_init_gccache(nilfs);
741 if (err)
742 goto failed_sbh;
743
744 set_nilfs_init(nilfs); 603 set_nilfs_init(nilfs);
745 err = 0; 604 err = 0;
746 out: 605 out:
@@ -775,9 +634,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
775 ret = blkdev_issue_discard(nilfs->ns_bdev, 634 ret = blkdev_issue_discard(nilfs->ns_bdev,
776 start * sects_per_block, 635 start * sects_per_block,
777 nblocks * sects_per_block, 636 nblocks * sects_per_block,
778 GFP_NOFS, 637 GFP_NOFS, 0);
779 BLKDEV_IFL_WAIT |
780 BLKDEV_IFL_BARRIER);
781 if (ret < 0) 638 if (ret < 0)
782 return ret; 639 return ret;
783 nblocks = 0; 640 nblocks = 0;
@@ -787,19 +644,17 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
787 ret = blkdev_issue_discard(nilfs->ns_bdev, 644 ret = blkdev_issue_discard(nilfs->ns_bdev,
788 start * sects_per_block, 645 start * sects_per_block,
789 nblocks * sects_per_block, 646 nblocks * sects_per_block,
790 GFP_NOFS, 647 GFP_NOFS, 0);
791 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
792 return ret; 648 return ret;
793} 649}
794 650
795int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 651int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
796{ 652{
797 struct inode *dat = nilfs_dat_inode(nilfs);
798 unsigned long ncleansegs; 653 unsigned long ncleansegs;
799 654
800 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 655 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
801 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); 656 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
802 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 657 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
803 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; 658 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
804 return 0; 659 return 0;
805} 660}
@@ -815,79 +670,92 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
815 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; 670 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
816} 671}
817 672
818/** 673struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
819 * nilfs_find_sbinfo - find existing nilfs_sb_info structure
820 * @nilfs: nilfs object
821 * @rw_mount: mount type (non-zero value for read/write mount)
822 * @cno: checkpoint number (zero for read-only mount)
823 *
824 * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
825 * @rw_mount and @cno (in case of snapshots) matched. If no instance
826 * was found, NULL is returned. Although the super block instance can
827 * be unmounted after this function returns, the nilfs_sb_info struct
828 * is kept on memory until nilfs_put_sbinfo() is called.
829 */
830struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
831 int rw_mount, __u64 cno)
832{ 674{
833 struct nilfs_sb_info *sbi; 675 struct rb_node *n;
834 676 struct nilfs_root *root;
835 down_read(&nilfs->ns_super_sem); 677
836 /* 678 spin_lock(&nilfs->ns_cptree_lock);
837 * The SNAPSHOT flag and sb->s_flags are supposed to be 679 n = nilfs->ns_cptree.rb_node;
838 * protected with nilfs->ns_super_sem. 680 while (n) {
839 */ 681 root = rb_entry(n, struct nilfs_root, rb_node);
840 sbi = nilfs->ns_current; 682
841 if (rw_mount) { 683 if (cno < root->cno) {
842 if (sbi && !(sbi->s_super->s_flags & MS_RDONLY)) 684 n = n->rb_left;
843 goto found; /* read/write mount */ 685 } else if (cno > root->cno) {
844 else 686 n = n->rb_right;
845 goto out; 687 } else {
846 } else if (cno == 0) { 688 atomic_inc(&root->count);
847 if (sbi && (sbi->s_super->s_flags & MS_RDONLY)) 689 spin_unlock(&nilfs->ns_cptree_lock);
848 goto found; /* read-only mount */ 690 return root;
849 else 691 }
850 goto out;
851 } 692 }
693 spin_unlock(&nilfs->ns_cptree_lock);
852 694
853 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
854 if (nilfs_test_opt(sbi, SNAPSHOT) &&
855 sbi->s_snapshot_cno == cno)
856 goto found; /* snapshot mount */
857 }
858 out:
859 up_read(&nilfs->ns_super_sem);
860 return NULL; 695 return NULL;
861
862 found:
863 atomic_inc(&sbi->s_count);
864 up_read(&nilfs->ns_super_sem);
865 return sbi;
866} 696}
867 697
868int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, 698struct nilfs_root *
869 int snapshot_mount) 699nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
870{ 700{
871 struct nilfs_sb_info *sbi; 701 struct rb_node **p, *parent;
872 int ret = 0; 702 struct nilfs_root *root, *new;
873 703
874 down_read(&nilfs->ns_super_sem); 704 root = nilfs_lookup_root(nilfs, cno);
875 if (cno == 0 || cno > nilfs->ns_cno) 705 if (root)
876 goto out_unlock; 706 return root;
877 707
878 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { 708 new = kmalloc(sizeof(*root), GFP_KERNEL);
879 if (sbi->s_snapshot_cno == cno && 709 if (!new)
880 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) { 710 return NULL;
881 /* exclude read-only mounts */ 711
882 ret++; 712 spin_lock(&nilfs->ns_cptree_lock);
883 break; 713
714 p = &nilfs->ns_cptree.rb_node;
715 parent = NULL;
716
717 while (*p) {
718 parent = *p;
719 root = rb_entry(parent, struct nilfs_root, rb_node);
720
721 if (cno < root->cno) {
722 p = &(*p)->rb_left;
723 } else if (cno > root->cno) {
724 p = &(*p)->rb_right;
725 } else {
726 atomic_inc(&root->count);
727 spin_unlock(&nilfs->ns_cptree_lock);
728 kfree(new);
729 return root;
884 } 730 }
885 } 731 }
886 /* for protecting recent checkpoints */
887 if (cno >= nilfs_last_cno(nilfs))
888 ret++;
889 732
890 out_unlock: 733 new->cno = cno;
891 up_read(&nilfs->ns_super_sem); 734 new->ifile = NULL;
892 return ret; 735 new->nilfs = nilfs;
736 atomic_set(&new->count, 1);
737 atomic_set(&new->inodes_count, 0);
738 atomic_set(&new->blocks_count, 0);
739
740 rb_link_node(&new->rb_node, parent, p);
741 rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
742
743 spin_unlock(&nilfs->ns_cptree_lock);
744
745 return new;
746}
747
748void nilfs_put_root(struct nilfs_root *root)
749{
750 if (atomic_dec_and_test(&root->count)) {
751 struct the_nilfs *nilfs = root->nilfs;
752
753 spin_lock(&nilfs->ns_cptree_lock);
754 rb_erase(&root->rb_node, &nilfs->ns_cptree);
755 spin_unlock(&nilfs->ns_cptree_lock);
756 if (root->ifile)
757 iput(root->ifile);
758
759 kfree(root);
760 }
893} 761}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f785a7b0ab99..fd85e4c05c6b 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
26 26
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/rbtree.h>
29#include <linux/fs.h> 30#include <linux/fs.h>
30#include <linux/blkdev.h> 31#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
@@ -35,8 +36,6 @@
35/* the_nilfs struct */ 36/* the_nilfs struct */
36enum { 37enum {
37 THE_NILFS_INIT = 0, /* Information from super_block is set */ 38 THE_NILFS_INIT = 0, /* Information from super_block is set */
38 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
39 the latest checkpoint was loaded */
40 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
41 THE_NILFS_GC_RUNNING, /* gc process is running */ 40 THE_NILFS_GC_RUNNING, /* gc process is running */
42 THE_NILFS_SB_DIRTY, /* super block is dirty */ 41 THE_NILFS_SB_DIRTY, /* super block is dirty */
@@ -45,22 +44,13 @@ enum {
45/** 44/**
46 * struct the_nilfs - struct to supervise multiple nilfs mount points 45 * struct the_nilfs - struct to supervise multiple nilfs mount points
47 * @ns_flags: flags 46 * @ns_flags: flags
48 * @ns_count: reference count
49 * @ns_list: list head for nilfs_list
50 * @ns_bdev: block device 47 * @ns_bdev: block device
51 * @ns_bdi: backing dev info
52 * @ns_writer: back pointer to writable nilfs_sb_info
53 * @ns_sem: semaphore for shared states 48 * @ns_sem: semaphore for shared states
54 * @ns_super_sem: semaphore for global operations across super block instances
55 * @ns_mount_mutex: mutex protecting mount process of nilfs
56 * @ns_writer_sem: semaphore protecting ns_writer attach/detach
57 * @ns_current: back pointer to current mount
58 * @ns_sbh: buffer heads of on-disk super blocks 49 * @ns_sbh: buffer heads of on-disk super blocks
59 * @ns_sbp: pointers to super block data 50 * @ns_sbp: pointers to super block data
60 * @ns_sbwtime: previous write time of super block 51 * @ns_sbwtime: previous write time of super block
61 * @ns_sbwcount: write count of super block 52 * @ns_sbwcount: write count of super block
62 * @ns_sbsize: size of valid data in super block 53 * @ns_sbsize: size of valid data in super block
63 * @ns_supers: list of nilfs super block structs
64 * @ns_seg_seq: segment sequence counter 54 * @ns_seg_seq: segment sequence counter
65 * @ns_segnum: index number of the latest full segment. 55 * @ns_segnum: index number of the latest full segment.
66 * @ns_nextnum: index number of the full segment index to be used next 56 * @ns_nextnum: index number of the full segment index to be used next
@@ -79,9 +69,9 @@ enum {
79 * @ns_dat: DAT file inode 69 * @ns_dat: DAT file inode
80 * @ns_cpfile: checkpoint file inode 70 * @ns_cpfile: checkpoint file inode
81 * @ns_sufile: segusage file inode 71 * @ns_sufile: segusage file inode
82 * @ns_gc_dat: shadow inode of the DAT file inode for GC 72 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
73 * @ns_cptree_lock: lock protecting @ns_cptree
83 * @ns_gc_inodes: dummy inodes to keep live blocks 74 * @ns_gc_inodes: dummy inodes to keep live blocks
84 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
85 * @ns_blocksize_bits: bit length of block size 75 * @ns_blocksize_bits: bit length of block size
86 * @ns_blocksize: block size 76 * @ns_blocksize: block size
87 * @ns_nsegments: number of segments in filesystem 77 * @ns_nsegments: number of segments in filesystem
@@ -95,22 +85,9 @@ enum {
95 */ 85 */
96struct the_nilfs { 86struct the_nilfs {
97 unsigned long ns_flags; 87 unsigned long ns_flags;
98 atomic_t ns_count;
99 struct list_head ns_list;
100 88
101 struct block_device *ns_bdev; 89 struct block_device *ns_bdev;
102 struct backing_dev_info *ns_bdi;
103 struct nilfs_sb_info *ns_writer;
104 struct rw_semaphore ns_sem; 90 struct rw_semaphore ns_sem;
105 struct rw_semaphore ns_super_sem;
106 struct mutex ns_mount_mutex;
107 struct rw_semaphore ns_writer_sem;
108
109 /*
110 * components protected by ns_super_sem
111 */
112 struct nilfs_sb_info *ns_current;
113 struct list_head ns_supers;
114 91
115 /* 92 /*
116 * used for 93 * used for
@@ -163,11 +140,13 @@ struct the_nilfs {
163 struct inode *ns_dat; 140 struct inode *ns_dat;
164 struct inode *ns_cpfile; 141 struct inode *ns_cpfile;
165 struct inode *ns_sufile; 142 struct inode *ns_sufile;
166 struct inode *ns_gc_dat;
167 143
168 /* GC inode list and hash table head */ 144 /* Checkpoint tree */
145 struct rb_root ns_cptree;
146 spinlock_t ns_cptree_lock;
147
148 /* GC inode list */
169 struct list_head ns_gc_inodes; 149 struct list_head ns_gc_inodes;
170 struct hlist_head *ns_gc_inodes_h;
171 150
172 /* Disk layout information (static) */ 151 /* Disk layout information (static) */
173 unsigned int ns_blocksize_bits; 152 unsigned int ns_blocksize_bits;
@@ -182,9 +161,6 @@ struct the_nilfs {
182 u32 ns_crc_seed; 161 u32 ns_crc_seed;
183}; 162};
184 163
185#define NILFS_GCINODE_HASH_BITS 8
186#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
187
188#define THE_NILFS_FNS(bit, name) \ 164#define THE_NILFS_FNS(bit, name) \
189static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ 165static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
190{ \ 166{ \
@@ -200,11 +176,36 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
200} 176}
201 177
202THE_NILFS_FNS(INIT, init) 178THE_NILFS_FNS(INIT, init)
203THE_NILFS_FNS(LOADED, loaded)
204THE_NILFS_FNS(DISCONTINUED, discontinued) 179THE_NILFS_FNS(DISCONTINUED, discontinued)
205THE_NILFS_FNS(GC_RUNNING, gc_running) 180THE_NILFS_FNS(GC_RUNNING, gc_running)
206THE_NILFS_FNS(SB_DIRTY, sb_dirty) 181THE_NILFS_FNS(SB_DIRTY, sb_dirty)
207 182
183/**
184 * struct nilfs_root - nilfs root object
185 * @cno: checkpoint number
186 * @rb_node: red-black tree node
187 * @count: refcount of this structure
188 * @nilfs: nilfs object
189 * @ifile: inode file
190 * @root: root inode
191 * @inodes_count: number of inodes
192 * @blocks_count: number of blocks (Reserved)
193 */
194struct nilfs_root {
195 __u64 cno;
196 struct rb_node rb_node;
197
198 atomic_t count;
199 struct the_nilfs *nilfs;
200 struct inode *ifile;
201
202 atomic_t inodes_count;
203 atomic_t blocks_count;
204};
205
206/* Special checkpoint number */
207#define NILFS_CPTREE_CURRENT_CNO 0
208
208/* Minimum interval of periodical update of superblocks (in seconds) */ 209/* Minimum interval of periodical update of superblocks (in seconds) */
209#define NILFS_SB_FREQ 10 210#define NILFS_SB_FREQ 10
210 211
@@ -221,46 +222,25 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
221} 222}
222 223
223void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 224void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
224struct the_nilfs *find_or_create_nilfs(struct block_device *); 225struct the_nilfs *alloc_nilfs(struct block_device *bdev);
225void put_nilfs(struct the_nilfs *); 226void destroy_nilfs(struct the_nilfs *nilfs);
226int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 227int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
227int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 228int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
228int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); 229int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
229int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 230int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
231struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
232struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
233 __u64 cno);
234void nilfs_put_root(struct nilfs_root *root);
230struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 235struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
231int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
232int nilfs_near_disk_full(struct the_nilfs *); 236int nilfs_near_disk_full(struct the_nilfs *);
233void nilfs_fall_back_super_block(struct the_nilfs *); 237void nilfs_fall_back_super_block(struct the_nilfs *);
234void nilfs_swap_super_block(struct the_nilfs *); 238void nilfs_swap_super_block(struct the_nilfs *);
235 239
236 240
237static inline void get_nilfs(struct the_nilfs *nilfs) 241static inline void nilfs_get_root(struct nilfs_root *root)
238{
239 /* Caller must have at least one reference of the_nilfs. */
240 atomic_inc(&nilfs->ns_count);
241}
242
243static inline void
244nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
245{
246 down_write(&nilfs->ns_writer_sem);
247 nilfs->ns_writer = sbi;
248 up_write(&nilfs->ns_writer_sem);
249}
250
251static inline void
252nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
253{
254 down_write(&nilfs->ns_writer_sem);
255 if (sbi == nilfs->ns_writer)
256 nilfs->ns_writer = NULL;
257 up_write(&nilfs->ns_writer_sem);
258}
259
260static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
261{ 242{
262 if (atomic_dec_and_test(&sbi->s_count)) 243 atomic_inc(&root->count);
263 kfree(sbi);
264} 244}
265 245
266static inline int nilfs_valid_fs(struct the_nilfs *nilfs) 246static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d3467..6e40e42a43de 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
19 19
20const struct file_operations def_blk_fops = { 20const struct file_operations def_blk_fops = {
21 .open = no_blkdev_open, 21 .open = no_blkdev_open,
22 .llseek = noop_llseek,
22}; 23};
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index b388443c3a09..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6#source "fs/notify/fanotify/Kconfig" 6source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b9..7dceff005a67 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
6 ---help--- 6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access 7 Say Y here to enable fanotify suport. fanotify is a file access
8 notification system which differs from inotify in that it sends 8 notification system which differs from inotify in that it sends
9 and open file descriptor to the userspace listener along with 9 an open file descriptor to the userspace listener along with
10 the event. 10 the event.
11 11
12 If unsure, say Y. 12 If unsure, say Y.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 85366c78cc37..f35794b97e8e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
92 92
93 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
94 94
95 wait_event(group->fanotify_data.access_waitq, event->response); 95 wait_event(group->fanotify_data.access_waitq, event->response ||
96 atomic_read(&group->fanotify_data.bypass_perm));
97
98 if (!event->response) /* bypass_perm set */
99 return 0;
96 100
97 /* userspace responded, convert to something usable */ 101 /* userspace responded, convert to something usable */
98 spin_lock(&event->lock); 102 spin_lock(&event->lock);
@@ -131,6 +135,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
131 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); 135 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
132 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); 136 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
133 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); 137 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
138 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
134 139
135 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 140 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
136 141
@@ -160,20 +165,21 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
160 __u32 event_mask, void *data, int data_type) 165 __u32 event_mask, void *data, int data_type)
161{ 166{
162 __u32 marks_mask, marks_ignored_mask; 167 __u32 marks_mask, marks_ignored_mask;
168 struct path *path = data;
163 169
164 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " 170 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 171 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
166 inode_mark, vfsmnt_mark, event_mask, data, data_type); 172 inode_mark, vfsmnt_mark, event_mask, data, data_type);
167 173
168 /* sorry, fanotify only gives a damn about files and dirs */
169 if (!S_ISREG(to_tell->i_mode) &&
170 !S_ISDIR(to_tell->i_mode))
171 return false;
172
173 /* if we don't have enough info to send an event to userspace say no */ 174 /* if we don't have enough info to send an event to userspace say no */
174 if (data_type != FSNOTIFY_EVENT_PATH) 175 if (data_type != FSNOTIFY_EVENT_PATH)
175 return false; 176 return false;
176 177
178 /* sorry, fanotify only gives a damn about files and dirs */
179 if (!S_ISREG(path->dentry->d_inode->i_mode) &&
180 !S_ISDIR(path->dentry->d_inode->i_mode))
181 return false;
182
177 if (inode_mark && vfsmnt_mark) { 183 if (inode_mark && vfsmnt_mark) {
178 marks_mask = (vfsmnt_mark->mask | inode_mark->mask); 184 marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
179 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); 185 marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
@@ -194,16 +200,29 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
194 BUG(); 200 BUG();
195 } 201 }
196 202
203 if (S_ISDIR(path->dentry->d_inode->i_mode) &&
204 (marks_ignored_mask & FS_ISDIR))
205 return false;
206
197 if (event_mask & marks_mask & ~marks_ignored_mask) 207 if (event_mask & marks_mask & ~marks_ignored_mask)
198 return true; 208 return true;
199 209
200 return false; 210 return false;
201} 211}
202 212
213static void fanotify_free_group_priv(struct fsnotify_group *group)
214{
215 struct user_struct *user;
216
217 user = group->fanotify_data.user;
218 atomic_dec(&user->fanotify_listeners);
219 free_uid(user);
220}
221
203const struct fsnotify_ops fanotify_fsnotify_ops = { 222const struct fsnotify_ops fanotify_fsnotify_ops = {
204 .handle_event = fanotify_handle_event, 223 .handle_event = fanotify_handle_event,
205 .should_send_event = fanotify_should_send_event, 224 .should_send_event = fanotify_should_send_event,
206 .free_group_priv = NULL, 225 .free_group_priv = fanotify_free_group_priv,
207 .free_event_priv = NULL, 226 .free_event_priv = NULL,
208 .freeing_mark = NULL, 227 .freeing_mark = NULL,
209}; 228};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5ed8e58d7bfc..8b61220cffc5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,10 @@
16 16
17#include <asm/ioctls.h> 17#include <asm/ioctls.h>
18 18
19#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
20#define FANOTIFY_DEFAULT_MAX_MARKS 8192
21#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
22
19extern const struct fsnotify_ops fanotify_fsnotify_ops; 23extern const struct fsnotify_ops fanotify_fsnotify_ops;
20 24
21static struct kmem_cache *fanotify_mark_cache __read_mostly; 25static struct kmem_cache *fanotify_mark_cache __read_mostly;
@@ -102,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
102 return client_fd; 106 return client_fd;
103} 107}
104 108
105static ssize_t fill_event_metadata(struct fsnotify_group *group, 109static int fill_event_metadata(struct fsnotify_group *group,
106 struct fanotify_event_metadata *metadata, 110 struct fanotify_event_metadata *metadata,
107 struct fsnotify_event *event) 111 struct fsnotify_event *event)
108{ 112{
113 int ret = 0;
114
109 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 115 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
110 group, metadata, event); 116 group, metadata, event);
111 117
112 metadata->event_len = FAN_EVENT_METADATA_LEN; 118 metadata->event_len = FAN_EVENT_METADATA_LEN;
119 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
113 metadata->vers = FANOTIFY_METADATA_VERSION; 120 metadata->vers = FANOTIFY_METADATA_VERSION;
114 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 121 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
115 metadata->pid = pid_vnr(event->tgid); 122 metadata->pid = pid_vnr(event->tgid);
116 metadata->fd = create_fd(group, event); 123 if (unlikely(event->mask & FAN_Q_OVERFLOW))
124 metadata->fd = FAN_NOFD;
125 else {
126 metadata->fd = create_fd(group, event);
127 if (metadata->fd < 0)
128 ret = metadata->fd;
129 }
117 130
118 return metadata->fd; 131 return ret;
119} 132}
120 133
121#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 134#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -196,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
196 209
197 mutex_lock(&group->fanotify_data.access_mutex); 210 mutex_lock(&group->fanotify_data.access_mutex);
198 211
199 if (group->fanotify_data.bypass_perm) { 212 if (atomic_read(&group->fanotify_data.bypass_perm)) {
200 mutex_unlock(&group->fanotify_data.access_mutex); 213 mutex_unlock(&group->fanotify_data.access_mutex);
201 kmem_cache_free(fanotify_response_event_cache, re); 214 kmem_cache_free(fanotify_response_event_cache, re);
202 event->response = FAN_ALLOW; 215 event->response = FAN_ALLOW;
@@ -253,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
253 266
254 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 267 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
255 268
256 fd = fill_event_metadata(group, &fanotify_event_metadata, event); 269 ret = fill_event_metadata(group, &fanotify_event_metadata, event);
257 if (fd < 0) 270 if (ret < 0)
258 return fd; 271 goto out;
259 272
273 fd = fanotify_event_metadata.fd;
260 ret = prepare_for_access_response(group, event, fd); 274 ret = prepare_for_access_response(group, event, fd);
261 if (ret) 275 if (ret)
262 goto out_close_fd; 276 goto out_close_fd;
263 277
264 ret = -EFAULT; 278 ret = -EFAULT;
265 if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN)) 279 if (copy_to_user(buf, &fanotify_event_metadata,
280 fanotify_event_metadata.event_len))
266 goto out_kill_access_response; 281 goto out_kill_access_response;
267 282
268 return FAN_EVENT_METADATA_LEN; 283 return fanotify_event_metadata.event_len;
269 284
270out_kill_access_response: 285out_kill_access_response:
271 remove_access_response(group, event, fd); 286 remove_access_response(group, event, fd);
272out_close_fd: 287out_close_fd:
273 sys_close(fd); 288 if (fd != FAN_NOFD)
289 sys_close(fd);
290out:
291#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
292 if (event->mask & FAN_ALL_PERM_EVENTS) {
293 event->response = FAN_DENY;
294 wake_up(&group->fanotify_data.access_waitq);
295 }
296#endif
274 return ret; 297 return ret;
275} 298}
276 299
@@ -326,7 +349,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
326 ret = -EAGAIN; 349 ret = -EAGAIN;
327 if (file->f_flags & O_NONBLOCK) 350 if (file->f_flags & O_NONBLOCK)
328 break; 351 break;
329 ret = -EINTR; 352 ret = -ERESTARTSYS;
330 if (signal_pending(current)) 353 if (signal_pending(current))
331 break; 354 break;
332 355
@@ -372,14 +395,13 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
372static int fanotify_release(struct inode *ignored, struct file *file) 395static int fanotify_release(struct inode *ignored, struct file *file)
373{ 396{
374 struct fsnotify_group *group = file->private_data; 397 struct fsnotify_group *group = file->private_data;
375 struct fanotify_response_event *re, *lre;
376
377 pr_debug("%s: file=%p group=%p\n", __func__, file, group);
378 398
379#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 399#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
400 struct fanotify_response_event *re, *lre;
401
380 mutex_lock(&group->fanotify_data.access_mutex); 402 mutex_lock(&group->fanotify_data.access_mutex);
381 403
382 group->fanotify_data.bypass_perm = true; 404 atomic_inc(&group->fanotify_data.bypass_perm);
383 405
384 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { 406 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
385 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, 407 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -433,6 +455,7 @@ static const struct file_operations fanotify_fops = {
433 .release = fanotify_release, 455 .release = fanotify_release,
434 .unlocked_ioctl = fanotify_ioctl, 456 .unlocked_ioctl = fanotify_ioctl,
435 .compat_ioctl = fanotify_ioctl, 457 .compat_ioctl = fanotify_ioctl,
458 .llseek = noop_llseek,
436}; 459};
437 460
438static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) 461static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
@@ -553,18 +576,24 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
553 __u32 mask, 576 __u32 mask,
554 unsigned int flags) 577 unsigned int flags)
555{ 578{
556 __u32 oldmask; 579 __u32 oldmask = -1;
557 580
558 spin_lock(&fsn_mark->lock); 581 spin_lock(&fsn_mark->lock);
559 if (!(flags & FAN_MARK_IGNORED_MASK)) { 582 if (!(flags & FAN_MARK_IGNORED_MASK)) {
560 oldmask = fsn_mark->mask; 583 oldmask = fsn_mark->mask;
561 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); 584 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
562 } else { 585 } else {
563 oldmask = fsn_mark->ignored_mask; 586 __u32 tmask = fsn_mark->ignored_mask | mask;
564 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask)); 587 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
565 if (flags & FAN_MARK_IGNORED_SURV_MODIFY) 588 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
566 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; 589 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
567 } 590 }
591
592 if (!(flags & FAN_MARK_ONDIR)) {
593 __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
594 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
595 }
596
568 spin_unlock(&fsn_mark->lock); 597 spin_unlock(&fsn_mark->lock);
569 598
570 return mask & ~oldmask; 599 return mask & ~oldmask;
@@ -576,10 +605,12 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
576{ 605{
577 struct fsnotify_mark *fsn_mark; 606 struct fsnotify_mark *fsn_mark;
578 __u32 added; 607 __u32 added;
608 int ret = 0;
579 609
580 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 610 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
581 if (!fsn_mark) { 611 if (!fsn_mark) {
582 int ret; 612 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
613 return -ENOSPC;
583 614
584 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 615 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
585 if (!fsn_mark) 616 if (!fsn_mark)
@@ -587,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
587 618
588 fsnotify_init_mark(fsn_mark, fanotify_free_mark); 619 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
589 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); 620 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
590 if (ret) { 621 if (ret)
591 fanotify_free_mark(fsn_mark); 622 goto err;
592 return ret;
593 }
594 } 623 }
595 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 624 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
596 fsnotify_put_mark(fsn_mark); 625
597 if (added & ~mnt->mnt_fsnotify_mask) 626 if (added & ~mnt->mnt_fsnotify_mask)
598 fsnotify_recalc_vfsmount_mask(mnt); 627 fsnotify_recalc_vfsmount_mask(mnt);
599 628err:
600 return 0; 629 fsnotify_put_mark(fsn_mark);
630 return ret;
601} 631}
602 632
603static int fanotify_add_inode_mark(struct fsnotify_group *group, 633static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -606,12 +636,24 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
606{ 636{
607 struct fsnotify_mark *fsn_mark; 637 struct fsnotify_mark *fsn_mark;
608 __u32 added; 638 __u32 added;
639 int ret = 0;
609 640
610 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 641 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
611 642
643 /*
644 * If some other task has this inode open for write we should not add
645 * an ignored mark, unless that ignored mark is supposed to survive
646 * modification changes anyway.
647 */
648 if ((flags & FAN_MARK_IGNORED_MASK) &&
649 !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
650 (atomic_read(&inode->i_writecount) > 0))
651 return 0;
652
612 fsn_mark = fsnotify_find_inode_mark(group, inode); 653 fsn_mark = fsnotify_find_inode_mark(group, inode);
613 if (!fsn_mark) { 654 if (!fsn_mark) {
614 int ret; 655 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
656 return -ENOSPC;
615 657
616 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 658 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
617 if (!fsn_mark) 659 if (!fsn_mark)
@@ -619,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
619 661
620 fsnotify_init_mark(fsn_mark, fanotify_free_mark); 662 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
621 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); 663 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
622 if (ret) { 664 if (ret)
623 fanotify_free_mark(fsn_mark); 665 goto err;
624 return ret;
625 }
626 } 666 }
627 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 667 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
628 fsnotify_put_mark(fsn_mark); 668
629 if (added & ~inode->i_fsnotify_mask) 669 if (added & ~inode->i_fsnotify_mask)
630 fsnotify_recalc_inode_mask(inode); 670 fsnotify_recalc_inode_mask(inode);
631 return 0; 671err:
672 fsnotify_put_mark(fsn_mark);
673 return ret;
632} 674}
633 675
634/* fanotify syscalls */ 676/* fanotify syscalls */
@@ -636,6 +678,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
636{ 678{
637 struct fsnotify_group *group; 679 struct fsnotify_group *group;
638 int f_flags, fd; 680 int f_flags, fd;
681 struct user_struct *user;
639 682
640 pr_debug("%s: flags=%d event_f_flags=%d\n", 683 pr_debug("%s: flags=%d event_f_flags=%d\n",
641 __func__, flags, event_f_flags); 684 __func__, flags, event_f_flags);
@@ -646,6 +689,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
646 if (flags & ~FAN_ALL_INIT_FLAGS) 689 if (flags & ~FAN_ALL_INIT_FLAGS)
647 return -EINVAL; 690 return -EINVAL;
648 691
692 user = get_current_user();
693 if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
694 free_uid(user);
695 return -EMFILE;
696 }
697
649 f_flags = O_RDWR | FMODE_NONOTIFY; 698 f_flags = O_RDWR | FMODE_NONOTIFY;
650 if (flags & FAN_CLOEXEC) 699 if (flags & FAN_CLOEXEC)
651 f_flags |= O_CLOEXEC; 700 f_flags |= O_CLOEXEC;
@@ -654,15 +703,53 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
654 703
655 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 704 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
656 group = fsnotify_alloc_group(&fanotify_fsnotify_ops); 705 group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
657 if (IS_ERR(group)) 706 if (IS_ERR(group)) {
707 free_uid(user);
658 return PTR_ERR(group); 708 return PTR_ERR(group);
709 }
710
711 group->fanotify_data.user = user;
712 atomic_inc(&user->fanotify_listeners);
659 713
660 group->fanotify_data.f_flags = event_f_flags; 714 group->fanotify_data.f_flags = event_f_flags;
661#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 715#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
662 mutex_init(&group->fanotify_data.access_mutex); 716 mutex_init(&group->fanotify_data.access_mutex);
663 init_waitqueue_head(&group->fanotify_data.access_waitq); 717 init_waitqueue_head(&group->fanotify_data.access_waitq);
664 INIT_LIST_HEAD(&group->fanotify_data.access_list); 718 INIT_LIST_HEAD(&group->fanotify_data.access_list);
719 atomic_set(&group->fanotify_data.bypass_perm, 0);
665#endif 720#endif
721 switch (flags & FAN_ALL_CLASS_BITS) {
722 case FAN_CLASS_NOTIF:
723 group->priority = FS_PRIO_0;
724 break;
725 case FAN_CLASS_CONTENT:
726 group->priority = FS_PRIO_1;
727 break;
728 case FAN_CLASS_PRE_CONTENT:
729 group->priority = FS_PRIO_2;
730 break;
731 default:
732 fd = -EINVAL;
733 goto out_put_group;
734 }
735
736 if (flags & FAN_UNLIMITED_QUEUE) {
737 fd = -EPERM;
738 if (!capable(CAP_SYS_ADMIN))
739 goto out_put_group;
740 group->max_events = UINT_MAX;
741 } else {
742 group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
743 }
744
745 if (flags & FAN_UNLIMITED_MARKS) {
746 fd = -EPERM;
747 if (!capable(CAP_SYS_ADMIN))
748 goto out_put_group;
749 group->fanotify_data.max_marks = UINT_MAX;
750 } else {
751 group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
752 }
666 753
667 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); 754 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
668 if (fd < 0) 755 if (fd < 0)
@@ -696,13 +783,21 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
696 if (flags & ~FAN_ALL_MARK_FLAGS) 783 if (flags & ~FAN_ALL_MARK_FLAGS)
697 return -EINVAL; 784 return -EINVAL;
698 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 785 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
699 case FAN_MARK_ADD: 786 case FAN_MARK_ADD: /* fallthrough */
700 case FAN_MARK_REMOVE: 787 case FAN_MARK_REMOVE:
788 if (!mask)
789 return -EINVAL;
701 case FAN_MARK_FLUSH: 790 case FAN_MARK_FLUSH:
702 break; 791 break;
703 default: 792 default:
704 return -EINVAL; 793 return -EINVAL;
705 } 794 }
795
796 if (mask & FAN_ONDIR) {
797 flags |= FAN_MARK_ONDIR;
798 mask &= ~FAN_ONDIR;
799 }
800
706#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 801#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
707 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD)) 802 if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
708#else 803#else
@@ -718,6 +813,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
718 ret = -EINVAL; 813 ret = -EINVAL;
719 if (unlikely(filp->f_op != &fanotify_fops)) 814 if (unlikely(filp->f_op != &fanotify_fops))
720 goto fput_and_out; 815 goto fput_and_out;
816 group = filp->private_data;
817
818 /*
819 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
820 * allowed to set permissions events.
821 */
822 ret = -EINVAL;
823 if (mask & FAN_ALL_PERM_EVENTS &&
824 group->priority == FS_PRIO_0)
825 goto fput_and_out;
721 826
722 ret = fanotify_find_path(dfd, pathname, &path, flags); 827 ret = fanotify_find_path(dfd, pathname, &path, flags);
723 if (ret) 828 if (ret)
@@ -728,7 +833,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
728 inode = path.dentry->d_inode; 833 inode = path.dentry->d_inode;
729 else 834 else
730 mnt = path.mnt; 835 mnt = path.mnt;
731 group = filp->private_data;
732 836
733 /* create/update an inode mark */ 837 /* create/update an inode mark */
734 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { 838 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..79b47cbb5cd8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
59 /* determine if the children should tell inode about their events */ 59 /* determine if the children should tell inode about their events */
60 watched = fsnotify_inode_watches_children(inode); 60 watched = fsnotify_inode_watches_children(inode);
61 61
62 spin_lock(&dcache_lock); 62 spin_lock(&inode->i_lock);
63 /* run all of the dentries associated with this inode. Since this is a 63 /* run all of the dentries associated with this inode. Since this is a
64 * directory, there damn well better only be one item on this list */ 64 * directory, there damn well better only be one item on this list */
65 list_for_each_entry(alias, &inode->i_dentry, d_alias) { 65 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -68,75 +68,57 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
68 /* run all of the children of the original inode and fix their 68 /* run all of the children of the original inode and fix their
69 * d_flags to indicate parental interest (their parent is the 69 * d_flags to indicate parental interest (their parent is the
70 * original inode) */ 70 * original inode) */
71 spin_lock(&alias->d_lock);
71 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { 72 list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
72 if (!child->d_inode) 73 if (!child->d_inode)
73 continue; 74 continue;
74 75
75 spin_lock(&child->d_lock); 76 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
76 if (watched) 77 if (watched)
77 child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; 78 child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
78 else 79 else
79 child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; 80 child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
80 spin_unlock(&child->d_lock); 81 spin_unlock(&child->d_lock);
81 } 82 }
83 spin_unlock(&alias->d_lock);
82 } 84 }
83 spin_unlock(&dcache_lock); 85 spin_unlock(&inode->i_lock);
84} 86}
85 87
86/* Notify this dentry's parent about a child's events. */ 88/* Notify this dentry's parent about a child's events. */
87void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) 89int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
88{ 90{
89 struct dentry *parent; 91 struct dentry *parent;
90 struct inode *p_inode; 92 struct inode *p_inode;
91 bool send = false; 93 int ret = 0;
92 bool should_update_children = false;
93 94
94 if (!dentry) 95 if (!dentry)
95 dentry = path->dentry; 96 dentry = path->dentry;
96 97
97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 98 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
98 return; 99 return 0;
99 100
100 spin_lock(&dentry->d_lock); 101 parent = dget_parent(dentry);
101 parent = dentry->d_parent;
102 p_inode = parent->d_inode; 102 p_inode = parent->d_inode;
103 103
104 if (fsnotify_inode_watches_children(p_inode)) { 104 if (unlikely(!fsnotify_inode_watches_children(p_inode)))
105 if (p_inode->i_fsnotify_mask & mask) { 105 __fsnotify_update_child_dentry_flags(p_inode);
106 dget(parent); 106 else if (p_inode->i_fsnotify_mask & mask) {
107 send = true;
108 }
109 } else {
110 /*
111 * The parent doesn't care about events on it's children but
112 * at least one child thought it did. We need to run all the
113 * children and update their d_flags to let them know p_inode
114 * doesn't care about them any more.
115 */
116 dget(parent);
117 should_update_children = true;
118 }
119
120 spin_unlock(&dentry->d_lock);
121
122 if (send) {
123 /* we are notifying a parent so come up with the new mask which 107 /* we are notifying a parent so come up with the new mask which
124 * specifies these are events which came from a child. */ 108 * specifies these are events which came from a child. */
125 mask |= FS_EVENT_ON_CHILD; 109 mask |= FS_EVENT_ON_CHILD;
126 110
127 if (path) 111 if (path)
128 fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, 112 ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
129 dentry->d_name.name, 0); 113 dentry->d_name.name, 0);
130 else 114 else
131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 115 ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
132 dentry->d_name.name, 0); 116 dentry->d_name.name, 0);
133 dput(parent);
134 } 117 }
135 118
136 if (unlikely(should_update_children)) { 119 dput(parent);
137 __fsnotify_update_child_dentry_flags(p_inode); 120
138 dput(parent); 121 return ret;
139 }
140} 122}
141EXPORT_SYMBOL_GPL(__fsnotify_parent); 123EXPORT_SYMBOL_GPL(__fsnotify_parent);
142 124
@@ -275,20 +257,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
275 257
276 if (inode_group > vfsmount_group) { 258 if (inode_group > vfsmount_group) {
277 /* handle inode */ 259 /* handle inode */
278 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, 260 ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
279 data_is, cookie, file_name, &event); 261 data_is, cookie, file_name, &event);
280 /* we didn't use the vfsmount_mark */ 262 /* we didn't use the vfsmount_mark */
281 vfsmount_group = NULL; 263 vfsmount_group = NULL;
282 } else if (vfsmount_group > inode_group) { 264 } else if (vfsmount_group > inode_group) {
283 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, 265 ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
284 data_is, cookie, file_name, &event); 266 data_is, cookie, file_name, &event);
285 inode_group = NULL; 267 inode_group = NULL;
286 } else { 268 } else {
287 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, 269 ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
288 mask, data, data_is, cookie, file_name, 270 mask, data, data_is, cookie, file_name,
289 &event); 271 &event);
290 } 272 }
291 273
274 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
275 goto out;
276
292 if (inode_group) 277 if (inode_group)
293 inode_node = srcu_dereference(inode_node->next, 278 inode_node = srcu_dereference(inode_node->next,
294 &fsnotify_mark_srcu); 279 &fsnotify_mark_srcu);
@@ -296,7 +281,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
296 vfsmount_node = srcu_dereference(vfsmount_node->next, 281 vfsmount_node = srcu_dereference(vfsmount_node->next,
297 &fsnotify_mark_srcu); 282 &fsnotify_mark_srcu);
298 } 283 }
299 284 ret = 0;
285out:
300 srcu_read_unlock(&fsnotify_mark_srcu, idx); 286 srcu_read_unlock(&fsnotify_mark_srcu, idx);
301 /* 287 /*
302 * fsnotify_create_event() took a reference so the event can't be cleaned 288 * fsnotify_create_event() took a reference so the event can't be cleaned
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..4c29fcf557d1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -177,7 +177,8 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
177 * Attach an initialized mark to a given inode. 177 * Attach an initialized mark to a given inode.
178 * These marks may be used for the fsnotify backend to determine which 178 * These marks may be used for the fsnotify backend to determine which
179 * event types should be delivered to which group and for which inodes. These 179 * event types should be delivered to which group and for which inodes. These
180 * marks are ordered according to the group's location in memory. 180 * marks are ordered according to priority, highest number first, and then by
181 * the group's location in memory.
181 */ 182 */
182int fsnotify_add_inode_mark(struct fsnotify_mark *mark, 183int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
183 struct fsnotify_group *group, struct inode *inode, 184 struct fsnotify_group *group, struct inode *inode,
@@ -211,7 +212,11 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
211 goto out; 212 goto out;
212 } 213 }
213 214
214 if (mark->group < lmark->group) 215 if (mark->group->priority < lmark->group->priority)
216 continue;
217
218 if ((mark->group->priority == lmark->group->priority) &&
219 (mark->group < lmark->group))
215 continue; 220 continue;
216 221
217 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); 222 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
@@ -240,6 +245,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
240{ 245{
241 struct inode *inode, *next_i, *need_iput = NULL; 246 struct inode *inode, *next_i, *need_iput = NULL;
242 247
248 spin_lock(&inode_lock);
243 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
244 struct inode *need_iput_tmp; 250 struct inode *need_iput_tmp;
245 251
@@ -297,4 +303,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
297 303
298 spin_lock(&inode_lock); 304 spin_lock(&inode_lock);
299 } 305 }
306 spin_unlock(&inode_lock);
300} 307}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bf7f6d776c31..4cd5d5d78f9f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -344,6 +344,7 @@ static const struct file_operations inotify_fops = {
344 .release = inotify_release, 344 .release = inotify_release,
345 .unlocked_ioctl = inotify_ioctl, 345 .unlocked_ioctl = inotify_ioctl,
346 .compat_ioctl = inotify_ioctl, 346 .compat_ioctl = inotify_ioctl,
347 .llseek = noop_llseek,
347}; 348};
348 349
349 350
@@ -751,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
751 if (ret >= 0) 752 if (ret >= 0)
752 return ret; 753 return ret;
753 754
755 fsnotify_put_group(group);
754 atomic_dec(&user->inotify_devs); 756 atomic_dec(&user->inotify_devs);
755out_free_uid: 757out_free_uid:
756 free_uid(user); 758 free_uid(user);
@@ -861,7 +863,7 @@ static int __init inotify_user_setup(void)
861 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); 863 BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
862 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); 864 BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
863 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); 865 BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
864 BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR); 866 BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
865 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); 867 BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
866 868
867 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); 869 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 56772b578fbd..85eebff6d0d7 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -169,7 +169,11 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
169 goto out; 169 goto out;
170 } 170 }
171 171
172 if (mark->group < lmark->group) 172 if (mark->group->priority < lmark->group->priority)
173 continue;
174
175 if ((mark->group->priority == lmark->group->priority) &&
176 (mark->group < lmark->group))
173 continue; 177 continue;
174 178
175 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); 179 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be992544..4ff028fcfd6e 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a4..f4b1057abdd2 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s 1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1381 * single-segment behaviour. 1381 * single-segment behaviour.
1382 * 1382 *
1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both 1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
1384 * when atomic and when not atomic. This is ok because 1384 * atomic and when not atomic. This is ok because it calls
1385 * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic() 1385 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1386 * and it is ok to call this when non-atomic. 1386 * fact, the only difference between __copy_from_user_inatomic() and
1387 * Infact, the only difference between __copy_from_user_inatomic() and
1388 * __copy_from_user() is that the latter calls might_sleep() and the former 1387 * __copy_from_user() is that the latter calls might_sleep() and the former
1389 * should not zero the tail of the buffer on error. And on many 1388 * should not zero the tail of the buffer on error. And on many architectures
1390 * architectures __copy_from_user_inatomic() is just defined to 1389 * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
1391 * __copy_from_user() so it makes no difference at all on those architectures. 1390 * makes no difference at all on those architectures.
1392 */ 1391 */
1393static inline size_t ntfs_copy_from_user_iovec(struct page **pages, 1392static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1394 unsigned nr_pages, unsigned ofs, const struct iovec **iov, 1393 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1409 if (unlikely(copied != len)) { 1408 if (unlikely(copied != len)) {
1410 /* Do it the slow way. */ 1409 /* Do it the slow way. */
1411 addr = kmap(*pages); 1410 addr = kmap(*pages);
1412 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, 1411 copied = __ntfs_copy_from_user_iovec_inatomic(addr +
1413 *iov, *iov_ofs, len); 1412 ofs, *iov, *iov_ofs, len);
1414 /*
1415 * Zero the rest of the target like __copy_from_user().
1416 */
1417 memset(addr + ofs + copied, 0, len - copied);
1418 kunmap(*pages);
1419 if (unlikely(copied != len)) 1413 if (unlikely(copied != len))
1420 goto err_out; 1414 goto err_out;
1415 kunmap(*pages);
1421 } 1416 }
1422 total += len; 1417 total += len;
1418 ntfs_set_next_iovec(iov, iov_ofs, len);
1423 bytes -= len; 1419 bytes -= len;
1424 if (!bytes) 1420 if (!bytes)
1425 break; 1421 break;
1426 ntfs_set_next_iovec(iov, iov_ofs, len);
1427 ofs = 0; 1422 ofs = 0;
1428 } while (++pages < last_page); 1423 } while (++pages < last_page);
1429out: 1424out:
1430 return total; 1425 return total;
1431err_out: 1426err_out:
1432 total += copied; 1427 BUG_ON(copied > len);
1433 /* Zero the rest of the target like __copy_from_user(). */ 1428 /* Zero the rest of the target like __copy_from_user(). */
1429 memset(addr + ofs + copied, 0, len - copied);
1430 kunmap(*pages);
1431 total += copied;
1432 ntfs_set_next_iovec(iov, iov_ofs, copied);
1434 while (++pages < last_page) { 1433 while (++pages < last_page) {
1435 bytes -= len; 1434 bytes -= len;
1436 if (!bytes) 1435 if (!bytes)
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc7..a627ed82c0a3 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
332 return NULL; 332 return NULL;
333} 333}
334 334
335static void ntfs_i_callback(struct rcu_head *head)
336{
337 struct inode *inode = container_of(head, struct inode, i_rcu);
338 INIT_LIST_HEAD(&inode->i_dentry);
339 kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
340}
341
335void ntfs_destroy_big_inode(struct inode *inode) 342void ntfs_destroy_big_inode(struct inode *inode)
336{ 343{
337 ntfs_inode *ni = NTFS_I(inode); 344 ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
340 BUG_ON(ni->page); 347 BUG_ON(ni->page);
341 if (!atomic_dec_and_test(&ni->count)) 348 if (!atomic_dec_and_test(&ni->count))
342 BUG(); 349 BUG();
343 kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); 350 call_rcu(&inode->i_rcu, ntfs_i_callback);
344} 351}
345 352
346static inline ntfs_inode *ntfs_alloc_extent_inode(void) 353static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b6727181..326e7475a22a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. 2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
2576 flush_dcache_page(page); 2576 flush_dcache_page(page);
2577 SetPageUptodate(page); 2577 SetPageUptodate(page);
2578 if (base_ni) { 2578 if (base_ni) {
2579 MFT_RECORD *m_tmp;
2580
2579 /* 2581 /*
2580 * Setup the base mft record in the extent mft record. This 2582 * Setup the base mft record in the extent mft record. This
2581 * completes initialization of the allocated extent mft record 2583 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
2588 * attach it to the base inode @base_ni and map, pin, and lock 2590 * attach it to the base inode @base_ni and map, pin, and lock
2589 * its, i.e. the allocated, mft record. 2591 * its, i.e. the allocated, mft record.
2590 */ 2592 */
2591 m = map_extent_mft_record(base_ni, bit, &ni); 2593 m_tmp = map_extent_mft_record(base_ni, bit, &ni);
2592 if (IS_ERR(m)) { 2594 if (IS_ERR(m_tmp)) {
2593 ntfs_error(vol->sb, "Failed to map allocated extent " 2595 ntfs_error(vol->sb, "Failed to map allocated extent "
2594 "mft record 0x%llx.", (long long)bit); 2596 "mft record 0x%llx.", (long long)bit);
2595 err = PTR_ERR(m); 2597 err = PTR_ERR(m_tmp);
2596 /* Set the mft record itself not in use. */ 2598 /* Set the mft record itself not in use. */
2597 m->flags &= cpu_to_le16( 2599 m->flags &= cpu_to_le16(
2598 ~le16_to_cpu(MFT_RECORD_IN_USE)); 2600 ~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
2603 ntfs_unmap_page(page); 2605 ntfs_unmap_page(page);
2604 goto undo_mftbmp_alloc; 2606 goto undo_mftbmp_alloc;
2605 } 2607 }
2608 BUG_ON(m != m_tmp);
2606 /* 2609 /*
2607 * Make sure the allocated mft record is written out to disk. 2610 * Make sure the allocated mft record is written out to disk.
2608 * No need to set the inode dirty because the caller is going 2611 * No need to set the inode dirty because the caller is going
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 512806171bfa..29099a07b9fe 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. 2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * Copyright (c) 2001,2002 Richard Russon 5 * Copyright (c) 2001,2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -30,7 +30,6 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h>
34#include <linux/bitmap.h> 33#include <linux/bitmap.h>
35 34
36#include "sysctl.h" 35#include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
445 444
446 ntfs_debug("Entering with remount options string: %s", opt); 445 ntfs_debug("Entering with remount options string: %s", opt);
447 446
448 lock_kernel();
449#ifndef NTFS_RW 447#ifndef NTFS_RW
450 /* For read-only compiled driver, enforce read-only flag. */ 448 /* For read-only compiled driver, enforce read-only flag. */
451 *flags |= MS_RDONLY; 449 *flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
469 if (NVolErrors(vol)) { 467 if (NVolErrors(vol)) {
470 ntfs_error(sb, "Volume has errors and is read-only%s", 468 ntfs_error(sb, "Volume has errors and is read-only%s",
471 es); 469 es);
472 unlock_kernel();
473 return -EROFS; 470 return -EROFS;
474 } 471 }
475 if (vol->vol_flags & VOLUME_IS_DIRTY) { 472 if (vol->vol_flags & VOLUME_IS_DIRTY) {
476 ntfs_error(sb, "Volume is dirty and read-only%s", es); 473 ntfs_error(sb, "Volume is dirty and read-only%s", es);
477 unlock_kernel();
478 return -EROFS; 474 return -EROFS;
479 } 475 }
480 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { 476 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
481 ntfs_error(sb, "Volume has been modified by chkdsk " 477 ntfs_error(sb, "Volume has been modified by chkdsk "
482 "and is read-only%s", es); 478 "and is read-only%s", es);
483 unlock_kernel();
484 return -EROFS; 479 return -EROFS;
485 } 480 }
486 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 481 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
488 "(0x%x) and is read-only%s", 483 "(0x%x) and is read-only%s",
489 (unsigned)le16_to_cpu(vol->vol_flags), 484 (unsigned)le16_to_cpu(vol->vol_flags),
490 es); 485 es);
491 unlock_kernel();
492 return -EROFS; 486 return -EROFS;
493 } 487 }
494 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { 488 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
495 ntfs_error(sb, "Failed to set dirty bit in volume " 489 ntfs_error(sb, "Failed to set dirty bit in volume "
496 "information flags%s", es); 490 "information flags%s", es);
497 unlock_kernel();
498 return -EROFS; 491 return -EROFS;
499 } 492 }
500#if 0 493#if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
514 ntfs_error(sb, "Failed to empty journal $LogFile%s", 507 ntfs_error(sb, "Failed to empty journal $LogFile%s",
515 es); 508 es);
516 NVolSetErrors(vol); 509 NVolSetErrors(vol);
517 unlock_kernel();
518 return -EROFS; 510 return -EROFS;
519 } 511 }
520 if (!ntfs_mark_quotas_out_of_date(vol)) { 512 if (!ntfs_mark_quotas_out_of_date(vol)) {
521 ntfs_error(sb, "Failed to mark quotas out of date%s", 513 ntfs_error(sb, "Failed to mark quotas out of date%s",
522 es); 514 es);
523 NVolSetErrors(vol); 515 NVolSetErrors(vol);
524 unlock_kernel();
525 return -EROFS; 516 return -EROFS;
526 } 517 }
527 if (!ntfs_stamp_usnjrnl(vol)) { 518 if (!ntfs_stamp_usnjrnl(vol)) {
528 ntfs_error(sb, "Failed to stamp transation log " 519 ntfs_error(sb, "Failed to stamp transation log "
529 "($UsnJrnl)%s", es); 520 "($UsnJrnl)%s", es);
530 NVolSetErrors(vol); 521 NVolSetErrors(vol);
531 unlock_kernel();
532 return -EROFS; 522 return -EROFS;
533 } 523 }
534 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 524 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
544 534
545 // TODO: Deal with *flags. 535 // TODO: Deal with *flags.
546 536
547 if (!parse_options(vol, opt)) { 537 if (!parse_options(vol, opt))
548 unlock_kernel();
549 return -EINVAL; 538 return -EINVAL;
550 } 539
551 unlock_kernel();
552 ntfs_debug("Done."); 540 ntfs_debug("Done.");
553 return 0; 541 return 0;
554} 542}
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
2261 2249
2262 ntfs_debug("Entering."); 2250 ntfs_debug("Entering.");
2263 2251
2264 lock_kernel();
2265
2266#ifdef NTFS_RW 2252#ifdef NTFS_RW
2267 /* 2253 /*
2268 * Commit all inodes while they are still open in case some of them 2254 * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
2433 2419
2434 sb->s_fs_info = NULL; 2420 sb->s_fs_info = NULL;
2435 kfree(vol); 2421 kfree(vol);
2436
2437 unlock_kernel();
2438} 2422}
2439 2423
2440/** 2424/**
@@ -2772,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2772 init_rwsem(&vol->mftbmp_lock); 2756 init_rwsem(&vol->mftbmp_lock);
2773 init_rwsem(&vol->lcnbmp_lock); 2757 init_rwsem(&vol->lcnbmp_lock);
2774 2758
2775 unlock_kernel();
2776
2777 /* By default, enable sparse support. */ 2759 /* By default, enable sparse support. */
2778 NVolSetSparseEnabled(vol); 2760 NVolSetSparseEnabled(vol);
2779 2761
@@ -2929,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2929 goto unl_upcase_iput_tmp_ino_err_out_now; 2911 goto unl_upcase_iput_tmp_ino_err_out_now;
2930 } 2912 }
2931 if ((sb->s_root = d_alloc_root(vol->root_ino))) { 2913 if ((sb->s_root = d_alloc_root(vol->root_ino))) {
2932 /* We increment i_count simulating an ntfs_iget(). */ 2914 /* We grab a reference, simulating an ntfs_iget(). */
2933 atomic_inc(&vol->root_ino->i_count); 2915 ihold(vol->root_ino);
2934 ntfs_debug("Exiting, status successful."); 2916 ntfs_debug("Exiting, status successful.");
2935 /* Release the default upcase if it has no users. */ 2917 /* Release the default upcase if it has no users. */
2936 mutex_lock(&ntfs_lock); 2918 mutex_lock(&ntfs_lock);
@@ -2940,7 +2922,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2940 } 2922 }
2941 mutex_unlock(&ntfs_lock); 2923 mutex_unlock(&ntfs_lock);
2942 sb->s_export_op = &ntfs_export_ops; 2924 sb->s_export_op = &ntfs_export_ops;
2943 lock_kernel();
2944 lockdep_on(); 2925 lockdep_on();
2945 return 0; 2926 return 0;
2946 } 2927 }
@@ -3040,24 +3021,8 @@ iput_tmp_ino_err_out_now:
3040 if (vol->mft_ino && vol->mft_ino != tmp_ino) 3021 if (vol->mft_ino && vol->mft_ino != tmp_ino)
3041 iput(vol->mft_ino); 3022 iput(vol->mft_ino);
3042 vol->mft_ino = NULL; 3023 vol->mft_ino = NULL;
3043 /*
3044 * This is needed to get ntfs_clear_extent_inode() called for each
3045 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
3046 * leak resources and B) a subsequent mount fails automatically due to
3047 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
3048 * method again... FIXME: Do we need to do this twice now because of
3049 * attribute inodes? I think not, so leave as is for now... (AIA)
3050 */
3051 if (invalidate_inodes(sb)) {
3052 ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
3053 "driver bug.");
3054 /* Copied from fs/super.c. I just love this message. (-; */
3055 printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
3056 "seconds. Have a nice day...\n");
3057 }
3058 /* Errors at this stage are irrelevant. */ 3024 /* Errors at this stage are irrelevant. */
3059err_out_now: 3025err_out_now:
3060 lock_kernel();
3061 sb->s_fs_info = NULL; 3026 sb->s_fs_info = NULL;
3062 kfree(vol); 3027 kfree(vol);
3063 ntfs_debug("Failed, returning -EINVAL."); 3028 ntfs_debug("Failed, returning -EINVAL.");
@@ -3094,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
3094/* Driver wide mutex. */ 3059/* Driver wide mutex. */
3095DEFINE_MUTEX(ntfs_lock); 3060DEFINE_MUTEX(ntfs_lock);
3096 3061
3097static int ntfs_get_sb(struct file_system_type *fs_type, 3062static struct dentry *ntfs_mount(struct file_system_type *fs_type,
3098 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 3063 int flags, const char *dev_name, void *data)
3099{ 3064{
3100 return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super, 3065 return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
3101 mnt);
3102} 3066}
3103 3067
3104static struct file_system_type ntfs_fs_type = { 3068static struct file_system_type ntfs_fs_type = {
3105 .owner = THIS_MODULE, 3069 .owner = THIS_MODULE,
3106 .name = "ntfs", 3070 .name = "ntfs",
3107 .get_sb = ntfs_get_sb, 3071 .mount = ntfs_mount,
3108 .kill_sb = kill_block_super, 3072 .kill_sb = kill_block_super,
3109 .fs_flags = FS_REQUIRES_DEV, 3073 .fs_flags = FS_REQUIRES_DEV,
3110}; 3074};
@@ -3229,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
3229 ntfs_sysctl(0); 3193 ntfs_sysctl(0);
3230} 3194}
3231 3195
3232MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>"); 3196MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
3233MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov"); 3197MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
3234MODULE_VERSION(NTFS_VERSION); 3198MODULE_VERSION(NTFS_VERSION);
3235MODULE_LICENSE("GPL"); 3199MODULE_LICENSE("GPL");
3236#ifdef DEBUG 3200#ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698e..77a8de5f7119 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
1config OCFS2_FS 1config OCFS2_FS
2 tristate "OCFS2 file system support" 2 tristate "OCFS2 file system support"
3 depends on NET && SYSFS 3 depends on NET && SYSFS && CONFIGFS_FS
4 select CONFIGFS_FS
5 select JBD2 4 select JBD2
6 select CRC32 5 select CRC32
7 select QUOTA 6 select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
51 50
52config OCFS2_FS_STATS 51config OCFS2_FS_STATS
53 bool "OCFS2 statistics" 52 bool "OCFS2 statistics"
54 depends on OCFS2_FS 53 depends on OCFS2_FS && DEBUG_FS
55 default y 54 default y
56 help 55 help
57 This option allows some fs statistics to be captured. Enabling 56 This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe1..704f6b1742f3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
291 return ret; 291 return ret;
292} 292}
293 293
294int ocfs2_check_acl(struct inode *inode, int mask) 294int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
295{ 295{
296 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 296 struct ocfs2_super *osb;
297 struct buffer_head *di_bh = NULL; 297 struct buffer_head *di_bh = NULL;
298 struct posix_acl *acl; 298 struct posix_acl *acl;
299 int ret = -EAGAIN; 299 int ret = -EAGAIN;
300 300
301 if (flags & IPERM_FLAG_RCU)
302 return -ECHILD;
303
304 osb = OCFS2_SB(inode->i_sb);
301 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 305 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
302 return ret; 306 return ret;
303 307
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f05853..4fe7c9cf4bfb 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
26 __le32 e_id; 26 __le32 e_id;
27}; 27};
28 28
29extern int ocfs2_check_acl(struct inode *, int); 29extern int ocfs2_check_acl(struct inode *, int, unsigned int);
30extern int ocfs2_acl_chmod(struct inode *); 30extern int ocfs2_acl_chmod(struct inode *);
31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, 31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
32 struct buffer_head *, struct buffer_head *, 32 struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 592fae5007d1..e4984e259cb6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -565,7 +565,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
565 return ret; 565 return ret;
566} 566}
567 567
568static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 568static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
570 struct ocfs2_extent_block *eb); 569 struct ocfs2_extent_block *eb);
571static void ocfs2_adjust_rightmost_records(handle_t *handle, 570static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -5858,6 +5857,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5858 5857
5859 ocfs2_journal_dirty(handle, tl_bh); 5858 ocfs2_journal_dirty(handle, tl_bh);
5860 5859
5860 osb->truncated_clusters += num_clusters;
5861bail: 5861bail:
5862 mlog_exit(status); 5862 mlog_exit(status);
5863 return status; 5863 return status;
@@ -5929,6 +5929,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5929 i--; 5929 i--;
5930 } 5930 }
5931 5931
5932 osb->truncated_clusters = 0;
5933
5932bail: 5934bail:
5933 mlog_exit(status); 5935 mlog_exit(status);
5934 return status; 5936 return status;
@@ -7139,64 +7141,6 @@ bail:
7139} 7141}
7140 7142
7141/* 7143/*
7142 * Expects the inode to already be locked.
7143 */
7144int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7145 struct inode *inode,
7146 struct buffer_head *fe_bh,
7147 struct ocfs2_truncate_context **tc)
7148{
7149 int status;
7150 unsigned int new_i_clusters;
7151 struct ocfs2_dinode *fe;
7152 struct ocfs2_extent_block *eb;
7153 struct buffer_head *last_eb_bh = NULL;
7154
7155 mlog_entry_void();
7156
7157 *tc = NULL;
7158
7159 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7160 i_size_read(inode));
7161 fe = (struct ocfs2_dinode *) fe_bh->b_data;
7162
7163 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7164 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7165 (unsigned long long)le64_to_cpu(fe->i_size));
7166
7167 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7168 if (!(*tc)) {
7169 status = -ENOMEM;
7170 mlog_errno(status);
7171 goto bail;
7172 }
7173 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7174
7175 if (fe->id2.i_list.l_tree_depth) {
7176 status = ocfs2_read_extent_block(INODE_CACHE(inode),
7177 le64_to_cpu(fe->i_last_eb_blk),
7178 &last_eb_bh);
7179 if (status < 0) {
7180 mlog_errno(status);
7181 goto bail;
7182 }
7183 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7184 }
7185
7186 (*tc)->tc_last_eb_bh = last_eb_bh;
7187
7188 status = 0;
7189bail:
7190 if (status < 0) {
7191 if (*tc)
7192 ocfs2_free_truncate_context(*tc);
7193 *tc = NULL;
7194 }
7195 mlog_exit_void();
7196 return status;
7197}
7198
7199/*
7200 * 'start' is inclusive, 'end' is not. 7144 * 'start' is inclusive, 'end' is not.
7201 */ 7145 */
7202int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 7146int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
@@ -7270,18 +7214,3 @@ out_commit:
7270out: 7214out:
7271 return ret; 7215 return ret;
7272} 7216}
7273
7274static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7275{
7276 /*
7277 * The caller is responsible for completing deallocation
7278 * before freeing the context.
7279 */
7280 if (tc->tc_dealloc.c_first_suballocator != NULL)
7281 mlog(ML_NOTICE,
7282 "Truncate completion has non-empty dealloc context\n");
7283
7284 brelse(tc->tc_last_eb_bh);
7285
7286 kfree(tc);
7287}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 55762b554b99..3bd08a03251c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -228,10 +228,6 @@ struct ocfs2_truncate_context {
228 228
229int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 229int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
230 u64 range_start, u64 range_end); 230 u64 range_start, u64 range_end);
231int ocfs2_prepare_truncate(struct ocfs2_super *osb,
232 struct inode *inode,
233 struct buffer_head *fe_bh,
234 struct ocfs2_truncate_context **tc);
235int ocfs2_commit_truncate(struct ocfs2_super *osb, 231int ocfs2_commit_truncate(struct ocfs2_super *osb,
236 struct inode *inode, 232 struct inode *inode,
237 struct buffer_head *di_bh); 233 struct buffer_head *di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..1fbb0e20131b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
165 * ocfs2 never allocates in this function - the only time we 165 * ocfs2 never allocates in this function - the only time we
166 * need to use BH_New is when we're extending i_size on a file 166 * need to use BH_New is when we're extending i_size on a file
167 * system which doesn't support holes, in which case BH_New 167 * system which doesn't support holes, in which case BH_New
168 * allows block_prepare_write() to zero. 168 * allows __block_write_begin() to zero.
169 * 169 *
170 * If we see this on a sparse file system, then a truncate has 170 * If we see this on a sparse file system, then a truncate has
171 * raced us and removed the cluster. In this case, we clear 171 * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
407 return ret; 407 return ret;
408} 408}
409 409
410/*
411 * This is called from ocfs2_write_zero_page() which has handled it's
412 * own cluster locking and has ensured allocation exists for those
413 * blocks to be written.
414 */
415int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
416 unsigned from, unsigned to)
417{
418 int ret;
419
420 ret = block_prepare_write(page, from, to, ocfs2_get_block);
421
422 return ret;
423}
424
425/* Taken from ext3. We don't necessarily need the full blown 410/* Taken from ext3. We don't necessarily need the full blown
426 * functionality yet, but IMHO it's better to cut and paste the whole 411 * functionality yet, but IMHO it's better to cut and paste the whole
427 * thing so we can avoid introducing our own bugs (and easily pick up 412 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -588,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
588 /* this io's submitter should not have unlocked this before we could */ 573 /* this io's submitter should not have unlocked this before we could */
589 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 574 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
590 575
576 if (ocfs2_iocb_is_sem_locked(iocb)) {
577 up_read(&inode->i_alloc_sem);
578 ocfs2_iocb_clear_sem_locked(iocb);
579 }
580
591 ocfs2_iocb_clear_rw_locked(iocb); 581 ocfs2_iocb_clear_rw_locked(iocb);
592 582
593 level = ocfs2_iocb_rw_locked_level(iocb); 583 level = ocfs2_iocb_rw_locked_level(iocb);
594 if (!level)
595 up_read(&inode->i_alloc_sem);
596 ocfs2_rw_unlock(inode, level); 584 ocfs2_rw_unlock(inode, level);
597 585
598 if (is_async) 586 if (is_async)
@@ -732,7 +720,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
732} 720}
733 721
734/* 722/*
735 * Some of this taken from block_prepare_write(). We already have our 723 * Some of this taken from __block_write_begin(). We already have our
736 * mapping by now though, and the entire write will be allocating or 724 * mapping by now though, and the entire write will be allocating or
737 * it won't, so not much need to use BH_New. 725 * it won't, so not much need to use BH_New.
738 * 726 *
@@ -883,8 +871,8 @@ struct ocfs2_write_ctxt {
883 * out in so that future reads from that region will get 871 * out in so that future reads from that region will get
884 * zero's. 872 * zero's.
885 */ 873 */
886 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
887 unsigned int w_num_pages; 874 unsigned int w_num_pages;
875 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
888 struct page *w_target_page; 876 struct page *w_target_page;
889 877
890 /* 878 /*
@@ -1642,13 +1630,51 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1642 return ret; 1630 return ret;
1643} 1631}
1644 1632
1645int ocfs2_write_begin_nolock(struct address_space *mapping, 1633/*
1634 * Try to flush truncate logs if we can free enough clusters from it.
1635 * As for return value, "< 0" means error, "0" no space and "1" means
1636 * we have freed enough spaces and let the caller try to allocate again.
1637 */
1638static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
1639 unsigned int needed)
1640{
1641 tid_t target;
1642 int ret = 0;
1643 unsigned int truncated_clusters;
1644
1645 mutex_lock(&osb->osb_tl_inode->i_mutex);
1646 truncated_clusters = osb->truncated_clusters;
1647 mutex_unlock(&osb->osb_tl_inode->i_mutex);
1648
1649 /*
1650 * Check whether we can succeed in allocating if we free
1651 * the truncate log.
1652 */
1653 if (truncated_clusters < needed)
1654 goto out;
1655
1656 ret = ocfs2_flush_truncate_log(osb);
1657 if (ret) {
1658 mlog_errno(ret);
1659 goto out;
1660 }
1661
1662 if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
1663 jbd2_log_wait_commit(osb->journal->j_journal, target);
1664 ret = 1;
1665 }
1666out:
1667 return ret;
1668}
1669
1670int ocfs2_write_begin_nolock(struct file *filp,
1671 struct address_space *mapping,
1646 loff_t pos, unsigned len, unsigned flags, 1672 loff_t pos, unsigned len, unsigned flags,
1647 struct page **pagep, void **fsdata, 1673 struct page **pagep, void **fsdata,
1648 struct buffer_head *di_bh, struct page *mmap_page) 1674 struct buffer_head *di_bh, struct page *mmap_page)
1649{ 1675{
1650 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; 1676 int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
1651 unsigned int clusters_to_alloc, extents_to_split; 1677 unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
1652 struct ocfs2_write_ctxt *wc; 1678 struct ocfs2_write_ctxt *wc;
1653 struct inode *inode = mapping->host; 1679 struct inode *inode = mapping->host;
1654 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1680 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1657,7 +1683,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1657 struct ocfs2_alloc_context *meta_ac = NULL; 1683 struct ocfs2_alloc_context *meta_ac = NULL;
1658 handle_t *handle; 1684 handle_t *handle;
1659 struct ocfs2_extent_tree et; 1685 struct ocfs2_extent_tree et;
1686 int try_free = 1, ret1;
1660 1687
1688try_again:
1661 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1689 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1662 if (ret) { 1690 if (ret) {
1663 mlog_errno(ret); 1691 mlog_errno(ret);
@@ -1692,7 +1720,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1692 mlog_errno(ret); 1720 mlog_errno(ret);
1693 goto out; 1721 goto out;
1694 } else if (ret == 1) { 1722 } else if (ret == 1) {
1695 ret = ocfs2_refcount_cow(inode, di_bh, 1723 clusters_need = wc->w_clen;
1724 ret = ocfs2_refcount_cow(inode, filp, di_bh,
1696 wc->w_cpos, wc->w_clen, UINT_MAX); 1725 wc->w_cpos, wc->w_clen, UINT_MAX);
1697 if (ret) { 1726 if (ret) {
1698 mlog_errno(ret); 1727 mlog_errno(ret);
@@ -1706,6 +1735,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1706 mlog_errno(ret); 1735 mlog_errno(ret);
1707 goto out; 1736 goto out;
1708 } 1737 }
1738 clusters_need += clusters_to_alloc;
1709 1739
1710 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1740 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1711 1741
@@ -1828,6 +1858,22 @@ out:
1828 ocfs2_free_alloc_context(data_ac); 1858 ocfs2_free_alloc_context(data_ac);
1829 if (meta_ac) 1859 if (meta_ac)
1830 ocfs2_free_alloc_context(meta_ac); 1860 ocfs2_free_alloc_context(meta_ac);
1861
1862 if (ret == -ENOSPC && try_free) {
1863 /*
1864 * Try to free some truncate log so that we can have enough
1865 * clusters to allocate.
1866 */
1867 try_free = 0;
1868
1869 ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
1870 if (ret1 == 1)
1871 goto try_again;
1872
1873 if (ret1 < 0)
1874 mlog_errno(ret1);
1875 }
1876
1831 return ret; 1877 return ret;
1832} 1878}
1833 1879
@@ -1854,7 +1900,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1854 */ 1900 */
1855 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1901 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1856 1902
1857 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1903 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
1858 fsdata, di_bh, NULL); 1904 fsdata, di_bh, NULL);
1859 if (ret) { 1905 if (ret) {
1860 mlog_errno(ret); 1906 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc513..eceb456037c1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
22#ifndef OCFS2_AOPS_H 22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H 23#define OCFS2_AOPS_H
24 24
25int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
26 unsigned from, unsigned to);
27
28handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 25handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page, 26 struct page *page,
30 unsigned from, 27 unsigned from,
@@ -48,7 +45,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
48 loff_t pos, unsigned len, unsigned copied, 45 loff_t pos, unsigned len, unsigned copied,
49 struct page *page, void *fsdata); 46 struct page *page, void *fsdata);
50 47
51int ocfs2_write_begin_nolock(struct address_space *mapping, 48int ocfs2_write_begin_nolock(struct file *filp,
49 struct address_space *mapping,
52 loff_t pos, unsigned len, unsigned flags, 50 loff_t pos, unsigned len, unsigned flags,
53 struct page **pagep, void **fsdata, 51 struct page **pagep, void **fsdata,
54 struct buffer_head *di_bh, struct page *mmap_page); 52 struct buffer_head *di_bh, struct page *mmap_page);
@@ -70,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
70 else 68 else
71 clear_bit(1, (unsigned long *)&iocb->private); 69 clear_bit(1, (unsigned long *)&iocb->private);
72} 70}
71
72/*
73 * Using a named enum representing lock types in terms of #N bit stored in
74 * iocb->private, which is going to be used for communication bewteen
75 * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
76 */
77enum ocfs2_iocb_lock_bits {
78 OCFS2_IOCB_RW_LOCK = 0,
79 OCFS2_IOCB_RW_LOCK_LEVEL,
80 OCFS2_IOCB_SEM,
81 OCFS2_IOCB_NUM_LOCKS
82};
83
73#define ocfs2_iocb_clear_rw_locked(iocb) \ 84#define ocfs2_iocb_clear_rw_locked(iocb) \
74 clear_bit(0, (unsigned long *)&iocb->private) 85 clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
75#define ocfs2_iocb_rw_locked_level(iocb) \ 86#define ocfs2_iocb_rw_locked_level(iocb) \
76 test_bit(1, (unsigned long *)&iocb->private) 87 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
88#define ocfs2_iocb_set_sem_locked(iocb) \
89 set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
90#define ocfs2_iocb_clear_sem_locked(iocb) \
91 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
92#define ocfs2_iocb_is_sem_locked(iocb) \
93 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
77#endif /* OCFS2_FILE_H */ 94#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d56..b108e863d8f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,53 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
62static LIST_HEAD(o2hb_node_events); 62static LIST_HEAD(o2hb_node_events);
63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
64 64
65/*
66 * In global heartbeat, we maintain a series of region bitmaps.
67 * - o2hb_region_bitmap allows us to limit the region number to max region.
68 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
69 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
70 * heartbeat on it.
71 * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
72 */
73static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
74static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
75static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
76static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
77
78#define O2HB_DB_TYPE_LIVENODES 0
79#define O2HB_DB_TYPE_LIVEREGIONS 1
80#define O2HB_DB_TYPE_QUORUMREGIONS 2
81#define O2HB_DB_TYPE_FAILEDREGIONS 3
82#define O2HB_DB_TYPE_REGION_LIVENODES 4
83#define O2HB_DB_TYPE_REGION_NUMBER 5
84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85#define O2HB_DB_TYPE_REGION_PINNED 7
86struct o2hb_debug_buf {
87 int db_type;
88 int db_size;
89 int db_len;
90 void *db_data;
91};
92
93static struct o2hb_debug_buf *o2hb_db_livenodes;
94static struct o2hb_debug_buf *o2hb_db_liveregions;
95static struct o2hb_debug_buf *o2hb_db_quorumregions;
96static struct o2hb_debug_buf *o2hb_db_failedregions;
97
65#define O2HB_DEBUG_DIR "o2hb" 98#define O2HB_DEBUG_DIR "o2hb"
66#define O2HB_DEBUG_LIVENODES "livenodes" 99#define O2HB_DEBUG_LIVENODES "livenodes"
100#define O2HB_DEBUG_LIVEREGIONS "live_regions"
101#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
102#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
103#define O2HB_DEBUG_REGION_NUMBER "num"
104#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
105#define O2HB_DEBUG_REGION_PINNED "pinned"
106
67static struct dentry *o2hb_debug_dir; 107static struct dentry *o2hb_debug_dir;
68static struct dentry *o2hb_debug_livenodes; 108static struct dentry *o2hb_debug_livenodes;
109static struct dentry *o2hb_debug_liveregions;
110static struct dentry *o2hb_debug_quorumregions;
111static struct dentry *o2hb_debug_failedregions;
69 112
70static LIST_HEAD(o2hb_all_regions); 113static LIST_HEAD(o2hb_all_regions);
71 114
@@ -77,7 +120,46 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
77 120
78#define O2HB_DEFAULT_BLOCK_BITS 9 121#define O2HB_DEFAULT_BLOCK_BITS 9
79 122
123enum o2hb_heartbeat_modes {
124 O2HB_HEARTBEAT_LOCAL = 0,
125 O2HB_HEARTBEAT_GLOBAL,
126 O2HB_HEARTBEAT_NUM_MODES,
127};
128
129char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
130 "local", /* O2HB_HEARTBEAT_LOCAL */
131 "global", /* O2HB_HEARTBEAT_GLOBAL */
132};
133
80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 134unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
135unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
136
137/*
138 * o2hb_dependent_users tracks the number of registered callbacks that depend
139 * on heartbeat. o2net and o2dlm are two entities that register this callback.
140 * However only o2dlm depends on the heartbeat. It does not want the heartbeat
141 * to stop while a dlm domain is still active.
142 */
143unsigned int o2hb_dependent_users;
144
145/*
146 * In global heartbeat mode, all regions are pinned if there are one or more
147 * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
148 * regions are unpinned if the region count exceeds the cut off or the number
149 * of dependent users falls to zero.
150 */
151#define O2HB_PIN_CUT_OFF 3
152
153/*
154 * In local heartbeat mode, we assume the dlm domain name to be the same as
155 * region uuid. This is true for domains created for the file system but not
156 * necessarily true for userdlm domains. This is a known limitation.
157 *
158 * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
159 * works for both file system and userdlm domains.
160 */
161static int o2hb_region_pin(const char *region_uuid);
162static void o2hb_region_unpin(const char *region_uuid);
81 163
82/* Only sets a new threshold if there are no active regions. 164/* Only sets a new threshold if there are no active regions.
83 * 165 *
@@ -94,6 +176,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
94 } 176 }
95} 177}
96 178
179static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
180{
181 int ret = -1;
182
183 if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
184 spin_lock(&o2hb_live_lock);
185 if (list_empty(&o2hb_all_regions)) {
186 o2hb_heartbeat_mode = hb_mode;
187 ret = 0;
188 }
189 spin_unlock(&o2hb_live_lock);
190 }
191
192 return ret;
193}
194
97struct o2hb_node_event { 195struct o2hb_node_event {
98 struct list_head hn_item; 196 struct list_head hn_item;
99 enum o2hb_callback_type hn_event_type; 197 enum o2hb_callback_type hn_event_type;
@@ -117,7 +215,9 @@ struct o2hb_region {
117 struct config_item hr_item; 215 struct config_item hr_item;
118 216
119 struct list_head hr_all_item; 217 struct list_head hr_all_item;
120 unsigned hr_unclean_stop:1; 218 unsigned hr_unclean_stop:1,
219 hr_item_pinned:1,
220 hr_item_dropped:1;
121 221
122 /* protected by the hr_callback_sem */ 222 /* protected by the hr_callback_sem */
123 struct task_struct *hr_task; 223 struct task_struct *hr_task;
@@ -135,6 +235,20 @@ struct o2hb_region {
135 struct block_device *hr_bdev; 235 struct block_device *hr_bdev;
136 struct o2hb_disk_slot *hr_slots; 236 struct o2hb_disk_slot *hr_slots;
137 237
238 /* live node map of this region */
239 unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
240 unsigned int hr_region_num;
241
242 struct dentry *hr_debug_dir;
243 struct dentry *hr_debug_livenodes;
244 struct dentry *hr_debug_regnum;
245 struct dentry *hr_debug_elapsed_time;
246 struct dentry *hr_debug_pinned;
247 struct o2hb_debug_buf *hr_db_livenodes;
248 struct o2hb_debug_buf *hr_db_regnum;
249 struct o2hb_debug_buf *hr_db_elapsed_time;
250 struct o2hb_debug_buf *hr_db_pinned;
251
138 /* let the person setting up hb wait for it to return until it 252 /* let the person setting up hb wait for it to return until it
139 * has reached a 'steady' state. This will be fixed when we have 253 * has reached a 'steady' state. This will be fixed when we have
140 * a more complete api that doesn't lead to this sort of fragility. */ 254 * a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +277,19 @@ struct o2hb_bio_wait_ctxt {
163 int wc_error; 277 int wc_error;
164}; 278};
165 279
280static int o2hb_pop_count(void *map, int count)
281{
282 int i = -1, pop = 0;
283
284 while ((i = find_next_bit(map, count, i + 1)) < count)
285 pop++;
286 return pop;
287}
288
166static void o2hb_write_timeout(struct work_struct *work) 289static void o2hb_write_timeout(struct work_struct *work)
167{ 290{
291 int failed, quorum;
292 unsigned long flags;
168 struct o2hb_region *reg = 293 struct o2hb_region *reg =
169 container_of(work, struct o2hb_region, 294 container_of(work, struct o2hb_region,
170 hr_write_timeout_work.work); 295 hr_write_timeout_work.work);
@@ -172,6 +297,28 @@ static void o2hb_write_timeout(struct work_struct *work)
172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 297 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
173 "milliseconds\n", reg->hr_dev_name, 298 "milliseconds\n", reg->hr_dev_name,
174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 299 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
300
301 if (o2hb_global_heartbeat_active()) {
302 spin_lock_irqsave(&o2hb_live_lock, flags);
303 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
304 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
305 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
306 O2NM_MAX_REGIONS);
307 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
308 O2NM_MAX_REGIONS);
309 spin_unlock_irqrestore(&o2hb_live_lock, flags);
310
311 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
312 quorum, failed);
313
314 /*
315 * Fence if the number of failed regions >= half the number
316 * of quorum regions
317 */
318 if ((failed << 1) < quorum)
319 return;
320 }
321
175 o2quo_disk_timeout(); 322 o2quo_disk_timeout();
176} 323}
177 324
@@ -180,6 +327,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
180 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 327 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
181 O2HB_MAX_WRITE_TIMEOUT_MS); 328 O2HB_MAX_WRITE_TIMEOUT_MS);
182 329
330 if (o2hb_global_heartbeat_active()) {
331 spin_lock(&o2hb_live_lock);
332 clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
333 spin_unlock(&o2hb_live_lock);
334 }
183 cancel_delayed_work(&reg->hr_write_timeout_work); 335 cancel_delayed_work(&reg->hr_write_timeout_work);
184 reg->hr_last_timeout_start = jiffies; 336 reg->hr_last_timeout_start = jiffies;
185 schedule_delayed_work(&reg->hr_write_timeout_work, 337 schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -188,8 +340,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
188 340
189static void o2hb_disarm_write_timeout(struct o2hb_region *reg) 341static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
190{ 342{
191 cancel_delayed_work(&reg->hr_write_timeout_work); 343 cancel_delayed_work_sync(&reg->hr_write_timeout_work);
192 flush_scheduled_work();
193} 344}
194 345
195static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) 346static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -513,6 +664,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
513{ 664{
514 assert_spin_locked(&o2hb_live_lock); 665 assert_spin_locked(&o2hb_live_lock);
515 666
667 BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
668
516 event->hn_event_type = type; 669 event->hn_event_type = type;
517 event->hn_node = node; 670 event->hn_node = node;
518 event->hn_node_num = node_num; 671 event->hn_node_num = node_num;
@@ -554,6 +707,43 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
554 o2nm_node_put(node); 707 o2nm_node_put(node);
555} 708}
556 709
710static void o2hb_set_quorum_device(struct o2hb_region *reg,
711 struct o2hb_disk_slot *slot)
712{
713 assert_spin_locked(&o2hb_live_lock);
714
715 if (!o2hb_global_heartbeat_active())
716 return;
717
718 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
719 return;
720
721 /*
722 * A region can be added to the quorum only when it sees all
723 * live nodes heartbeat on it. In other words, the region has been
724 * added to all nodes.
725 */
726 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
727 sizeof(o2hb_live_node_bitmap)))
728 return;
729
730 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
731 return;
732
733 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
734 config_item_name(&reg->hr_item));
735
736 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
737
738 /*
739 * If global heartbeat active, unpin all regions if the
740 * region count > CUT_OFF
741 */
742 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
743 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
744 o2hb_region_unpin(NULL);
745}
746
557static int o2hb_check_slot(struct o2hb_region *reg, 747static int o2hb_check_slot(struct o2hb_region *reg,
558 struct o2hb_disk_slot *slot) 748 struct o2hb_disk_slot *slot)
559{ 749{
@@ -565,14 +755,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
565 u64 cputime; 755 u64 cputime;
566 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 756 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
567 unsigned int slot_dead_ms; 757 unsigned int slot_dead_ms;
758 int tmp;
568 759
569 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 760 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
570 761
571 /* Is this correct? Do we assume that the node doesn't exist 762 /*
572 * if we're not configured for him? */ 763 * If a node is no longer configured but is still in the livemap, we
764 * may need to clear that bit from the livemap.
765 */
573 node = o2nm_get_node_by_num(slot->ds_node_num); 766 node = o2nm_get_node_by_num(slot->ds_node_num);
574 if (!node) 767 if (!node) {
575 return 0; 768 spin_lock(&o2hb_live_lock);
769 tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
770 spin_unlock(&o2hb_live_lock);
771 if (!tmp)
772 return 0;
773 }
576 774
577 if (!o2hb_verify_crc(reg, hb_block)) { 775 if (!o2hb_verify_crc(reg, hb_block)) {
578 /* all paths from here will drop o2hb_live_lock for 776 /* all paths from here will drop o2hb_live_lock for
@@ -639,8 +837,12 @@ fire_callbacks:
639 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", 837 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
640 slot->ds_node_num, (long long)slot->ds_last_generation); 838 slot->ds_node_num, (long long)slot->ds_last_generation);
641 839
840 set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
841
642 /* first on the list generates a callback */ 842 /* first on the list generates a callback */
643 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 843 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
844 mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
845 "bitmap\n", slot->ds_node_num);
644 set_bit(slot->ds_node_num, o2hb_live_node_bitmap); 846 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
645 847
646 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, 848 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +886,18 @@ fire_callbacks:
684 mlog(ML_HEARTBEAT, "Node %d left my region\n", 886 mlog(ML_HEARTBEAT, "Node %d left my region\n",
685 slot->ds_node_num); 887 slot->ds_node_num);
686 888
889 clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
890
687 /* last off the live_slot generates a callback */ 891 /* last off the live_slot generates a callback */
688 list_del_init(&slot->ds_live_item); 892 list_del_init(&slot->ds_live_item);
689 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 893 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
894 mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
895 "nodes bitmap\n", slot->ds_node_num);
690 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); 896 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
691 897
692 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 898 /* node can be null */
693 slot->ds_node_num); 899 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
900 node, slot->ds_node_num);
694 901
695 changed = 1; 902 changed = 1;
696 } 903 }
@@ -706,11 +913,14 @@ fire_callbacks:
706 slot->ds_equal_samples = 0; 913 slot->ds_equal_samples = 0;
707 } 914 }
708out: 915out:
916 o2hb_set_quorum_device(reg, slot);
917
709 spin_unlock(&o2hb_live_lock); 918 spin_unlock(&o2hb_live_lock);
710 919
711 o2hb_run_event_list(&event); 920 o2hb_run_event_list(&event);
712 921
713 o2nm_node_put(node); 922 if (node)
923 o2nm_node_put(node);
714 return changed; 924 return changed;
715} 925}
716 926
@@ -737,6 +947,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
737{ 947{
738 int i, ret, highest_node, change = 0; 948 int i, ret, highest_node, change = 0;
739 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 949 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
950 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
740 struct o2hb_bio_wait_ctxt write_wc; 951 struct o2hb_bio_wait_ctxt write_wc;
741 952
742 ret = o2nm_configured_node_map(configured_nodes, 953 ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +957,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
746 return ret; 957 return ret;
747 } 958 }
748 959
960 /*
961 * If a node is not configured but is in the livemap, we still need
962 * to read the slot so as to be able to remove it from the livemap.
963 */
964 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
965 i = -1;
966 while ((i = find_next_bit(live_node_bitmap,
967 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
968 set_bit(i, configured_nodes);
969 }
970
749 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 971 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
750 if (highest_node >= O2NM_MAX_NODES) { 972 if (highest_node >= O2NM_MAX_NODES) {
751 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 973 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -860,6 +1082,9 @@ static int o2hb_thread(void *data)
860 1082
861 set_user_nice(current, -20); 1083 set_user_nice(current, -20);
862 1084
1085 /* Pin node */
1086 o2nm_depend_this_node();
1087
863 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1088 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
864 /* We track the time spent inside 1089 /* We track the time spent inside
865 * o2hb_do_disk_heartbeat so that we avoid more than 1090 * o2hb_do_disk_heartbeat so that we avoid more than
@@ -909,6 +1134,9 @@ static int o2hb_thread(void *data)
909 mlog_errno(ret); 1134 mlog_errno(ret);
910 } 1135 }
911 1136
1137 /* Unpin node */
1138 o2nm_undepend_this_node();
1139
912 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1140 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
913 1141
914 return 0; 1142 return 0;
@@ -917,21 +1145,65 @@ static int o2hb_thread(void *data)
917#ifdef CONFIG_DEBUG_FS 1145#ifdef CONFIG_DEBUG_FS
918static int o2hb_debug_open(struct inode *inode, struct file *file) 1146static int o2hb_debug_open(struct inode *inode, struct file *file)
919{ 1147{
1148 struct o2hb_debug_buf *db = inode->i_private;
1149 struct o2hb_region *reg;
920 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1150 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
921 char *buf = NULL; 1151 char *buf = NULL;
922 int i = -1; 1152 int i = -1;
923 int out = 0; 1153 int out = 0;
924 1154
1155 /* max_nodes should be the largest bitmap we pass here */
1156 BUG_ON(sizeof(map) < db->db_size);
1157
925 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 1158 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
926 if (!buf) 1159 if (!buf)
927 goto bail; 1160 goto bail;
928 1161
929 o2hb_fill_node_map(map, sizeof(map)); 1162 switch (db->db_type) {
1163 case O2HB_DB_TYPE_LIVENODES:
1164 case O2HB_DB_TYPE_LIVEREGIONS:
1165 case O2HB_DB_TYPE_QUORUMREGIONS:
1166 case O2HB_DB_TYPE_FAILEDREGIONS:
1167 spin_lock(&o2hb_live_lock);
1168 memcpy(map, db->db_data, db->db_size);
1169 spin_unlock(&o2hb_live_lock);
1170 break;
930 1171
931 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) 1172 case O2HB_DB_TYPE_REGION_LIVENODES:
1173 spin_lock(&o2hb_live_lock);
1174 reg = (struct o2hb_region *)db->db_data;
1175 memcpy(map, reg->hr_live_node_bitmap, db->db_size);
1176 spin_unlock(&o2hb_live_lock);
1177 break;
1178
1179 case O2HB_DB_TYPE_REGION_NUMBER:
1180 reg = (struct o2hb_region *)db->db_data;
1181 out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
1182 reg->hr_region_num);
1183 goto done;
1184
1185 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1186 reg = (struct o2hb_region *)db->db_data;
1187 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1188 jiffies_to_msecs(jiffies -
1189 reg->hr_last_timeout_start));
1190 goto done;
1191
1192 case O2HB_DB_TYPE_REGION_PINNED:
1193 reg = (struct o2hb_region *)db->db_data;
1194 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1195 !!reg->hr_item_pinned);
1196 goto done;
1197
1198 default:
1199 goto done;
1200 }
1201
1202 while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
932 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); 1203 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
933 out += snprintf(buf + out, PAGE_SIZE - out, "\n"); 1204 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
934 1205
1206done:
935 i_size_write(inode, out); 1207 i_size_write(inode, out);
936 1208
937 file->private_data = buf; 1209 file->private_data = buf;
@@ -978,10 +1250,104 @@ static const struct file_operations o2hb_debug_fops = {
978 1250
979void o2hb_exit(void) 1251void o2hb_exit(void)
980{ 1252{
981 if (o2hb_debug_livenodes) 1253 kfree(o2hb_db_livenodes);
982 debugfs_remove(o2hb_debug_livenodes); 1254 kfree(o2hb_db_liveregions);
983 if (o2hb_debug_dir) 1255 kfree(o2hb_db_quorumregions);
984 debugfs_remove(o2hb_debug_dir); 1256 kfree(o2hb_db_failedregions);
1257 debugfs_remove(o2hb_debug_failedregions);
1258 debugfs_remove(o2hb_debug_quorumregions);
1259 debugfs_remove(o2hb_debug_liveregions);
1260 debugfs_remove(o2hb_debug_livenodes);
1261 debugfs_remove(o2hb_debug_dir);
1262}
1263
1264static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
1265 struct o2hb_debug_buf **db, int db_len,
1266 int type, int size, int len, void *data)
1267{
1268 *db = kmalloc(db_len, GFP_KERNEL);
1269 if (!*db)
1270 return NULL;
1271
1272 (*db)->db_type = type;
1273 (*db)->db_size = size;
1274 (*db)->db_len = len;
1275 (*db)->db_data = data;
1276
1277 return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
1278 &o2hb_debug_fops);
1279}
1280
1281static int o2hb_debug_init(void)
1282{
1283 int ret = -ENOMEM;
1284
1285 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1286 if (!o2hb_debug_dir) {
1287 mlog_errno(ret);
1288 goto bail;
1289 }
1290
1291 o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1292 o2hb_debug_dir,
1293 &o2hb_db_livenodes,
1294 sizeof(*o2hb_db_livenodes),
1295 O2HB_DB_TYPE_LIVENODES,
1296 sizeof(o2hb_live_node_bitmap),
1297 O2NM_MAX_NODES,
1298 o2hb_live_node_bitmap);
1299 if (!o2hb_debug_livenodes) {
1300 mlog_errno(ret);
1301 goto bail;
1302 }
1303
1304 o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
1305 o2hb_debug_dir,
1306 &o2hb_db_liveregions,
1307 sizeof(*o2hb_db_liveregions),
1308 O2HB_DB_TYPE_LIVEREGIONS,
1309 sizeof(o2hb_live_region_bitmap),
1310 O2NM_MAX_REGIONS,
1311 o2hb_live_region_bitmap);
1312 if (!o2hb_debug_liveregions) {
1313 mlog_errno(ret);
1314 goto bail;
1315 }
1316
1317 o2hb_debug_quorumregions =
1318 o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
1319 o2hb_debug_dir,
1320 &o2hb_db_quorumregions,
1321 sizeof(*o2hb_db_quorumregions),
1322 O2HB_DB_TYPE_QUORUMREGIONS,
1323 sizeof(o2hb_quorum_region_bitmap),
1324 O2NM_MAX_REGIONS,
1325 o2hb_quorum_region_bitmap);
1326 if (!o2hb_debug_quorumregions) {
1327 mlog_errno(ret);
1328 goto bail;
1329 }
1330
1331 o2hb_debug_failedregions =
1332 o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
1333 o2hb_debug_dir,
1334 &o2hb_db_failedregions,
1335 sizeof(*o2hb_db_failedregions),
1336 O2HB_DB_TYPE_FAILEDREGIONS,
1337 sizeof(o2hb_failed_region_bitmap),
1338 O2NM_MAX_REGIONS,
1339 o2hb_failed_region_bitmap);
1340 if (!o2hb_debug_failedregions) {
1341 mlog_errno(ret);
1342 goto bail;
1343 }
1344
1345 ret = 0;
1346bail:
1347 if (ret)
1348 o2hb_exit();
1349
1350 return ret;
985} 1351}
986 1352
987int o2hb_init(void) 1353int o2hb_init(void)
@@ -997,24 +1363,14 @@ int o2hb_init(void)
997 INIT_LIST_HEAD(&o2hb_node_events); 1363 INIT_LIST_HEAD(&o2hb_node_events);
998 1364
999 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 1365 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
1366 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
1367 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
1368 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1369 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1000 1370
1001 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); 1371 o2hb_dependent_users = 0;
1002 if (!o2hb_debug_dir) {
1003 mlog_errno(-ENOMEM);
1004 return -ENOMEM;
1005 }
1006 1372
1007 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, 1373 return o2hb_debug_init();
1008 S_IFREG|S_IRUSR,
1009 o2hb_debug_dir, NULL,
1010 &o2hb_debug_fops);
1011 if (!o2hb_debug_livenodes) {
1012 mlog_errno(-ENOMEM);
1013 debugfs_remove(o2hb_debug_dir);
1014 return -ENOMEM;
1015 }
1016
1017 return 0;
1018} 1374}
1019 1375
1020/* if we're already in a callback then we're already serialized by the sem */ 1376/* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1434,14 @@ static void o2hb_region_release(struct config_item *item)
1078 if (reg->hr_slots) 1434 if (reg->hr_slots)
1079 kfree(reg->hr_slots); 1435 kfree(reg->hr_slots);
1080 1436
1437 kfree(reg->hr_db_regnum);
1438 kfree(reg->hr_db_livenodes);
1439 debugfs_remove(reg->hr_debug_livenodes);
1440 debugfs_remove(reg->hr_debug_regnum);
1441 debugfs_remove(reg->hr_debug_elapsed_time);
1442 debugfs_remove(reg->hr_debug_pinned);
1443 debugfs_remove(reg->hr_debug_dir);
1444
1081 spin_lock(&o2hb_live_lock); 1445 spin_lock(&o2hb_live_lock);
1082 list_del(&reg->hr_all_item); 1446 list_del(&reg->hr_all_item);
1083 spin_unlock(&o2hb_live_lock); 1447 spin_unlock(&o2hb_live_lock);
@@ -1365,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1365 goto out; 1729 goto out;
1366 1730
1367 reg->hr_bdev = I_BDEV(filp->f_mapping->host); 1731 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
1368 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); 1732 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
1369 if (ret) { 1733 if (ret) {
1370 reg->hr_bdev = NULL; 1734 reg->hr_bdev = NULL;
1371 goto out; 1735 goto out;
@@ -1441,6 +1805,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1441 /* Ok, we were woken. Make sure it wasn't by drop_item() */ 1805 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1442 spin_lock(&o2hb_live_lock); 1806 spin_lock(&o2hb_live_lock);
1443 hb_task = reg->hr_task; 1807 hb_task = reg->hr_task;
1808 if (o2hb_global_heartbeat_active())
1809 set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
1444 spin_unlock(&o2hb_live_lock); 1810 spin_unlock(&o2hb_live_lock);
1445 1811
1446 if (hb_task) 1812 if (hb_task)
@@ -1448,6 +1814,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1448 else 1814 else
1449 ret = -EIO; 1815 ret = -EIO;
1450 1816
1817 if (hb_task && o2hb_global_heartbeat_active())
1818 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
1819 config_item_name(&reg->hr_item));
1820
1451out: 1821out:
1452 if (filp) 1822 if (filp)
1453 fput(filp); 1823 fput(filp);
@@ -1586,22 +1956,113 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1586 : NULL; 1956 : NULL;
1587} 1957}
1588 1958
1959static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
1960{
1961 int ret = -ENOMEM;
1962
1963 reg->hr_debug_dir =
1964 debugfs_create_dir(config_item_name(&reg->hr_item), dir);
1965 if (!reg->hr_debug_dir) {
1966 mlog_errno(ret);
1967 goto bail;
1968 }
1969
1970 reg->hr_debug_livenodes =
1971 o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1972 reg->hr_debug_dir,
1973 &(reg->hr_db_livenodes),
1974 sizeof(*(reg->hr_db_livenodes)),
1975 O2HB_DB_TYPE_REGION_LIVENODES,
1976 sizeof(reg->hr_live_node_bitmap),
1977 O2NM_MAX_NODES, reg);
1978 if (!reg->hr_debug_livenodes) {
1979 mlog_errno(ret);
1980 goto bail;
1981 }
1982
1983 reg->hr_debug_regnum =
1984 o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
1985 reg->hr_debug_dir,
1986 &(reg->hr_db_regnum),
1987 sizeof(*(reg->hr_db_regnum)),
1988 O2HB_DB_TYPE_REGION_NUMBER,
1989 0, O2NM_MAX_NODES, reg);
1990 if (!reg->hr_debug_regnum) {
1991 mlog_errno(ret);
1992 goto bail;
1993 }
1994
1995 reg->hr_debug_elapsed_time =
1996 o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
1997 reg->hr_debug_dir,
1998 &(reg->hr_db_elapsed_time),
1999 sizeof(*(reg->hr_db_elapsed_time)),
2000 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
2001 0, 0, reg);
2002 if (!reg->hr_debug_elapsed_time) {
2003 mlog_errno(ret);
2004 goto bail;
2005 }
2006
2007 reg->hr_debug_pinned =
2008 o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
2009 reg->hr_debug_dir,
2010 &(reg->hr_db_pinned),
2011 sizeof(*(reg->hr_db_pinned)),
2012 O2HB_DB_TYPE_REGION_PINNED,
2013 0, 0, reg);
2014 if (!reg->hr_debug_pinned) {
2015 mlog_errno(ret);
2016 goto bail;
2017 }
2018
2019 ret = 0;
2020bail:
2021 return ret;
2022}
2023
1589static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 2024static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1590 const char *name) 2025 const char *name)
1591{ 2026{
1592 struct o2hb_region *reg = NULL; 2027 struct o2hb_region *reg = NULL;
2028 int ret;
1593 2029
1594 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); 2030 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
1595 if (reg == NULL) 2031 if (reg == NULL)
1596 return ERR_PTR(-ENOMEM); 2032 return ERR_PTR(-ENOMEM);
1597 2033
1598 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 2034 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
2035 ret = -ENAMETOOLONG;
2036 goto free;
2037 }
1599 2038
1600 spin_lock(&o2hb_live_lock); 2039 spin_lock(&o2hb_live_lock);
2040 reg->hr_region_num = 0;
2041 if (o2hb_global_heartbeat_active()) {
2042 reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
2043 O2NM_MAX_REGIONS);
2044 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
2045 spin_unlock(&o2hb_live_lock);
2046 ret = -EFBIG;
2047 goto free;
2048 }
2049 set_bit(reg->hr_region_num, o2hb_region_bitmap);
2050 }
1601 list_add_tail(&reg->hr_all_item, &o2hb_all_regions); 2051 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1602 spin_unlock(&o2hb_live_lock); 2052 spin_unlock(&o2hb_live_lock);
1603 2053
2054 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
2055
2056 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
2057 if (ret) {
2058 config_item_put(&reg->hr_item);
2059 goto free;
2060 }
2061
1604 return &reg->hr_item; 2062 return &reg->hr_item;
2063free:
2064 kfree(reg);
2065 return ERR_PTR(ret);
1605} 2066}
1606 2067
1607static void o2hb_heartbeat_group_drop_item(struct config_group *group, 2068static void o2hb_heartbeat_group_drop_item(struct config_group *group,
@@ -1609,11 +2070,20 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1609{ 2070{
1610 struct task_struct *hb_task; 2071 struct task_struct *hb_task;
1611 struct o2hb_region *reg = to_o2hb_region(item); 2072 struct o2hb_region *reg = to_o2hb_region(item);
2073 int quorum_region = 0;
1612 2074
1613 /* stop the thread when the user removes the region dir */ 2075 /* stop the thread when the user removes the region dir */
1614 spin_lock(&o2hb_live_lock); 2076 spin_lock(&o2hb_live_lock);
2077 if (o2hb_global_heartbeat_active()) {
2078 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2079 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2080 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2081 quorum_region = 1;
2082 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2083 }
1615 hb_task = reg->hr_task; 2084 hb_task = reg->hr_task;
1616 reg->hr_task = NULL; 2085 reg->hr_task = NULL;
2086 reg->hr_item_dropped = 1;
1617 spin_unlock(&o2hb_live_lock); 2087 spin_unlock(&o2hb_live_lock);
1618 2088
1619 if (hb_task) 2089 if (hb_task)
@@ -1628,7 +2098,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1628 wake_up(&o2hb_steady_queue); 2098 wake_up(&o2hb_steady_queue);
1629 } 2099 }
1630 2100
2101 if (o2hb_global_heartbeat_active())
2102 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2103 config_item_name(&reg->hr_item));
2104
1631 config_item_put(item); 2105 config_item_put(item);
2106
2107 if (!o2hb_global_heartbeat_active() || !quorum_region)
2108 return;
2109
2110 /*
2111 * If global heartbeat active and there are dependent users,
2112 * pin all regions if quorum region count <= CUT_OFF
2113 */
2114 spin_lock(&o2hb_live_lock);
2115
2116 if (!o2hb_dependent_users)
2117 goto unlock;
2118
2119 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2120 O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
2121 o2hb_region_pin(NULL);
2122
2123unlock:
2124 spin_unlock(&o2hb_live_lock);
1632} 2125}
1633 2126
1634struct o2hb_heartbeat_group_attribute { 2127struct o2hb_heartbeat_group_attribute {
@@ -1688,6 +2181,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
1688 return count; 2181 return count;
1689} 2182}
1690 2183
2184static
2185ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
2186 char *page)
2187{
2188 return sprintf(page, "%s\n",
2189 o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
2190}
2191
2192static
2193ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2194 const char *page, size_t count)
2195{
2196 unsigned int i;
2197 int ret;
2198 size_t len;
2199
2200 len = (page[count - 1] == '\n') ? count - 1 : count;
2201 if (!len)
2202 return -EINVAL;
2203
2204 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
2205 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2206 continue;
2207
2208 ret = o2hb_global_hearbeat_mode_set(i);
2209 if (!ret)
2210 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2211 o2hb_heartbeat_mode_desc[i]);
2212 return count;
2213 }
2214
2215 return -EINVAL;
2216
2217}
2218
1691static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { 2219static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1692 .attr = { .ca_owner = THIS_MODULE, 2220 .attr = { .ca_owner = THIS_MODULE,
1693 .ca_name = "dead_threshold", 2221 .ca_name = "dead_threshold",
@@ -1696,8 +2224,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
1696 .store = o2hb_heartbeat_group_threshold_store, 2224 .store = o2hb_heartbeat_group_threshold_store,
1697}; 2225};
1698 2226
2227static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
2228 .attr = { .ca_owner = THIS_MODULE,
2229 .ca_name = "mode",
2230 .ca_mode = S_IRUGO | S_IWUSR },
2231 .show = o2hb_heartbeat_group_mode_show,
2232 .store = o2hb_heartbeat_group_mode_store,
2233};
2234
1699static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { 2235static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1700 &o2hb_heartbeat_group_attr_threshold.attr, 2236 &o2hb_heartbeat_group_attr_threshold.attr,
2237 &o2hb_heartbeat_group_attr_mode.attr,
1701 NULL, 2238 NULL,
1702}; 2239};
1703 2240
@@ -1770,63 +2307,138 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
1770} 2307}
1771EXPORT_SYMBOL_GPL(o2hb_setup_callback); 2308EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1772 2309
1773static struct o2hb_region *o2hb_find_region(const char *region_uuid) 2310/*
2311 * In local heartbeat mode, region_uuid passed matches the dlm domain name.
2312 * In global heartbeat mode, region_uuid passed is NULL.
2313 *
2314 * In local, we only pin the matching region. In global we pin all the active
2315 * regions.
2316 */
2317static int o2hb_region_pin(const char *region_uuid)
1774{ 2318{
1775 struct o2hb_region *p, *reg = NULL; 2319 int ret = 0, found = 0;
2320 struct o2hb_region *reg;
2321 char *uuid;
1776 2322
1777 assert_spin_locked(&o2hb_live_lock); 2323 assert_spin_locked(&o2hb_live_lock);
1778 2324
1779 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) { 2325 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
1780 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) { 2326 uuid = config_item_name(&reg->hr_item);
1781 reg = p; 2327
1782 break; 2328 /* local heartbeat */
2329 if (region_uuid) {
2330 if (strcmp(region_uuid, uuid))
2331 continue;
2332 found = 1;
2333 }
2334
2335 if (reg->hr_item_pinned || reg->hr_item_dropped)
2336 goto skip_pin;
2337
2338 /* Ignore ENOENT only for local hb (userdlm domain) */
2339 ret = o2nm_depend_item(&reg->hr_item);
2340 if (!ret) {
2341 mlog(ML_CLUSTER, "Pin region %s\n", uuid);
2342 reg->hr_item_pinned = 1;
2343 } else {
2344 if (ret == -ENOENT && found)
2345 ret = 0;
2346 else {
2347 mlog(ML_ERROR, "Pin region %s fails with %d\n",
2348 uuid, ret);
2349 break;
2350 }
1783 } 2351 }
2352skip_pin:
2353 if (found)
2354 break;
1784 } 2355 }
1785 2356
1786 return reg; 2357 return ret;
1787} 2358}
1788 2359
1789static int o2hb_region_get(const char *region_uuid) 2360/*
2361 * In local heartbeat mode, region_uuid passed matches the dlm domain name.
2362 * In global heartbeat mode, region_uuid passed is NULL.
2363 *
2364 * In local, we only unpin the matching region. In global we unpin all the
2365 * active regions.
2366 */
2367static void o2hb_region_unpin(const char *region_uuid)
1790{ 2368{
1791 int ret = 0;
1792 struct o2hb_region *reg; 2369 struct o2hb_region *reg;
2370 char *uuid;
2371 int found = 0;
1793 2372
1794 spin_lock(&o2hb_live_lock); 2373 assert_spin_locked(&o2hb_live_lock);
1795 2374
1796 reg = o2hb_find_region(region_uuid); 2375 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
1797 if (!reg) 2376 uuid = config_item_name(&reg->hr_item);
1798 ret = -ENOENT; 2377 if (region_uuid) {
1799 spin_unlock(&o2hb_live_lock); 2378 if (strcmp(region_uuid, uuid))
2379 continue;
2380 found = 1;
2381 }
1800 2382
1801 if (ret) 2383 if (reg->hr_item_pinned) {
1802 goto out; 2384 mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
2385 o2nm_undepend_item(&reg->hr_item);
2386 reg->hr_item_pinned = 0;
2387 }
2388 if (found)
2389 break;
2390 }
2391}
1803 2392
1804 ret = o2nm_depend_this_node(); 2393static int o2hb_region_inc_user(const char *region_uuid)
1805 if (ret) 2394{
1806 goto out; 2395 int ret = 0;
1807 2396
1808 ret = o2nm_depend_item(&reg->hr_item); 2397 spin_lock(&o2hb_live_lock);
1809 if (ret)
1810 o2nm_undepend_this_node();
1811 2398
1812out: 2399 /* local heartbeat */
2400 if (!o2hb_global_heartbeat_active()) {
2401 ret = o2hb_region_pin(region_uuid);
2402 goto unlock;
2403 }
2404
2405 /*
2406 * if global heartbeat active and this is the first dependent user,
2407 * pin all regions if quorum region count <= CUT_OFF
2408 */
2409 o2hb_dependent_users++;
2410 if (o2hb_dependent_users > 1)
2411 goto unlock;
2412
2413 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
2414 O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
2415 ret = o2hb_region_pin(NULL);
2416
2417unlock:
2418 spin_unlock(&o2hb_live_lock);
1813 return ret; 2419 return ret;
1814} 2420}
1815 2421
1816static void o2hb_region_put(const char *region_uuid) 2422void o2hb_region_dec_user(const char *region_uuid)
1817{ 2423{
1818 struct o2hb_region *reg;
1819
1820 spin_lock(&o2hb_live_lock); 2424 spin_lock(&o2hb_live_lock);
1821 2425
1822 reg = o2hb_find_region(region_uuid); 2426 /* local heartbeat */
2427 if (!o2hb_global_heartbeat_active()) {
2428 o2hb_region_unpin(region_uuid);
2429 goto unlock;
2430 }
1823 2431
1824 spin_unlock(&o2hb_live_lock); 2432 /*
2433 * if global heartbeat active and there are no dependent users,
2434 * unpin all quorum regions
2435 */
2436 o2hb_dependent_users--;
2437 if (!o2hb_dependent_users)
2438 o2hb_region_unpin(NULL);
1825 2439
1826 if (reg) { 2440unlock:
1827 o2nm_undepend_item(&reg->hr_item); 2441 spin_unlock(&o2hb_live_lock);
1828 o2nm_undepend_this_node();
1829 }
1830} 2442}
1831 2443
1832int o2hb_register_callback(const char *region_uuid, 2444int o2hb_register_callback(const char *region_uuid,
@@ -1847,9 +2459,11 @@ int o2hb_register_callback(const char *region_uuid,
1847 } 2459 }
1848 2460
1849 if (region_uuid) { 2461 if (region_uuid) {
1850 ret = o2hb_region_get(region_uuid); 2462 ret = o2hb_region_inc_user(region_uuid);
1851 if (ret) 2463 if (ret) {
2464 mlog_errno(ret);
1852 goto out; 2465 goto out;
2466 }
1853 } 2467 }
1854 2468
1855 down_write(&o2hb_callback_sem); 2469 down_write(&o2hb_callback_sem);
@@ -1867,7 +2481,7 @@ int o2hb_register_callback(const char *region_uuid,
1867 up_write(&o2hb_callback_sem); 2481 up_write(&o2hb_callback_sem);
1868 ret = 0; 2482 ret = 0;
1869out: 2483out:
1870 mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n", 2484 mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
1871 ret, __builtin_return_address(0), hc); 2485 ret, __builtin_return_address(0), hc);
1872 return ret; 2486 return ret;
1873} 2487}
@@ -1878,7 +2492,7 @@ void o2hb_unregister_callback(const char *region_uuid,
1878{ 2492{
1879 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); 2493 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1880 2494
1881 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", 2495 mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
1882 __builtin_return_address(0), hc); 2496 __builtin_return_address(0), hc);
1883 2497
1884 /* XXX Can this happen _with_ a region reference? */ 2498 /* XXX Can this happen _with_ a region reference? */
@@ -1886,7 +2500,7 @@ void o2hb_unregister_callback(const char *region_uuid,
1886 return; 2500 return;
1887 2501
1888 if (region_uuid) 2502 if (region_uuid)
1889 o2hb_region_put(region_uuid); 2503 o2hb_region_dec_user(region_uuid);
1890 2504
1891 down_write(&o2hb_callback_sem); 2505 down_write(&o2hb_callback_sem);
1892 2506
@@ -1963,3 +2577,34 @@ void o2hb_stop_all_regions(void)
1963 spin_unlock(&o2hb_live_lock); 2577 spin_unlock(&o2hb_live_lock);
1964} 2578}
1965EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); 2579EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
2580
2581int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2582{
2583 struct o2hb_region *reg;
2584 int numregs = 0;
2585 char *p;
2586
2587 spin_lock(&o2hb_live_lock);
2588
2589 p = region_uuids;
2590 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2591 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2592 if (numregs < max_regions) {
2593 memcpy(p, config_item_name(&reg->hr_item),
2594 O2HB_MAX_REGION_NAME_LEN);
2595 p += O2HB_MAX_REGION_NAME_LEN;
2596 }
2597 numregs++;
2598 }
2599
2600 spin_unlock(&o2hb_live_lock);
2601
2602 return numregs;
2603}
2604EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
2605
2606int o2hb_global_heartbeat_active(void)
2607{
2608 return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
2609}
2610EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b49..00ad8e8fea51 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
31 31
32#define O2HB_REGION_TIMEOUT_MS 2000 32#define O2HB_REGION_TIMEOUT_MS 2000
33 33
34#define O2HB_MAX_REGION_NAME_LEN 32
35
34/* number of changes to be seen as live */ 36/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2 37#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 38/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
81int o2hb_check_node_heartbeating_from_callback(u8 node_num); 83int o2hb_check_node_heartbeating_from_callback(u8 node_num);
82int o2hb_check_local_node_heartbeating(void); 84int o2hb_check_local_node_heartbeating(void);
83void o2hb_stop_all_regions(void); 85void o2hb_stop_all_regions(void);
86int o2hb_get_all_regions(char *region_uuids, u8 numregions);
87int o2hb_global_heartbeat_active(void);
84 88
85#endif /* O2CLUSTER_HEARTBEAT_H */ 89#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392d..6c61771469af 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS), 115 define_mask(BASTS),
116 define_mask(RESERVATIONS),
117 define_mask(CLUSTER),
116 define_mask(ERROR), 118 define_mask(ERROR),
117 define_mask(NOTICE), 119 define_mask(NOTICE),
118 define_mask(KTHREAD), 120 define_mask(KTHREAD),
119 define_mask(RESERVATIONS),
120}; 121};
121 122
122static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; 123static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa56..34d6544357d9 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
81#include <linux/sched.h> 81#include <linux/sched.h>
82 82
83/* bits that are frequently given and infrequently matched in the low word */ 83/* bits that are frequently given and infrequently matched in the low word */
84/* NOTE: If you add a flag, you need to also update mlog.c! */ 84/* NOTE: If you add a flag, you need to also update masklog.c! */
85#define ML_ENTRY 0x0000000000000001ULL /* func call entry */ 85#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
86#define ML_EXIT 0x0000000000000002ULL /* func call exit */ 86#define ML_EXIT 0x0000000000000002ULL /* func call exit */
87#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ 87#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,12 +114,14 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */ 117#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */
118#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
119#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */
120
118/* bits that are infrequently given and frequently matched in the high word */ 121/* bits that are infrequently given and frequently matched in the high word */
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 122#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 123#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 124#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
123 125
124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 126#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 127#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b02..3a5835904b3d 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,15 @@
46#define O2NET_DEBUG_DIR "o2net" 46#define O2NET_DEBUG_DIR "o2net"
47#define SC_DEBUG_NAME "sock_containers" 47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking" 48#define NST_DEBUG_NAME "send_tracking"
49#define STATS_DEBUG_NAME "stats"
50
51#define SHOW_SOCK_CONTAINERS 0
52#define SHOW_SOCK_STATS 1
49 53
50static struct dentry *o2net_dentry; 54static struct dentry *o2net_dentry;
51static struct dentry *sc_dentry; 55static struct dentry *sc_dentry;
52static struct dentry *nst_dentry; 56static struct dentry *nst_dentry;
57static struct dentry *stats_dentry;
53 58
54static DEFINE_SPINLOCK(o2net_debug_lock); 59static DEFINE_SPINLOCK(o2net_debug_lock);
55 60
@@ -123,37 +128,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
123static int nst_seq_show(struct seq_file *seq, void *v) 128static int nst_seq_show(struct seq_file *seq, void *v)
124{ 129{
125 struct o2net_send_tracking *nst, *dummy_nst = seq->private; 130 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
131 ktime_t now;
132 s64 sock, send, status;
126 133
127 spin_lock(&o2net_debug_lock); 134 spin_lock(&o2net_debug_lock);
128 nst = next_nst(dummy_nst); 135 nst = next_nst(dummy_nst);
136 if (!nst)
137 goto out;
129 138
130 if (nst != NULL) { 139 now = ktime_get();
131 /* get_task_comm isn't exported. oh well. */ 140 sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
132 seq_printf(seq, "%p:\n" 141 send = ktime_to_us(ktime_sub(now, nst->st_send_time));
133 " pid: %lu\n" 142 status = ktime_to_us(ktime_sub(now, nst->st_status_time));
134 " tgid: %lu\n" 143
135 " process name: %s\n" 144 /* get_task_comm isn't exported. oh well. */
136 " node: %u\n" 145 seq_printf(seq, "%p:\n"
137 " sc: %p\n" 146 " pid: %lu\n"
138 " message id: %d\n" 147 " tgid: %lu\n"
139 " message type: %u\n" 148 " process name: %s\n"
140 " message key: 0x%08x\n" 149 " node: %u\n"
141 " sock acquiry: %lu.%ld\n" 150 " sc: %p\n"
142 " send start: %lu.%ld\n" 151 " message id: %d\n"
143 " wait start: %lu.%ld\n", 152 " message type: %u\n"
144 nst, (unsigned long)nst->st_task->pid, 153 " message key: 0x%08x\n"
145 (unsigned long)nst->st_task->tgid, 154 " sock acquiry: %lld usecs ago\n"
146 nst->st_task->comm, nst->st_node, 155 " send start: %lld usecs ago\n"
147 nst->st_sc, nst->st_id, nst->st_msg_type, 156 " wait start: %lld usecs ago\n",
148 nst->st_msg_key, 157 nst, (unsigned long)task_pid_nr(nst->st_task),
149 nst->st_sock_time.tv_sec, 158 (unsigned long)nst->st_task->tgid,
150 (long)nst->st_sock_time.tv_usec, 159 nst->st_task->comm, nst->st_node,
151 nst->st_send_time.tv_sec, 160 nst->st_sc, nst->st_id, nst->st_msg_type,
152 (long)nst->st_send_time.tv_usec, 161 nst->st_msg_key,
153 nst->st_status_time.tv_sec, 162 (long long)sock,
154 (long)nst->st_status_time.tv_usec); 163 (long long)send,
155 } 164 (long long)status);
156 165
166out:
157 spin_unlock(&o2net_debug_lock); 167 spin_unlock(&o2net_debug_lock);
158 168
159 return 0; 169 return 0;
@@ -228,6 +238,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
228 spin_unlock(&o2net_debug_lock); 238 spin_unlock(&o2net_debug_lock);
229} 239}
230 240
241struct o2net_sock_debug {
242 int dbg_ctxt;
243 struct o2net_sock_container *dbg_sock;
244};
245
231static struct o2net_sock_container 246static struct o2net_sock_container
232 *next_sc(struct o2net_sock_container *sc_start) 247 *next_sc(struct o2net_sock_container *sc_start)
233{ 248{
@@ -253,7 +268,8 @@ static struct o2net_sock_container
253 268
254static void *sc_seq_start(struct seq_file *seq, loff_t *pos) 269static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
255{ 270{
256 struct o2net_sock_container *sc, *dummy_sc = seq->private; 271 struct o2net_sock_debug *sd = seq->private;
272 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
257 273
258 spin_lock(&o2net_debug_lock); 274 spin_lock(&o2net_debug_lock);
259 sc = next_sc(dummy_sc); 275 sc = next_sc(dummy_sc);
@@ -264,7 +280,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
264 280
265static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) 281static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266{ 282{
267 struct o2net_sock_container *sc, *dummy_sc = seq->private; 283 struct o2net_sock_debug *sd = seq->private;
284 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
268 285
269 spin_lock(&o2net_debug_lock); 286 spin_lock(&o2net_debug_lock);
270 sc = next_sc(dummy_sc); 287 sc = next_sc(dummy_sc);
@@ -276,65 +293,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 return sc; /* unused, just needs to be null when done */ 293 return sc; /* unused, just needs to be null when done */
277} 294}
278 295
279#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec 296#ifdef CONFIG_OCFS2_FS_STATS
297# define sc_send_count(_s) ((_s)->sc_send_count)
298# define sc_recv_count(_s) ((_s)->sc_recv_count)
299# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
300# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
301# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
302# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
303#else
304# define sc_send_count(_s) (0U)
305# define sc_recv_count(_s) (0U)
306# define sc_tv_acquiry_total_ns(_s) (0LL)
307# define sc_tv_send_total_ns(_s) (0LL)
308# define sc_tv_status_total_ns(_s) (0LL)
309# define sc_tv_process_total_ns(_s) (0LL)
310#endif
311
312/* So that debugfs.ocfs2 can determine which format is being used */
313#define O2NET_STATS_STR_VERSION 1
314static void sc_show_sock_stats(struct seq_file *seq,
315 struct o2net_sock_container *sc)
316{
317 if (!sc)
318 return;
319
320 seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
321 sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
322 (long long)sc_tv_acquiry_total_ns(sc),
323 (long long)sc_tv_send_total_ns(sc),
324 (long long)sc_tv_status_total_ns(sc),
325 (unsigned long)sc_recv_count(sc),
326 (long long)sc_tv_process_total_ns(sc));
327}
328
329static void sc_show_sock_container(struct seq_file *seq,
330 struct o2net_sock_container *sc)
331{
332 struct inet_sock *inet = NULL;
333 __be32 saddr = 0, daddr = 0;
334 __be16 sport = 0, dport = 0;
335
336 if (!sc)
337 return;
338
339 if (sc->sc_sock) {
340 inet = inet_sk(sc->sc_sock->sk);
341 /* the stack's structs aren't sparse endian clean */
342 saddr = (__force __be32)inet->inet_saddr;
343 daddr = (__force __be32)inet->inet_daddr;
344 sport = (__force __be16)inet->inet_sport;
345 dport = (__force __be16)inet->inet_dport;
346 }
347
348 /* XXX sigh, inet-> doesn't have sparse annotation so any
349 * use of it here generates a warning with -Wbitwise */
350 seq_printf(seq, "%p:\n"
351 " krefs: %d\n"
352 " sock: %pI4:%u -> "
353 "%pI4:%u\n"
354 " remote node: %s\n"
355 " page off: %zu\n"
356 " handshake ok: %u\n"
357 " timer: %lld usecs\n"
358 " data ready: %lld usecs\n"
359 " advance start: %lld usecs\n"
360 " advance stop: %lld usecs\n"
361 " func start: %lld usecs\n"
362 " func stop: %lld usecs\n"
363 " func key: 0x%08x\n"
364 " func type: %u\n",
365 sc,
366 atomic_read(&sc->sc_kref.refcount),
367 &saddr, inet ? ntohs(sport) : 0,
368 &daddr, inet ? ntohs(dport) : 0,
369 sc->sc_node->nd_name,
370 sc->sc_page_off,
371 sc->sc_handshake_ok,
372 (long long)ktime_to_us(sc->sc_tv_timer),
373 (long long)ktime_to_us(sc->sc_tv_data_ready),
374 (long long)ktime_to_us(sc->sc_tv_advance_start),
375 (long long)ktime_to_us(sc->sc_tv_advance_stop),
376 (long long)ktime_to_us(sc->sc_tv_func_start),
377 (long long)ktime_to_us(sc->sc_tv_func_stop),
378 sc->sc_msg_key,
379 sc->sc_msg_type);
380}
280 381
281static int sc_seq_show(struct seq_file *seq, void *v) 382static int sc_seq_show(struct seq_file *seq, void *v)
282{ 383{
283 struct o2net_sock_container *sc, *dummy_sc = seq->private; 384 struct o2net_sock_debug *sd = seq->private;
385 struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
284 386
285 spin_lock(&o2net_debug_lock); 387 spin_lock(&o2net_debug_lock);
286 sc = next_sc(dummy_sc); 388 sc = next_sc(dummy_sc);
287 389
288 if (sc != NULL) { 390 if (sc) {
289 struct inet_sock *inet = NULL; 391 if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
290 392 sc_show_sock_container(seq, sc);
291 __be32 saddr = 0, daddr = 0; 393 else
292 __be16 sport = 0, dport = 0; 394 sc_show_sock_stats(seq, sc);
293
294 if (sc->sc_sock) {
295 inet = inet_sk(sc->sc_sock->sk);
296 /* the stack's structs aren't sparse endian clean */
297 saddr = (__force __be32)inet->inet_saddr;
298 daddr = (__force __be32)inet->inet_daddr;
299 sport = (__force __be16)inet->inet_sport;
300 dport = (__force __be16)inet->inet_dport;
301 }
302
303 /* XXX sigh, inet-> doesn't have sparse annotation so any
304 * use of it here generates a warning with -Wbitwise */
305 seq_printf(seq, "%p:\n"
306 " krefs: %d\n"
307 " sock: %pI4:%u -> "
308 "%pI4:%u\n"
309 " remote node: %s\n"
310 " page off: %zu\n"
311 " handshake ok: %u\n"
312 " timer: %lu.%ld\n"
313 " data ready: %lu.%ld\n"
314 " advance start: %lu.%ld\n"
315 " advance stop: %lu.%ld\n"
316 " func start: %lu.%ld\n"
317 " func stop: %lu.%ld\n"
318 " func key: %u\n"
319 " func type: %u\n",
320 sc,
321 atomic_read(&sc->sc_kref.refcount),
322 &saddr, inet ? ntohs(sport) : 0,
323 &daddr, inet ? ntohs(dport) : 0,
324 sc->sc_node->nd_name,
325 sc->sc_page_off,
326 sc->sc_handshake_ok,
327 TV_SEC_USEC(sc->sc_tv_timer),
328 TV_SEC_USEC(sc->sc_tv_data_ready),
329 TV_SEC_USEC(sc->sc_tv_advance_start),
330 TV_SEC_USEC(sc->sc_tv_advance_stop),
331 TV_SEC_USEC(sc->sc_tv_func_start),
332 TV_SEC_USEC(sc->sc_tv_func_stop),
333 sc->sc_msg_key,
334 sc->sc_msg_type);
335 } 395 }
336 396
337
338 spin_unlock(&o2net_debug_lock); 397 spin_unlock(&o2net_debug_lock);
339 398
340 return 0; 399 return 0;
@@ -351,7 +410,7 @@ static const struct seq_operations sc_seq_ops = {
351 .show = sc_seq_show, 410 .show = sc_seq_show,
352}; 411};
353 412
354static int sc_fop_open(struct inode *inode, struct file *file) 413static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
355{ 414{
356 struct o2net_sock_container *dummy_sc; 415 struct o2net_sock_container *dummy_sc;
357 struct seq_file *seq; 416 struct seq_file *seq;
@@ -369,7 +428,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
369 goto out; 428 goto out;
370 429
371 seq = file->private_data; 430 seq = file->private_data;
372 seq->private = dummy_sc; 431 seq->private = sd;
432 sd->dbg_sock = dummy_sc;
373 o2net_debug_add_sc(dummy_sc); 433 o2net_debug_add_sc(dummy_sc);
374 434
375 dummy_sc = NULL; 435 dummy_sc = NULL;
@@ -382,12 +442,48 @@ out:
382static int sc_fop_release(struct inode *inode, struct file *file) 442static int sc_fop_release(struct inode *inode, struct file *file)
383{ 443{
384 struct seq_file *seq = file->private_data; 444 struct seq_file *seq = file->private_data;
385 struct o2net_sock_container *dummy_sc = seq->private; 445 struct o2net_sock_debug *sd = seq->private;
446 struct o2net_sock_container *dummy_sc = sd->dbg_sock;
386 447
387 o2net_debug_del_sc(dummy_sc); 448 o2net_debug_del_sc(dummy_sc);
388 return seq_release_private(inode, file); 449 return seq_release_private(inode, file);
389} 450}
390 451
452static int stats_fop_open(struct inode *inode, struct file *file)
453{
454 struct o2net_sock_debug *sd;
455
456 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
457 if (sd == NULL)
458 return -ENOMEM;
459
460 sd->dbg_ctxt = SHOW_SOCK_STATS;
461 sd->dbg_sock = NULL;
462
463 return sc_common_open(file, sd);
464}
465
466static const struct file_operations stats_seq_fops = {
467 .open = stats_fop_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
470 .release = sc_fop_release,
471};
472
473static int sc_fop_open(struct inode *inode, struct file *file)
474{
475 struct o2net_sock_debug *sd;
476
477 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
478 if (sd == NULL)
479 return -ENOMEM;
480
481 sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
482 sd->dbg_sock = NULL;
483
484 return sc_common_open(file, sd);
485}
486
391static const struct file_operations sc_seq_fops = { 487static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 488 .open = sc_fop_open,
393 .read = seq_read, 489 .read = seq_read,
@@ -419,25 +515,29 @@ int o2net_debugfs_init(void)
419 goto bail; 515 goto bail;
420 } 516 }
421 517
518 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
519 o2net_dentry, NULL,
520 &stats_seq_fops);
521 if (!stats_dentry) {
522 mlog_errno(-ENOMEM);
523 goto bail;
524 }
525
422 return 0; 526 return 0;
423bail: 527bail:
424 if (sc_dentry) 528 debugfs_remove(stats_dentry);
425 debugfs_remove(sc_dentry); 529 debugfs_remove(sc_dentry);
426 if (nst_dentry) 530 debugfs_remove(nst_dentry);
427 debugfs_remove(nst_dentry); 531 debugfs_remove(o2net_dentry);
428 if (o2net_dentry)
429 debugfs_remove(o2net_dentry);
430 return -ENOMEM; 532 return -ENOMEM;
431} 533}
432 534
433void o2net_debugfs_exit(void) 535void o2net_debugfs_exit(void)
434{ 536{
435 if (sc_dentry) 537 debugfs_remove(stats_dentry);
436 debugfs_remove(sc_dentry); 538 debugfs_remove(sc_dentry);
437 if (nst_dentry) 539 debugfs_remove(nst_dentry);
438 debugfs_remove(nst_dentry); 540 debugfs_remove(o2net_dentry);
439 if (o2net_dentry)
440 debugfs_remove(o2net_dentry);
441} 541}
442 542
443#endif /* CONFIG_DEBUG_FS */ 543#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fed..bb240647ca5f 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); 711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
712 spin_lock_init(&node->nd_lock); 712 spin_lock_init(&node->nd_lock);
713 713
714 mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
715
714 return &node->nd_item; 716 return &node->nd_item;
715} 717}
716 718
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
744 } 746 }
745 write_unlock(&cluster->cl_nodes_lock); 747 write_unlock(&cluster->cl_nodes_lock);
746 748
749 mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
750 config_item_name(&node->nd_item));
751
747 config_item_put(item); 752 config_item_put(item);
748} 753}
749 754
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad571..49b594325bec 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
36/* host name, group name, cluster name all 64 bytes */ 36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN 37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38 38
39/*
40 * Maximum number of global heartbeat regions allowed.
41 * **CAUTION** Changing this number will break dlm compatibility.
42 */
43#define O2NM_MAX_REGIONS 32
44
39#endif /* _OCFS2_NODEMANAGER_H */ 45#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cf3e16696216..a87366750f23 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -325,5 +325,7 @@ void o2quo_init(void)
325 325
326void o2quo_exit(void) 326void o2quo_exit(void)
327{ 327{
328 flush_scheduled_work(); 328 struct o2quo_state *qs = &o2quo_state;
329
330 flush_work_sync(&qs->qs_work);
329} 331}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index cbe2f057cc28..3b11cb1e38fc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -153,63 +153,114 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
153 nst->st_node = node; 153 nst->st_node = node;
154} 154}
155 155
156static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 156static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
157{ 157{
158 do_gettimeofday(&nst->st_sock_time); 158 nst->st_sock_time = ktime_get();
159} 159}
160 160
161static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 161static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
162{ 162{
163 do_gettimeofday(&nst->st_send_time); 163 nst->st_send_time = ktime_get();
164} 164}
165 165
166static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 166static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
167{ 167{
168 do_gettimeofday(&nst->st_status_time); 168 nst->st_status_time = ktime_get();
169} 169}
170 170
171static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 171static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
172 struct o2net_sock_container *sc) 172 struct o2net_sock_container *sc)
173{ 173{
174 nst->st_sc = sc; 174 nst->st_sc = sc;
175} 175}
176 176
177static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) 177static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
178 u32 msg_id)
178{ 179{
179 nst->st_id = msg_id; 180 nst->st_id = msg_id;
180} 181}
181 182
182#else /* CONFIG_DEBUG_FS */ 183static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
183
184static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
185 u32 msgkey, struct task_struct *task, u8 node)
186{ 184{
185 sc->sc_tv_timer = ktime_get();
187} 186}
188 187
189static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 188static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
190{ 189{
190 sc->sc_tv_data_ready = ktime_get();
191} 191}
192 192
193static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 193static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
194{ 194{
195 sc->sc_tv_advance_start = ktime_get();
195} 196}
196 197
197static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 198static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
198{ 199{
200 sc->sc_tv_advance_stop = ktime_get();
199} 201}
200 202
201static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 203static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
202 struct o2net_sock_container *sc)
203{ 204{
205 sc->sc_tv_func_start = ktime_get();
204} 206}
205 207
206static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, 208static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
207 u32 msg_id)
208{ 209{
210 sc->sc_tv_func_stop = ktime_get();
209} 211}
210 212
213static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
214{
215 return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
216}
217#else /* CONFIG_DEBUG_FS */
218# define o2net_init_nst(a, b, c, d, e)
219# define o2net_set_nst_sock_time(a)
220# define o2net_set_nst_send_time(a)
221# define o2net_set_nst_status_time(a)
222# define o2net_set_nst_sock_container(a, b)
223# define o2net_set_nst_msg_id(a, b)
224# define o2net_set_sock_timer(a)
225# define o2net_set_data_ready_time(a)
226# define o2net_set_advance_start_time(a)
227# define o2net_set_advance_stop_time(a)
228# define o2net_set_func_start_time(a)
229# define o2net_set_func_stop_time(a)
230# define o2net_get_func_run_time(a) (ktime_t)0
211#endif /* CONFIG_DEBUG_FS */ 231#endif /* CONFIG_DEBUG_FS */
212 232
233#ifdef CONFIG_OCFS2_FS_STATS
234static void o2net_update_send_stats(struct o2net_send_tracking *nst,
235 struct o2net_sock_container *sc)
236{
237 sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
238 ktime_sub(ktime_get(),
239 nst->st_status_time));
240 sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
241 ktime_sub(nst->st_status_time,
242 nst->st_send_time));
243 sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
244 ktime_sub(nst->st_send_time,
245 nst->st_sock_time));
246 sc->sc_send_count++;
247}
248
249static void o2net_update_recv_stats(struct o2net_sock_container *sc)
250{
251 sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
252 o2net_get_func_run_time(sc));
253 sc->sc_recv_count++;
254}
255
256#else
257
258# define o2net_update_send_stats(a, b)
259
260# define o2net_update_recv_stats(sc)
261
262#endif /* CONFIG_OCFS2_FS_STATS */
263
213static inline int o2net_reconnect_delay(void) 264static inline int o2net_reconnect_delay(void)
214{ 265{
215 return o2nm_single_cluster->cl_reconnect_delay_ms; 266 return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -355,6 +406,7 @@ static void sc_kref_release(struct kref *kref)
355 sc->sc_sock = NULL; 406 sc->sc_sock = NULL;
356 } 407 }
357 408
409 o2nm_undepend_item(&sc->sc_node->nd_item);
358 o2nm_node_put(sc->sc_node); 410 o2nm_node_put(sc->sc_node);
359 sc->sc_node = NULL; 411 sc->sc_node = NULL;
360 412
@@ -376,6 +428,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
376{ 428{
377 struct o2net_sock_container *sc, *ret = NULL; 429 struct o2net_sock_container *sc, *ret = NULL;
378 struct page *page = NULL; 430 struct page *page = NULL;
431 int status = 0;
379 432
380 page = alloc_page(GFP_NOFS); 433 page = alloc_page(GFP_NOFS);
381 sc = kzalloc(sizeof(*sc), GFP_NOFS); 434 sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +439,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
386 o2nm_node_get(node); 439 o2nm_node_get(node);
387 sc->sc_node = node; 440 sc->sc_node = node;
388 441
442 /* pin the node item of the remote node */
443 status = o2nm_depend_item(&node->nd_item);
444 if (status) {
445 mlog_errno(status);
446 o2nm_node_put(node);
447 goto out;
448 }
389 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed); 449 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
390 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty); 450 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
391 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); 451 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -546,7 +606,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
546 if (sk->sk_user_data) { 606 if (sk->sk_user_data) {
547 struct o2net_sock_container *sc = sk->sk_user_data; 607 struct o2net_sock_container *sc = sk->sk_user_data;
548 sclog(sc, "data_ready hit\n"); 608 sclog(sc, "data_ready hit\n");
549 do_gettimeofday(&sc->sc_tv_data_ready); 609 o2net_set_data_ready_time(sc);
550 o2net_sc_queue_work(sc, &sc->sc_rx_work); 610 o2net_sc_queue_work(sc, &sc->sc_rx_work);
551 ready = sc->sc_data_ready; 611 ready = sc->sc_data_ready;
552 } else { 612 } else {
@@ -1070,6 +1130,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
1070 o2net_set_nst_status_time(&nst); 1130 o2net_set_nst_status_time(&nst);
1071 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); 1131 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
1072 1132
1133 o2net_update_send_stats(&nst, sc);
1134
1073 /* Note that we avoid overwriting the callers status return 1135 /* Note that we avoid overwriting the callers status return
1074 * variable if a system error was reported on the other 1136 * variable if a system error was reported on the other
1075 * side. Callers beware. */ 1137 * side. Callers beware. */
@@ -1183,13 +1245,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
1183 if (syserr != O2NET_ERR_NONE) 1245 if (syserr != O2NET_ERR_NONE)
1184 goto out_respond; 1246 goto out_respond;
1185 1247
1186 do_gettimeofday(&sc->sc_tv_func_start); 1248 o2net_set_func_start_time(sc);
1187 sc->sc_msg_key = be32_to_cpu(hdr->key); 1249 sc->sc_msg_key = be32_to_cpu(hdr->key);
1188 sc->sc_msg_type = be16_to_cpu(hdr->msg_type); 1250 sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
1189 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + 1251 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
1190 be16_to_cpu(hdr->data_len), 1252 be16_to_cpu(hdr->data_len),
1191 nmh->nh_func_data, &ret_data); 1253 nmh->nh_func_data, &ret_data);
1192 do_gettimeofday(&sc->sc_tv_func_stop); 1254 o2net_set_func_stop_time(sc);
1255
1256 o2net_update_recv_stats(sc);
1193 1257
1194out_respond: 1258out_respond:
1195 /* this destroys the hdr, so don't use it after this */ 1259 /* this destroys the hdr, so don't use it after this */
@@ -1300,7 +1364,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
1300 size_t datalen; 1364 size_t datalen;
1301 1365
1302 sclog(sc, "receiving\n"); 1366 sclog(sc, "receiving\n");
1303 do_gettimeofday(&sc->sc_tv_advance_start); 1367 o2net_set_advance_start_time(sc);
1304 1368
1305 if (unlikely(sc->sc_handshake_ok == 0)) { 1369 if (unlikely(sc->sc_handshake_ok == 0)) {
1306 if(sc->sc_page_off < sizeof(struct o2net_handshake)) { 1370 if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1439,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
1375 1439
1376out: 1440out:
1377 sclog(sc, "ret = %d\n", ret); 1441 sclog(sc, "ret = %d\n", ret);
1378 do_gettimeofday(&sc->sc_tv_advance_stop); 1442 o2net_set_advance_stop_time(sc);
1379 return ret; 1443 return ret;
1380} 1444}
1381 1445
@@ -1475,27 +1539,28 @@ static void o2net_idle_timer(unsigned long data)
1475{ 1539{
1476 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1540 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1477 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1541 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1478 struct timeval now;
1479 1542
1480 do_gettimeofday(&now); 1543#ifdef CONFIG_DEBUG_FS
1544 ktime_t now = ktime_get();
1545#endif
1481 1546
1482 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1547 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1483 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1548 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1484 o2net_idle_timeout() / 1000, 1549 o2net_idle_timeout() / 1000,
1485 o2net_idle_timeout() % 1000); 1550 o2net_idle_timeout() % 1000);
1486 mlog(ML_NOTICE, "here are some times that might help debug the " 1551
1487 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1552#ifdef CONFIG_DEBUG_FS
1488 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1553 mlog(ML_NOTICE, "Here are some times that might help debug the "
1489 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 1554 "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
1490 now.tv_sec, (long) now.tv_usec, 1555 "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
1491 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, 1556 (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
1492 sc->sc_tv_advance_start.tv_sec, 1557 (long long)ktime_to_us(sc->sc_tv_data_ready),
1493 (long) sc->sc_tv_advance_start.tv_usec, 1558 (long long)ktime_to_us(sc->sc_tv_advance_start),
1494 sc->sc_tv_advance_stop.tv_sec, 1559 (long long)ktime_to_us(sc->sc_tv_advance_stop),
1495 (long) sc->sc_tv_advance_stop.tv_usec,
1496 sc->sc_msg_key, sc->sc_msg_type, 1560 sc->sc_msg_key, sc->sc_msg_type,
1497 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, 1561 (long long)ktime_to_us(sc->sc_tv_func_start),
1498 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); 1562 (long long)ktime_to_us(sc->sc_tv_func_stop));
1563#endif
1499 1564
1500 /* 1565 /*
1501 * Initialize the nn_timeout so that the next connection attempt 1566 * Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1576,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1511 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 1576 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1512 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, 1577 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1513 msecs_to_jiffies(o2net_keepalive_delay())); 1578 msecs_to_jiffies(o2net_keepalive_delay()));
1514 do_gettimeofday(&sc->sc_tv_timer); 1579 o2net_set_sock_timer(sc);
1515 mod_timer(&sc->sc_idle_timeout, 1580 mod_timer(&sc->sc_idle_timeout,
1516 jiffies + msecs_to_jiffies(o2net_idle_timeout())); 1581 jiffies + msecs_to_jiffies(o2net_idle_timeout()));
1517} 1582}
@@ -1696,6 +1761,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1696{ 1761{
1697 o2quo_hb_down(node_num); 1762 o2quo_hb_down(node_num);
1698 1763
1764 if (!node)
1765 return;
1766
1699 if (node_num != o2nm_this_node()) 1767 if (node_num != o2nm_this_node())
1700 o2net_disconnect_node(node); 1768 o2net_disconnect_node(node);
1701 1769
@@ -1709,6 +1777,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1709 1777
1710 o2quo_hb_up(node_num); 1778 o2quo_hb_up(node_num);
1711 1779
1780 BUG_ON(!node);
1781
1712 /* ensure an immediate connect attempt */ 1782 /* ensure an immediate connect attempt */
1713 nn->nn_last_connect_attempt = jiffies - 1783 nn->nn_last_connect_attempt = jiffies -
1714 (msecs_to_jiffies(o2net_reconnect_delay()) + 1); 1784 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530c..4cbcb65784a3 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
129 129
130struct o2net_sock_container { 130struct o2net_sock_container {
131 struct kref sc_kref; 131 struct kref sc_kref;
132 /* the next two are vaild for the life time of the sc */ 132 /* the next two are valid for the life time of the sc */
133 struct socket *sc_sock; 133 struct socket *sc_sock;
134 struct o2nm_node *sc_node; 134 struct o2nm_node *sc_node;
135 135
@@ -166,18 +166,27 @@ struct o2net_sock_container {
166 /* original handlers for the sockets */ 166 /* original handlers for the sockets */
167 void (*sc_state_change)(struct sock *sk); 167 void (*sc_state_change)(struct sock *sk);
168 void (*sc_data_ready)(struct sock *sk, int bytes); 168 void (*sc_data_ready)(struct sock *sk, int bytes);
169#ifdef CONFIG_DEBUG_FS 169
170 struct list_head sc_net_debug_item;
171#endif
172 struct timeval sc_tv_timer;
173 struct timeval sc_tv_data_ready;
174 struct timeval sc_tv_advance_start;
175 struct timeval sc_tv_advance_stop;
176 struct timeval sc_tv_func_start;
177 struct timeval sc_tv_func_stop;
178 u32 sc_msg_key; 170 u32 sc_msg_key;
179 u16 sc_msg_type; 171 u16 sc_msg_type;
180 172
173#ifdef CONFIG_DEBUG_FS
174 struct list_head sc_net_debug_item;
175 ktime_t sc_tv_timer;
176 ktime_t sc_tv_data_ready;
177 ktime_t sc_tv_advance_start;
178 ktime_t sc_tv_advance_stop;
179 ktime_t sc_tv_func_start;
180 ktime_t sc_tv_func_stop;
181#endif
182#ifdef CONFIG_OCFS2_FS_STATS
183 ktime_t sc_tv_acquiry_total;
184 ktime_t sc_tv_send_total;
185 ktime_t sc_tv_status_total;
186 u32 sc_send_count;
187 u32 sc_recv_count;
188 ktime_t sc_tv_process_total;
189#endif
181 struct mutex sc_send_lock; 190 struct mutex sc_send_lock;
182}; 191};
183 192
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
220 u32 st_msg_type; 229 u32 st_msg_type;
221 u32 st_msg_key; 230 u32 st_msg_key;
222 u8 st_node; 231 u8 st_node;
223 struct timeval st_sock_time; 232 ktime_t st_sock_time;
224 struct timeval st_send_time; 233 ktime_t st_send_time;
225 struct timeval st_status_time; 234 ktime_t st_status_time;
226}; 235};
227#else 236#else
228struct o2net_send_tracking { 237struct o2net_send_tracking {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe2..6d80ecc7834f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,22 +40,45 @@
40#include "inode.h" 40#include "inode.h"
41#include "super.h" 41#include "super.h"
42 42
43void ocfs2_dentry_attach_gen(struct dentry *dentry)
44{
45 unsigned long gen =
46 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
47 BUG_ON(dentry->d_inode);
48 dentry->d_fsdata = (void *)gen;
49}
50
43 51
44static int ocfs2_dentry_revalidate(struct dentry *dentry, 52static int ocfs2_dentry_revalidate(struct dentry *dentry,
45 struct nameidata *nd) 53 struct nameidata *nd)
46{ 54{
47 struct inode *inode = dentry->d_inode; 55 struct inode *inode;
48 int ret = 0; /* if all else fails, just return false */ 56 int ret = 0; /* if all else fails, just return false */
49 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 57 struct ocfs2_super *osb;
58
59 if (nd->flags & LOOKUP_RCU)
60 return -ECHILD;
61
62 inode = dentry->d_inode;
63 osb = OCFS2_SB(dentry->d_sb);
50 64
51 mlog_entry("(0x%p, '%.*s')\n", dentry, 65 mlog_entry("(0x%p, '%.*s')\n", dentry,
52 dentry->d_name.len, dentry->d_name.name); 66 dentry->d_name.len, dentry->d_name.name);
53 67
54 /* Never trust a negative dentry - force a new lookup. */ 68 /* For a negative dentry -
69 * check the generation number of the parent and compare with the
70 * one stored in the inode.
71 */
55 if (inode == NULL) { 72 if (inode == NULL) {
56 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, 73 unsigned long gen = (unsigned long) dentry->d_fsdata;
57 dentry->d_name.name); 74 unsigned long pgen =
58 goto bail; 75 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
76 mlog(0, "negative dentry: %.*s parent gen: %lu "
77 "dentry gen: %lu\n",
78 dentry->d_name.len, dentry->d_name.name, pgen, gen);
79 if (gen != pgen)
80 goto bail;
81 goto valid;
59 } 82 }
60 83
61 BUG_ON(!osb); 84 BUG_ON(!osb);
@@ -96,6 +119,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
96 goto bail; 119 goto bail;
97 } 120 }
98 121
122valid:
99 ret = 1; 123 ret = 1;
100 124
101bail: 125bail:
@@ -151,23 +175,25 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
151 struct list_head *p; 175 struct list_head *p;
152 struct dentry *dentry = NULL; 176 struct dentry *dentry = NULL;
153 177
154 spin_lock(&dcache_lock); 178 spin_lock(&inode->i_lock);
155
156 list_for_each(p, &inode->i_dentry) { 179 list_for_each(p, &inode->i_dentry) {
157 dentry = list_entry(p, struct dentry, d_alias); 180 dentry = list_entry(p, struct dentry, d_alias);
158 181
182 spin_lock(&dentry->d_lock);
159 if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { 183 if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
160 mlog(0, "dentry found: %.*s\n", 184 mlog(0, "dentry found: %.*s\n",
161 dentry->d_name.len, dentry->d_name.name); 185 dentry->d_name.len, dentry->d_name.name);
162 186
163 dget_locked(dentry); 187 dget_dlock(dentry);
188 spin_unlock(&dentry->d_lock);
164 break; 189 break;
165 } 190 }
191 spin_unlock(&dentry->d_lock);
166 192
167 dentry = NULL; 193 dentry = NULL;
168 } 194 }
169 195
170 spin_unlock(&dcache_lock); 196 spin_unlock(&inode->i_lock);
171 197
172 return dentry; 198 return dentry;
173} 199}
@@ -227,6 +253,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
227 if (!inode) 253 if (!inode)
228 return 0; 254 return 0;
229 255
256 if (!dentry->d_inode && dentry->d_fsdata) {
257 /* Converting a negative dentry to positive
258 Clear dentry->d_fsdata */
259 dentry->d_fsdata = dl = NULL;
260 }
261
230 if (dl) { 262 if (dl) {
231 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, 263 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
232 " \"%.*s\": old parent: %llu, new: %llu\n", 264 " \"%.*s\": old parent: %llu, new: %llu\n",
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf1..b79eff709958 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
64 struct inode *old_dir, struct inode *new_dir); 64 struct inode *old_dir, struct inode *new_dir);
65 65
66extern spinlock_t dentry_attach_lock; 66extern spinlock_t dentry_attach_lock;
67void ocfs2_dentry_attach_gen(struct dentry *dentry);
67 68
68#endif /* OCFS2_DCACHE_H */ 69#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7ab..d417b3f9b0c7 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2461 2461
2462 di->i_dx_root = cpu_to_le64(dr_blkno); 2462 di->i_dx_root = cpu_to_le64(dr_blkno);
2463 2463
2464 spin_lock(&OCFS2_I(dir)->ip_lock);
2464 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; 2465 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2465 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 2466 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2467 spin_unlock(&OCFS2_I(dir)->ip_lock);
2466 2468
2467 ocfs2_journal_dirty(handle, di_bh); 2469 ocfs2_journal_dirty(handle, di_bh);
2468 2470
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4466 goto out_commit; 4468 goto out_commit;
4467 } 4469 }
4468 4470
4471 spin_lock(&OCFS2_I(dir)->ip_lock);
4469 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; 4472 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4470 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 4473 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4474 spin_unlock(&OCFS2_I(dir)->ip_lock);
4471 di->i_dx_root = cpu_to_le64(0ULL); 4475 di->i_dx_root = cpu_to_le64(0ULL);
4472 4476
4473 ocfs2_journal_dirty(handle, di_bh); 4477 ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f44999156839..3a3ed4bb794b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -90,19 +90,29 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
90 90
91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
92{ 92{
93 mlog_entry_void(); 93 struct dlm_lock_resource *res;
94 94
95 BUG_ON(!dlm); 95 BUG_ON(!dlm);
96 BUG_ON(!lock); 96 BUG_ON(!lock);
97 97
98 res = lock->lockres;
99
98 assert_spin_locked(&dlm->ast_lock); 100 assert_spin_locked(&dlm->ast_lock);
101
99 if (!list_empty(&lock->ast_list)) { 102 if (!list_empty(&lock->ast_list)) {
100 mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n", 103 mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
104 "AST list not empty, pending %d, newlevel %d\n",
105 dlm->name, res->lockname.len, res->lockname.name,
106 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
107 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
101 lock->ast_pending, lock->ml.type); 108 lock->ast_pending, lock->ml.type);
102 BUG(); 109 BUG();
103 } 110 }
104 if (lock->ast_pending) 111 if (lock->ast_pending)
105 mlog(0, "lock has an ast getting flushed right now\n"); 112 mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
113 dlm->name, res->lockname.len, res->lockname.name,
114 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
115 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
106 116
107 /* putting lock on list, add a ref */ 117 /* putting lock on list, add a ref */
108 dlm_lock_get(lock); 118 dlm_lock_get(lock);
@@ -110,9 +120,10 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
110 120
111 /* check to see if this ast obsoletes the bast */ 121 /* check to see if this ast obsoletes the bast */
112 if (dlm_should_cancel_bast(dlm, lock)) { 122 if (dlm_should_cancel_bast(dlm, lock)) {
113 struct dlm_lock_resource *res = lock->lockres; 123 mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
114 mlog(0, "%s: cancelling bast for %.*s\n", 124 dlm->name, res->lockname.len, res->lockname.name,
115 dlm->name, res->lockname.len, res->lockname.name); 125 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
126 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
116 lock->bast_pending = 0; 127 lock->bast_pending = 0;
117 list_del_init(&lock->bast_list); 128 list_del_init(&lock->bast_list);
118 lock->ml.highest_blocked = LKM_IVMODE; 129 lock->ml.highest_blocked = LKM_IVMODE;
@@ -134,8 +145,6 @@ void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
134 145
135void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 146void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
136{ 147{
137 mlog_entry_void();
138
139 BUG_ON(!dlm); 148 BUG_ON(!dlm);
140 BUG_ON(!lock); 149 BUG_ON(!lock);
141 150
@@ -147,15 +156,21 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
147 156
148void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 157void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
149{ 158{
150 mlog_entry_void(); 159 struct dlm_lock_resource *res;
151 160
152 BUG_ON(!dlm); 161 BUG_ON(!dlm);
153 BUG_ON(!lock); 162 BUG_ON(!lock);
163
154 assert_spin_locked(&dlm->ast_lock); 164 assert_spin_locked(&dlm->ast_lock);
155 165
166 res = lock->lockres;
167
156 BUG_ON(!list_empty(&lock->bast_list)); 168 BUG_ON(!list_empty(&lock->bast_list));
157 if (lock->bast_pending) 169 if (lock->bast_pending)
158 mlog(0, "lock has a bast getting flushed right now\n"); 170 mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
171 dlm->name, res->lockname.len, res->lockname.name,
172 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
173 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
159 174
160 /* putting lock on list, add a ref */ 175 /* putting lock on list, add a ref */
161 dlm_lock_get(lock); 176 dlm_lock_get(lock);
@@ -167,8 +182,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
167 182
168void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 183void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
169{ 184{
170 mlog_entry_void();
171
172 BUG_ON(!dlm); 185 BUG_ON(!dlm);
173 BUG_ON(!lock); 186 BUG_ON(!lock);
174 187
@@ -213,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
213 dlm_astlockfunc_t *fn; 226 dlm_astlockfunc_t *fn;
214 struct dlm_lockstatus *lksb; 227 struct dlm_lockstatus *lksb;
215 228
216 mlog_entry_void(); 229 mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
230 res->lockname.len, res->lockname.name,
231 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
232 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
217 233
218 lksb = lock->lksb; 234 lksb = lock->lksb;
219 fn = lock->ast; 235 fn = lock->ast;
@@ -231,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
231 struct dlm_lockstatus *lksb; 247 struct dlm_lockstatus *lksb;
232 int lksbflags; 248 int lksbflags;
233 249
234 mlog_entry_void(); 250 mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
251 res->lockname.len, res->lockname.name,
252 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
253 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
235 254
236 lksb = lock->lksb; 255 lksb = lock->lksb;
237 BUG_ON(lock->ml.node == dlm->node_num); 256 BUG_ON(lock->ml.node == dlm->node_num);
@@ -250,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
250{ 269{
251 dlm_bastlockfunc_t *fn = lock->bast; 270 dlm_bastlockfunc_t *fn = lock->bast;
252 271
253 mlog_entry_void();
254 BUG_ON(lock->ml.node != dlm->node_num); 272 BUG_ON(lock->ml.node != dlm->node_num);
255 273
274 mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
275 dlm->name, res->lockname.len, res->lockname.name,
276 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
277 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
278 blocked_type);
279
256 (*fn)(lock->astdata, blocked_type); 280 (*fn)(lock->astdata, blocked_type);
257} 281}
258 282
@@ -332,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
332 /* cannot get a proxy ast message if this node owns it */ 356 /* cannot get a proxy ast message if this node owns it */
333 BUG_ON(res->owner == dlm->node_num); 357 BUG_ON(res->owner == dlm->node_num);
334 358
335 mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); 359 mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
360 res->lockname.name);
336 361
337 spin_lock(&res->spinlock); 362 spin_lock(&res->spinlock);
338 if (res->state & DLM_LOCK_RES_RECOVERING) { 363 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -382,8 +407,12 @@ do_ast:
382 if (past->type == DLM_AST) { 407 if (past->type == DLM_AST) {
383 /* do not alter lock refcount. switching lists. */ 408 /* do not alter lock refcount. switching lists. */
384 list_move_tail(&lock->list, &res->granted); 409 list_move_tail(&lock->list, &res->granted);
385 mlog(0, "ast: Adding to granted list... type=%d, " 410 mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
386 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); 411 dlm->name, res->lockname.len, res->lockname.name,
412 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
413 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
414 lock->ml.type, lock->ml.convert_type);
415
387 if (lock->ml.convert_type != LKM_IVMODE) { 416 if (lock->ml.convert_type != LKM_IVMODE) {
388 lock->ml.type = lock->ml.convert_type; 417 lock->ml.type = lock->ml.convert_type;
389 lock->ml.convert_type = LKM_IVMODE; 418 lock->ml.convert_type = LKM_IVMODE;
@@ -426,9 +455,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
426 size_t veclen = 1; 455 size_t veclen = 1;
427 int status; 456 int status;
428 457
429 mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n", 458 mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
430 res->lockname.len, res->lockname.name, lock->ml.node, 459 res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
431 msg_type, blocked_type); 460 blocked_type);
432 461
433 memset(&past, 0, sizeof(struct dlm_proxy_ast)); 462 memset(&past, 0, sizeof(struct dlm_proxy_ast));
434 past.node_idx = dlm->node_num; 463 past.node_idx = dlm->node_num;
@@ -441,7 +470,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
441 vec[0].iov_len = sizeof(struct dlm_proxy_ast); 470 vec[0].iov_len = sizeof(struct dlm_proxy_ast);
442 vec[0].iov_base = &past; 471 vec[0].iov_base = &past;
443 if (flags & DLM_LKSB_GET_LVB) { 472 if (flags & DLM_LKSB_GET_LVB) {
444 mlog(0, "returning requested LVB data\n");
445 be32_add_cpu(&past.flags, LKM_GET_LVB); 473 be32_add_cpu(&past.flags, LKM_GET_LVB);
446 vec[1].iov_len = DLM_LVB_LEN; 474 vec[1].iov_len = DLM_LVB_LEN;
447 vec[1].iov_base = lock->lksb->lvb; 475 vec[1].iov_base = lock->lksb->lvb;
@@ -451,8 +479,8 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, 479 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
452 lock->ml.node, &status); 480 lock->ml.node, &status);
453 if (ret < 0) 481 if (ret < 0)
454 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 482 mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
455 "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key, 483 dlm->name, res->lockname.len, res->lockname.name, ret,
456 lock->ml.node); 484 lock->ml.node);
457 else { 485 else {
458 if (status == DLM_RECOVERING) { 486 if (status == DLM_RECOVERING) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 765298908f1d..4bdf7baee344 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -50,10 +50,10 @@
50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) 50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
51 51
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK = 0,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER = 1,
55 DLM_MLE_MIGRATION, 55 DLM_MLE_MIGRATION = 2,
56 DLM_MLE_NUM_TYPES 56 DLM_MLE_NUM_TYPES = 3,
57}; 57};
58 58
59struct dlm_master_list_entry { 59struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
82 82
83enum dlm_ast_type { 83enum dlm_ast_type {
84 DLM_AST = 0, 84 DLM_AST = 0,
85 DLM_BAST, 85 DLM_BAST = 1,
86 DLM_ASTUNLOCK 86 DLM_ASTUNLOCK = 2,
87}; 87};
88 88
89 89
@@ -119,9 +119,9 @@ struct dlm_recovery_ctxt
119 119
120enum dlm_ctxt_state { 120enum dlm_ctxt_state {
121 DLM_CTXT_NEW = 0, 121 DLM_CTXT_NEW = 0,
122 DLM_CTXT_JOINED, 122 DLM_CTXT_JOINED = 1,
123 DLM_CTXT_IN_SHUTDOWN, 123 DLM_CTXT_IN_SHUTDOWN = 2,
124 DLM_CTXT_LEAVING, 124 DLM_CTXT_LEAVING = 3,
125}; 125};
126 126
127struct dlm_ctxt 127struct dlm_ctxt
@@ -388,8 +388,8 @@ struct dlm_lock
388 388
389enum dlm_lockres_list { 389enum dlm_lockres_list {
390 DLM_GRANTED_LIST = 0, 390 DLM_GRANTED_LIST = 0,
391 DLM_CONVERTING_LIST, 391 DLM_CONVERTING_LIST = 1,
392 DLM_BLOCKED_LIST 392 DLM_BLOCKED_LIST = 2,
393}; 393};
394 394
395static inline int dlm_lvb_is_empty(char *lvb) 395static inline int dlm_lvb_is_empty(char *lvb)
@@ -427,25 +427,27 @@ struct dlm_node_iter
427 427
428 428
429enum { 429enum {
430 DLM_MASTER_REQUEST_MSG = 500, 430 DLM_MASTER_REQUEST_MSG = 500,
431 DLM_UNUSED_MSG1, /* 501 */ 431 DLM_UNUSED_MSG1 = 501,
432 DLM_ASSERT_MASTER_MSG, /* 502 */ 432 DLM_ASSERT_MASTER_MSG = 502,
433 DLM_CREATE_LOCK_MSG, /* 503 */ 433 DLM_CREATE_LOCK_MSG = 503,
434 DLM_CONVERT_LOCK_MSG, /* 504 */ 434 DLM_CONVERT_LOCK_MSG = 504,
435 DLM_PROXY_AST_MSG, /* 505 */ 435 DLM_PROXY_AST_MSG = 505,
436 DLM_UNLOCK_LOCK_MSG, /* 506 */ 436 DLM_UNLOCK_LOCK_MSG = 506,
437 DLM_DEREF_LOCKRES_MSG, /* 507 */ 437 DLM_DEREF_LOCKRES_MSG = 507,
438 DLM_MIGRATE_REQUEST_MSG, /* 508 */ 438 DLM_MIGRATE_REQUEST_MSG = 508,
439 DLM_MIG_LOCKRES_MSG, /* 509 */ 439 DLM_MIG_LOCKRES_MSG = 509,
440 DLM_QUERY_JOIN_MSG, /* 510 */ 440 DLM_QUERY_JOIN_MSG = 510,
441 DLM_ASSERT_JOINED_MSG, /* 511 */ 441 DLM_ASSERT_JOINED_MSG = 511,
442 DLM_CANCEL_JOIN_MSG, /* 512 */ 442 DLM_CANCEL_JOIN_MSG = 512,
443 DLM_EXIT_DOMAIN_MSG, /* 513 */ 443 DLM_EXIT_DOMAIN_MSG = 513,
444 DLM_MASTER_REQUERY_MSG, /* 514 */ 444 DLM_MASTER_REQUERY_MSG = 514,
445 DLM_LOCK_REQUEST_MSG, /* 515 */ 445 DLM_LOCK_REQUEST_MSG = 515,
446 DLM_RECO_DATA_DONE_MSG, /* 516 */ 446 DLM_RECO_DATA_DONE_MSG = 516,
447 DLM_BEGIN_RECO_MSG, /* 517 */ 447 DLM_BEGIN_RECO_MSG = 517,
448 DLM_FINALIZE_RECO_MSG /* 518 */ 448 DLM_FINALIZE_RECO_MSG = 518,
449 DLM_QUERY_REGION = 519,
450 DLM_QUERY_NODEINFO = 520,
449}; 451};
450 452
451struct dlm_reco_node_data 453struct dlm_reco_node_data
@@ -458,19 +460,19 @@ struct dlm_reco_node_data
458enum { 460enum {
459 DLM_RECO_NODE_DATA_DEAD = -1, 461 DLM_RECO_NODE_DATA_DEAD = -1,
460 DLM_RECO_NODE_DATA_INIT = 0, 462 DLM_RECO_NODE_DATA_INIT = 0,
461 DLM_RECO_NODE_DATA_REQUESTING, 463 DLM_RECO_NODE_DATA_REQUESTING = 1,
462 DLM_RECO_NODE_DATA_REQUESTED, 464 DLM_RECO_NODE_DATA_REQUESTED = 2,
463 DLM_RECO_NODE_DATA_RECEIVING, 465 DLM_RECO_NODE_DATA_RECEIVING = 3,
464 DLM_RECO_NODE_DATA_DONE, 466 DLM_RECO_NODE_DATA_DONE = 4,
465 DLM_RECO_NODE_DATA_FINALIZE_SENT, 467 DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
466}; 468};
467 469
468 470
469enum { 471enum {
470 DLM_MASTER_RESP_NO = 0, 472 DLM_MASTER_RESP_NO = 0,
471 DLM_MASTER_RESP_YES, 473 DLM_MASTER_RESP_YES = 1,
472 DLM_MASTER_RESP_MAYBE, 474 DLM_MASTER_RESP_MAYBE = 2,
473 DLM_MASTER_RESP_ERROR 475 DLM_MASTER_RESP_ERROR = 3,
474}; 476};
475 477
476 478
@@ -647,9 +649,9 @@ struct dlm_proxy_ast
647#define DLM_MOD_KEY (0x666c6172) 649#define DLM_MOD_KEY (0x666c6172)
648enum dlm_query_join_response_code { 650enum dlm_query_join_response_code {
649 JOIN_DISALLOW = 0, 651 JOIN_DISALLOW = 0,
650 JOIN_OK, 652 JOIN_OK = 1,
651 JOIN_OK_NO_MAP, 653 JOIN_OK_NO_MAP = 2,
652 JOIN_PROTOCOL_MISMATCH, 654 JOIN_PROTOCOL_MISMATCH = 3,
653}; 655};
654 656
655struct dlm_query_join_packet { 657struct dlm_query_join_packet {
@@ -727,6 +729,31 @@ struct dlm_cancel_join
727 u8 domain[O2NM_MAX_NAME_LEN]; 729 u8 domain[O2NM_MAX_NAME_LEN];
728}; 730};
729 731
732struct dlm_query_region {
733 u8 qr_node;
734 u8 qr_numregions;
735 u8 qr_namelen;
736 u8 pad1;
737 u8 qr_domain[O2NM_MAX_NAME_LEN];
738 u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
739};
740
741struct dlm_node_info {
742 u8 ni_nodenum;
743 u8 pad1;
744 u16 ni_ipv4_port;
745 u32 ni_ipv4_address;
746};
747
748struct dlm_query_nodeinfo {
749 u8 qn_nodenum;
750 u8 qn_numnodes;
751 u8 qn_namelen;
752 u8 pad1;
753 u8 qn_domain[O2NM_MAX_NAME_LEN];
754 struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
755};
756
730struct dlm_exit_domain 757struct dlm_exit_domain
731{ 758{
732 u8 node_idx; 759 u8 node_idx;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 901ca52bf86b..04a32be0aeb9 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -370,92 +370,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
370 kref_get(&dc->debug_refcnt); 370 kref_get(&dc->debug_refcnt);
371} 371}
372 372
373static struct debug_buffer *debug_buffer_allocate(void) 373static int debug_release(struct inode *inode, struct file *file)
374{ 374{
375 struct debug_buffer *db = NULL; 375 free_page((unsigned long)file->private_data);
376 376 return 0;
377 db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
378 if (!db)
379 goto bail;
380
381 db->len = PAGE_SIZE;
382 db->buf = kmalloc(db->len, GFP_KERNEL);
383 if (!db->buf)
384 goto bail;
385
386 return db;
387bail:
388 kfree(db);
389 return NULL;
390}
391
392static ssize_t debug_buffer_read(struct file *file, char __user *buf,
393 size_t nbytes, loff_t *ppos)
394{
395 struct debug_buffer *db = file->private_data;
396
397 return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
398}
399
400static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
401{
402 struct debug_buffer *db = file->private_data;
403 loff_t new = -1;
404
405 switch (whence) {
406 case 0:
407 new = off;
408 break;
409 case 1:
410 new = file->f_pos + off;
411 break;
412 }
413
414 if (new < 0 || new > db->len)
415 return -EINVAL;
416
417 return (file->f_pos = new);
418} 377}
419 378
420static int debug_buffer_release(struct inode *inode, struct file *file) 379static ssize_t debug_read(struct file *file, char __user *buf,
380 size_t nbytes, loff_t *ppos)
421{ 381{
422 struct debug_buffer *db = file->private_data; 382 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
423 383 i_size_read(file->f_mapping->host));
424 if (db)
425 kfree(db->buf);
426 kfree(db);
427
428 return 0;
429} 384}
430/* end - util funcs */ 385/* end - util funcs */
431 386
432/* begin - purge list funcs */ 387/* begin - purge list funcs */
433static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 388static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
434{ 389{
435 struct dlm_lock_resource *res; 390 struct dlm_lock_resource *res;
436 int out = 0; 391 int out = 0;
437 unsigned long total = 0; 392 unsigned long total = 0;
438 393
439 out += snprintf(db->buf + out, db->len - out, 394 out += snprintf(buf + out, len - out,
440 "Dumping Purgelist for Domain: %s\n", dlm->name); 395 "Dumping Purgelist for Domain: %s\n", dlm->name);
441 396
442 spin_lock(&dlm->spinlock); 397 spin_lock(&dlm->spinlock);
443 list_for_each_entry(res, &dlm->purge_list, purge) { 398 list_for_each_entry(res, &dlm->purge_list, purge) {
444 ++total; 399 ++total;
445 if (db->len - out < 100) 400 if (len - out < 100)
446 continue; 401 continue;
447 spin_lock(&res->spinlock); 402 spin_lock(&res->spinlock);
448 out += stringify_lockname(res->lockname.name, 403 out += stringify_lockname(res->lockname.name,
449 res->lockname.len, 404 res->lockname.len,
450 db->buf + out, db->len - out); 405 buf + out, len - out);
451 out += snprintf(db->buf + out, db->len - out, "\t%ld\n", 406 out += snprintf(buf + out, len - out, "\t%ld\n",
452 (jiffies - res->last_used)/HZ); 407 (jiffies - res->last_used)/HZ);
453 spin_unlock(&res->spinlock); 408 spin_unlock(&res->spinlock);
454 } 409 }
455 spin_unlock(&dlm->spinlock); 410 spin_unlock(&dlm->spinlock);
456 411
457 out += snprintf(db->buf + out, db->len - out, 412 out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
458 "Total on list: %ld\n", total);
459 413
460 return out; 414 return out;
461} 415}
@@ -463,15 +417,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
463static int debug_purgelist_open(struct inode *inode, struct file *file) 417static int debug_purgelist_open(struct inode *inode, struct file *file)
464{ 418{
465 struct dlm_ctxt *dlm = inode->i_private; 419 struct dlm_ctxt *dlm = inode->i_private;
466 struct debug_buffer *db; 420 char *buf = NULL;
467 421
468 db = debug_buffer_allocate(); 422 buf = (char *) get_zeroed_page(GFP_NOFS);
469 if (!db) 423 if (!buf)
470 goto bail; 424 goto bail;
471 425
472 db->len = debug_purgelist_print(dlm, db); 426 i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
473 427
474 file->private_data = db; 428 file->private_data = buf;
475 429
476 return 0; 430 return 0;
477bail: 431bail:
@@ -480,22 +434,22 @@ bail:
480 434
481static const struct file_operations debug_purgelist_fops = { 435static const struct file_operations debug_purgelist_fops = {
482 .open = debug_purgelist_open, 436 .open = debug_purgelist_open,
483 .release = debug_buffer_release, 437 .release = debug_release,
484 .read = debug_buffer_read, 438 .read = debug_read,
485 .llseek = debug_buffer_llseek, 439 .llseek = generic_file_llseek,
486}; 440};
487/* end - purge list funcs */ 441/* end - purge list funcs */
488 442
489/* begin - debug mle funcs */ 443/* begin - debug mle funcs */
490static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 444static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
491{ 445{
492 struct dlm_master_list_entry *mle; 446 struct dlm_master_list_entry *mle;
493 struct hlist_head *bucket; 447 struct hlist_head *bucket;
494 struct hlist_node *list; 448 struct hlist_node *list;
495 int i, out = 0; 449 int i, out = 0;
496 unsigned long total = 0, longest = 0, bktcnt; 450 unsigned long total = 0, longest = 0, bucket_count = 0;
497 451
498 out += snprintf(db->buf + out, db->len - out, 452 out += snprintf(buf + out, len - out,
499 "Dumping MLEs for Domain: %s\n", dlm->name); 453 "Dumping MLEs for Domain: %s\n", dlm->name);
500 454
501 spin_lock(&dlm->master_lock); 455 spin_lock(&dlm->master_lock);
@@ -505,17 +459,17 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
505 mle = hlist_entry(list, struct dlm_master_list_entry, 459 mle = hlist_entry(list, struct dlm_master_list_entry,
506 master_hash_node); 460 master_hash_node);
507 ++total; 461 ++total;
508 ++bktcnt; 462 ++bucket_count;
509 if (db->len - out < 200) 463 if (len - out < 200)
510 continue; 464 continue;
511 out += dump_mle(mle, db->buf + out, db->len - out); 465 out += dump_mle(mle, buf + out, len - out);
512 } 466 }
513 longest = max(longest, bktcnt); 467 longest = max(longest, bucket_count);
514 bktcnt = 0; 468 bucket_count = 0;
515 } 469 }
516 spin_unlock(&dlm->master_lock); 470 spin_unlock(&dlm->master_lock);
517 471
518 out += snprintf(db->buf + out, db->len - out, 472 out += snprintf(buf + out, len - out,
519 "Total: %ld, Longest: %ld\n", total, longest); 473 "Total: %ld, Longest: %ld\n", total, longest);
520 return out; 474 return out;
521} 475}
@@ -523,15 +477,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
523static int debug_mle_open(struct inode *inode, struct file *file) 477static int debug_mle_open(struct inode *inode, struct file *file)
524{ 478{
525 struct dlm_ctxt *dlm = inode->i_private; 479 struct dlm_ctxt *dlm = inode->i_private;
526 struct debug_buffer *db; 480 char *buf = NULL;
527 481
528 db = debug_buffer_allocate(); 482 buf = (char *) get_zeroed_page(GFP_NOFS);
529 if (!db) 483 if (!buf)
530 goto bail; 484 goto bail;
531 485
532 db->len = debug_mle_print(dlm, db); 486 i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
533 487
534 file->private_data = db; 488 file->private_data = buf;
535 489
536 return 0; 490 return 0;
537bail: 491bail:
@@ -540,9 +494,9 @@ bail:
540 494
541static const struct file_operations debug_mle_fops = { 495static const struct file_operations debug_mle_fops = {
542 .open = debug_mle_open, 496 .open = debug_mle_open,
543 .release = debug_buffer_release, 497 .release = debug_release,
544 .read = debug_buffer_read, 498 .read = debug_read,
545 .llseek = debug_buffer_llseek, 499 .llseek = generic_file_llseek,
546}; 500};
547 501
548/* end - debug mle funcs */ 502/* end - debug mle funcs */
@@ -757,7 +711,7 @@ static const struct file_operations debug_lockres_fops = {
757/* end - debug lockres funcs */ 711/* end - debug lockres funcs */
758 712
759/* begin - debug state funcs */ 713/* begin - debug state funcs */
760static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 714static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
761{ 715{
762 int out = 0; 716 int out = 0;
763 struct dlm_reco_node_data *node; 717 struct dlm_reco_node_data *node;
@@ -781,33 +735,35 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
781 } 735 }
782 736
783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ 737 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
784 out += snprintf(db->buf + out, db->len - out, 738 out += snprintf(buf + out, len - out,
785 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); 739 "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
740 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
741 dlm->dlm_locking_proto.pv_minor);
786 742
787 /* Thread Pid: xxx Node: xxx State: xxxxx */ 743 /* Thread Pid: xxx Node: xxx State: xxxxx */
788 out += snprintf(db->buf + out, db->len - out, 744 out += snprintf(buf + out, len - out,
789 "Thread Pid: %d Node: %d State: %s\n", 745 "Thread Pid: %d Node: %d State: %s\n",
790 dlm->dlm_thread_task->pid, dlm->node_num, state); 746 task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
791 747
792 /* Number of Joins: xxx Joining Node: xxx */ 748 /* Number of Joins: xxx Joining Node: xxx */
793 out += snprintf(db->buf + out, db->len - out, 749 out += snprintf(buf + out, len - out,
794 "Number of Joins: %d Joining Node: %d\n", 750 "Number of Joins: %d Joining Node: %d\n",
795 dlm->num_joins, dlm->joining_node); 751 dlm->num_joins, dlm->joining_node);
796 752
797 /* Domain Map: xx xx xx */ 753 /* Domain Map: xx xx xx */
798 out += snprintf(db->buf + out, db->len - out, "Domain Map: "); 754 out += snprintf(buf + out, len - out, "Domain Map: ");
799 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, 755 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
800 db->buf + out, db->len - out); 756 buf + out, len - out);
801 out += snprintf(db->buf + out, db->len - out, "\n"); 757 out += snprintf(buf + out, len - out, "\n");
802 758
803 /* Live Map: xx xx xx */ 759 /* Live Map: xx xx xx */
804 out += snprintf(db->buf + out, db->len - out, "Live Map: "); 760 out += snprintf(buf + out, len - out, "Live Map: ");
805 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, 761 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
806 db->buf + out, db->len - out); 762 buf + out, len - out);
807 out += snprintf(db->buf + out, db->len - out, "\n"); 763 out += snprintf(buf + out, len - out, "\n");
808 764
809 /* Lock Resources: xxx (xxx) */ 765 /* Lock Resources: xxx (xxx) */
810 out += snprintf(db->buf + out, db->len - out, 766 out += snprintf(buf + out, len - out,
811 "Lock Resources: %d (%d)\n", 767 "Lock Resources: %d (%d)\n",
812 atomic_read(&dlm->res_cur_count), 768 atomic_read(&dlm->res_cur_count),
813 atomic_read(&dlm->res_tot_count)); 769 atomic_read(&dlm->res_tot_count));
@@ -819,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
819 cur_mles += atomic_read(&dlm->mle_cur_count[i]); 775 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
820 776
821 /* MLEs: xxx (xxx) */ 777 /* MLEs: xxx (xxx) */
822 out += snprintf(db->buf + out, db->len - out, 778 out += snprintf(buf + out, len - out,
823 "MLEs: %d (%d)\n", cur_mles, tot_mles); 779 "MLEs: %d (%d)\n", cur_mles, tot_mles);
824 780
825 /* Blocking: xxx (xxx) */ 781 /* Blocking: xxx (xxx) */
826 out += snprintf(db->buf + out, db->len - out, 782 out += snprintf(buf + out, len - out,
827 " Blocking: %d (%d)\n", 783 " Blocking: %d (%d)\n",
828 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), 784 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
829 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); 785 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
830 786
831 /* Mastery: xxx (xxx) */ 787 /* Mastery: xxx (xxx) */
832 out += snprintf(db->buf + out, db->len - out, 788 out += snprintf(buf + out, len - out,
833 " Mastery: %d (%d)\n", 789 " Mastery: %d (%d)\n",
834 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), 790 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
835 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); 791 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
836 792
837 /* Migration: xxx (xxx) */ 793 /* Migration: xxx (xxx) */
838 out += snprintf(db->buf + out, db->len - out, 794 out += snprintf(buf + out, len - out,
839 " Migration: %d (%d)\n", 795 " Migration: %d (%d)\n",
840 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), 796 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
841 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); 797 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
842 798
843 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 799 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
844 out += snprintf(db->buf + out, db->len - out, 800 out += snprintf(buf + out, len - out,
845 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 801 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
846 "PendingBASTs=%s\n", 802 "PendingBASTs=%s\n",
847 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 803 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -850,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
850 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); 806 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
851 807
852 /* Purge Count: xxx Refs: xxx */ 808 /* Purge Count: xxx Refs: xxx */
853 out += snprintf(db->buf + out, db->len - out, 809 out += snprintf(buf + out, len - out,
854 "Purge Count: %d Refs: %d\n", dlm->purge_count, 810 "Purge Count: %d Refs: %d\n", dlm->purge_count,
855 atomic_read(&dlm->dlm_refs.refcount)); 811 atomic_read(&dlm->dlm_refs.refcount));
856 812
857 /* Dead Node: xxx */ 813 /* Dead Node: xxx */
858 out += snprintf(db->buf + out, db->len - out, 814 out += snprintf(buf + out, len - out,
859 "Dead Node: %d\n", dlm->reco.dead_node); 815 "Dead Node: %d\n", dlm->reco.dead_node);
860 816
861 /* What about DLM_RECO_STATE_FINALIZE? */ 817 /* What about DLM_RECO_STATE_FINALIZE? */
@@ -865,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
865 state = "INACTIVE"; 821 state = "INACTIVE";
866 822
867 /* Recovery Pid: xxxx Master: xxx State: xxxx */ 823 /* Recovery Pid: xxxx Master: xxx State: xxxx */
868 out += snprintf(db->buf + out, db->len - out, 824 out += snprintf(buf + out, len - out,
869 "Recovery Pid: %d Master: %d State: %s\n", 825 "Recovery Pid: %d Master: %d State: %s\n",
870 dlm->dlm_reco_thread_task->pid, 826 task_pid_nr(dlm->dlm_reco_thread_task),
871 dlm->reco.new_master, state); 827 dlm->reco.new_master, state);
872 828
873 /* Recovery Map: xx xx */ 829 /* Recovery Map: xx xx */
874 out += snprintf(db->buf + out, db->len - out, "Recovery Map: "); 830 out += snprintf(buf + out, len - out, "Recovery Map: ");
875 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, 831 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
876 db->buf + out, db->len - out); 832 buf + out, len - out);
877 out += snprintf(db->buf + out, db->len - out, "\n"); 833 out += snprintf(buf + out, len - out, "\n");
878 834
879 /* Recovery Node State: */ 835 /* Recovery Node State: */
880 out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n"); 836 out += snprintf(buf + out, len - out, "Recovery Node State:\n");
881 list_for_each_entry(node, &dlm->reco.node_data, list) { 837 list_for_each_entry(node, &dlm->reco.node_data, list) {
882 switch (node->state) { 838 switch (node->state) {
883 case DLM_RECO_NODE_DATA_INIT: 839 case DLM_RECO_NODE_DATA_INIT:
@@ -905,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
905 state = "BAD"; 861 state = "BAD";
906 break; 862 break;
907 } 863 }
908 out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n", 864 out += snprintf(buf + out, len - out, "\t%u - %s\n",
909 node->node_num, state); 865 node->node_num, state);
910 } 866 }
911 867
@@ -917,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
917static int debug_state_open(struct inode *inode, struct file *file) 873static int debug_state_open(struct inode *inode, struct file *file)
918{ 874{
919 struct dlm_ctxt *dlm = inode->i_private; 875 struct dlm_ctxt *dlm = inode->i_private;
920 struct debug_buffer *db = NULL; 876 char *buf = NULL;
921 877
922 db = debug_buffer_allocate(); 878 buf = (char *) get_zeroed_page(GFP_NOFS);
923 if (!db) 879 if (!buf)
924 goto bail; 880 goto bail;
925 881
926 db->len = debug_state_print(dlm, db); 882 i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
927 883
928 file->private_data = db; 884 file->private_data = buf;
929 885
930 return 0; 886 return 0;
931bail: 887bail:
@@ -934,9 +890,9 @@ bail:
934 890
935static const struct file_operations debug_state_fops = { 891static const struct file_operations debug_state_fops = {
936 .open = debug_state_open, 892 .open = debug_state_open,
937 .release = debug_buffer_release, 893 .release = debug_release,
938 .read = debug_buffer_read, 894 .read = debug_read,
939 .llseek = debug_buffer_llseek, 895 .llseek = generic_file_llseek,
940}; 896};
941/* end - debug state funcs */ 897/* end - debug state funcs */
942 898
@@ -1000,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
1000 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; 956 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
1001 957
1002 if (dc) { 958 if (dc) {
1003 if (dc->debug_purgelist_dentry) 959 debugfs_remove(dc->debug_purgelist_dentry);
1004 debugfs_remove(dc->debug_purgelist_dentry); 960 debugfs_remove(dc->debug_mle_dentry);
1005 if (dc->debug_mle_dentry) 961 debugfs_remove(dc->debug_lockres_dentry);
1006 debugfs_remove(dc->debug_mle_dentry); 962 debugfs_remove(dc->debug_state_dentry);
1007 if (dc->debug_lockres_dentry)
1008 debugfs_remove(dc->debug_lockres_dentry);
1009 if (dc->debug_state_dentry)
1010 debugfs_remove(dc->debug_state_dentry);
1011 dlm_debug_put(dc); 963 dlm_debug_put(dc);
1012 } 964 }
1013} 965}
@@ -1038,8 +990,7 @@ bail:
1038 990
1039void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) 991void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
1040{ 992{
1041 if (dlm->dlm_debugfs_subroot) 993 debugfs_remove(dlm->dlm_debugfs_subroot);
1042 debugfs_remove(dlm->dlm_debugfs_subroot);
1043} 994}
1044 995
1045/* debugfs root */ 996/* debugfs root */
@@ -1055,7 +1006,6 @@ int dlm_create_debugfs_root(void)
1055 1006
1056void dlm_destroy_debugfs_root(void) 1007void dlm_destroy_debugfs_root(void)
1057{ 1008{
1058 if (dlm_debugfs_root) 1009 debugfs_remove(dlm_debugfs_root);
1059 debugfs_remove(dlm_debugfs_root);
1060} 1010}
1061#endif /* CONFIG_DEBUG_FS */ 1011#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c7..1f27c4812d1a 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
37 struct dentry *debug_purgelist_dentry; 37 struct dentry *debug_purgelist_dentry;
38}; 38};
39 39
40struct debug_buffer {
41 int len;
42 char *buf;
43};
44
45struct debug_lockres { 40struct debug_lockres {
46 int dl_len; 41 int dl_len;
47 char *dl_buf; 42 char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 11a5c87fd7f7..7e38a072d720 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
128 * will have a negotiated version with the same major number and a minor 128 * will have a negotiated version with the same major number and a minor
129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should 129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
130 * be used to determine what a running domain is actually using. 130 * be used to determine what a running domain is actually using.
131 *
132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
131 */ 135 */
132static const struct dlm_protocol_version dlm_protocol = { 136static const struct dlm_protocol_version dlm_protocol = {
133 .pv_major = 1, 137 .pv_major = 1,
134 .pv_minor = 0, 138 .pv_minor = 1,
135}; 139};
136 140
137#define DLM_DOMAIN_BACKOFF_MS 200 141#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
142 void **ret_data); 146 void **ret_data);
143static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 147static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
144 void **ret_data); 148 void **ret_data);
149static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
150 void *data, void **ret_data);
145static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 151static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
146 void **ret_data); 152 void **ret_data);
147static int dlm_protocol_compare(struct dlm_protocol_version *existing, 153static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -454,8 +460,6 @@ redo_bucket:
454 } 460 }
455 cond_resched_lock(&dlm->spinlock); 461 cond_resched_lock(&dlm->spinlock);
456 num += n; 462 num += n;
457 mlog(0, "%s: touched %d lockreses in bucket %d "
458 "(tot=%d)\n", dlm->name, n, i, num);
459 } 463 }
460 spin_unlock(&dlm->spinlock); 464 spin_unlock(&dlm->spinlock);
461 wake_up(&dlm->dlm_thread_wq); 465 wake_up(&dlm->dlm_thread_wq);
@@ -921,6 +925,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
921 return 0; 925 return 0;
922} 926}
923 927
928static int dlm_match_regions(struct dlm_ctxt *dlm,
929 struct dlm_query_region *qr)
930{
931 char *local = NULL, *remote = qr->qr_regions;
932 char *l, *r;
933 int localnr, i, j, foundit;
934 int status = 0;
935
936 if (!o2hb_global_heartbeat_active()) {
937 if (qr->qr_numregions) {
938 mlog(ML_ERROR, "Domain %s: Joining node %d has global "
939 "heartbeat enabled but local node %d does not\n",
940 qr->qr_domain, qr->qr_node, dlm->node_num);
941 status = -EINVAL;
942 }
943 goto bail;
944 }
945
946 if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
947 mlog(ML_ERROR, "Domain %s: Local node %d has global "
948 "heartbeat enabled but joining node %d does not\n",
949 qr->qr_domain, dlm->node_num, qr->qr_node);
950 status = -EINVAL;
951 goto bail;
952 }
953
954 r = remote;
955 for (i = 0; i < qr->qr_numregions; ++i) {
956 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
957 r += O2HB_MAX_REGION_NAME_LEN;
958 }
959
960 local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
961 if (!local) {
962 status = -ENOMEM;
963 goto bail;
964 }
965
966 localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
967
968 /* compare local regions with remote */
969 l = local;
970 for (i = 0; i < localnr; ++i) {
971 foundit = 0;
972 r = remote;
973 for (j = 0; j <= qr->qr_numregions; ++j) {
974 if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
975 foundit = 1;
976 break;
977 }
978 r += O2HB_MAX_REGION_NAME_LEN;
979 }
980 if (!foundit) {
981 status = -EINVAL;
982 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
983 "in local node %d but not in joining node %d\n",
984 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
985 dlm->node_num, qr->qr_node);
986 goto bail;
987 }
988 l += O2HB_MAX_REGION_NAME_LEN;
989 }
990
991 /* compare remote with local regions */
992 r = remote;
993 for (i = 0; i < qr->qr_numregions; ++i) {
994 foundit = 0;
995 l = local;
996 for (j = 0; j < localnr; ++j) {
997 if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
998 foundit = 1;
999 break;
1000 }
1001 l += O2HB_MAX_REGION_NAME_LEN;
1002 }
1003 if (!foundit) {
1004 status = -EINVAL;
1005 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
1006 "in joining node %d but not in local node %d\n",
1007 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
1008 qr->qr_node, dlm->node_num);
1009 goto bail;
1010 }
1011 r += O2HB_MAX_REGION_NAME_LEN;
1012 }
1013
1014bail:
1015 kfree(local);
1016
1017 return status;
1018}
1019
1020static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
1021{
1022 struct dlm_query_region *qr = NULL;
1023 int status, ret = 0, i;
1024 char *p;
1025
1026 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1027 goto bail;
1028
1029 qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
1030 if (!qr) {
1031 ret = -ENOMEM;
1032 mlog_errno(ret);
1033 goto bail;
1034 }
1035
1036 qr->qr_node = dlm->node_num;
1037 qr->qr_namelen = strlen(dlm->name);
1038 memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
1039 /* if local hb, the numregions will be zero */
1040 if (o2hb_global_heartbeat_active())
1041 qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
1042 O2NM_MAX_REGIONS);
1043
1044 p = qr->qr_regions;
1045 for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
1046 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
1047
1048 i = -1;
1049 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1050 i + 1)) < O2NM_MAX_NODES) {
1051 if (i == dlm->node_num)
1052 continue;
1053
1054 mlog(0, "Sending regions to node %d\n", i);
1055
1056 ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
1057 sizeof(struct dlm_query_region),
1058 i, &status);
1059 if (ret >= 0)
1060 ret = status;
1061 if (ret) {
1062 mlog(ML_ERROR, "Region mismatch %d, node %d\n",
1063 ret, i);
1064 break;
1065 }
1066 }
1067
1068bail:
1069 kfree(qr);
1070 return ret;
1071}
1072
1073static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1074 void *data, void **ret_data)
1075{
1076 struct dlm_query_region *qr;
1077 struct dlm_ctxt *dlm = NULL;
1078 int status = 0;
1079 int locked = 0;
1080
1081 qr = (struct dlm_query_region *) msg->buf;
1082
1083 mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
1084 qr->qr_domain);
1085
1086 status = -EINVAL;
1087
1088 spin_lock(&dlm_domain_lock);
1089 dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
1090 if (!dlm) {
1091 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1092 "before join domain\n", qr->qr_node, qr->qr_domain);
1093 goto bail;
1094 }
1095
1096 spin_lock(&dlm->spinlock);
1097 locked = 1;
1098 if (dlm->joining_node != qr->qr_node) {
1099 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1100 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1101 dlm->joining_node);
1102 goto bail;
1103 }
1104
1105 /* Support for global heartbeat was added in 1.1 */
1106 if (dlm->dlm_locking_proto.pv_major == 1 &&
1107 dlm->dlm_locking_proto.pv_minor == 0) {
1108 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1109 "but active dlm protocol is %d.%d\n", qr->qr_node,
1110 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1111 dlm->dlm_locking_proto.pv_minor);
1112 goto bail;
1113 }
1114
1115 status = dlm_match_regions(dlm, qr);
1116
1117bail:
1118 if (locked)
1119 spin_unlock(&dlm->spinlock);
1120 spin_unlock(&dlm_domain_lock);
1121
1122 return status;
1123}
1124
1125static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
1126{
1127 struct o2nm_node *local;
1128 struct dlm_node_info *remote;
1129 int i, j;
1130 int status = 0;
1131
1132 for (j = 0; j < qn->qn_numnodes; ++j)
1133 mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
1134 &(qn->qn_nodes[j].ni_ipv4_address),
1135 ntohs(qn->qn_nodes[j].ni_ipv4_port));
1136
1137 for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
1138 local = o2nm_get_node_by_num(i);
1139 remote = NULL;
1140 for (j = 0; j < qn->qn_numnodes; ++j) {
1141 if (qn->qn_nodes[j].ni_nodenum == i) {
1142 remote = &(qn->qn_nodes[j]);
1143 break;
1144 }
1145 }
1146
1147 if (!local && !remote)
1148 continue;
1149
1150 if ((local && !remote) || (!local && remote))
1151 status = -EINVAL;
1152
1153 if (!status &&
1154 ((remote->ni_nodenum != local->nd_num) ||
1155 (remote->ni_ipv4_port != local->nd_ipv4_port) ||
1156 (remote->ni_ipv4_address != local->nd_ipv4_address)))
1157 status = -EINVAL;
1158
1159 if (status) {
1160 if (remote && !local)
1161 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1162 "registered in joining node %d but not in "
1163 "local node %d\n", qn->qn_domain,
1164 remote->ni_nodenum,
1165 &(remote->ni_ipv4_address),
1166 ntohs(remote->ni_ipv4_port),
1167 qn->qn_nodenum, dlm->node_num);
1168 if (local && !remote)
1169 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1170 "registered in local node %d but not in "
1171 "joining node %d\n", qn->qn_domain,
1172 local->nd_num, &(local->nd_ipv4_address),
1173 ntohs(local->nd_ipv4_port),
1174 dlm->node_num, qn->qn_nodenum);
1175 BUG_ON((!local && !remote));
1176 }
1177
1178 if (local)
1179 o2nm_node_put(local);
1180 }
1181
1182 return status;
1183}
1184
1185static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
1186{
1187 struct dlm_query_nodeinfo *qn = NULL;
1188 struct o2nm_node *node;
1189 int ret = 0, status, count, i;
1190
1191 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1192 goto bail;
1193
1194 qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
1195 if (!qn) {
1196 ret = -ENOMEM;
1197 mlog_errno(ret);
1198 goto bail;
1199 }
1200
1201 for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
1202 node = o2nm_get_node_by_num(i);
1203 if (!node)
1204 continue;
1205 qn->qn_nodes[count].ni_nodenum = node->nd_num;
1206 qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
1207 qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
1208 mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
1209 &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
1210 ++count;
1211 o2nm_node_put(node);
1212 }
1213
1214 qn->qn_nodenum = dlm->node_num;
1215 qn->qn_numnodes = count;
1216 qn->qn_namelen = strlen(dlm->name);
1217 memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
1218
1219 i = -1;
1220 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1221 i + 1)) < O2NM_MAX_NODES) {
1222 if (i == dlm->node_num)
1223 continue;
1224
1225 mlog(0, "Sending nodeinfo to node %d\n", i);
1226
1227 ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
1228 qn, sizeof(struct dlm_query_nodeinfo),
1229 i, &status);
1230 if (ret >= 0)
1231 ret = status;
1232 if (ret) {
1233 mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
1234 break;
1235 }
1236 }
1237
1238bail:
1239 kfree(qn);
1240 return ret;
1241}
1242
1243static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
1244 void *data, void **ret_data)
1245{
1246 struct dlm_query_nodeinfo *qn;
1247 struct dlm_ctxt *dlm = NULL;
1248 int locked = 0, status = -EINVAL;
1249
1250 qn = (struct dlm_query_nodeinfo *) msg->buf;
1251
1252 mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
1253 qn->qn_domain);
1254
1255 spin_lock(&dlm_domain_lock);
1256 dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
1257 if (!dlm) {
1258 mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
1259 "join domain\n", qn->qn_nodenum, qn->qn_domain);
1260 goto bail;
1261 }
1262
1263 spin_lock(&dlm->spinlock);
1264 locked = 1;
1265 if (dlm->joining_node != qn->qn_nodenum) {
1266 mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
1267 "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
1268 dlm->joining_node);
1269 goto bail;
1270 }
1271
1272 /* Support for node query was added in 1.1 */
1273 if (dlm->dlm_locking_proto.pv_major == 1 &&
1274 dlm->dlm_locking_proto.pv_minor == 0) {
1275 mlog(ML_ERROR, "Node %d queried nodes on domain %s "
1276 "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
1277 qn->qn_domain, dlm->dlm_locking_proto.pv_major,
1278 dlm->dlm_locking_proto.pv_minor);
1279 goto bail;
1280 }
1281
1282 status = dlm_match_nodes(dlm, qn);
1283
1284bail:
1285 if (locked)
1286 spin_unlock(&dlm->spinlock);
1287 spin_unlock(&dlm_domain_lock);
1288
1289 return status;
1290}
1291
924static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 1292static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
925 void **ret_data) 1293 void **ret_data)
926{ 1294{
@@ -1241,6 +1609,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1241 set_bit(dlm->node_num, dlm->domain_map); 1609 set_bit(dlm->node_num, dlm->domain_map);
1242 spin_unlock(&dlm->spinlock); 1610 spin_unlock(&dlm->spinlock);
1243 1611
1612 /* Support for global heartbeat and node info was added in 1.1 */
1613 if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
1614 status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
1615 if (status) {
1616 mlog_errno(status);
1617 goto bail;
1618 }
1619 status = dlm_send_regions(dlm, ctxt->yes_resp_map);
1620 if (status) {
1621 mlog_errno(status);
1622 goto bail;
1623 }
1624 }
1625
1244 dlm_send_join_asserts(dlm, ctxt->yes_resp_map); 1626 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
1245 1627
1246 /* Joined state *must* be set before the joining node 1628 /* Joined state *must* be set before the joining node
@@ -1277,8 +1659,8 @@ bail:
1277 1659
1278static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) 1660static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
1279{ 1661{
1280 o2hb_unregister_callback(NULL, &dlm->dlm_hb_up); 1662 o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
1281 o2hb_unregister_callback(NULL, &dlm->dlm_hb_down); 1663 o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
1282 o2net_unregister_handler_list(&dlm->dlm_domain_handlers); 1664 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
1283} 1665}
1284 1666
@@ -1290,13 +1672,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1290 1672
1291 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1673 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1292 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1674 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1293 status = o2hb_register_callback(NULL, &dlm->dlm_hb_down); 1675 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
1294 if (status) 1676 if (status)
1295 goto bail; 1677 goto bail;
1296 1678
1297 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, 1679 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1298 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); 1680 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1299 status = o2hb_register_callback(NULL, &dlm->dlm_hb_up); 1681 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
1300 if (status) 1682 if (status)
1301 goto bail; 1683 goto bail;
1302 1684
@@ -1807,7 +2189,21 @@ static int dlm_register_net_handlers(void)
1807 sizeof(struct dlm_cancel_join), 2189 sizeof(struct dlm_cancel_join),
1808 dlm_cancel_join_handler, 2190 dlm_cancel_join_handler,
1809 NULL, NULL, &dlm_join_handlers); 2191 NULL, NULL, &dlm_join_handlers);
2192 if (status)
2193 goto bail;
2194
2195 status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
2196 sizeof(struct dlm_query_region),
2197 dlm_query_region_handler,
2198 NULL, NULL, &dlm_join_handlers);
1810 2199
2200 if (status)
2201 goto bail;
2202
2203 status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
2204 sizeof(struct dlm_query_nodeinfo),
2205 dlm_query_nodeinfo_handler,
2206 NULL, NULL, &dlm_join_handlers);
1811bail: 2207bail:
1812 if (status < 0) 2208 if (status < 0)
1813 dlm_unregister_net_handlers(); 2209 dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 69cf369961c4..7009292aac5a 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -106,6 +106,9 @@ static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
106 106
107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) 107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
108 return 0; 108 return 0;
109 if (!dlm_lock_compatible(tmplock->ml.convert_type,
110 lock->ml.type))
111 return 0;
109 } 112 }
110 113
111 return 1; 114 return 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80d..59f0f6bdfc62 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2346 */ 2346 */
2347static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 2347static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2348 struct dlm_lock_resource *res, 2348 struct dlm_lock_resource *res,
2349 int *numlocks) 2349 int *numlocks,
2350 int *hasrefs)
2350{ 2351{
2351 int ret; 2352 int ret;
2352 int i; 2353 int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2356 2357
2357 assert_spin_locked(&res->spinlock); 2358 assert_spin_locked(&res->spinlock);
2358 2359
2360 *numlocks = 0;
2361 *hasrefs = 0;
2362
2359 ret = -EINVAL; 2363 ret = -EINVAL;
2360 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 2364 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2361 mlog(0, "cannot migrate lockres with unknown owner!\n"); 2365 mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2386 } 2390 }
2387 2391
2388 *numlocks = count; 2392 *numlocks = count;
2389 mlog(0, "migrateable lockres having %d locks\n", *numlocks); 2393
2394 count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2395 if (count < O2NM_MAX_NODES)
2396 *hasrefs = 1;
2397
2398 mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
2399 res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
2390 2400
2391leave: 2401leave:
2392 return ret; 2402 return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2408 const char *name; 2418 const char *name;
2409 unsigned int namelen; 2419 unsigned int namelen;
2410 int mle_added = 0; 2420 int mle_added = 0;
2411 int numlocks; 2421 int numlocks, hasrefs;
2412 int wake = 0; 2422 int wake = 0;
2413 2423
2414 if (!dlm_grab(dlm)) 2424 if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2417 name = res->lockname.name; 2427 name = res->lockname.name;
2418 namelen = res->lockname.len; 2428 namelen = res->lockname.len;
2419 2429
2420 mlog(0, "migrating %.*s to %u\n", namelen, name, target); 2430 mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
2421 2431
2422 /* 2432 /*
2423 * ensure this lockres is a proper candidate for migration 2433 * ensure this lockres is a proper candidate for migration
2424 */ 2434 */
2425 spin_lock(&res->spinlock); 2435 spin_lock(&res->spinlock);
2426 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); 2436 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2427 if (ret < 0) { 2437 if (ret < 0) {
2428 spin_unlock(&res->spinlock); 2438 spin_unlock(&res->spinlock);
2429 goto leave; 2439 goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2431 spin_unlock(&res->spinlock); 2441 spin_unlock(&res->spinlock);
2432 2442
2433 /* no work to do */ 2443 /* no work to do */
2434 if (numlocks == 0) { 2444 if (numlocks == 0 && !hasrefs)
2435 mlog(0, "no locks were found on this lockres! done!\n");
2436 goto leave; 2445 goto leave;
2437 }
2438 2446
2439 /* 2447 /*
2440 * preallocate up front 2448 * preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2459 * find a node to migrate the lockres to 2467 * find a node to migrate the lockres to
2460 */ 2468 */
2461 2469
2462 mlog(0, "picking a migration node\n");
2463 spin_lock(&dlm->spinlock); 2470 spin_lock(&dlm->spinlock);
2464 /* pick a new node */ 2471 /* pick a new node */
2465 if (!test_bit(target, dlm->domain_map) || 2472 if (!test_bit(target, dlm->domain_map) ||
2466 target >= O2NM_MAX_NODES) { 2473 target >= O2NM_MAX_NODES) {
2467 target = dlm_pick_migration_target(dlm, res); 2474 target = dlm_pick_migration_target(dlm, res);
2468 } 2475 }
2469 mlog(0, "node %u chosen for migration\n", target); 2476 mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
2477 namelen, name, target);
2470 2478
2471 if (target >= O2NM_MAX_NODES || 2479 if (target >= O2NM_MAX_NODES ||
2472 !test_bit(target, dlm->domain_map)) { 2480 !test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2667{ 2675{
2668 int ret; 2676 int ret;
2669 int lock_dropped = 0; 2677 int lock_dropped = 0;
2670 int numlocks; 2678 int numlocks, hasrefs;
2671 2679
2672 spin_lock(&res->spinlock); 2680 spin_lock(&res->spinlock);
2673 if (res->owner != dlm->node_num) { 2681 if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2681 } 2689 }
2682 2690
2683 /* No need to migrate a lockres having no locks */ 2691 /* No need to migrate a lockres having no locks */
2684 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); 2692 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2685 if (ret >= 0 && numlocks == 0) { 2693 if (ret >= 0 && numlocks == 0 && !hasrefs) {
2686 spin_unlock(&res->spinlock); 2694 spin_unlock(&res->spinlock);
2687 goto leave; 2695 goto leave;
2688 } 2696 }
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2915 } 2923 }
2916 queue++; 2924 queue++;
2917 } 2925 }
2926
2927 nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2928 if (nodenum < O2NM_MAX_NODES) {
2929 spin_unlock(&res->spinlock);
2930 return nodenum;
2931 }
2918 spin_unlock(&res->spinlock); 2932 spin_unlock(&res->spinlock);
2919 mlog(0, "have not found a suitable target yet! checking domain map\n"); 2933 mlog(0, "have not found a suitable target yet! checking domain map\n");
2920 2934
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2211acf33d9b..1d6d1d22c471 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -122,15 +122,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
122void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 122void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
123 struct dlm_lock_resource *res) 123 struct dlm_lock_resource *res)
124{ 124{
125 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
126
127 assert_spin_locked(&dlm->spinlock); 125 assert_spin_locked(&dlm->spinlock);
128 assert_spin_locked(&res->spinlock); 126 assert_spin_locked(&res->spinlock);
129 127
130 if (__dlm_lockres_unused(res)){ 128 if (__dlm_lockres_unused(res)){
131 if (list_empty(&res->purge)) { 129 if (list_empty(&res->purge)) {
132 mlog(0, "putting lockres %.*s:%p onto purge list\n", 130 mlog(0, "%s: Adding res %.*s to purge list\n",
133 res->lockname.len, res->lockname.name, res); 131 dlm->name, res->lockname.len, res->lockname.name);
134 132
135 res->last_used = jiffies; 133 res->last_used = jiffies;
136 dlm_lockres_get(res); 134 dlm_lockres_get(res);
@@ -138,8 +136,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
138 dlm->purge_count++; 136 dlm->purge_count++;
139 } 137 }
140 } else if (!list_empty(&res->purge)) { 138 } else if (!list_empty(&res->purge)) {
141 mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", 139 mlog(0, "%s: Removing res %.*s from purge list\n",
142 res->lockname.len, res->lockname.name, res, res->owner); 140 dlm->name, res->lockname.len, res->lockname.name);
143 141
144 list_del_init(&res->purge); 142 list_del_init(&res->purge);
145 dlm_lockres_put(res); 143 dlm_lockres_put(res);
@@ -150,7 +148,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
150void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 148void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
151 struct dlm_lock_resource *res) 149 struct dlm_lock_resource *res)
152{ 150{
153 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
154 spin_lock(&dlm->spinlock); 151 spin_lock(&dlm->spinlock);
155 spin_lock(&res->spinlock); 152 spin_lock(&res->spinlock);
156 153
@@ -171,9 +168,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
171 168
172 master = (res->owner == dlm->node_num); 169 master = (res->owner == dlm->node_num);
173 170
174 171 mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 172 res->lockname.len, res->lockname.name, master);
176 res->lockname.name, master);
177 173
178 if (!master) { 174 if (!master) {
179 res->state |= DLM_LOCK_RES_DROPPING_REF; 175 res->state |= DLM_LOCK_RES_DROPPING_REF;
@@ -189,27 +185,25 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
189 /* clear our bit from the master's refmap, ignore errors */ 185 /* clear our bit from the master's refmap, ignore errors */
190 ret = dlm_drop_lockres_ref(dlm, res); 186 ret = dlm_drop_lockres_ref(dlm, res);
191 if (ret < 0) { 187 if (ret < 0) {
192 mlog_errno(ret); 188 mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
189 res->lockname.len, res->lockname.name, ret);
193 if (!dlm_is_host_down(ret)) 190 if (!dlm_is_host_down(ret))
194 BUG(); 191 BUG();
195 } 192 }
196 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
197 dlm->name, res->lockname.len, res->lockname.name, ret);
198 spin_lock(&dlm->spinlock); 193 spin_lock(&dlm->spinlock);
199 spin_lock(&res->spinlock); 194 spin_lock(&res->spinlock);
200 } 195 }
201 196
202 if (!list_empty(&res->purge)) { 197 if (!list_empty(&res->purge)) {
203 mlog(0, "removing lockres %.*s:%p from purgelist, " 198 mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
204 "master = %d\n", res->lockname.len, res->lockname.name, 199 dlm->name, res->lockname.len, res->lockname.name, master);
205 res, master);
206 list_del_init(&res->purge); 200 list_del_init(&res->purge);
207 dlm_lockres_put(res); 201 dlm_lockres_put(res);
208 dlm->purge_count--; 202 dlm->purge_count--;
209 } 203 }
210 204
211 if (!__dlm_lockres_unused(res)) { 205 if (!__dlm_lockres_unused(res)) {
212 mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n", 206 mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
213 dlm->name, res->lockname.len, res->lockname.name); 207 dlm->name, res->lockname.len, res->lockname.name);
214 __dlm_print_one_lock_resource(res); 208 __dlm_print_one_lock_resource(res);
215 BUG(); 209 BUG();
@@ -266,10 +260,10 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
266 unused = __dlm_lockres_unused(lockres); 260 unused = __dlm_lockres_unused(lockres);
267 if (!unused || 261 if (!unused ||
268 (lockres->state & DLM_LOCK_RES_MIGRATING)) { 262 (lockres->state & DLM_LOCK_RES_MIGRATING)) {
269 mlog(0, "lockres %s:%.*s: is in use or " 263 mlog(0, "%s: res %.*s is in use or being remastered, "
270 "being remastered, used %d, state %d\n", 264 "used %d, state %d\n", dlm->name,
271 dlm->name, lockres->lockname.len, 265 lockres->lockname.len, lockres->lockname.name,
272 lockres->lockname.name, !unused, lockres->state); 266 !unused, lockres->state);
273 list_move_tail(&dlm->purge_list, &lockres->purge); 267 list_move_tail(&dlm->purge_list, &lockres->purge);
274 spin_unlock(&lockres->spinlock); 268 spin_unlock(&lockres->spinlock);
275 continue; 269 continue;
@@ -296,15 +290,12 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
296 struct list_head *head; 290 struct list_head *head;
297 int can_grant = 1; 291 int can_grant = 1;
298 292
299 //mlog(0, "res->lockname.len=%d\n", res->lockname.len); 293 /*
300 //mlog(0, "res->lockname.name=%p\n", res->lockname.name); 294 * Because this function is called with the lockres
301 //mlog(0, "shuffle res %.*s\n", res->lockname.len,
302 // res->lockname.name);
303
304 /* because this function is called with the lockres
305 * spinlock, and because we know that it is not migrating/ 295 * spinlock, and because we know that it is not migrating/
306 * recovering/in-progress, it is fine to reserve asts and 296 * recovering/in-progress, it is fine to reserve asts and
307 * basts right before queueing them all throughout */ 297 * basts right before queueing them all throughout
298 */
308 assert_spin_locked(&dlm->ast_lock); 299 assert_spin_locked(&dlm->ast_lock);
309 assert_spin_locked(&res->spinlock); 300 assert_spin_locked(&res->spinlock);
310 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| 301 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
@@ -314,13 +305,13 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
314converting: 305converting:
315 if (list_empty(&res->converting)) 306 if (list_empty(&res->converting))
316 goto blocked; 307 goto blocked;
317 mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len, 308 mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
318 res->lockname.name); 309 res->lockname.len, res->lockname.name);
319 310
320 target = list_entry(res->converting.next, struct dlm_lock, list); 311 target = list_entry(res->converting.next, struct dlm_lock, list);
321 if (target->ml.convert_type == LKM_IVMODE) { 312 if (target->ml.convert_type == LKM_IVMODE) {
322 mlog(ML_ERROR, "%.*s: converting a lock with no " 313 mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
323 "convert_type!\n", res->lockname.len, res->lockname.name); 314 dlm->name, res->lockname.len, res->lockname.name);
324 BUG(); 315 BUG();
325 } 316 }
326 head = &res->granted; 317 head = &res->granted;
@@ -365,9 +356,12 @@ converting:
365 spin_lock(&target->spinlock); 356 spin_lock(&target->spinlock);
366 BUG_ON(target->ml.highest_blocked != LKM_IVMODE); 357 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
367 358
368 mlog(0, "calling ast for converting lock: %.*s, have: %d, " 359 mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
369 "granting: %d, node: %u\n", res->lockname.len, 360 "%d => %d, node %u\n", dlm->name, res->lockname.len,
370 res->lockname.name, target->ml.type, 361 res->lockname.name,
362 dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
363 dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
364 target->ml.type,
371 target->ml.convert_type, target->ml.node); 365 target->ml.convert_type, target->ml.node);
372 366
373 target->ml.type = target->ml.convert_type; 367 target->ml.type = target->ml.convert_type;
@@ -428,11 +422,14 @@ blocked:
428 spin_lock(&target->spinlock); 422 spin_lock(&target->spinlock);
429 BUG_ON(target->ml.highest_blocked != LKM_IVMODE); 423 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
430 424
431 mlog(0, "calling ast for blocked lock: %.*s, granting: %d, " 425 mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
432 "node: %u\n", res->lockname.len, res->lockname.name, 426 "node %u\n", dlm->name, res->lockname.len,
427 res->lockname.name,
428 dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
429 dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
433 target->ml.type, target->ml.node); 430 target->ml.type, target->ml.node);
434 431
435 // target->ml.type is already correct 432 /* target->ml.type is already correct */
436 list_move_tail(&target->list, &res->granted); 433 list_move_tail(&target->list, &res->granted);
437 434
438 BUG_ON(!target->lksb); 435 BUG_ON(!target->lksb);
@@ -453,7 +450,6 @@ leave:
453/* must have NO locks when calling this with res !=NULL * */ 450/* must have NO locks when calling this with res !=NULL * */
454void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 451void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
455{ 452{
456 mlog_entry("dlm=%p, res=%p\n", dlm, res);
457 if (res) { 453 if (res) {
458 spin_lock(&dlm->spinlock); 454 spin_lock(&dlm->spinlock);
459 spin_lock(&res->spinlock); 455 spin_lock(&res->spinlock);
@@ -466,8 +462,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
466 462
467void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 463void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
468{ 464{
469 mlog_entry("dlm=%p, res=%p\n", dlm, res);
470
471 assert_spin_locked(&dlm->spinlock); 465 assert_spin_locked(&dlm->spinlock);
472 assert_spin_locked(&res->spinlock); 466 assert_spin_locked(&res->spinlock);
473 467
@@ -484,13 +478,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
484 res->state |= DLM_LOCK_RES_DIRTY; 478 res->state |= DLM_LOCK_RES_DIRTY;
485 } 479 }
486 } 480 }
481
482 mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
483 res->lockname.name);
487} 484}
488 485
489 486
490/* Launch the NM thread for the mounted volume */ 487/* Launch the NM thread for the mounted volume */
491int dlm_launch_thread(struct dlm_ctxt *dlm) 488int dlm_launch_thread(struct dlm_ctxt *dlm)
492{ 489{
493 mlog(0, "starting dlm thread...\n"); 490 mlog(0, "Starting dlm_thread...\n");
494 491
495 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); 492 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
496 if (IS_ERR(dlm->dlm_thread_task)) { 493 if (IS_ERR(dlm->dlm_thread_task)) {
@@ -505,7 +502,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
505void dlm_complete_thread(struct dlm_ctxt *dlm) 502void dlm_complete_thread(struct dlm_ctxt *dlm)
506{ 503{
507 if (dlm->dlm_thread_task) { 504 if (dlm->dlm_thread_task) {
508 mlog(ML_KTHREAD, "waiting for dlm thread to exit\n"); 505 mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
509 kthread_stop(dlm->dlm_thread_task); 506 kthread_stop(dlm->dlm_thread_task);
510 dlm->dlm_thread_task = NULL; 507 dlm->dlm_thread_task = NULL;
511 } 508 }
@@ -536,7 +533,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
536 /* get an extra ref on lock */ 533 /* get an extra ref on lock */
537 dlm_lock_get(lock); 534 dlm_lock_get(lock);
538 res = lock->lockres; 535 res = lock->lockres;
539 mlog(0, "delivering an ast for this lockres\n"); 536 mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
537 "node %u\n", dlm->name, res->lockname.len,
538 res->lockname.name,
539 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
540 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
541 lock->ml.type, lock->ml.node);
540 542
541 BUG_ON(!lock->ast_pending); 543 BUG_ON(!lock->ast_pending);
542 544
@@ -557,9 +559,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
557 /* possible that another ast was queued while 559 /* possible that another ast was queued while
558 * we were delivering the last one */ 560 * we were delivering the last one */
559 if (!list_empty(&lock->ast_list)) { 561 if (!list_empty(&lock->ast_list)) {
560 mlog(0, "aha another ast got queued while " 562 mlog(0, "%s: res %.*s, AST queued while flushing last "
561 "we were finishing the last one. will " 563 "one\n", dlm->name, res->lockname.len,
562 "keep the ast_pending flag set.\n"); 564 res->lockname.name);
563 } else 565 } else
564 lock->ast_pending = 0; 566 lock->ast_pending = 0;
565 567
@@ -590,8 +592,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
590 dlm_lock_put(lock); 592 dlm_lock_put(lock);
591 spin_unlock(&dlm->ast_lock); 593 spin_unlock(&dlm->ast_lock);
592 594
593 mlog(0, "delivering a bast for this lockres " 595 mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
594 "(blocked = %d\n", hi); 596 "blocked %d, node %u\n",
597 dlm->name, res->lockname.len, res->lockname.name,
598 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
599 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
600 hi, lock->ml.node);
595 601
596 if (lock->ml.node != dlm->node_num) { 602 if (lock->ml.node != dlm->node_num) {
597 ret = dlm_send_proxy_bast(dlm, res, lock, hi); 603 ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -605,9 +611,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
605 /* possible that another bast was queued while 611 /* possible that another bast was queued while
606 * we were delivering the last one */ 612 * we were delivering the last one */
607 if (!list_empty(&lock->bast_list)) { 613 if (!list_empty(&lock->bast_list)) {
608 mlog(0, "aha another bast got queued while " 614 mlog(0, "%s: res %.*s, BAST queued while flushing last "
609 "we were finishing the last one. will " 615 "one\n", dlm->name, res->lockname.len,
610 "keep the bast_pending flag set.\n"); 616 res->lockname.name);
611 } else 617 } else
612 lock->bast_pending = 0; 618 lock->bast_pending = 0;
613 619
@@ -675,11 +681,12 @@ static int dlm_thread(void *data)
675 spin_lock(&res->spinlock); 681 spin_lock(&res->spinlock);
676 if (res->owner != dlm->node_num) { 682 if (res->owner != dlm->node_num) {
677 __dlm_print_one_lock_resource(res); 683 __dlm_print_one_lock_resource(res);
678 mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n", 684 mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
679 res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no", 685 " dirty %d\n", dlm->name,
680 res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no", 686 !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
681 res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no", 687 !!(res->state & DLM_LOCK_RES_MIGRATING),
682 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); 688 !!(res->state & DLM_LOCK_RES_RECOVERING),
689 !!(res->state & DLM_LOCK_RES_DIRTY));
683 } 690 }
684 BUG_ON(res->owner != dlm->node_num); 691 BUG_ON(res->owner != dlm->node_num);
685 692
@@ -693,8 +700,8 @@ static int dlm_thread(void *data)
693 res->state &= ~DLM_LOCK_RES_DIRTY; 700 res->state &= ~DLM_LOCK_RES_DIRTY;
694 spin_unlock(&res->spinlock); 701 spin_unlock(&res->spinlock);
695 spin_unlock(&dlm->ast_lock); 702 spin_unlock(&dlm->ast_lock);
696 mlog(0, "delaying list shuffling for in-" 703 mlog(0, "%s: res %.*s, inprogress, delay list "
697 "progress lockres %.*s, state=%d\n", 704 "shuffle, state %d\n", dlm->name,
698 res->lockname.len, res->lockname.name, 705 res->lockname.len, res->lockname.name,
699 res->state); 706 res->state);
700 delay = 1; 707 delay = 1;
@@ -706,10 +713,6 @@ static int dlm_thread(void *data)
706 * spinlock and do NOT have the dlm lock. 713 * spinlock and do NOT have the dlm lock.
707 * safe to reserve/queue asts and run the lists. */ 714 * safe to reserve/queue asts and run the lists. */
708 715
709 mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
710 "res=%.*s\n", dlm->name,
711 res->lockname.len, res->lockname.name);
712
713 /* called while holding lockres lock */ 716 /* called while holding lockres lock */
714 dlm_shuffle_lists(dlm, res); 717 dlm_shuffle_lists(dlm, res);
715 res->state &= ~DLM_LOCK_RES_DIRTY; 718 res->state &= ~DLM_LOCK_RES_DIRTY;
@@ -733,7 +736,8 @@ in_progress:
733 /* unlikely, but we may need to give time to 736 /* unlikely, but we may need to give time to
734 * other tasks */ 737 * other tasks */
735 if (!--n) { 738 if (!--n) {
736 mlog(0, "throttling dlm_thread\n"); 739 mlog(0, "%s: Throttling dlm thread\n",
740 dlm->name);
737 break; 741 break;
738 } 742 }
739 } 743 }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..8c5c0eddc365 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
351 return &ip->ip_vfs_inode; 351 return &ip->ip_vfs_inode;
352} 352}
353 353
354static void dlmfs_destroy_inode(struct inode *inode) 354static void dlmfs_i_callback(struct rcu_head *head)
355{ 355{
356 struct inode *inode = container_of(head, struct inode, i_rcu);
357 INIT_LIST_HEAD(&inode->i_dentry);
356 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); 358 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
357} 359}
358 360
361static void dlmfs_destroy_inode(struct inode *inode)
362{
363 call_rcu(&inode->i_rcu, dlmfs_i_callback);
364}
365
359static void dlmfs_evict_inode(struct inode *inode) 366static void dlmfs_evict_inode(struct inode *inode)
360{ 367{
361 int status; 368 int status;
@@ -400,6 +407,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
400 if (inode) { 407 if (inode) {
401 ip = DLMFS_I(inode); 408 ip = DLMFS_I(inode);
402 409
410 inode->i_ino = get_next_ino();
403 inode->i_mode = mode; 411 inode->i_mode = mode;
404 inode->i_uid = current_fsuid(); 412 inode->i_uid = current_fsuid();
405 inode->i_gid = current_fsgid(); 413 inode->i_gid = current_fsgid();
@@ -425,6 +433,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
425 if (!inode) 433 if (!inode)
426 return NULL; 434 return NULL;
427 435
436 inode->i_ino = get_next_ino();
428 inode->i_mode = mode; 437 inode->i_mode = mode;
429 inode->i_uid = current_fsuid(); 438 inode->i_uid = current_fsuid();
430 inode->i_gid = current_fsgid(); 439 inode->i_gid = current_fsgid();
@@ -612,6 +621,7 @@ static const struct file_operations dlmfs_file_operations = {
612 .poll = dlmfs_file_poll, 621 .poll = dlmfs_file_poll,
613 .read = dlmfs_file_read, 622 .read = dlmfs_file_read,
614 .write = dlmfs_file_write, 623 .write = dlmfs_file_write,
624 .llseek = default_llseek,
615}; 625};
616 626
617static const struct inode_operations dlmfs_dir_inode_operations = { 627static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -640,16 +650,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
640 .setattr = dlmfs_file_setattr, 650 .setattr = dlmfs_file_setattr,
641}; 651};
642 652
643static int dlmfs_get_sb(struct file_system_type *fs_type, 653static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
644 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 654 int flags, const char *dev_name, void *data)
645{ 655{
646 return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); 656 return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
647} 657}
648 658
649static struct file_system_type dlmfs_fs_type = { 659static struct file_system_type dlmfs_fs_type = {
650 .owner = THIS_MODULE, 660 .owner = THIS_MODULE,
651 .name = "ocfs2_dlmfs", 661 .name = "ocfs2_dlmfs",
652 .get_sb = dlmfs_get_sb, 662 .mount = dlmfs_mount,
653 .kill_sb = kill_litter_super, 663 .kill_sb = kill_litter_super,
654}; 664};
655 665
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a893f46e..e8d94d722ecb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3635{ 3635{
3636 struct inode *inode; 3636 struct inode *inode;
3637 struct address_space *mapping; 3637 struct address_space *mapping;
3638 struct ocfs2_inode_info *oi;
3638 3639
3639 inode = ocfs2_lock_res_inode(lockres); 3640 inode = ocfs2_lock_res_inode(lockres);
3640 mapping = inode->i_mapping; 3641 mapping = inode->i_mapping;
3641 3642
3643 if (S_ISDIR(inode->i_mode)) {
3644 oi = OCFS2_I(inode);
3645 oi->ip_dir_lock_gen++;
3646 mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
3647 goto out;
3648 }
3649
3642 if (!S_ISREG(inode->i_mode)) 3650 if (!S_ISREG(inode->i_mode))
3643 goto out; 3651 goto out;
3644 3652
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af3..5dbc3062b4fd 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
137 } 137 }
138 138
139 result = d_obtain_alias(inode); 139 result = d_obtain_alias(inode);
140 if (!IS_ERR(result)) 140 if (IS_ERR(result))
141 result->d_op = &ocfs2_dentry_ops;
142 else
143 mlog_errno(PTR_ERR(result)); 141 mlog_errno(PTR_ERR(result));
144 142
145bail: 143bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
175 } 173 }
176 174
177 parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); 175 parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
178 if (!IS_ERR(parent))
179 parent->d_op = &ocfs2_dentry_ops;
180 176
181bail_unlock: 177bail_unlock:
182 ocfs2_inode_unlock(dir, 0); 178 ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a03c151b5ce..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -64,12 +64,6 @@
64 64
65#include "buffer_head_io.h" 65#include "buffer_head_io.h"
66 66
67static int ocfs2_sync_inode(struct inode *inode)
68{
69 filemap_fdatawrite(inode->i_mapping);
70 return sync_mapping_buffers(inode->i_mapping);
71}
72
73static int ocfs2_init_file_private(struct inode *inode, struct file *file) 67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
74{ 68{
75 struct ocfs2_file_private *fp; 69 struct ocfs2_file_private *fp;
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
180{ 174{
181 int err = 0; 175 int err = 0;
182 journal_t *journal; 176 journal_t *journal;
183 struct dentry *dentry = file->f_path.dentry;
184 struct inode *inode = file->f_mapping->host; 177 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 179
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 180 mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
188 dentry->d_name.len, dentry->d_name.name); 181 file->f_path.dentry, file->f_path.dentry->d_name.len,
189 182 file->f_path.dentry->d_name.name);
190 err = ocfs2_sync_inode(dentry->d_inode);
191 if (err)
192 goto bail;
193 183
194 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 184 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
195 /* 185 /*
@@ -197,8 +187,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
197 * platter 187 * platter
198 */ 188 */
199 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 189 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
200 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 190 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
201 NULL, BLKDEV_IFL_WAIT);
202 goto bail; 191 goto bail;
203 } 192 }
204 193
@@ -370,7 +359,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
370 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 359 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
371 goto out; 360 goto out;
372 361
373 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); 362 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
374 363
375out: 364out:
376 return status; 365 return status;
@@ -807,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
807 block_end = block_start + (1 << inode->i_blkbits); 796 block_end = block_start + (1 << inode->i_blkbits);
808 797
809 /* 798 /*
810 * block_start is block-aligned. Bump it by one to 799 * block_start is block-aligned. Bump it by one to force
811 * force ocfs2_{prepare,commit}_write() to zero the 800 * __block_write_begin and block_commit_write to zero the
812 * whole block. 801 * whole block.
813 */ 802 */
814 ret = ocfs2_prepare_write_nolock(inode, page, 803 ret = __block_write_begin(page, block_start + 1, 0,
815 block_start + 1, 804 ocfs2_get_block);
816 block_start + 1);
817 if (ret < 0) { 805 if (ret < 0) {
818 mlog_errno(ret); 806 mlog_errno(ret);
819 goto out_unlock; 807 goto out_unlock;
@@ -913,8 +901,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
913 zero_clusters = last_cpos - zero_cpos; 901 zero_clusters = last_cpos - zero_cpos;
914 902
915 if (needs_cow) { 903 if (needs_cow) {
916 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, 904 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
917 UINT_MAX); 905 zero_clusters, UINT_MAX);
918 if (rc) { 906 if (rc) {
919 mlog_errno(rc); 907 mlog_errno(rc);
920 goto out; 908 goto out;
@@ -1319,10 +1307,13 @@ bail:
1319 return err; 1307 return err;
1320} 1308}
1321 1309
1322int ocfs2_permission(struct inode *inode, int mask) 1310int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
1323{ 1311{
1324 int ret; 1312 int ret;
1325 1313
1314 if (flags & IPERM_FLAG_RCU)
1315 return -ECHILD;
1316
1326 mlog_entry_void(); 1317 mlog_entry_void();
1327 1318
1328 ret = ocfs2_inode_lock(inode, NULL, 0); 1319 ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1332,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
1332 goto out; 1323 goto out;
1333 } 1324 }
1334 1325
1335 ret = generic_permission(inode, mask, ocfs2_check_acl); 1326 ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
1336 1327
1337 ocfs2_inode_unlock(inode, 0); 1328 ocfs2_inode_unlock(inode, 0);
1338out: 1329out:
@@ -1998,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1998 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1989 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1999} 1990}
2000 1991
2001static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1992static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
2002 loff_t len) 1993 loff_t len)
2003{ 1994{
1995 struct inode *inode = file->f_path.dentry->d_inode;
2004 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1996 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2005 struct ocfs2_space_resv sr; 1997 struct ocfs2_space_resv sr;
2006 int change_size = 1; 1998 int change_size = 1;
1999 int cmd = OCFS2_IOC_RESVSP64;
2007 2000
2001 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2002 return -EOPNOTSUPP;
2008 if (!ocfs2_writes_unwritten_extents(osb)) 2003 if (!ocfs2_writes_unwritten_extents(osb))
2009 return -EOPNOTSUPP; 2004 return -EOPNOTSUPP;
2010 2005
2011 if (S_ISDIR(inode->i_mode))
2012 return -ENODEV;
2013
2014 if (mode & FALLOC_FL_KEEP_SIZE) 2006 if (mode & FALLOC_FL_KEEP_SIZE)
2015 change_size = 0; 2007 change_size = 0;
2016 2008
2009 if (mode & FALLOC_FL_PUNCH_HOLE)
2010 cmd = OCFS2_IOC_UNRESVSP64;
2011
2017 sr.l_whence = 0; 2012 sr.l_whence = 0;
2018 sr.l_start = (s64)offset; 2013 sr.l_start = (s64)offset;
2019 sr.l_len = (s64)len; 2014 sr.l_len = (s64)len;
2020 2015
2021 return __ocfs2_change_file_space(NULL, inode, offset, 2016 return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2022 OCFS2_IOC_RESVSP64, &sr, change_size); 2017 change_size);
2023} 2018}
2024 2019
2025int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 2020int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2062,6 +2057,7 @@ out:
2062} 2057}
2063 2058
2064static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2059static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2060 struct file *file,
2065 loff_t pos, size_t count, 2061 loff_t pos, size_t count,
2066 int *meta_level) 2062 int *meta_level)
2067{ 2063{
@@ -2079,7 +2075,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2079 2075
2080 *meta_level = 1; 2076 *meta_level = 1;
2081 2077
2082 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2078 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2083 if (ret) 2079 if (ret)
2084 mlog_errno(ret); 2080 mlog_errno(ret);
2085out: 2081out:
@@ -2087,7 +2083,7 @@ out:
2087 return ret; 2083 return ret;
2088} 2084}
2089 2085
2090static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 2086static int ocfs2_prepare_inode_for_write(struct file *file,
2091 loff_t *ppos, 2087 loff_t *ppos,
2092 size_t count, 2088 size_t count,
2093 int appending, 2089 int appending,
@@ -2095,6 +2091,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2095 int *has_refcount) 2091 int *has_refcount)
2096{ 2092{
2097 int ret = 0, meta_level = 0; 2093 int ret = 0, meta_level = 0;
2094 struct dentry *dentry = file->f_path.dentry;
2098 struct inode *inode = dentry->d_inode; 2095 struct inode *inode = dentry->d_inode;
2099 loff_t saved_pos, end; 2096 loff_t saved_pos, end;
2100 2097
@@ -2150,6 +2147,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2150 meta_level = -1; 2147 meta_level = -1;
2151 2148
2152 ret = ocfs2_prepare_inode_for_refcount(inode, 2149 ret = ocfs2_prepare_inode_for_refcount(inode,
2150 file,
2153 saved_pos, 2151 saved_pos,
2154 count, 2152 count,
2155 &meta_level); 2153 &meta_level);
@@ -2232,6 +2230,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2232 struct file *file = iocb->ki_filp; 2230 struct file *file = iocb->ki_filp;
2233 struct inode *inode = file->f_path.dentry->d_inode; 2231 struct inode *inode = file->f_path.dentry->d_inode;
2234 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2232 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2233 int full_coherency = !(osb->s_mount_opt &
2234 OCFS2_MOUNT_COHERENCY_BUFFERED);
2235 2235
2236 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2236 mlog_entry("(0x%p, %u, '%.*s')\n", file,
2237 (unsigned int)nr_segs, 2237 (unsigned int)nr_segs,
@@ -2248,23 +2248,50 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2248 2248
2249 mutex_lock(&inode->i_mutex); 2249 mutex_lock(&inode->i_mutex);
2250 2250
2251 ocfs2_iocb_clear_sem_locked(iocb);
2252
2251relock: 2253relock:
2252 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2254 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
2253 if (direct_io) { 2255 if (direct_io) {
2254 down_read(&inode->i_alloc_sem); 2256 down_read(&inode->i_alloc_sem);
2255 have_alloc_sem = 1; 2257 have_alloc_sem = 1;
2258 /* communicate with ocfs2_dio_end_io */
2259 ocfs2_iocb_set_sem_locked(iocb);
2256 } 2260 }
2257 2261
2258 /* concurrent O_DIRECT writes are allowed */ 2262 /*
2259 rw_level = !direct_io; 2263 * Concurrent O_DIRECT writes are allowed with
2264 * mount_option "coherency=buffered".
2265 */
2266 rw_level = (!direct_io || full_coherency);
2267
2260 ret = ocfs2_rw_lock(inode, rw_level); 2268 ret = ocfs2_rw_lock(inode, rw_level);
2261 if (ret < 0) { 2269 if (ret < 0) {
2262 mlog_errno(ret); 2270 mlog_errno(ret);
2263 goto out_sems; 2271 goto out_sems;
2264 } 2272 }
2265 2273
2274 /*
2275 * O_DIRECT writes with "coherency=full" need to take EX cluster
2276 * inode_lock to guarantee coherency.
2277 */
2278 if (direct_io && full_coherency) {
2279 /*
2280 * We need to take and drop the inode lock to force
2281 * other nodes to drop their caches. Buffered I/O
2282 * already does this in write_begin().
2283 */
2284 ret = ocfs2_inode_lock(inode, NULL, 1);
2285 if (ret < 0) {
2286 mlog_errno(ret);
2287 goto out_sems;
2288 }
2289
2290 ocfs2_inode_unlock(inode, 1);
2291 }
2292
2266 can_do_direct = direct_io; 2293 can_do_direct = direct_io;
2267 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 2294 ret = ocfs2_prepare_inode_for_write(file, ppos,
2268 iocb->ki_left, appending, 2295 iocb->ki_left, appending,
2269 &can_do_direct, &has_refcount); 2296 &can_do_direct, &has_refcount);
2270 if (ret < 0) { 2297 if (ret < 0) {
@@ -2312,17 +2339,6 @@ relock:
2312 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2339 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2313 ppos, count, ocount); 2340 ppos, count, ocount);
2314 if (written < 0) { 2341 if (written < 0) {
2315 /*
2316 * direct write may have instantiated a few
2317 * blocks outside i_size. Trim these off again.
2318 * Don't need i_size_read because we hold i_mutex.
2319 *
2320 * XXX(truncate): this looks buggy because ocfs2 did not
2321 * actually implement ->truncate. Take a look at
2322 * the new truncate sequence and update this accordingly
2323 */
2324 if (*ppos + count > inode->i_size)
2325 truncate_setsize(inode, inode->i_size);
2326 ret = written; 2342 ret = written;
2327 goto out_dio; 2343 goto out_dio;
2328 } 2344 }
@@ -2377,8 +2393,10 @@ out:
2377 ocfs2_rw_unlock(inode, rw_level); 2393 ocfs2_rw_unlock(inode, rw_level);
2378 2394
2379out_sems: 2395out_sems:
2380 if (have_alloc_sem) 2396 if (have_alloc_sem) {
2381 up_read(&inode->i_alloc_sem); 2397 up_read(&inode->i_alloc_sem);
2398 ocfs2_iocb_clear_sem_locked(iocb);
2399 }
2382 2400
2383 mutex_unlock(&inode->i_mutex); 2401 mutex_unlock(&inode->i_mutex);
2384 2402
@@ -2394,7 +2412,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2394{ 2412{
2395 int ret; 2413 int ret;
2396 2414
2397 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2415 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2398 sd->total_len, 0, NULL, NULL); 2416 sd->total_len, 0, NULL, NULL);
2399 if (ret < 0) { 2417 if (ret < 0) {
2400 mlog_errno(ret); 2418 mlog_errno(ret);
@@ -2522,6 +2540,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2522 goto bail; 2540 goto bail;
2523 } 2541 }
2524 2542
2543 ocfs2_iocb_clear_sem_locked(iocb);
2544
2525 /* 2545 /*
2526 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2546 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2527 * need locks to protect pending reads from racing with truncate. 2547 * need locks to protect pending reads from racing with truncate.
@@ -2529,6 +2549,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2529 if (filp->f_flags & O_DIRECT) { 2549 if (filp->f_flags & O_DIRECT) {
2530 down_read(&inode->i_alloc_sem); 2550 down_read(&inode->i_alloc_sem);
2531 have_alloc_sem = 1; 2551 have_alloc_sem = 1;
2552 ocfs2_iocb_set_sem_locked(iocb);
2532 2553
2533 ret = ocfs2_rw_lock(inode, 0); 2554 ret = ocfs2_rw_lock(inode, 0);
2534 if (ret < 0) { 2555 if (ret < 0) {
@@ -2570,8 +2591,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2570 } 2591 }
2571 2592
2572bail: 2593bail:
2573 if (have_alloc_sem) 2594 if (have_alloc_sem) {
2574 up_read(&inode->i_alloc_sem); 2595 up_read(&inode->i_alloc_sem);
2596 ocfs2_iocb_clear_sem_locked(iocb);
2597 }
2575 if (rw_level != -1) 2598 if (rw_level != -1)
2576 ocfs2_rw_unlock(inode, rw_level); 2599 ocfs2_rw_unlock(inode, rw_level);
2577 mlog_exit(ret); 2600 mlog_exit(ret);
@@ -2587,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
2587 .getxattr = generic_getxattr, 2610 .getxattr = generic_getxattr,
2588 .listxattr = ocfs2_listxattr, 2611 .listxattr = ocfs2_listxattr,
2589 .removexattr = generic_removexattr, 2612 .removexattr = generic_removexattr,
2590 .fallocate = ocfs2_fallocate,
2591 .fiemap = ocfs2_fiemap, 2613 .fiemap = ocfs2_fiemap,
2592}; 2614};
2593 2615
@@ -2619,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
2619 .flock = ocfs2_flock, 2641 .flock = ocfs2_flock,
2620 .splice_read = ocfs2_file_splice_read, 2642 .splice_read = ocfs2_file_splice_read,
2621 .splice_write = ocfs2_file_splice_write, 2643 .splice_write = ocfs2_file_splice_write,
2644 .fallocate = ocfs2_fallocate,
2622}; 2645};
2623 2646
2624const struct file_operations ocfs2_dops = { 2647const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..f5afbbef6703 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
63 struct kstat *stat); 63 struct kstat *stat);
64int ocfs2_permission(struct inode *inode, int mask); 64int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
65 65
66int ocfs2_should_update_atime(struct inode *inode, 66int ocfs2_should_update_atime(struct inode *inode,
67 struct vfsmount *vfsmnt); 67 struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index eece3e05d9d0..4068c6c4c6f6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
335 else 335 else
336 inode->i_fop = &ocfs2_dops_no_plocks; 336 inode->i_fop = &ocfs2_dops_no_plocks;
337 i_size_write(inode, le64_to_cpu(fe->i_size)); 337 i_size_write(inode, le64_to_cpu(fe->i_size));
338 OCFS2_I(inode)->ip_dir_lock_gen = 1;
338 break; 339 break;
339 case S_IFLNK: 340 case S_IFLNK:
340 if (ocfs2_inode_is_fast_symlink(inode)) 341 if (ocfs2_inode_is_fast_symlink(inode))
@@ -433,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
433 * #1 and #2 can be simply solved by never taking the lock 434 * #1 and #2 can be simply solved by never taking the lock
434 * here for system files (which are the only type we read 435 * here for system files (which are the only type we read
435 * during mount). It's a heavier approach, but our main 436 * during mount). It's a heavier approach, but our main
436 * concern is user-accesible files anyway. 437 * concern is user-accessible files anyway.
437 * 438 *
438 * #3 works itself out because we'll eventually take the 439 * #3 works itself out because we'll eventually take the
439 * cluster lock before trusting anything anyway. 440 * cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db30..1c508b149b3a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
46 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 47 spinlock_t ip_lock;
48 u32 ip_open_count; 48 u32 ip_open_count;
49 u32 ip_clusters;
50 struct list_head ip_io_markers; 49 struct list_head ip_io_markers;
50 u32 ip_clusters;
51 51
52 u16 ip_dyn_features;
52 struct mutex ip_io_mutex; 53 struct mutex ip_io_mutex;
53
54 u32 ip_flags; /* see below */ 54 u32 ip_flags; /* see below */
55 u32 ip_attr; /* inode attributes */ 55 u32 ip_attr; /* inode attributes */
56 u16 ip_dyn_features;
57 56
58 /* protected by recovery_lock. */ 57 /* protected by recovery_lock. */
59 struct inode *ip_next_orphan; 58 struct inode *ip_next_orphan;
60 59
61 u32 ip_dir_start_lookup;
62
63 struct ocfs2_caching_info ip_metadata_cache; 60 struct ocfs2_caching_info ip_metadata_cache;
64
65 struct ocfs2_extent_map ip_extent_map; 61 struct ocfs2_extent_map ip_extent_map;
66
67 struct inode vfs_inode; 62 struct inode vfs_inode;
68 struct jbd2_inode ip_jinode; 63 struct jbd2_inode ip_jinode;
69 64
65 u32 ip_dir_start_lookup;
66
70 /* Only valid if the inode is the dir. */ 67 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 68 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 69 u64 ip_last_used_group;
70 u32 ip_dir_lock_gen;
73 71
74 struct ocfs2_alloc_reservation ip_la_data_resv; 72 struct ocfs2_alloc_reservation ip_la_data_resv;
75}; 73};
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132cef..7a4868196152 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
26 26
27#include <linux/ext2_fs.h> 27#include <linux/ext2_fs.h>
28 28
29#define o2info_from_user(a, b) \
30 copy_from_user(&(a), (b), sizeof(a))
31#define o2info_to_user(a, b) \
32 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
33
34/*
35 * This call is void because we are already reporting an error that may
36 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
37 * just a best-effort to tell userspace that this request caused the error.
38 */
39static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
40 struct ocfs2_info_request __user *req)
41{
42 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
43 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
44}
45
46#define o2info_set_request_error(a, b) \
47 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
48
29static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 49static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
30{ 50{
31 int status; 51 int status;
@@ -109,6 +129,328 @@ bail:
109 return status; 129 return status;
110} 130}
111 131
132int ocfs2_info_handle_blocksize(struct inode *inode,
133 struct ocfs2_info_request __user *req)
134{
135 int status = -EFAULT;
136 struct ocfs2_info_blocksize oib;
137
138 if (o2info_from_user(oib, req))
139 goto bail;
140
141 oib.ib_blocksize = inode->i_sb->s_blocksize;
142 oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
143
144 if (o2info_to_user(oib, req))
145 goto bail;
146
147 status = 0;
148bail:
149 if (status)
150 o2info_set_request_error(oib, req);
151
152 return status;
153}
154
155int ocfs2_info_handle_clustersize(struct inode *inode,
156 struct ocfs2_info_request __user *req)
157{
158 int status = -EFAULT;
159 struct ocfs2_info_clustersize oic;
160 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
161
162 if (o2info_from_user(oic, req))
163 goto bail;
164
165 oic.ic_clustersize = osb->s_clustersize;
166 oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
167
168 if (o2info_to_user(oic, req))
169 goto bail;
170
171 status = 0;
172bail:
173 if (status)
174 o2info_set_request_error(oic, req);
175
176 return status;
177}
178
179int ocfs2_info_handle_maxslots(struct inode *inode,
180 struct ocfs2_info_request __user *req)
181{
182 int status = -EFAULT;
183 struct ocfs2_info_maxslots oim;
184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
185
186 if (o2info_from_user(oim, req))
187 goto bail;
188
189 oim.im_max_slots = osb->max_slots;
190 oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
191
192 if (o2info_to_user(oim, req))
193 goto bail;
194
195 status = 0;
196bail:
197 if (status)
198 o2info_set_request_error(oim, req);
199
200 return status;
201}
202
203int ocfs2_info_handle_label(struct inode *inode,
204 struct ocfs2_info_request __user *req)
205{
206 int status = -EFAULT;
207 struct ocfs2_info_label oil;
208 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
209
210 if (o2info_from_user(oil, req))
211 goto bail;
212
213 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
214 oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
215
216 if (o2info_to_user(oil, req))
217 goto bail;
218
219 status = 0;
220bail:
221 if (status)
222 o2info_set_request_error(oil, req);
223
224 return status;
225}
226
227int ocfs2_info_handle_uuid(struct inode *inode,
228 struct ocfs2_info_request __user *req)
229{
230 int status = -EFAULT;
231 struct ocfs2_info_uuid oiu;
232 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
233
234 if (o2info_from_user(oiu, req))
235 goto bail;
236
237 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
238 oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
239
240 if (o2info_to_user(oiu, req))
241 goto bail;
242
243 status = 0;
244bail:
245 if (status)
246 o2info_set_request_error(oiu, req);
247
248 return status;
249}
250
251int ocfs2_info_handle_fs_features(struct inode *inode,
252 struct ocfs2_info_request __user *req)
253{
254 int status = -EFAULT;
255 struct ocfs2_info_fs_features oif;
256 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
257
258 if (o2info_from_user(oif, req))
259 goto bail;
260
261 oif.if_compat_features = osb->s_feature_compat;
262 oif.if_incompat_features = osb->s_feature_incompat;
263 oif.if_ro_compat_features = osb->s_feature_ro_compat;
264 oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
265
266 if (o2info_to_user(oif, req))
267 goto bail;
268
269 status = 0;
270bail:
271 if (status)
272 o2info_set_request_error(oif, req);
273
274 return status;
275}
276
277int ocfs2_info_handle_journal_size(struct inode *inode,
278 struct ocfs2_info_request __user *req)
279{
280 int status = -EFAULT;
281 struct ocfs2_info_journal_size oij;
282 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
283
284 if (o2info_from_user(oij, req))
285 goto bail;
286
287 oij.ij_journal_size = osb->journal->j_inode->i_size;
288
289 oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
290
291 if (o2info_to_user(oij, req))
292 goto bail;
293
294 status = 0;
295bail:
296 if (status)
297 o2info_set_request_error(oij, req);
298
299 return status;
300}
301
302int ocfs2_info_handle_unknown(struct inode *inode,
303 struct ocfs2_info_request __user *req)
304{
305 int status = -EFAULT;
306 struct ocfs2_info_request oir;
307
308 if (o2info_from_user(oir, req))
309 goto bail;
310
311 oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
312
313 if (o2info_to_user(oir, req))
314 goto bail;
315
316 status = 0;
317bail:
318 if (status)
319 o2info_set_request_error(oir, req);
320
321 return status;
322}
323
324/*
325 * Validate and distinguish OCFS2_IOC_INFO requests.
326 *
327 * - validate the magic number.
328 * - distinguish different requests.
329 * - validate size of different requests.
330 */
331int ocfs2_info_handle_request(struct inode *inode,
332 struct ocfs2_info_request __user *req)
333{
334 int status = -EFAULT;
335 struct ocfs2_info_request oir;
336
337 if (o2info_from_user(oir, req))
338 goto bail;
339
340 status = -EINVAL;
341 if (oir.ir_magic != OCFS2_INFO_MAGIC)
342 goto bail;
343
344 switch (oir.ir_code) {
345 case OCFS2_INFO_BLOCKSIZE:
346 if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
347 status = ocfs2_info_handle_blocksize(inode, req);
348 break;
349 case OCFS2_INFO_CLUSTERSIZE:
350 if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
351 status = ocfs2_info_handle_clustersize(inode, req);
352 break;
353 case OCFS2_INFO_MAXSLOTS:
354 if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
355 status = ocfs2_info_handle_maxslots(inode, req);
356 break;
357 case OCFS2_INFO_LABEL:
358 if (oir.ir_size == sizeof(struct ocfs2_info_label))
359 status = ocfs2_info_handle_label(inode, req);
360 break;
361 case OCFS2_INFO_UUID:
362 if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
363 status = ocfs2_info_handle_uuid(inode, req);
364 break;
365 case OCFS2_INFO_FS_FEATURES:
366 if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
367 status = ocfs2_info_handle_fs_features(inode, req);
368 break;
369 case OCFS2_INFO_JOURNAL_SIZE:
370 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
371 status = ocfs2_info_handle_journal_size(inode, req);
372 break;
373 default:
374 status = ocfs2_info_handle_unknown(inode, req);
375 break;
376 }
377
378bail:
379 return status;
380}
381
382int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
383 u64 *req_addr, int compat_flag)
384{
385 int status = -EFAULT;
386 u64 __user *bp = NULL;
387
388 if (compat_flag) {
389#ifdef CONFIG_COMPAT
390 /*
391 * pointer bp stores the base address of a pointers array,
392 * which collects all addresses of separate request.
393 */
394 bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
395#else
396 BUG();
397#endif
398 } else
399 bp = (u64 __user *)(unsigned long)(info->oi_requests);
400
401 if (o2info_from_user(*req_addr, bp + idx))
402 goto bail;
403
404 status = 0;
405bail:
406 return status;
407}
408
409/*
410 * OCFS2_IOC_INFO handles an array of requests passed from userspace.
411 *
412 * ocfs2_info_handle() recevies a large info aggregation, grab and
413 * validate the request count from header, then break it into small
414 * pieces, later specific handlers can handle them one by one.
415 *
416 * Idea here is to make each separate request small enough to ensure
417 * a better backward&forward compatibility, since a small piece of
418 * request will be less likely to be broken if disk layout get changed.
419 */
420int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
421 int compat_flag)
422{
423 int i, status = 0;
424 u64 req_addr;
425 struct ocfs2_info_request __user *reqp;
426
427 if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
428 (!info->oi_requests)) {
429 status = -EINVAL;
430 goto bail;
431 }
432
433 for (i = 0; i < info->oi_count; i++) {
434
435 status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
436 if (status)
437 break;
438
439 reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
440 if (!reqp) {
441 status = -EINVAL;
442 goto bail;
443 }
444
445 status = ocfs2_info_handle_request(inode, reqp);
446 if (status)
447 break;
448 }
449
450bail:
451 return status;
452}
453
112long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 454long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
113{ 455{
114 struct inode *inode = filp->f_path.dentry->d_inode; 456 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
120 struct reflink_arguments args; 462 struct reflink_arguments args;
121 const char *old_path, *new_path; 463 const char *old_path, *new_path;
122 bool preserve; 464 bool preserve;
465 struct ocfs2_info info;
123 466
124 switch (cmd) { 467 switch (cmd) {
125 case OCFS2_IOC_GETFLAGS: 468 case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
174 preserve = (args.preserve != 0); 517 preserve = (args.preserve != 0);
175 518
176 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); 519 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
520 case OCFS2_IOC_INFO:
521 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
522 sizeof(struct ocfs2_info)))
523 return -EFAULT;
524
525 return ocfs2_info_handle(inode, &info, 0);
177 default: 526 default:
178 return -ENOTTY; 527 return -ENOTTY;
179 } 528 }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
185 bool preserve; 534 bool preserve;
186 struct reflink_arguments args; 535 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode; 536 struct inode *inode = file->f_path.dentry->d_inode;
537 struct ocfs2_info info;
188 538
189 switch (cmd) { 539 switch (cmd) {
190 case OCFS2_IOC32_GETFLAGS: 540 case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
209 559
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), 560 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve); 561 compat_ptr(args.new_path), preserve);
562 case OCFS2_IOC_INFO:
563 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
564 sizeof(struct ocfs2_info)))
565 return -EFAULT;
566
567 return ocfs2_info_handle(inode, &info, 1);
212 default: 568 default:
213 return -ENOIOCTLCMD; 569 return -ENOIOCTLCMD;
214 } 570 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff9..faa2303dbf0a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
301{ 301{
302 int status = 0; 302 int status = 0;
303 unsigned int flushed; 303 unsigned int flushed;
304 unsigned long old_id;
305 struct ocfs2_journal *journal = NULL; 304 struct ocfs2_journal *journal = NULL;
306 305
307 mlog_entry_void(); 306 mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
326 goto finally; 325 goto finally;
327 } 326 }
328 327
329 old_id = ocfs2_inc_trans_id(journal); 328 ocfs2_inc_trans_id(journal);
330 329
331 flushed = atomic_read(&journal->j_num_trans); 330 flushed = atomic_read(&journal->j_num_trans);
332 atomic_set(&journal->j_num_trans, 0); 331 atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
342 return status; 341 return status;
343} 342}
344 343
345/* pass it NULL and it will allocate a new handle object for you. If
346 * you pass it a handle however, it may still return error, in which
347 * case it has free'd the passed handle for you. */
348handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) 344handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
349{ 345{
350 journal_t *journal = osb->journal->j_journal; 346 journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1888 1884
1889 os = &osb->osb_orphan_scan; 1885 os = &osb->osb_orphan_scan;
1890 1886
1887 mlog(0, "Begin orphan scan\n");
1888
1891 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) 1889 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1892 goto out; 1890 goto out;
1893 1891
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1920unlock: 1918unlock:
1921 ocfs2_orphan_scan_unlock(osb, seqno); 1919 ocfs2_orphan_scan_unlock(osb, seqno);
1922out: 1920out:
1921 mlog(0, "Orphan scan completed\n");
1923 return; 1922 return;
1924} 1923}
1925 1924
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710f..43e56b97f9c0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
67 struct buffer_head *j_bh; /* Journal disk inode block */ 67 struct buffer_head *j_bh; /* Journal disk inode block */
68 atomic_t j_num_trans; /* Number of transactions 68 atomic_t j_num_trans; /* Number of transactions
69 * currently in the system. */ 69 * currently in the system. */
70 spinlock_t j_lock;
70 unsigned long j_trans_id; 71 unsigned long j_trans_id;
71 struct rw_semaphore j_trans_barrier; 72 struct rw_semaphore j_trans_barrier;
72 wait_queue_head_t j_checkpointed; 73 wait_queue_head_t j_checkpointed;
73 74
74 spinlock_t j_lock; 75 /* both fields protected by j_lock*/
75 struct list_head j_la_cleanups; 76 struct list_head j_la_cleanups;
76 struct work_struct j_recovery_work; 77 struct work_struct j_recovery_work;
77}; 78};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 4c18f4ad93b4..7e32db9c2c99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
59 return ret; 59 return ret;
60} 60}
61 61
62static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, 62static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
63 struct page *page) 63 struct page *page)
64{ 64{
65 int ret; 65 int ret;
66 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 67 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 68 loff_t pos = page_offset(page);
68 unsigned int len = PAGE_CACHE_SIZE; 69 unsigned int len = PAGE_CACHE_SIZE;
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
111 if (page->index == last_index) 112 if (page->index == last_index)
112 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; 113 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
113 114
114 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, 115 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
115 &fsdata, di_bh, page); 116 &fsdata, di_bh, page);
116 if (ret) { 117 if (ret) {
117 if (ret != -ENOSPC) 118 if (ret != -ENOSPC)
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
159 */ 160 */
160 down_write(&OCFS2_I(inode)->ip_alloc_sem); 161 down_write(&OCFS2_I(inode)->ip_alloc_sem);
161 162
162 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 163 ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
163 164
164 up_write(&OCFS2_I(inode)->ip_alloc_sem); 165 up_write(&OCFS2_I(inode)->ip_alloc_sem);
165 166
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a00dda2e4f16..849fb4a2e814 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
147 spin_unlock(&oi->ip_lock); 147 spin_unlock(&oi->ip_lock);
148 148
149bail_add: 149bail_add:
150 dentry->d_op = &ocfs2_dentry_ops;
151 ret = d_splice_alias(inode, dentry); 150 ret = d_splice_alias(inode, dentry);
152 151
153 if (inode) { 152 if (inode) {
@@ -171,7 +170,8 @@ bail_add:
171 ret = ERR_PTR(status); 170 ret = ERR_PTR(status);
172 goto bail_unlock; 171 goto bail_unlock;
173 } 172 }
174 } 173 } else
174 ocfs2_dentry_attach_gen(dentry);
175 175
176bail_unlock: 176bail_unlock:
177 /* Don't drop the cluster lock until *after* the d_add -- 177 /* Don't drop the cluster lock until *after* the d_add --
@@ -414,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
414 mlog_errno(status); 414 mlog_errno(status);
415 goto leave; 415 goto leave;
416 } 416 }
417 dentry->d_op = &ocfs2_dentry_ops;
418 417
419 status = ocfs2_add_entry(handle, dentry, inode, 418 status = ocfs2_add_entry(handle, dentry, inode,
420 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 419 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -741,8 +740,7 @@ static int ocfs2_link(struct dentry *old_dentry,
741 goto out_commit; 740 goto out_commit;
742 } 741 }
743 742
744 atomic_inc(&inode->i_count); 743 ihold(inode);
745 dentry->d_op = &ocfs2_dentry_ops;
746 d_instantiate(dentry, inode); 744 d_instantiate(dentry, inode);
747 745
748out_commit: 746out_commit:
@@ -1016,8 +1014,11 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1016 * An error return must mean that no cluster locks 1014 * An error return must mean that no cluster locks
1017 * were held on function exit. 1015 * were held on function exit.
1018 */ 1016 */
1019 if (oi1->ip_blkno != oi2->ip_blkno) 1017 if (oi1->ip_blkno != oi2->ip_blkno) {
1020 ocfs2_inode_unlock(inode2, 1); 1018 ocfs2_inode_unlock(inode2, 1);
1019 brelse(*bh2);
1020 *bh2 = NULL;
1021 }
1021 1022
1022 if (status != -ENOENT) 1023 if (status != -ENOENT)
1023 mlog_errno(status); 1024 mlog_errno(status);
@@ -1793,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
1793 mlog_errno(status); 1794 mlog_errno(status);
1794 goto bail; 1795 goto bail;
1795 } 1796 }
1796 dentry->d_op = &ocfs2_dentry_ops;
1797 1797
1798 status = ocfs2_add_entry(handle, dentry, inode, 1798 status = ocfs2_add_entry(handle, dentry, inode,
1799 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1799 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2458,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2458 goto out_commit; 2458 goto out_commit;
2459 } 2459 }
2460 2460
2461 dentry->d_op = &ocfs2_dentry_ops;
2462 d_instantiate(dentry, inode); 2461 d_instantiate(dentry, inode);
2463 status = 0; 2462 status = 0;
2464out_commit: 2463out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..51cd6898e7f1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
150struct ocfs2_lock_res { 150struct ocfs2_lock_res {
151 void *l_priv; 151 void *l_priv;
152 struct ocfs2_lock_res_ops *l_ops; 152 struct ocfs2_lock_res_ops *l_ops;
153 spinlock_t l_lock; 153
154 154
155 struct list_head l_blocked_list; 155 struct list_head l_blocked_list;
156 struct list_head l_mask_waiters; 156 struct list_head l_mask_waiters;
157 157
158 enum ocfs2_lock_type l_type;
159 unsigned long l_flags; 158 unsigned long l_flags;
160 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
161 int l_level;
162 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
163 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
164 struct ocfs2_dlm_lksb l_lksb; 162 signed char l_level;
163 signed char l_requested;
164 signed char l_blocking;
165
166 /* Data packed - type enum ocfs2_lock_type */
167 unsigned char l_type;
165 168
166 /* used from AST/BAST funcs. */ 169 /* used from AST/BAST funcs. */
167 enum ocfs2_ast_action l_action; 170 /* Data packed - enum type ocfs2_ast_action */
168 enum ocfs2_unlock_action l_unlock_action; 171 unsigned char l_action;
169 int l_requested; 172 /* Data packed - enum type ocfs2_unlock_action */
170 int l_blocking; 173 unsigned char l_unlock_action;
171 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
172 175
176 spinlock_t l_lock;
177
178 struct ocfs2_dlm_lksb l_lksb;
179
173 wait_queue_head_t l_event; 180 wait_queue_head_t l_event;
174 181
175 struct list_head l_debug_list; 182 struct list_head l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
243 250
244enum ocfs2_mount_options 251enum ocfs2_mount_options
245{ 252{
246 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ 253 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
247 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ 254 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
248 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 255 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
249 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 256 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
256 control lists */ 263 control lists */
257 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
258 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
266 OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
267 writes */
268 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
269 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
259}; 270};
260 271
261#define OCFS2_OSB_SOFT_RO 0x0001 272#define OCFS2_OSB_SOFT_RO 0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
277 struct super_block *sb; 288 struct super_block *sb;
278 struct inode *root_inode; 289 struct inode *root_inode;
279 struct inode *sys_root_inode; 290 struct inode *sys_root_inode;
280 struct inode *system_inodes[NUM_SYSTEM_INODES]; 291 struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
292 struct inode **local_system_inodes;
281 293
282 struct ocfs2_slot_info *slot_info; 294 struct ocfs2_slot_info *slot_info;
283 295
@@ -368,6 +380,8 @@ struct ocfs2_super
368 struct ocfs2_alloc_stats alloc_stats; 380 struct ocfs2_alloc_stats alloc_stats;
369 char dev_str[20]; /* "major,minor" of the device */ 381 char dev_str[20]; /* "major,minor" of the device */
370 382
383 u8 osb_stackflags;
384
371 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 385 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
372 struct ocfs2_cluster_connection *cconn; 386 struct ocfs2_cluster_connection *cconn;
373 struct ocfs2_lock_res osb_super_lockres; 387 struct ocfs2_lock_res osb_super_lockres;
@@ -406,6 +420,11 @@ struct ocfs2_super
406 struct inode *osb_tl_inode; 420 struct inode *osb_tl_inode;
407 struct buffer_head *osb_tl_bh; 421 struct buffer_head *osb_tl_bh;
408 struct delayed_work osb_truncate_log_wq; 422 struct delayed_work osb_truncate_log_wq;
423 /*
424 * How many clusters in our truncate log.
425 * It must be protected by osb_tl_inode->i_mutex.
426 */
427 unsigned int truncated_clusters;
409 428
410 struct ocfs2_node_map osb_recovering_orphan_dirs; 429 struct ocfs2_node_map osb_recovering_orphan_dirs;
411 unsigned int *osb_orphan_wipes; 430 unsigned int *osb_orphan_wipes;
@@ -601,10 +620,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
601 return ret; 620 return ret;
602} 621}
603 622
604static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) 623static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
605{ 624{
606 return (osb->s_feature_incompat & 625 return (osb->s_feature_incompat &
607 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); 626 (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
627 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
628}
629
630static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
631{
632 if (ocfs2_clusterinfo_valid(osb) &&
633 memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
634 OCFS2_STACK_LABEL_LEN))
635 return 1;
636 return 0;
637}
638
639static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
640{
641 if (ocfs2_clusterinfo_valid(osb) &&
642 !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
643 OCFS2_STACK_LABEL_LEN))
644 return 1;
645 return 0;
646}
647
648static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
649{
650 return ocfs2_o2cb_stack(osb) &&
651 (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
608} 652}
609 653
610static inline int ocfs2_mount_local(struct ocfs2_super *osb) 654static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index fa31d05e41b7..bf2e7764920e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) 104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 171#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171 172
172/* 173/*
174 * Incompat bit to indicate useable clusterinfo with stackflags for all
175 * cluster stacks (userspace adnd o2cb). If this bit is set,
176 * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
177 */
178#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
179
180/*
173 * backup superblock flag is used to indicate that this volume 181 * backup superblock flag is used to indicate that this volume
174 * has backup superblocks. 182 * has backup superblocks.
175 */ 183 */
@@ -292,10 +300,13 @@
292#define OCFS2_VOL_UUID_LEN 16 300#define OCFS2_VOL_UUID_LEN 16
293#define OCFS2_MAX_VOL_LABEL_LEN 64 301#define OCFS2_MAX_VOL_LABEL_LEN 64
294 302
295/* The alternate, userspace stack fields */ 303/* The cluster stack fields */
296#define OCFS2_STACK_LABEL_LEN 4 304#define OCFS2_STACK_LABEL_LEN 4
297#define OCFS2_CLUSTER_NAME_LEN 16 305#define OCFS2_CLUSTER_NAME_LEN 16
298 306
307/* Classic (historically speaking) cluster stack */
308#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
309
299/* Journal limits (in bytes) */ 310/* Journal limits (in bytes) */
300#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 311#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
301 312
@@ -305,6 +316,11 @@
305 */ 316 */
306#define OCFS2_MIN_XATTR_INLINE_SIZE 256 317#define OCFS2_MIN_XATTR_INLINE_SIZE 256
307 318
319/*
320 * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
321 */
322#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
323
308struct ocfs2_system_inode_info { 324struct ocfs2_system_inode_info {
309 char *si_name; 325 char *si_name;
310 int si_iflags; 326 int si_iflags;
@@ -322,6 +338,7 @@ enum {
322 USER_QUOTA_SYSTEM_INODE, 338 USER_QUOTA_SYSTEM_INODE,
323 GROUP_QUOTA_SYSTEM_INODE, 339 GROUP_QUOTA_SYSTEM_INODE,
324#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE 340#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
341#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
325 ORPHAN_DIR_SYSTEM_INODE, 342 ORPHAN_DIR_SYSTEM_INODE,
326 EXTENT_ALLOC_SYSTEM_INODE, 343 EXTENT_ALLOC_SYSTEM_INODE,
327 INODE_ALLOC_SYSTEM_INODE, 344 INODE_ALLOC_SYSTEM_INODE,
@@ -330,8 +347,12 @@ enum {
330 TRUNCATE_LOG_SYSTEM_INODE, 347 TRUNCATE_LOG_SYSTEM_INODE,
331 LOCAL_USER_QUOTA_SYSTEM_INODE, 348 LOCAL_USER_QUOTA_SYSTEM_INODE,
332 LOCAL_GROUP_QUOTA_SYSTEM_INODE, 349 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
350#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
333 NUM_SYSTEM_INODES 351 NUM_SYSTEM_INODES
334}; 352};
353#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
354#define NUM_LOCAL_SYSTEM_INODES \
355 (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
335 356
336static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { 357static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
337 /* Global system inodes (single copy) */ 358 /* Global system inodes (single copy) */
@@ -360,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
360/* Parameter passed from mount.ocfs2 to module */ 381/* Parameter passed from mount.ocfs2 to module */
361#define OCFS2_HB_NONE "heartbeat=none" 382#define OCFS2_HB_NONE "heartbeat=none"
362#define OCFS2_HB_LOCAL "heartbeat=local" 383#define OCFS2_HB_LOCAL "heartbeat=local"
384#define OCFS2_HB_GLOBAL "heartbeat=global"
363 385
364/* 386/*
365 * OCFS2 directory file types. Only the low 3 bits are used. The 387 * OCFS2 directory file types. Only the low 3 bits are used. The
@@ -566,9 +588,21 @@ struct ocfs2_slot_map_extended {
566 */ 588 */
567}; 589};
568 590
591/*
592 * ci_stackflags is only valid if the incompat bit
593 * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
594 */
569struct ocfs2_cluster_info { 595struct ocfs2_cluster_info {
570/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; 596/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
571 __le32 ci_reserved; 597 union {
598 __le32 ci_reserved;
599 struct {
600 __u8 ci_stackflags;
601 __u8 ci_reserved1;
602 __u8 ci_reserved2;
603 __u8 ci_reserved3;
604 };
605 };
572/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; 606/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
573/*18*/ 607/*18*/
574}; 608};
@@ -605,9 +639,9 @@ struct ocfs2_super_block {
605 * group header */ 639 * group header */
606/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 640/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
607/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 641/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
608/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace 642/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
609 stack. Only valid 643 userspace or clusterinfo
610 with INCOMPAT flag. */ 644 INCOMPAT flag set. */
611/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 645/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
612 for this fs*/ 646 for this fs*/
613 __le16 s_reserved0; 647 __le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 5d241505690b..b46f39bf7438 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -76,4 +76,99 @@ struct reflink_arguments {
76}; 76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) 77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78 78
79/* Following definitions dedicated for ocfs2_info_request ioctls. */
80#define OCFS2_INFO_MAX_REQUEST (50)
81#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
82
83/* Magic number of all requests */
84#define OCFS2_INFO_MAGIC (0x4F32494E)
85
86/*
87 * Always try to separate info request into small pieces to
88 * guarantee the backward&forward compatibility.
89 */
90struct ocfs2_info {
91 __u64 oi_requests; /* Array of __u64 pointers to requests */
92 __u32 oi_count; /* Number of requests in info_requests */
93 __u32 oi_pad;
94};
95
96struct ocfs2_info_request {
97/*00*/ __u32 ir_magic; /* Magic number */
98 __u32 ir_code; /* Info request code */
99 __u32 ir_size; /* Size of request */
100 __u32 ir_flags; /* Request flags */
101/*10*/ /* Request specific fields */
102};
103
104struct ocfs2_info_clustersize {
105 struct ocfs2_info_request ic_req;
106 __u32 ic_clustersize;
107 __u32 ic_pad;
108};
109
110struct ocfs2_info_blocksize {
111 struct ocfs2_info_request ib_req;
112 __u32 ib_blocksize;
113 __u32 ib_pad;
114};
115
116struct ocfs2_info_maxslots {
117 struct ocfs2_info_request im_req;
118 __u32 im_max_slots;
119 __u32 im_pad;
120};
121
122struct ocfs2_info_label {
123 struct ocfs2_info_request il_req;
124 __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
125} __attribute__ ((packed));
126
127struct ocfs2_info_uuid {
128 struct ocfs2_info_request iu_req;
129 __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
130} __attribute__ ((packed));
131
132struct ocfs2_info_fs_features {
133 struct ocfs2_info_request if_req;
134 __u32 if_compat_features;
135 __u32 if_incompat_features;
136 __u32 if_ro_compat_features;
137 __u32 if_pad;
138};
139
140struct ocfs2_info_journal_size {
141 struct ocfs2_info_request ij_req;
142 __u64 ij_journal_size;
143};
144
145/* Codes for ocfs2_info_request */
146enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1,
148 OCFS2_INFO_BLOCKSIZE,
149 OCFS2_INFO_MAXSLOTS,
150 OCFS2_INFO_LABEL,
151 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE,
154 OCFS2_INFO_NUM_TYPES
155};
156
157/* Flags for struct ocfs2_info_request */
158/* Filled by the caller */
159#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
160 required. This is a hint.
161 It is up to ocfs2 whether
162 the request can be fulfilled
163 without locking. */
164/* Filled by ocfs2 */
165#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
166 this request and
167 filled in the answer */
168
169#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
170 request handling. */
171
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173
79#endif /* OCFS2_IOCTL_H */ 174#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index efdd75607406..b5f9160e93e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
49 49
50struct ocfs2_cow_context { 50struct ocfs2_cow_context {
51 struct inode *inode; 51 struct inode *inode;
52 struct file *file;
52 u32 cow_start; 53 u32 cow_start;
53 u32 cow_len; 54 u32 cow_len;
54 struct ocfs2_extent_tree data_et; 55 struct ocfs2_extent_tree data_et;
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2932 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933 struct page *page; 2934 struct page *page;
2934 pgoff_t page_index; 2935 pgoff_t page_index;
2935 unsigned int from, to; 2936 unsigned int from, to, readahead_pages;
2936 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2937 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = context->inode->i_mapping;
2938 2939
2939 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2940 new_cluster, new_len, cpos); 2941 new_cluster, new_len, cpos);
2941 2942
2943 readahead_pages =
2944 (ocfs2_cow_contig_clusters(sb) <<
2945 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2942 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2946 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2943 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2947 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2944 /* 2948 /*
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2969 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2970 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2971 2975
2976 if (PageReadahead(page) && context->file) {
2977 page_cache_async_readahead(mapping,
2978 &context->file->f_ra,
2979 context->file,
2980 page, page_index,
2981 readahead_pages);
2982 }
2983
2972 if (!PageUptodate(page)) { 2984 if (!PageUptodate(page)) {
2973 ret = block_read_full_page(page, ocfs2_get_block); 2985 ret = block_read_full_page(page, ocfs2_get_block);
2974 if (ret) { 2986 if (ret) {
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3409 return ret; 3421 return ret;
3410} 3422}
3411 3423
3424static void ocfs2_readahead_for_cow(struct inode *inode,
3425 struct file *file,
3426 u32 start, u32 len)
3427{
3428 struct address_space *mapping;
3429 pgoff_t index;
3430 unsigned long num_pages;
3431 int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3432
3433 if (!file)
3434 return;
3435
3436 mapping = file->f_mapping;
3437 num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3438 if (!num_pages)
3439 num_pages = 1;
3440
3441 index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3442 page_cache_sync_readahead(mapping, &file->f_ra, file,
3443 index, num_pages);
3444}
3445
3412/* 3446/*
3413 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3447 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3414 * past max_cpos. This will stop when it runs into a hole or an 3448 * past max_cpos. This will stop when it runs into a hole or an
3415 * unrefcounted extent. 3449 * unrefcounted extent.
3416 */ 3450 */
3417static int ocfs2_refcount_cow_hunk(struct inode *inode, 3451static int ocfs2_refcount_cow_hunk(struct inode *inode,
3452 struct file *file,
3418 struct buffer_head *di_bh, 3453 struct buffer_head *di_bh,
3419 u32 cpos, u32 write_len, u32 max_cpos) 3454 u32 cpos, u32 write_len, u32 max_cpos)
3420{ 3455{
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3443 3478
3444 BUG_ON(cow_len == 0); 3479 BUG_ON(cow_len == 0);
3445 3480
3481 ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3482
3446 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3483 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3447 if (!context) { 3484 if (!context) {
3448 ret = -ENOMEM; 3485 ret = -ENOMEM;
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3464 context->ref_root_bh = ref_root_bh; 3501 context->ref_root_bh = ref_root_bh;
3465 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3502 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3466 context->get_clusters = ocfs2_di_get_clusters; 3503 context->get_clusters = ocfs2_di_get_clusters;
3504 context->file = file;
3467 3505
3468 ocfs2_init_dinode_extent_tree(&context->data_et, 3506 ocfs2_init_dinode_extent_tree(&context->data_et,
3469 INODE_CACHE(inode), di_bh); 3507 INODE_CACHE(inode), di_bh);
@@ -3492,6 +3530,7 @@ out:
3492 * clusters between cpos and cpos+write_len are safe to modify. 3530 * clusters between cpos and cpos+write_len are safe to modify.
3493 */ 3531 */
3494int ocfs2_refcount_cow(struct inode *inode, 3532int ocfs2_refcount_cow(struct inode *inode,
3533 struct file *file,
3495 struct buffer_head *di_bh, 3534 struct buffer_head *di_bh,
3496 u32 cpos, u32 write_len, u32 max_cpos) 3535 u32 cpos, u32 write_len, u32 max_cpos)
3497{ 3536{
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
3511 num_clusters = write_len; 3550 num_clusters = write_len;
3512 3551
3513 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3552 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3514 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3553 ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
3515 num_clusters, max_cpos); 3554 num_clusters, max_cpos);
3516 if (ret) { 3555 if (ret) {
3517 mlog_errno(ret); 3556 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..c8ce46f7d8e3 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
21 struct rb_node rf_node; 21 struct rb_node rf_node;
22 u64 rf_blkno; 22 u64 rf_blkno;
23 u32 rf_generation; 23 u32 rf_generation;
24 struct kref rf_getcnt;
24 struct rw_semaphore rf_sem; 25 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres; 26 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed; 27 int rf_removed;
28 28
29 /* the following 4 fields are used by caching_info. */ 29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock; 30 spinlock_t rf_lock;
31 struct ocfs2_caching_info rf_ci;
32 struct mutex rf_io_mutex; 32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb; 33 struct super_block *rf_sb;
34}; 34};
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 int *ref_blocks); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode,
56 struct file *filep, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 57 u32 cpos, u32 write_len, u32 max_cpos);
57 58
58typedef int (ocfs2_post_refcount_func)(struct inode *inode, 59typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949f..ab4e0172cc1d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
357{ 357{
358 int status = 0; 358 int status = 0;
359 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes; 360 unsigned long long blocks, bytes = 0;
361 unsigned int i; 361 unsigned int i;
362 struct buffer_head *bh; 362 struct buffer_head *bh;
363 363
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c5..19965b00c43c 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
283 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
284 * in the heartbeat universe */ 284 * in the heartbeat universe */
285 if (!o2hb_check_local_node_heartbeating()) { 285 if (!o2hb_check_local_node_heartbeating()) {
286 if (o2hb_global_heartbeat_active())
287 mlog(ML_ERROR, "Global heartbeat not started\n");
286 rc = -EINVAL; 288 rc = -EINVAL;
287 goto out; 289 goto out;
288 } 290 }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..a5ebe421195f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/smp_lock.h>
26#include <linux/reboot.h> 25#include <linux/reboot.h>
27#include <asm/uaccess.h> 26#include <asm/uaccess.h>
28 27
@@ -191,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
191 return c; 190 return c;
192 } 191 }
193 192
194 return c; 193 return NULL;
195} 194}
196 195
197/* 196/*
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
612 return -ENOMEM; 611 return -ENOMEM;
613 p->op_this_node = -1; 612 p->op_this_node = -1;
614 613
615 lock_kernel();
616 mutex_lock(&ocfs2_control_lock); 614 mutex_lock(&ocfs2_control_lock);
617 file->private_data = p; 615 file->private_data = p;
618 list_add(&p->op_list, &ocfs2_control_private_list); 616 list_add(&p->op_list, &ocfs2_control_private_list);
619 mutex_unlock(&ocfs2_control_lock); 617 mutex_unlock(&ocfs2_control_lock);
620 unlock_kernel();
621 618
622 return 0; 619 return 0;
623} 620}
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
628 .read = ocfs2_control_read, 625 .read = ocfs2_control_read,
629 .write = ocfs2_control_write, 626 .write = ocfs2_control_write,
630 .owner = THIS_MODULE, 627 .owner = THIS_MODULE,
628 .llseek = default_llseek,
631}; 629};
632 630
633static struct miscdevice ocfs2_control_device = { 631static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 849c2f0e0a0e..71998d4d61d5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1380 } 1380 }
1381 1381
1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1383 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1384 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1385 " count %u but claims %u are freed. num_bits %d",
1386 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1387 le16_to_cpu(bg->bg_bits),
1388 le16_to_cpu(bg->bg_free_bits_count), num_bits);
1389 return -EROFS;
1390 }
1383 while(num_bits--) 1391 while(num_bits--)
1384 ocfs2_set_bit(bit_off++, bitmap); 1392 ocfs2_set_bit(bit_off++, bitmap);
1385 1393
@@ -1908,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1908 if (res->sr_bg_blkno) { 1916 if (res->sr_bg_blkno) {
1909 /* Attempt to short-circuit the usual search mechanism 1917 /* Attempt to short-circuit the usual search mechanism
1910 * by jumping straight to the most recently used 1918 * by jumping straight to the most recently used
1911 * allocation group. This helps us mantain some 1919 * allocation group. This helps us maintain some
1912 * contiguousness across allocations. */ 1920 * contiguousness across allocations. */
1913 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1921 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1914 min_bits, res, &bits_left); 1922 min_bits, res, &bits_left);
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2419 (unsigned long *) undo_bg->bg_bitmap); 2427 (unsigned long *) undo_bg->bg_bitmap);
2420 } 2428 }
2421 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2429 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2430 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2431 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2432 " count %u but claims %u are freed. num_bits %d",
2433 (unsigned long long)le64_to_cpu(bg->bg_blkno),
2434 le16_to_cpu(bg->bg_bits),
2435 le16_to_cpu(bg->bg_free_bits_count), num_bits);
2436 return -EROFS;
2437 }
2422 2438
2423 if (undo_fn) 2439 if (undo_fn)
2424 jbd_unlock_bh_state(group_bh); 2440 jbd_unlock_bh_state(group_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b304d1..38f986d2447e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/smp_lock.h>
45 44
46#define MLOG_MASK_PREFIX ML_SUPER 45#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -162,6 +161,7 @@ enum {
162 Opt_nointr, 161 Opt_nointr,
163 Opt_hb_none, 162 Opt_hb_none,
164 Opt_hb_local, 163 Opt_hb_local,
164 Opt_hb_global,
165 Opt_data_ordered, 165 Opt_data_ordered,
166 Opt_data_writeback, 166 Opt_data_writeback,
167 Opt_atime_quantum, 167 Opt_atime_quantum,
@@ -177,6 +177,8 @@ enum {
177 Opt_noacl, 177 Opt_noacl,
178 Opt_usrquota, 178 Opt_usrquota,
179 Opt_grpquota, 179 Opt_grpquota,
180 Opt_coherency_buffered,
181 Opt_coherency_full,
180 Opt_resv_level, 182 Opt_resv_level,
181 Opt_dir_resv_level, 183 Opt_dir_resv_level,
182 Opt_err, 184 Opt_err,
@@ -190,6 +192,7 @@ static const match_table_t tokens = {
190 {Opt_nointr, "nointr"}, 192 {Opt_nointr, "nointr"},
191 {Opt_hb_none, OCFS2_HB_NONE}, 193 {Opt_hb_none, OCFS2_HB_NONE},
192 {Opt_hb_local, OCFS2_HB_LOCAL}, 194 {Opt_hb_local, OCFS2_HB_LOCAL},
195 {Opt_hb_global, OCFS2_HB_GLOBAL},
193 {Opt_data_ordered, "data=ordered"}, 196 {Opt_data_ordered, "data=ordered"},
194 {Opt_data_writeback, "data=writeback"}, 197 {Opt_data_writeback, "data=writeback"},
195 {Opt_atime_quantum, "atime_quantum=%u"}, 198 {Opt_atime_quantum, "atime_quantum=%u"},
@@ -205,6 +208,8 @@ static const match_table_t tokens = {
205 {Opt_noacl, "noacl"}, 208 {Opt_noacl, "noacl"},
206 {Opt_usrquota, "usrquota"}, 209 {Opt_usrquota, "usrquota"},
207 {Opt_grpquota, "grpquota"}, 210 {Opt_grpquota, "grpquota"},
211 {Opt_coherency_buffered, "coherency=buffered"},
212 {Opt_coherency_full, "coherency=full"},
208 {Opt_resv_level, "resv_level=%u"}, 213 {Opt_resv_level, "resv_level=%u"},
209 {Opt_dir_resv_level, "dir_resv_level=%u"}, 214 {Opt_dir_resv_level, "dir_resv_level=%u"},
210 {Opt_err, NULL} 215 {Opt_err, NULL}
@@ -514,11 +519,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
514 519
515 mlog_entry_void(); 520 mlog_entry_void();
516 521
517 for (i = 0; i < NUM_SYSTEM_INODES; i++) { 522 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
518 inode = osb->system_inodes[i]; 523 inode = osb->global_system_inodes[i];
519 if (inode) { 524 if (inode) {
520 iput(inode); 525 iput(inode);
521 osb->system_inodes[i] = NULL; 526 osb->global_system_inodes[i] = NULL;
522 } 527 }
523 } 528 }
524 529
@@ -534,6 +539,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
534 osb->root_inode = NULL; 539 osb->root_inode = NULL;
535 } 540 }
536 541
542 if (!osb->local_system_inodes)
543 goto out;
544
545 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
546 if (osb->local_system_inodes[i]) {
547 iput(osb->local_system_inodes[i]);
548 osb->local_system_inodes[i] = NULL;
549 }
550 }
551
552 kfree(osb->local_system_inodes);
553 osb->local_system_inodes = NULL;
554
555out:
537 mlog_exit(0); 556 mlog_exit(0);
538} 557}
539 558
@@ -550,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
550 return &oi->vfs_inode; 569 return &oi->vfs_inode;
551} 570}
552 571
553static void ocfs2_destroy_inode(struct inode *inode) 572static void ocfs2_i_callback(struct rcu_head *head)
554{ 573{
574 struct inode *inode = container_of(head, struct inode, i_rcu);
575 INIT_LIST_HEAD(&inode->i_dentry);
555 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); 576 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
556} 577}
557 578
579static void ocfs2_destroy_inode(struct inode *inode)
580{
581 call_rcu(&inode->i_rcu, ocfs2_i_callback);
582}
583
558static unsigned long long ocfs2_max_file_offset(unsigned int bbits, 584static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
559 unsigned int cbits) 585 unsigned int cbits)
560{ 586{
@@ -608,8 +634,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
608 int ret = 0; 634 int ret = 0;
609 struct mount_options parsed_options; 635 struct mount_options parsed_options;
610 struct ocfs2_super *osb = OCFS2_SB(sb); 636 struct ocfs2_super *osb = OCFS2_SB(sb);
611 637 u32 tmp;
612 lock_kernel();
613 638
614 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 639 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
615 !ocfs2_check_set_options(sb, &parsed_options)) { 640 !ocfs2_check_set_options(sb, &parsed_options)) {
@@ -617,8 +642,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
617 goto out; 642 goto out;
618 } 643 }
619 644
620 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != 645 tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
621 (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 646 OCFS2_MOUNT_HB_NONE;
647 if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
622 ret = -EINVAL; 648 ret = -EINVAL;
623 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); 649 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
624 goto out; 650 goto out;
@@ -717,7 +743,6 @@ unlock_osb:
717 MS_POSIXACL : 0); 743 MS_POSIXACL : 0);
718 } 744 }
719out: 745out:
720 unlock_kernel();
721 return ret; 746 return ret;
722} 747}
723 748
@@ -809,23 +834,29 @@ bail:
809 834
810static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) 835static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
811{ 836{
812 if (ocfs2_mount_local(osb)) { 837 u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
813 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 838
839 if (osb->s_mount_opt & hb_enabled) {
840 if (ocfs2_mount_local(osb)) {
814 mlog(ML_ERROR, "Cannot heartbeat on a locally " 841 mlog(ML_ERROR, "Cannot heartbeat on a locally "
815 "mounted device.\n"); 842 "mounted device.\n");
816 return -EINVAL; 843 return -EINVAL;
817 } 844 }
818 } 845 if (ocfs2_userspace_stack(osb)) {
819
820 if (ocfs2_userspace_stack(osb)) {
821 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
822 mlog(ML_ERROR, "Userspace stack expected, but " 846 mlog(ML_ERROR, "Userspace stack expected, but "
823 "o2cb heartbeat arguments passed to mount\n"); 847 "o2cb heartbeat arguments passed to mount\n");
824 return -EINVAL; 848 return -EINVAL;
825 } 849 }
850 if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
851 !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
852 ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
853 ocfs2_cluster_o2cb_global_heartbeat(osb))) {
854 mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
855 return -EINVAL;
856 }
826 } 857 }
827 858
828 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 859 if (!(osb->s_mount_opt & hb_enabled)) {
829 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && 860 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
830 !ocfs2_userspace_stack(osb)) { 861 !ocfs2_userspace_stack(osb)) {
831 mlog(ML_ERROR, "Heartbeat has to be started to mount " 862 mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -962,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
962} 993}
963 994
964/* Handle quota on quotactl */ 995/* Handle quota on quotactl */
965static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 996static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
966 char *path)
967{ 997{
968 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 998 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
969 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 999 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -982,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
982} 1012}
983 1013
984static const struct quotactl_ops ocfs2_quotactl_ops = { 1014static const struct quotactl_ops ocfs2_quotactl_ops = {
985 .quota_on = ocfs2_quota_on, 1015 .quota_on_meta = ocfs2_quota_on,
986 .quota_off = ocfs2_quota_off, 1016 .quota_off = ocfs2_quota_off,
987 .quota_sync = dquot_quota_sync, 1017 .quota_sync = dquot_quota_sync,
988 .get_info = dquot_get_dqinfo, 1018 .get_info = dquot_get_dqinfo,
@@ -1211,14 +1241,12 @@ read_super_error:
1211 return status; 1241 return status;
1212} 1242}
1213 1243
1214static int ocfs2_get_sb(struct file_system_type *fs_type, 1244static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
1215 int flags, 1245 int flags,
1216 const char *dev_name, 1246 const char *dev_name,
1217 void *data, 1247 void *data)
1218 struct vfsmount *mnt)
1219{ 1248{
1220 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super, 1249 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
1221 mnt);
1222} 1250}
1223 1251
1224static void ocfs2_kill_sb(struct super_block *sb) 1252static void ocfs2_kill_sb(struct super_block *sb)
@@ -1242,8 +1270,7 @@ out:
1242static struct file_system_type ocfs2_fs_type = { 1270static struct file_system_type ocfs2_fs_type = {
1243 .owner = THIS_MODULE, 1271 .owner = THIS_MODULE,
1244 .name = "ocfs2", 1272 .name = "ocfs2",
1245 .get_sb = ocfs2_get_sb, /* is this called when we mount 1273 .mount = ocfs2_mount,
1246 * the fs? */
1247 .kill_sb = ocfs2_kill_sb, 1274 .kill_sb = ocfs2_kill_sb,
1248 1275
1249 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1276 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
@@ -1291,6 +1318,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1291{ 1318{
1292 int status; 1319 int status;
1293 char *p; 1320 char *p;
1321 u32 tmp;
1294 1322
1295 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 1323 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
1296 options ? options : "(none)"); 1324 options ? options : "(none)");
@@ -1322,7 +1350,10 @@ static int ocfs2_parse_options(struct super_block *sb,
1322 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; 1350 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
1323 break; 1351 break;
1324 case Opt_hb_none: 1352 case Opt_hb_none:
1325 mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; 1353 mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
1354 break;
1355 case Opt_hb_global:
1356 mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
1326 break; 1357 break;
1327 case Opt_barrier: 1358 case Opt_barrier:
1328 if (match_int(&args[0], &option)) { 1359 if (match_int(&args[0], &option)) {
@@ -1438,6 +1469,12 @@ static int ocfs2_parse_options(struct super_block *sb,
1438 case Opt_grpquota: 1469 case Opt_grpquota:
1439 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1470 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1440 break; 1471 break;
1472 case Opt_coherency_buffered:
1473 mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
1474 break;
1475 case Opt_coherency_full:
1476 mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
1477 break;
1441 case Opt_acl: 1478 case Opt_acl:
1442 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1479 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1443 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; 1480 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1477,6 +1514,15 @@ static int ocfs2_parse_options(struct super_block *sb,
1477 } 1514 }
1478 } 1515 }
1479 1516
1517 /* Ensure only one heartbeat mode */
1518 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
1519 OCFS2_MOUNT_HB_NONE);
1520 if (hweight32(tmp) != 1) {
1521 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1522 status = 0;
1523 goto bail;
1524 }
1525
1480 status = 1; 1526 status = 1;
1481 1527
1482bail: 1528bail:
@@ -1490,10 +1536,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1490 unsigned long opts = osb->s_mount_opt; 1536 unsigned long opts = osb->s_mount_opt;
1491 unsigned int local_alloc_megs; 1537 unsigned int local_alloc_megs;
1492 1538
1493 if (opts & OCFS2_MOUNT_HB_LOCAL) 1539 if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
1494 seq_printf(s, ",_netdev,heartbeat=local"); 1540 seq_printf(s, ",_netdev");
1495 else 1541 if (opts & OCFS2_MOUNT_HB_LOCAL)
1496 seq_printf(s, ",heartbeat=none"); 1542 seq_printf(s, ",%s", OCFS2_HB_LOCAL);
1543 else
1544 seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
1545 } else
1546 seq_printf(s, ",%s", OCFS2_HB_NONE);
1497 1547
1498 if (opts & OCFS2_MOUNT_NOINTR) 1548 if (opts & OCFS2_MOUNT_NOINTR)
1499 seq_printf(s, ",nointr"); 1549 seq_printf(s, ",nointr");
@@ -1536,6 +1586,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1536 if (opts & OCFS2_MOUNT_GRPQUOTA) 1586 if (opts & OCFS2_MOUNT_GRPQUOTA)
1537 seq_printf(s, ",grpquota"); 1587 seq_printf(s, ",grpquota");
1538 1588
1589 if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
1590 seq_printf(s, ",coherency=buffered");
1591 else
1592 seq_printf(s, ",coherency=full");
1593
1539 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1594 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1540 seq_printf(s, ",nouser_xattr"); 1595 seq_printf(s, ",nouser_xattr");
1541 else 1596 else
@@ -1640,13 +1695,9 @@ static void ocfs2_put_super(struct super_block *sb)
1640{ 1695{
1641 mlog_entry("(0x%p)\n", sb); 1696 mlog_entry("(0x%p)\n", sb);
1642 1697
1643 lock_kernel();
1644
1645 ocfs2_sync_blockdev(sb); 1698 ocfs2_sync_blockdev(sb);
1646 ocfs2_dismount_volume(sb, 0); 1699 ocfs2_dismount_volume(sb, 0);
1647 1700
1648 unlock_kernel();
1649
1650 mlog_exit_void(); 1701 mlog_exit_void();
1651} 1702}
1652 1703
@@ -1990,6 +2041,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
1990 return 0; 2041 return 0;
1991} 2042}
1992 2043
2044/* Make sure entire volume is addressable by our journal. Requires
2045 osb_clusters_at_boot to be valid and for the journal to have been
2046 initialized by ocfs2_journal_init(). */
2047static int ocfs2_journal_addressable(struct ocfs2_super *osb)
2048{
2049 int status = 0;
2050 u64 max_block =
2051 ocfs2_clusters_to_blocks(osb->sb,
2052 osb->osb_clusters_at_boot) - 1;
2053
2054 /* 32-bit block number is always OK. */
2055 if (max_block <= (u32)~0ULL)
2056 goto out;
2057
2058 /* Volume is "huge", so see if our journal is new enough to
2059 support it. */
2060 if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
2061 OCFS2_FEATURE_COMPAT_JBD2_SB) &&
2062 jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
2063 JBD2_FEATURE_INCOMPAT_64BIT))) {
2064 mlog(ML_ERROR, "The journal cannot address the entire volume. "
2065 "Enable the 'block64' journal option with tunefs.ocfs2");
2066 status = -EFBIG;
2067 goto out;
2068 }
2069
2070 out:
2071 return status;
2072}
2073
1993static int ocfs2_initialize_super(struct super_block *sb, 2074static int ocfs2_initialize_super(struct super_block *sb,
1994 struct buffer_head *bh, 2075 struct buffer_head *bh,
1995 int sector_size, 2076 int sector_size,
@@ -2002,6 +2083,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2002 struct ocfs2_journal *journal; 2083 struct ocfs2_journal *journal;
2003 __le32 uuid_net_key; 2084 __le32 uuid_net_key;
2004 struct ocfs2_super *osb; 2085 struct ocfs2_super *osb;
2086 u64 total_blocks;
2005 2087
2006 mlog_entry_void(); 2088 mlog_entry_void();
2007 2089
@@ -2014,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2014 2096
2015 sb->s_fs_info = osb; 2097 sb->s_fs_info = osb;
2016 sb->s_op = &ocfs2_sops; 2098 sb->s_op = &ocfs2_sops;
2099 sb->s_d_op = &ocfs2_dentry_ops;
2017 sb->s_export_op = &ocfs2_export_ops; 2100 sb->s_export_op = &ocfs2_export_ops;
2018 sb->s_qcop = &ocfs2_quotactl_ops; 2101 sb->s_qcop = &ocfs2_quotactl_ops;
2019 sb->dq_op = &ocfs2_quota_operations; 2102 sb->dq_op = &ocfs2_quota_operations;
@@ -2060,6 +2143,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2060 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2143 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
2061 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2144 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
2062 2145
2146 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2147 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2148 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2149 osb->max_slots);
2150 status = -EINVAL;
2151 goto bail;
2152 }
2153 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2154
2063 ocfs2_orphan_scan_init(osb); 2155 ocfs2_orphan_scan_init(osb);
2064 2156
2065 status = ocfs2_recovery_init(osb); 2157 status = ocfs2_recovery_init(osb);
@@ -2098,15 +2190,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2098 goto bail; 2190 goto bail;
2099 } 2191 }
2100 2192
2101 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2102 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2103 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2104 osb->max_slots);
2105 status = -EINVAL;
2106 goto bail;
2107 }
2108 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2109
2110 osb->slot_recovery_generations = 2193 osb->slot_recovery_generations =
2111 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), 2194 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
2112 GFP_KERNEL); 2195 GFP_KERNEL);
@@ -2149,7 +2232,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2149 goto bail; 2232 goto bail;
2150 } 2233 }
2151 2234
2152 if (ocfs2_userspace_stack(osb)) { 2235 if (ocfs2_clusterinfo_valid(osb)) {
2236 osb->osb_stackflags =
2237 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2153 memcpy(osb->osb_cluster_stack, 2238 memcpy(osb->osb_cluster_stack,
2154 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2239 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2155 OCFS2_STACK_LABEL_LEN); 2240 OCFS2_STACK_LABEL_LEN);
@@ -2214,11 +2299,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2214 goto bail; 2299 goto bail;
2215 } 2300 }
2216 2301
2217 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) 2302 total_blocks = ocfs2_clusters_to_blocks(osb->sb,
2218 > (u32)~0UL) { 2303 le32_to_cpu(di->i_clusters));
2219 mlog(ML_ERROR, "Volume might try to write to blocks beyond " 2304
2220 "what jbd can address in 32 bits.\n"); 2305 status = generic_check_addressable(osb->sb->s_blocksize_bits,
2221 status = -EINVAL; 2306 total_blocks);
2307 if (status) {
2308 mlog(ML_ERROR, "Volume too large "
2309 "to mount safely on this system");
2310 status = -EFBIG;
2222 goto bail; 2311 goto bail;
2223 } 2312 }
2224 2313
@@ -2380,6 +2469,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2380 goto finally; 2469 goto finally;
2381 } 2470 }
2382 2471
2472 /* Now that journal has been initialized, check to make sure
2473 entire volume is addressable. */
2474 status = ocfs2_journal_addressable(osb);
2475 if (status)
2476 goto finally;
2477
2383 /* If the journal was unmounted cleanly then we don't want to 2478 /* If the journal was unmounted cleanly then we don't want to
2384 * recover anything. Otherwise, journal_load will do that 2479 * recover anything. Otherwise, journal_load will do that
2385 * dirty work for us :) */ 2480 * dirty work for us :) */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf1..902efb23b6a6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
44 int type, 44 int type,
45 u32 slot); 45 u32 slot);
46 46
47static inline int is_global_system_inode(int type);
48static inline int is_in_system_inode_array(struct ocfs2_super *osb,
49 int type,
50 u32 slot);
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC 47#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; 48static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
54#endif 49#endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
59 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; 54 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
60} 55}
61 56
62static inline int is_in_system_inode_array(struct ocfs2_super *osb, 57static struct inode **get_local_system_inode(struct ocfs2_super *osb,
63 int type, 58 int type,
64 u32 slot) 59 u32 slot)
65{ 60{
66 return slot == osb->slot_num || is_global_system_inode(type); 61 int index;
62 struct inode **local_system_inodes, **free = NULL;
63
64 BUG_ON(slot == OCFS2_INVALID_SLOT);
65 BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
66 type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
67
68 spin_lock(&osb->osb_lock);
69 local_system_inodes = osb->local_system_inodes;
70 spin_unlock(&osb->osb_lock);
71
72 if (unlikely(!local_system_inodes)) {
73 local_system_inodes = kzalloc(sizeof(struct inode *) *
74 NUM_LOCAL_SYSTEM_INODES *
75 osb->max_slots,
76 GFP_NOFS);
77 if (!local_system_inodes) {
78 mlog_errno(-ENOMEM);
79 /*
80 * return NULL here so that ocfs2_get_sytem_file_inodes
81 * will try to create an inode and use it. We will try
82 * to initialize local_system_inodes next time.
83 */
84 return NULL;
85 }
86
87 spin_lock(&osb->osb_lock);
88 if (osb->local_system_inodes) {
89 /* Someone has initialized it for us. */
90 free = local_system_inodes;
91 local_system_inodes = osb->local_system_inodes;
92 } else
93 osb->local_system_inodes = local_system_inodes;
94 spin_unlock(&osb->osb_lock);
95 if (unlikely(free))
96 kfree(free);
97 }
98
99 index = (slot * NUM_LOCAL_SYSTEM_INODES) +
100 (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
101
102 return &local_system_inodes[index];
67} 103}
68 104
69struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, 105struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
74 struct inode **arr = NULL; 110 struct inode **arr = NULL;
75 111
76 /* avoid the lookup if cached in local system file array */ 112 /* avoid the lookup if cached in local system file array */
77 if (is_in_system_inode_array(osb, type, slot)) 113 if (is_global_system_inode(type)) {
78 arr = &(osb->system_inodes[type]); 114 arr = &(osb->global_system_inodes[type]);
115 } else
116 arr = get_local_system_inode(osb, type, slot);
79 117
80 if (arr && ((inode = *arr) != NULL)) { 118 if (arr && ((inode = *arr) != NULL)) {
81 /* get a ref in addition to the array ref */ 119 /* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 06fa5e77c40e..67cd43914641 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
7081 goto out; 7081 goto out;
7082 } 7082 }
7083 7083
7084 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) 7084 if (!indexed)
7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); 7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
7086 else 7086 else
7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); 7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 14a22863291a..e043c4cb9a97 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -557,17 +557,16 @@ end:
557 return ret; 557 return ret;
558} 558}
559 559
560static int omfs_get_sb(struct file_system_type *fs_type, 560static struct dentry *omfs_mount(struct file_system_type *fs_type,
561 int flags, const char *dev_name, 561 int flags, const char *dev_name, void *data)
562 void *data, struct vfsmount *m)
563{ 562{
564 return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m); 563 return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
565} 564}
566 565
567static struct file_system_type omfs_fs_type = { 566static struct file_system_type omfs_fs_type = {
568 .owner = THIS_MODULE, 567 .owner = THIS_MODULE,
569 .name = "omfs", 568 .name = "omfs",
570 .get_sb = omfs_get_sb, 569 .mount = omfs_mount,
571 .kill_sb = kill_block_super, 570 .kill_sb = kill_block_super,
572 .fs_flags = FS_REQUIRES_DEV, 571 .fs_flags = FS_REQUIRES_DEV,
573}; 572};
diff --git a/fs/open.c b/fs/open.c
index d74e1983e8dc..e52389e1f05b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
223 return -EINVAL; 223 return -EINVAL;
224 224
225 /* Return error if mode is not supported */ 225 /* Return error if mode is not supported */
226 if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) 226 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
227 return -EOPNOTSUPP;
228
229 /* Punch hole must have keep size set */
230 if ((mode & FALLOC_FL_PUNCH_HOLE) &&
231 !(mode & FALLOC_FL_KEEP_SIZE))
227 return -EOPNOTSUPP; 232 return -EOPNOTSUPP;
228 233
229 if (!(file->f_mode & FMODE_WRITE)) 234 if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
250 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 255 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
251 return -EFBIG; 256 return -EFBIG;
252 257
253 if (!inode->i_op->fallocate) 258 if (!file->f_op->fallocate)
254 return -EOPNOTSUPP; 259 return -EOPNOTSUPP;
255 260
256 return inode->i_op->fallocate(inode, mode, offset, len); 261 return file->f_op->fallocate(file, mode, offset, len);
257} 262}
258 263
259SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) 264SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
@@ -786,11 +791,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
786 /* Pick up the filp from the open intent */ 791 /* Pick up the filp from the open intent */
787 filp = nd->intent.open.file; 792 filp = nd->intent.open.file;
788 /* Has the filesystem initialised the file for us? */ 793 /* Has the filesystem initialised the file for us? */
789 if (filp->f_path.dentry == NULL) 794 if (filp->f_path.dentry == NULL) {
795 path_get(&nd->path);
790 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, 796 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
791 NULL, cred); 797 NULL, cred);
792 else 798 }
793 path_put(&nd->path);
794 return filp; 799 return filp;
795} 800}
796 801
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012c..a2a5bff774e3 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
343 return &oi->vfs_inode; 343 return &oi->vfs_inode;
344} 344}
345 345
346static void openprom_destroy_inode(struct inode *inode) 346static void openprom_i_callback(struct rcu_head *head)
347{ 347{
348 struct inode *inode = container_of(head, struct inode, i_rcu);
349 INIT_LIST_HEAD(&inode->i_dentry);
348 kmem_cache_free(op_inode_cachep, OP_I(inode)); 350 kmem_cache_free(op_inode_cachep, OP_I(inode));
349} 351}
350 352
353static void openprom_destroy_inode(struct inode *inode)
354{
355 call_rcu(&inode->i_rcu, openprom_i_callback);
356}
357
351static struct inode *openprom_iget(struct super_block *sb, ino_t ino) 358static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
352{ 359{
353 struct inode *inode; 360 struct inode *inode;
@@ -415,16 +422,16 @@ out_no_root:
415 return ret; 422 return ret;
416} 423}
417 424
418static int openprom_get_sb(struct file_system_type *fs_type, 425static struct dentry *openprom_mount(struct file_system_type *fs_type,
419 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 426 int flags, const char *dev_name, void *data)
420{ 427{
421 return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt); 428 return mount_single(fs_type, flags, data, openprom_fill_super);
422} 429}
423 430
424static struct file_system_type openprom_fs_type = { 431static struct file_system_type openprom_fs_type = {
425 .owner = THIS_MODULE, 432 .owner = THIS_MODULE,
426 .name = "openpromfs", 433 .name = "openpromfs",
427 .get_sb = openprom_get_sb, 434 .mount = openprom_mount,
428 .kill_sb = kill_anon_super, 435 .kill_sb = kill_anon_super,
429}; 436};
430 437
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..9c21119512b9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
238} 238}
239 239
240ssize_t part_ro_show(struct device *dev,
241 struct device_attribute *attr, char *buf)
242{
243 struct hd_struct *p = dev_to_part(dev);
244 return sprintf(buf, "%d\n", p->policy ? 1 : 0);
245}
246
240ssize_t part_alignment_offset_show(struct device *dev, 247ssize_t part_alignment_offset_show(struct device *dev,
241 struct device_attribute *attr, char *buf) 248 struct device_attribute *attr, char *buf)
242{ 249{
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
312static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); 319static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
313static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 320static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
314static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 321static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
322static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
315static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 323static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
316static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, 324static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
317 NULL); 325 NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
326 &dev_attr_partition.attr, 334 &dev_attr_partition.attr,
327 &dev_attr_start.attr, 335 &dev_attr_start.attr,
328 &dev_attr_size.attr, 336 &dev_attr_size.attr,
337 &dev_attr_ro.attr,
329 &dev_attr_alignment_offset.attr, 338 &dev_attr_alignment_offset.attr,
330 &dev_attr_discard_alignment.attr, 339 &dev_attr_discard_alignment.attr,
331 &dev_attr_stat.attr, 340 &dev_attr_stat.attr,
@@ -352,6 +361,7 @@ static void part_release(struct device *dev)
352{ 361{
353 struct hd_struct *p = dev_to_part(dev); 362 struct hd_struct *p = dev_to_part(dev);
354 free_part_stats(p); 363 free_part_stats(p);
364 free_part_info(p);
355 kfree(p); 365 kfree(p);
356} 366}
357 367
@@ -371,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
371 put_device(part_to_dev(part)); 381 put_device(part_to_dev(part));
372} 382}
373 383
384void __delete_partition(struct hd_struct *part)
385{
386 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
387}
388
374void delete_partition(struct gendisk *disk, int partno) 389void delete_partition(struct gendisk *disk, int partno)
375{ 390{
376 struct disk_part_tbl *ptbl = disk->part_tbl; 391 struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -389,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
389 kobject_put(part->holder_dir); 404 kobject_put(part->holder_dir);
390 device_del(part_to_dev(part)); 405 device_del(part_to_dev(part));
391 406
392 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 407 hd_struct_put(part);
393} 408}
394 409
395static ssize_t whole_disk_show(struct device *dev, 410static ssize_t whole_disk_show(struct device *dev,
@@ -401,7 +416,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
401 whole_disk_show, NULL); 416 whole_disk_show, NULL);
402 417
403struct hd_struct *add_partition(struct gendisk *disk, int partno, 418struct hd_struct *add_partition(struct gendisk *disk, int partno,
404 sector_t start, sector_t len, int flags) 419 sector_t start, sector_t len, int flags,
420 struct partition_meta_info *info)
405{ 421{
406 struct hd_struct *p; 422 struct hd_struct *p;
407 dev_t devt = MKDEV(0, 0); 423 dev_t devt = MKDEV(0, 0);
@@ -438,6 +454,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
438 p->partno = partno; 454 p->partno = partno;
439 p->policy = get_disk_ro(disk); 455 p->policy = get_disk_ro(disk);
440 456
457 if (info) {
458 struct partition_meta_info *pinfo = alloc_part_info(disk);
459 if (!pinfo)
460 goto out_free_stats;
461 memcpy(pinfo, info, sizeof(*info));
462 p->info = pinfo;
463 }
464
441 dname = dev_name(ddev); 465 dname = dev_name(ddev);
442 if (isdigit(dname[strlen(dname) - 1])) 466 if (isdigit(dname[strlen(dname) - 1]))
443 dev_set_name(pdev, "%sp%d", dname, partno); 467 dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +475,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
451 475
452 err = blk_alloc_devt(p, &devt); 476 err = blk_alloc_devt(p, &devt);
453 if (err) 477 if (err)
454 goto out_free_stats; 478 goto out_free_info;
455 pdev->devt = devt; 479 pdev->devt = devt;
456 480
457 /* delay uevent until 'holders' subdir is created */ 481 /* delay uevent until 'holders' subdir is created */
@@ -479,8 +503,11 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
479 if (!dev_get_uevent_suppress(ddev)) 503 if (!dev_get_uevent_suppress(ddev))
480 kobject_uevent(&pdev->kobj, KOBJ_ADD); 504 kobject_uevent(&pdev->kobj, KOBJ_ADD);
481 505
506 hd_ref_init(p);
482 return p; 507 return p;
483 508
509out_free_info:
510 free_part_info(p);
484out_free_stats: 511out_free_stats:
485 free_part_stats(p); 512 free_part_stats(p);
486out_free: 513out_free:
@@ -495,65 +522,6 @@ out_put:
495 return ERR_PTR(err); 522 return ERR_PTR(err);
496} 523}
497 524
498/* Not exported, helper to add_disk(). */
499void register_disk(struct gendisk *disk)
500{
501 struct device *ddev = disk_to_dev(disk);
502 struct block_device *bdev;
503 struct disk_part_iter piter;
504 struct hd_struct *part;
505 int err;
506
507 ddev->parent = disk->driverfs_dev;
508
509 dev_set_name(ddev, disk->disk_name);
510
511 /* delay uevents, until we scanned partition table */
512 dev_set_uevent_suppress(ddev, 1);
513
514 if (device_add(ddev))
515 return;
516#ifndef CONFIG_SYSFS_DEPRECATED
517 err = sysfs_create_link(block_depr, &ddev->kobj,
518 kobject_name(&ddev->kobj));
519 if (err) {
520 device_del(ddev);
521 return;
522 }
523#endif
524 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
525 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
526
527 /* No minors to use for partitions */
528 if (!disk_partitionable(disk))
529 goto exit;
530
531 /* No such device (e.g., media were just removed) */
532 if (!get_capacity(disk))
533 goto exit;
534
535 bdev = bdget_disk(disk, 0);
536 if (!bdev)
537 goto exit;
538
539 bdev->bd_invalidated = 1;
540 err = blkdev_get(bdev, FMODE_READ);
541 if (err < 0)
542 goto exit;
543 blkdev_put(bdev, FMODE_READ);
544
545exit:
546 /* announce disk after possible partitions are created */
547 dev_set_uevent_suppress(ddev, 0);
548 kobject_uevent(&ddev->kobj, KOBJ_ADD);
549
550 /* announce possible partitions */
551 disk_part_iter_init(&piter, disk, 0);
552 while ((part = disk_part_iter_next(&piter)))
553 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
554 disk_part_iter_exit(&piter);
555}
556
557static bool disk_unlock_native_capacity(struct gendisk *disk) 525static bool disk_unlock_native_capacity(struct gendisk *disk)
558{ 526{
559 const struct block_device_operations *bdops = disk->fops; 527 const struct block_device_operations *bdops = disk->fops;
@@ -642,6 +610,7 @@ rescan:
642 /* add partitions */ 610 /* add partitions */
643 for (p = 1; p < state->limit; p++) { 611 for (p = 1; p < state->limit; p++) {
644 sector_t size, from; 612 sector_t size, from;
613 struct partition_meta_info *info = NULL;
645 614
646 size = state->parts[p].size; 615 size = state->parts[p].size;
647 if (!size) 616 if (!size)
@@ -675,8 +644,12 @@ rescan:
675 size = get_capacity(disk) - from; 644 size = get_capacity(disk) - from;
676 } 645 }
677 } 646 }
647
648 if (state->parts[p].has_info)
649 info = &state->parts[p].info;
678 part = add_partition(disk, p, from, size, 650 part = add_partition(disk, p, from, size,
679 state->parts[p].flags); 651 state->parts[p].flags,
652 &state->parts[p].info);
680 if (IS_ERR(part)) { 653 if (IS_ERR(part)) {
681 printk(KERN_ERR " %s: p%d could not be added: %ld\n", 654 printk(KERN_ERR " %s: p%d could not be added: %ld\n",
682 disk->disk_name, p, -PTR_ERR(part)); 655 disk->disk_name, p, -PTR_ERR(part));
@@ -711,34 +684,3 @@ fail:
711} 684}
712 685
713EXPORT_SYMBOL(read_dev_sector); 686EXPORT_SYMBOL(read_dev_sector);
714
715void del_gendisk(struct gendisk *disk)
716{
717 struct disk_part_iter piter;
718 struct hd_struct *part;
719
720 /* invalidate stuff */
721 disk_part_iter_init(&piter, disk,
722 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
723 while ((part = disk_part_iter_next(&piter))) {
724 invalidate_partition(disk, part->partno);
725 delete_partition(disk, part->partno);
726 }
727 disk_part_iter_exit(&piter);
728
729 invalidate_partition(disk, 0);
730 blk_free_devt(disk_to_dev(disk)->devt);
731 set_capacity(disk, 0);
732 disk->flags &= ~GENHD_FL_UP;
733 unlink_gendisk(disk);
734 part_stat_set_all(&disk->part0, 0);
735 disk->part0.stamp = 0;
736
737 kobject_put(disk->part0.holder_dir);
738 kobject_put(disk->slave_dir);
739 disk->driverfs_dev = NULL;
740#ifndef CONFIG_SYSFS_DEPRECATED
741 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
742#endif
743 device_del(disk_to_dev(disk));
744}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
1#include <linux/pagemap.h> 1#include <linux/pagemap.h>
2#include <linux/blkdev.h> 2#include <linux/blkdev.h>
3#include <linux/genhd.h>
3 4
4/* 5/*
5 * add_gd_partition adds a partitions details to the devices partition 6 * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
12 sector_t from; 13 sector_t from;
13 sector_t size; 14 sector_t size;
14 int flags; 15 int flags;
16 bool has_info;
17 struct partition_meta_info info;
15 } parts[DISK_MAX_PARTS]; 18 } parts[DISK_MAX_PARTS];
16 int next; 19 int next;
17 int limit; 20 int limit;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
94 * 94 *
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/ctype.h>
97#include <linux/math64.h> 98#include <linux/math64.h>
98#include <linux/slab.h> 99#include <linux/slab.h>
99#include "check.h" 100#include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
604 gpt_entry *ptes = NULL; 605 gpt_entry *ptes = NULL;
605 u32 i; 606 u32 i;
606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512; 607 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
608 u8 unparsed_guid[37];
607 609
608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { 610 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
609 kfree(gpt); 611 kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
614 pr_debug("GUID Partition Table is valid! Yea!\n"); 616 pr_debug("GUID Partition Table is valid! Yea!\n");
615 617
616 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 618 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
619 struct partition_meta_info *info;
620 unsigned label_count = 0;
621 unsigned label_max;
617 u64 start = le64_to_cpu(ptes[i].starting_lba); 622 u64 start = le64_to_cpu(ptes[i].starting_lba);
618 u64 size = le64_to_cpu(ptes[i].ending_lba) - 623 u64 size = le64_to_cpu(ptes[i].ending_lba) -
619 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 624 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
627 if (!efi_guidcmp(ptes[i].partition_type_guid, 632 if (!efi_guidcmp(ptes[i].partition_type_guid,
628 PARTITION_LINUX_RAID_GUID)) 633 PARTITION_LINUX_RAID_GUID))
629 state->parts[i + 1].flags = ADDPART_FLAG_RAID; 634 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635
636 info = &state->parts[i + 1].info;
637 /* Instead of doing a manual swap to big endian, reuse the
638 * common ASCII hex format as the interim.
639 */
640 efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
641 part_pack_uuid(unparsed_guid, info->uuid);
642
643 /* Naively convert UTF16-LE to 7 bits. */
644 label_max = min(sizeof(info->volname) - 1,
645 sizeof(ptes[i].partition_name));
646 info->volname[label_max] = 0;
647 while (label_count < label_max) {
648 u8 c = ptes[i].partition_name[label_count] & 0xff;
649 if (c && !isprint(c))
650 c = '!';
651 info->volname[label_count] = c;
652 label_count++;
653 }
654 state->parts[i + 1].has_info = true;
630 } 655 }
631 kfree(ptes); 656 kfree(ptes);
632 kfree(gpt); 657 kfree(gpt);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 5bf8a04b5d9b..789c625c7aa5 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it under 10 * This program is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU General Public License as published by the Free Software 11 * the terms of the GNU General Public License as published by the Free Software
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d86..374242c0971a 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it 10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free 11 * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51c..da42f7db50de 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
382 error = ops->confirm(pipe, buf); 382 error = ops->confirm(pipe, buf);
383 if (error) { 383 if (error) {
384 if (!ret) 384 if (!ret)
385 error = ret; 385 ret = error;
386 break; 386 break;
387 } 387 }
388 388
@@ -441,7 +441,7 @@ redo:
441 break; 441 break;
442 } 442 }
443 if (do_wakeup) { 443 if (do_wakeup) {
444 wake_up_interruptible_sync(&pipe->wait); 444 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
446 } 446 }
447 pipe_wait(pipe); 447 pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
450 450
451 /* Signal writers asynchronously that there is more room. */ 451 /* Signal writers asynchronously that there is more room. */
452 if (do_wakeup) { 452 if (do_wakeup) {
453 wake_up_interruptible_sync(&pipe->wait); 453 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
455 } 455 }
456 if (ret > 0) 456 if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
612 break; 612 break;
613 } 613 }
614 if (do_wakeup) { 614 if (do_wakeup) {
615 wake_up_interruptible_sync(&pipe->wait); 615 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
617 do_wakeup = 0; 617 do_wakeup = 0;
618 } 618 }
@@ -623,7 +623,7 @@ redo2:
623out: 623out:
624 mutex_unlock(&inode->i_mutex); 624 mutex_unlock(&inode->i_mutex);
625 if (do_wakeup) { 625 if (do_wakeup) {
626 wake_up_interruptible_sync(&pipe->wait); 626 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
628 } 628 }
629 if (ret > 0) 629 if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
715 if (!pipe->readers && !pipe->writers) { 715 if (!pipe->readers && !pipe->writers) {
716 free_pipe_info(inode); 716 free_pipe_info(inode);
717 } else { 717 } else {
718 wake_up_interruptible_sync(&pipe->wait); 718 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
721 } 721 }
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
954 if (!inode) 954 if (!inode)
955 goto fail_inode; 955 goto fail_inode;
956 956
957 inode->i_ino = get_next_ino();
958
957 pipe = alloc_pipe_info(inode); 959 pipe = alloc_pipe_info(inode);
958 if (!pipe) 960 if (!pipe)
959 goto fail_iput; 961 goto fail_iput;
@@ -997,12 +999,11 @@ struct file *create_write_pipe(int flags)
997 goto err; 999 goto err;
998 1000
999 err = -ENOMEM; 1001 err = -ENOMEM;
1000 path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); 1002 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
1001 if (!path.dentry) 1003 if (!path.dentry)
1002 goto err_inode; 1004 goto err_inode;
1003 path.mnt = mntget(pipe_mnt); 1005 path.mnt = mntget(pipe_mnt);
1004 1006
1005 path.dentry->d_op = &pipefs_dentry_operations;
1006 d_instantiate(path.dentry, inode); 1007 d_instantiate(path.dentry, inode);
1007 1008
1008 err = -ENFILE; 1009 err = -ENFILE;
@@ -1197,12 +1198,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1197 return ret; 1198 return ret;
1198} 1199}
1199 1200
1201/*
1202 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1203 * location, so checking ->i_pipe is not enough to verify that this is a
1204 * pipe.
1205 */
1206struct pipe_inode_info *get_pipe_info(struct file *file)
1207{
1208 struct inode *i = file->f_path.dentry->d_inode;
1209
1210 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
1211}
1212
1200long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1213long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1201{ 1214{
1202 struct pipe_inode_info *pipe; 1215 struct pipe_inode_info *pipe;
1203 long ret; 1216 long ret;
1204 1217
1205 pipe = file->f_path.dentry->d_inode->i_pipe; 1218 pipe = get_pipe_info(file);
1206 if (!pipe) 1219 if (!pipe)
1207 return -EBADF; 1220 return -EBADF;
1208 1221
@@ -1239,22 +1252,26 @@ out:
1239 return ret; 1252 return ret;
1240} 1253}
1241 1254
1255static const struct super_operations pipefs_ops = {
1256 .destroy_inode = free_inode_nonrcu,
1257};
1258
1242/* 1259/*
1243 * pipefs should _never_ be mounted by userland - too much of security hassle, 1260 * pipefs should _never_ be mounted by userland - too much of security hassle,
1244 * no real gain from having the whole whorehouse mounted. So we don't need 1261 * no real gain from having the whole whorehouse mounted. So we don't need
1245 * any operations on the root directory. However, we need a non-trivial 1262 * any operations on the root directory. However, we need a non-trivial
1246 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1263 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1247 */ 1264 */
1248static int pipefs_get_sb(struct file_system_type *fs_type, 1265static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1249 int flags, const char *dev_name, void *data, 1266 int flags, const char *dev_name, void *data)
1250 struct vfsmount *mnt)
1251{ 1267{
1252 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); 1268 return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1269 &pipefs_dentry_operations, PIPEFS_MAGIC);
1253} 1270}
1254 1271
1255static struct file_system_type pipe_fs_type = { 1272static struct file_system_type pipe_fs_type = {
1256 .name = "pipefs", 1273 .name = "pipefs",
1257 .get_sb = pipefs_get_sb, 1274 .mount = pipefs_mount,
1258 .kill_sb = kill_anon_super, 1275 .kill_sb = kill_anon_super,
1259}; 1276};
1260 1277
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748f..d42514e32380 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
288 */ 288 */
289static inline int do_refcount_check(struct vfsmount *mnt, int count) 289static inline int do_refcount_check(struct vfsmount *mnt, int count)
290{ 290{
291 int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts; 291 int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
292 return (mycount > count); 292 return (mycount > count);
293} 293}
294 294
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
300 * Check if any of these mounts that **do not have submounts** 300 * Check if any of these mounts that **do not have submounts**
301 * have more references than 'refcnt'. If so return busy. 301 * have more references than 'refcnt'. If so return busy.
302 * 302 *
303 * vfsmount lock must be held for read or write 303 * vfsmount lock must be held for write
304 */ 304 */
305int propagate_mount_busy(struct vfsmount *mnt, int refcnt) 305int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
306{ 306{
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec25..b1cf6bf4b41d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/errno.h> 23#include <linux/errno.h>
24 24
25EXPORT_SYMBOL(posix_acl_init);
25EXPORT_SYMBOL(posix_acl_alloc); 26EXPORT_SYMBOL(posix_acl_alloc);
26EXPORT_SYMBOL(posix_acl_clone); 27EXPORT_SYMBOL(posix_acl_clone);
27EXPORT_SYMBOL(posix_acl_valid); 28EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
32EXPORT_SYMBOL(posix_acl_permission); 33EXPORT_SYMBOL(posix_acl_permission);
33 34
34/* 35/*
36 * Init a fresh posix_acl
37 */
38void
39posix_acl_init(struct posix_acl *acl, int count)
40{
41 atomic_set(&acl->a_refcount, 1);
42 acl->a_count = count;
43}
44
45/*
35 * Allocate a new ACL with the specified number of entries. 46 * Allocate a new ACL with the specified number of entries.
36 */ 47 */
37struct posix_acl * 48struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
40 const size_t size = sizeof(struct posix_acl) + 51 const size_t size = sizeof(struct posix_acl) +
41 count * sizeof(struct posix_acl_entry); 52 count * sizeof(struct posix_acl_entry);
42 struct posix_acl *acl = kmalloc(size, flags); 53 struct posix_acl *acl = kmalloc(size, flags);
43 if (acl) { 54 if (acl)
44 atomic_set(&acl->a_refcount, 1); 55 posix_acl_init(acl, count);
45 acl->a_count = count;
46 }
47 return acl; 56 return acl;
48} 57}
49 58
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..15af6222f8a4 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
1config PROC_FS 1config PROC_FS
2 bool "/proc file system support" if EMBEDDED 2 bool "/proc file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 This is a virtual file system providing information about the status 5 This is a virtual file system providing information about the status
@@ -33,14 +33,14 @@ config PROC_KCORE
33 depends on PROC_FS && MMU 33 depends on PROC_FS && MMU
34 34
35config PROC_VMCORE 35config PROC_VMCORE
36 bool "/proc/vmcore support (EXPERIMENTAL)" 36 bool "/proc/vmcore support"
37 depends on PROC_FS && CRASH_DUMP 37 depends on PROC_FS && CRASH_DUMP
38 default y 38 default y
39 help 39 help
40 Exports the dump image of crashed kernel in ELF format. 40 Exports the dump image of crashed kernel in ELF format.
41 41
42config PROC_SYSCTL 42config PROC_SYSCTL
43 bool "Sysctl support (/proc/sys)" if EMBEDDED 43 bool "Sysctl support (/proc/sys)" if EXPERT
44 depends on PROC_FS 44 depends on PROC_FS
45 select SYSCTL 45 select SYSCTL
46 default y 46 default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
61config PROC_PAGE_MONITOR 61config PROC_PAGE_MONITOR
62 default y 62 default y
63 depends on PROC_FS && MMU 63 depends on PROC_FS && MMU
64 bool "Enable /proc page monitoring" if EMBEDDED 64 bool "Enable /proc page monitoring" if EXPERT
65 help 65 help
66 Various /proc files exist to monitor process memory utilization: 66 Various /proc files exist to monitor process memory utilization:
67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, 67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc518..df434c5f28fb 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU) := mmu.o task_mmu.o
10proc-y += inode.o root.o base.o generic.o array.o \ 10proc-y += inode.o root.o base.o generic.o array.o \
11 proc_tty.o 11 proc_tty.o
12proc-y += cmdline.o 12proc-y += cmdline.o
13proc-y += consoles.o
13proc-y += cpuinfo.o 14proc-y += cpuinfo.o
14proc-y += devices.o 15proc-y += devices.o
15proc-y += interrupts.o 16proc-y += interrupts.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676ae..df2b703b9d0f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
95 95
96 get_task_comm(tcomm, p); 96 get_task_comm(tcomm, p);
97 97
98 seq_printf(m, "Name:\t"); 98 seq_puts(m, "Name:\t");
99 end = m->buf + m->size; 99 end = m->buf + m->size;
100 buf = m->buf + m->count; 100 buf = m->buf + m->count;
101 name = tcomm; 101 name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
122 buf++; 122 buf++;
123 } 123 }
124 m->count = buf - m->buf; 124 m->count = buf - m->buf;
125 seq_printf(m, "\n"); 125 seq_putc(m, '\n');
126} 126}
127 127
128/* 128/*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
208 seq_printf(m, "%d ", GROUP_AT(group_info, g)); 208 seq_printf(m, "%d ", GROUP_AT(group_info, g));
209 put_cred(cred); 209 put_cred(cred);
210 210
211 seq_printf(m, "\n"); 211 seq_putc(m, '\n');
212} 212}
213 213
214static void render_sigset_t(struct seq_file *m, const char *header, 214static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
216{ 216{
217 int i; 217 int i;
218 218
219 seq_printf(m, "%s", header); 219 seq_puts(m, header);
220 220
221 i = _NSIG; 221 i = _NSIG;
222 do { 222 do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
230 seq_printf(m, "%x", x); 230 seq_printf(m, "%x", x);
231 } while (i >= 4); 231 } while (i >= 4);
232 232
233 seq_printf(m, "\n"); 233 seq_putc(m, '\n');
234} 234}
235 235
236static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, 236static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
291{ 291{
292 unsigned __capi; 292 unsigned __capi;
293 293
294 seq_printf(m, "%s", header); 294 seq_puts(m, header);
295 CAP_FOR_EACH_U32(__capi) { 295 CAP_FOR_EACH_U32(__capi) {
296 seq_printf(m, "%08x", 296 seq_printf(m, "%08x",
297 a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); 297 a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
298 } 298 }
299 seq_printf(m, "\n"); 299 seq_putc(m, '\n');
300} 300}
301 301
302static inline void task_cap(struct seq_file *m, struct task_struct *p) 302static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
329 329
330static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 330static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
331{ 331{
332 seq_printf(m, "Cpus_allowed:\t"); 332 seq_puts(m, "Cpus_allowed:\t");
333 seq_cpumask(m, &task->cpus_allowed); 333 seq_cpumask(m, &task->cpus_allowed);
334 seq_printf(m, "\n"); 334 seq_putc(m, '\n');
335 seq_printf(m, "Cpus_allowed_list:\t"); 335 seq_puts(m, "Cpus_allowed_list:\t");
336 seq_cpumask_list(m, &task->cpus_allowed); 336 seq_cpumask_list(m, &task->cpus_allowed);
337 seq_printf(m, "\n"); 337 seq_putc(m, '\n');
338} 338}
339 339
340int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 340int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
535int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, 535int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
536 struct pid *pid, struct task_struct *task) 536 struct pid *pid, struct task_struct *task)
537{ 537{
538 int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; 538 unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
539 struct mm_struct *mm = get_task_mm(task); 539 struct mm_struct *mm = get_task_mm(task);
540 540
541 if (mm) { 541 if (mm) {
542 size = task_statm(mm, &shared, &text, &data, &resident); 542 size = task_statm(mm, &shared, &text, &data, &resident);
543 mmput(mm); 543 mmput(mm);
544 } 544 }
545 seq_printf(m, "%d %d %d %d %d %d %d\n", 545 seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
546 size, resident, shared, text, lib, data, 0); 546 size, resident, shared, text, data);
547 547
548 return 0; 548 return 0;
549} 549}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e4addaa5424..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
226{ 226{
227 struct mm_struct *mm; 227 struct mm_struct *mm;
228 228
229 if (mutex_lock_killable(&task->cred_guard_mutex)) 229 if (mutex_lock_killable(&task->signal->cred_guard_mutex))
230 return NULL; 230 return NULL;
231 231
232 mm = get_task_mm(task); 232 mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
235 mmput(mm); 235 mmput(mm);
236 mm = NULL; 236 mm = NULL;
237 } 237 }
238 mutex_unlock(&task->cred_guard_mutex); 238 mutex_unlock(&task->signal->cred_guard_mutex);
239 239
240 return mm; 240 return mm;
241} 241}
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
373 return -ESRCH; 373 return -ESRCH;
374 seq_puts(m, "Latency Top version : v0.1\n"); 374 seq_puts(m, "Latency Top version : v0.1\n");
375 for (i = 0; i < 32; i++) { 375 for (i = 0; i < 32; i++) {
376 if (task->latency_record[i].backtrace[0]) { 376 struct latency_record *lr = &task->latency_record[i];
377 if (lr->backtrace[0]) {
377 int q; 378 int q;
378 seq_printf(m, "%i %li %li ", 379 seq_printf(m, "%i %li %li",
379 task->latency_record[i].count, 380 lr->count, lr->time, lr->max);
380 task->latency_record[i].time,
381 task->latency_record[i].max);
382 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 381 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
383 char sym[KSYM_SYMBOL_LEN]; 382 unsigned long bt = lr->backtrace[q];
384 char *c; 383 if (!bt)
385 if (!task->latency_record[i].backtrace[q])
386 break; 384 break;
387 if (task->latency_record[i].backtrace[q] == ULONG_MAX) 385 if (bt == ULONG_MAX)
388 break; 386 break;
389 sprint_symbol(sym, task->latency_record[i].backtrace[q]); 387 seq_printf(m, " %ps", (void *)bt);
390 c = strchr(sym, '+');
391 if (c)
392 *c = 0;
393 seq_printf(m, "%s ", sym);
394 } 388 }
395 seq_printf(m, "\n"); 389 seq_putc(m, '\n');
396 } 390 }
397 391
398 } 392 }
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
751 745
752static int proc_single_open(struct inode *inode, struct file *filp) 746static int proc_single_open(struct inode *inode, struct file *filp)
753{ 747{
754 int ret; 748 return single_open(filp, proc_single_show, inode);
755 ret = single_open(filp, proc_single_show, NULL);
756 if (!ret) {
757 struct seq_file *m = filp->private_data;
758
759 m->private = inode;
760 }
761 return ret;
762} 749}
763 750
764static const struct file_operations proc_single_file_operations = { 751static const struct file_operations proc_single_file_operations = {
@@ -771,6 +758,8 @@ static const struct file_operations proc_single_file_operations = {
771static int mem_open(struct inode* inode, struct file* file) 758static int mem_open(struct inode* inode, struct file* file)
772{ 759{
773 file->private_data = (void*)((long)current->self_exec_id); 760 file->private_data = (void*)((long)current->self_exec_id);
761 /* OK to pass negative loff_t, we can catch out-of-range */
762 file->f_mode |= FMODE_UNSIGNED_OFFSET;
774 return 0; 763 return 0;
775} 764}
776 765
@@ -1023,28 +1012,47 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1023 memset(buffer, 0, sizeof(buffer)); 1012 memset(buffer, 0, sizeof(buffer));
1024 if (count > sizeof(buffer) - 1) 1013 if (count > sizeof(buffer) - 1)
1025 count = sizeof(buffer) - 1; 1014 count = sizeof(buffer) - 1;
1026 if (copy_from_user(buffer, buf, count)) 1015 if (copy_from_user(buffer, buf, count)) {
1027 return -EFAULT; 1016 err = -EFAULT;
1017 goto out;
1018 }
1028 1019
1029 err = strict_strtol(strstrip(buffer), 0, &oom_adjust); 1020 err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1030 if (err) 1021 if (err)
1031 return -EINVAL; 1022 goto out;
1032 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && 1023 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1033 oom_adjust != OOM_DISABLE) 1024 oom_adjust != OOM_DISABLE) {
1034 return -EINVAL; 1025 err = -EINVAL;
1026 goto out;
1027 }
1035 1028
1036 task = get_proc_task(file->f_path.dentry->d_inode); 1029 task = get_proc_task(file->f_path.dentry->d_inode);
1037 if (!task) 1030 if (!task) {
1038 return -ESRCH; 1031 err = -ESRCH;
1032 goto out;
1033 }
1034
1035 task_lock(task);
1036 if (!task->mm) {
1037 err = -EINVAL;
1038 goto err_task_lock;
1039 }
1040
1039 if (!lock_task_sighand(task, &flags)) { 1041 if (!lock_task_sighand(task, &flags)) {
1040 put_task_struct(task); 1042 err = -ESRCH;
1041 return -ESRCH; 1043 goto err_task_lock;
1042 } 1044 }
1043 1045
1044 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { 1046 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1045 unlock_task_sighand(task, &flags); 1047 err = -EACCES;
1046 put_task_struct(task); 1048 goto err_sighand;
1047 return -EACCES; 1049 }
1050
1051 if (oom_adjust != task->signal->oom_adj) {
1052 if (oom_adjust == OOM_DISABLE)
1053 atomic_inc(&task->mm->oom_disable_count);
1054 if (task->signal->oom_adj == OOM_DISABLE)
1055 atomic_dec(&task->mm->oom_disable_count);
1048 } 1056 }
1049 1057
1050 /* 1058 /*
@@ -1065,10 +1073,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1065 else 1073 else
1066 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / 1074 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1067 -OOM_DISABLE; 1075 -OOM_DISABLE;
1076err_sighand:
1068 unlock_task_sighand(task, &flags); 1077 unlock_task_sighand(task, &flags);
1078err_task_lock:
1079 task_unlock(task);
1069 put_task_struct(task); 1080 put_task_struct(task);
1070 1081out:
1071 return count; 1082 return err < 0 ? err : count;
1072} 1083}
1073 1084
1074static const struct file_operations proc_oom_adjust_operations = { 1085static const struct file_operations proc_oom_adjust_operations = {
@@ -1109,31 +1120,52 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1109 memset(buffer, 0, sizeof(buffer)); 1120 memset(buffer, 0, sizeof(buffer));
1110 if (count > sizeof(buffer) - 1) 1121 if (count > sizeof(buffer) - 1)
1111 count = sizeof(buffer) - 1; 1122 count = sizeof(buffer) - 1;
1112 if (copy_from_user(buffer, buf, count)) 1123 if (copy_from_user(buffer, buf, count)) {
1113 return -EFAULT; 1124 err = -EFAULT;
1125 goto out;
1126 }
1114 1127
1115 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); 1128 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
1116 if (err) 1129 if (err)
1117 return -EINVAL; 1130 goto out;
1118 if (oom_score_adj < OOM_SCORE_ADJ_MIN || 1131 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1119 oom_score_adj > OOM_SCORE_ADJ_MAX) 1132 oom_score_adj > OOM_SCORE_ADJ_MAX) {
1120 return -EINVAL; 1133 err = -EINVAL;
1134 goto out;
1135 }
1121 1136
1122 task = get_proc_task(file->f_path.dentry->d_inode); 1137 task = get_proc_task(file->f_path.dentry->d_inode);
1123 if (!task) 1138 if (!task) {
1124 return -ESRCH; 1139 err = -ESRCH;
1140 goto out;
1141 }
1142
1143 task_lock(task);
1144 if (!task->mm) {
1145 err = -EINVAL;
1146 goto err_task_lock;
1147 }
1148
1125 if (!lock_task_sighand(task, &flags)) { 1149 if (!lock_task_sighand(task, &flags)) {
1126 put_task_struct(task); 1150 err = -ESRCH;
1127 return -ESRCH; 1151 goto err_task_lock;
1128 } 1152 }
1129 if (oom_score_adj < task->signal->oom_score_adj && 1153
1154 if (oom_score_adj < task->signal->oom_score_adj_min &&
1130 !capable(CAP_SYS_RESOURCE)) { 1155 !capable(CAP_SYS_RESOURCE)) {
1131 unlock_task_sighand(task, &flags); 1156 err = -EACCES;
1132 put_task_struct(task); 1157 goto err_sighand;
1133 return -EACCES;
1134 } 1158 }
1135 1159
1160 if (oom_score_adj != task->signal->oom_score_adj) {
1161 if (oom_score_adj == OOM_SCORE_ADJ_MIN)
1162 atomic_inc(&task->mm->oom_disable_count);
1163 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1164 atomic_dec(&task->mm->oom_disable_count);
1165 }
1136 task->signal->oom_score_adj = oom_score_adj; 1166 task->signal->oom_score_adj = oom_score_adj;
1167 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1168 task->signal->oom_score_adj_min = oom_score_adj;
1137 /* 1169 /*
1138 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1170 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1139 * always attainable. 1171 * always attainable.
@@ -1143,14 +1175,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1143 else 1175 else
1144 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / 1176 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1145 OOM_SCORE_ADJ_MAX; 1177 OOM_SCORE_ADJ_MAX;
1178err_sighand:
1146 unlock_task_sighand(task, &flags); 1179 unlock_task_sighand(task, &flags);
1180err_task_lock:
1181 task_unlock(task);
1147 put_task_struct(task); 1182 put_task_struct(task);
1148 return count; 1183out:
1184 return err < 0 ? err : count;
1149} 1185}
1150 1186
1151static const struct file_operations proc_oom_score_adj_operations = { 1187static const struct file_operations proc_oom_score_adj_operations = {
1152 .read = oom_score_adj_read, 1188 .read = oom_score_adj_read,
1153 .write = oom_score_adj_write, 1189 .write = oom_score_adj_write,
1190 .llseek = default_llseek,
1154}; 1191};
1155 1192
1156#ifdef CONFIG_AUDITSYSCALL 1193#ifdef CONFIG_AUDITSYSCALL
@@ -1338,9 +1375,77 @@ sched_write(struct file *file, const char __user *buf,
1338 1375
1339static int sched_open(struct inode *inode, struct file *filp) 1376static int sched_open(struct inode *inode, struct file *filp)
1340{ 1377{
1378 return single_open(filp, sched_show, inode);
1379}
1380
1381static const struct file_operations proc_pid_sched_operations = {
1382 .open = sched_open,
1383 .read = seq_read,
1384 .write = sched_write,
1385 .llseek = seq_lseek,
1386 .release = single_release,
1387};
1388
1389#endif
1390
1391#ifdef CONFIG_SCHED_AUTOGROUP
1392/*
1393 * Print out autogroup related information:
1394 */
1395static int sched_autogroup_show(struct seq_file *m, void *v)
1396{
1397 struct inode *inode = m->private;
1398 struct task_struct *p;
1399
1400 p = get_proc_task(inode);
1401 if (!p)
1402 return -ESRCH;
1403 proc_sched_autogroup_show_task(p, m);
1404
1405 put_task_struct(p);
1406
1407 return 0;
1408}
1409
1410static ssize_t
1411sched_autogroup_write(struct file *file, const char __user *buf,
1412 size_t count, loff_t *offset)
1413{
1414 struct inode *inode = file->f_path.dentry->d_inode;
1415 struct task_struct *p;
1416 char buffer[PROC_NUMBUF];
1417 long nice;
1418 int err;
1419
1420 memset(buffer, 0, sizeof(buffer));
1421 if (count > sizeof(buffer) - 1)
1422 count = sizeof(buffer) - 1;
1423 if (copy_from_user(buffer, buf, count))
1424 return -EFAULT;
1425
1426 err = strict_strtol(strstrip(buffer), 0, &nice);
1427 if (err)
1428 return -EINVAL;
1429
1430 p = get_proc_task(inode);
1431 if (!p)
1432 return -ESRCH;
1433
1434 err = nice;
1435 err = proc_sched_autogroup_set_nice(p, &err);
1436 if (err)
1437 count = err;
1438
1439 put_task_struct(p);
1440
1441 return count;
1442}
1443
1444static int sched_autogroup_open(struct inode *inode, struct file *filp)
1445{
1341 int ret; 1446 int ret;
1342 1447
1343 ret = single_open(filp, sched_show, NULL); 1448 ret = single_open(filp, sched_autogroup_show, NULL);
1344 if (!ret) { 1449 if (!ret) {
1345 struct seq_file *m = filp->private_data; 1450 struct seq_file *m = filp->private_data;
1346 1451
@@ -1349,15 +1454,15 @@ static int sched_open(struct inode *inode, struct file *filp)
1349 return ret; 1454 return ret;
1350} 1455}
1351 1456
1352static const struct file_operations proc_pid_sched_operations = { 1457static const struct file_operations proc_pid_sched_autogroup_operations = {
1353 .open = sched_open, 1458 .open = sched_autogroup_open,
1354 .read = seq_read, 1459 .read = seq_read,
1355 .write = sched_write, 1460 .write = sched_autogroup_write,
1356 .llseek = seq_lseek, 1461 .llseek = seq_lseek,
1357 .release = single_release, 1462 .release = single_release,
1358}; 1463};
1359 1464
1360#endif 1465#endif /* CONFIG_SCHED_AUTOGROUP */
1361 1466
1362static ssize_t comm_write(struct file *file, const char __user *buf, 1467static ssize_t comm_write(struct file *file, const char __user *buf,
1363 size_t count, loff_t *offset) 1468 size_t count, loff_t *offset)
@@ -1406,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
1406 1511
1407static int comm_open(struct inode *inode, struct file *filp) 1512static int comm_open(struct inode *inode, struct file *filp)
1408{ 1513{
1409 int ret; 1514 return single_open(filp, comm_show, inode);
1410
1411 ret = single_open(filp, comm_show, NULL);
1412 if (!ret) {
1413 struct seq_file *m = filp->private_data;
1414
1415 m->private = inode;
1416 }
1417 return ret;
1418} 1515}
1419 1516
1420static const struct file_operations proc_pid_set_comm_operations = { 1517static const struct file_operations proc_pid_set_comm_operations = {
@@ -1526,7 +1623,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1526 if (!tmp) 1623 if (!tmp)
1527 return -ENOMEM; 1624 return -ENOMEM;
1528 1625
1529 pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE); 1626 pathname = d_path(path, tmp, PAGE_SIZE);
1530 len = PTR_ERR(pathname); 1627 len = PTR_ERR(pathname);
1531 if (IS_ERR(pathname)) 1628 if (IS_ERR(pathname))
1532 goto out; 1629 goto out;
@@ -1600,6 +1697,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1600 1697
1601 /* Common stuff */ 1698 /* Common stuff */
1602 ei = PROC_I(inode); 1699 ei = PROC_I(inode);
1700 inode->i_ino = get_next_ino();
1603 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1701 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1604 inode->i_op = &proc_def_inode_operations; 1702 inode->i_op = &proc_def_inode_operations;
1605 1703
@@ -1670,10 +1768,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
1670 */ 1768 */
1671static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) 1769static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1672{ 1770{
1673 struct inode *inode = dentry->d_inode; 1771 struct inode *inode;
1674 struct task_struct *task = get_proc_task(inode); 1772 struct task_struct *task;
1675 const struct cred *cred; 1773 const struct cred *cred;
1676 1774
1775 if (nd && nd->flags & LOOKUP_RCU)
1776 return -ECHILD;
1777
1778 inode = dentry->d_inode;
1779 task = get_proc_task(inode);
1780
1677 if (task) { 1781 if (task) {
1678 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1782 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1679 task_dumpable(task)) { 1783 task_dumpable(task)) {
@@ -1695,7 +1799,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1695 return 0; 1799 return 0;
1696} 1800}
1697 1801
1698static int pid_delete_dentry(struct dentry * dentry) 1802static int pid_delete_dentry(const struct dentry * dentry)
1699{ 1803{
1700 /* Is the task we represent dead? 1804 /* Is the task we represent dead?
1701 * If so, then don't put the dentry on the lru list, 1805 * If so, then don't put the dentry on the lru list,
@@ -1839,12 +1943,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
1839 1943
1840static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) 1944static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1841{ 1945{
1842 struct inode *inode = dentry->d_inode; 1946 struct inode *inode;
1843 struct task_struct *task = get_proc_task(inode); 1947 struct task_struct *task;
1844 int fd = proc_fd(inode); 1948 int fd;
1845 struct files_struct *files; 1949 struct files_struct *files;
1846 const struct cred *cred; 1950 const struct cred *cred;
1847 1951
1952 if (nd && nd->flags & LOOKUP_RCU)
1953 return -ECHILD;
1954
1955 inode = dentry->d_inode;
1956 task = get_proc_task(inode);
1957 fd = proc_fd(inode);
1958
1848 if (task) { 1959 if (task) {
1849 files = get_files_struct(task); 1960 files = get_files_struct(task);
1850 if (files) { 1961 if (files) {
@@ -1920,7 +2031,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
1920 inode->i_op = &proc_pid_link_inode_operations; 2031 inode->i_op = &proc_pid_link_inode_operations;
1921 inode->i_size = 64; 2032 inode->i_size = 64;
1922 ei->op.proc_get_link = proc_fd_link; 2033 ei->op.proc_get_link = proc_fd_link;
1923 dentry->d_op = &tid_fd_dentry_operations; 2034 d_set_d_op(dentry, &tid_fd_dentry_operations);
1924 d_add(dentry, inode); 2035 d_add(dentry, inode);
1925 /* Close the race of the process dying before we return the dentry */ 2036 /* Close the race of the process dying before we return the dentry */
1926 if (tid_fd_revalidate(dentry, NULL)) 2037 if (tid_fd_revalidate(dentry, NULL))
@@ -2039,22 +2150,26 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
2039static const struct file_operations proc_fdinfo_file_operations = { 2150static const struct file_operations proc_fdinfo_file_operations = {
2040 .open = nonseekable_open, 2151 .open = nonseekable_open,
2041 .read = proc_fdinfo_read, 2152 .read = proc_fdinfo_read,
2153 .llseek = no_llseek,
2042}; 2154};
2043 2155
2044static const struct file_operations proc_fd_operations = { 2156static const struct file_operations proc_fd_operations = {
2045 .read = generic_read_dir, 2157 .read = generic_read_dir,
2046 .readdir = proc_readfd, 2158 .readdir = proc_readfd,
2159 .llseek = default_llseek,
2047}; 2160};
2048 2161
2049/* 2162/*
2050 * /proc/pid/fd needs a special permission handler so that a process can still 2163 * /proc/pid/fd needs a special permission handler so that a process can still
2051 * access /proc/self/fd after it has executed a setuid(). 2164 * access /proc/self/fd after it has executed a setuid().
2052 */ 2165 */
2053static int proc_fd_permission(struct inode *inode, int mask) 2166static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
2054{ 2167{
2055 int rv; 2168 int rv;
2056 2169
2057 rv = generic_permission(inode, mask, NULL); 2170 if (flags & IPERM_FLAG_RCU)
2171 return -ECHILD;
2172 rv = generic_permission(inode, mask, flags, NULL);
2058 if (rv == 0) 2173 if (rv == 0)
2059 return 0; 2174 return 0;
2060 if (task_pid(current) == proc_pid(inode)) 2175 if (task_pid(current) == proc_pid(inode))
@@ -2086,7 +2201,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
2086 ei->fd = fd; 2201 ei->fd = fd;
2087 inode->i_mode = S_IFREG | S_IRUSR; 2202 inode->i_mode = S_IFREG | S_IRUSR;
2088 inode->i_fop = &proc_fdinfo_file_operations; 2203 inode->i_fop = &proc_fdinfo_file_operations;
2089 dentry->d_op = &tid_fd_dentry_operations; 2204 d_set_d_op(dentry, &tid_fd_dentry_operations);
2090 d_add(dentry, inode); 2205 d_add(dentry, inode);
2091 /* Close the race of the process dying before we return the dentry */ 2206 /* Close the race of the process dying before we return the dentry */
2092 if (tid_fd_revalidate(dentry, NULL)) 2207 if (tid_fd_revalidate(dentry, NULL))
@@ -2112,6 +2227,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
2112static const struct file_operations proc_fdinfo_operations = { 2227static const struct file_operations proc_fdinfo_operations = {
2113 .read = generic_read_dir, 2228 .read = generic_read_dir,
2114 .readdir = proc_readfdinfo, 2229 .readdir = proc_readfdinfo,
2230 .llseek = default_llseek,
2115}; 2231};
2116 2232
2117/* 2233/*
@@ -2144,7 +2260,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2144 if (p->fop) 2260 if (p->fop)
2145 inode->i_fop = p->fop; 2261 inode->i_fop = p->fop;
2146 ei->op = p->op; 2262 ei->op = p->op;
2147 dentry->d_op = &pid_dentry_operations; 2263 d_set_d_op(dentry, &pid_dentry_operations);
2148 d_add(dentry, inode); 2264 d_add(dentry, inode);
2149 /* Close the race of the process dying before we return the dentry */ 2265 /* Close the race of the process dying before we return the dentry */
2150 if (pid_revalidate(dentry, NULL)) 2266 if (pid_revalidate(dentry, NULL))
@@ -2302,14 +2418,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2302 goto out_free; 2418 goto out_free;
2303 2419
2304 /* Guard against adverse ptrace interaction */ 2420 /* Guard against adverse ptrace interaction */
2305 length = mutex_lock_interruptible(&task->cred_guard_mutex); 2421 length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
2306 if (length < 0) 2422 if (length < 0)
2307 goto out_free; 2423 goto out_free;
2308 2424
2309 length = security_setprocattr(task, 2425 length = security_setprocattr(task,
2310 (char*)file->f_path.dentry->d_name.name, 2426 (char*)file->f_path.dentry->d_name.name,
2311 (void*)page, count); 2427 (void*)page, count);
2312 mutex_unlock(&task->cred_guard_mutex); 2428 mutex_unlock(&task->signal->cred_guard_mutex);
2313out_free: 2429out_free:
2314 free_page((unsigned long) page); 2430 free_page((unsigned long) page);
2315out: 2431out:
@@ -2343,6 +2459,7 @@ static int proc_attr_dir_readdir(struct file * filp,
2343static const struct file_operations proc_attr_dir_operations = { 2459static const struct file_operations proc_attr_dir_operations = {
2344 .read = generic_read_dir, 2460 .read = generic_read_dir,
2345 .readdir = proc_attr_dir_readdir, 2461 .readdir = proc_attr_dir_readdir,
2462 .llseek = default_llseek,
2346}; 2463};
2347 2464
2348static struct dentry *proc_attr_dir_lookup(struct inode *dir, 2465static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2510,8 +2627,14 @@ static const struct pid_entry proc_base_stuff[] = {
2510 */ 2627 */
2511static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) 2628static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2512{ 2629{
2513 struct inode *inode = dentry->d_inode; 2630 struct inode *inode;
2514 struct task_struct *task = get_proc_task(inode); 2631 struct task_struct *task;
2632
2633 if (nd->flags & LOOKUP_RCU)
2634 return -ECHILD;
2635
2636 inode = dentry->d_inode;
2637 task = get_proc_task(inode);
2515 if (task) { 2638 if (task) {
2516 put_task_struct(task); 2639 put_task_struct(task);
2517 return 1; 2640 return 1;
@@ -2542,6 +2665,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2542 2665
2543 /* Initialize the inode */ 2666 /* Initialize the inode */
2544 ei = PROC_I(inode); 2667 ei = PROC_I(inode);
2668 inode->i_ino = get_next_ino();
2545 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 2669 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2546 2670
2547 /* 2671 /*
@@ -2561,7 +2685,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2561 if (p->fop) 2685 if (p->fop)
2562 inode->i_fop = p->fop; 2686 inode->i_fop = p->fop;
2563 ei->op = p->op; 2687 ei->op = p->op;
2564 dentry->d_op = &proc_base_dentry_operations; 2688 d_set_d_op(dentry, &proc_base_dentry_operations);
2565 d_add(dentry, inode); 2689 d_add(dentry, inode);
2566 error = NULL; 2690 error = NULL;
2567out: 2691out:
@@ -2679,6 +2803,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2679#ifdef CONFIG_SCHED_DEBUG 2803#ifdef CONFIG_SCHED_DEBUG
2680 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2804 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2681#endif 2805#endif
2806#ifdef CONFIG_SCHED_AUTOGROUP
2807 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
2808#endif
2682 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2809 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2683#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2810#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2684 INF("syscall", S_IRUSR, proc_pid_syscall), 2811 INF("syscall", S_IRUSR, proc_pid_syscall),
@@ -2751,6 +2878,7 @@ static int proc_tgid_base_readdir(struct file * filp,
2751static const struct file_operations proc_tgid_base_operations = { 2878static const struct file_operations proc_tgid_base_operations = {
2752 .read = generic_read_dir, 2879 .read = generic_read_dir,
2753 .readdir = proc_tgid_base_readdir, 2880 .readdir = proc_tgid_base_readdir,
2881 .llseek = default_llseek,
2754}; 2882};
2755 2883
2756static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ 2884static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -2871,7 +2999,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2871 inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, 2999 inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
2872 ARRAY_SIZE(tgid_base_stuff)); 3000 ARRAY_SIZE(tgid_base_stuff));
2873 3001
2874 dentry->d_op = &pid_dentry_operations; 3002 d_set_d_op(dentry, &pid_dentry_operations);
2875 3003
2876 d_add(dentry, inode); 3004 d_add(dentry, inode);
2877 /* Close the race of the process dying before we return the dentry */ 3005 /* Close the race of the process dying before we return the dentry */
@@ -3088,6 +3216,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
3088static const struct file_operations proc_tid_base_operations = { 3216static const struct file_operations proc_tid_base_operations = {
3089 .read = generic_read_dir, 3217 .read = generic_read_dir,
3090 .readdir = proc_tid_base_readdir, 3218 .readdir = proc_tid_base_readdir,
3219 .llseek = default_llseek,
3091}; 3220};
3092 3221
3093static const struct inode_operations proc_tid_base_inode_operations = { 3222static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3113,7 +3242,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
3113 inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, 3242 inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
3114 ARRAY_SIZE(tid_base_stuff)); 3243 ARRAY_SIZE(tid_base_stuff));
3115 3244
3116 dentry->d_op = &pid_dentry_operations; 3245 d_set_d_op(dentry, &pid_dentry_operations);
3117 3246
3118 d_add(dentry, inode); 3247 d_add(dentry, inode);
3119 /* Close the race of the process dying before we return the dentry */ 3248 /* Close the race of the process dying before we return the dentry */
@@ -3324,4 +3453,5 @@ static const struct inode_operations proc_task_inode_operations = {
3324static const struct file_operations proc_task_operations = { 3453static const struct file_operations proc_task_operations = {
3325 .read = generic_read_dir, 3454 .read = generic_read_dir,
3326 .readdir = proc_task_readdir, 3455 .readdir = proc_task_readdir,
3456 .llseek = default_llseek,
3327}; 3457};
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 000000000000..b701eaa482bf
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,114 @@
1/*
2 * Copyright (c) 2010 Werner Fink, Jiri Slaby
3 *
4 * Licensed under GPLv2
5 */
6
7#include <linux/console.h>
8#include <linux/kernel.h>
9#include <linux/proc_fs.h>
10#include <linux/seq_file.h>
11#include <linux/tty_driver.h>
12
13/*
14 * This is handler for /proc/consoles
15 */
16static int show_console_dev(struct seq_file *m, void *v)
17{
18 static const struct {
19 short flag;
20 char name;
21 } con_flags[] = {
22 { CON_ENABLED, 'E' },
23 { CON_CONSDEV, 'C' },
24 { CON_BOOT, 'B' },
25 { CON_PRINTBUFFER, 'p' },
26 { CON_BRL, 'b' },
27 { CON_ANYTIME, 'a' },
28 };
29 char flags[ARRAY_SIZE(con_flags) + 1];
30 struct console *con = v;
31 unsigned int a;
32 int len;
33 dev_t dev = 0;
34
35 if (con->device) {
36 const struct tty_driver *driver;
37 int index;
38 driver = con->device(con, &index);
39 if (driver) {
40 dev = MKDEV(driver->major, driver->minor_start);
41 dev += index;
42 }
43 }
44
45 for (a = 0; a < ARRAY_SIZE(con_flags); a++)
46 flags[a] = (con->flags & con_flags[a].flag) ?
47 con_flags[a].name : ' ';
48 flags[a] = 0;
49
50 seq_printf(m, "%s%d%n", con->name, con->index, &len);
51 len = 21 - len;
52 if (len < 1)
53 len = 1;
54 seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
55 con->write ? 'W' : '-', con->unblank ? 'U' : '-',
56 flags);
57 if (dev)
58 seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
59
60 seq_printf(m, "\n");
61
62 return 0;
63}
64
65static void *c_start(struct seq_file *m, loff_t *pos)
66{
67 struct console *con;
68 loff_t off = 0;
69
70 console_lock();
71 for_each_console(con)
72 if (off++ == *pos)
73 break;
74
75 return con;
76}
77
78static void *c_next(struct seq_file *m, void *v, loff_t *pos)
79{
80 struct console *con = v;
81 ++*pos;
82 return con->next;
83}
84
85static void c_stop(struct seq_file *m, void *v)
86{
87 console_unlock();
88}
89
90static const struct seq_operations consoles_op = {
91 .start = c_start,
92 .next = c_next,
93 .stop = c_stop,
94 .show = show_console_dev
95};
96
97static int consoles_open(struct inode *inode, struct file *file)
98{
99 return seq_open(file, &consoles_op);
100}
101
102static const struct file_operations proc_consoles_operations = {
103 .open = consoles_open,
104 .read = seq_read,
105 .llseek = seq_lseek,
106 .release = seq_release,
107};
108
109static int __init proc_consoles_init(void)
110{
111 proc_create("consoles", 0, NULL, &proc_consoles_operations);
112 return 0;
113}
114module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c9..b14347167c35 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
9 9
10 if (i < CHRDEV_MAJOR_HASH_SIZE) { 10 if (i < CHRDEV_MAJOR_HASH_SIZE) {
11 if (i == 0) 11 if (i == 0)
12 seq_printf(f, "Character devices:\n"); 12 seq_puts(f, "Character devices:\n");
13 chrdev_show(f, i); 13 chrdev_show(f, i);
14 } 14 }
15#ifdef CONFIG_BLOCK 15#ifdef CONFIG_BLOCK
16 else { 16 else {
17 i -= CHRDEV_MAJOR_HASH_SIZE; 17 i -= CHRDEV_MAJOR_HASH_SIZE;
18 if (i == 0) 18 if (i == 0)
19 seq_printf(f, "\nBlock devices:\n"); 19 seq_puts(f, "\nBlock devices:\n");
20 blkdev_show(f, i); 20 blkdev_show(f, i);
21 } 21 }
22#endif 22#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f0337661..01e07f2a188f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
400 * smarter: we could keep a "volatile" flag in the 400 * smarter: we could keep a "volatile" flag in the
401 * inode to indicate which ones to keep. 401 * inode to indicate which ones to keep.
402 */ 402 */
403static int proc_delete_dentry(struct dentry * dentry) 403static int proc_delete_dentry(const struct dentry * dentry)
404{ 404{
405 return 1; 405 return 1;
406} 406}
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
425 if (de->namelen != dentry->d_name.len) 425 if (de->namelen != dentry->d_name.len)
426 continue; 426 continue;
427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { 427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
428 unsigned int ino;
429
430 ino = de->low_ino;
431 pde_get(de); 428 pde_get(de);
432 spin_unlock(&proc_subdir_lock); 429 spin_unlock(&proc_subdir_lock);
433 error = -EINVAL; 430 error = -EINVAL;
434 inode = proc_get_inode(dir->i_sb, ino, de); 431 inode = proc_get_inode(dir->i_sb, de);
435 goto out_unlock; 432 goto out_unlock;
436 } 433 }
437 } 434 }
@@ -439,7 +436,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
439out_unlock: 436out_unlock:
440 437
441 if (inode) { 438 if (inode) {
442 dentry->d_op = &proc_dentry_operations; 439 d_set_d_op(dentry, &proc_dentry_operations);
443 d_add(dentry, inode); 440 d_add(dentry, inode);
444 return NULL; 441 return NULL;
445 } 442 }
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
768 765
769static void free_proc_entry(struct proc_dir_entry *de) 766static void free_proc_entry(struct proc_dir_entry *de)
770{ 767{
771 unsigned int ino = de->low_ino; 768 release_inode_number(de->low_ino);
772
773 if (ino < PROC_DYNAMIC_FIRST)
774 return;
775
776 release_inode_number(ino);
777 769
778 if (S_ISLNK(de->mode)) 770 if (S_ISLNK(de->mode))
779 kfree(de->data); 771 kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
834 826
835 wait_for_completion(de->pde_unload_completion); 827 wait_for_completion(de->pde_unload_completion);
836 828
837 goto continue_removing; 829 spin_lock(&de->pde_unload_lock);
838 } 830 }
839 spin_unlock(&de->pde_unload_lock);
840 831
841continue_removing:
842 spin_lock(&de->pde_unload_lock);
843 while (!list_empty(&de->pde_openers)) { 832 while (!list_empty(&de->pde_openers)) {
844 struct pde_opener *pdeo; 833 struct pde_opener *pdeo;
845 834
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f484879..176ce4cda68a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
16#include <linux/limits.h> 16#include <linux/limits.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 19#include <linux/sysctl.h>
21#include <linux/slab.h> 20#include <linux/slab.h>
22 21
@@ -66,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
66 return inode; 65 return inode;
67} 66}
68 67
69static void proc_destroy_inode(struct inode *inode) 68static void proc_i_callback(struct rcu_head *head)
70{ 69{
70 struct inode *inode = container_of(head, struct inode, i_rcu);
71 INIT_LIST_HEAD(&inode->i_dentry);
71 kmem_cache_free(proc_inode_cachep, PROC_I(inode)); 72 kmem_cache_free(proc_inode_cachep, PROC_I(inode));
72} 73}
73 74
75static void proc_destroy_inode(struct inode *inode)
76{
77 call_rcu(&inode->i_rcu, proc_i_callback);
78}
79
74static void init_once(void *foo) 80static void init_once(void *foo)
75{ 81{
76 struct proc_inode *ei = (struct proc_inode *) foo; 82 struct proc_inode *ei = (struct proc_inode *) foo;
@@ -410,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
410}; 416};
411#endif 417#endif
412 418
413struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, 419struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
414 struct proc_dir_entry *de)
415{ 420{
416 struct inode * inode; 421 struct inode * inode;
417 422
418 inode = iget_locked(sb, ino); 423 inode = iget_locked(sb, de->low_ino);
419 if (!inode) 424 if (!inode)
420 return NULL; 425 return NULL;
421 if (inode->i_state & I_NEW) { 426 if (inode->i_state & I_NEW) {
@@ -465,7 +470,7 @@ int proc_fill_super(struct super_block *s)
465 s->s_time_gran = 1; 470 s->s_time_gran = 1;
466 471
467 pde_get(&proc_root); 472 pde_get(&proc_root);
468 root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); 473 root_inode = proc_get_inode(s, &proc_root);
469 if (!root_inode) 474 if (!root_inode)
470 goto out_no_root; 475 goto out_no_root;
471 root_inode->i_uid = 0; 476 root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd12..9ad561ded409 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
96struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); 96struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
97int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); 97int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
98unsigned long task_vsize(struct mm_struct *); 98unsigned long task_vsize(struct mm_struct *);
99int task_statm(struct mm_struct *, int *, int *, int *, int *); 99unsigned long task_statm(struct mm_struct *,
100 unsigned long *, unsigned long *, unsigned long *, unsigned long *);
100void task_mem(struct seq_file *, struct mm_struct *); 101void task_mem(struct seq_file *, struct mm_struct *);
101 102
102static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) 103static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
108 109
109extern struct vfsmount *proc_mnt; 110extern struct vfsmount *proc_mnt;
110int proc_fill_super(struct super_block *); 111int proc_fill_super(struct super_block *);
111struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); 112struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
112 113
113/* 114/*
114 * These are generic /proc routines that use the internal 115 * These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468d..d245cb23dd72 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
558static const struct file_operations proc_kcore_operations = { 558static const struct file_operations proc_kcore_operations = {
559 .read = read_kcore, 559 .read = read_kcore,
560 .open = open_kcore, 560 .open = open_kcore,
561 .llseek = generic_file_llseek, 561 .llseek = default_llseek,
562}; 562};
563 563
564#ifdef CONFIG_MEMORY_HOTPLUG 564#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %5lu kB\n" 102 "HardwareCorrupted: %5lu kB\n"
103#endif 103#endif
104#ifdef CONFIG_TRANSPARENT_HUGEPAGE
105 "AnonHugePages: %8lu kB\n"
106#endif
104 , 107 ,
105 K(i.totalram), 108 K(i.totalram),
106 K(i.freeram), 109 K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
128 K(i.freeswap), 131 K(i.freeswap),
129 K(global_page_state(NR_FILE_DIRTY)), 132 K(global_page_state(NR_FILE_DIRTY)),
130 K(global_page_state(NR_WRITEBACK)), 133 K(global_page_state(NR_WRITEBACK)),
131 K(global_page_state(NR_ANON_PAGES)), 134 K(global_page_state(NR_ANON_PAGES)
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
137 HPAGE_PMD_NR
138#endif
139 ),
132 K(global_page_state(NR_FILE_MAPPED)), 140 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)), 141 K(global_page_state(NR_SHMEM)),
134 K(global_page_state(NR_SLAB_RECLAIMABLE) + 142 K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
151#ifdef CONFIG_MEMORY_FAILURE 159#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) 160 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif 161#endif
162#ifdef CONFIG_TRANSPARENT_HUGEPAGE
163 ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
164 HPAGE_PMD_NR)
165#endif
154 ); 166 );
155 167
156 hugetlb_report_meminfo(m); 168 hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b45660331..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
40 ppage = pfn_to_page(pfn); 40 ppage = pfn_to_page(pfn);
41 else 41 else
42 ppage = NULL; 42 ppage = NULL;
43 if (!ppage) 43 if (!ppage || PageSlab(ppage))
44 pcount = 0; 44 pcount = 0;
45 else 45 else
46 pcount = page_mapcount(ppage); 46 pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 118
119 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
120
121 /* 119 /*
122 * Caveats on high order pages: 120 * Caveats on high order pages: page->_count will only be set
123 * PG_buddy will only be set on the head page; SLUB/SLQB do the same 121 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
124 * for PG_slab; SLOB won't set PG_slab at all on compound pages. 122 * SLOB won't set PG_slab at all on compound pages.
125 */ 123 */
124 if (PageBuddy(page))
125 u |= 1 << KPF_BUDDY;
126
127 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
128
126 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 129 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
127 u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
128 130
129 u |= kpf_copy_bit(k, KPF_ERROR, PG_error); 131 u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
130 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); 132 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5be436ea088e..09a1f92a34ef 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
5#include <linux/sysctl.h> 5#include <linux/sysctl.h>
6#include <linux/proc_fs.h> 6#include <linux/proc_fs.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/namei.h>
8#include "internal.h" 9#include "internal.h"
9 10
10static const struct dentry_operations proc_sys_dentry_operations; 11static const struct dentry_operations proc_sys_dentry_operations;
@@ -23,6 +24,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
23 if (!inode) 24 if (!inode)
24 goto out; 25 goto out;
25 26
27 inode->i_ino = get_next_ino();
28
26 sysctl_head_get(head); 29 sysctl_head_get(head);
27 ei = PROC_I(inode); 30 ei = PROC_I(inode);
28 ei->sysctl = head; 31 ei->sysctl = head;
@@ -118,7 +121,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
118 goto out; 121 goto out;
119 122
120 err = NULL; 123 err = NULL;
121 dentry->d_op = &proc_sys_dentry_operations; 124 d_set_d_op(dentry, &proc_sys_dentry_operations);
122 d_add(dentry, inode); 125 d_add(dentry, inode);
123 126
124out: 127out:
@@ -199,7 +202,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
199 dput(child); 202 dput(child);
200 return -ENOMEM; 203 return -ENOMEM;
201 } else { 204 } else {
202 child->d_op = &proc_sys_dentry_operations; 205 d_set_d_op(child, &proc_sys_dentry_operations);
203 d_add(child, inode); 206 d_add(child, inode);
204 } 207 }
205 } else { 208 } else {
@@ -292,7 +295,7 @@ out:
292 return ret; 295 return ret;
293} 296}
294 297
295static int proc_sys_permission(struct inode *inode, int mask) 298static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
296{ 299{
297 /* 300 /*
298 * sysctl entries that are not writeable, 301 * sysctl entries that are not writeable,
@@ -302,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
302 struct ctl_table *table; 305 struct ctl_table *table;
303 int error; 306 int error;
304 307
308 if (flags & IPERM_FLAG_RCU)
309 return -ECHILD;
310
305 /* Executable files are not allowed under /proc/sys/ */ 311 /* Executable files are not allowed under /proc/sys/ */
306 if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) 312 if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
307 return -EACCES; 313 return -EACCES;
@@ -364,6 +370,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
364static const struct file_operations proc_sys_file_operations = { 370static const struct file_operations proc_sys_file_operations = {
365 .read = proc_sys_read, 371 .read = proc_sys_read,
366 .write = proc_sys_write, 372 .write = proc_sys_write,
373 .llseek = default_llseek,
367}; 374};
368 375
369static const struct file_operations proc_sys_dir_file_operations = { 376static const struct file_operations proc_sys_dir_file_operations = {
@@ -386,23 +393,30 @@ static const struct inode_operations proc_sys_dir_operations = {
386 393
387static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) 394static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
388{ 395{
396 if (nd->flags & LOOKUP_RCU)
397 return -ECHILD;
389 return !PROC_I(dentry->d_inode)->sysctl->unregistering; 398 return !PROC_I(dentry->d_inode)->sysctl->unregistering;
390} 399}
391 400
392static int proc_sys_delete(struct dentry *dentry) 401static int proc_sys_delete(const struct dentry *dentry)
393{ 402{
394 return !!PROC_I(dentry->d_inode)->sysctl->unregistering; 403 return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
395} 404}
396 405
397static int proc_sys_compare(struct dentry *dir, struct qstr *qstr, 406static int proc_sys_compare(const struct dentry *parent,
398 struct qstr *name) 407 const struct inode *pinode,
408 const struct dentry *dentry, const struct inode *inode,
409 unsigned int len, const char *str, const struct qstr *name)
399{ 410{
400 struct dentry *dentry = container_of(qstr, struct dentry, d_name); 411 /* Although proc doesn't have negative dentries, rcu-walk means
401 if (qstr->len != name->len) 412 * that inode here can be NULL */
413 if (!inode)
414 return 0;
415 if (name->len != len)
402 return 1; 416 return 1;
403 if (memcmp(qstr->name, name->name, name->len)) 417 if (memcmp(name->name, str, len))
404 return 1; 418 return 1;
405 return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); 419 return !sysctl_is_seen(PROC_I(inode)->sysctl);
406} 420}
407 421
408static const struct dentry_operations proc_sys_dentry_operations = { 422static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc869437..cb761f010300 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
36 } 36 }
37 switch (p->type) { 37 switch (p->type) {
38 case TTY_DRIVER_TYPE_SYSTEM: 38 case TTY_DRIVER_TYPE_SYSTEM:
39 seq_printf(m, "system"); 39 seq_puts(m, "system");
40 if (p->subtype == SYSTEM_TYPE_TTY) 40 if (p->subtype == SYSTEM_TYPE_TTY)
41 seq_printf(m, ":/dev/tty"); 41 seq_puts(m, ":/dev/tty");
42 else if (p->subtype == SYSTEM_TYPE_SYSCONS) 42 else if (p->subtype == SYSTEM_TYPE_SYSCONS)
43 seq_printf(m, ":console"); 43 seq_puts(m, ":console");
44 else if (p->subtype == SYSTEM_TYPE_CONSOLE) 44 else if (p->subtype == SYSTEM_TYPE_CONSOLE)
45 seq_printf(m, ":vtmaster"); 45 seq_puts(m, ":vtmaster");
46 break; 46 break;
47 case TTY_DRIVER_TYPE_CONSOLE: 47 case TTY_DRIVER_TYPE_CONSOLE:
48 seq_printf(m, "console"); 48 seq_puts(m, "console");
49 break; 49 break;
50 case TTY_DRIVER_TYPE_SERIAL: 50 case TTY_DRIVER_TYPE_SERIAL:
51 seq_printf(m, "serial"); 51 seq_puts(m, "serial");
52 break; 52 break;
53 case TTY_DRIVER_TYPE_PTY: 53 case TTY_DRIVER_TYPE_PTY:
54 if (p->subtype == PTY_TYPE_MASTER) 54 if (p->subtype == PTY_TYPE_MASTER)
55 seq_printf(m, "pty:master"); 55 seq_puts(m, "pty:master");
56 else if (p->subtype == PTY_TYPE_SLAVE) 56 else if (p->subtype == PTY_TYPE_SLAVE)
57 seq_printf(m, "pty:slave"); 57 seq_puts(m, "pty:slave");
58 else 58 else
59 seq_printf(m, "pty"); 59 seq_puts(m, "pty");
60 break; 60 break;
61 default: 61 default:
62 seq_printf(m, "type:%d.%d", p->type, p->subtype); 62 seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
74 /* pseudo-drivers first */ 74 /* pseudo-drivers first */
75 seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); 75 seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
76 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); 76 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
77 seq_printf(m, "system:/dev/tty\n"); 77 seq_puts(m, "system:/dev/tty\n");
78 seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); 78 seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
79 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); 79 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
80 seq_printf(m, "system:console\n"); 80 seq_puts(m, "system:console\n");
81#ifdef CONFIG_UNIX98_PTYS 81#ifdef CONFIG_UNIX98_PTYS
82 seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); 82 seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
83 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); 83 seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
84 seq_printf(m, "system\n"); 84 seq_puts(m, "system\n");
85#endif 85#endif
86#ifdef CONFIG_VT 86#ifdef CONFIG_VT
87 seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); 87 seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
88 seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); 88 seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
89 seq_printf(m, "system:vtmaster\n"); 89 seq_puts(m, "system:vtmaster\n");
90#endif 90#endif
91 } 91 }
92 92
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22d..ef9fa8e24ad6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
35 return set_anon_super(sb, NULL); 35 return set_anon_super(sb, NULL);
36} 36}
37 37
38static int proc_get_sb(struct file_system_type *fs_type, 38static struct dentry *proc_mount(struct file_system_type *fs_type,
39 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 39 int flags, const char *dev_name, void *data)
40{ 40{
41 int err; 41 int err;
42 struct super_block *sb; 42 struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
61 61
62 sb = sget(fs_type, proc_test_super, proc_set_super, ns); 62 sb = sget(fs_type, proc_test_super, proc_set_super, ns);
63 if (IS_ERR(sb)) 63 if (IS_ERR(sb))
64 return PTR_ERR(sb); 64 return ERR_CAST(sb);
65 65
66 if (!sb->s_root) { 66 if (!sb->s_root) {
67 sb->s_flags = flags; 67 sb->s_flags = flags;
68 err = proc_fill_super(sb); 68 err = proc_fill_super(sb);
69 if (err) { 69 if (err) {
70 deactivate_locked_super(sb); 70 deactivate_locked_super(sb);
71 return err; 71 return ERR_PTR(err);
72 } 72 }
73 73
74 ei = PROC_I(sb->s_root->d_inode); 74 ei = PROC_I(sb->s_root->d_inode);
@@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
79 } 79 }
80 80
81 sb->s_flags |= MS_ACTIVE; 81 sb->s_flags |= MS_ACTIVE;
82 ns->proc_mnt = mnt;
83 } 82 }
84 83
85 simple_set_mnt(mnt, sb); 84 return dget(sb->s_root);
86 return 0;
87} 85}
88 86
89static void proc_kill_sb(struct super_block *sb) 87static void proc_kill_sb(struct super_block *sb)
@@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
97 95
98static struct file_system_type proc_fs_type = { 96static struct file_system_type proc_fs_type = {
99 .name = "proc", 97 .name = "proc",
100 .get_sb = proc_get_sb, 98 .mount = proc_mount,
101 .kill_sb = proc_kill_sb, 99 .kill_sb = proc_kill_sb,
102}; 100};
103 101
@@ -115,6 +113,7 @@ void __init proc_root_init(void)
115 return; 113 return;
116 } 114 }
117 115
116 init_pid_ns.proc_mnt = proc_mnt;
118 proc_symlink("mounts", NULL, "self/mounts"); 117 proc_symlink("mounts", NULL, "self/mounts");
119 118
120 proc_net_init(); 119 proc_net_init();
@@ -179,6 +178,7 @@ static int proc_root_readdir(struct file * filp,
179static const struct file_operations proc_root_operations = { 178static const struct file_operations proc_root_operations = {
180 .read = generic_read_dir, 179 .read = generic_read_dir,
181 .readdir = proc_root_readdir, 180 .readdir = proc_root_readdir,
181 .llseek = default_llseek,
182}; 182};
183 183
184/* 184/*
@@ -212,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
212 if (IS_ERR(mnt)) 212 if (IS_ERR(mnt))
213 return PTR_ERR(mnt); 213 return PTR_ERR(mnt);
214 214
215 ns->proc_mnt = mnt;
215 return 0; 216 return 0;
216} 217}
217 218
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f17..62604be9f58d 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
10{ 10{
11 int i, j; 11 int i, j;
12 12
13 seq_printf(p, " "); 13 seq_puts(p, " ");
14 for_each_possible_cpu(i) 14 for_each_possible_cpu(i)
15 seq_printf(p, "CPU%-8d", i); 15 seq_printf(p, "CPU%-8d", i);
16 seq_printf(p, "\n"); 16 seq_putc(p, '\n');
17 17
18 for (i = 0; i < NR_SOFTIRQS; i++) { 18 for (i = 0; i < NR_SOFTIRQS; i++) {
19 seq_printf(p, "%8s:", softirq_to_name[i]); 19 seq_printf(p, "%12s:", softirq_to_name[i]);
20 for_each_possible_cpu(j) 20 for_each_possible_cpu(j)
21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); 21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
22 seq_printf(p, "\n"); 22 seq_putc(p, '\n');
23 } 23 }
24 return 0; 24 return 0;
25} 25}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc275..1cffa2b8a2fc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
31 u64 sum_softirq = 0; 31 u64 sum_softirq = 0;
32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
33 struct timespec boottime; 33 struct timespec boottime;
34 unsigned int per_irq_sum;
35 34
36 user = nice = system = idle = iowait = 35 user = nice = system = idle = iowait =
37 irq = softirq = steal = cputime64_zero; 36 irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
52 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 51 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
53 guest_nice = cputime64_add(guest_nice, 52 guest_nice = cputime64_add(guest_nice,
54 kstat_cpu(i).cpustat.guest_nice); 53 kstat_cpu(i).cpustat.guest_nice);
55 for_each_irq_nr(j) { 54 sum += kstat_cpu_irqs_sum(i);
56 sum += kstat_irqs_cpu(j, i);
57 }
58 sum += arch_irq_stat_cpu(i); 55 sum += arch_irq_stat_cpu(i);
59 56
60 for (j = 0; j < NR_SOFTIRQS; j++) { 57 for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
110 seq_printf(p, "intr %llu", (unsigned long long)sum); 107 seq_printf(p, "intr %llu", (unsigned long long)sum);
111 108
112 /* sum again ? it could be updated? */ 109 /* sum again ? it could be updated? */
113 for_each_irq_nr(j) { 110 for_each_irq_nr(j)
114 per_irq_sum = 0; 111 seq_printf(p, " %u", kstat_irqs(j));
115 for_each_possible_cpu(i)
116 per_irq_sum += kstat_irqs_cpu(j, i);
117
118 seq_printf(p, " %u", per_irq_sum);
119 }
120 112
121 seq_printf(p, 113 seq_printf(p,
122 "\nctxt %llu\n" 114 "\nctxt %llu\n"
@@ -134,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
134 126
135 for (i = 0; i < NR_SOFTIRQS; i++) 127 for (i = 0; i < NR_SOFTIRQS; i++)
136 seq_printf(p, " %u", per_softirq_sums[i]); 128 seq_printf(p, " %u", per_softirq_sums[i]);
137 seq_printf(p, "\n"); 129 seq_putc(p, '\n');
138 130
139 return 0; 131 return 0;
140} 132}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1dbca4e8cc16..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
66 return PAGE_SIZE * mm->total_vm; 66 return PAGE_SIZE * mm->total_vm;
67} 67}
68 68
69int task_statm(struct mm_struct *mm, int *shared, int *text, 69unsigned long task_statm(struct mm_struct *mm,
70 int *data, int *resident) 70 unsigned long *shared, unsigned long *text,
71 unsigned long *data, unsigned long *resident)
71{ 72{
72 *shared = get_mm_counter(mm, MM_FILEPAGES); 73 *shared = get_mm_counter(mm, MM_FILEPAGES);
73 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 74 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -327,6 +328,7 @@ struct mem_size_stats {
327 unsigned long private_clean; 328 unsigned long private_clean;
328 unsigned long private_dirty; 329 unsigned long private_dirty;
329 unsigned long referenced; 330 unsigned long referenced;
331 unsigned long anonymous;
330 unsigned long swap; 332 unsigned long swap;
331 u64 pss; 333 u64 pss;
332}; 334};
@@ -357,6 +359,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
357 if (!page) 359 if (!page)
358 continue; 360 continue;
359 361
362 if (PageAnon(page))
363 mss->anonymous += PAGE_SIZE;
364
360 mss->resident += PAGE_SIZE; 365 mss->resident += PAGE_SIZE;
361 /* Accumulate the size in pages that have been accessed. */ 366 /* Accumulate the size in pages that have been accessed. */
362 if (pte_young(ptent) || PageReferenced(page)) 367 if (pte_young(ptent) || PageReferenced(page))
@@ -410,9 +415,11 @@ static int show_smap(struct seq_file *m, void *v)
410 "Private_Clean: %8lu kB\n" 415 "Private_Clean: %8lu kB\n"
411 "Private_Dirty: %8lu kB\n" 416 "Private_Dirty: %8lu kB\n"
412 "Referenced: %8lu kB\n" 417 "Referenced: %8lu kB\n"
418 "Anonymous: %8lu kB\n"
413 "Swap: %8lu kB\n" 419 "Swap: %8lu kB\n"
414 "KernelPageSize: %8lu kB\n" 420 "KernelPageSize: %8lu kB\n"
415 "MMUPageSize: %8lu kB\n", 421 "MMUPageSize: %8lu kB\n"
422 "Locked: %8lu kB\n",
416 (vma->vm_end - vma->vm_start) >> 10, 423 (vma->vm_end - vma->vm_start) >> 10,
417 mss.resident >> 10, 424 mss.resident >> 10,
418 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 425 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -421,9 +428,12 @@ static int show_smap(struct seq_file *m, void *v)
421 mss.private_clean >> 10, 428 mss.private_clean >> 10,
422 mss.private_dirty >> 10, 429 mss.private_dirty >> 10,
423 mss.referenced >> 10, 430 mss.referenced >> 10,
431 mss.anonymous >> 10,
424 mss.swap >> 10, 432 mss.swap >> 10,
425 vma_kernel_pagesize(vma) >> 10, 433 vma_kernel_pagesize(vma) >> 10,
426 vma_mmu_pagesize(vma) >> 10); 434 vma_mmu_pagesize(vma) >> 10,
435 (vma->vm_flags & VM_LOCKED) ?
436 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
427 437
428 if (m->count < m->size) /* vma is copied successfully */ 438 if (m->count < m->size) /* vma is copied successfully */
429 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 439 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
@@ -539,6 +549,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
539 549
540const struct file_operations proc_clear_refs_operations = { 550const struct file_operations proc_clear_refs_operations = {
541 .write = clear_refs_write, 551 .write = clear_refs_write,
552 .llseek = noop_llseek,
542}; 553};
543 554
544struct pagemapread { 555struct pagemapread {
@@ -699,6 +710,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
699 * skip over unmapped regions. 710 * skip over unmapped regions.
700 */ 711 */
701#define PAGEMAP_WALK_SIZE (PMD_SIZE) 712#define PAGEMAP_WALK_SIZE (PMD_SIZE)
713#define PAGEMAP_WALK_MASK (PMD_MASK)
702static ssize_t pagemap_read(struct file *file, char __user *buf, 714static ssize_t pagemap_read(struct file *file, char __user *buf,
703 size_t count, loff_t *ppos) 715 size_t count, loff_t *ppos)
704{ 716{
@@ -769,7 +781,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
769 unsigned long end; 781 unsigned long end;
770 782
771 pm.pos = 0; 783 pm.pos = 0;
772 end = start_vaddr + PAGEMAP_WALK_SIZE; 784 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
773 /* overflow ? */ 785 /* overflow ? */
774 if (end < start_vaddr || end > end_vaddr) 786 if (end < start_vaddr || end > end_vaddr)
775 end = end_vaddr; 787 end = end_vaddr;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e63843..b535d3e5d5f1 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
92 return vsize; 92 return vsize;
93} 93}
94 94
95int task_statm(struct mm_struct *mm, int *shared, int *text, 95unsigned long task_statm(struct mm_struct *mm,
96 int *data, int *resident) 96 unsigned long *shared, unsigned long *text,
97 unsigned long *data, unsigned long *resident)
97{ 98{
98 struct vm_area_struct *vma; 99 struct vm_area_struct *vma;
99 struct vm_region *region; 100 struct vm_region *region;
100 struct rb_node *p; 101 struct rb_node *p;
101 int size = kobjsize(mm); 102 unsigned long size = kobjsize(mm);
102 103
103 down_read(&mm->mmap_sem); 104 down_read(&mm->mmap_sem);
104 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { 105 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 2367fb3f70bc..74802bc5ded9 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void)
499 /* Do some basic Verification. */ 499 /* Do some basic Verification. */
500 if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || 500 if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
501 (ehdr.e_type != ET_CORE) || 501 (ehdr.e_type != ET_CORE) ||
502 !vmcore_elf_check_arch(&ehdr) || 502 !vmcore_elf64_check_arch(&ehdr) ||
503 ehdr.e_ident[EI_CLASS] != ELFCLASS64 || 503 ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
504 ehdr.e_ident[EI_VERSION] != EV_CURRENT || 504 ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
505 ehdr.e_version != EV_CURRENT || 505 ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a8..7b0329468a5d 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. 11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
12 */ 12 */
13 13
14#include <linux/smp_lock.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include "qnx4.h" 15#include "qnx4.h"
17 16
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
29 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
30 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); 29 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
31 30
32 lock_kernel();
33
34 while (filp->f_pos < inode->i_size) { 31 while (filp->f_pos < inode->i_size) {
35 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); 32 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
36 bh = sb_bread(inode->i_sb, blknum); 33 bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
71 brelse(bh); 68 brelse(bh);
72 } 69 }
73out: 70out:
74 unlock_kernel();
75 return 0; 71 return 0;
76} 72}
77 73
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 16829722be93..e63b4171d583 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/smp_lock.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
157 struct super_block *sb = dentry->d_sb; 156 struct super_block *sb = dentry->d_sb;
158 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 157 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
159 158
160 lock_kernel();
161
162 buf->f_type = sb->s_magic; 159 buf->f_type = sb->s_magic;
163 buf->f_bsize = sb->s_blocksize; 160 buf->f_bsize = sb->s_blocksize;
164 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8; 161 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
168 buf->f_fsid.val[0] = (u32)id; 165 buf->f_fsid.val[0] = (u32)id;
169 buf->f_fsid.val[1] = (u32)(id >> 32); 166 buf->f_fsid.val[1] = (u32)(id >> 32);
170 167
171 unlock_kernel();
172
173 return 0; 168 return 0;
174} 169}
175 170
@@ -283,7 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
283 goto outi; 278 goto outi;
284 279
285 brelse(bh); 280 brelse(bh);
286
287 return 0; 281 return 0;
288 282
289 outi: 283 outi:
@@ -431,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
431 return &ei->vfs_inode; 425 return &ei->vfs_inode;
432} 426}
433 427
434static void qnx4_destroy_inode(struct inode *inode) 428static void qnx4_i_callback(struct rcu_head *head)
435{ 429{
430 struct inode *inode = container_of(head, struct inode, i_rcu);
431 INIT_LIST_HEAD(&inode->i_dentry);
436 kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); 432 kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
437} 433}
438 434
435static void qnx4_destroy_inode(struct inode *inode)
436{
437 call_rcu(&inode->i_rcu, qnx4_i_callback);
438}
439
439static void init_once(void *foo) 440static void init_once(void *foo)
440{ 441{
441 struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; 442 struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
@@ -460,17 +461,16 @@ static void destroy_inodecache(void)
460 kmem_cache_destroy(qnx4_inode_cachep); 461 kmem_cache_destroy(qnx4_inode_cachep);
461} 462}
462 463
463static int qnx4_get_sb(struct file_system_type *fs_type, 464static struct dentry *qnx4_mount(struct file_system_type *fs_type,
464 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 465 int flags, const char *dev_name, void *data)
465{ 466{
466 return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super, 467 return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
467 mnt);
468} 468}
469 469
470static struct file_system_type qnx4_fs_type = { 470static struct file_system_type qnx4_fs_type = {
471 .owner = THIS_MODULE, 471 .owner = THIS_MODULE,
472 .name = "qnx4", 472 .name = "qnx4",
473 .get_sb = qnx4_get_sb, 473 .mount = qnx4_mount,
474 .kill_sb = kill_block_super, 474 .kill_sb = kill_block_super,
475 .fs_flags = FS_REQUIRES_DEV, 475 .fs_flags = FS_REQUIRES_DEV,
476}; 476};
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba879..275327b5615e 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink. 12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
13 */ 13 */
14 14
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include "qnx4.h" 16#include "qnx4.h"
18 17
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
109 int len = dentry->d_name.len; 108 int len = dentry->d_name.len;
110 struct inode *foundinode = NULL; 109 struct inode *foundinode = NULL;
111 110
112 lock_kernel();
113 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino))) 111 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
114 goto out; 112 goto out;
115 /* The entry is linked, let's get the real info */ 113 /* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
123 121
124 foundinode = qnx4_iget(dir->i_sb, ino); 122 foundinode = qnx4_iget(dir->i_sb, ino);
125 if (IS_ERR(foundinode)) { 123 if (IS_ERR(foundinode)) {
126 unlock_kernel();
127 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", 124 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
128 PTR_ERR(foundinode))); 125 PTR_ERR(foundinode)));
129 return ERR_CAST(foundinode); 126 return ERR_CAST(foundinode);
130 } 127 }
131out: 128out:
132 unlock_kernel();
133 d_add(dentry, foundinode); 129 d_add(dentry, foundinode);
134 130
135 return NULL; 131 return NULL;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3a..880fd9884366 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
4 4
5config QUOTA 5config QUOTA
6 bool "Quota support" 6 bool "Quota support"
7 select QUOTACTL
7 help 8 help
8 If you say Y here, you will be able to set per user limits for disk 9 If you say Y here, you will be able to set per user limits for disk
9 usage (also called disk quotas). Currently, it works for the 10 usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
65 66
66config QUOTACTL 67config QUOTACTL
67 bool 68 bool
68 depends on XFS_QUOTA || QUOTA 69 default n
69 default y
70 70
71config QUOTACTL_COMPAT 71config QUOTACTL_COMPAT
72 bool 72 bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index aad1316a977f..a2a622e079f0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -133,16 +133,20 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135void __quota_error(struct super_block *sb, const char *func, 135void __quota_error(struct super_block *sb, const char *func,
136 const char *fmt, ...) 136 const char *fmt, ...)
137{ 137{
138 va_list args;
139
140 if (printk_ratelimit()) { 138 if (printk_ratelimit()) {
139 va_list args;
140 struct va_format vaf;
141
141 va_start(args, fmt); 142 va_start(args, fmt);
142 printk(KERN_ERR "Quota error (device %s): %s: ", 143
143 sb->s_id, func); 144 vaf.fmt = fmt;
144 vprintk(fmt, args); 145 vaf.va = &args;
145 printk("\n"); 146
147 printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
148 sb->s_id, func, &vaf);
149
146 va_end(args); 150 va_end(args);
147 } 151 }
148} 152}
@@ -1386,6 +1390,9 @@ static void __dquot_initialize(struct inode *inode, int type)
1386 /* Avoid races with quotaoff() */ 1390 /* Avoid races with quotaoff() */
1387 if (!sb_has_quota_active(sb, cnt)) 1391 if (!sb_has_quota_active(sb, cnt))
1388 continue; 1392 continue;
1393 /* We could race with quotaon or dqget() could have failed */
1394 if (!got[cnt])
1395 continue;
1389 if (!inode->i_dquot[cnt]) { 1396 if (!inode->i_dquot[cnt]) {
1390 inode->i_dquot[cnt] = got[cnt]; 1397 inode->i_dquot[cnt] = got[cnt];
1391 got[cnt] = NULL; 1398 got[cnt] = NULL;
@@ -1736,6 +1743,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1736 qsize_t rsv_space = 0; 1743 qsize_t rsv_space = 0;
1737 struct dquot *transfer_from[MAXQUOTAS] = {}; 1744 struct dquot *transfer_from[MAXQUOTAS] = {};
1738 int cnt, ret = 0; 1745 int cnt, ret = 0;
1746 char is_valid[MAXQUOTAS] = {};
1739 char warntype_to[MAXQUOTAS]; 1747 char warntype_to[MAXQUOTAS];
1740 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1748 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1741 1749
@@ -1757,8 +1765,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1757 space = cur_space + rsv_space; 1765 space = cur_space + rsv_space;
1758 /* Build the transfer_from list and check the limits */ 1766 /* Build the transfer_from list and check the limits */
1759 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1767 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1768 /*
1769 * Skip changes for same uid or gid or for turned off quota-type.
1770 */
1760 if (!transfer_to[cnt]) 1771 if (!transfer_to[cnt])
1761 continue; 1772 continue;
1773 /* Avoid races with quotaoff() */
1774 if (!sb_has_quota_active(inode->i_sb, cnt))
1775 continue;
1776 is_valid[cnt] = 1;
1762 transfer_from[cnt] = inode->i_dquot[cnt]; 1777 transfer_from[cnt] = inode->i_dquot[cnt];
1763 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); 1778 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
1764 if (ret) 1779 if (ret)
@@ -1772,12 +1787,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1772 * Finally perform the needed transfer from transfer_from to transfer_to 1787 * Finally perform the needed transfer from transfer_from to transfer_to
1773 */ 1788 */
1774 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1789 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1775 /* 1790 if (!is_valid[cnt])
1776 * Skip changes for same uid or gid or for turned off quota-type.
1777 */
1778 if (!transfer_to[cnt])
1779 continue; 1791 continue;
1780
1781 /* Due to IO error we might not have transfer_from[] structure */ 1792 /* Due to IO error we might not have transfer_from[] structure */
1782 if (transfer_from[cnt]) { 1793 if (transfer_from[cnt]) {
1783 warntype_from_inodes[cnt] = 1794 warntype_from_inodes[cnt] =
@@ -1801,18 +1812,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1801 1812
1802 mark_all_dquot_dirty(transfer_from); 1813 mark_all_dquot_dirty(transfer_from);
1803 mark_all_dquot_dirty(transfer_to); 1814 mark_all_dquot_dirty(transfer_to);
1804 /* Pass back references to put */
1805 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1806 transfer_to[cnt] = transfer_from[cnt];
1807warn:
1808 flush_warnings(transfer_to, warntype_to); 1815 flush_warnings(transfer_to, warntype_to);
1809 flush_warnings(transfer_from, warntype_from_inodes); 1816 flush_warnings(transfer_from, warntype_from_inodes);
1810 flush_warnings(transfer_from, warntype_from_space); 1817 flush_warnings(transfer_from, warntype_from_space);
1811 return ret; 1818 /* Pass back references to put */
1819 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1820 if (is_valid[cnt])
1821 transfer_to[cnt] = transfer_from[cnt];
1822 return 0;
1812over_quota: 1823over_quota:
1813 spin_unlock(&dq_data_lock); 1824 spin_unlock(&dq_data_lock);
1814 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1825 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1815 goto warn; 1826 flush_warnings(transfer_to, warntype_to);
1827 return ret;
1816} 1828}
1817EXPORT_SYMBOL(__dquot_transfer); 1829EXPORT_SYMBOL(__dquot_transfer);
1818 1830
@@ -2177,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
2177} 2189}
2178EXPORT_SYMBOL(dquot_resume); 2190EXPORT_SYMBOL(dquot_resume);
2179 2191
2180int dquot_quota_on_path(struct super_block *sb, int type, int format_id, 2192int dquot_quota_on(struct super_block *sb, int type, int format_id,
2181 struct path *path) 2193 struct path *path)
2182{ 2194{
2183 int error = security_quota_on(path->dentry); 2195 int error = security_quota_on(path->dentry);
2184 if (error) 2196 if (error)
@@ -2192,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2192 DQUOT_LIMITS_ENABLED); 2204 DQUOT_LIMITS_ENABLED);
2193 return error; 2205 return error;
2194} 2206}
2195EXPORT_SYMBOL(dquot_quota_on_path);
2196
2197int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2198{
2199 struct path path;
2200 int error;
2201
2202 error = kern_path(name, LOOKUP_FOLLOW, &path);
2203 if (!error) {
2204 error = dquot_quota_on_path(sb, type, format_id, &path);
2205 path_put(&path);
2206 }
2207 return error;
2208}
2209EXPORT_SYMBOL(dquot_quota_on); 2207EXPORT_SYMBOL(dquot_quota_on);
2210 2208
2211/* 2209/*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1edb..b34bdb25490c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
64} 64}
65 65
66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
67 void __user *addr) 67 struct path *path)
68{ 68{
69 char *pathname; 69 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
70 int ret = -ENOSYS; 70 return -ENOSYS;
71 71 if (sb->s_qcop->quota_on_meta)
72 pathname = getname(addr); 72 return sb->s_qcop->quota_on_meta(sb, type, id);
73 if (IS_ERR(pathname)) 73 if (IS_ERR(path))
74 return PTR_ERR(pathname); 74 return PTR_ERR(path);
75 if (sb->s_qcop->quota_on) 75 return sb->s_qcop->quota_on(sb, type, id, path);
76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
77 putname(pathname);
78 return ret;
79} 76}
80 77
81static int quota_getfmt(struct super_block *sb, int type, void __user *addr) 78static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
241 238
242/* Copy parameters and call proper function */ 239/* Copy parameters and call proper function */
243static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, 240static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
244 void __user *addr) 241 void __user *addr, struct path *path)
245{ 242{
246 int ret; 243 int ret;
247 244
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
256 253
257 switch (cmd) { 254 switch (cmd) {
258 case Q_QUOTAON: 255 case Q_QUOTAON:
259 return quota_quotaon(sb, type, cmd, id, addr); 256 return quota_quotaon(sb, type, cmd, id, path);
260 case Q_QUOTAOFF: 257 case Q_QUOTAOFF:
261 if (!sb->s_qcop->quota_off) 258 if (!sb->s_qcop->quota_off)
262 return -ENOSYS; 259 return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
335{ 332{
336 uint cmds, type; 333 uint cmds, type;
337 struct super_block *sb = NULL; 334 struct super_block *sb = NULL;
335 struct path path, *pathp = NULL;
338 int ret; 336 int ret;
339 337
340 cmds = cmd >> SUBCMDSHIFT; 338 cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
351 return -ENODEV; 349 return -ENODEV;
352 } 350 }
353 351
352 /*
353 * Path for quotaon has to be resolved before grabbing superblock
354 * because that gets s_umount sem which is also possibly needed by path
355 * resolution (think about autofs) and thus deadlocks could arise.
356 */
357 if (cmds == Q_QUOTAON) {
358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
359 if (ret)
360 pathp = ERR_PTR(ret);
361 else
362 pathp = &path;
363 }
364
354 sb = quotactl_block(special); 365 sb = quotactl_block(special);
355 if (IS_ERR(sb)) 366 if (IS_ERR(sb))
356 return PTR_ERR(sb); 367 return PTR_ERR(sb);
357 368
358 ret = do_quotactl(sb, type, cmds, id, addr); 369 ret = do_quotactl(sb, type, cmds, id, addr, pathp);
359 370
360 drop_super(sb); 371 drop_super(sb);
372 if (pathp && !IS_ERR(pathp))
373 path_put(pathp);
361 return ret; 374 return ret;
362} 375}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 9e48874eabcc..e41c1becf096 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -468,8 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
468 return -ENOMEM; 468 return -ENOMEM;
469 ret = read_blk(info, *blk, buf); 469 ret = read_blk(info, *blk, buf);
470 if (ret < 0) { 470 if (ret < 0) {
471 quota_error(dquot->dq_sb, "Can't read quota data " 471 quota_error(dquot->dq_sb, "Can't read quota data block %u",
472 "block %u", blk); 472 *blk);
473 goto out_buf; 473 goto out_buf;
474 } 474 }
475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -493,8 +493,9 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
493 } else { 493 } else {
494 ret = write_blk(info, *blk, buf); 494 ret = write_blk(info, *blk, buf);
495 if (ret < 0) 495 if (ret < 0)
496 quota_error(dquot->dq_sb, "Can't write quota " 496 quota_error(dquot->dq_sb,
497 "tree block %u", blk); 497 "Can't write quota tree block %u",
498 *blk);
498 } 499 }
499 } 500 }
500out_buf: 501out_buf:
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..eacb166fb259 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
58 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
59 59
60 if (inode) { 60 if (inode) {
61 inode->i_ino = get_next_ino();
61 inode_init_owner(inode, dir, mode); 62 inode_init_owner(inode, dir, mode);
62 inode->i_mapping->a_ops = &ramfs_aops; 63 inode->i_mapping->a_ops = &ramfs_aops;
63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -254,17 +255,16 @@ fail:
254 return err; 255 return err;
255} 256}
256 257
257int ramfs_get_sb(struct file_system_type *fs_type, 258struct dentry *ramfs_mount(struct file_system_type *fs_type,
258 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 259 int flags, const char *dev_name, void *data)
259{ 260{
260 return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt); 261 return mount_nodev(fs_type, flags, data, ramfs_fill_super);
261} 262}
262 263
263static int rootfs_get_sb(struct file_system_type *fs_type, 264static struct dentry *rootfs_mount(struct file_system_type *fs_type,
264 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 265 int flags, const char *dev_name, void *data)
265{ 266{
266 return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super, 267 return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
267 mnt);
268} 268}
269 269
270static void ramfs_kill_sb(struct super_block *sb) 270static void ramfs_kill_sb(struct super_block *sb)
@@ -275,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
275 275
276static struct file_system_type ramfs_fs_type = { 276static struct file_system_type ramfs_fs_type = {
277 .name = "ramfs", 277 .name = "ramfs",
278 .get_sb = ramfs_get_sb, 278 .mount = ramfs_mount,
279 .kill_sb = ramfs_kill_sb, 279 .kill_sb = ramfs_kill_sb,
280}; 280};
281static struct file_system_type rootfs_fs_type = { 281static struct file_system_type rootfs_fs_type = {
282 .name = "rootfs", 282 .name = "rootfs",
283 .get_sb = rootfs_get_sb, 283 .mount = rootfs_mount,
284 .kill_sb = kill_litter_super, 284 .kill_sb = kill_litter_super,
285}; 285};
286 286
diff --git a/fs/read_write.c b/fs/read_write.c
index 74e36586e4d3..5520f8ad5504 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
9#include <linux/fcntl.h> 9#include <linux/fcntl.h>
10#include <linux/file.h> 10#include <linux/file.h>
11#include <linux/uio.h> 11#include <linux/uio.h>
12#include <linux/smp_lock.h>
13#include <linux/fsnotify.h> 12#include <linux/fsnotify.h>
14#include <linux/security.h> 13#include <linux/security.h>
15#include <linux/module.h> 14#include <linux/module.h>
@@ -31,6 +30,11 @@ const struct file_operations generic_ro_fops = {
31 30
32EXPORT_SYMBOL(generic_ro_fops); 31EXPORT_SYMBOL(generic_ro_fops);
33 32
33static inline int unsigned_offsets(struct file *file)
34{
35 return file->f_mode & FMODE_UNSIGNED_OFFSET;
36}
37
34/** 38/**
35 * generic_file_llseek_unlocked - lockless generic llseek implementation 39 * generic_file_llseek_unlocked - lockless generic llseek implementation
36 * @file: file structure to seek on 40 * @file: file structure to seek on
@@ -62,7 +66,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
62 break; 66 break;
63 } 67 }
64 68
65 if (offset < 0 || offset > inode->i_sb->s_maxbytes) 69 if (offset < 0 && !unsigned_offsets(file))
70 return -EINVAL;
71 if (offset > inode->i_sb->s_maxbytes)
66 return -EINVAL; 72 return -EINVAL;
67 73
68 /* Special lock needed here? */ 74 /* Special lock needed here? */
@@ -124,7 +130,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
124{ 130{
125 loff_t retval; 131 loff_t retval;
126 132
127 lock_kernel(); 133 mutex_lock(&file->f_dentry->d_inode->i_mutex);
128 switch (origin) { 134 switch (origin) {
129 case SEEK_END: 135 case SEEK_END:
130 offset += i_size_read(file->f_path.dentry->d_inode); 136 offset += i_size_read(file->f_path.dentry->d_inode);
@@ -137,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
137 offset += file->f_pos; 143 offset += file->f_pos;
138 } 144 }
139 retval = -EINVAL; 145 retval = -EINVAL;
140 if (offset >= 0) { 146 if (offset >= 0 || unsigned_offsets(file)) {
141 if (offset != file->f_pos) { 147 if (offset != file->f_pos) {
142 file->f_pos = offset; 148 file->f_pos = offset;
143 file->f_version = 0; 149 file->f_version = 0;
@@ -145,7 +151,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
145 retval = offset; 151 retval = offset;
146 } 152 }
147out: 153out:
148 unlock_kernel(); 154 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
149 return retval; 155 return retval;
150} 156}
151EXPORT_SYMBOL(default_llseek); 157EXPORT_SYMBOL(default_llseek);
@@ -156,7 +162,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
156 162
157 fn = no_llseek; 163 fn = no_llseek;
158 if (file->f_mode & FMODE_LSEEK) { 164 if (file->f_mode & FMODE_LSEEK) {
159 fn = default_llseek;
160 if (file->f_op && file->f_op->llseek) 165 if (file->f_op && file->f_op->llseek)
161 fn = file->f_op->llseek; 166 fn = file->f_op->llseek;
162 } 167 }
@@ -222,13 +227,12 @@ bad:
222} 227}
223#endif 228#endif
224 229
230
225/* 231/*
226 * rw_verify_area doesn't like huge counts. We limit 232 * rw_verify_area doesn't like huge counts. We limit
227 * them to something that fits in "int" so that others 233 * them to something that fits in "int" so that others
228 * won't have to do range checks all the time. 234 * won't have to do range checks all the time.
229 */ 235 */
230#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
231
232int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 236int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
233{ 237{
234 struct inode *inode; 238 struct inode *inode;
@@ -239,8 +243,15 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
239 if (unlikely((ssize_t) count < 0)) 243 if (unlikely((ssize_t) count < 0))
240 return retval; 244 return retval;
241 pos = *ppos; 245 pos = *ppos;
242 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) 246 if (unlikely(pos < 0)) {
243 return retval; 247 if (!unsigned_offsets(file))
248 return retval;
249 if (count >= -pos) /* both values are in 0..LLONG_MAX */
250 return -EOVERFLOW;
251 } else if (unlikely((loff_t) (pos + count) < 0)) {
252 if (!unsigned_offsets(file))
253 return retval;
254 }
244 255
245 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 256 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
246 retval = locks_mandatory_area( 257 retval = locks_mandatory_area(
@@ -565,65 +576,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
565 unsigned long nr_segs, unsigned long fast_segs, 576 unsigned long nr_segs, unsigned long fast_segs,
566 struct iovec *fast_pointer, 577 struct iovec *fast_pointer,
567 struct iovec **ret_pointer) 578 struct iovec **ret_pointer)
568 { 579{
569 unsigned long seg; 580 unsigned long seg;
570 ssize_t ret; 581 ssize_t ret;
571 struct iovec *iov = fast_pointer; 582 struct iovec *iov = fast_pointer;
572 583
573 /* 584 /*
574 * SuS says "The readv() function *may* fail if the iovcnt argument 585 * SuS says "The readv() function *may* fail if the iovcnt argument
575 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has 586 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
576 * traditionally returned zero for zero segments, so... 587 * traditionally returned zero for zero segments, so...
577 */ 588 */
578 if (nr_segs == 0) { 589 if (nr_segs == 0) {
579 ret = 0; 590 ret = 0;
580 goto out; 591 goto out;
581 } 592 }
582 593
583 /* 594 /*
584 * First get the "struct iovec" from user memory and 595 * First get the "struct iovec" from user memory and
585 * verify all the pointers 596 * verify all the pointers
586 */ 597 */
587 if (nr_segs > UIO_MAXIOV) { 598 if (nr_segs > UIO_MAXIOV) {
588 ret = -EINVAL; 599 ret = -EINVAL;
589 goto out; 600 goto out;
590 } 601 }
591 if (nr_segs > fast_segs) { 602 if (nr_segs > fast_segs) {
592 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 603 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
593 if (iov == NULL) { 604 if (iov == NULL) {
594 ret = -ENOMEM; 605 ret = -ENOMEM;
595 goto out; 606 goto out;
596 } 607 }
597 } 608 }
598 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { 609 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
599 ret = -EFAULT; 610 ret = -EFAULT;
600 goto out; 611 goto out;
601 } 612 }
602 613
603 /* 614 /*
604 * According to the Single Unix Specification we should return EINVAL 615 * According to the Single Unix Specification we should return EINVAL
605 * if an element length is < 0 when cast to ssize_t or if the 616 * if an element length is < 0 when cast to ssize_t or if the
606 * total length would overflow the ssize_t return value of the 617 * total length would overflow the ssize_t return value of the
607 * system call. 618 * system call.
608 */ 619 *
620 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
621 * overflow case.
622 */
609 ret = 0; 623 ret = 0;
610 for (seg = 0; seg < nr_segs; seg++) { 624 for (seg = 0; seg < nr_segs; seg++) {
611 void __user *buf = iov[seg].iov_base; 625 void __user *buf = iov[seg].iov_base;
612 ssize_t len = (ssize_t)iov[seg].iov_len; 626 ssize_t len = (ssize_t)iov[seg].iov_len;
613 627
614 /* see if we we're about to use an invalid len or if 628 /* see if we we're about to use an invalid len or if
615 * it's about to overflow ssize_t */ 629 * it's about to overflow ssize_t */
616 if (len < 0 || (ret + len < ret)) { 630 if (len < 0) {
617 ret = -EINVAL; 631 ret = -EINVAL;
618 goto out; 632 goto out;
619 } 633 }
620 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { 634 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
621 ret = -EFAULT; 635 ret = -EFAULT;
622 goto out; 636 goto out;
637 }
638 if (len > MAX_RW_COUNT - ret) {
639 len = MAX_RW_COUNT - ret;
640 iov[seg].iov_len = len;
623 } 641 }
624
625 ret += len; 642 ret += len;
626 } 643 }
627out: 644out:
628 *ret_pointer = iov; 645 *ret_pointer = iov;
629 return ret; 646 return ret;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f9..7cd46666ba2c 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
10 10
11 In general, ReiserFS is as fast as ext2, but is very efficient with 11 In general, ReiserFS is as fast as ext2, but is very efficient with
12 large directories and small files. Additional patches are needed 12 large directories and small files. Additional patches are needed
13 for NFS and quotas, please see <http://www.namesys.com/> for links. 13 for NFS and quotas, please see
14 <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
14 15
15 It is more easily extended to have features currently found in 16 It is more easily extended to have features currently found in
16 database and keyword search systems than block allocation based file 17 database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
18 plugins consistent with our motto ``It takes more than a license to 19 plugins consistent with our motto ``It takes more than a license to
19 make source code open.'' 20 make source code open.''
20 21
21 Read <http://www.namesys.com/> to learn more about reiserfs. 22 Read <https://reiser4.wiki.kernel.org/index.php/Main_Page>
23 to learn more about reiserfs.
22 24
23 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. 25 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
24 26
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e5..e2f7a264e3ff 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
43[END LICENSING] 43[END LICENSING]
44 44
45Reiserfs is a file system based on balanced tree algorithms, which is 45Reiserfs is a file system based on balanced tree algorithms, which is
46described at http://devlinux.com/namesys. 46described at https://reiser4.wiki.kernel.org/index.php/Main_Page
47 47
48Stop reading here. Go there, then return. 48Stop reading here. Go there, then return.
49 49
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
152 barrier_done = reiserfs_commit_for_inode(inode); 152 barrier_done = reiserfs_commit_for_inode(inode);
153 reiserfs_write_unlock(inode->i_sb); 153 reiserfs_write_unlock(inode->i_sb);
154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
156 BLKDEV_IFL_WAIT);
157 if (barrier_done < 0) 156 if (barrier_done < 0)
158 return barrier_done; 157 return barrier_done;
159 return (err < 0) ? -EIO : 0; 158 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..0bae036831e2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
8#include <linux/reiserfs_acl.h> 8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h> 9#include <linux/reiserfs_xattr.h>
10#include <linux/exportfs.h> 10#include <linux/exportfs.h>
11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 11#include <linux/pagemap.h>
13#include <linux/highmem.h> 12#include <linux/highmem.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
@@ -22,8 +21,6 @@
22 21
23int reiserfs_commit_write(struct file *f, struct page *page, 22int reiserfs_commit_write(struct file *f, struct page *page,
24 unsigned from, unsigned to); 23 unsigned from, unsigned to);
25int reiserfs_prepare_write(struct file *f, struct page *page,
26 unsigned from, unsigned to);
27 24
28void reiserfs_evict_inode(struct inode *inode) 25void reiserfs_evict_inode(struct inode *inode)
29{ 26{
@@ -165,7 +162,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
165** but tail is still sitting in a direct item, and we can't write to 162** but tail is still sitting in a direct item, and we can't write to
166** it. So, look through this page, and check all the mapped buffers 163** it. So, look through this page, and check all the mapped buffers
167** to make sure they have valid block numbers. Any that don't need 164** to make sure they have valid block numbers. Any that don't need
168** to be unmapped, so that block_prepare_write will correctly call 165** to be unmapped, so that __block_write_begin will correctly call
169** reiserfs_get_block to convert the tail into an unformatted node 166** reiserfs_get_block to convert the tail into an unformatted node
170*/ 167*/
171static inline void fix_tail_page_for_writing(struct page *page) 168static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +436,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
439} 436}
440 437
441/* special version of get_block that is only used by grab_tail_page right 438/* special version of get_block that is only used by grab_tail_page right
442** now. It is sent to block_prepare_write, and when you try to get a 439** now. It is sent to __block_write_begin, and when you try to get a
443** block past the end of the file (or a block from a hole) it returns 440** block past the end of the file (or a block from a hole) it returns
444** -ENOENT instead of a valid buffer. block_prepare_write expects to 441** -ENOENT instead of a valid buffer. __block_write_begin expects to
445** be able to do i/o on the buffers returned, unless an error value 442** be able to do i/o on the buffers returned, unless an error value
446** is also returned. 443** is also returned.
447** 444**
448** So, this allows block_prepare_write to be used for reading a single block 445** So, this allows __block_write_begin to be used for reading a single block
449** in a page. Where it does not produce a valid page for holes, or past the 446** in a page. Where it does not produce a valid page for holes, or past the
450** end of the file. This turns out to be exactly what we need for reading 447** end of the file. This turns out to be exactly what we need for reading
451** tails for conversion. 448** tails for conversion.
@@ -558,11 +555,12 @@ static int convert_tail_for_hole(struct inode *inode,
558 ** 555 **
559 ** We must fix the tail page for writing because it might have buffers 556 ** We must fix the tail page for writing because it might have buffers
560 ** that are mapped, but have a block number of 0. This indicates tail 557 ** that are mapped, but have a block number of 0. This indicates tail
561 ** data that has been read directly into the page, and block_prepare_write 558 ** data that has been read directly into the page, and
562 ** won't trigger a get_block in this case. 559 ** __block_write_begin won't trigger a get_block in this case.
563 */ 560 */
564 fix_tail_page_for_writing(tail_page); 561 fix_tail_page_for_writing(tail_page);
565 retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); 562 retval = __reiserfs_write_begin(tail_page, tail_start,
563 tail_end - tail_start);
566 if (retval) 564 if (retval)
567 goto unlock; 565 goto unlock;
568 566
@@ -2033,7 +2031,7 @@ static int grab_tail_page(struct inode *inode,
2033 /* start within the page of the last block in the file */ 2031 /* start within the page of the last block in the file */
2034 start = (offset / blocksize) * blocksize; 2032 start = (offset / blocksize) * blocksize;
2035 2033
2036 error = block_prepare_write(page, start, offset, 2034 error = __block_write_begin(page, start, offset - start,
2037 reiserfs_get_block_create_0); 2035 reiserfs_get_block_create_0);
2038 if (error) 2036 if (error)
2039 goto unlock; 2037 goto unlock;
@@ -2438,7 +2436,7 @@ static int reiserfs_write_full_page(struct page *page,
2438 /* from this point on, we know the buffer is mapped to a 2436 /* from this point on, we know the buffer is mapped to a
2439 * real block and not a direct item 2437 * real block and not a direct item
2440 */ 2438 */
2441 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 2439 if (wbc->sync_mode != WB_SYNC_NONE) {
2442 lock_buffer(bh); 2440 lock_buffer(bh);
2443 } else { 2441 } else {
2444 if (!trylock_buffer(bh)) { 2442 if (!trylock_buffer(bh)) {
@@ -2628,8 +2626,7 @@ static int reiserfs_write_begin(struct file *file,
2628 return ret; 2626 return ret;
2629} 2627}
2630 2628
2631int reiserfs_prepare_write(struct file *f, struct page *page, 2629int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
2632 unsigned from, unsigned to)
2633{ 2630{
2634 struct inode *inode = page->mapping->host; 2631 struct inode *inode = page->mapping->host;
2635 int ret; 2632 int ret;
@@ -2650,7 +2647,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2650 th->t_refcount++; 2647 th->t_refcount++;
2651 } 2648 }
2652 2649
2653 ret = block_prepare_write(page, from, to, reiserfs_get_block); 2650 ret = __block_write_begin(page, from, len, reiserfs_get_block);
2654 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2651 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2655 struct reiserfs_transaction_handle *th = current->journal_info; 2652 struct reiserfs_transaction_handle *th = current->journal_info;
2656 /* this gets a little ugly. If reiserfs_get_block returned an 2653 /* this gets a little ugly. If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e134ac..79265fdc317a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
9#include <linux/time.h> 9#include <linux/time.h>
10#include <asm/uaccess.h> 10#include <asm/uaccess.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/smp_lock.h>
13#include <linux/compat.h> 12#include <linux/compat.h>
14 13
15/* 14/*
@@ -160,8 +159,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
160 159
161int reiserfs_commit_write(struct file *f, struct page *page, 160int reiserfs_commit_write(struct file *f, struct page *page,
162 unsigned from, unsigned to); 161 unsigned from, unsigned to);
163int reiserfs_prepare_write(struct file *f, struct page *page,
164 unsigned from, unsigned to);
165/* 162/*
166** reiserfs_unpack 163** reiserfs_unpack
167** Function try to convert tail from direct item into indirect. 164** Function try to convert tail from direct item into indirect.
@@ -186,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
186 return 0; 183 return 0;
187 } 184 }
188 185
189 /* we need to make sure nobody is changing the file size beneath
190 ** us
191 */
192 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
193 depth = reiserfs_write_lock_once(inode->i_sb); 186 depth = reiserfs_write_lock_once(inode->i_sb);
194 187
188 /* we need to make sure nobody is changing the file size beneath us */
189 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
190
195 write_from = inode->i_size & (blocksize - 1); 191 write_from = inode->i_size & (blocksize - 1);
196 /* if we are on a block boundary, we are already unpacked. */ 192 /* if we are on a block boundary, we are already unpacked. */
197 if (write_from == 0) { 193 if (write_from == 0) {
@@ -200,7 +196,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
200 } 196 }
201 197
202 /* we unpack by finding the page with the tail, and calling 198 /* we unpack by finding the page with the tail, and calling
203 ** reiserfs_prepare_write on that page. This will force a 199 ** __reiserfs_write_begin on that page. This will force a
204 ** reiserfs_get_block to unpack the tail for us. 200 ** reiserfs_get_block to unpack the tail for us.
205 */ 201 */
206 index = inode->i_size >> PAGE_CACHE_SHIFT; 202 index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +206,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
210 if (!page) { 206 if (!page) {
211 goto out; 207 goto out;
212 } 208 }
213 retval = reiserfs_prepare_write(NULL, page, write_from, write_from); 209 retval = __reiserfs_write_begin(page, write_from, 0);
214 if (retval) 210 if (retval)
215 goto out_unlock; 211 goto out_unlock;
216 212
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa29..3eea859e6990 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
43#include <linux/fcntl.h> 43#include <linux/fcntl.h>
44#include <linux/stat.h> 44#include <linux/stat.h>
45#include <linux/string.h> 45#include <linux/string.h>
46#include <linux/smp_lock.h>
47#include <linux/buffer_head.h> 46#include <linux/buffer_head.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/writeback.h> 48#include <linux/writeback.h>
@@ -138,13 +137,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
138 return 0; 137 return 0;
139} 138}
140 139
141static void disable_barrier(struct super_block *s)
142{
143 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
144 printk("reiserfs: disabling flush barriers on %s\n",
145 reiserfs_bdevname(s));
146}
147
148static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block 140static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
149 *sb) 141 *sb)
150{ 142{
@@ -677,30 +669,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
677 submit_bh(WRITE, bh); 669 submit_bh(WRITE, bh);
678} 670}
679 671
680static int submit_barrier_buffer(struct buffer_head *bh)
681{
682 get_bh(bh);
683 bh->b_end_io = reiserfs_end_ordered_io;
684 clear_buffer_dirty(bh);
685 if (!buffer_uptodate(bh))
686 BUG();
687 return submit_bh(WRITE_BARRIER, bh);
688}
689
690static void check_barrier_completion(struct super_block *s,
691 struct buffer_head *bh)
692{
693 if (buffer_eopnotsupp(bh)) {
694 clear_buffer_eopnotsupp(bh);
695 disable_barrier(s);
696 set_buffer_uptodate(bh);
697 set_buffer_dirty(bh);
698 reiserfs_write_unlock(s);
699 sync_dirty_buffer(bh);
700 reiserfs_write_lock(s);
701 }
702}
703
704#define CHUNK_SIZE 32 672#define CHUNK_SIZE 32
705struct buffer_chunk { 673struct buffer_chunk {
706 struct buffer_head *bh[CHUNK_SIZE]; 674 struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +977,6 @@ static int flush_commit_list(struct super_block *s,
1009 struct buffer_head *tbh = NULL; 977 struct buffer_head *tbh = NULL;
1010 unsigned int trans_id = jl->j_trans_id; 978 unsigned int trans_id = jl->j_trans_id;
1011 struct reiserfs_journal *journal = SB_JOURNAL(s); 979 struct reiserfs_journal *journal = SB_JOURNAL(s);
1012 int barrier = 0;
1013 int retval = 0; 980 int retval = 0;
1014 int write_len; 981 int write_len;
1015 982
@@ -1094,24 +1061,6 @@ static int flush_commit_list(struct super_block *s,
1094 } 1061 }
1095 atomic_dec(&journal->j_async_throttle); 1062 atomic_dec(&journal->j_async_throttle);
1096 1063
1097 /* We're skipping the commit if there's an error */
1098 if (retval || reiserfs_is_journal_aborted(journal))
1099 barrier = 0;
1100
1101 /* wait on everything written so far before writing the commit
1102 * if we are in barrier mode, send the commit down now
1103 */
1104 barrier = reiserfs_barrier_flush(s);
1105 if (barrier) {
1106 int ret;
1107 lock_buffer(jl->j_commit_bh);
1108 ret = submit_barrier_buffer(jl->j_commit_bh);
1109 if (ret == -EOPNOTSUPP) {
1110 set_buffer_uptodate(jl->j_commit_bh);
1111 disable_barrier(s);
1112 barrier = 0;
1113 }
1114 }
1115 for (i = 0; i < (jl->j_len + 1); i++) { 1064 for (i = 0; i < (jl->j_len + 1); i++) {
1116 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1065 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1117 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1066 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1092,22 @@ static int flush_commit_list(struct super_block *s,
1143 1092
1144 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); 1093 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1145 1094
1146 if (!barrier) { 1095 /* If there was a write error in the journal - we can't commit
1147 /* If there was a write error in the journal - we can't commit 1096 * this transaction - it will be invalid and, if successful,
1148 * this transaction - it will be invalid and, if successful, 1097 * will just end up propagating the write error out to
1149 * will just end up propagating the write error out to 1098 * the file system. */
1150 * the file system. */ 1099 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1151 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { 1100 if (buffer_dirty(jl->j_commit_bh))
1152 if (buffer_dirty(jl->j_commit_bh)) 1101 BUG();
1153 BUG(); 1102 mark_buffer_dirty(jl->j_commit_bh) ;
1154 mark_buffer_dirty(jl->j_commit_bh) ;
1155 reiserfs_write_unlock(s);
1156 sync_dirty_buffer(jl->j_commit_bh) ;
1157 reiserfs_write_lock(s);
1158 }
1159 } else {
1160 reiserfs_write_unlock(s); 1103 reiserfs_write_unlock(s);
1161 wait_on_buffer(jl->j_commit_bh); 1104 if (reiserfs_barrier_flush(s))
1105 __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1106 else
1107 sync_dirty_buffer(jl->j_commit_bh);
1162 reiserfs_write_lock(s); 1108 reiserfs_write_lock(s);
1163 } 1109 }
1164 1110
1165 check_barrier_completion(s, jl->j_commit_bh);
1166
1167 /* If there was a write error in the journal - we can't commit this 1111 /* If there was a write error in the journal - we can't commit this
1168 * transaction - it will be invalid and, if successful, will just end 1112 * transaction - it will be invalid and, if successful, will just end
1169 * up propagating the write error out to the filesystem. */ 1113 * up propagating the write error out to the filesystem. */
@@ -1319,26 +1263,15 @@ static int _update_journal_header_block(struct super_block *sb,
1319 jh->j_first_unflushed_offset = cpu_to_le32(offset); 1263 jh->j_first_unflushed_offset = cpu_to_le32(offset);
1320 jh->j_mount_id = cpu_to_le32(journal->j_mount_id); 1264 jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1321 1265
1322 if (reiserfs_barrier_flush(sb)) { 1266 set_buffer_dirty(journal->j_header_bh);
1323 int ret; 1267 reiserfs_write_unlock(sb);
1324 lock_buffer(journal->j_header_bh); 1268
1325 ret = submit_barrier_buffer(journal->j_header_bh); 1269 if (reiserfs_barrier_flush(sb))
1326 if (ret == -EOPNOTSUPP) { 1270 __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1327 set_buffer_uptodate(journal->j_header_bh); 1271 else
1328 disable_barrier(sb);
1329 goto sync;
1330 }
1331 reiserfs_write_unlock(sb);
1332 wait_on_buffer(journal->j_header_bh);
1333 reiserfs_write_lock(sb);
1334 check_barrier_completion(sb, journal->j_header_bh);
1335 } else {
1336 sync:
1337 set_buffer_dirty(journal->j_header_bh);
1338 reiserfs_write_unlock(sb);
1339 sync_dirty_buffer(journal->j_header_bh); 1272 sync_dirty_buffer(journal->j_header_bh);
1340 reiserfs_write_lock(sb); 1273
1341 } 1274 reiserfs_write_lock(sb);
1342 if (!buffer_uptodate(journal->j_header_bh)) { 1275 if (!buffer_uptodate(journal->j_header_bh)) {
1343 reiserfs_warning(sb, "journal-837", 1276 reiserfs_warning(sb, "journal-837",
1344 "IO error during journal replay"); 1277 "IO error during journal replay");
@@ -2618,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
2618 result = 0; 2551 result = 0;
2619 2552
2620 if (journal->j_dev_bd != NULL) { 2553 if (journal->j_dev_bd != NULL) {
2621 if (journal->j_dev_bd->bd_dev != super->s_dev)
2622 bd_release(journal->j_dev_bd);
2623 result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); 2554 result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
2624 journal->j_dev_bd = NULL; 2555 journal->j_dev_bd = NULL;
2625 } 2556 }
@@ -2637,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
2637{ 2568{
2638 int result; 2569 int result;
2639 dev_t jdev; 2570 dev_t jdev;
2640 fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE; 2571 fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
2641 char b[BDEVNAME_SIZE]; 2572 char b[BDEVNAME_SIZE];
2642 2573
2643 result = 0; 2574 result = 0;
@@ -2651,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
2651 2582
2652 /* there is no "jdev" option and journal is on separate device */ 2583 /* there is no "jdev" option and journal is on separate device */
2653 if ((!jdev_name || !jdev_name[0])) { 2584 if ((!jdev_name || !jdev_name[0])) {
2654 journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); 2585 if (jdev == super->s_dev)
2586 blkdev_mode &= ~FMODE_EXCL;
2587 journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
2588 journal);
2655 journal->j_dev_mode = blkdev_mode; 2589 journal->j_dev_mode = blkdev_mode;
2656 if (IS_ERR(journal->j_dev_bd)) { 2590 if (IS_ERR(journal->j_dev_bd)) {
2657 result = PTR_ERR(journal->j_dev_bd); 2591 result = PTR_ERR(journal->j_dev_bd);
@@ -2660,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
2660 "cannot init journal device '%s': %i", 2594 "cannot init journal device '%s': %i",
2661 __bdevname(jdev, b), result); 2595 __bdevname(jdev, b), result);
2662 return result; 2596 return result;
2663 } else if (jdev != super->s_dev) { 2597 } else if (jdev != super->s_dev)
2664 result = bd_claim(journal->j_dev_bd, journal);
2665 if (result) {
2666 blkdev_put(journal->j_dev_bd, blkdev_mode);
2667 return result;
2668 }
2669
2670 set_blocksize(journal->j_dev_bd, super->s_blocksize); 2598 set_blocksize(journal->j_dev_bd, super->s_blocksize);
2671 }
2672 2599
2673 return 0; 2600 return 0;
2674 } 2601 }
2675 2602
2676 journal->j_dev_mode = blkdev_mode; 2603 journal->j_dev_mode = blkdev_mode;
2677 journal->j_dev_bd = open_bdev_exclusive(jdev_name, 2604 journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
2678 blkdev_mode, journal);
2679 if (IS_ERR(journal->j_dev_bd)) { 2605 if (IS_ERR(journal->j_dev_bd)) {
2680 result = PTR_ERR(journal->j_dev_bd); 2606 result = PTR_ERR(journal->j_dev_bd);
2681 journal->j_dev_bd = NULL; 2607 journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1156 inode->i_ctime = CURRENT_TIME_SEC; 1156 inode->i_ctime = CURRENT_TIME_SEC;
1157 reiserfs_update_sd(&th, inode); 1157 reiserfs_update_sd(&th, inode);
1158 1158
1159 atomic_inc(&inode->i_count); 1159 ihold(inode);
1160 d_instantiate(dentry, inode); 1160 d_instantiate(dentry, inode);
1161 retval = journal_end(&th, dir->i_sb, jbegin_count); 1161 retval = journal_end(&th, dir->i_sb, jbegin_count);
1162 reiserfs_write_unlock(dir->i_sb); 1162 reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f538515..45de98b59466 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int l
586 va_list args; 586 va_list args;
587 int mode, first, last; 587 int mode, first, last;
588 588
589 va_start(args, bh);
590
591 if (!bh) { 589 if (!bh) {
592 printk("print_block: buffer is NULL\n"); 590 printk("print_block: buffer is NULL\n");
593 return; 591 return;
594 } 592 }
595 593
594 va_start(args, bh);
595
596 mode = va_arg(args, int); 596 mode = va_arg(args, int);
597 first = va_arg(args, int); 597 first = va_arg(args, int);
598 last = va_arg(args, int); 598 last = va_arg(args, int);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e15ff612002d..0aab04f46827 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h> 30#include <linux/crc32.h>
31#include <linux/smp_lock.h>
32 31
33struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
34 33
@@ -530,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
530 return &ei->vfs_inode; 529 return &ei->vfs_inode;
531} 530}
532 531
533static void reiserfs_destroy_inode(struct inode *inode) 532static void reiserfs_i_callback(struct rcu_head *head)
534{ 533{
534 struct inode *inode = container_of(head, struct inode, i_rcu);
535 INIT_LIST_HEAD(&inode->i_dentry);
535 kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); 536 kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
536} 537}
537 538
539static void reiserfs_destroy_inode(struct inode *inode)
540{
541 call_rcu(&inode->i_rcu, reiserfs_i_callback);
542}
543
538static void init_once(void *foo) 544static void init_once(void *foo)
539{ 545{
540 struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; 546 struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
@@ -626,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
626static int reiserfs_release_dquot(struct dquot *); 632static int reiserfs_release_dquot(struct dquot *);
627static int reiserfs_mark_dquot_dirty(struct dquot *); 633static int reiserfs_mark_dquot_dirty(struct dquot *);
628static int reiserfs_write_info(struct super_block *, int); 634static int reiserfs_write_info(struct super_block *, int);
629static int reiserfs_quota_on(struct super_block *, int, int, char *); 635static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
630 636
631static const struct dquot_operations reiserfs_quota_operations = { 637static const struct dquot_operations reiserfs_quota_operations = {
632 .write_dquot = reiserfs_write_dquot, 638 .write_dquot = reiserfs_write_dquot,
@@ -2042,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2042 * Standard function to be called on quota_on 2048 * Standard function to be called on quota_on
2043 */ 2049 */
2044static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2050static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2045 char *name) 2051 struct path *path)
2046{ 2052{
2047 int err; 2053 int err;
2048 struct path path;
2049 struct inode *inode; 2054 struct inode *inode;
2050 struct reiserfs_transaction_handle th; 2055 struct reiserfs_transaction_handle th;
2051 2056
2052 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2057 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2053 return -EINVAL; 2058 return -EINVAL;
2054 2059
2055 err = kern_path(name, LOOKUP_FOLLOW, &path);
2056 if (err)
2057 return err;
2058 /* Quotafile not on the same filesystem? */ 2060 /* Quotafile not on the same filesystem? */
2059 if (path.mnt->mnt_sb != sb) { 2061 if (path->mnt->mnt_sb != sb) {
2060 err = -EXDEV; 2062 err = -EXDEV;
2061 goto out; 2063 goto out;
2062 } 2064 }
2063 inode = path.dentry->d_inode; 2065 inode = path->dentry->d_inode;
2064 /* We must not pack tails for quota files on reiserfs for quota IO to work */ 2066 /* We must not pack tails for quota files on reiserfs for quota IO to work */
2065 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { 2067 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
2066 err = reiserfs_unpack(inode, NULL); 2068 err = reiserfs_unpack(inode, NULL);
@@ -2076,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2076 /* Journaling quota? */ 2078 /* Journaling quota? */
2077 if (REISERFS_SB(sb)->s_qf_names[type]) { 2079 if (REISERFS_SB(sb)->s_qf_names[type]) {
2078 /* Quotafile not of fs root? */ 2080 /* Quotafile not of fs root? */
2079 if (path.dentry->d_parent != sb->s_root) 2081 if (path->dentry->d_parent != sb->s_root)
2080 reiserfs_warning(sb, "super-6521", 2082 reiserfs_warning(sb, "super-6521",
2081 "Quota file not on filesystem root. " 2083 "Quota file not on filesystem root. "
2082 "Journalled quota will not work."); 2084 "Journalled quota will not work.");
@@ -2095,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2095 if (err) 2097 if (err)
2096 goto out; 2098 goto out;
2097 } 2099 }
2098 err = dquot_quota_on_path(sb, type, format_id, &path); 2100 err = dquot_quota_on(sb, type, format_id, path);
2099out: 2101out:
2100 path_put(&path);
2101 return err; 2102 return err;
2102} 2103}
2103 2104
@@ -2213,12 +2214,11 @@ out:
2213 2214
2214#endif 2215#endif
2215 2216
2216static int get_super_block(struct file_system_type *fs_type, 2217static struct dentry *get_super_block(struct file_system_type *fs_type,
2217 int flags, const char *dev_name, 2218 int flags, const char *dev_name,
2218 void *data, struct vfsmount *mnt) 2219 void *data)
2219{ 2220{
2220 return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super, 2221 return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
2221 mnt);
2222} 2222}
2223 2223
2224static int __init init_reiserfs_fs(void) 2224static int __init init_reiserfs_fs(void)
@@ -2253,7 +2253,7 @@ static void __exit exit_reiserfs_fs(void)
2253struct file_system_type reiserfs_fs_type = { 2253struct file_system_type reiserfs_fs_type = {
2254 .owner = THIS_MODULE, 2254 .owner = THIS_MODULE,
2255 .name = "reiserfs", 2255 .name = "reiserfs",
2256 .get_sb = get_super_block, 2256 .mount = get_super_block,
2257 .kill_sb = reiserfs_kill_sb, 2257 .kill_sb = reiserfs_kill_sb,
2258 .fs_flags = FS_REQUIRES_DEV, 2258 .fs_flags = FS_REQUIRES_DEV,
2259}; 2259};
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..3cfb2e933644 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
418 418
419int reiserfs_commit_write(struct file *f, struct page *page, 419int reiserfs_commit_write(struct file *f, struct page *page,
420 unsigned from, unsigned to); 420 unsigned from, unsigned to);
421int reiserfs_prepare_write(struct file *f, struct page *page,
422 unsigned from, unsigned to);
423 421
424static void update_ctime(struct inode *inode) 422static void update_ctime(struct inode *inode)
425{ 423{
426 struct timespec now = current_fs_time(inode->i_sb); 424 struct timespec now = current_fs_time(inode->i_sb);
427 if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink || 425 if (inode_unhashed(inode) || !inode->i_nlink ||
428 timespec_equal(&inode->i_ctime, &now)) 426 timespec_equal(&inode->i_ctime, &now))
429 return; 427 return;
430 428
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
532 rxh->h_hash = cpu_to_le32(xahash); 530 rxh->h_hash = cpu_to_le32(xahash);
533 } 531 }
534 532
535 err = reiserfs_prepare_write(NULL, page, page_offset, 533 err = __reiserfs_write_begin(page, page_offset, chunk + skip);
536 page_offset + chunk + skip);
537 if (!err) { 534 if (!err) {
538 if (buffer) 535 if (buffer)
539 memcpy(data + skip, buffer + buffer_pos, chunk); 536 memcpy(data + skip, buffer + buffer_pos, chunk);
@@ -873,11 +870,14 @@ out:
873 return err; 870 return err;
874} 871}
875 872
876static int reiserfs_check_acl(struct inode *inode, int mask) 873static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
877{ 874{
878 struct posix_acl *acl; 875 struct posix_acl *acl;
879 int error = -EAGAIN; /* do regular unix permission checks by default */ 876 int error = -EAGAIN; /* do regular unix permission checks by default */
880 877
878 if (flags & IPERM_FLAG_RCU)
879 return -ECHILD;
880
881 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); 881 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
882 882
883 if (acl) { 883 if (acl) {
@@ -954,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
954 return 0; 954 return 0;
955} 955}
956 956
957int reiserfs_permission(struct inode *inode, int mask) 957int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
958{ 958{
959 if (flags & IPERM_FLAG_RCU)
960 return -ECHILD;
959 /* 961 /*
960 * We don't do permission checks on the internal objects. 962 * We don't do permission checks on the internal objects.
961 * Permissions are determined by the "owning" object. 963 * Permissions are determined by the "owning" object.
@@ -968,13 +970,16 @@ int reiserfs_permission(struct inode *inode, int mask)
968 * Stat data v1 doesn't support ACLs. 970 * Stat data v1 doesn't support ACLs.
969 */ 971 */
970 if (get_inode_sd_version(inode) != STAT_DATA_V1) 972 if (get_inode_sd_version(inode) != STAT_DATA_V1)
971 return generic_permission(inode, mask, reiserfs_check_acl); 973 return generic_permission(inode, mask, flags,
974 reiserfs_check_acl);
972#endif 975#endif
973 return generic_permission(inode, mask, NULL); 976 return generic_permission(inode, mask, flags, NULL);
974} 977}
975 978
976static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) 979static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
977{ 980{
981 if (nd->flags & LOOKUP_RCU)
982 return -ECHILD;
978 return -EPERM; 983 return -EPERM;
979} 984}
980 985
@@ -993,7 +998,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
993 strlen(PRIVROOT_NAME)); 998 strlen(PRIVROOT_NAME));
994 if (!IS_ERR(dentry)) { 999 if (!IS_ERR(dentry)) {
995 REISERFS_SB(s)->priv_root = dentry; 1000 REISERFS_SB(s)->priv_root = dentry;
996 dentry->d_op = &xattr_lookup_poison_ops; 1001 d_set_d_op(dentry, &xattr_lookup_poison_ops);
997 if (dentry->d_inode) 1002 if (dentry->d_inode)
998 dentry->d_inode->i_flags |= S_PRIVATE; 1003 dentry->d_inode->i_flags |= S_PRIVATE;
999 } else 1004 } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a28..90d2fcb67a31 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
472 struct reiserfs_transaction_handle th; 472 struct reiserfs_transaction_handle th;
473 size_t size = reiserfs_xattr_nblocks(inode, 473 size_t size = reiserfs_xattr_nblocks(inode,
474 reiserfs_acl_size(clone->a_count)); 474 reiserfs_acl_size(clone->a_count));
475 reiserfs_write_lock(inode->i_sb); 475 int depth;
476
477 depth = reiserfs_write_lock_once(inode->i_sb);
476 error = journal_begin(&th, inode->i_sb, size * 2); 478 error = journal_begin(&th, inode->i_sb, size * 2);
477 if (!error) { 479 if (!error) {
478 int error2; 480 int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
482 if (error2) 484 if (error2)
483 error = error2; 485 error = error2;
484 } 486 }
485 reiserfs_write_unlock(inode->i_sb); 487 reiserfs_write_unlock_once(inode->i_sb, depth);
486 } 488 }
487 posix_acl_release(clone); 489 posix_acl_release(clone);
488 return error; 490 return error;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d213546894..2305e3121cb1 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
282static const struct file_operations romfs_dir_operations = { 282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir, 283 .read = generic_read_dir,
284 .readdir = romfs_readdir, 284 .readdir = romfs_readdir,
285 .llseek = default_llseek,
285}; 286};
286 287
287static const struct inode_operations romfs_dir_inode_operations = { 288static const struct inode_operations romfs_dir_inode_operations = {
@@ -399,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
399/* 400/*
400 * return a spent inode to the slab cache 401 * return a spent inode to the slab cache
401 */ 402 */
402static void romfs_destroy_inode(struct inode *inode) 403static void romfs_i_callback(struct rcu_head *head)
403{ 404{
405 struct inode *inode = container_of(head, struct inode, i_rcu);
406 INIT_LIST_HEAD(&inode->i_dentry);
404 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); 407 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
405} 408}
406 409
410static void romfs_destroy_inode(struct inode *inode)
411{
412 call_rcu(&inode->i_rcu, romfs_i_callback);
413}
414
407/* 415/*
408 * get filesystem statistics 416 * get filesystem statistics
409 */ 417 */
@@ -551,20 +559,19 @@ error_rsb:
551/* 559/*
552 * get a superblock for mounting 560 * get a superblock for mounting
553 */ 561 */
554static int romfs_get_sb(struct file_system_type *fs_type, 562static struct dentry *romfs_mount(struct file_system_type *fs_type,
555 int flags, const char *dev_name, 563 int flags, const char *dev_name,
556 void *data, struct vfsmount *mnt) 564 void *data)
557{ 565{
558 int ret = -EINVAL; 566 struct dentry *ret = ERR_PTR(-EINVAL);
559 567
560#ifdef CONFIG_ROMFS_ON_MTD 568#ifdef CONFIG_ROMFS_ON_MTD
561 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, 569 ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
562 mnt);
563#endif 570#endif
564#ifdef CONFIG_ROMFS_ON_BLOCK 571#ifdef CONFIG_ROMFS_ON_BLOCK
565 if (ret == -EINVAL) 572 if (ret == ERR_PTR(-EINVAL))
566 ret = get_sb_bdev(fs_type, flags, dev_name, data, 573 ret = mount_bdev(fs_type, flags, dev_name, data,
567 romfs_fill_super, mnt); 574 romfs_fill_super);
568#endif 575#endif
569 return ret; 576 return ret;
570} 577}
@@ -591,7 +598,7 @@ static void romfs_kill_sb(struct super_block *sb)
591static struct file_system_type romfs_fs_type = { 598static struct file_system_type romfs_fs_type = {
592 .owner = THIS_MODULE, 599 .owner = THIS_MODULE,
593 .name = "romfs", 600 .name = "romfs",
594 .get_sb = romfs_get_sb, 601 .mount = romfs_mount,
595 .kill_sb = romfs_kill_sb, 602 .kill_sb = romfs_kill_sb,
596 .fs_flags = FS_REQUIRES_DEV, 603 .fs_flags = FS_REQUIRES_DEV,
597}; 604};
diff --git a/fs/select.c b/fs/select.c
index 500a669f7790..e56560d2b08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
67 return slack; 67 return slack;
68} 68}
69 69
70static long estimate_accuracy(struct timespec *tv) 70long select_estimate_accuracy(struct timespec *tv)
71{ 71{
72 unsigned long ret; 72 unsigned long ret;
73 struct timespec now; 73 struct timespec now;
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
306 rts.tv_sec = rts.tv_nsec = 0; 306 rts.tv_sec = rts.tv_nsec = 0;
307 307
308 if (timeval) { 308 if (timeval) {
309 if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
310 memset(&rtv, 0, sizeof(rtv));
309 rtv.tv_sec = rts.tv_sec; 311 rtv.tv_sec = rts.tv_sec;
310 rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; 312 rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
311 313
@@ -417,7 +419,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
417 } 419 }
418 420
419 if (end_time && !timed_out) 421 if (end_time && !timed_out)
420 slack = estimate_accuracy(end_time); 422 slack = select_estimate_accuracy(end_time);
421 423
422 retval = 0; 424 retval = 0;
423 for (;;) { 425 for (;;) {
@@ -769,7 +771,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
769 } 771 }
770 772
771 if (end_time && !timed_out) 773 if (end_time && !timed_out)
772 slack = estimate_accuracy(end_time); 774 slack = select_estimate_accuracy(end_time);
773 775
774 for (;;) { 776 for (;;) {
775 struct poll_list *walk; 777 struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3c..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
131 */ 131 */
132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) 132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
133{ 133{
134 struct seq_file *m = (struct seq_file *)file->private_data; 134 struct seq_file *m = file->private_data;
135 size_t copied = 0; 135 size_t copied = 0;
136 loff_t pos; 136 loff_t pos;
137 size_t n; 137 size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
280 */ 280 */
281loff_t seq_lseek(struct file *file, loff_t offset, int origin) 281loff_t seq_lseek(struct file *file, loff_t offset, int origin)
282{ 282{
283 struct seq_file *m = (struct seq_file *)file->private_data; 283 struct seq_file *m = file->private_data;
284 loff_t retval = -EINVAL; 284 loff_t retval = -EINVAL;
285 285
286 mutex_lock(&m->lock); 286 mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
324 */ 324 */
325int seq_release(struct inode *inode, struct file *file) 325int seq_release(struct inode *inode, struct file *file)
326{ 326{
327 struct seq_file *m = (struct seq_file *)file->private_data; 327 struct seq_file *m = file->private_data;
328 kfree(m->buf); 328 kfree(m->buf);
329 kfree(m); 329 kfree(m);
330 return 0; 330 return 0;
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
462 if (size) { 462 if (size) {
463 char *p; 463 char *p;
464 464
465 spin_lock(&dcache_lock);
466 p = __d_path(path, root, buf, size); 465 p = __d_path(path, root, buf, size);
467 spin_unlock(&dcache_lock);
468 res = PTR_ERR(p); 466 res = PTR_ERR(p);
469 if (!IS_ERR(p)) { 467 if (!IS_ERR(p)) {
470 char *end = mangle_path(buf, p, esc); 468 char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c5a6add779d..492465b451dd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
99#ifdef __ARCH_SI_TRAPNO 99#ifdef __ARCH_SI_TRAPNO
100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); 100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
101#endif 101#endif
102#ifdef BUS_MCEERR_AO
103 /*
104 * Other callers might not initialize the si_lsb field,
105 * so check explicitly for the right codes here.
106 */
107 if (kinfo->si_code == BUS_MCEERR_AR ||
108 kinfo->si_code == BUS_MCEERR_AO)
109 err |= __put_user((short) kinfo->si_addr_lsb,
110 &uinfo->ssi_addr_lsb);
111#endif
102 break; 112 break;
103 case __SI_CHLD: 113 case __SI_CHLD:
104 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 114 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
@@ -206,6 +216,7 @@ static const struct file_operations signalfd_fops = {
206 .release = signalfd_release, 216 .release = signalfd_release,
207 .poll = signalfd_poll, 217 .poll = signalfd_poll,
208 .read = signalfd_read, 218 .read = signalfd_read,
219 .llseek = noop_llseek,
209}; 220};
210 221
211SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, 222SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2e..000000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
1config SMB_FS
2 tristate "SMB file system support (OBSOLETE, please use CIFS)"
3 depends on INET
4 select NLS
5 help
6 SMB (Server Message Block) is the protocol Windows for Workgroups
7 (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
8 files and printers over local networks. Saying Y here allows you to
9 mount their file systems (often called "shares" in this context) and
10 access them just like any other Unix directory. Currently, this
11 works only if the Windows machines use TCP/IP as the underlying
12 transport protocol, and not NetBEUI. For details, read
13 <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>.
15
16 Note: if you just want your box to act as an SMB *server* and make
17 files and printing services available to Windows clients (which need
18 to have a TCP/IP stack), you don't need to say Y here; you can use
19 the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
20 for that.
21
22 General information about how to connect Linux, Windows machines and
23 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
24
25 To compile the SMB support as a module, choose M here:
26 the module will be called smbfs. Most people say N, however.
27
28config SMB_NLS_DEFAULT
29 bool "Use a default NLS"
30 depends on SMB_FS
31 help
32 Enabling this will make smbfs use nls translations by default. You
33 need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
34 settings and you need to give the default nls for the SMB server as
35 CONFIG_SMB_NLS_REMOTE.
36
37 The nls settings can be changed at mount time, if your smbmount
38 supports that, using the codepage and iocharset parameters.
39
40 smbmount from samba 2.2.0 or later supports this.
41
42config SMB_NLS_REMOTE
43 string "Default Remote NLS Option"
44 depends on SMB_NLS_DEFAULT
45 default "cp437"
46 help
47 This setting allows you to specify a default value for which
48 codepage the server uses. If this field is left blank no
49 translations will be done by default. The local codepage/charset
50 default to CONFIG_NLS_DEFAULT.
51
52 The nls settings can be changed at mount time, if your smbmount
53 supports that, using the codepage and iocharset parameters.
54
55 smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c3..000000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
1#
2# Makefile for the linux smb-filesystem routines.
3#
4
5obj-$(CONFIG_SMB_FS) += smbfs.o
6
7smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
8 symlink.o smbiod.o request.o
9
10# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
11# SMBFS_PARANOIA should normally be enabled.
12
13EXTRA_CFLAGS += -DSMBFS_PARANOIA
14#EXTRA_CFLAGS += -DSMBFS_DEBUG
15#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
16#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
17#EXTRA_CFLAGS += -Werror
18
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e344..000000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
1/*
2 * cache.c
3 *
4 * Copyright (C) 1997 by Bill Hawes
5 *
6 * Routines to support directory cacheing using the page cache.
7 * This cache code is almost directly taken from ncpfs.
8 *
9 * Please add a note about your changes to smbfs in the ChangeLog file.
10 */
11
12#include <linux/time.h>
13#include <linux/errno.h>
14#include <linux/kernel.h>
15#include <linux/mm.h>
16#include <linux/smb_fs.h>
17#include <linux/pagemap.h>
18#include <linux/net.h>
19
20#include <asm/page.h>
21
22#include "smb_debug.h"
23#include "proto.h"
24
25/*
26 * Force the next attempt to use the cache to be a timeout.
27 * If we can't find the page that's fine, it will cause a refresh.
28 */
29void
30smb_invalid_dir_cache(struct inode * dir)
31{
32 struct smb_sb_info *server = server_from_inode(dir);
33 union smb_dir_cache *cache = NULL;
34 struct page *page = NULL;
35
36 page = grab_cache_page(&dir->i_data, 0);
37 if (!page)
38 goto out;
39
40 if (!PageUptodate(page))
41 goto out_unlock;
42
43 cache = kmap(page);
44 cache->head.time = jiffies - SMB_MAX_AGE(server);
45
46 kunmap(page);
47 SetPageUptodate(page);
48out_unlock:
49 unlock_page(page);
50 page_cache_release(page);
51out:
52 return;
53}
54
55/*
56 * Mark all dentries for 'parent' as invalid, forcing them to be re-read
57 */
58void
59smb_invalidate_dircache_entries(struct dentry *parent)
60{
61 struct smb_sb_info *server = server_from_dentry(parent);
62 struct list_head *next;
63 struct dentry *dentry;
64
65 spin_lock(&dcache_lock);
66 next = parent->d_subdirs.next;
67 while (next != &parent->d_subdirs) {
68 dentry = list_entry(next, struct dentry, d_u.d_child);
69 dentry->d_fsdata = NULL;
70 smb_age_dentry(server, dentry);
71 next = next->next;
72 }
73 spin_unlock(&dcache_lock);
74}
75
76/*
77 * dget, but require that fpos and parent matches what the dentry contains.
78 * dentry is not known to be a valid pointer at entry.
79 */
80struct dentry *
81smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
82{
83 struct dentry *dent = dentry;
84 struct list_head *next;
85
86 if (d_validate(dent, parent)) {
87 if (dent->d_name.len <= SMB_MAXNAMELEN &&
88 (unsigned long)dent->d_fsdata == fpos) {
89 if (!dent->d_inode) {
90 dput(dent);
91 dent = NULL;
92 }
93 return dent;
94 }
95 dput(dent);
96 }
97
98 /* If a pointer is invalid, we search the dentry. */
99 spin_lock(&dcache_lock);
100 next = parent->d_subdirs.next;
101 while (next != &parent->d_subdirs) {
102 dent = list_entry(next, struct dentry, d_u.d_child);
103 if ((unsigned long)dent->d_fsdata == fpos) {
104 if (dent->d_inode)
105 dget_locked(dent);
106 else
107 dent = NULL;
108 goto out_unlock;
109 }
110 next = next->next;
111 }
112 dent = NULL;
113out_unlock:
114 spin_unlock(&dcache_lock);
115 return dent;
116}
117
118
119/*
120 * Create dentry/inode for this file and add it to the dircache.
121 */
122int
123smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
124 struct smb_cache_control *ctrl, struct qstr *qname,
125 struct smb_fattr *entry)
126{
127 struct dentry *newdent, *dentry = filp->f_path.dentry;
128 struct inode *newino, *inode = dentry->d_inode;
129 struct smb_cache_control ctl = *ctrl;
130 int valid = 0;
131 int hashed = 0;
132 ino_t ino = 0;
133
134 qname->hash = full_name_hash(qname->name, qname->len);
135
136 if (dentry->d_op && dentry->d_op->d_hash)
137 if (dentry->d_op->d_hash(dentry, qname) != 0)
138 goto end_advance;
139
140 newdent = d_lookup(dentry, qname);
141
142 if (!newdent) {
143 newdent = d_alloc(dentry, qname);
144 if (!newdent)
145 goto end_advance;
146 } else {
147 hashed = 1;
148 memcpy((char *) newdent->d_name.name, qname->name,
149 newdent->d_name.len);
150 }
151
152 if (!newdent->d_inode) {
153 smb_renew_times(newdent);
154 entry->f_ino = iunique(inode->i_sb, 2);
155 newino = smb_iget(inode->i_sb, entry);
156 if (newino) {
157 smb_new_dentry(newdent);
158 d_instantiate(newdent, newino);
159 if (!hashed)
160 d_rehash(newdent);
161 }
162 } else
163 smb_set_inode_attr(newdent->d_inode, entry);
164
165 if (newdent->d_inode) {
166 ino = newdent->d_inode->i_ino;
167 newdent->d_fsdata = (void *) ctl.fpos;
168 smb_new_dentry(newdent);
169 }
170
171 if (ctl.idx >= SMB_DIRCACHE_SIZE) {
172 if (ctl.page) {
173 kunmap(ctl.page);
174 SetPageUptodate(ctl.page);
175 unlock_page(ctl.page);
176 page_cache_release(ctl.page);
177 }
178 ctl.cache = NULL;
179 ctl.idx -= SMB_DIRCACHE_SIZE;
180 ctl.ofs += 1;
181 ctl.page = grab_cache_page(&inode->i_data, ctl.ofs);
182 if (ctl.page)
183 ctl.cache = kmap(ctl.page);
184 }
185 if (ctl.cache) {
186 ctl.cache->dentry[ctl.idx] = newdent;
187 valid = 1;
188 }
189 dput(newdent);
190
191end_advance:
192 if (!valid)
193 ctl.valid = 0;
194 if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
195 if (!ino)
196 ino = find_inode_number(dentry, qname);
197 if (!ino)
198 ino = iunique(inode->i_sb, 2);
199 ctl.filled = filldir(dirent, qname->name, qname->len,
200 filp->f_pos, ino, DT_UNKNOWN);
201 if (!ctl.filled)
202 filp->f_pos += 1;
203 }
204 ctl.fpos += 1;
205 ctl.idx += 1;
206 *ctrl = ctl;
207 return (ctl.valid || !ctl.filled);
208}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f36..000000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
1/*
2 * dir.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/time.h>
11#include <linux/errno.h>
12#include <linux/kernel.h>
13#include <linux/smp_lock.h>
14#include <linux/ctype.h>
15#include <linux/net.h>
16#include <linux/sched.h>
17
18#include <linux/smb_fs.h>
19#include <linux/smb_mount.h>
20#include <linux/smbno.h>
21
22#include "smb_debug.h"
23#include "proto.h"
24
25static int smb_readdir(struct file *, void *, filldir_t);
26static int smb_dir_open(struct inode *, struct file *);
27
28static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
29static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
30static int smb_mkdir(struct inode *, struct dentry *, int);
31static int smb_rmdir(struct inode *, struct dentry *);
32static int smb_unlink(struct inode *, struct dentry *);
33static int smb_rename(struct inode *, struct dentry *,
34 struct inode *, struct dentry *);
35static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
36static int smb_link(struct dentry *, struct inode *, struct dentry *);
37
38const struct file_operations smb_dir_operations =
39{
40 .llseek = generic_file_llseek,
41 .read = generic_read_dir,
42 .readdir = smb_readdir,
43 .unlocked_ioctl = smb_ioctl,
44 .open = smb_dir_open,
45};
46
47const struct inode_operations smb_dir_inode_operations =
48{
49 .create = smb_create,
50 .lookup = smb_lookup,
51 .unlink = smb_unlink,
52 .mkdir = smb_mkdir,
53 .rmdir = smb_rmdir,
54 .rename = smb_rename,
55 .getattr = smb_getattr,
56 .setattr = smb_notify_change,
57};
58
59const struct inode_operations smb_dir_inode_operations_unix =
60{
61 .create = smb_create,
62 .lookup = smb_lookup,
63 .unlink = smb_unlink,
64 .mkdir = smb_mkdir,
65 .rmdir = smb_rmdir,
66 .rename = smb_rename,
67 .getattr = smb_getattr,
68 .setattr = smb_notify_change,
69 .symlink = smb_symlink,
70 .mknod = smb_make_node,
71 .link = smb_link,
72};
73
74/*
75 * Read a directory, using filldir to fill the dirent memory.
76 * smb_proc_readdir does the actual reading from the smb server.
77 *
78 * The cache code is almost directly taken from ncpfs
79 */
80static int
81smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
82{
83 struct dentry *dentry = filp->f_path.dentry;
84 struct inode *dir = dentry->d_inode;
85 struct smb_sb_info *server = server_from_dentry(dentry);
86 union smb_dir_cache *cache = NULL;
87 struct smb_cache_control ctl;
88 struct page *page = NULL;
89 int result;
90
91 ctl.page = NULL;
92 ctl.cache = NULL;
93
94 VERBOSE("reading %s/%s, f_pos=%d\n",
95 DENTRY_PATH(dentry), (int) filp->f_pos);
96
97 result = 0;
98
99 lock_kernel();
100
101 switch ((unsigned int) filp->f_pos) {
102 case 0:
103 if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
104 goto out;
105 filp->f_pos = 1;
106 /* fallthrough */
107 case 1:
108 if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
109 goto out;
110 filp->f_pos = 2;
111 }
112
113 /*
114 * Make sure our inode is up-to-date.
115 */
116 result = smb_revalidate_inode(dentry);
117 if (result)
118 goto out;
119
120
121 page = grab_cache_page(&dir->i_data, 0);
122 if (!page)
123 goto read_really;
124
125 ctl.cache = cache = kmap(page);
126 ctl.head = cache->head;
127
128 if (!PageUptodate(page) || !ctl.head.eof) {
129 VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
130 DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
131 goto init_cache;
132 }
133
134 if (filp->f_pos == 2) {
135 if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
136 goto init_cache;
137
138 /*
139 * N.B. ncpfs checks mtime of dentry too here, we don't.
140 * 1. common smb servers do not update mtime on dir changes
141 * 2. it requires an extra smb request
142 * (revalidate has the same timeout as ctl.head.time)
143 *
144 * Instead smbfs invalidates its own cache on local changes
145 * and remote changes are not seen until timeout.
146 */
147 }
148
149 if (filp->f_pos > ctl.head.end)
150 goto finished;
151
152 ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
153 ctl.ofs = ctl.fpos / SMB_DIRCACHE_SIZE;
154 ctl.idx = ctl.fpos % SMB_DIRCACHE_SIZE;
155
156 for (;;) {
157 if (ctl.ofs != 0) {
158 ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
159 if (!ctl.page)
160 goto invalid_cache;
161 ctl.cache = kmap(ctl.page);
162 if (!PageUptodate(ctl.page))
163 goto invalid_cache;
164 }
165 while (ctl.idx < SMB_DIRCACHE_SIZE) {
166 struct dentry *dent;
167 int res;
168
169 dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
170 dentry, filp->f_pos);
171 if (!dent)
172 goto invalid_cache;
173
174 res = filldir(dirent, dent->d_name.name,
175 dent->d_name.len, filp->f_pos,
176 dent->d_inode->i_ino, DT_UNKNOWN);
177 dput(dent);
178 if (res)
179 goto finished;
180 filp->f_pos += 1;
181 ctl.idx += 1;
182 if (filp->f_pos > ctl.head.end)
183 goto finished;
184 }
185 if (ctl.page) {
186 kunmap(ctl.page);
187 SetPageUptodate(ctl.page);
188 unlock_page(ctl.page);
189 page_cache_release(ctl.page);
190 ctl.page = NULL;
191 }
192 ctl.idx = 0;
193 ctl.ofs += 1;
194 }
195invalid_cache:
196 if (ctl.page) {
197 kunmap(ctl.page);
198 unlock_page(ctl.page);
199 page_cache_release(ctl.page);
200 ctl.page = NULL;
201 }
202 ctl.cache = cache;
203init_cache:
204 smb_invalidate_dircache_entries(dentry);
205 ctl.head.time = jiffies;
206 ctl.head.eof = 0;
207 ctl.fpos = 2;
208 ctl.ofs = 0;
209 ctl.idx = SMB_DIRCACHE_START;
210 ctl.filled = 0;
211 ctl.valid = 1;
212read_really:
213 result = server->ops->readdir(filp, dirent, filldir, &ctl);
214 if (result == -ERESTARTSYS && page)
215 ClearPageUptodate(page);
216 if (ctl.idx == -1)
217 goto invalid_cache; /* retry */
218 ctl.head.end = ctl.fpos - 1;
219 ctl.head.eof = ctl.valid;
220finished:
221 if (page) {
222 cache->head = ctl.head;
223 kunmap(page);
224 if (result != -ERESTARTSYS)
225 SetPageUptodate(page);
226 unlock_page(page);
227 page_cache_release(page);
228 }
229 if (ctl.page) {
230 kunmap(ctl.page);
231 SetPageUptodate(ctl.page);
232 unlock_page(ctl.page);
233 page_cache_release(ctl.page);
234 }
235out:
236 unlock_kernel();
237 return result;
238}
239
240static int
241smb_dir_open(struct inode *dir, struct file *file)
242{
243 struct dentry *dentry = file->f_path.dentry;
244 struct smb_sb_info *server;
245 int error = 0;
246
247 VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
248 file->f_path.dentry->d_name.name);
249
250 /*
251 * Directory timestamps in the core protocol aren't updated
252 * when a file is added, so we give them a very short TTL.
253 */
254 lock_kernel();
255 server = server_from_dentry(dentry);
256 if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
257 unsigned long age = jiffies - SMB_I(dir)->oldmtime;
258 if (age > 2*HZ)
259 smb_invalid_dir_cache(dir);
260 }
261
262 /*
263 * Note: in order to allow the smbmount process to open the
264 * mount point, we only revalidate if the connection is valid or
265 * if the process is trying to access something other than the root.
266 */
267 if (server->state == CONN_VALID || !IS_ROOT(dentry))
268 error = smb_revalidate_inode(dentry);
269 unlock_kernel();
270 return error;
271}
272
273/*
274 * Dentry operations routines
275 */
276static int smb_lookup_validate(struct dentry *, struct nameidata *);
277static int smb_hash_dentry(struct dentry *, struct qstr *);
278static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
279static int smb_delete_dentry(struct dentry *);
280
281static const struct dentry_operations smbfs_dentry_operations =
282{
283 .d_revalidate = smb_lookup_validate,
284 .d_hash = smb_hash_dentry,
285 .d_compare = smb_compare_dentry,
286 .d_delete = smb_delete_dentry,
287};
288
289static const struct dentry_operations smbfs_dentry_operations_case =
290{
291 .d_revalidate = smb_lookup_validate,
292 .d_delete = smb_delete_dentry,
293};
294
295
296/*
297 * This is the callback when the dcache has a lookup hit.
298 */
299static int
300smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
301{
302 struct smb_sb_info *server = server_from_dentry(dentry);
303 struct inode * inode = dentry->d_inode;
304 unsigned long age = jiffies - dentry->d_time;
305 int valid;
306
307 /*
308 * The default validation is based on dentry age:
309 * we believe in dentries for a few seconds. (But each
310 * successful server lookup renews the timestamp.)
311 */
312 valid = (age <= SMB_MAX_AGE(server));
313#ifdef SMBFS_DEBUG_VERBOSE
314 if (!valid)
315 VERBOSE("%s/%s not valid, age=%lu\n",
316 DENTRY_PATH(dentry), age);
317#endif
318
319 if (inode) {
320 lock_kernel();
321 if (is_bad_inode(inode)) {
322 PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
323 valid = 0;
324 } else if (!valid)
325 valid = (smb_revalidate_inode(dentry) == 0);
326 unlock_kernel();
327 } else {
328 /*
329 * What should we do for negative dentries?
330 */
331 }
332 return valid;
333}
334
335static int
336smb_hash_dentry(struct dentry *dir, struct qstr *this)
337{
338 unsigned long hash;
339 int i;
340
341 hash = init_name_hash();
342 for (i=0; i < this->len ; i++)
343 hash = partial_name_hash(tolower(this->name[i]), hash);
344 this->hash = end_name_hash(hash);
345
346 return 0;
347}
348
349static int
350smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
351{
352 int i, result = 1;
353
354 if (a->len != b->len)
355 goto out;
356 for (i=0; i < a->len; i++) {
357 if (tolower(a->name[i]) != tolower(b->name[i]))
358 goto out;
359 }
360 result = 0;
361out:
362 return result;
363}
364
365/*
366 * This is the callback from dput() when d_count is going to 0.
367 * We use this to unhash dentries with bad inodes.
368 */
369static int
370smb_delete_dentry(struct dentry * dentry)
371{
372 if (dentry->d_inode) {
373 if (is_bad_inode(dentry->d_inode)) {
374 PARANOIA("bad inode, unhashing %s/%s\n",
375 DENTRY_PATH(dentry));
376 return 1;
377 }
378 } else {
379 /* N.B. Unhash negative dentries? */
380 }
381 return 0;
382}
383
384/*
385 * Initialize a new dentry
386 */
387void
388smb_new_dentry(struct dentry *dentry)
389{
390 struct smb_sb_info *server = server_from_dentry(dentry);
391
392 if (server->mnt->flags & SMB_MOUNT_CASE)
393 dentry->d_op = &smbfs_dentry_operations_case;
394 else
395 dentry->d_op = &smbfs_dentry_operations;
396 dentry->d_time = jiffies;
397}
398
399
400/*
401 * Whenever a lookup succeeds, we know the parent directories
402 * are all valid, so we want to update the dentry timestamps.
403 * N.B. Move this to dcache?
404 */
405void
406smb_renew_times(struct dentry * dentry)
407{
408 dget(dentry);
409 spin_lock(&dentry->d_lock);
410 for (;;) {
411 struct dentry *parent;
412
413 dentry->d_time = jiffies;
414 if (IS_ROOT(dentry))
415 break;
416 parent = dentry->d_parent;
417 dget(parent);
418 spin_unlock(&dentry->d_lock);
419 dput(dentry);
420 dentry = parent;
421 spin_lock(&dentry->d_lock);
422 }
423 spin_unlock(&dentry->d_lock);
424 dput(dentry);
425}
426
427static struct dentry *
428smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
429{
430 struct smb_fattr finfo;
431 struct inode *inode;
432 int error;
433 struct smb_sb_info *server;
434
435 error = -ENAMETOOLONG;
436 if (dentry->d_name.len > SMB_MAXNAMELEN)
437 goto out;
438
439 /* Do not allow lookup of names with backslashes in */
440 error = -EINVAL;
441 if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
442 goto out;
443
444 lock_kernel();
445 error = smb_proc_getattr(dentry, &finfo);
446#ifdef SMBFS_PARANOIA
447 if (error && error != -ENOENT)
448 PARANOIA("find %s/%s failed, error=%d\n",
449 DENTRY_PATH(dentry), error);
450#endif
451
452 inode = NULL;
453 if (error == -ENOENT)
454 goto add_entry;
455 if (!error) {
456 error = -EACCES;
457 finfo.f_ino = iunique(dentry->d_sb, 2);
458 inode = smb_iget(dir->i_sb, &finfo);
459 if (inode) {
460 add_entry:
461 server = server_from_dentry(dentry);
462 if (server->mnt->flags & SMB_MOUNT_CASE)
463 dentry->d_op = &smbfs_dentry_operations_case;
464 else
465 dentry->d_op = &smbfs_dentry_operations;
466
467 d_add(dentry, inode);
468 smb_renew_times(dentry);
469 error = 0;
470 }
471 }
472 unlock_kernel();
473out:
474 return ERR_PTR(error);
475}
476
477/*
478 * This code is common to all routines creating a new inode.
479 */
480static int
481smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
482{
483 struct smb_sb_info *server = server_from_dentry(dentry);
484 struct inode *inode;
485 int error;
486 struct smb_fattr fattr;
487
488 VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
489
490 error = smb_proc_getattr(dentry, &fattr);
491 if (error)
492 goto out_close;
493
494 smb_renew_times(dentry);
495 fattr.f_ino = iunique(dentry->d_sb, 2);
496 inode = smb_iget(dentry->d_sb, &fattr);
497 if (!inode)
498 goto out_no_inode;
499
500 if (have_id) {
501 struct smb_inode_info *ei = SMB_I(inode);
502 ei->fileid = fileid;
503 ei->access = SMB_O_RDWR;
504 ei->open = server->generation;
505 }
506 d_instantiate(dentry, inode);
507out:
508 return error;
509
510out_no_inode:
511 error = -EACCES;
512out_close:
513 if (have_id) {
514 PARANOIA("%s/%s failed, error=%d, closing %u\n",
515 DENTRY_PATH(dentry), error, fileid);
516 smb_close_fileid(dentry, fileid);
517 }
518 goto out;
519}
520
521/* N.B. How should the mode argument be used? */
522static int
523smb_create(struct inode *dir, struct dentry *dentry, int mode,
524 struct nameidata *nd)
525{
526 struct smb_sb_info *server = server_from_dentry(dentry);
527 __u16 fileid;
528 int error;
529 struct iattr attr;
530
531 VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
532
533 lock_kernel();
534 smb_invalid_dir_cache(dir);
535 error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
536 if (!error) {
537 if (server->opt.capabilities & SMB_CAP_UNIX) {
538 /* Set attributes for new file */
539 attr.ia_valid = ATTR_MODE;
540 attr.ia_mode = mode;
541 error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
542 }
543 error = smb_instantiate(dentry, fileid, 1);
544 } else {
545 PARANOIA("%s/%s failed, error=%d\n",
546 DENTRY_PATH(dentry), error);
547 }
548 unlock_kernel();
549 return error;
550}
551
552/* N.B. How should the mode argument be used? */
553static int
554smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
555{
556 struct smb_sb_info *server = server_from_dentry(dentry);
557 int error;
558 struct iattr attr;
559
560 lock_kernel();
561 smb_invalid_dir_cache(dir);
562 error = smb_proc_mkdir(dentry);
563 if (!error) {
564 if (server->opt.capabilities & SMB_CAP_UNIX) {
565 /* Set attributes for new directory */
566 attr.ia_valid = ATTR_MODE;
567 attr.ia_mode = mode;
568 error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
569 }
570 error = smb_instantiate(dentry, 0, 0);
571 }
572 unlock_kernel();
573 return error;
574}
575
576static int
577smb_rmdir(struct inode *dir, struct dentry *dentry)
578{
579 struct inode *inode = dentry->d_inode;
580 int error;
581
582 /*
583 * Close the directory if it's open.
584 */
585 lock_kernel();
586 smb_close(inode);
587
588 /*
589 * Check that nobody else is using the directory..
590 */
591 error = -EBUSY;
592 if (!d_unhashed(dentry))
593 goto out;
594
595 smb_invalid_dir_cache(dir);
596 error = smb_proc_rmdir(dentry);
597
598out:
599 unlock_kernel();
600 return error;
601}
602
603static int
604smb_unlink(struct inode *dir, struct dentry *dentry)
605{
606 int error;
607
608 /*
609 * Close the file if it's open.
610 */
611 lock_kernel();
612 smb_close(dentry->d_inode);
613
614 smb_invalid_dir_cache(dir);
615 error = smb_proc_unlink(dentry);
616 if (!error)
617 smb_renew_times(dentry);
618 unlock_kernel();
619 return error;
620}
621
622static int
623smb_rename(struct inode *old_dir, struct dentry *old_dentry,
624 struct inode *new_dir, struct dentry *new_dentry)
625{
626 int error;
627
628 /*
629 * Close any open files, and check whether to delete the
630 * target before attempting the rename.
631 */
632 lock_kernel();
633 if (old_dentry->d_inode)
634 smb_close(old_dentry->d_inode);
635 if (new_dentry->d_inode) {
636 smb_close(new_dentry->d_inode);
637 error = smb_proc_unlink(new_dentry);
638 if (error) {
639 VERBOSE("unlink %s/%s, error=%d\n",
640 DENTRY_PATH(new_dentry), error);
641 goto out;
642 }
643 /* FIXME */
644 d_delete(new_dentry);
645 }
646
647 smb_invalid_dir_cache(old_dir);
648 smb_invalid_dir_cache(new_dir);
649 error = smb_proc_mv(old_dentry, new_dentry);
650 if (!error) {
651 smb_renew_times(old_dentry);
652 smb_renew_times(new_dentry);
653 }
654out:
655 unlock_kernel();
656 return error;
657}
658
659/*
660 * FIXME: samba servers won't let you create device nodes unless uid/gid
661 * matches the connection credentials (and we don't know which those are ...)
662 */
663static int
664smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
665{
666 int error;
667 struct iattr attr;
668
669 attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
670 attr.ia_mode = mode;
671 current_euid_egid(&attr.ia_uid, &attr.ia_gid);
672
673 if (!new_valid_dev(dev))
674 return -EINVAL;
675
676 smb_invalid_dir_cache(dir);
677 error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
678 if (!error) {
679 error = smb_instantiate(dentry, 0, 0);
680 }
681 return error;
682}
683
684/*
685 * dentry = existing file
686 * new_dentry = new file
687 */
688static int
689smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
690{
691 int error;
692
693 DEBUG1("smb_link old=%s/%s new=%s/%s\n",
694 DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
695 smb_invalid_dir_cache(dir);
696 error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
697 if (!error) {
698 smb_renew_times(dentry);
699 error = smb_instantiate(new_dentry, 0, 0);
700 }
701 return error;
702}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94bb..000000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
1/*
2 * file.c
3 *
4 * Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/time.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/fcntl.h>
14#include <linux/stat.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/net.h>
19#include <linux/aio.h>
20
21#include <asm/uaccess.h>
22#include <asm/system.h>
23
24#include <linux/smbno.h>
25#include <linux/smb_fs.h>
26
27#include "smb_debug.h"
28#include "proto.h"
29
30static int
31smb_fsync(struct file *file, int datasync)
32{
33 struct dentry *dentry = file->f_path.dentry;
34 struct smb_sb_info *server = server_from_dentry(dentry);
35 int result;
36
37 VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
38
39 /*
40 * The VFS will writepage() all dirty pages for us, but we
41 * should send a SMBflush to the server, letting it know that
42 * we want things synchronized with actual storage.
43 *
44 * Note: this function requires all pages to have been written already
45 * (should be ok with writepage_sync)
46 */
47 result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
48 return result;
49}
50
51/*
52 * Read a page synchronously.
53 */
54static int
55smb_readpage_sync(struct dentry *dentry, struct page *page)
56{
57 char *buffer = kmap(page);
58 loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
59 struct smb_sb_info *server = server_from_dentry(dentry);
60 unsigned int rsize = smb_get_rsize(server);
61 int count = PAGE_SIZE;
62 int result;
63
64 VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
65 DENTRY_PATH(dentry), count, offset, rsize);
66
67 result = smb_open(dentry, SMB_O_RDONLY);
68 if (result < 0)
69 goto io_error;
70
71 do {
72 if (count < rsize)
73 rsize = count;
74
75 result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
76 if (result < 0)
77 goto io_error;
78
79 count -= result;
80 offset += result;
81 buffer += result;
82 dentry->d_inode->i_atime =
83 current_fs_time(dentry->d_inode->i_sb);
84 if (result < rsize)
85 break;
86 } while (count);
87
88 memset(buffer, 0, count);
89 flush_dcache_page(page);
90 SetPageUptodate(page);
91 result = 0;
92
93io_error:
94 kunmap(page);
95 unlock_page(page);
96 return result;
97}
98
99/*
100 * We are called with the page locked and we unlock it when done.
101 */
102static int
103smb_readpage(struct file *file, struct page *page)
104{
105 int error;
106 struct dentry *dentry = file->f_path.dentry;
107
108 page_cache_get(page);
109 error = smb_readpage_sync(dentry, page);
110 page_cache_release(page);
111 return error;
112}
113
114/*
115 * Write a page synchronously.
116 * Offset is the data offset within the page.
117 */
118static int
119smb_writepage_sync(struct inode *inode, struct page *page,
120 unsigned long pageoffset, unsigned int count)
121{
122 loff_t offset;
123 char *buffer = kmap(page) + pageoffset;
124 struct smb_sb_info *server = server_from_inode(inode);
125 unsigned int wsize = smb_get_wsize(server);
126 int ret = 0;
127
128 offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
129 VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
130 inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
131
132 do {
133 int write_ret;
134
135 if (count < wsize)
136 wsize = count;
137
138 write_ret = server->ops->write(inode, offset, wsize, buffer);
139 if (write_ret < 0) {
140 PARANOIA("failed write, wsize=%d, write_ret=%d\n",
141 wsize, write_ret);
142 ret = write_ret;
143 break;
144 }
145 /* N.B. what if result < wsize?? */
146#ifdef SMBFS_PARANOIA
147 if (write_ret < wsize)
148 PARANOIA("short write, wsize=%d, write_ret=%d\n",
149 wsize, write_ret);
150#endif
151 buffer += wsize;
152 offset += wsize;
153 count -= wsize;
154 /*
155 * Update the inode now rather than waiting for a refresh.
156 */
157 inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
158 SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
159 if (offset > inode->i_size)
160 inode->i_size = offset;
161 } while (count);
162
163 kunmap(page);
164 return ret;
165}
166
167/*
168 * Write a page to the server. This will be used for NFS swapping only
169 * (for now), and we currently do this synchronously only.
170 *
171 * We are called with the page locked and we unlock it when done.
172 */
173static int
174smb_writepage(struct page *page, struct writeback_control *wbc)
175{
176 struct address_space *mapping = page->mapping;
177 struct inode *inode;
178 unsigned long end_index;
179 unsigned offset = PAGE_CACHE_SIZE;
180 int err;
181
182 BUG_ON(!mapping);
183 inode = mapping->host;
184 BUG_ON(!inode);
185
186 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
187
188 /* easy case */
189 if (page->index < end_index)
190 goto do_it;
191 /* things got complicated... */
192 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
193 /* OK, are we completely out? */
194 if (page->index >= end_index+1 || !offset)
195 return 0; /* truncated - don't care */
196do_it:
197 page_cache_get(page);
198 err = smb_writepage_sync(inode, page, 0, offset);
199 SetPageUptodate(page);
200 unlock_page(page);
201 page_cache_release(page);
202 return err;
203}
204
205static int
206smb_updatepage(struct file *file, struct page *page, unsigned long offset,
207 unsigned int count)
208{
209 struct dentry *dentry = file->f_path.dentry;
210
211 DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
212 ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
213
214 return smb_writepage_sync(dentry->d_inode, page, offset, count);
215}
216
217static ssize_t
218smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
219 unsigned long nr_segs, loff_t pos)
220{
221 struct file * file = iocb->ki_filp;
222 struct dentry * dentry = file->f_path.dentry;
223 ssize_t status;
224
225 VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
226 (unsigned long) iocb->ki_left, (unsigned long) pos);
227
228 status = smb_revalidate_inode(dentry);
229 if (status) {
230 PARANOIA("%s/%s validation failed, error=%Zd\n",
231 DENTRY_PATH(dentry), status);
232 goto out;
233 }
234
235 VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
236 (long)dentry->d_inode->i_size,
237 dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
238
239 status = generic_file_aio_read(iocb, iov, nr_segs, pos);
240out:
241 return status;
242}
243
244static int
245smb_file_mmap(struct file * file, struct vm_area_struct * vma)
246{
247 struct dentry * dentry = file->f_path.dentry;
248 int status;
249
250 VERBOSE("file %s/%s, address %lu - %lu\n",
251 DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
252
253 status = smb_revalidate_inode(dentry);
254 if (status) {
255 PARANOIA("%s/%s validation failed, error=%d\n",
256 DENTRY_PATH(dentry), status);
257 goto out;
258 }
259 status = generic_file_mmap(file, vma);
260out:
261 return status;
262}
263
264static ssize_t
265smb_file_splice_read(struct file *file, loff_t *ppos,
266 struct pipe_inode_info *pipe, size_t count,
267 unsigned int flags)
268{
269 struct dentry *dentry = file->f_path.dentry;
270 ssize_t status;
271
272 VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
273 DENTRY_PATH(dentry), *ppos, count);
274
275 status = smb_revalidate_inode(dentry);
276 if (status) {
277 PARANOIA("%s/%s validation failed, error=%Zd\n",
278 DENTRY_PATH(dentry), status);
279 goto out;
280 }
281 status = generic_file_splice_read(file, ppos, pipe, count, flags);
282out:
283 return status;
284}
285
286/*
287 * This does the "real" work of the write. The generic routine has
288 * allocated the page, locked it, done all the page alignment stuff
289 * calculations etc. Now we should just copy the data from user
290 * space and write it back to the real medium..
291 *
292 * If the writer ends up delaying the write, the writer needs to
293 * increment the page use counts until he is done with the page.
294 */
295static int smb_write_begin(struct file *file, struct address_space *mapping,
296 loff_t pos, unsigned len, unsigned flags,
297 struct page **pagep, void **fsdata)
298{
299 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
300 *pagep = grab_cache_page_write_begin(mapping, index, flags);
301 if (!*pagep)
302 return -ENOMEM;
303 return 0;
304}
305
306static int smb_write_end(struct file *file, struct address_space *mapping,
307 loff_t pos, unsigned len, unsigned copied,
308 struct page *page, void *fsdata)
309{
310 int status;
311 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
312
313 lock_kernel();
314 status = smb_updatepage(file, page, offset, copied);
315 unlock_kernel();
316
317 if (!status) {
318 if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
319 SetPageUptodate(page);
320 status = copied;
321 }
322
323 unlock_page(page);
324 page_cache_release(page);
325
326 return status;
327}
328
329const struct address_space_operations smb_file_aops = {
330 .readpage = smb_readpage,
331 .writepage = smb_writepage,
332 .write_begin = smb_write_begin,
333 .write_end = smb_write_end,
334};
335
336/*
337 * Write to a file (through the page cache).
338 */
339static ssize_t
340smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
341 unsigned long nr_segs, loff_t pos)
342{
343 struct file * file = iocb->ki_filp;
344 struct dentry * dentry = file->f_path.dentry;
345 ssize_t result;
346
347 VERBOSE("file %s/%s, count=%lu@%lu\n",
348 DENTRY_PATH(dentry),
349 (unsigned long) iocb->ki_left, (unsigned long) pos);
350
351 result = smb_revalidate_inode(dentry);
352 if (result) {
353 PARANOIA("%s/%s validation failed, error=%Zd\n",
354 DENTRY_PATH(dentry), result);
355 goto out;
356 }
357
358 result = smb_open(dentry, SMB_O_WRONLY);
359 if (result)
360 goto out;
361
362 if (iocb->ki_left > 0) {
363 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
364 VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
365 (long) file->f_pos, (long) dentry->d_inode->i_size,
366 dentry->d_inode->i_mtime.tv_sec,
367 dentry->d_inode->i_atime.tv_sec);
368 }
369out:
370 return result;
371}
372
373static int
374smb_file_open(struct inode *inode, struct file * file)
375{
376 int result;
377 struct dentry *dentry = file->f_path.dentry;
378 int smb_mode = (file->f_mode & O_ACCMODE) - 1;
379
380 lock_kernel();
381 result = smb_open(dentry, smb_mode);
382 if (result)
383 goto out;
384 SMB_I(inode)->openers++;
385out:
386 unlock_kernel();
387 return result;
388}
389
390static int
391smb_file_release(struct inode *inode, struct file * file)
392{
393 lock_kernel();
394 if (!--SMB_I(inode)->openers) {
395 /* We must flush any dirty pages now as we won't be able to
396 write anything after close. mmap can trigger this.
397 "openers" should perhaps include mmap'ers ... */
398 filemap_write_and_wait(inode->i_mapping);
399 smb_close(inode);
400 }
401 unlock_kernel();
402 return 0;
403}
404
405/*
406 * Check whether the required access is compatible with
407 * an inode's permission. SMB doesn't recognize superuser
408 * privileges, so we need our own check for this.
409 */
410static int
411smb_file_permission(struct inode *inode, int mask)
412{
413 int mode = inode->i_mode;
414 int error = 0;
415
416 VERBOSE("mode=%x, mask=%x\n", mode, mask);
417
418 /* Look at user permissions */
419 mode >>= 6;
420 if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
421 error = -EACCES;
422 return error;
423}
424
425static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
426{
427 loff_t ret;
428 lock_kernel();
429 ret = generic_file_llseek_unlocked(file, offset, origin);
430 unlock_kernel();
431 return ret;
432}
433
434const struct file_operations smb_file_operations =
435{
436 .llseek = smb_remote_llseek,
437 .read = do_sync_read,
438 .aio_read = smb_file_aio_read,
439 .write = do_sync_write,
440 .aio_write = smb_file_aio_write,
441 .unlocked_ioctl = smb_ioctl,
442 .mmap = smb_file_mmap,
443 .open = smb_file_open,
444 .release = smb_file_release,
445 .fsync = smb_fsync,
446 .splice_read = smb_file_splice_read,
447};
448
449const struct inode_operations smb_file_inode_operations =
450{
451 .permission = smb_file_permission,
452 .getattr = smb_getattr,
453 .setattr = smb_notify_change,
454};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab1..000000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
1/*
2 * getopt.c
3 */
4
5#include <linux/kernel.h>
6#include <linux/string.h>
7#include <linux/net.h>
8
9#include "getopt.h"
10
11/**
12 * smb_getopt - option parser
13 * @caller: name of the caller, for error messages
14 * @options: the options string
15 * @opts: an array of &struct option entries controlling parser operations
16 * @optopt: output; will contain the current option
17 * @optarg: output; will contain the value (if one exists)
18 * @flag: output; may be NULL; should point to a long for or'ing flags
19 * @value: output; may be NULL; will be overwritten with the integer value
20 * of the current argument.
21 *
22 * Helper to parse options on the format used by mount ("a=b,c=d,e,f").
23 * Returns opts->val if a matching entry in the 'opts' array is found,
24 * 0 when no more tokens are found, -1 if an error is encountered.
25 */
26int smb_getopt(char *caller, char **options, struct option *opts,
27 char **optopt, char **optarg, unsigned long *flag,
28 unsigned long *value)
29{
30 char *token;
31 char *val;
32 int i;
33
34 do {
35 if ((token = strsep(options, ",")) == NULL)
36 return 0;
37 } while (*token == '\0');
38 *optopt = token;
39
40 *optarg = NULL;
41 if ((val = strchr (token, '=')) != NULL) {
42 *val++ = 0;
43 if (value)
44 *value = simple_strtoul(val, NULL, 0);
45 *optarg = val;
46 }
47
48 for (i = 0; opts[i].name != NULL; i++) {
49 if (!strcmp(opts[i].name, token)) {
50 if (!opts[i].flag && (!val || !*val)) {
51 printk("%s: the %s option requires an argument\n",
52 caller, token);
53 return -1;
54 }
55
56 if (flag && opts[i].flag)
57 *flag |= opts[i].flag;
58
59 return opts[i].val;
60 }
61 }
62 printk("%s: Unrecognized mount option %s\n", caller, token);
63 return -1;
64}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c46..000000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#ifndef _LINUX_GETOPT_H
2#define _LINUX_GETOPT_H
3
4struct option {
5 const char *name;
6 unsigned long flag;
7 int val;
8};
9
10extern int smb_getopt(char *caller, char **options, struct option *opts,
11 char **optopt, char **optarg, unsigned long *flag,
12 unsigned long *value);
13
14#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 450c91941988..000000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,839 +0,0 @@
1/*
2 * inode.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/module.h>
11#include <linux/time.h>
12#include <linux/kernel.h>
13#include <linux/mm.h>
14#include <linux/string.h>
15#include <linux/stat.h>
16#include <linux/errno.h>
17#include <linux/slab.h>
18#include <linux/init.h>
19#include <linux/file.h>
20#include <linux/dcache.h>
21#include <linux/smp_lock.h>
22#include <linux/nls.h>
23#include <linux/seq_file.h>
24#include <linux/mount.h>
25#include <linux/net.h>
26#include <linux/vfs.h>
27#include <linux/highuid.h>
28#include <linux/sched.h>
29#include <linux/smb_fs.h>
30#include <linux/smbno.h>
31#include <linux/smb_mount.h>
32
33#include <asm/system.h>
34#include <asm/uaccess.h>
35
36#include "smb_debug.h"
37#include "getopt.h"
38#include "proto.h"
39
40/* Always pick a default string */
41#ifdef CONFIG_SMB_NLS_REMOTE
42#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
43#else
44#define SMB_NLS_REMOTE ""
45#endif
46
47#define SMB_TTL_DEFAULT 1000
48
49static void smb_evict_inode(struct inode *);
50static void smb_put_super(struct super_block *);
51static int smb_statfs(struct dentry *, struct kstatfs *);
52static int smb_show_options(struct seq_file *, struct vfsmount *);
53
54static struct kmem_cache *smb_inode_cachep;
55
56static struct inode *smb_alloc_inode(struct super_block *sb)
57{
58 struct smb_inode_info *ei;
59 ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
60 if (!ei)
61 return NULL;
62 return &ei->vfs_inode;
63}
64
65static void smb_destroy_inode(struct inode *inode)
66{
67 kmem_cache_free(smb_inode_cachep, SMB_I(inode));
68}
69
70static void init_once(void *foo)
71{
72 struct smb_inode_info *ei = (struct smb_inode_info *) foo;
73
74 inode_init_once(&ei->vfs_inode);
75}
76
77static int init_inodecache(void)
78{
79 smb_inode_cachep = kmem_cache_create("smb_inode_cache",
80 sizeof(struct smb_inode_info),
81 0, (SLAB_RECLAIM_ACCOUNT|
82 SLAB_MEM_SPREAD),
83 init_once);
84 if (smb_inode_cachep == NULL)
85 return -ENOMEM;
86 return 0;
87}
88
89static void destroy_inodecache(void)
90{
91 kmem_cache_destroy(smb_inode_cachep);
92}
93
94static int smb_remount(struct super_block *sb, int *flags, char *data)
95{
96 *flags |= MS_NODIRATIME;
97 return 0;
98}
99
100static const struct super_operations smb_sops =
101{
102 .alloc_inode = smb_alloc_inode,
103 .destroy_inode = smb_destroy_inode,
104 .drop_inode = generic_delete_inode,
105 .evict_inode = smb_evict_inode,
106 .put_super = smb_put_super,
107 .statfs = smb_statfs,
108 .show_options = smb_show_options,
109 .remount_fs = smb_remount,
110};
111
112
113/* We are always generating a new inode here */
114struct inode *
115smb_iget(struct super_block *sb, struct smb_fattr *fattr)
116{
117 struct smb_sb_info *server = SMB_SB(sb);
118 struct inode *result;
119
120 DEBUG1("smb_iget: %p\n", fattr);
121
122 result = new_inode(sb);
123 if (!result)
124 return result;
125 result->i_ino = fattr->f_ino;
126 SMB_I(result)->open = 0;
127 SMB_I(result)->fileid = 0;
128 SMB_I(result)->access = 0;
129 SMB_I(result)->flags = 0;
130 SMB_I(result)->closed = 0;
131 SMB_I(result)->openers = 0;
132 smb_set_inode_attr(result, fattr);
133 if (S_ISREG(result->i_mode)) {
134 result->i_op = &smb_file_inode_operations;
135 result->i_fop = &smb_file_operations;
136 result->i_data.a_ops = &smb_file_aops;
137 } else if (S_ISDIR(result->i_mode)) {
138 if (server->opt.capabilities & SMB_CAP_UNIX)
139 result->i_op = &smb_dir_inode_operations_unix;
140 else
141 result->i_op = &smb_dir_inode_operations;
142 result->i_fop = &smb_dir_operations;
143 } else if (S_ISLNK(result->i_mode)) {
144 result->i_op = &smb_link_inode_operations;
145 } else {
146 init_special_inode(result, result->i_mode, fattr->f_rdev);
147 }
148 insert_inode_hash(result);
149 return result;
150}
151
152/*
153 * Copy the inode data to a smb_fattr structure.
154 */
155void
156smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
157{
158 memset(fattr, 0, sizeof(struct smb_fattr));
159 fattr->f_mode = inode->i_mode;
160 fattr->f_nlink = inode->i_nlink;
161 fattr->f_ino = inode->i_ino;
162 fattr->f_uid = inode->i_uid;
163 fattr->f_gid = inode->i_gid;
164 fattr->f_size = inode->i_size;
165 fattr->f_mtime = inode->i_mtime;
166 fattr->f_ctime = inode->i_ctime;
167 fattr->f_atime = inode->i_atime;
168 fattr->f_blocks = inode->i_blocks;
169
170 fattr->attr = SMB_I(inode)->attr;
171 /*
172 * Keep the attributes in sync with the inode permissions.
173 */
174 if (fattr->f_mode & S_IWUSR)
175 fattr->attr &= ~aRONLY;
176 else
177 fattr->attr |= aRONLY;
178}
179
180/*
181 * Update the inode, possibly causing it to invalidate its pages if mtime/size
182 * is different from last time.
183 */
184void
185smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
186{
187 struct smb_inode_info *ei = SMB_I(inode);
188
189 /*
190 * A size change should have a different mtime, or same mtime
191 * but different size.
192 */
193 time_t last_time = inode->i_mtime.tv_sec;
194 loff_t last_sz = inode->i_size;
195
196 inode->i_mode = fattr->f_mode;
197 inode->i_nlink = fattr->f_nlink;
198 inode->i_uid = fattr->f_uid;
199 inode->i_gid = fattr->f_gid;
200 inode->i_ctime = fattr->f_ctime;
201 inode->i_blocks = fattr->f_blocks;
202 inode->i_size = fattr->f_size;
203 inode->i_mtime = fattr->f_mtime;
204 inode->i_atime = fattr->f_atime;
205 ei->attr = fattr->attr;
206
207 /*
208 * Update the "last time refreshed" field for revalidation.
209 */
210 ei->oldmtime = jiffies;
211
212 if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
213 VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
214 inode->i_ino,
215 (long) last_time, (long) inode->i_mtime.tv_sec,
216 (long) last_sz, (long) inode->i_size);
217
218 if (!S_ISDIR(inode->i_mode))
219 invalidate_remote_inode(inode);
220 }
221}
222
223/*
224 * This is called if the connection has gone bad ...
225 * try to kill off all the current inodes.
226 */
227void
228smb_invalidate_inodes(struct smb_sb_info *server)
229{
230 VERBOSE("\n");
231 shrink_dcache_sb(SB_of(server));
232 invalidate_inodes(SB_of(server));
233}
234
235/*
236 * This is called to update the inode attributes after
237 * we've made changes to a file or directory.
238 */
239static int
240smb_refresh_inode(struct dentry *dentry)
241{
242 struct inode *inode = dentry->d_inode;
243 int error;
244 struct smb_fattr fattr;
245
246 error = smb_proc_getattr(dentry, &fattr);
247 if (!error) {
248 smb_renew_times(dentry);
249 /*
250 * Check whether the type part of the mode changed,
251 * and don't update the attributes if it did.
252 *
253 * And don't dick with the root inode
254 */
255 if (inode->i_ino == 2)
256 return error;
257 if (S_ISLNK(inode->i_mode))
258 return error; /* VFS will deal with it */
259
260 if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
261 smb_set_inode_attr(inode, &fattr);
262 } else {
263 /*
264 * Big trouble! The inode has become a new object,
265 * so any operations attempted on it are invalid.
266 *
267 * To limit damage, mark the inode as bad so that
268 * subsequent lookup validations will fail.
269 */
270 PARANOIA("%s/%s changed mode, %07o to %07o\n",
271 DENTRY_PATH(dentry),
272 inode->i_mode, fattr.f_mode);
273
274 fattr.f_mode = inode->i_mode; /* save mode */
275 make_bad_inode(inode);
276 inode->i_mode = fattr.f_mode; /* restore mode */
277 /*
278 * No need to worry about unhashing the dentry: the
279 * lookup validation will see that the inode is bad.
280 * But we do want to invalidate the caches ...
281 */
282 if (!S_ISDIR(inode->i_mode))
283 invalidate_remote_inode(inode);
284 else
285 smb_invalid_dir_cache(inode);
286 error = -EIO;
287 }
288 }
289 return error;
290}
291
292/*
293 * This is called when we want to check whether the inode
294 * has changed on the server. If it has changed, we must
295 * invalidate our local caches.
296 */
297int
298smb_revalidate_inode(struct dentry *dentry)
299{
300 struct smb_sb_info *s = server_from_dentry(dentry);
301 struct inode *inode = dentry->d_inode;
302 int error = 0;
303
304 DEBUG1("smb_revalidate_inode\n");
305 lock_kernel();
306
307 /*
308 * Check whether we've recently refreshed the inode.
309 */
310 if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
311 VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
312 inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
313 goto out;
314 }
315
316 error = smb_refresh_inode(dentry);
317out:
318 unlock_kernel();
319 return error;
320}
321
322/*
323 * This routine is called when i_nlink == 0 and i_count goes to 0.
324 * All blocking cleanup operations need to go here to avoid races.
325 */
326static void
327smb_evict_inode(struct inode *ino)
328{
329 DEBUG1("ino=%ld\n", ino->i_ino);
330 truncate_inode_pages(&ino->i_data, 0);
331 end_writeback(ino);
332 lock_kernel();
333 if (smb_close(ino))
334 PARANOIA("could not close inode %ld\n", ino->i_ino);
335 unlock_kernel();
336}
337
338static struct option opts[] = {
339 { "version", 0, 'v' },
340 { "win95", SMB_MOUNT_WIN95, 1 },
341 { "oldattr", SMB_MOUNT_OLDATTR, 1 },
342 { "dirattr", SMB_MOUNT_DIRATTR, 1 },
343 { "case", SMB_MOUNT_CASE, 1 },
344 { "uid", 0, 'u' },
345 { "gid", 0, 'g' },
346 { "file_mode", 0, 'f' },
347 { "dir_mode", 0, 'd' },
348 { "iocharset", 0, 'i' },
349 { "codepage", 0, 'c' },
350 { "ttl", 0, 't' },
351 { NULL, 0, 0}
352};
353
354static int
355parse_options(struct smb_mount_data_kernel *mnt, char *options)
356{
357 int c;
358 unsigned long flags;
359 unsigned long value;
360 char *optarg;
361 char *optopt;
362
363 flags = 0;
364 while ( (c = smb_getopt("smbfs", &options, opts,
365 &optopt, &optarg, &flags, &value)) > 0) {
366
367 VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
368 switch (c) {
369 case 1:
370 /* got a "flag" option */
371 break;
372 case 'v':
373 if (value != SMB_MOUNT_VERSION) {
374 printk ("smbfs: Bad mount version %ld, expected %d\n",
375 value, SMB_MOUNT_VERSION);
376 return 0;
377 }
378 mnt->version = value;
379 break;
380 case 'u':
381 mnt->uid = value;
382 flags |= SMB_MOUNT_UID;
383 break;
384 case 'g':
385 mnt->gid = value;
386 flags |= SMB_MOUNT_GID;
387 break;
388 case 'f':
389 mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
390 flags |= SMB_MOUNT_FMODE;
391 break;
392 case 'd':
393 mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
394 flags |= SMB_MOUNT_DMODE;
395 break;
396 case 'i':
397 strlcpy(mnt->codepage.local_name, optarg,
398 SMB_NLS_MAXNAMELEN);
399 break;
400 case 'c':
401 strlcpy(mnt->codepage.remote_name, optarg,
402 SMB_NLS_MAXNAMELEN);
403 break;
404 case 't':
405 mnt->ttl = value;
406 break;
407 default:
408 printk ("smbfs: Unrecognized mount option %s\n",
409 optopt);
410 return -1;
411 }
412 }
413 mnt->flags = flags;
414 return c;
415}
416
417/*
418 * smb_show_options() is for displaying mount options in /proc/mounts.
419 * It tries to avoid showing settings that were not changed from their
420 * defaults.
421 */
422static int
423smb_show_options(struct seq_file *s, struct vfsmount *m)
424{
425 struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
426 int i;
427
428 for (i = 0; opts[i].name != NULL; i++)
429 if (mnt->flags & opts[i].flag)
430 seq_printf(s, ",%s", opts[i].name);
431
432 if (mnt->flags & SMB_MOUNT_UID)
433 seq_printf(s, ",uid=%d", mnt->uid);
434 if (mnt->flags & SMB_MOUNT_GID)
435 seq_printf(s, ",gid=%d", mnt->gid);
436 if (mnt->mounted_uid != 0)
437 seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
438
439 /*
440 * Defaults for file_mode and dir_mode are unknown to us; they
441 * depend on the current umask of the user doing the mount.
442 */
443 if (mnt->flags & SMB_MOUNT_FMODE)
444 seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
445 if (mnt->flags & SMB_MOUNT_DMODE)
446 seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
447
448 if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
449 seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
450 if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
451 seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
452
453 if (mnt->ttl != SMB_TTL_DEFAULT)
454 seq_printf(s, ",ttl=%d", mnt->ttl);
455
456 return 0;
457}
458
459static void
460smb_unload_nls(struct smb_sb_info *server)
461{
462 unload_nls(server->remote_nls);
463 unload_nls(server->local_nls);
464}
465
466static void
467smb_put_super(struct super_block *sb)
468{
469 struct smb_sb_info *server = SMB_SB(sb);
470
471 lock_kernel();
472
473 smb_lock_server(server);
474 server->state = CONN_INVALID;
475 smbiod_unregister_server(server);
476
477 smb_close_socket(server);
478
479 if (server->conn_pid)
480 kill_pid(server->conn_pid, SIGTERM, 1);
481
482 bdi_destroy(&server->bdi);
483 kfree(server->ops);
484 smb_unload_nls(server);
485 sb->s_fs_info = NULL;
486 smb_unlock_server(server);
487 put_pid(server->conn_pid);
488 kfree(server);
489
490 unlock_kernel();
491}
492
493static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
494{
495 struct smb_sb_info *server;
496 struct smb_mount_data_kernel *mnt;
497 struct smb_mount_data *oldmnt;
498 struct inode *root_inode;
499 struct smb_fattr root;
500 int ver;
501 void *mem;
502 static int warn_count;
503
504 if (warn_count < 5) {
505 warn_count++;
506 printk(KERN_EMERG "smbfs is deprecated and will be removed"
507 " from the 2.6.27 kernel. Please migrate to cifs\n");
508 }
509
510 if (!raw_data)
511 goto out_no_data;
512
513 oldmnt = (struct smb_mount_data *) raw_data;
514 ver = oldmnt->version;
515 if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
516 goto out_wrong_data;
517
518 sb->s_flags |= MS_NODIRATIME;
519 sb->s_blocksize = 1024; /* Eh... Is this correct? */
520 sb->s_blocksize_bits = 10;
521 sb->s_magic = SMB_SUPER_MAGIC;
522 sb->s_op = &smb_sops;
523 sb->s_time_gran = 100;
524
525 server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
526 if (!server)
527 goto out_no_server;
528 sb->s_fs_info = server;
529
530 if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
531 goto out_bdi;
532
533 sb->s_bdi = &server->bdi;
534
535 server->super_block = sb;
536 server->mnt = NULL;
537 server->sock_file = NULL;
538 init_waitqueue_head(&server->conn_wq);
539 init_MUTEX(&server->sem);
540 INIT_LIST_HEAD(&server->entry);
541 INIT_LIST_HEAD(&server->xmitq);
542 INIT_LIST_HEAD(&server->recvq);
543 server->conn_error = 0;
544 server->conn_pid = NULL;
545 server->state = CONN_INVALID; /* no connection yet */
546 server->generation = 0;
547
548 /* Allocate the global temp buffer and some superblock helper structs */
549 /* FIXME: move these to the smb_sb_info struct */
550 VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
551 sizeof(struct smb_mount_data_kernel));
552 mem = kmalloc(sizeof(struct smb_ops) +
553 sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
554 if (!mem)
555 goto out_no_mem;
556
557 server->ops = mem;
558 smb_install_null_ops(server->ops);
559 server->mnt = mem + sizeof(struct smb_ops);
560
561 /* Setup NLS stuff */
562 server->remote_nls = NULL;
563 server->local_nls = NULL;
564
565 mnt = server->mnt;
566
567 memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
568 strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
569 SMB_NLS_MAXNAMELEN);
570 strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
571 SMB_NLS_MAXNAMELEN);
572
573 mnt->ttl = SMB_TTL_DEFAULT;
574 if (ver == SMB_MOUNT_OLDVERSION) {
575 mnt->version = oldmnt->version;
576
577 SET_UID(mnt->uid, oldmnt->uid);
578 SET_GID(mnt->gid, oldmnt->gid);
579
580 mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
581 mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
582
583 mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
584 SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
585 } else {
586 mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
587 S_IROTH | S_IXOTH | S_IFREG;
588 mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
589 S_IROTH | S_IXOTH | S_IFDIR;
590 if (parse_options(mnt, raw_data))
591 goto out_bad_option;
592 }
593 mnt->mounted_uid = current_uid();
594 smb_setcodepage(server, &mnt->codepage);
595
596 /*
597 * Display the enabled options
598 * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
599 */
600 if (mnt->flags & SMB_MOUNT_OLDATTR)
601 printk("SMBFS: Using core getattr (Win 95 speedup)\n");
602 else if (mnt->flags & SMB_MOUNT_DIRATTR)
603 printk("SMBFS: Using dir ff getattr\n");
604
605 if (smbiod_register_server(server) < 0) {
606 printk(KERN_ERR "smbfs: failed to start smbiod\n");
607 goto out_no_smbiod;
608 }
609
610 /*
611 * Keep the super block locked while we get the root inode.
612 */
613 smb_init_root_dirent(server, &root, sb);
614 root_inode = smb_iget(sb, &root);
615 if (!root_inode)
616 goto out_no_root;
617
618 sb->s_root = d_alloc_root(root_inode);
619 if (!sb->s_root)
620 goto out_no_root;
621
622 smb_new_dentry(sb->s_root);
623
624 return 0;
625
626out_no_root:
627 iput(root_inode);
628out_no_smbiod:
629 smb_unload_nls(server);
630out_bad_option:
631 kfree(mem);
632out_no_mem:
633 bdi_destroy(&server->bdi);
634out_bdi:
635 if (!server->mnt)
636 printk(KERN_ERR "smb_fill_super: allocation failure\n");
637 sb->s_fs_info = NULL;
638 kfree(server);
639 goto out_fail;
640out_wrong_data:
641 printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
642 goto out_fail;
643out_no_data:
644 printk(KERN_ERR "smb_fill_super: missing data argument\n");
645out_fail:
646 return -EINVAL;
647out_no_server:
648 printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
649 return -ENOMEM;
650}
651
652static int
653smb_statfs(struct dentry *dentry, struct kstatfs *buf)
654{
655 int result;
656
657 lock_kernel();
658
659 result = smb_proc_dskattr(dentry, buf);
660
661 unlock_kernel();
662
663 buf->f_type = SMB_SUPER_MAGIC;
664 buf->f_namelen = SMB_MAXPATHLEN;
665 return result;
666}
667
668int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
669{
670 int err = smb_revalidate_inode(dentry);
671 if (!err)
672 generic_fillattr(dentry->d_inode, stat);
673 return err;
674}
675
676int
677smb_notify_change(struct dentry *dentry, struct iattr *attr)
678{
679 struct inode *inode = dentry->d_inode;
680 struct smb_sb_info *server = server_from_dentry(dentry);
681 unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
682 int error, changed, refresh = 0;
683 struct smb_fattr fattr;
684
685 lock_kernel();
686
687 error = smb_revalidate_inode(dentry);
688 if (error)
689 goto out;
690
691 if ((error = inode_change_ok(inode, attr)) < 0)
692 goto out;
693
694 error = -EPERM;
695 if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
696 goto out;
697
698 if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
699 goto out;
700
701 if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
702 goto out;
703
704 if ((attr->ia_valid & ATTR_SIZE) != 0) {
705 VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
706 DENTRY_PATH(dentry),
707 (long) inode->i_size, (long) attr->ia_size);
708
709 filemap_write_and_wait(inode->i_mapping);
710
711 error = smb_open(dentry, O_WRONLY);
712 if (error)
713 goto out;
714 error = server->ops->truncate(inode, attr->ia_size);
715 if (error)
716 goto out;
717 truncate_setsize(inode, attr->ia_size);
718 refresh = 1;
719 }
720
721 if (server->opt.capabilities & SMB_CAP_UNIX) {
722 /* For now we don't want to set the size with setattr_unix */
723 attr->ia_valid &= ~ATTR_SIZE;
724 /* FIXME: only call if we actually want to set something? */
725 error = smb_proc_setattr_unix(dentry, attr, 0, 0);
726 if (!error)
727 refresh = 1;
728
729 goto out;
730 }
731
732 /*
733 * Initialize the fattr and check for changed fields.
734 * Note: CTIME under SMB is creation time rather than
735 * change time, so we don't attempt to change it.
736 */
737 smb_get_inode_attr(inode, &fattr);
738
739 changed = 0;
740 if ((attr->ia_valid & ATTR_MTIME) != 0) {
741 fattr.f_mtime = attr->ia_mtime;
742 changed = 1;
743 }
744 if ((attr->ia_valid & ATTR_ATIME) != 0) {
745 fattr.f_atime = attr->ia_atime;
746 /* Earlier protocols don't have an access time */
747 if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
748 changed = 1;
749 }
750 if (changed) {
751 error = smb_proc_settime(dentry, &fattr);
752 if (error)
753 goto out;
754 refresh = 1;
755 }
756
757 /*
758 * Check for mode changes ... we're extremely limited in
759 * what can be set for SMB servers: just the read-only bit.
760 */
761 if ((attr->ia_valid & ATTR_MODE) != 0) {
762 VERBOSE("%s/%s mode change, old=%x, new=%x\n",
763 DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
764 changed = 0;
765 if (attr->ia_mode & S_IWUSR) {
766 if (fattr.attr & aRONLY) {
767 fattr.attr &= ~aRONLY;
768 changed = 1;
769 }
770 } else {
771 if (!(fattr.attr & aRONLY)) {
772 fattr.attr |= aRONLY;
773 changed = 1;
774 }
775 }
776 if (changed) {
777 error = smb_proc_setattr(dentry, &fattr);
778 if (error)
779 goto out;
780 refresh = 1;
781 }
782 }
783 error = 0;
784
785out:
786 if (refresh)
787 smb_refresh_inode(dentry);
788 unlock_kernel();
789 return error;
790}
791
792static int smb_get_sb(struct file_system_type *fs_type,
793 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
794{
795 return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
796}
797
798static struct file_system_type smb_fs_type = {
799 .owner = THIS_MODULE,
800 .name = "smbfs",
801 .get_sb = smb_get_sb,
802 .kill_sb = kill_anon_super,
803 .fs_flags = FS_BINARY_MOUNTDATA,
804};
805
806static int __init init_smb_fs(void)
807{
808 int err;
809 DEBUG1("registering ...\n");
810
811 err = init_inodecache();
812 if (err)
813 goto out_inode;
814 err = smb_init_request_cache();
815 if (err)
816 goto out_request;
817 err = register_filesystem(&smb_fs_type);
818 if (err)
819 goto out;
820 return 0;
821out:
822 smb_destroy_request_cache();
823out_request:
824 destroy_inodecache();
825out_inode:
826 return err;
827}
828
829static void __exit exit_smb_fs(void)
830{
831 DEBUG1("unregistering ...\n");
832 unregister_filesystem(&smb_fs_type);
833 smb_destroy_request_cache();
834 destroy_inodecache();
835}
836
837module_init(init_smb_fs)
838module_exit(exit_smb_fs)
839MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad39..000000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/*
2 * ioctl.c
3 *
4 * Copyright (C) 1995, 1996 by Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/errno.h>
11#include <linux/fs.h>
12#include <linux/ioctl.h>
13#include <linux/time.h>
14#include <linux/mm.h>
15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
17#include <linux/net.h>
18
19#include <linux/smb_fs.h>
20#include <linux/smb_mount.h>
21
22#include <asm/uaccess.h>
23
24#include "proto.h"
25
26long
27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
28{
29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt;
31 int result = -EINVAL;
32
33 lock_kernel();
34 switch (cmd) {
35 uid16_t uid16;
36 uid_t uid32;
37 case SMB_IOC_GETMOUNTUID:
38 SET_UID(uid16, server->mnt->mounted_uid);
39 result = put_user(uid16, (uid16_t __user *) arg);
40 break;
41 case SMB_IOC_GETMOUNTUID32:
42 SET_UID(uid32, server->mnt->mounted_uid);
43 result = put_user(uid32, (uid_t __user *) arg);
44 break;
45
46 case SMB_IOC_NEWCONN:
47 /* arg is smb_conn_opt, or NULL if no connection was made */
48 if (!arg) {
49 result = 0;
50 smb_lock_server(server);
51 server->state = CONN_RETRIED;
52 printk(KERN_ERR "Connection attempt failed! [%d]\n",
53 server->conn_error);
54 smbiod_flush(server);
55 smb_unlock_server(server);
56 break;
57 }
58
59 result = -EFAULT;
60 if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
61 result = smb_newconn(server, &opt);
62 break;
63 default:
64 break;
65 }
66 unlock_kernel();
67
68 return result;
69}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b4..000000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
1/*
2 * proc.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/types.h>
11#include <linux/capability.h>
12#include <linux/errno.h>
13#include <linux/slab.h>
14#include <linux/fs.h>
15#include <linux/file.h>
16#include <linux/stat.h>
17#include <linux/fcntl.h>
18#include <linux/dcache.h>
19#include <linux/nls.h>
20#include <linux/smp_lock.h>
21#include <linux/net.h>
22#include <linux/vfs.h>
23#include <linux/smb_fs.h>
24#include <linux/smbno.h>
25#include <linux/smb_mount.h>
26
27#include <net/sock.h>
28
29#include <asm/string.h>
30#include <asm/div64.h>
31
32#include "smb_debug.h"
33#include "proto.h"
34#include "request.h"
35
36
37/* Features. Undefine if they cause problems, this should perhaps be a
38 config option. */
39#define SMBFS_POSIX_UNLINK 1
40
41/* Allow smb_retry to be interrupted. */
42#define SMB_RETRY_INTR
43
44#define SMB_VWV(packet) ((packet) + SMB_HEADER_LEN)
45#define SMB_CMD(packet) (*(packet+8))
46#define SMB_WCT(packet) (*(packet+SMB_HEADER_LEN - 1))
47
48#define SMB_DIRINFO_SIZE 43
49#define SMB_STATUS_SIZE 21
50
51#define SMB_ST_BLKSIZE (PAGE_SIZE)
52#define SMB_ST_BLKSHIFT (PAGE_SHIFT)
53
54static struct smb_ops smb_ops_core;
55static struct smb_ops smb_ops_os2;
56static struct smb_ops smb_ops_win95;
57static struct smb_ops smb_ops_winNT;
58static struct smb_ops smb_ops_unix;
59static struct smb_ops smb_ops_null;
60
61static void
62smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
63static void
64smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
65static int
66smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
67 struct smb_fattr *fattr);
68static int
69smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
70 struct smb_fattr *fattr);
71static int
72smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
73 u16 attr);
74static int
75smb_proc_setattr_ext(struct smb_sb_info *server,
76 struct inode *inode, struct smb_fattr *fattr);
77static int
78smb_proc_query_cifsunix(struct smb_sb_info *server);
79static void
80install_ops(struct smb_ops *dst, struct smb_ops *src);
81
82
83static void
84str_upper(char *name, int len)
85{
86 while (len--)
87 {
88 if (*name >= 'a' && *name <= 'z')
89 *name -= ('a' - 'A');
90 name++;
91 }
92}
93
94#if 0
95static void
96str_lower(char *name, int len)
97{
98 while (len--)
99 {
100 if (*name >= 'A' && *name <= 'Z')
101 *name += ('a' - 'A');
102 name++;
103 }
104}
105#endif
106
107/* reverse a string inline. This is used by the dircache walking routines */
108static void reverse_string(char *buf, int len)
109{
110 char c;
111 char *end = buf+len-1;
112
113 while(buf < end) {
114 c = *buf;
115 *(buf++) = *end;
116 *(end--) = c;
117 }
118}
119
120/* no conversion, just a wrapper for memcpy. */
121static int convert_memcpy(unsigned char *output, int olen,
122 const unsigned char *input, int ilen,
123 struct nls_table *nls_from,
124 struct nls_table *nls_to)
125{
126 if (olen < ilen)
127 return -ENAMETOOLONG;
128 memcpy(output, input, ilen);
129 return ilen;
130}
131
132static inline int write_char(unsigned char ch, char *output, int olen)
133{
134 if (olen < 4)
135 return -ENAMETOOLONG;
136 sprintf(output, ":x%02x", ch);
137 return 4;
138}
139
140static inline int write_unichar(wchar_t ch, char *output, int olen)
141{
142 if (olen < 5)
143 return -ENAMETOOLONG;
144 sprintf(output, ":%04x", ch);
145 return 5;
146}
147
148/* convert from one "codepage" to another (possibly being utf8). */
149static int convert_cp(unsigned char *output, int olen,
150 const unsigned char *input, int ilen,
151 struct nls_table *nls_from,
152 struct nls_table *nls_to)
153{
154 int len = 0;
155 int n;
156 wchar_t ch;
157
158 while (ilen > 0) {
159 /* convert by changing to unicode and back to the new cp */
160 n = nls_from->char2uni(input, ilen, &ch);
161 if (n == -EINVAL) {
162 ilen--;
163 n = write_char(*input++, output, olen);
164 if (n < 0)
165 goto fail;
166 output += n;
167 olen -= n;
168 len += n;
169 continue;
170 } else if (n < 0)
171 goto fail;
172 input += n;
173 ilen -= n;
174
175 n = nls_to->uni2char(ch, output, olen);
176 if (n == -EINVAL)
177 n = write_unichar(ch, output, olen);
178 if (n < 0)
179 goto fail;
180 output += n;
181 olen -= n;
182
183 len += n;
184 }
185 return len;
186fail:
187 return n;
188}
189
190/* ----------------------------------------------------------- */
191
192/*
193 * nls_unicode
194 *
195 * This encodes/decodes little endian unicode format
196 */
197
198static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
199{
200 if (boundlen < 2)
201 return -EINVAL;
202 *out++ = uni & 0xff;
203 *out++ = uni >> 8;
204 return 2;
205}
206
207static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
208{
209 if (boundlen < 2)
210 return -EINVAL;
211 *uni = (rawstring[1] << 8) | rawstring[0];
212 return 2;
213}
214
215static struct nls_table unicode_table = {
216 .charset = "unicode",
217 .uni2char = uni2char,
218 .char2uni = char2uni,
219};
220
221/* ----------------------------------------------------------- */
222
223static int setcodepage(struct nls_table **p, char *name)
224{
225 struct nls_table *nls;
226
227 if (!name || !*name) {
228 nls = NULL;
229 } else if ( (nls = load_nls(name)) == NULL) {
230 printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
231 return -EINVAL;
232 }
233
234 /* if already set, unload the previous one. */
235 if (*p && *p != &unicode_table)
236 unload_nls(*p);
237 *p = nls;
238
239 return 0;
240}
241
242/* Handles all changes to codepage settings. */
243int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
244{
245 int n = 0;
246
247 smb_lock_server(server);
248
249 /* Don't load any nls_* at all, if no remote is requested */
250 if (!*cp->remote_name)
251 goto out;
252
253 /* local */
254 n = setcodepage(&server->local_nls, cp->local_name);
255 if (n != 0)
256 goto out;
257
258 /* remote */
259 if (!strcmp(cp->remote_name, "unicode")) {
260 server->remote_nls = &unicode_table;
261 } else {
262 n = setcodepage(&server->remote_nls, cp->remote_name);
263 if (n != 0)
264 setcodepage(&server->local_nls, NULL);
265 }
266
267out:
268 if (server->local_nls != NULL && server->remote_nls != NULL)
269 server->ops->convert = convert_cp;
270 else
271 server->ops->convert = convert_memcpy;
272
273 smb_unlock_server(server);
274 return n;
275}
276
277
278/*****************************************************************************/
279/* */
280/* Encoding/Decoding section */
281/* */
282/*****************************************************************************/
283
284static __u8 *
285smb_encode_smb_length(__u8 * p, __u32 len)
286{
287 *p = 0;
288 *(p+1) = 0;
289 *(p+2) = (len & 0xFF00) >> 8;
290 *(p+3) = (len & 0xFF);
291 if (len > 0xFFFF)
292 {
293 *(p+1) = 1;
294 }
295 return p + 4;
296}
297
298/*
299 * smb_build_path: build the path to entry and name storing it in buf.
300 * The path returned will have the trailing '\0'.
301 */
302static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
303 int maxlen,
304 struct dentry *entry, struct qstr *name)
305{
306 unsigned char *path = buf;
307 int len;
308 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
309
310 if (maxlen < (2<<unicode))
311 return -ENAMETOOLONG;
312
313 if (maxlen > SMB_MAXPATHLEN + 1)
314 maxlen = SMB_MAXPATHLEN + 1;
315
316 if (entry == NULL)
317 goto test_name_and_out;
318
319 /*
320 * If IS_ROOT, we have to do no walking at all.
321 */
322 if (IS_ROOT(entry) && !name) {
323 *path++ = '\\';
324 if (unicode) *path++ = '\0';
325 *path++ = '\0';
326 if (unicode) *path++ = '\0';
327 return path-buf;
328 }
329
330 /*
331 * Build the path string walking the tree backward from end to ROOT
332 * and store it in reversed order [see reverse_string()]
333 */
334 dget(entry);
335 spin_lock(&entry->d_lock);
336 while (!IS_ROOT(entry)) {
337 struct dentry *parent;
338
339 if (maxlen < (3<<unicode)) {
340 spin_unlock(&entry->d_lock);
341 dput(entry);
342 return -ENAMETOOLONG;
343 }
344
345 len = server->ops->convert(path, maxlen-2,
346 entry->d_name.name, entry->d_name.len,
347 server->local_nls, server->remote_nls);
348 if (len < 0) {
349 spin_unlock(&entry->d_lock);
350 dput(entry);
351 return len;
352 }
353 reverse_string(path, len);
354 path += len;
355 if (unicode) {
356 /* Note: reverse order */
357 *path++ = '\0';
358 maxlen--;
359 }
360 *path++ = '\\';
361 maxlen -= len+1;
362
363 parent = entry->d_parent;
364 dget(parent);
365 spin_unlock(&entry->d_lock);
366 dput(entry);
367 entry = parent;
368 spin_lock(&entry->d_lock);
369 }
370 spin_unlock(&entry->d_lock);
371 dput(entry);
372 reverse_string(buf, path-buf);
373
374 /* maxlen has space for at least one char */
375test_name_and_out:
376 if (name) {
377 if (maxlen < (3<<unicode))
378 return -ENAMETOOLONG;
379 *path++ = '\\';
380 if (unicode) {
381 *path++ = '\0';
382 maxlen--;
383 }
384 len = server->ops->convert(path, maxlen-2,
385 name->name, name->len,
386 server->local_nls, server->remote_nls);
387 if (len < 0)
388 return len;
389 path += len;
390 maxlen -= len+1;
391 }
392 /* maxlen has space for at least one char */
393 *path++ = '\0';
394 if (unicode) *path++ = '\0';
395 return path-buf;
396}
397
398static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
399 struct dentry *dir, struct qstr *name)
400{
401 int result;
402
403 result = smb_build_path(server, buf, maxlen, dir, name);
404 if (result < 0)
405 goto out;
406 if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
407 str_upper(buf, result);
408out:
409 return result;
410}
411
412/* encode_path for non-trans2 request SMBs */
413static int smb_simple_encode_path(struct smb_request *req, char **p,
414 struct dentry * entry, struct qstr * name)
415{
416 struct smb_sb_info *server = req->rq_server;
417 char *s = *p;
418 int res;
419 int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
420 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
421
422 if (!maxlen)
423 return -ENAMETOOLONG;
424 *s++ = 4; /* ASCII data format */
425
426 /*
427 * SMB Unicode strings must be 16bit aligned relative the start of the
428 * packet. If they are not they must be padded with 0.
429 */
430 if (unicode) {
431 int align = s - (char *)req->rq_buffer;
432 if (!(align & 1)) {
433 *s++ = '\0';
434 maxlen--;
435 }
436 }
437
438 res = smb_encode_path(server, s, maxlen-1, entry, name);
439 if (res < 0)
440 return res;
441 *p = s + res;
442 return 0;
443}
444
445/* The following are taken directly from msdos-fs */
446
447/* Linear day numbers of the respective 1sts in non-leap years. */
448
449static int day_n[] =
450{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
451 /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
452
453
454static time_t
455utc2local(struct smb_sb_info *server, time_t time)
456{
457 return time - server->opt.serverzone*60;
458}
459
460static time_t
461local2utc(struct smb_sb_info *server, time_t time)
462{
463 return time + server->opt.serverzone*60;
464}
465
466/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
467
468static time_t
469date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
470{
471 int month, year;
472 time_t secs;
473
474 /* first subtract and mask after that... Otherwise, if
475 date == 0, bad things happen */
476 month = ((date >> 5) - 1) & 15;
477 year = date >> 9;
478 secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
479 ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
480 month < 2 ? 1 : 0) + 3653);
481 /* days since 1.1.70 plus 80's leap day */
482 return local2utc(server, secs);
483}
484
485
486/* Convert linear UNIX date to a MS-DOS time/date pair. */
487
488static void
489date_unix2dos(struct smb_sb_info *server,
490 int unix_date, __u16 *date, __u16 *time)
491{
492 int day, year, nl_day, month;
493
494 unix_date = utc2local(server, unix_date);
495 if (unix_date < 315532800)
496 unix_date = 315532800;
497
498 *time = (unix_date % 60) / 2 +
499 (((unix_date / 60) % 60) << 5) +
500 (((unix_date / 3600) % 24) << 11);
501
502 day = unix_date / 86400 - 3652;
503 year = day / 365;
504 if ((year + 3) / 4 + 365 * year > day)
505 year--;
506 day -= (year + 3) / 4 + 365 * year;
507 if (day == 59 && !(year & 3)) {
508 nl_day = day;
509 month = 2;
510 } else {
511 nl_day = (year & 3) || day <= 59 ? day : day - 1;
512 for (month = 1; month < 12; month++)
513 if (day_n[month] > nl_day)
514 break;
515 }
516 *date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
517}
518
519/* The following are taken from fs/ntfs/util.c */
520
521#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
522
523/*
524 * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
525 * into Unix UTC (based 1970-01-01, in seconds).
526 */
527static struct timespec
528smb_ntutc2unixutc(u64 ntutc)
529{
530 struct timespec ts;
531 /* FIXME: what about the timezone difference? */
532 /* Subtract the NTFS time offset, then convert to 1s intervals. */
533 u64 t = ntutc - NTFS_TIME_OFFSET;
534 ts.tv_nsec = do_div(t, 10000000) * 100;
535 ts.tv_sec = t;
536 return ts;
537}
538
539/* Convert the Unix UTC into NT time */
540static u64
541smb_unixutc2ntutc(struct timespec ts)
542{
543 /* Note: timezone conversion is probably wrong. */
544 /* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
545 return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
546}
547
548#define MAX_FILE_MODE 6
549static mode_t file_mode[] = {
550 S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
551};
552
553static int smb_filetype_to_mode(u32 filetype)
554{
555 if (filetype > MAX_FILE_MODE) {
556 PARANOIA("Filetype out of range: %d\n", filetype);
557 return S_IFREG;
558 }
559 return file_mode[filetype];
560}
561
562static u32 smb_filetype_from_mode(int mode)
563{
564 if (S_ISREG(mode))
565 return UNIX_TYPE_FILE;
566 if (S_ISDIR(mode))
567 return UNIX_TYPE_DIR;
568 if (S_ISLNK(mode))
569 return UNIX_TYPE_SYMLINK;
570 if (S_ISCHR(mode))
571 return UNIX_TYPE_CHARDEV;
572 if (S_ISBLK(mode))
573 return UNIX_TYPE_BLKDEV;
574 if (S_ISFIFO(mode))
575 return UNIX_TYPE_FIFO;
576 if (S_ISSOCK(mode))
577 return UNIX_TYPE_SOCKET;
578 return UNIX_TYPE_UNKNOWN;
579}
580
581
582/*****************************************************************************/
583/* */
584/* Support section. */
585/* */
586/*****************************************************************************/
587
588__u32
589smb_len(__u8 * p)
590{
591 return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
592}
593
594static __u16
595smb_bcc(__u8 * packet)
596{
597 int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
598 return WVAL(packet, pos);
599}
600
601/* smb_valid_packet: We check if packet fulfills the basic
602 requirements of a smb packet */
603
604static int
605smb_valid_packet(__u8 * packet)
606{
607 return (packet[4] == 0xff
608 && packet[5] == 'S'
609 && packet[6] == 'M'
610 && packet[7] == 'B'
611 && (smb_len(packet) + 4 == SMB_HEADER_LEN
612 + SMB_WCT(packet) * 2 + smb_bcc(packet)));
613}
614
615/* smb_verify: We check if we got the answer we expected, and if we
616 got enough data. If bcc == -1, we don't care. */
617
618static int
619smb_verify(__u8 * packet, int command, int wct, int bcc)
620{
621 if (SMB_CMD(packet) != command)
622 goto bad_command;
623 if (SMB_WCT(packet) < wct)
624 goto bad_wct;
625 if (bcc != -1 && smb_bcc(packet) < bcc)
626 goto bad_bcc;
627 return 0;
628
629bad_command:
630 printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
631 command, SMB_CMD(packet));
632 goto fail;
633bad_wct:
634 printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
635 command, wct, SMB_WCT(packet));
636 goto fail;
637bad_bcc:
638 printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
639 command, bcc, smb_bcc(packet));
640fail:
641 return -EIO;
642}
643
644/*
645 * Returns the maximum read or write size for the "payload". Making all of the
646 * packet fit within the negotiated max_xmit size.
647 *
648 * N.B. Since this value is usually computed before locking the server,
649 * the server's packet size must never be decreased!
650 */
651static inline int
652smb_get_xmitsize(struct smb_sb_info *server, int overhead)
653{
654 return server->opt.max_xmit - overhead;
655}
656
657/*
658 * Calculate the maximum read size
659 */
660int
661smb_get_rsize(struct smb_sb_info *server)
662{
663 /* readX has 12 parameters, read has 5 */
664 int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
665 int size = smb_get_xmitsize(server, overhead);
666
667 VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
668
669 return size;
670}
671
672/*
673 * Calculate the maximum write size
674 */
675int
676smb_get_wsize(struct smb_sb_info *server)
677{
678 /* writeX has 14 parameters, write has 5 */
679 int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
680 int size = smb_get_xmitsize(server, overhead);
681
682 VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
683
684 return size;
685}
686
687/*
688 * Convert SMB error codes to -E... errno values.
689 */
690int
691smb_errno(struct smb_request *req)
692{
693 int errcls = req->rq_rcls;
694 int error = req->rq_err;
695 char *class = "Unknown";
696
697 VERBOSE("errcls %d code %d from command 0x%x\n",
698 errcls, error, SMB_CMD(req->rq_header));
699
700 if (errcls == ERRDOS) {
701 switch (error) {
702 case ERRbadfunc:
703 return -EINVAL;
704 case ERRbadfile:
705 case ERRbadpath:
706 return -ENOENT;
707 case ERRnofids:
708 return -EMFILE;
709 case ERRnoaccess:
710 return -EACCES;
711 case ERRbadfid:
712 return -EBADF;
713 case ERRbadmcb:
714 return -EREMOTEIO;
715 case ERRnomem:
716 return -ENOMEM;
717 case ERRbadmem:
718 return -EFAULT;
719 case ERRbadenv:
720 case ERRbadformat:
721 return -EREMOTEIO;
722 case ERRbadaccess:
723 return -EACCES;
724 case ERRbaddata:
725 return -E2BIG;
726 case ERRbaddrive:
727 return -ENXIO;
728 case ERRremcd:
729 return -EREMOTEIO;
730 case ERRdiffdevice:
731 return -EXDEV;
732 case ERRnofiles:
733 return -ENOENT;
734 case ERRbadshare:
735 return -ETXTBSY;
736 case ERRlock:
737 return -EDEADLK;
738 case ERRfilexists:
739 return -EEXIST;
740 case ERROR_INVALID_PARAMETER:
741 return -EINVAL;
742 case ERROR_DISK_FULL:
743 return -ENOSPC;
744 case ERROR_INVALID_NAME:
745 return -ENOENT;
746 case ERROR_DIR_NOT_EMPTY:
747 return -ENOTEMPTY;
748 case ERROR_NOT_LOCKED:
749 return -ENOLCK;
750 case ERROR_ALREADY_EXISTS:
751 return -EEXIST;
752 default:
753 class = "ERRDOS";
754 goto err_unknown;
755 }
756 } else if (errcls == ERRSRV) {
757 switch (error) {
758 /* N.B. This is wrong ... EIO ? */
759 case ERRerror:
760 return -ENFILE;
761 case ERRbadpw:
762 return -EINVAL;
763 case ERRbadtype:
764 case ERRtimeout:
765 return -EIO;
766 case ERRaccess:
767 return -EACCES;
768 /*
769 * This is a fatal error, as it means the "tree ID"
770 * for this connection is no longer valid. We map
771 * to a special error code and get a new connection.
772 */
773 case ERRinvnid:
774 return -EBADSLT;
775 default:
776 class = "ERRSRV";
777 goto err_unknown;
778 }
779 } else if (errcls == ERRHRD) {
780 switch (error) {
781 case ERRnowrite:
782 return -EROFS;
783 case ERRbadunit:
784 return -ENODEV;
785 case ERRnotready:
786 return -EUCLEAN;
787 case ERRbadcmd:
788 case ERRdata:
789 return -EIO;
790 case ERRbadreq:
791 return -ERANGE;
792 case ERRbadshare:
793 return -ETXTBSY;
794 case ERRlock:
795 return -EDEADLK;
796 case ERRdiskfull:
797 return -ENOSPC;
798 default:
799 class = "ERRHRD";
800 goto err_unknown;
801 }
802 } else if (errcls == ERRCMD) {
803 class = "ERRCMD";
804 } else if (errcls == SUCCESS) {
805 return 0; /* This is the only valid 0 return */
806 }
807
808err_unknown:
809 printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
810 class, error, SMB_CMD(req->rq_header));
811 return -EIO;
812}
813
814/* smb_request_ok: We expect the server to be locked. Then we do the
815 request and check the answer completely. When smb_request_ok
816 returns 0, you can be quite sure that everything went well. When
817 the answer is <=0, the returned number is a valid unix errno. */
818
819static int
820smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
821{
822 int result;
823
824 req->rq_resp_wct = wct;
825 req->rq_resp_bcc = bcc;
826
827 result = smb_add_request(req);
828 if (result != 0) {
829 DEBUG1("smb_request failed\n");
830 goto out;
831 }
832
833 if (smb_valid_packet(req->rq_header) != 0) {
834 PARANOIA("invalid packet!\n");
835 goto out;
836 }
837
838 result = smb_verify(req->rq_header, command, wct, bcc);
839
840out:
841 return result;
842}
843
844/*
845 * This implements the NEWCONN ioctl. It installs the server pid,
846 * sets server->state to CONN_VALID, and wakes up the waiting process.
847 */
848int
849smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
850{
851 struct file *filp;
852 struct sock *sk;
853 int error;
854
855 VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
856
857 smb_lock_server(server);
858
859 /*
860 * Make sure we don't already have a valid connection ...
861 */
862 error = -EINVAL;
863 if (server->state == CONN_VALID)
864 goto out;
865
866 error = -EACCES;
867 if (current_uid() != server->mnt->mounted_uid &&
868 !capable(CAP_SYS_ADMIN))
869 goto out;
870
871 error = -EBADF;
872 filp = fget(opt->fd);
873 if (!filp)
874 goto out;
875 if (!smb_valid_socket(filp->f_path.dentry->d_inode))
876 goto out_putf;
877
878 server->sock_file = filp;
879 server->conn_pid = get_pid(task_pid(current));
880 server->opt = *opt;
881 server->generation += 1;
882 server->state = CONN_VALID;
883 error = 0;
884
885 if (server->conn_error) {
886 /*
887 * conn_error is the returncode we originally decided to
888 * drop the old connection on. This message should be positive
889 * and not make people ask questions on why smbfs is printing
890 * error messages ...
891 */
892 printk(KERN_INFO "SMB connection re-established (%d)\n",
893 server->conn_error);
894 server->conn_error = 0;
895 }
896
897 /*
898 * Store the server in sock user_data (Only used by sunrpc)
899 */
900 sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
901 sk->sk_user_data = server;
902
903 /* chain into the data_ready callback */
904 server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
905
906 /* check if we have an old smbmount that uses seconds for the
907 serverzone */
908 if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
909 server->opt.serverzone /= 60;
910
911 /* now that we have an established connection we can detect the server
912 type and enable bug workarounds */
913 if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
914 install_ops(server->ops, &smb_ops_core);
915 else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
916 install_ops(server->ops, &smb_ops_os2);
917 else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
918 (server->opt.max_xmit < 0x1000) &&
919 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
920 /* FIXME: can we kill the WIN95 flag now? */
921 server->mnt->flags |= SMB_MOUNT_WIN95;
922 VERBOSE("detected WIN95 server\n");
923 install_ops(server->ops, &smb_ops_win95);
924 } else {
925 /*
926 * Samba has max_xmit 65535
927 * NT4spX has max_xmit 4536 (or something like that)
928 * win2k has ...
929 */
930 VERBOSE("detected NT1 (Samba, NT4/5) server\n");
931 install_ops(server->ops, &smb_ops_winNT);
932 }
933
934 /* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
935 if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
936 server->ops->getattr = smb_proc_getattr_core;
937 } else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
938 server->ops->getattr = smb_proc_getattr_ff;
939 }
940
941 /* Decode server capabilities */
942 if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
943 /* Should be ok to set this now, as no one can access the
944 mount until the connection has been established. */
945 SB_of(server)->s_maxbytes = ~0ULL >> 1;
946 VERBOSE("LFS enabled\n");
947 }
948 if (server->opt.capabilities & SMB_CAP_UNICODE) {
949 server->mnt->flags |= SMB_MOUNT_UNICODE;
950 VERBOSE("Unicode enabled\n");
951 } else {
952 server->mnt->flags &= ~SMB_MOUNT_UNICODE;
953 }
954#if 0
955 /* flags we may test for other patches ... */
956 if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
957 VERBOSE("Large reads enabled\n");
958 }
959 if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
960 VERBOSE("Large writes enabled\n");
961 }
962#endif
963 if (server->opt.capabilities & SMB_CAP_UNIX) {
964 struct inode *inode;
965 VERBOSE("Using UNIX CIFS extensions\n");
966 install_ops(server->ops, &smb_ops_unix);
967 inode = SB_of(server)->s_root->d_inode;
968 if (inode)
969 inode->i_op = &smb_dir_inode_operations_unix;
970 }
971
972 VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
973 server->opt.protocol, server->opt.max_xmit,
974 pid_nr(server->conn_pid), server->opt.capabilities);
975
976 /* FIXME: this really should be done by smbmount. */
977 if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
978 server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
979 }
980
981 smb_unlock_server(server);
982 smbiod_wake_up();
983 if (server->opt.capabilities & SMB_CAP_UNIX)
984 smb_proc_query_cifsunix(server);
985
986 server->conn_complete++;
987 wake_up_interruptible_all(&server->conn_wq);
988 return error;
989
990out:
991 smb_unlock_server(server);
992 smbiod_wake_up();
993 return error;
994
995out_putf:
996 fput(filp);
997 goto out;
998}
999
1000/* smb_setup_header: We completely set up the packet. You only have to
1001 insert the command-specific fields */
1002
1003__u8 *
1004smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
1005{
1006 __u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
1007 __u8 *p = req->rq_header;
1008 struct smb_sb_info *server = req->rq_server;
1009
1010 p = smb_encode_smb_length(p, xmit_len - 4);
1011
1012 *p++ = 0xff;
1013 *p++ = 'S';
1014 *p++ = 'M';
1015 *p++ = 'B';
1016 *p++ = command;
1017
1018 memset(p, '\0', 19);
1019 p += 19;
1020 p += 8;
1021
1022 if (server->opt.protocol > SMB_PROTOCOL_CORE) {
1023 int flags = SMB_FLAGS_CASELESS_PATHNAMES;
1024 int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
1025 SMB_FLAGS2_EXTENDED_ATTRIBUTES; /* EA? not really ... */
1026
1027 *(req->rq_header + smb_flg) = flags;
1028 if (server->mnt->flags & SMB_MOUNT_UNICODE)
1029 flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
1030 WSET(req->rq_header, smb_flg2, flags2);
1031 }
1032 *p++ = wct; /* wct */
1033 p += 2 * wct;
1034 WSET(p, 0, bcc);
1035
1036 /* Include the header in the data to send */
1037 req->rq_iovlen = 1;
1038 req->rq_iov[0].iov_base = req->rq_header;
1039 req->rq_iov[0].iov_len = xmit_len - bcc;
1040
1041 return req->rq_buffer;
1042}
1043
1044static void
1045smb_setup_bcc(struct smb_request *req, __u8 *p)
1046{
1047 u16 bcc = p - req->rq_buffer;
1048 u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
1049
1050 WSET(pbcc, 0, bcc);
1051
1052 smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN +
1053 2*SMB_WCT(req->rq_header) - 2 + bcc);
1054
1055 /* Include the "bytes" in the data to send */
1056 req->rq_iovlen = 2;
1057 req->rq_iov[1].iov_base = req->rq_buffer;
1058 req->rq_iov[1].iov_len = bcc;
1059}
1060
1061static int
1062smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
1063 __u16 mode, off_t offset)
1064{
1065 int result;
1066 struct smb_request *req;
1067
1068 result = -ENOMEM;
1069 if (! (req = smb_alloc_request(server, 0)))
1070 goto out;
1071
1072 smb_setup_header(req, SMBlseek, 4, 0);
1073 WSET(req->rq_header, smb_vwv0, fileid);
1074 WSET(req->rq_header, smb_vwv1, mode);
1075 DSET(req->rq_header, smb_vwv2, offset);
1076 req->rq_flags |= SMB_REQ_NORETRY;
1077
1078 result = smb_request_ok(req, SMBlseek, 2, 0);
1079 if (result < 0) {
1080 result = 0;
1081 goto out_free;
1082 }
1083
1084 result = DVAL(req->rq_header, smb_vwv0);
1085out_free:
1086 smb_rput(req);
1087out:
1088 return result;
1089}
1090
1091static int
1092smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
1093{
1094 struct inode *ino = dentry->d_inode;
1095 struct smb_inode_info *ei = SMB_I(ino);
1096 int mode, read_write = 0x42, read_only = 0x40;
1097 int res;
1098 char *p;
1099 struct smb_request *req;
1100
1101 /*
1102 * Attempt to open r/w, unless there are no write privileges.
1103 */
1104 mode = read_write;
1105 if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
1106 mode = read_only;
1107#if 0
1108 /* FIXME: why is this code not in? below we fix it so that a caller
1109 wanting RO doesn't get RW. smb_revalidate_inode does some
1110 optimization based on access mode. tail -f needs it to be correct.
1111
1112 We must open rw since we don't do the open if called a second time
1113 with different 'wish'. Is that not supported by smb servers? */
1114 if (!(wish & (O_WRONLY | O_RDWR)))
1115 mode = read_only;
1116#endif
1117
1118 res = -ENOMEM;
1119 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1120 goto out;
1121
1122 retry:
1123 p = smb_setup_header(req, SMBopen, 2, 0);
1124 WSET(req->rq_header, smb_vwv0, mode);
1125 WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
1126 res = smb_simple_encode_path(req, &p, dentry, NULL);
1127 if (res < 0)
1128 goto out_free;
1129 smb_setup_bcc(req, p);
1130
1131 res = smb_request_ok(req, SMBopen, 7, 0);
1132 if (res != 0) {
1133 if (mode == read_write &&
1134 (res == -EACCES || res == -ETXTBSY || res == -EROFS))
1135 {
1136 VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
1137 DENTRY_PATH(dentry), res);
1138 mode = read_only;
1139 req->rq_flags = 0;
1140 goto retry;
1141 }
1142 goto out_free;
1143 }
1144 /* We should now have data in vwv[0..6]. */
1145
1146 ei->fileid = WVAL(req->rq_header, smb_vwv0);
1147 ei->attr = WVAL(req->rq_header, smb_vwv1);
1148 /* smb_vwv2 has mtime */
1149 /* smb_vwv4 has size */
1150 ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
1151 ei->open = server->generation;
1152
1153out_free:
1154 smb_rput(req);
1155out:
1156 return res;
1157}
1158
1159/*
1160 * Make sure the file is open, and check that the access
1161 * is compatible with the desired access.
1162 */
1163int
1164smb_open(struct dentry *dentry, int wish)
1165{
1166 struct inode *inode = dentry->d_inode;
1167 int result;
1168 __u16 access;
1169
1170 result = -ENOENT;
1171 if (!inode) {
1172 printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
1173 DENTRY_PATH(dentry));
1174 goto out;
1175 }
1176
1177 if (!smb_is_open(inode)) {
1178 struct smb_sb_info *server = server_from_inode(inode);
1179 result = 0;
1180 if (!smb_is_open(inode))
1181 result = smb_proc_open(server, dentry, wish);
1182 if (result)
1183 goto out;
1184 /*
1185 * A successful open means the path is still valid ...
1186 */
1187 smb_renew_times(dentry);
1188 }
1189
1190 /*
1191 * Check whether the access is compatible with the desired mode.
1192 */
1193 result = 0;
1194 access = SMB_I(inode)->access;
1195 if (access != wish && access != SMB_O_RDWR) {
1196 PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
1197 DENTRY_PATH(dentry), access, wish);
1198 result = -EACCES;
1199 }
1200out:
1201 return result;
1202}
1203
1204static int
1205smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
1206{
1207 struct smb_request *req;
1208 int result = -ENOMEM;
1209
1210 if (! (req = smb_alloc_request(server, 0)))
1211 goto out;
1212
1213 smb_setup_header(req, SMBclose, 3, 0);
1214 WSET(req->rq_header, smb_vwv0, fileid);
1215 DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
1216 req->rq_flags |= SMB_REQ_NORETRY;
1217 result = smb_request_ok(req, SMBclose, 0, 0);
1218
1219 smb_rput(req);
1220out:
1221 return result;
1222}
1223
1224/*
1225 * Win NT 4.0 has an apparent bug in that it fails to update the
1226 * modify time when writing to a file. As a workaround, we update
1227 * both modify and access time locally, and post the times to the
1228 * server when closing the file.
1229 */
1230static int
1231smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
1232{
1233 struct smb_inode_info *ei = SMB_I(ino);
1234 int result = 0;
1235 if (smb_is_open(ino))
1236 {
1237 /*
1238 * We clear the open flag in advance, in case another
1239 * process observes the value while we block below.
1240 */
1241 ei->open = 0;
1242
1243 /*
1244 * Kludge alert: SMB timestamps are accurate only to
1245 * two seconds ... round the times to avoid needless
1246 * cache invalidations!
1247 */
1248 if (ino->i_mtime.tv_sec & 1) {
1249 ino->i_mtime.tv_sec--;
1250 ino->i_mtime.tv_nsec = 0;
1251 }
1252 if (ino->i_atime.tv_sec & 1) {
1253 ino->i_atime.tv_sec--;
1254 ino->i_atime.tv_nsec = 0;
1255 }
1256 /*
1257 * If the file is open with write permissions,
1258 * update the time stamps to sync mtime and atime.
1259 */
1260 if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
1261 (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
1262 !(ei->access == SMB_O_RDONLY))
1263 {
1264 struct smb_fattr fattr;
1265 smb_get_inode_attr(ino, &fattr);
1266 smb_proc_setattr_ext(server, ino, &fattr);
1267 }
1268
1269 result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
1270 /*
1271 * Force a revalidation after closing ... some servers
1272 * don't post the size until the file has been closed.
1273 */
1274 if (server->opt.protocol < SMB_PROTOCOL_NT1)
1275 ei->oldmtime = 0;
1276 ei->closed = jiffies;
1277 }
1278 return result;
1279}
1280
1281int
1282smb_close(struct inode *ino)
1283{
1284 int result = 0;
1285
1286 if (smb_is_open(ino)) {
1287 struct smb_sb_info *server = server_from_inode(ino);
1288 result = smb_proc_close_inode(server, ino);
1289 }
1290 return result;
1291}
1292
1293/*
1294 * This is used to close a file following a failed instantiate.
1295 * Since we don't have an inode, we can't use any of the above.
1296 */
1297int
1298smb_close_fileid(struct dentry *dentry, __u16 fileid)
1299{
1300 struct smb_sb_info *server = server_from_dentry(dentry);
1301 int result;
1302
1303 result = smb_proc_close(server, fileid, get_seconds());
1304 return result;
1305}
1306
1307/* In smb_proc_read and smb_proc_write we do not retry, because the
1308 file-id would not be valid after a reconnection. */
1309
1310static void
1311smb_proc_read_data(struct smb_request *req)
1312{
1313 req->rq_iov[0].iov_base = req->rq_buffer;
1314 req->rq_iov[0].iov_len = 3;
1315
1316 req->rq_iov[1].iov_base = req->rq_page;
1317 req->rq_iov[1].iov_len = req->rq_rsize;
1318 req->rq_iovlen = 2;
1319
1320 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
1321}
1322
1323static int
1324smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
1325{
1326 struct smb_sb_info *server = server_from_inode(inode);
1327 __u16 returned_count, data_len;
1328 unsigned char *buf;
1329 int result;
1330 struct smb_request *req;
1331 u8 rbuf[4];
1332
1333 result = -ENOMEM;
1334 if (! (req = smb_alloc_request(server, 0)))
1335 goto out;
1336
1337 smb_setup_header(req, SMBread, 5, 0);
1338 buf = req->rq_header;
1339 WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
1340 WSET(buf, smb_vwv1, count);
1341 DSET(buf, smb_vwv2, offset);
1342 WSET(buf, smb_vwv4, 0);
1343
1344 req->rq_page = data;
1345 req->rq_rsize = count;
1346 req->rq_callback = smb_proc_read_data;
1347 req->rq_buffer = rbuf;
1348 req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
1349
1350 result = smb_request_ok(req, SMBread, 5, -1);
1351 if (result < 0)
1352 goto out_free;
1353 returned_count = WVAL(req->rq_header, smb_vwv0);
1354
1355 data_len = WVAL(rbuf, 1);
1356
1357 if (returned_count != data_len) {
1358 printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
1359 printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
1360 returned_count, data_len);
1361 }
1362 result = data_len;
1363
1364out_free:
1365 smb_rput(req);
1366out:
1367 VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
1368 inode->i_ino, SMB_I(inode)->fileid, count, result);
1369 return result;
1370}
1371
1372static int
1373smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
1374{
1375 struct smb_sb_info *server = server_from_inode(inode);
1376 int result;
1377 u16 fileid = SMB_I(inode)->fileid;
1378 u8 buf[4];
1379 struct smb_request *req;
1380
1381 result = -ENOMEM;
1382 if (! (req = smb_alloc_request(server, 0)))
1383 goto out;
1384
1385 VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
1386 inode->i_ino, fileid, count, offset);
1387
1388 smb_setup_header(req, SMBwrite, 5, count + 3);
1389 WSET(req->rq_header, smb_vwv0, fileid);
1390 WSET(req->rq_header, smb_vwv1, count);
1391 DSET(req->rq_header, smb_vwv2, offset);
1392 WSET(req->rq_header, smb_vwv4, 0);
1393
1394 buf[0] = 1;
1395 WSET(buf, 1, count); /* yes, again ... */
1396 req->rq_iov[1].iov_base = buf;
1397 req->rq_iov[1].iov_len = 3;
1398 req->rq_iov[2].iov_base = (char *) data;
1399 req->rq_iov[2].iov_len = count;
1400 req->rq_iovlen = 3;
1401 req->rq_flags |= SMB_REQ_NORETRY;
1402
1403 result = smb_request_ok(req, SMBwrite, 1, 0);
1404 if (result >= 0)
1405 result = WVAL(req->rq_header, smb_vwv0);
1406
1407 smb_rput(req);
1408out:
1409 return result;
1410}
1411
1412/*
1413 * In smb_proc_readX and smb_proc_writeX we do not retry, because the
1414 * file-id would not be valid after a reconnection.
1415 */
1416
1417#define SMB_READX_MAX_PAD 64
1418static void
1419smb_proc_readX_data(struct smb_request *req)
1420{
1421 /* header length, excluding the netbios length (-4) */
1422 int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
1423 int data_off = WVAL(req->rq_header, smb_vwv6);
1424
1425 /*
1426 * Some genius made the padding to the data bytes arbitrary.
1427 * So we must first calculate the amount of padding used by the server.
1428 */
1429 data_off -= hdrlen;
1430 if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
1431 PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
1432 PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
1433 req->rq_rlen = req->rq_bufsize + 1;
1434 return;
1435 }
1436 req->rq_iov[0].iov_base = req->rq_buffer;
1437 req->rq_iov[0].iov_len = data_off;
1438
1439 req->rq_iov[1].iov_base = req->rq_page;
1440 req->rq_iov[1].iov_len = req->rq_rsize;
1441 req->rq_iovlen = 2;
1442
1443 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
1444}
1445
1446static int
1447smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
1448{
1449 struct smb_sb_info *server = server_from_inode(inode);
1450 unsigned char *buf;
1451 int result;
1452 struct smb_request *req;
1453 static char pad[SMB_READX_MAX_PAD];
1454
1455 result = -ENOMEM;
1456 if (! (req = smb_alloc_request(server, 0)))
1457 goto out;
1458
1459 smb_setup_header(req, SMBreadX, 12, 0);
1460 buf = req->rq_header;
1461 WSET(buf, smb_vwv0, 0x00ff);
1462 WSET(buf, smb_vwv1, 0);
1463 WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
1464 DSET(buf, smb_vwv3, (u32)offset); /* low 32 bits */
1465 WSET(buf, smb_vwv5, count);
1466 WSET(buf, smb_vwv6, 0);
1467 DSET(buf, smb_vwv7, 0);
1468 WSET(buf, smb_vwv9, 0);
1469 DSET(buf, smb_vwv10, (u32)(offset >> 32)); /* high 32 bits */
1470 WSET(buf, smb_vwv11, 0);
1471
1472 req->rq_page = data;
1473 req->rq_rsize = count;
1474 req->rq_callback = smb_proc_readX_data;
1475 req->rq_buffer = pad;
1476 req->rq_bufsize = SMB_READX_MAX_PAD;
1477 req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
1478
1479 result = smb_request_ok(req, SMBreadX, 12, -1);
1480 if (result < 0)
1481 goto out_free;
1482 result = WVAL(req->rq_header, smb_vwv5);
1483
1484out_free:
1485 smb_rput(req);
1486out:
1487 VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
1488 inode->i_ino, SMB_I(inode)->fileid, count, result);
1489 return result;
1490}
1491
1492static int
1493smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
1494{
1495 struct smb_sb_info *server = server_from_inode(inode);
1496 int result;
1497 u8 *p;
1498 static u8 pad[4];
1499 struct smb_request *req;
1500
1501 result = -ENOMEM;
1502 if (! (req = smb_alloc_request(server, 0)))
1503 goto out;
1504
1505 VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
1506 inode->i_ino, SMB_I(inode)->fileid, count, offset);
1507
1508 p = smb_setup_header(req, SMBwriteX, 14, count + 1);
1509 WSET(req->rq_header, smb_vwv0, 0x00ff);
1510 WSET(req->rq_header, smb_vwv1, 0);
1511 WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
1512 DSET(req->rq_header, smb_vwv3, (u32)offset); /* low 32 bits */
1513 DSET(req->rq_header, smb_vwv5, 0);
1514 WSET(req->rq_header, smb_vwv7, 0); /* write mode */
1515 WSET(req->rq_header, smb_vwv8, 0);
1516 WSET(req->rq_header, smb_vwv9, 0);
1517 WSET(req->rq_header, smb_vwv10, count); /* data length */
1518 WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
1519 DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
1520
1521 req->rq_iov[1].iov_base = pad;
1522 req->rq_iov[1].iov_len = 1;
1523 req->rq_iov[2].iov_base = (char *) data;
1524 req->rq_iov[2].iov_len = count;
1525 req->rq_iovlen = 3;
1526 req->rq_flags |= SMB_REQ_NORETRY;
1527
1528 result = smb_request_ok(req, SMBwriteX, 6, 0);
1529 if (result >= 0)
1530 result = WVAL(req->rq_header, smb_vwv2);
1531
1532 smb_rput(req);
1533out:
1534 return result;
1535}
1536
1537int
1538smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
1539{
1540 struct smb_sb_info *server = server_from_dentry(dentry);
1541 char *p;
1542 int result;
1543 struct smb_request *req;
1544
1545 result = -ENOMEM;
1546 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1547 goto out;
1548
1549 p = smb_setup_header(req, SMBcreate, 3, 0);
1550 WSET(req->rq_header, smb_vwv0, attr);
1551 DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
1552 result = smb_simple_encode_path(req, &p, dentry, NULL);
1553 if (result < 0)
1554 goto out_free;
1555 smb_setup_bcc(req, p);
1556
1557 result = smb_request_ok(req, SMBcreate, 1, 0);
1558 if (result < 0)
1559 goto out_free;
1560
1561 *fileid = WVAL(req->rq_header, smb_vwv0);
1562 result = 0;
1563
1564out_free:
1565 smb_rput(req);
1566out:
1567 return result;
1568}
1569
1570int
1571smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
1572{
1573 struct smb_sb_info *server = server_from_dentry(old_dentry);
1574 char *p;
1575 int result;
1576 struct smb_request *req;
1577
1578 result = -ENOMEM;
1579 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1580 goto out;
1581
1582 p = smb_setup_header(req, SMBmv, 1, 0);
1583 WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
1584 result = smb_simple_encode_path(req, &p, old_dentry, NULL);
1585 if (result < 0)
1586 goto out_free;
1587 result = smb_simple_encode_path(req, &p, new_dentry, NULL);
1588 if (result < 0)
1589 goto out_free;
1590 smb_setup_bcc(req, p);
1591
1592 if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
1593 goto out_free;
1594 result = 0;
1595
1596out_free:
1597 smb_rput(req);
1598out:
1599 return result;
1600}
1601
1602/*
1603 * Code common to mkdir and rmdir.
1604 */
1605static int
1606smb_proc_generic_command(struct dentry *dentry, __u8 command)
1607{
1608 struct smb_sb_info *server = server_from_dentry(dentry);
1609 char *p;
1610 int result;
1611 struct smb_request *req;
1612
1613 result = -ENOMEM;
1614 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1615 goto out;
1616
1617 p = smb_setup_header(req, command, 0, 0);
1618 result = smb_simple_encode_path(req, &p, dentry, NULL);
1619 if (result < 0)
1620 goto out_free;
1621 smb_setup_bcc(req, p);
1622
1623 result = smb_request_ok(req, command, 0, 0);
1624 if (result < 0)
1625 goto out_free;
1626 result = 0;
1627
1628out_free:
1629 smb_rput(req);
1630out:
1631 return result;
1632}
1633
1634int
1635smb_proc_mkdir(struct dentry *dentry)
1636{
1637 return smb_proc_generic_command(dentry, SMBmkdir);
1638}
1639
1640int
1641smb_proc_rmdir(struct dentry *dentry)
1642{
1643 return smb_proc_generic_command(dentry, SMBrmdir);
1644}
1645
1646#if SMBFS_POSIX_UNLINK
1647/*
1648 * Removes readonly attribute from a file. Used by unlink to give posix
1649 * semantics.
1650 */
1651static int
1652smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
1653{
1654 int result;
1655 struct smb_fattr fattr;
1656
1657 /* FIXME: cifsUE should allow removing a readonly file. */
1658
1659 /* first get current attribute */
1660 smb_init_dirent(server, &fattr);
1661 result = server->ops->getattr(server, dentry, &fattr);
1662 smb_finish_dirent(server, &fattr);
1663 if (result < 0)
1664 return result;
1665
1666 /* if RONLY attribute is set, remove it */
1667 if (fattr.attr & aRONLY) { /* read only attribute is set */
1668 fattr.attr &= ~aRONLY;
1669 result = smb_proc_setattr_core(server, dentry, fattr.attr);
1670 }
1671 return result;
1672}
1673#endif
1674
1675int
1676smb_proc_unlink(struct dentry *dentry)
1677{
1678 struct smb_sb_info *server = server_from_dentry(dentry);
1679 int flag = 0;
1680 char *p;
1681 int result;
1682 struct smb_request *req;
1683
1684 result = -ENOMEM;
1685 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
1686 goto out;
1687
1688 retry:
1689 p = smb_setup_header(req, SMBunlink, 1, 0);
1690 WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
1691 result = smb_simple_encode_path(req, &p, dentry, NULL);
1692 if (result < 0)
1693 goto out_free;
1694 smb_setup_bcc(req, p);
1695
1696 if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
1697#if SMBFS_POSIX_UNLINK
1698 if (result == -EACCES && !flag) {
1699 /* Posix semantics is for the read-only state
1700 of a file to be ignored in unlink(). In the
1701 SMB world a unlink() is refused on a
1702 read-only file. To make things easier for
1703 unix users we try to override the files
1704 permission if the unlink fails with the
1705 right error.
1706 This introduces a race condition that could
1707 lead to a file being written by someone who
1708 shouldn't have access, but as far as I can
1709 tell that is unavoidable */
1710
1711 /* remove RONLY attribute and try again */
1712 result = smb_set_rw(dentry,server);
1713 if (result == 0) {
1714 flag = 1;
1715 req->rq_flags = 0;
1716 goto retry;
1717 }
1718 }
1719#endif
1720 goto out_free;
1721 }
1722 result = 0;
1723
1724out_free:
1725 smb_rput(req);
1726out:
1727 return result;
1728}
1729
1730int
1731smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
1732{
1733 int result;
1734 struct smb_request *req;
1735
1736 result = -ENOMEM;
1737 if (! (req = smb_alloc_request(server, 0)))
1738 goto out;
1739
1740 smb_setup_header(req, SMBflush, 1, 0);
1741 WSET(req->rq_header, smb_vwv0, fileid);
1742 req->rq_flags |= SMB_REQ_NORETRY;
1743 result = smb_request_ok(req, SMBflush, 0, 0);
1744
1745 smb_rput(req);
1746out:
1747 return result;
1748}
1749
1750static int
1751smb_proc_trunc32(struct inode *inode, loff_t length)
1752{
1753 /*
1754 * Writing 0bytes is old-SMB magic for truncating files.
1755 * MAX_NON_LFS should prevent this from being called with a too
1756 * large offset.
1757 */
1758 return smb_proc_write(inode, length, 0, NULL);
1759}
1760
1761static int
1762smb_proc_trunc64(struct inode *inode, loff_t length)
1763{
1764 struct smb_sb_info *server = server_from_inode(inode);
1765 int result;
1766 char *param;
1767 char *data;
1768 struct smb_request *req;
1769
1770 result = -ENOMEM;
1771 if (! (req = smb_alloc_request(server, 14)))
1772 goto out;
1773
1774 param = req->rq_buffer;
1775 data = req->rq_buffer + 6;
1776
1777 /* FIXME: must we also set allocation size? winNT seems to do that */
1778 WSET(param, 0, SMB_I(inode)->fileid);
1779 WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
1780 WSET(param, 4, 0);
1781 LSET(data, 0, length);
1782
1783 req->rq_trans2_command = TRANSACT2_SETFILEINFO;
1784 req->rq_ldata = 8;
1785 req->rq_data = data;
1786 req->rq_lparm = 6;
1787 req->rq_parm = param;
1788 req->rq_flags |= SMB_REQ_NORETRY;
1789 result = smb_add_request(req);
1790 if (result < 0)
1791 goto out_free;
1792
1793 result = 0;
1794 if (req->rq_rcls != 0)
1795 result = smb_errno(req);
1796
1797out_free:
1798 smb_rput(req);
1799out:
1800 return result;
1801}
1802
1803static int
1804smb_proc_trunc95(struct inode *inode, loff_t length)
1805{
1806 struct smb_sb_info *server = server_from_inode(inode);
1807 int result = smb_proc_trunc32(inode, length);
1808
1809 /*
1810 * win9x doesn't appear to update the size immediately.
1811 * It will return the old file size after the truncate,
1812 * confusing smbfs. So we force an update.
1813 *
1814 * FIXME: is this still necessary?
1815 */
1816 smb_proc_flush(server, SMB_I(inode)->fileid);
1817 return result;
1818}
1819
1820static void
1821smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
1822{
1823 memset(fattr, 0, sizeof(*fattr));
1824
1825 fattr->f_nlink = 1;
1826 fattr->f_uid = server->mnt->uid;
1827 fattr->f_gid = server->mnt->gid;
1828 fattr->f_unix = 0;
1829}
1830
1831static void
1832smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
1833{
1834 if (fattr->f_unix)
1835 return;
1836
1837 fattr->f_mode = server->mnt->file_mode;
1838 if (fattr->attr & aDIR) {
1839 fattr->f_mode = server->mnt->dir_mode;
1840 fattr->f_size = SMB_ST_BLKSIZE;
1841 }
1842 /* Check the read-only flag */
1843 if (fattr->attr & aRONLY)
1844 fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
1845
1846 /* How many 512 byte blocks do we need for this file? */
1847 fattr->f_blocks = 0;
1848 if (fattr->f_size != 0)
1849 fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
1850 return;
1851}
1852
1853void
1854smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
1855 struct super_block *sb)
1856{
1857 smb_init_dirent(server, fattr);
1858 fattr->attr = aDIR;
1859 fattr->f_ino = 2; /* traditional root inode number */
1860 fattr->f_mtime = current_fs_time(sb);
1861 smb_finish_dirent(server, fattr);
1862}
1863
1864/*
1865 * Decode a dirent for old protocols
1866 *
1867 * qname is filled with the decoded, and possibly translated, name.
1868 * fattr receives decoded attributes
1869 *
1870 * Bugs Noted:
1871 * (1) Pathworks servers may pad the name with extra spaces.
1872 */
1873static char *
1874smb_decode_short_dirent(struct smb_sb_info *server, char *p,
1875 struct qstr *qname, struct smb_fattr *fattr,
1876 unsigned char *name_buf)
1877{
1878 int len;
1879
1880 /*
1881 * SMB doesn't have a concept of inode numbers ...
1882 */
1883 smb_init_dirent(server, fattr);
1884 fattr->f_ino = 0; /* FIXME: do we need this? */
1885
1886 p += SMB_STATUS_SIZE; /* reserved (search_status) */
1887 fattr->attr = *p;
1888 fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
1889 fattr->f_mtime.tv_nsec = 0;
1890 fattr->f_size = DVAL(p, 5);
1891 fattr->f_ctime = fattr->f_mtime;
1892 fattr->f_atime = fattr->f_mtime;
1893 qname->name = p + 9;
1894 len = strnlen(qname->name, 12);
1895
1896 /*
1897 * Trim trailing blanks for Pathworks servers
1898 */
1899 while (len > 2 && qname->name[len-1] == ' ')
1900 len--;
1901
1902 smb_finish_dirent(server, fattr);
1903
1904#if 0
1905 /* FIXME: These only work for ascii chars, and recent smbmount doesn't
1906 allow the flag to be set anyway. It kills const. Remove? */
1907 switch (server->opt.case_handling) {
1908 case SMB_CASE_UPPER:
1909 str_upper(entry->name, len);
1910 break;
1911 case SMB_CASE_LOWER:
1912 str_lower(entry->name, len);
1913 break;
1914 default:
1915 break;
1916 }
1917#endif
1918
1919 qname->len = 0;
1920 len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
1921 qname->name, len,
1922 server->remote_nls, server->local_nls);
1923 if (len > 0) {
1924 qname->len = len;
1925 qname->name = name_buf;
1926 DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
1927 }
1928
1929 return p + 22;
1930}
1931
1932/*
1933 * This routine is used to read in directory entries from the network.
1934 * Note that it is for short directory name seeks, i.e.: protocol <
1935 * SMB_PROTOCOL_LANMAN2
1936 */
1937static int
1938smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
1939 struct smb_cache_control *ctl)
1940{
1941 struct dentry *dir = filp->f_path.dentry;
1942 struct smb_sb_info *server = server_from_dentry(dir);
1943 struct qstr qname;
1944 struct smb_fattr fattr;
1945 char *p;
1946 int result;
1947 int i, first, entries_seen, entries;
1948 int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
1949 __u16 bcc;
1950 __u16 count;
1951 char status[SMB_STATUS_SIZE];
1952 static struct qstr mask = {
1953 .name = "*.*",
1954 .len = 3,
1955 };
1956 unsigned char *last_status;
1957 struct smb_request *req;
1958 unsigned char *name_buf;
1959
1960 VERBOSE("%s/%s\n", DENTRY_PATH(dir));
1961
1962 lock_kernel();
1963
1964 result = -ENOMEM;
1965 if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
1966 goto out;
1967
1968 first = 1;
1969 entries = 0;
1970 entries_seen = 2; /* implicit . and .. */
1971
1972 result = -ENOMEM;
1973 if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
1974 goto out_name;
1975
1976 while (1) {
1977 p = smb_setup_header(req, SMBsearch, 2, 0);
1978 WSET(req->rq_header, smb_vwv0, entries_asked);
1979 WSET(req->rq_header, smb_vwv1, aDIR);
1980 if (first == 1) {
1981 result = smb_simple_encode_path(req, &p, dir, &mask);
1982 if (result < 0)
1983 goto out_free;
1984 if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
1985 result = -ENAMETOOLONG;
1986 goto out_free;
1987 }
1988 *p++ = 5;
1989 WSET(p, 0, 0);
1990 p += 2;
1991 first = 0;
1992 } else {
1993 if (p + 5 + SMB_STATUS_SIZE >
1994 (char *)req->rq_buffer + req->rq_bufsize) {
1995 result = -ENAMETOOLONG;
1996 goto out_free;
1997 }
1998
1999 *p++ = 4;
2000 *p++ = 0;
2001 *p++ = 5;
2002 WSET(p, 0, SMB_STATUS_SIZE);
2003 p += 2;
2004 memcpy(p, status, SMB_STATUS_SIZE);
2005 p += SMB_STATUS_SIZE;
2006 }
2007
2008 smb_setup_bcc(req, p);
2009
2010 result = smb_request_ok(req, SMBsearch, 1, -1);
2011 if (result < 0) {
2012 if ((req->rq_rcls == ERRDOS) &&
2013 (req->rq_err == ERRnofiles))
2014 break;
2015 goto out_free;
2016 }
2017 count = WVAL(req->rq_header, smb_vwv0);
2018 if (count <= 0)
2019 break;
2020
2021 result = -EIO;
2022 bcc = smb_bcc(req->rq_header);
2023 if (bcc != count * SMB_DIRINFO_SIZE + 3)
2024 goto out_free;
2025 p = req->rq_buffer + 3;
2026
2027
2028 /* Make sure the response fits in the buffer. Fixed sized
2029 entries means we don't have to check in the decode loop. */
2030
2031 last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
2032
2033 if (last_status + SMB_DIRINFO_SIZE >=
2034 req->rq_buffer + req->rq_bufsize) {
2035 printk(KERN_ERR "smb_proc_readdir_short: "
2036 "last dir entry outside buffer! "
2037 "%d@%p %d@%p\n", SMB_DIRINFO_SIZE, last_status,
2038 req->rq_bufsize, req->rq_buffer);
2039 goto out_free;
2040 }
2041
2042 /* Read the last entry into the status field. */
2043 memcpy(status, last_status, SMB_STATUS_SIZE);
2044
2045
2046 /* Now we are ready to parse smb directory entries. */
2047
2048 for (i = 0; i < count; i++) {
2049 p = smb_decode_short_dirent(server, p,
2050 &qname, &fattr, name_buf);
2051 if (qname.len == 0)
2052 continue;
2053
2054 if (entries_seen == 2 && qname.name[0] == '.') {
2055 if (qname.len == 1)
2056 continue;
2057 if (qname.name[1] == '.' && qname.len == 2)
2058 continue;
2059 }
2060 if (!smb_fill_cache(filp, dirent, filldir, ctl,
2061 &qname, &fattr))
2062 ; /* stop reading? */
2063 entries_seen++;
2064 }
2065 }
2066 result = entries;
2067
2068out_free:
2069 smb_rput(req);
2070out_name:
2071 kfree(name_buf);
2072out:
2073 unlock_kernel();
2074 return result;
2075}
2076
2077static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
2078{
2079 u64 size, disk_bytes;
2080
2081 /* FIXME: verify nls support. all is sent as utf8? */
2082
2083 fattr->f_unix = 1;
2084 fattr->f_mode = 0;
2085
2086 /* FIXME: use the uniqueID from the remote instead? */
2087 /* 0 L file size in bytes */
2088 /* 8 L file size on disk in bytes (block count) */
2089 /* 40 L uid */
2090 /* 48 L gid */
2091 /* 56 W file type */
2092 /* 60 L devmajor */
2093 /* 68 L devminor */
2094 /* 76 L unique ID (inode) */
2095 /* 84 L permissions */
2096 /* 92 L link count */
2097
2098 size = LVAL(p, 0);
2099 disk_bytes = LVAL(p, 8);
2100
2101 /*
2102 * Some samba versions round up on-disk byte usage
2103 * to 1MB boundaries, making it useless. When seeing
2104 * that, use the size instead.
2105 */
2106 if (!(disk_bytes & 0xfffff))
2107 disk_bytes = size+511;
2108
2109 fattr->f_size = size;
2110 fattr->f_blocks = disk_bytes >> 9;
2111 fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
2112 fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
2113 fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
2114
2115 if (server->mnt->flags & SMB_MOUNT_UID)
2116 fattr->f_uid = server->mnt->uid;
2117 else
2118 fattr->f_uid = LVAL(p, 40);
2119
2120 if (server->mnt->flags & SMB_MOUNT_GID)
2121 fattr->f_gid = server->mnt->gid;
2122 else
2123 fattr->f_gid = LVAL(p, 48);
2124
2125 fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
2126
2127 if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
2128 __u64 major = LVAL(p, 60);
2129 __u64 minor = LVAL(p, 68);
2130
2131 fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
2132 if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
2133 MINOR(fattr->f_rdev) != (minor & 0xffffffff))
2134 fattr->f_rdev = 0;
2135 }
2136
2137 fattr->f_mode |= LVAL(p, 84);
2138
2139 if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
2140 (S_ISDIR(fattr->f_mode)) )
2141 fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
2142 else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
2143 !(S_ISDIR(fattr->f_mode)) )
2144 fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
2145 (fattr->f_mode & S_IFMT);
2146
2147}
2148
2149/*
2150 * Interpret a long filename structure using the specified info level:
2151 * level 1 for anything below NT1 protocol
2152 * level 260 for NT1 protocol
2153 *
2154 * qname is filled with the decoded, and possibly translated, name
2155 * fattr receives decoded attributes.
2156 *
2157 * Bugs Noted:
2158 * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
2159 */
2160static char *
2161smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
2162 struct qstr *qname, struct smb_fattr *fattr,
2163 unsigned char *name_buf)
2164{
2165 char *result;
2166 unsigned int len = 0;
2167 int n;
2168 __u16 date, time;
2169 int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
2170
2171 /*
2172 * SMB doesn't have a concept of inode numbers ...
2173 */
2174 smb_init_dirent(server, fattr);
2175 fattr->f_ino = 0; /* FIXME: do we need this? */
2176
2177 switch (level) {
2178 case 1:
2179 len = *((unsigned char *) p + 22);
2180 qname->name = p + 23;
2181 result = p + 24 + len;
2182
2183 date = WVAL(p, 0);
2184 time = WVAL(p, 2);
2185 fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2186 fattr->f_ctime.tv_nsec = 0;
2187
2188 date = WVAL(p, 4);
2189 time = WVAL(p, 6);
2190 fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
2191 fattr->f_atime.tv_nsec = 0;
2192
2193 date = WVAL(p, 8);
2194 time = WVAL(p, 10);
2195 fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2196 fattr->f_mtime.tv_nsec = 0;
2197 fattr->f_size = DVAL(p, 12);
2198 /* ULONG allocation size */
2199 fattr->attr = WVAL(p, 20);
2200
2201 VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
2202 p, len, len, qname->name);
2203 break;
2204 case 260:
2205 result = p + WVAL(p, 0);
2206 len = DVAL(p, 60);
2207 if (len > 255) len = 255;
2208 /* NT4 null terminates, unless we are using unicode ... */
2209 qname->name = p + 94;
2210 if (!unicode && len && qname->name[len-1] == '\0')
2211 len--;
2212
2213 fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
2214 fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
2215 fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
2216 /* change time (32) */
2217 fattr->f_size = LVAL(p, 40);
2218 /* alloc size (48) */
2219 fattr->attr = DVAL(p, 56);
2220
2221 VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
2222 p, len, len, qname->name);
2223 break;
2224 case SMB_FIND_FILE_UNIX:
2225 result = p + WVAL(p, 0);
2226 qname->name = p + 108;
2227
2228 len = strlen(qname->name);
2229 /* FIXME: should we check the length?? */
2230
2231 p += 8;
2232 smb_decode_unix_basic(fattr, server, p);
2233 VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
2234 p, len, len, qname->name);
2235 break;
2236 default:
2237 PARANOIA("Unknown info level %d\n", level);
2238 result = p + WVAL(p, 0);
2239 goto out;
2240 }
2241
2242 smb_finish_dirent(server, fattr);
2243
2244#if 0
2245 /* FIXME: These only work for ascii chars, and recent smbmount doesn't
2246 allow the flag to be set anyway. Remove? */
2247 switch (server->opt.case_handling) {
2248 case SMB_CASE_UPPER:
2249 str_upper(qname->name, len);
2250 break;
2251 case SMB_CASE_LOWER:
2252 str_lower(qname->name, len);
2253 break;
2254 default:
2255 break;
2256 }
2257#endif
2258
2259 qname->len = 0;
2260 n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
2261 qname->name, len,
2262 server->remote_nls, server->local_nls);
2263 if (n > 0) {
2264 qname->len = n;
2265 qname->name = name_buf;
2266 }
2267
2268out:
2269 return result;
2270}
2271
2272/* findfirst/findnext flags */
2273#define SMB_CLOSE_AFTER_FIRST (1<<0)
2274#define SMB_CLOSE_IF_END (1<<1)
2275#define SMB_REQUIRE_RESUME_KEY (1<<2)
2276#define SMB_CONTINUE_BIT (1<<3)
2277
2278/*
2279 * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
2280 * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
2281 * go there for advise.
2282 *
2283 * Bugs Noted:
2284 * (1) When using Info Level 1 Win NT 4.0 truncates directory listings
2285 * for certain patterns of names and/or lengths. The breakage pattern
2286 * is completely reproducible and can be toggled by the creation of a
2287 * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
2288 */
2289static int
2290smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
2291 struct smb_cache_control *ctl)
2292{
2293 struct dentry *dir = filp->f_path.dentry;
2294 struct smb_sb_info *server = server_from_dentry(dir);
2295 struct qstr qname;
2296 struct smb_fattr fattr;
2297
2298 unsigned char *p, *lastname;
2299 char *mask, *param;
2300 __u16 command;
2301 int first, entries_seen;
2302
2303 /* Both NT and OS/2 accept info level 1 (but see note below). */
2304 int info_level = 260;
2305 const int max_matches = 512;
2306
2307 unsigned int ff_searchcount = 0;
2308 unsigned int ff_eos = 0;
2309 unsigned int ff_lastname = 0;
2310 unsigned int ff_dir_handle = 0;
2311 unsigned int loop_count = 0;
2312 unsigned int mask_len, i;
2313 int result;
2314 struct smb_request *req;
2315 unsigned char *name_buf;
2316 static struct qstr star = {
2317 .name = "*",
2318 .len = 1,
2319 };
2320
2321 lock_kernel();
2322
2323 /*
2324 * We always prefer unix style. Use info level 1 for older
2325 * servers that don't do 260.
2326 */
2327 if (server->opt.capabilities & SMB_CAP_UNIX)
2328 info_level = SMB_FIND_FILE_UNIX;
2329 else if (server->opt.protocol < SMB_PROTOCOL_NT1)
2330 info_level = 1;
2331
2332 result = -ENOMEM;
2333 if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
2334 goto out;
2335 if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
2336 goto out_name;
2337 param = req->rq_buffer;
2338
2339 /*
2340 * Encode the initial path
2341 */
2342 mask = param + 12;
2343
2344 result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
2345 if (result <= 0)
2346 goto out_free;
2347 mask_len = result - 1; /* mask_len is strlen, not #bytes */
2348 result = 0;
2349 first = 1;
2350 VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
2351
2352 entries_seen = 2;
2353 ff_eos = 0;
2354
2355 while (ff_eos == 0) {
2356 loop_count += 1;
2357 if (loop_count > 10) {
2358 printk(KERN_WARNING "smb_proc_readdir_long: "
2359 "Looping in FIND_NEXT??\n");
2360 result = -EIO;
2361 break;
2362 }
2363
2364 if (first != 0) {
2365 command = TRANSACT2_FINDFIRST;
2366 WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
2367 WSET(param, 2, max_matches); /* max count */
2368 WSET(param, 4, SMB_CLOSE_IF_END);
2369 WSET(param, 6, info_level);
2370 DSET(param, 8, 0);
2371 } else {
2372 command = TRANSACT2_FINDNEXT;
2373
2374 VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
2375 ff_dir_handle, ff_lastname, mask_len, mask);
2376
2377 WSET(param, 0, ff_dir_handle); /* search handle */
2378 WSET(param, 2, max_matches); /* max count */
2379 WSET(param, 4, info_level);
2380 DSET(param, 6, 0);
2381 WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
2382 }
2383
2384 req->rq_trans2_command = command;
2385 req->rq_ldata = 0;
2386 req->rq_data = NULL;
2387 req->rq_lparm = 12 + mask_len + 1;
2388 req->rq_parm = param;
2389 req->rq_flags = 0;
2390 result = smb_add_request(req);
2391 if (result < 0) {
2392 PARANOIA("error=%d, breaking\n", result);
2393 break;
2394 }
2395
2396 if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
2397 /* a damn Win95 bug - sometimes it clags if you
2398 ask it too fast */
2399 schedule_timeout_interruptible(msecs_to_jiffies(200));
2400 continue;
2401 }
2402
2403 if (req->rq_rcls != 0) {
2404 result = smb_errno(req);
2405 PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
2406 mask, result, req->rq_rcls, req->rq_err);
2407 break;
2408 }
2409
2410 /* parse out some important return info */
2411 if (first != 0) {
2412 ff_dir_handle = WVAL(req->rq_parm, 0);
2413 ff_searchcount = WVAL(req->rq_parm, 2);
2414 ff_eos = WVAL(req->rq_parm, 4);
2415 ff_lastname = WVAL(req->rq_parm, 8);
2416 } else {
2417 ff_searchcount = WVAL(req->rq_parm, 0);
2418 ff_eos = WVAL(req->rq_parm, 2);
2419 ff_lastname = WVAL(req->rq_parm, 6);
2420 }
2421
2422 if (ff_searchcount == 0)
2423 break;
2424
2425 /* Now we are ready to parse smb directory entries. */
2426
2427 /* point to the data bytes */
2428 p = req->rq_data;
2429 for (i = 0; i < ff_searchcount; i++) {
2430 /* make sure we stay within the buffer */
2431 if (p >= req->rq_data + req->rq_ldata) {
2432 printk(KERN_ERR "smb_proc_readdir_long: "
2433 "dirent pointer outside buffer! "
2434 "%p %d@%p\n",
2435 p, req->rq_ldata, req->rq_data);
2436 result = -EIO; /* always a comm. error? */
2437 goto out_free;
2438 }
2439
2440 p = smb_decode_long_dirent(server, p, info_level,
2441 &qname, &fattr, name_buf);
2442
2443 /* ignore . and .. from the server */
2444 if (entries_seen == 2 && qname.name[0] == '.') {
2445 if (qname.len == 1)
2446 continue;
2447 if (qname.name[1] == '.' && qname.len == 2)
2448 continue;
2449 }
2450
2451 if (!smb_fill_cache(filp, dirent, filldir, ctl,
2452 &qname, &fattr))
2453 ; /* stop reading? */
2454 entries_seen++;
2455 }
2456
2457 VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
2458
2459 /*
2460 * We might need the lastname for continuations.
2461 *
2462 * Note that some servers (win95?) point to the filename and
2463 * others (NT4, Samba using NT1) to the dir entry. We assume
2464 * here that those who do not point to a filename do not need
2465 * this info to continue the listing.
2466 *
2467 * OS/2 needs this and talks infolevel 1.
2468 * NetApps want lastname with infolevel 260.
2469 * win2k want lastname with infolevel 260, and points to
2470 * the record not to the name.
2471 * Samba+CifsUnixExt doesn't need lastname.
2472 *
2473 * Both are happy if we return the data they point to. So we do.
2474 * (FIXME: above is not true with win2k)
2475 */
2476 mask_len = 0;
2477 if (info_level != SMB_FIND_FILE_UNIX &&
2478 ff_lastname > 0 && ff_lastname < req->rq_ldata) {
2479 lastname = req->rq_data + ff_lastname;
2480
2481 switch (info_level) {
2482 case 260:
2483 mask_len = req->rq_ldata - ff_lastname;
2484 break;
2485 case 1:
2486 /* lastname points to a length byte */
2487 mask_len = *lastname++;
2488 if (ff_lastname + 1 + mask_len > req->rq_ldata)
2489 mask_len = req->rq_ldata - ff_lastname - 1;
2490 break;
2491 }
2492
2493 /*
2494 * Update the mask string for the next message.
2495 */
2496 if (mask_len > 255)
2497 mask_len = 255;
2498 if (mask_len)
2499 strncpy(mask, lastname, mask_len);
2500 }
2501 mask_len = strnlen(mask, mask_len);
2502 VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
2503 mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
2504
2505 first = 0;
2506 loop_count = 0;
2507 }
2508
2509out_free:
2510 smb_rput(req);
2511out_name:
2512 kfree(name_buf);
2513out:
2514 unlock_kernel();
2515 return result;
2516}
2517
2518/*
2519 * This version uses the trans2 TRANSACT2_FINDFIRST message
2520 * to get the attribute data.
2521 *
2522 * Bugs Noted:
2523 */
2524static int
2525smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
2526 struct smb_fattr *fattr)
2527{
2528 char *param, *mask;
2529 __u16 date, time;
2530 int mask_len, result;
2531 struct smb_request *req;
2532
2533 result = -ENOMEM;
2534 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2535 goto out;
2536 param = req->rq_buffer;
2537 mask = param + 12;
2538
2539 mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
2540 if (mask_len < 0) {
2541 result = mask_len;
2542 goto out_free;
2543 }
2544 VERBOSE("name=%s, len=%d\n", mask, mask_len);
2545 WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
2546 WSET(param, 2, 1); /* max count */
2547 WSET(param, 4, 1); /* close after this call */
2548 WSET(param, 6, 1); /* info_level */
2549 DSET(param, 8, 0);
2550
2551 req->rq_trans2_command = TRANSACT2_FINDFIRST;
2552 req->rq_ldata = 0;
2553 req->rq_data = NULL;
2554 req->rq_lparm = 12 + mask_len;
2555 req->rq_parm = param;
2556 req->rq_flags = 0;
2557 result = smb_add_request(req);
2558 if (result < 0)
2559 goto out_free;
2560 if (req->rq_rcls != 0) {
2561 result = smb_errno(req);
2562#ifdef SMBFS_PARANOIA
2563 if (result != -ENOENT)
2564 PARANOIA("error for %s, rcls=%d, err=%d\n",
2565 mask, req->rq_rcls, req->rq_err);
2566#endif
2567 goto out_free;
2568 }
2569 /* Make sure we got enough data ... */
2570 result = -EINVAL;
2571 if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
2572 PARANOIA("bad result for %s, len=%d, count=%d\n",
2573 mask, req->rq_ldata, WVAL(req->rq_parm, 2));
2574 goto out_free;
2575 }
2576
2577 /*
2578 * Decode the response into the fattr ...
2579 */
2580 date = WVAL(req->rq_data, 0);
2581 time = WVAL(req->rq_data, 2);
2582 fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2583 fattr->f_ctime.tv_nsec = 0;
2584
2585 date = WVAL(req->rq_data, 4);
2586 time = WVAL(req->rq_data, 6);
2587 fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
2588 fattr->f_atime.tv_nsec = 0;
2589
2590 date = WVAL(req->rq_data, 8);
2591 time = WVAL(req->rq_data, 10);
2592 fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2593 fattr->f_mtime.tv_nsec = 0;
2594 VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
2595 mask, date, time, fattr->f_mtime.tv_sec);
2596 fattr->f_size = DVAL(req->rq_data, 12);
2597 /* ULONG allocation size */
2598 fattr->attr = WVAL(req->rq_data, 20);
2599 result = 0;
2600
2601out_free:
2602 smb_rput(req);
2603out:
2604 return result;
2605}
2606
2607static int
2608smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
2609 struct smb_fattr *fattr)
2610{
2611 int result;
2612 char *p;
2613 struct smb_request *req;
2614
2615 result = -ENOMEM;
2616 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2617 goto out;
2618
2619 p = smb_setup_header(req, SMBgetatr, 0, 0);
2620 result = smb_simple_encode_path(req, &p, dir, NULL);
2621 if (result < 0)
2622 goto out_free;
2623 smb_setup_bcc(req, p);
2624
2625 if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
2626 goto out_free;
2627 fattr->attr = WVAL(req->rq_header, smb_vwv0);
2628 fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
2629 fattr->f_mtime.tv_nsec = 0;
2630 fattr->f_size = DVAL(req->rq_header, smb_vwv3);
2631 fattr->f_ctime = fattr->f_mtime;
2632 fattr->f_atime = fattr->f_mtime;
2633#ifdef SMBFS_DEBUG_TIMESTAMP
2634 printk("getattr_core: %s/%s, mtime=%ld\n",
2635 DENTRY_PATH(dir), fattr->f_mtime);
2636#endif
2637 result = 0;
2638
2639out_free:
2640 smb_rput(req);
2641out:
2642 return result;
2643}
2644
2645/*
2646 * Bugs Noted:
2647 * (1) Win 95 swaps the date and time fields in the standard info level.
2648 */
2649static int
2650smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
2651 struct smb_request *req, int infolevel)
2652{
2653 char *p, *param;
2654 int result;
2655
2656 param = req->rq_buffer;
2657 WSET(param, 0, infolevel);
2658 DSET(param, 2, 0);
2659 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
2660 if (result < 0)
2661 goto out;
2662 p = param + 6 + result;
2663
2664 req->rq_trans2_command = TRANSACT2_QPATHINFO;
2665 req->rq_ldata = 0;
2666 req->rq_data = NULL;
2667 req->rq_lparm = p - param;
2668 req->rq_parm = param;
2669 req->rq_flags = 0;
2670 result = smb_add_request(req);
2671 if (result < 0)
2672 goto out;
2673 if (req->rq_rcls != 0) {
2674 VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
2675 &param[6], result, req->rq_rcls, req->rq_err);
2676 result = smb_errno(req);
2677 goto out;
2678 }
2679 result = -ENOENT;
2680 if (req->rq_ldata < 22) {
2681 PARANOIA("not enough data for %s, len=%d\n",
2682 &param[6], req->rq_ldata);
2683 goto out;
2684 }
2685
2686 result = 0;
2687out:
2688 return result;
2689}
2690
2691static int
2692smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
2693 struct smb_fattr *attr)
2694{
2695 u16 date, time;
2696 int off_date = 0, off_time = 2;
2697 int result;
2698 struct smb_request *req;
2699
2700 result = -ENOMEM;
2701 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2702 goto out;
2703
2704 result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
2705 if (result < 0)
2706 goto out_free;
2707
2708 /*
2709 * Kludge alert: Win 95 swaps the date and time field,
2710 * contrary to the CIFS docs and Win NT practice.
2711 */
2712 if (server->mnt->flags & SMB_MOUNT_WIN95) {
2713 off_date = 2;
2714 off_time = 0;
2715 }
2716 date = WVAL(req->rq_data, off_date);
2717 time = WVAL(req->rq_data, off_time);
2718 attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
2719 attr->f_ctime.tv_nsec = 0;
2720
2721 date = WVAL(req->rq_data, 4 + off_date);
2722 time = WVAL(req->rq_data, 4 + off_time);
2723 attr->f_atime.tv_sec = date_dos2unix(server, date, time);
2724 attr->f_atime.tv_nsec = 0;
2725
2726 date = WVAL(req->rq_data, 8 + off_date);
2727 time = WVAL(req->rq_data, 8 + off_time);
2728 attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
2729 attr->f_mtime.tv_nsec = 0;
2730#ifdef SMBFS_DEBUG_TIMESTAMP
2731 printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
2732 DENTRY_PATH(dir), date, time, attr->f_mtime);
2733#endif
2734 attr->f_size = DVAL(req->rq_data, 12);
2735 attr->attr = WVAL(req->rq_data, 20);
2736
2737out_free:
2738 smb_rput(req);
2739out:
2740 return result;
2741}
2742
2743static int
2744smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
2745 struct smb_fattr *attr)
2746{
2747 struct smb_request *req;
2748 int result;
2749
2750 result = -ENOMEM;
2751 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2752 goto out;
2753
2754 result = smb_proc_getattr_trans2(server, dir, req,
2755 SMB_QUERY_FILE_ALL_INFO);
2756 if (result < 0)
2757 goto out_free;
2758
2759 attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
2760 attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
2761 attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
2762 /* change (24) */
2763 attr->attr = WVAL(req->rq_data, 32);
2764 /* pad? (34) */
2765 /* allocated size (40) */
2766 attr->f_size = LVAL(req->rq_data, 48);
2767
2768out_free:
2769 smb_rput(req);
2770out:
2771 return result;
2772}
2773
2774static int
2775smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
2776 struct smb_fattr *attr)
2777{
2778 struct smb_request *req;
2779 int result;
2780
2781 result = -ENOMEM;
2782 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2783 goto out;
2784
2785 result = smb_proc_getattr_trans2(server, dir, req,
2786 SMB_QUERY_FILE_UNIX_BASIC);
2787 if (result < 0)
2788 goto out_free;
2789
2790 smb_decode_unix_basic(attr, server, req->rq_data);
2791
2792out_free:
2793 smb_rput(req);
2794out:
2795 return result;
2796}
2797
2798static int
2799smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
2800 struct smb_fattr *attr)
2801{
2802 struct inode *inode = dir->d_inode;
2803 int result;
2804
2805 /* FIXME: why not use the "all" version? */
2806 result = smb_proc_getattr_trans2_std(server, dir, attr);
2807 if (result < 0)
2808 goto out;
2809
2810 /*
2811 * None of the getattr versions here can make win9x return the right
2812 * filesize if there are changes made to an open file.
2813 * A seek-to-end does return the right size, but we only need to do
2814 * that on files we have written.
2815 */
2816 if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
2817 smb_is_open(inode))
2818 {
2819 __u16 fileid = SMB_I(inode)->fileid;
2820 attr->f_size = smb_proc_seek(server, fileid, 2, 0);
2821 }
2822
2823out:
2824 return result;
2825}
2826
2827static int
2828smb_proc_ops_wait(struct smb_sb_info *server)
2829{
2830 int result;
2831
2832 result = wait_event_interruptible_timeout(server->conn_wq,
2833 server->conn_complete, 30*HZ);
2834
2835 if (!result || signal_pending(current))
2836 return -EIO;
2837
2838 return 0;
2839}
2840
2841static int
2842smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
2843 struct smb_fattr *fattr)
2844{
2845 int result;
2846
2847 if (smb_proc_ops_wait(server) < 0)
2848 return -EIO;
2849
2850 smb_init_dirent(server, fattr);
2851 result = server->ops->getattr(server, dir, fattr);
2852 smb_finish_dirent(server, fattr);
2853
2854 return result;
2855}
2856
2857static int
2858smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
2859 struct smb_cache_control *ctl)
2860{
2861 struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
2862
2863 if (smb_proc_ops_wait(server) < 0)
2864 return -EIO;
2865
2866 return server->ops->readdir(filp, dirent, filldir, ctl);
2867}
2868
2869int
2870smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
2871{
2872 struct smb_sb_info *server = server_from_dentry(dir);
2873 int result;
2874
2875 smb_init_dirent(server, fattr);
2876 result = server->ops->getattr(server, dir, fattr);
2877 smb_finish_dirent(server, fattr);
2878
2879 return result;
2880}
2881
2882
2883/*
2884 * Because of bugs in the core protocol, we use this only to set
2885 * attributes. See smb_proc_settime() below for timestamp handling.
2886 *
2887 * Bugs Noted:
2888 * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
2889 * with an undocumented error (ERRDOS code 50). Setting
2890 * mtime to 0 allows the attributes to be set.
2891 * (2) The extra parameters following the name string aren't
2892 * in the CIFS docs, but seem to be necessary for operation.
2893 */
2894static int
2895smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
2896 __u16 attr)
2897{
2898 char *p;
2899 int result;
2900 struct smb_request *req;
2901
2902 result = -ENOMEM;
2903 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
2904 goto out;
2905
2906 p = smb_setup_header(req, SMBsetatr, 8, 0);
2907 WSET(req->rq_header, smb_vwv0, attr);
2908 DSET(req->rq_header, smb_vwv1, 0); /* mtime */
2909 WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
2910 WSET(req->rq_header, smb_vwv4, 0);
2911 WSET(req->rq_header, smb_vwv5, 0);
2912 WSET(req->rq_header, smb_vwv6, 0);
2913 WSET(req->rq_header, smb_vwv7, 0);
2914 result = smb_simple_encode_path(req, &p, dentry, NULL);
2915 if (result < 0)
2916 goto out_free;
2917 if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
2918 result = -ENAMETOOLONG;
2919 goto out_free;
2920 }
2921 *p++ = 4;
2922 *p++ = 0;
2923 smb_setup_bcc(req, p);
2924
2925 result = smb_request_ok(req, SMBsetatr, 0, 0);
2926 if (result < 0)
2927 goto out_free;
2928 result = 0;
2929
2930out_free:
2931 smb_rput(req);
2932out:
2933 return result;
2934}
2935
2936/*
2937 * Because of bugs in the trans2 setattr messages, we must set
2938 * attributes and timestamps separately. The core SMBsetatr
2939 * message seems to be the only reliable way to set attributes.
2940 */
2941int
2942smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
2943{
2944 struct smb_sb_info *server = server_from_dentry(dir);
2945 int result;
2946
2947 VERBOSE("setting %s/%s, open=%d\n",
2948 DENTRY_PATH(dir), smb_is_open(dir->d_inode));
2949 result = smb_proc_setattr_core(server, dir, fattr->attr);
2950 return result;
2951}
2952
2953/*
2954 * Sets the timestamps for an file open with write permissions.
2955 */
2956static int
2957smb_proc_setattr_ext(struct smb_sb_info *server,
2958 struct inode *inode, struct smb_fattr *fattr)
2959{
2960 __u16 date, time;
2961 int result;
2962 struct smb_request *req;
2963
2964 result = -ENOMEM;
2965 if (! (req = smb_alloc_request(server, 0)))
2966 goto out;
2967
2968 smb_setup_header(req, SMBsetattrE, 7, 0);
2969 WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
2970 /* We don't change the creation time */
2971 WSET(req->rq_header, smb_vwv1, 0);
2972 WSET(req->rq_header, smb_vwv2, 0);
2973 date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
2974 WSET(req->rq_header, smb_vwv3, date);
2975 WSET(req->rq_header, smb_vwv4, time);
2976 date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
2977 WSET(req->rq_header, smb_vwv5, date);
2978 WSET(req->rq_header, smb_vwv6, time);
2979#ifdef SMBFS_DEBUG_TIMESTAMP
2980 printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
2981 date, time, fattr->f_mtime);
2982#endif
2983
2984 req->rq_flags |= SMB_REQ_NORETRY;
2985 result = smb_request_ok(req, SMBsetattrE, 0, 0);
2986 if (result < 0)
2987 goto out_free;
2988 result = 0;
2989out_free:
2990 smb_rput(req);
2991out:
2992 return result;
2993}
2994
2995/*
2996 * Bugs Noted:
2997 * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
2998 * set the file's attribute flags.
2999 */
3000static int
3001smb_proc_setattr_trans2(struct smb_sb_info *server,
3002 struct dentry *dir, struct smb_fattr *fattr)
3003{
3004 __u16 date, time;
3005 char *p, *param;
3006 int result;
3007 char data[26];
3008 struct smb_request *req;
3009
3010 result = -ENOMEM;
3011 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3012 goto out;
3013 param = req->rq_buffer;
3014
3015 WSET(param, 0, 1); /* Info level SMB_INFO_STANDARD */
3016 DSET(param, 2, 0);
3017 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
3018 if (result < 0)
3019 goto out_free;
3020 p = param + 6 + result;
3021
3022 WSET(data, 0, 0); /* creation time */
3023 WSET(data, 2, 0);
3024 date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
3025 WSET(data, 4, date);
3026 WSET(data, 6, time);
3027 date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
3028 WSET(data, 8, date);
3029 WSET(data, 10, time);
3030#ifdef SMBFS_DEBUG_TIMESTAMP
3031 printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
3032 DENTRY_PATH(dir), date, time, fattr->f_mtime);
3033#endif
3034 DSET(data, 12, 0); /* size */
3035 DSET(data, 16, 0); /* blksize */
3036 WSET(data, 20, 0); /* attr */
3037 DSET(data, 22, 0); /* ULONG EA size */
3038
3039 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3040 req->rq_ldata = 26;
3041 req->rq_data = data;
3042 req->rq_lparm = p - param;
3043 req->rq_parm = param;
3044 req->rq_flags = 0;
3045 result = smb_add_request(req);
3046 if (result < 0)
3047 goto out_free;
3048 result = 0;
3049 if (req->rq_rcls != 0)
3050 result = smb_errno(req);
3051
3052out_free:
3053 smb_rput(req);
3054out:
3055 return result;
3056}
3057
3058/*
3059 * ATTR_MODE 0x001
3060 * ATTR_UID 0x002
3061 * ATTR_GID 0x004
3062 * ATTR_SIZE 0x008
3063 * ATTR_ATIME 0x010
3064 * ATTR_MTIME 0x020
3065 * ATTR_CTIME 0x040
3066 * ATTR_ATIME_SET 0x080
3067 * ATTR_MTIME_SET 0x100
3068 * ATTR_FORCE 0x200
3069 * ATTR_ATTR_FLAG 0x400
3070 *
3071 * major/minor should only be set by mknod.
3072 */
3073int
3074smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
3075 unsigned int major, unsigned int minor)
3076{
3077 struct smb_sb_info *server = server_from_dentry(d);
3078 u64 nttime;
3079 char *p, *param;
3080 int result;
3081 char data[100];
3082 struct smb_request *req;
3083
3084 result = -ENOMEM;
3085 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3086 goto out;
3087 param = req->rq_buffer;
3088
3089 DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
3090
3091 WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
3092 DSET(param, 2, 0);
3093 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
3094 if (result < 0)
3095 goto out_free;
3096 p = param + 6 + result;
3097
3098 /* 0 L file size in bytes */
3099 /* 8 L file size on disk in bytes (block count) */
3100 /* 40 L uid */
3101 /* 48 L gid */
3102 /* 56 W file type enum */
3103 /* 60 L devmajor */
3104 /* 68 L devminor */
3105 /* 76 L unique ID (inode) */
3106 /* 84 L permissions */
3107 /* 92 L link count */
3108 LSET(data, 0, SMB_SIZE_NO_CHANGE);
3109 LSET(data, 8, SMB_SIZE_NO_CHANGE);
3110 LSET(data, 16, SMB_TIME_NO_CHANGE);
3111 LSET(data, 24, SMB_TIME_NO_CHANGE);
3112 LSET(data, 32, SMB_TIME_NO_CHANGE);
3113 LSET(data, 40, SMB_UID_NO_CHANGE);
3114 LSET(data, 48, SMB_GID_NO_CHANGE);
3115 DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
3116 LSET(data, 60, major);
3117 LSET(data, 68, minor);
3118 LSET(data, 76, 0);
3119 LSET(data, 84, SMB_MODE_NO_CHANGE);
3120 LSET(data, 92, 0);
3121
3122 if (attr->ia_valid & ATTR_SIZE) {
3123 LSET(data, 0, attr->ia_size);
3124 LSET(data, 8, 0); /* can't set anyway */
3125 }
3126
3127 /*
3128 * FIXME: check the conversion function it the correct one
3129 *
3130 * we can't set ctime but we might as well pass this to the server
3131 * and let it ignore it.
3132 */
3133 if (attr->ia_valid & ATTR_CTIME) {
3134 nttime = smb_unixutc2ntutc(attr->ia_ctime);
3135 LSET(data, 16, nttime);
3136 }
3137 if (attr->ia_valid & ATTR_ATIME) {
3138 nttime = smb_unixutc2ntutc(attr->ia_atime);
3139 LSET(data, 24, nttime);
3140 }
3141 if (attr->ia_valid & ATTR_MTIME) {
3142 nttime = smb_unixutc2ntutc(attr->ia_mtime);
3143 LSET(data, 32, nttime);
3144 }
3145
3146 if (attr->ia_valid & ATTR_UID) {
3147 LSET(data, 40, attr->ia_uid);
3148 }
3149 if (attr->ia_valid & ATTR_GID) {
3150 LSET(data, 48, attr->ia_gid);
3151 }
3152
3153 if (attr->ia_valid & ATTR_MODE) {
3154 LSET(data, 84, attr->ia_mode);
3155 }
3156
3157 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3158 req->rq_ldata = 100;
3159 req->rq_data = data;
3160 req->rq_lparm = p - param;
3161 req->rq_parm = param;
3162 req->rq_flags = 0;
3163 result = smb_add_request(req);
3164
3165out_free:
3166 smb_rput(req);
3167out:
3168 return result;
3169}
3170
3171
3172/*
3173 * Set the modify and access timestamps for a file.
3174 *
3175 * Incredibly enough, in all of SMB there is no message to allow
3176 * setting both attributes and timestamps at once.
3177 *
3178 * Bugs Noted:
3179 * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message
3180 * with info level 1 (INFO_STANDARD).
3181 * (2) Win 95 seems not to support setting directory timestamps.
3182 * (3) Under the core protocol apparently the only way to set the
3183 * timestamp is to open and close the file.
3184 */
3185int
3186smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
3187{
3188 struct smb_sb_info *server = server_from_dentry(dentry);
3189 struct inode *inode = dentry->d_inode;
3190 int result;
3191
3192 VERBOSE("setting %s/%s, open=%d\n",
3193 DENTRY_PATH(dentry), smb_is_open(inode));
3194
3195 /* setting the time on a Win95 server fails (tridge) */
3196 if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 &&
3197 !(server->mnt->flags & SMB_MOUNT_WIN95)) {
3198 if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
3199 result = smb_proc_setattr_ext(server, inode, fattr);
3200 else
3201 result = smb_proc_setattr_trans2(server, dentry, fattr);
3202 } else {
3203 /*
3204 * Fail silently on directories ... timestamp can't be set?
3205 */
3206 result = 0;
3207 if (S_ISREG(inode->i_mode)) {
3208 /*
3209 * Set the mtime by opening and closing the file.
3210 * Note that the file is opened read-only, but this
3211 * still allows us to set the date (tridge)
3212 */
3213 result = -EACCES;
3214 if (!smb_is_open(inode))
3215 smb_proc_open(server, dentry, SMB_O_RDONLY);
3216 if (smb_is_open(inode)) {
3217 inode->i_mtime = fattr->f_mtime;
3218 result = smb_proc_close_inode(server, inode);
3219 }
3220 }
3221 }
3222
3223 return result;
3224}
3225
3226int
3227smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
3228{
3229 struct smb_sb_info *server = SMB_SB(dentry->d_sb);
3230 int result;
3231 char *p;
3232 long unit;
3233 struct smb_request *req;
3234
3235 result = -ENOMEM;
3236 if (! (req = smb_alloc_request(server, 0)))
3237 goto out;
3238
3239 smb_setup_header(req, SMBdskattr, 0, 0);
3240 if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
3241 goto out_free;
3242 p = SMB_VWV(req->rq_header);
3243 unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
3244 attr->f_blocks = WVAL(p, 0) * unit;
3245 attr->f_bsize = SMB_ST_BLKSIZE;
3246 attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
3247 result = 0;
3248
3249out_free:
3250 smb_rput(req);
3251out:
3252 return result;
3253}
3254
3255int
3256smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
3257 char *buffer, int len)
3258{
3259 char *p, *param;
3260 int result;
3261 struct smb_request *req;
3262
3263 DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
3264
3265 result = -ENOMEM;
3266 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3267 goto out;
3268 param = req->rq_buffer;
3269
3270 WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
3271 DSET(param, 2, 0);
3272 result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
3273 if (result < 0)
3274 goto out_free;
3275 p = param + 6 + result;
3276
3277 req->rq_trans2_command = TRANSACT2_QPATHINFO;
3278 req->rq_ldata = 0;
3279 req->rq_data = NULL;
3280 req->rq_lparm = p - param;
3281 req->rq_parm = param;
3282 req->rq_flags = 0;
3283 result = smb_add_request(req);
3284 if (result < 0)
3285 goto out_free;
3286 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3287 &param[6], result, req->rq_rcls, req->rq_err);
3288
3289 /* copy data up to the \0 or buffer length */
3290 result = len;
3291 if (req->rq_ldata < len)
3292 result = req->rq_ldata;
3293 strncpy(buffer, req->rq_data, result);
3294
3295out_free:
3296 smb_rput(req);
3297out:
3298 return result;
3299}
3300
3301
3302/*
3303 * Create a symlink object called dentry which points to oldpath.
3304 * Samba does not permit dangling links but returns a suitable error message.
3305 */
3306int
3307smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
3308 const char *oldpath)
3309{
3310 char *p, *param;
3311 int result;
3312 struct smb_request *req;
3313
3314 result = -ENOMEM;
3315 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3316 goto out;
3317 param = req->rq_buffer;
3318
3319 WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
3320 DSET(param, 2, 0);
3321 result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
3322 if (result < 0)
3323 goto out_free;
3324 p = param + 6 + result;
3325
3326 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3327 req->rq_ldata = strlen(oldpath) + 1;
3328 req->rq_data = (char *) oldpath;
3329 req->rq_lparm = p - param;
3330 req->rq_parm = param;
3331 req->rq_flags = 0;
3332 result = smb_add_request(req);
3333 if (result < 0)
3334 goto out_free;
3335
3336 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3337 &param[6], result, req->rq_rcls, req->rq_err);
3338 result = 0;
3339
3340out_free:
3341 smb_rput(req);
3342out:
3343 return result;
3344}
3345
3346/*
3347 * Create a hard link object called new_dentry which points to dentry.
3348 */
3349int
3350smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
3351 struct dentry *new_dentry)
3352{
3353 char *p, *param;
3354 int result;
3355 struct smb_request *req;
3356
3357 result = -ENOMEM;
3358 if (! (req = smb_alloc_request(server, PAGE_SIZE)))
3359 goto out;
3360 param = req->rq_buffer;
3361
3362 WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
3363 DSET(param, 2, 0);
3364 result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
3365 new_dentry, NULL);
3366 if (result < 0)
3367 goto out_free;
3368 p = param + 6 + result;
3369
3370 /* Grr, pointless separation of parameters and data ... */
3371 req->rq_data = p;
3372 req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
3373 dentry, NULL);
3374
3375 req->rq_trans2_command = TRANSACT2_SETPATHINFO;
3376 req->rq_lparm = p - param;
3377 req->rq_parm = param;
3378 req->rq_flags = 0;
3379 result = smb_add_request(req);
3380 if (result < 0)
3381 goto out_free;
3382
3383 DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
3384 &param[6], result, req->rq_rcls, req->rq_err);
3385 result = 0;
3386
3387out_free:
3388 smb_rput(req);
3389out:
3390 return result;
3391}
3392
3393static int
3394smb_proc_query_cifsunix(struct smb_sb_info *server)
3395{
3396 int result;
3397 int major, minor;
3398 u64 caps;
3399 char param[2];
3400 struct smb_request *req;
3401
3402 result = -ENOMEM;
3403 if (! (req = smb_alloc_request(server, 100)))
3404 goto out;
3405
3406 WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
3407
3408 req->rq_trans2_command = TRANSACT2_QFSINFO;
3409 req->rq_ldata = 0;
3410 req->rq_data = NULL;
3411 req->rq_lparm = 2;
3412 req->rq_parm = param;
3413 req->rq_flags = 0;
3414 result = smb_add_request(req);
3415 if (result < 0)
3416 goto out_free;
3417
3418 if (req->rq_ldata < 12) {
3419 PARANOIA("Not enough data\n");
3420 goto out_free;
3421 }
3422 major = WVAL(req->rq_data, 0);
3423 minor = WVAL(req->rq_data, 2);
3424
3425 DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
3426 major, minor);
3427 /* FIXME: verify that we are ok with this major/minor? */
3428
3429 caps = LVAL(req->rq_data, 4);
3430 DEBUG1("Server capabilities 0x%016llx\n", caps);
3431
3432out_free:
3433 smb_rput(req);
3434out:
3435 return result;
3436}
3437
3438
3439static void
3440install_ops(struct smb_ops *dst, struct smb_ops *src)
3441{
3442 memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
3443}
3444
3445/* < LANMAN2 */
3446static struct smb_ops smb_ops_core =
3447{
3448 .read = smb_proc_read,
3449 .write = smb_proc_write,
3450 .readdir = smb_proc_readdir_short,
3451 .getattr = smb_proc_getattr_core,
3452 .truncate = smb_proc_trunc32,
3453};
3454
3455/* LANMAN2, OS/2, others? */
3456static struct smb_ops smb_ops_os2 =
3457{
3458 .read = smb_proc_read,
3459 .write = smb_proc_write,
3460 .readdir = smb_proc_readdir_long,
3461 .getattr = smb_proc_getattr_trans2_std,
3462 .truncate = smb_proc_trunc32,
3463};
3464
3465/* Win95, and possibly some NetApp versions too */
3466static struct smb_ops smb_ops_win95 =
3467{
3468 .read = smb_proc_read, /* does not support 12word readX */
3469 .write = smb_proc_write,
3470 .readdir = smb_proc_readdir_long,
3471 .getattr = smb_proc_getattr_95,
3472 .truncate = smb_proc_trunc95,
3473};
3474
3475/* Samba, NT4 and NT5 */
3476static struct smb_ops smb_ops_winNT =
3477{
3478 .read = smb_proc_readX,
3479 .write = smb_proc_writeX,
3480 .readdir = smb_proc_readdir_long,
3481 .getattr = smb_proc_getattr_trans2_all,
3482 .truncate = smb_proc_trunc64,
3483};
3484
3485/* Samba w/ unix extensions. Others? */
3486static struct smb_ops smb_ops_unix =
3487{
3488 .read = smb_proc_readX,
3489 .write = smb_proc_writeX,
3490 .readdir = smb_proc_readdir_long,
3491 .getattr = smb_proc_getattr_unix,
3492 /* FIXME: core/ext/time setattr needs to be cleaned up! */
3493 /* .setattr = smb_proc_setattr_unix, */
3494 .truncate = smb_proc_trunc64,
3495};
3496
3497/* Place holder until real ops are in place */
3498static struct smb_ops smb_ops_null =
3499{
3500 .readdir = smb_proc_readdir_null,
3501 .getattr = smb_proc_getattr_null,
3502};
3503
3504void smb_install_null_ops(struct smb_ops *ops)
3505{
3506 install_ops(ops, &smb_ops_null);
3507}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e6..000000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * Autogenerated with cproto on: Sat Sep 13 17:18:51 CEST 2003
3 */
4
5struct smb_request;
6struct sock;
7struct statfs;
8
9/* proc.c */
10extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
11extern __u32 smb_len(__u8 *p);
12extern int smb_get_rsize(struct smb_sb_info *server);
13extern int smb_get_wsize(struct smb_sb_info *server);
14extern int smb_errno(struct smb_request *req);
15extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
16extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
17extern int smb_open(struct dentry *dentry, int wish);
18extern int smb_close(struct inode *ino);
19extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
20extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
21extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
22extern int smb_proc_mkdir(struct dentry *dentry);
23extern int smb_proc_rmdir(struct dentry *dentry);
24extern int smb_proc_unlink(struct dentry *dentry);
25extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
26extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
27 struct super_block *sb);
28extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
29extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
30extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
31extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
32extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
33extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
34extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
35extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
36extern void smb_install_null_ops(struct smb_ops *ops);
37/* dir.c */
38extern const struct file_operations smb_dir_operations;
39extern const struct inode_operations smb_dir_inode_operations;
40extern const struct inode_operations smb_dir_inode_operations_unix;
41extern void smb_new_dentry(struct dentry *dentry);
42extern void smb_renew_times(struct dentry *dentry);
43/* cache.c */
44extern void smb_invalid_dir_cache(struct inode *dir);
45extern void smb_invalidate_dircache_entries(struct dentry *parent);
46extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
47extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
48/* sock.c */
49extern void smb_data_ready(struct sock *sk, int len);
50extern int smb_valid_socket(struct inode *inode);
51extern void smb_close_socket(struct smb_sb_info *server);
52extern int smb_recv_available(struct smb_sb_info *server);
53extern int smb_receive_header(struct smb_sb_info *server);
54extern int smb_receive_drop(struct smb_sb_info *server);
55extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
56extern int smb_send_request(struct smb_request *req);
57/* inode.c */
58extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
59extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
60extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
61extern void smb_invalidate_inodes(struct smb_sb_info *server);
62extern int smb_revalidate_inode(struct dentry *dentry);
63extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
64extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
65/* file.c */
66extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */
70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */
72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server);
74extern void smbiod_unregister_server(struct smb_sb_info *server);
75extern void smbiod_flush(struct smb_sb_info *server);
76extern int smbiod_retry(struct smb_sb_info *server);
77/* request.c */
78extern int smb_init_request_cache(void);
79extern void smb_destroy_request_cache(void);
80extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
81extern void smb_rput(struct smb_request *req);
82extern int smb_add_request(struct smb_request *req);
83extern int smb_request_send_server(struct smb_sb_info *server);
84extern int smb_request_recv(struct smb_sb_info *server);
85/* symlink.c */
86extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
87extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e862..000000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
1/*
2 * request.c
3 *
4 * Copyright (C) 2001 by Urban Widmark
5 *
6 * Please add a note about your changes to smbfs in the ChangeLog file.
7 */
8
9#include <linux/kernel.h>
10#include <linux/types.h>
11#include <linux/fs.h>
12#include <linux/slab.h>
13#include <linux/net.h>
14#include <linux/sched.h>
15
16#include <linux/smb_fs.h>
17#include <linux/smbno.h>
18#include <linux/smb_mount.h>
19
20#include "smb_debug.h"
21#include "request.h"
22#include "proto.h"
23
24/* #define SMB_SLAB_DEBUG (SLAB_RED_ZONE | SLAB_POISON) */
25#define SMB_SLAB_DEBUG 0
26
27/* cache for request structures */
28static struct kmem_cache *req_cachep;
29
30static int smb_request_send_req(struct smb_request *req);
31
32/*
33 /proc/slabinfo:
34 name, active, num, objsize, active_slabs, num_slaps, #pages
35*/
36
37
38int smb_init_request_cache(void)
39{
40 req_cachep = kmem_cache_create("smb_request",
41 sizeof(struct smb_request), 0,
42 SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
43 NULL);
44 if (req_cachep == NULL)
45 return -ENOMEM;
46
47 return 0;
48}
49
50void smb_destroy_request_cache(void)
51{
52 kmem_cache_destroy(req_cachep);
53}
54
55/*
56 * Allocate and initialise a request structure
57 */
58static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
59 int bufsize)
60{
61 struct smb_request *req;
62 unsigned char *buf = NULL;
63
64 req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
65 VERBOSE("allocating request: %p\n", req);
66 if (!req)
67 goto out;
68
69 if (bufsize > 0) {
70 buf = kmalloc(bufsize, GFP_NOFS);
71 if (!buf) {
72 kmem_cache_free(req_cachep, req);
73 return NULL;
74 }
75 }
76
77 req->rq_buffer = buf;
78 req->rq_bufsize = bufsize;
79 req->rq_server = server;
80 init_waitqueue_head(&req->rq_wait);
81 INIT_LIST_HEAD(&req->rq_queue);
82 atomic_set(&req->rq_count, 1);
83
84out:
85 return req;
86}
87
88struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
89{
90 struct smb_request *req = NULL;
91
92 for (;;) {
93 atomic_inc(&server->nr_requests);
94 if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
95 req = smb_do_alloc_request(server, bufsize);
96 if (req != NULL)
97 break;
98 }
99
100#if 0
101 /*
102 * Try to free up at least one request in order to stay
103 * below the hard limit
104 */
105 if (nfs_try_to_free_pages(server))
106 continue;
107
108 if (fatal_signal_pending(current))
109 return ERR_PTR(-ERESTARTSYS);
110 current->policy = SCHED_YIELD;
111 schedule();
112#else
113 /* FIXME: we want something like nfs does above, but that
114 requires changes to all callers and can wait. */
115 break;
116#endif
117 }
118 return req;
119}
120
121static void smb_free_request(struct smb_request *req)
122{
123 atomic_dec(&req->rq_server->nr_requests);
124 if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
125 kfree(req->rq_buffer);
126 kfree(req->rq_trans2buffer);
127 kmem_cache_free(req_cachep, req);
128}
129
130/*
131 * What prevents a rget to race with a rput? The count must never drop to zero
132 * while it is in use. Only rput if it is ok that it is free'd.
133 */
134static void smb_rget(struct smb_request *req)
135{
136 atomic_inc(&req->rq_count);
137}
138void smb_rput(struct smb_request *req)
139{
140 if (atomic_dec_and_test(&req->rq_count)) {
141 list_del_init(&req->rq_queue);
142 smb_free_request(req);
143 }
144}
145
146/* setup to receive the data part of the SMB */
147static int smb_setup_bcc(struct smb_request *req)
148{
149 int result = 0;
150 req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
151
152 if (req->rq_rlen > req->rq_bufsize) {
153 PARANOIA("Packet too large %d > %d\n",
154 req->rq_rlen, req->rq_bufsize);
155 return -ENOBUFS;
156 }
157
158 req->rq_iov[0].iov_base = req->rq_buffer;
159 req->rq_iov[0].iov_len = req->rq_rlen;
160 req->rq_iovlen = 1;
161
162 return result;
163}
164
165/*
166 * Prepare a "normal" request structure.
167 */
168static int smb_setup_request(struct smb_request *req)
169{
170 int len = smb_len(req->rq_header) + 4;
171 req->rq_slen = len;
172
173 /* if we expect a data part in the reply we set the iov's to read it */
174 if (req->rq_resp_bcc)
175 req->rq_setup_read = smb_setup_bcc;
176
177 /* This tries to support re-using the same request */
178 req->rq_bytes_sent = 0;
179 req->rq_rcls = 0;
180 req->rq_err = 0;
181 req->rq_errno = 0;
182 req->rq_fragment = 0;
183 kfree(req->rq_trans2buffer);
184 req->rq_trans2buffer = NULL;
185
186 return 0;
187}
188
189/*
190 * Prepare a transaction2 request structure
191 */
192static int smb_setup_trans2request(struct smb_request *req)
193{
194 struct smb_sb_info *server = req->rq_server;
195 int mparam, mdata;
196 static unsigned char padding[4];
197
198 /* I know the following is very ugly, but I want to build the
199 smb packet as efficiently as possible. */
200
201 const int smb_parameters = 15;
202 const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
203 const int oparam = ALIGN(header + 3, sizeof(u32));
204 const int odata = ALIGN(oparam + req->rq_lparm, sizeof(u32));
205 const int bcc = (req->rq_data ? odata + req->rq_ldata :
206 oparam + req->rq_lparm) - header;
207
208 if ((bcc + oparam) > server->opt.max_xmit)
209 return -ENOMEM;
210 smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
211
212 /*
213 * max parameters + max data + max setup == bufsize to make NT4 happy
214 * and not abort the transfer or split into multiple responses. It also
215 * makes smbfs happy as handling packets larger than the buffer size
216 * is extra work.
217 *
218 * OS/2 is probably going to hate me for this ...
219 */
220 mparam = SMB_TRANS2_MAX_PARAM;
221 mdata = req->rq_bufsize - mparam;
222
223 mdata = server->opt.max_xmit - mparam - 100;
224 if (mdata < 1024) {
225 mdata = 1024;
226 mparam = 20;
227 }
228
229#if 0
230 /* NT/win2k has ~4k max_xmit, so with this we request more than it wants
231 to return as one SMB. Useful for testing the fragmented trans2
232 handling. */
233 mdata = 8192;
234#endif
235
236 WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
237 WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
238 WSET(req->rq_header, smb_mprcnt, mparam);
239 WSET(req->rq_header, smb_mdrcnt, mdata);
240 WSET(req->rq_header, smb_msrcnt, 0); /* max setup always 0 ? */
241 WSET(req->rq_header, smb_flags, 0);
242 DSET(req->rq_header, smb_timeout, 0);
243 WSET(req->rq_header, smb_pscnt, req->rq_lparm);
244 WSET(req->rq_header, smb_psoff, oparam - 4);
245 WSET(req->rq_header, smb_dscnt, req->rq_ldata);
246 WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
247 *(req->rq_header + smb_suwcnt) = 0x01; /* setup count */
248 *(req->rq_header + smb_suwcnt + 1) = 0x00; /* reserved */
249 WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
250
251 req->rq_iovlen = 2;
252 req->rq_iov[0].iov_base = (void *) req->rq_header;
253 req->rq_iov[0].iov_len = oparam;
254 req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
255 req->rq_iov[1].iov_len = req->rq_lparm;
256 req->rq_slen = oparam + req->rq_lparm;
257
258 if (req->rq_data) {
259 req->rq_iovlen += 2;
260 req->rq_iov[2].iov_base = padding;
261 req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
262 req->rq_iov[3].iov_base = req->rq_data;
263 req->rq_iov[3].iov_len = req->rq_ldata;
264 req->rq_slen = odata + req->rq_ldata;
265 }
266
267 /* always a data part for trans2 replies */
268 req->rq_setup_read = smb_setup_bcc;
269
270 return 0;
271}
272
273/*
274 * Add a request and tell smbiod to process it
275 */
276int smb_add_request(struct smb_request *req)
277{
278 long timeleft;
279 struct smb_sb_info *server = req->rq_server;
280 int result = 0;
281
282 smb_setup_request(req);
283 if (req->rq_trans2_command) {
284 if (req->rq_buffer == NULL) {
285 PARANOIA("trans2 attempted without response buffer!\n");
286 return -EIO;
287 }
288 result = smb_setup_trans2request(req);
289 }
290 if (result < 0)
291 return result;
292
293#ifdef SMB_DEBUG_PACKET_SIZE
294 add_xmit_stats(req);
295#endif
296
297 /* add 'req' to the queue of requests */
298 if (smb_lock_server_interruptible(server))
299 return -EINTR;
300
301 /*
302 * Try to send the request as the process. If that fails we queue the
303 * request and let smbiod send it later.
304 */
305
306 /* FIXME: each server has a number on the maximum number of parallel
307 requests. 10, 50 or so. We should not allow more requests to be
308 active. */
309 if (server->mid > 0xf000)
310 server->mid = 0;
311 req->rq_mid = server->mid++;
312 WSET(req->rq_header, smb_mid, req->rq_mid);
313
314 result = 0;
315 if (server->state == CONN_VALID) {
316 if (list_empty(&server->xmitq))
317 result = smb_request_send_req(req);
318 if (result < 0) {
319 /* Connection lost? */
320 server->conn_error = result;
321 server->state = CONN_INVALID;
322 }
323 }
324 if (result != 1)
325 list_add_tail(&req->rq_queue, &server->xmitq);
326 smb_rget(req);
327
328 if (server->state != CONN_VALID)
329 smbiod_retry(server);
330
331 smb_unlock_server(server);
332
333 smbiod_wake_up();
334
335 timeleft = wait_event_interruptible_timeout(req->rq_wait,
336 req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
337 if (!timeleft || signal_pending(current)) {
338 /*
339 * On timeout or on interrupt we want to try and remove the
340 * request from the recvq/xmitq.
341 * First check if the request is still part of a queue. (May
342 * have been removed by some error condition)
343 */
344 smb_lock_server(server);
345 if (!list_empty(&req->rq_queue)) {
346 list_del_init(&req->rq_queue);
347 smb_rput(req);
348 }
349 smb_unlock_server(server);
350 }
351
352 if (!timeleft) {
353 PARANOIA("request [%p, mid=%d] timed out!\n",
354 req, req->rq_mid);
355 VERBOSE("smb_com: %02x\n", *(req->rq_header + smb_com));
356 VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
357 VERBOSE("smb_flg: %02x\n", *(req->rq_header + smb_flg));
358 VERBOSE("smb_tid: %04x\n", WVAL(req->rq_header, smb_tid));
359 VERBOSE("smb_pid: %04x\n", WVAL(req->rq_header, smb_pid));
360 VERBOSE("smb_uid: %04x\n", WVAL(req->rq_header, smb_uid));
361 VERBOSE("smb_mid: %04x\n", WVAL(req->rq_header, smb_mid));
362 VERBOSE("smb_wct: %02x\n", *(req->rq_header + smb_wct));
363
364 req->rq_rcls = ERRSRV;
365 req->rq_err = ERRtimeout;
366
367 /* Just in case it was "stuck" */
368 smbiod_wake_up();
369 }
370 VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
371
372 if (req->rq_rcls != 0)
373 req->rq_errno = smb_errno(req);
374 if (signal_pending(current))
375 req->rq_errno = -ERESTARTSYS;
376 return req->rq_errno;
377}
378
379/*
380 * Send a request and place it on the recvq if successfully sent.
381 * Must be called with the server lock held.
382 */
383static int smb_request_send_req(struct smb_request *req)
384{
385 struct smb_sb_info *server = req->rq_server;
386 int result;
387
388 if (req->rq_bytes_sent == 0) {
389 WSET(req->rq_header, smb_tid, server->opt.tid);
390 WSET(req->rq_header, smb_pid, 1);
391 WSET(req->rq_header, smb_uid, server->opt.server_uid);
392 }
393
394 result = smb_send_request(req);
395 if (result < 0 && result != -EAGAIN)
396 goto out;
397
398 result = 0;
399 if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
400 goto out;
401
402 list_move_tail(&req->rq_queue, &server->recvq);
403 result = 1;
404out:
405 return result;
406}
407
408/*
409 * Sends one request for this server. (smbiod)
410 * Must be called with the server lock held.
411 * Returns: <0 on error
412 * 0 if no request could be completely sent
413 * 1 if all data for one request was sent
414 */
415int smb_request_send_server(struct smb_sb_info *server)
416{
417 struct list_head *head;
418 struct smb_request *req;
419 int result;
420
421 if (server->state != CONN_VALID)
422 return 0;
423
424 /* dequeue first request, if any */
425 req = NULL;
426 head = server->xmitq.next;
427 if (head != &server->xmitq) {
428 req = list_entry(head, struct smb_request, rq_queue);
429 }
430 if (!req)
431 return 0;
432
433 result = smb_request_send_req(req);
434 if (result < 0) {
435 server->conn_error = result;
436 list_move(&req->rq_queue, &server->xmitq);
437 result = -EIO;
438 goto out;
439 }
440
441out:
442 return result;
443}
444
445/*
446 * Try to find a request matching this "mid". Typically the first entry will
447 * be the matching one.
448 */
449static struct smb_request *find_request(struct smb_sb_info *server, int mid)
450{
451 struct list_head *tmp;
452 struct smb_request *req = NULL;
453
454 list_for_each(tmp, &server->recvq) {
455 req = list_entry(tmp, struct smb_request, rq_queue);
456 if (req->rq_mid == mid) {
457 break;
458 }
459 req = NULL;
460 }
461
462 if (!req) {
463 VERBOSE("received reply with mid %d but no request!\n",
464 WVAL(server->header, smb_mid));
465 server->rstate = SMB_RECV_DROP;
466 }
467
468 return req;
469}
470
471/*
472 * Called when we have read the smb header and believe this is a response.
473 */
474static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
475{
476 int hdrlen, wct;
477
478 memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
479
480 wct = *(req->rq_header + smb_wct);
481 if (wct > 20) {
482 PARANOIA("wct too large, %d > 20\n", wct);
483 server->rstate = SMB_RECV_DROP;
484 return 0;
485 }
486
487 req->rq_resp_wct = wct;
488 hdrlen = SMB_HEADER_LEN + wct*2 + 2;
489 VERBOSE("header length: %d smb_wct: %2d\n", hdrlen, wct);
490
491 req->rq_bytes_recvd = SMB_HEADER_LEN;
492 req->rq_rlen = hdrlen;
493 req->rq_iov[0].iov_base = req->rq_header;
494 req->rq_iov[0].iov_len = hdrlen;
495 req->rq_iovlen = 1;
496 server->rstate = SMB_RECV_PARAM;
497
498#ifdef SMB_DEBUG_PACKET_SIZE
499 add_recv_stats(smb_len(server->header));
500#endif
501 return 0;
502}
503
504/*
505 * Reads the SMB parameters
506 */
507static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
508{
509 int result;
510
511 result = smb_receive(server, req);
512 if (result < 0)
513 return result;
514 if (req->rq_bytes_recvd < req->rq_rlen)
515 return 0;
516
517 VERBOSE("result: %d smb_bcc: %04x\n", result,
518 WVAL(req->rq_header, SMB_HEADER_LEN +
519 (*(req->rq_header + smb_wct) * 2)));
520
521 result = 0;
522 req->rq_iov[0].iov_base = NULL;
523 req->rq_rlen = 0;
524 if (req->rq_callback)
525 req->rq_callback(req);
526 else if (req->rq_setup_read)
527 result = req->rq_setup_read(req);
528 if (result < 0) {
529 server->rstate = SMB_RECV_DROP;
530 return result;
531 }
532
533 server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
534
535 req->rq_bytes_recvd = 0; // recvd out of the iov
536
537 VERBOSE("rlen: %d\n", req->rq_rlen);
538 if (req->rq_rlen < 0) {
539 PARANOIA("Parameters read beyond end of packet!\n");
540 server->rstate = SMB_RECV_END;
541 return -EIO;
542 }
543 return 0;
544}
545
546/*
547 * Reads the SMB data
548 */
549static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
550{
551 int result;
552
553 result = smb_receive(server, req);
554 if (result < 0)
555 goto out;
556 if (req->rq_bytes_recvd < req->rq_rlen)
557 goto out;
558 server->rstate = SMB_RECV_END;
559out:
560 VERBOSE("result: %d\n", result);
561 return result;
562}
563
564/*
565 * Receive a transaction2 response
566 * Return: 0 if the response has been fully read
567 * 1 if there are further "fragments" to read
568 * <0 if there is an error
569 */
570static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
571{
572 unsigned char *inbuf;
573 unsigned int parm_disp, parm_offset, parm_count, parm_tot;
574 unsigned int data_disp, data_offset, data_count, data_tot;
575 int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
576
577 VERBOSE("handling trans2\n");
578
579 inbuf = req->rq_header;
580 data_tot = WVAL(inbuf, smb_tdrcnt);
581 parm_tot = WVAL(inbuf, smb_tprcnt);
582 parm_disp = WVAL(inbuf, smb_prdisp);
583 parm_offset = WVAL(inbuf, smb_proff);
584 parm_count = WVAL(inbuf, smb_prcnt);
585 data_disp = WVAL(inbuf, smb_drdisp);
586 data_offset = WVAL(inbuf, smb_droff);
587 data_count = WVAL(inbuf, smb_drcnt);
588
589 /* Modify offset for the split header/buffer we use */
590 if (data_count || data_offset) {
591 if (unlikely(data_offset < hdrlen))
592 goto out_bad_data;
593 else
594 data_offset -= hdrlen;
595 }
596 if (parm_count || parm_offset) {
597 if (unlikely(parm_offset < hdrlen))
598 goto out_bad_parm;
599 else
600 parm_offset -= hdrlen;
601 }
602
603 if (parm_count == parm_tot && data_count == data_tot) {
604 /*
605 * This packet has all the trans2 data.
606 *
607 * We setup the request so that this will be the common
608 * case. It may be a server error to not return a
609 * response that fits.
610 */
611 VERBOSE("single trans2 response "
612 "dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
613 data_count, parm_count,
614 data_offset, parm_offset);
615 req->rq_ldata = data_count;
616 req->rq_lparm = parm_count;
617 req->rq_data = req->rq_buffer + data_offset;
618 req->rq_parm = req->rq_buffer + parm_offset;
619 if (unlikely(parm_offset + parm_count > req->rq_rlen))
620 goto out_bad_parm;
621 if (unlikely(data_offset + data_count > req->rq_rlen))
622 goto out_bad_data;
623 return 0;
624 }
625
626 VERBOSE("multi trans2 response "
627 "frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
628 req->rq_fragment,
629 data_count, parm_count,
630 data_offset, parm_offset);
631
632 if (!req->rq_fragment) {
633 int buf_len;
634
635 /* We got the first trans2 fragment */
636 req->rq_fragment = 1;
637 req->rq_total_data = data_tot;
638 req->rq_total_parm = parm_tot;
639 req->rq_ldata = 0;
640 req->rq_lparm = 0;
641
642 buf_len = data_tot + parm_tot;
643 if (buf_len > SMB_MAX_PACKET_SIZE)
644 goto out_too_long;
645
646 req->rq_trans2bufsize = buf_len;
647 req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
648 if (!req->rq_trans2buffer)
649 goto out_no_mem;
650
651 req->rq_parm = req->rq_trans2buffer;
652 req->rq_data = req->rq_trans2buffer + parm_tot;
653 } else if (unlikely(req->rq_total_data < data_tot ||
654 req->rq_total_parm < parm_tot))
655 goto out_data_grew;
656
657 if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
658 parm_offset + parm_count > req->rq_rlen))
659 goto out_bad_parm;
660 if (unlikely(data_disp + data_count > req->rq_total_data ||
661 data_offset + data_count > req->rq_rlen))
662 goto out_bad_data;
663
664 inbuf = req->rq_buffer;
665 memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
666 memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
667
668 req->rq_ldata += data_count;
669 req->rq_lparm += parm_count;
670
671 /*
672 * Check whether we've received all of the data. Note that
673 * we use the packet totals -- total lengths might shrink!
674 */
675 if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
676 req->rq_ldata = data_tot;
677 req->rq_lparm = parm_tot;
678 return 0;
679 }
680 return 1;
681
682out_too_long:
683 printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
684 data_tot, parm_tot);
685 goto out_EIO;
686out_no_mem:
687 printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
688 req->rq_trans2bufsize);
689 req->rq_errno = -ENOMEM;
690 goto out;
691out_data_grew:
692 printk(KERN_ERR "smb_trans2: data/params grew!\n");
693 goto out_EIO;
694out_bad_parm:
695 printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
696 parm_disp, parm_count, parm_tot, parm_offset);
697 goto out_EIO;
698out_bad_data:
699 printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
700 data_disp, data_count, data_tot, data_offset);
701out_EIO:
702 req->rq_errno = -EIO;
703out:
704 return req->rq_errno;
705}
706
707/*
708 * State machine for receiving responses. We handle the fact that we can't
709 * read the full response in one try by having states telling us how much we
710 * have read.
711 *
712 * Must be called with the server lock held (only called from smbiod).
713 *
714 * Return: <0 on error
715 */
716int smb_request_recv(struct smb_sb_info *server)
717{
718 struct smb_request *req = NULL;
719 int result = 0;
720
721 if (smb_recv_available(server) <= 0)
722 return 0;
723
724 VERBOSE("state: %d\n", server->rstate);
725 switch (server->rstate) {
726 case SMB_RECV_DROP:
727 result = smb_receive_drop(server);
728 if (result < 0)
729 break;
730 if (server->rstate == SMB_RECV_DROP)
731 break;
732 server->rstate = SMB_RECV_START;
733 /* fallthrough */
734 case SMB_RECV_START:
735 server->smb_read = 0;
736 server->rstate = SMB_RECV_HEADER;
737 /* fallthrough */
738 case SMB_RECV_HEADER:
739 result = smb_receive_header(server);
740 if (result < 0)
741 break;
742 if (server->rstate == SMB_RECV_HEADER)
743 break;
744 if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
745 server->rstate = SMB_RECV_REQUEST;
746 break;
747 }
748 if (server->rstate != SMB_RECV_HCOMPLETE)
749 break;
750 /* fallthrough */
751 case SMB_RECV_HCOMPLETE:
752 req = find_request(server, WVAL(server->header, smb_mid));
753 if (!req)
754 break;
755 smb_init_request(server, req);
756 req->rq_rcls = *(req->rq_header + smb_rcls);
757 req->rq_err = WVAL(req->rq_header, smb_err);
758 if (server->rstate != SMB_RECV_PARAM)
759 break;
760 /* fallthrough */
761 case SMB_RECV_PARAM:
762 if (!req)
763 req = find_request(server,WVAL(server->header,smb_mid));
764 if (!req)
765 break;
766 result = smb_recv_param(server, req);
767 if (result < 0)
768 break;
769 if (server->rstate != SMB_RECV_DATA)
770 break;
771 /* fallthrough */
772 case SMB_RECV_DATA:
773 if (!req)
774 req = find_request(server,WVAL(server->header,smb_mid));
775 if (!req)
776 break;
777 result = smb_recv_data(server, req);
778 if (result < 0)
779 break;
780 break;
781
782 /* We should never be called with any of these states */
783 case SMB_RECV_END:
784 case SMB_RECV_REQUEST:
785 BUG();
786 }
787
788 if (result < 0) {
789 /* We saw an error */
790 return result;
791 }
792
793 if (server->rstate != SMB_RECV_END)
794 return 0;
795
796 result = 0;
797 if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
798 result = smb_recv_trans2(server, req);
799
800 /*
801 * Response completely read. Drop any extra bytes sent by the server.
802 * (Yes, servers sometimes add extra bytes to responses)
803 */
804 VERBOSE("smb_len: %d smb_read: %d\n",
805 server->smb_len, server->smb_read);
806 if (server->smb_read < server->smb_len)
807 smb_receive_drop(server);
808
809 server->rstate = SMB_RECV_START;
810
811 if (!result) {
812 list_del_init(&req->rq_queue);
813 req->rq_flags |= SMB_REQ_RECEIVED;
814 smb_rput(req);
815 wake_up_interruptible(&req->rq_wait);
816 }
817 return 0;
818}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c9..000000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
1#include <linux/list.h>
2#include <linux/types.h>
3#include <linux/uio.h>
4#include <linux/wait.h>
5
6struct smb_request {
7 struct list_head rq_queue; /* recvq or xmitq for the server */
8
9 atomic_t rq_count;
10
11 wait_queue_head_t rq_wait;
12 int rq_flags;
13 int rq_mid; /* multiplex ID, set by request.c */
14
15 struct smb_sb_info *rq_server;
16
17 /* header + word count + parameter words + byte count */
18 unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
19
20 int rq_bufsize;
21 unsigned char *rq_buffer;
22
23 /* FIXME: this is not good enough for merging IO requests. */
24 unsigned char *rq_page;
25 int rq_rsize;
26
27 int rq_resp_wct;
28 int rq_resp_bcc;
29
30 int rq_rlen;
31 int rq_bytes_recvd;
32
33 int rq_slen;
34 int rq_bytes_sent;
35
36 int rq_iovlen;
37 struct kvec rq_iov[4];
38
39 int (*rq_setup_read) (struct smb_request *);
40 void (*rq_callback) (struct smb_request *);
41
42 /* ------ trans2 stuff ------ */
43
44 u16 rq_trans2_command; /* 0 if not a trans2 request */
45 unsigned int rq_ldata;
46 unsigned char *rq_data;
47 unsigned int rq_lparm;
48 unsigned char *rq_parm;
49
50 int rq_fragment;
51 u32 rq_total_data;
52 u32 rq_total_parm;
53 int rq_trans2bufsize;
54 unsigned char *rq_trans2buffer;
55
56 /* ------ response ------ */
57
58 unsigned short rq_rcls;
59 unsigned short rq_err;
60 int rq_errno;
61};
62
63#define SMB_REQ_STATIC 0x0001 /* rq_buffer is static */
64#define SMB_REQ_NORETRY 0x0002 /* request is invalid after retry */
65
66#define SMB_REQ_TRANSMITTED 0x4000 /* all data has been sent */
67#define SMB_REQ_RECEIVED 0x8000 /* reply received, smbiod is done */
68
69#define xSMB_REQ_NOREPLY 0x0004 /* we don't want the reply (if any) */
70#define xSMB_REQ_NORECEIVER 0x0008 /* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd755..000000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * Defines some debug macros for smbfs.
3 */
4
5/* This makes a dentry parent/child name pair. Useful for debugging printk's */
6#define DENTRY_PATH(dentry) \
7 (dentry)->d_parent->d_name.name,(dentry)->d_name.name
8
9/*
10 * safety checks that should never happen ???
11 * these are normally enabled.
12 */
13#ifdef SMBFS_PARANOIA
14# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
15#else
16# define PARANOIA(f, a...) do { ; } while(0)
17#endif
18
19/* lots of debug messages */
20#ifdef SMBFS_DEBUG_VERBOSE
21# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
22#else
23# define VERBOSE(f, a...) do { ; } while(0)
24#endif
25
26/*
27 * "normal" debug messages, but not with a normal DEBUG define ... way
28 * too common name.
29 */
30#ifdef SMBFS_DEBUG
31#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
32#else
33#define DEBUG1(f, a...) do { ; } while(0)
34#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10a..000000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
1/*
2 * smbiod.c
3 *
4 * Copyright (C) 2000, Charles Loep / Corel Corp.
5 * Copyright (C) 2001, Urban Widmark
6 */
7
8
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/mm.h>
12#include <linux/string.h>
13#include <linux/stat.h>
14#include <linux/errno.h>
15#include <linux/init.h>
16#include <linux/file.h>
17#include <linux/dcache.h>
18#include <linux/module.h>
19#include <linux/net.h>
20#include <linux/kthread.h>
21#include <net/ip.h>
22
23#include <linux/smb_fs.h>
24#include <linux/smbno.h>
25#include <linux/smb_mount.h>
26
27#include <asm/system.h>
28#include <asm/uaccess.h>
29
30#include "smb_debug.h"
31#include "request.h"
32#include "proto.h"
33
34enum smbiod_state {
35 SMBIOD_DEAD,
36 SMBIOD_STARTING,
37 SMBIOD_RUNNING,
38};
39
40static enum smbiod_state smbiod_state = SMBIOD_DEAD;
41static struct task_struct *smbiod_thread;
42static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
43static LIST_HEAD(smb_servers);
44static DEFINE_SPINLOCK(servers_lock);
45
46#define SMBIOD_DATA_READY (1<<0)
47static unsigned long smbiod_flags;
48
49static int smbiod(void *);
50static int smbiod_start(void);
51
52/*
53 * called when there's work for us to do
54 */
55void smbiod_wake_up(void)
56{
57 if (smbiod_state == SMBIOD_DEAD)
58 return;
59 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
60 wake_up_interruptible(&smbiod_wait);
61}
62
63/*
64 * start smbiod if none is running
65 */
66static int smbiod_start(void)
67{
68 struct task_struct *tsk;
69 int err = 0;
70
71 if (smbiod_state != SMBIOD_DEAD)
72 return 0;
73 smbiod_state = SMBIOD_STARTING;
74 __module_get(THIS_MODULE);
75 spin_unlock(&servers_lock);
76 tsk = kthread_run(smbiod, NULL, "smbiod");
77 if (IS_ERR(tsk)) {
78 err = PTR_ERR(tsk);
79 module_put(THIS_MODULE);
80 }
81
82 spin_lock(&servers_lock);
83 if (err < 0) {
84 smbiod_state = SMBIOD_DEAD;
85 smbiod_thread = NULL;
86 } else {
87 smbiod_state = SMBIOD_RUNNING;
88 smbiod_thread = tsk;
89 }
90 return err;
91}
92
93/*
94 * register a server & start smbiod if necessary
95 */
96int smbiod_register_server(struct smb_sb_info *server)
97{
98 int ret;
99 spin_lock(&servers_lock);
100 list_add(&server->entry, &smb_servers);
101 VERBOSE("%p\n", server);
102 ret = smbiod_start();
103 spin_unlock(&servers_lock);
104 return ret;
105}
106
107/*
108 * Unregister a server
109 * Must be called with the server lock held.
110 */
111void smbiod_unregister_server(struct smb_sb_info *server)
112{
113 spin_lock(&servers_lock);
114 list_del_init(&server->entry);
115 VERBOSE("%p\n", server);
116 spin_unlock(&servers_lock);
117
118 smbiod_wake_up();
119 smbiod_flush(server);
120}
121
122void smbiod_flush(struct smb_sb_info *server)
123{
124 struct list_head *tmp, *n;
125 struct smb_request *req;
126
127 list_for_each_safe(tmp, n, &server->xmitq) {
128 req = list_entry(tmp, struct smb_request, rq_queue);
129 req->rq_errno = -EIO;
130 list_del_init(&req->rq_queue);
131 smb_rput(req);
132 wake_up_interruptible(&req->rq_wait);
133 }
134 list_for_each_safe(tmp, n, &server->recvq) {
135 req = list_entry(tmp, struct smb_request, rq_queue);
136 req->rq_errno = -EIO;
137 list_del_init(&req->rq_queue);
138 smb_rput(req);
139 wake_up_interruptible(&req->rq_wait);
140 }
141}
142
143/*
144 * Wake up smbmount and make it reconnect to the server.
145 * This must be called with the server locked.
146 *
147 * FIXME: add smbconnect version to this
148 */
149int smbiod_retry(struct smb_sb_info *server)
150{
151 struct list_head *head;
152 struct smb_request *req;
153 struct pid *pid = get_pid(server->conn_pid);
154 int result = 0;
155
156 VERBOSE("state: %d\n", server->state);
157 if (server->state == CONN_VALID || server->state == CONN_RETRYING)
158 goto out;
159
160 smb_invalidate_inodes(server);
161
162 /*
163 * Some requests are meaningless after a retry, so we abort them.
164 * One example are all requests using 'fileid' since the files are
165 * closed on retry.
166 */
167 head = server->xmitq.next;
168 while (head != &server->xmitq) {
169 req = list_entry(head, struct smb_request, rq_queue);
170 head = head->next;
171
172 req->rq_bytes_sent = 0;
173 if (req->rq_flags & SMB_REQ_NORETRY) {
174 VERBOSE("aborting request %p on xmitq\n", req);
175 req->rq_errno = -EIO;
176 list_del_init(&req->rq_queue);
177 smb_rput(req);
178 wake_up_interruptible(&req->rq_wait);
179 }
180 }
181
182 /*
183 * FIXME: test the code for retrying request we already sent
184 */
185 head = server->recvq.next;
186 while (head != &server->recvq) {
187 req = list_entry(head, struct smb_request, rq_queue);
188 head = head->next;
189#if 0
190 if (req->rq_flags & SMB_REQ_RETRY) {
191 /* must move the request to the xmitq */
192 VERBOSE("retrying request %p on recvq\n", req);
193 list_move(&req->rq_queue, &server->xmitq);
194 continue;
195 }
196#endif
197
198 VERBOSE("aborting request %p on recvq\n", req);
199 /* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
200 req->rq_errno = -EIO;
201 list_del_init(&req->rq_queue);
202 smb_rput(req);
203 wake_up_interruptible(&req->rq_wait);
204 }
205
206 smb_close_socket(server);
207
208 if (!pid) {
209 /* FIXME: this is fatal, umount? */
210 printk(KERN_ERR "smb_retry: no connection process\n");
211 server->state = CONN_RETRIED;
212 goto out;
213 }
214
215 /*
216 * Change state so that only one retry per server will be started.
217 */
218 server->state = CONN_RETRYING;
219
220 /*
221 * Note: use the "priv" flag, as a user process may need to reconnect.
222 */
223 result = kill_pid(pid, SIGUSR1, 1);
224 if (result) {
225 /* FIXME: this is most likely fatal, umount? */
226 printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
227 goto out;
228 }
229 VERBOSE("signalled pid %d\n", pid_nr(pid));
230
231 /* FIXME: The retried requests should perhaps get a "time boost". */
232
233out:
234 put_pid(pid);
235 return result;
236}
237
238/*
239 * Currently handles lockingX packets.
240 */
241static void smbiod_handle_request(struct smb_sb_info *server)
242{
243 PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
244 server->rstate = SMB_RECV_DROP;
245}
246
247/*
248 * Do some IO for one server.
249 */
250static void smbiod_doio(struct smb_sb_info *server)
251{
252 int result;
253 int maxwork = 7;
254
255 if (server->state != CONN_VALID)
256 goto out;
257
258 do {
259 result = smb_request_recv(server);
260 if (result < 0) {
261 server->state = CONN_INVALID;
262 smbiod_retry(server);
263 goto out; /* reconnecting is slow */
264 } else if (server->rstate == SMB_RECV_REQUEST)
265 smbiod_handle_request(server);
266 } while (result > 0 && maxwork-- > 0);
267
268 /*
269 * If there is more to read then we want to be sure to wake up again.
270 */
271 if (server->state != CONN_VALID)
272 goto out;
273 if (smb_recv_available(server) > 0)
274 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
275
276 do {
277 result = smb_request_send_server(server);
278 if (result < 0) {
279 server->state = CONN_INVALID;
280 smbiod_retry(server);
281 goto out; /* reconnecting is slow */
282 }
283 } while (result > 0);
284
285 /*
286 * If the last request was not sent out we want to wake up again.
287 */
288 if (!list_empty(&server->xmitq))
289 set_bit(SMBIOD_DATA_READY, &smbiod_flags);
290
291out:
292 return;
293}
294
295/*
296 * smbiod kernel thread
297 */
298static int smbiod(void *unused)
299{
300 VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
301
302 for (;;) {
303 struct smb_sb_info *server;
304 struct list_head *pos, *n;
305
306 /* FIXME: Use poll? */
307 wait_event_interruptible(smbiod_wait,
308 test_bit(SMBIOD_DATA_READY, &smbiod_flags));
309 if (signal_pending(current)) {
310 spin_lock(&servers_lock);
311 smbiod_state = SMBIOD_DEAD;
312 spin_unlock(&servers_lock);
313 break;
314 }
315
316 clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
317
318 spin_lock(&servers_lock);
319 if (list_empty(&smb_servers)) {
320 smbiod_state = SMBIOD_DEAD;
321 spin_unlock(&servers_lock);
322 break;
323 }
324
325 list_for_each_safe(pos, n, &smb_servers) {
326 server = list_entry(pos, struct smb_sb_info, entry);
327 VERBOSE("checking server %p\n", server);
328
329 if (server->state == CONN_VALID) {
330 spin_unlock(&servers_lock);
331
332 smb_lock_server(server);
333 smbiod_doio(server);
334 smb_unlock_server(server);
335
336 spin_lock(&servers_lock);
337 }
338 }
339 spin_unlock(&servers_lock);
340 }
341
342 VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
343 module_put_and_exit(0);
344}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd0..000000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
1/*
2 * sock.c
3 *
4 * Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
5 * Copyright (C) 1997 by Volker Lendecke
6 *
7 * Please add a note about your changes to smbfs in the ChangeLog file.
8 */
9
10#include <linux/fs.h>
11#include <linux/time.h>
12#include <linux/errno.h>
13#include <linux/socket.h>
14#include <linux/fcntl.h>
15#include <linux/file.h>
16#include <linux/in.h>
17#include <linux/net.h>
18#include <linux/mm.h>
19#include <linux/netdevice.h>
20#include <linux/workqueue.h>
21#include <net/scm.h>
22#include <net/tcp_states.h>
23#include <net/ip.h>
24
25#include <linux/smb_fs.h>
26#include <linux/smb.h>
27#include <linux/smbno.h>
28
29#include <asm/uaccess.h>
30#include <asm/ioctls.h>
31
32#include "smb_debug.h"
33#include "proto.h"
34#include "request.h"
35
36
37static int
38_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
39{
40 struct kvec iov = {ubuf, size};
41 struct msghdr msg = {.msg_flags = flags};
42 msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
43 return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
44}
45
46/*
47 * Return the server this socket belongs to
48 */
49static struct smb_sb_info *
50server_from_socket(struct socket *socket)
51{
52 return socket->sk->sk_user_data;
53}
54
55/*
56 * Called when there is data on the socket.
57 */
58void
59smb_data_ready(struct sock *sk, int len)
60{
61 struct smb_sb_info *server = server_from_socket(sk->sk_socket);
62 void (*data_ready)(struct sock *, int) = server->data_ready;
63
64 data_ready(sk, len);
65 VERBOSE("(%p, %d)\n", sk, len);
66 smbiod_wake_up();
67}
68
69int
70smb_valid_socket(struct inode * inode)
71{
72 return (inode && S_ISSOCK(inode->i_mode) &&
73 SOCKET_I(inode)->type == SOCK_STREAM);
74}
75
76static struct socket *
77server_sock(struct smb_sb_info *server)
78{
79 struct file *file;
80
81 if (server && (file = server->sock_file))
82 {
83#ifdef SMBFS_PARANOIA
84 if (!smb_valid_socket(file->f_path.dentry->d_inode))
85 PARANOIA("bad socket!\n");
86#endif
87 return SOCKET_I(file->f_path.dentry->d_inode);
88 }
89 return NULL;
90}
91
92void
93smb_close_socket(struct smb_sb_info *server)
94{
95 struct file * file = server->sock_file;
96
97 if (file) {
98 struct socket *sock = server_sock(server);
99
100 VERBOSE("closing socket %p\n", sock);
101 sock->sk->sk_data_ready = server->data_ready;
102 server->sock_file = NULL;
103 fput(file);
104 }
105}
106
107static int
108smb_get_length(struct socket *socket, unsigned char *header)
109{
110 int result;
111
112 result = _recvfrom(socket, header, 4, MSG_PEEK);
113 if (result == -EAGAIN)
114 return -ENODATA;
115 if (result < 0) {
116 PARANOIA("recv error = %d\n", -result);
117 return result;
118 }
119 if (result < 4)
120 return -ENODATA;
121
122 switch (header[0]) {
123 case 0x00:
124 case 0x82:
125 break;
126
127 case 0x85:
128 DEBUG1("Got SESSION KEEP ALIVE\n");
129 _recvfrom(socket, header, 4, 0); /* read away */
130 return -ENODATA;
131
132 default:
133 PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
134 return -EIO;
135 }
136
137 /* The length in the RFC NB header is the raw data length */
138 return smb_len(header);
139}
140
141int
142smb_recv_available(struct smb_sb_info *server)
143{
144 mm_segment_t oldfs;
145 int avail, err;
146 struct socket *sock = server_sock(server);
147
148 oldfs = get_fs();
149 set_fs(get_ds());
150 err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
151 set_fs(oldfs);
152 return (err >= 0) ? avail : err;
153}
154
155/*
156 * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
157 */
158static int
159smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
160{
161 struct kvec *iv = *data;
162 int i;
163 int len;
164
165 /*
166 * Eat any sent kvecs
167 */
168 while (iv->iov_len <= amount) {
169 amount -= iv->iov_len;
170 iv++;
171 (*num)--;
172 }
173
174 /*
175 * And chew down the partial one
176 */
177 vec[0].iov_len = iv->iov_len-amount;
178 vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
179 iv++;
180
181 len = vec[0].iov_len;
182
183 /*
184 * And copy any others
185 */
186 for (i = 1; i < *num; i++) {
187 vec[i] = *iv++;
188 len += vec[i].iov_len;
189 }
190
191 *data = vec;
192 return len;
193}
194
195/*
196 * smb_receive_header
197 * Only called by the smbiod thread.
198 */
199int
200smb_receive_header(struct smb_sb_info *server)
201{
202 struct socket *sock;
203 int result = 0;
204 unsigned char peek_buf[4];
205
206 result = -EIO;
207 sock = server_sock(server);
208 if (!sock)
209 goto out;
210 if (sock->sk->sk_state != TCP_ESTABLISHED)
211 goto out;
212
213 if (!server->smb_read) {
214 result = smb_get_length(sock, peek_buf);
215 if (result < 0) {
216 if (result == -ENODATA)
217 result = 0;
218 goto out;
219 }
220 server->smb_len = result + 4;
221
222 if (server->smb_len < SMB_HEADER_LEN) {
223 PARANOIA("short packet: %d\n", result);
224 server->rstate = SMB_RECV_DROP;
225 result = -EIO;
226 goto out;
227 }
228 if (server->smb_len > SMB_MAX_PACKET_SIZE) {
229 PARANOIA("long packet: %d\n", result);
230 server->rstate = SMB_RECV_DROP;
231 result = -EIO;
232 goto out;
233 }
234 }
235
236 result = _recvfrom(sock, server->header + server->smb_read,
237 SMB_HEADER_LEN - server->smb_read, 0);
238 VERBOSE("_recvfrom: %d\n", result);
239 if (result < 0) {
240 VERBOSE("receive error: %d\n", result);
241 goto out;
242 }
243 server->smb_read += result;
244
245 if (server->smb_read == SMB_HEADER_LEN)
246 server->rstate = SMB_RECV_HCOMPLETE;
247out:
248 return result;
249}
250
251static char drop_buffer[PAGE_SIZE];
252
253/*
254 * smb_receive_drop - read and throw away the data
255 * Only called by the smbiod thread.
256 *
257 * FIXME: we are in the kernel, could we just tell the socket that we want
258 * to drop stuff from the buffer?
259 */
260int
261smb_receive_drop(struct smb_sb_info *server)
262{
263 struct socket *sock;
264 unsigned int flags;
265 struct kvec iov;
266 struct msghdr msg;
267 int rlen = smb_len(server->header) - server->smb_read + 4;
268 int result = -EIO;
269
270 if (rlen > PAGE_SIZE)
271 rlen = PAGE_SIZE;
272
273 sock = server_sock(server);
274 if (!sock)
275 goto out;
276 if (sock->sk->sk_state != TCP_ESTABLISHED)
277 goto out;
278
279 flags = MSG_DONTWAIT | MSG_NOSIGNAL;
280 iov.iov_base = drop_buffer;
281 iov.iov_len = PAGE_SIZE;
282 msg.msg_flags = flags;
283 msg.msg_name = NULL;
284 msg.msg_namelen = 0;
285 msg.msg_control = NULL;
286
287 result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
288
289 VERBOSE("read: %d\n", result);
290 if (result < 0) {
291 VERBOSE("receive error: %d\n", result);
292 goto out;
293 }
294 server->smb_read += result;
295
296 if (server->smb_read >= server->smb_len)
297 server->rstate = SMB_RECV_END;
298
299out:
300 return result;
301}
302
303/*
304 * smb_receive
305 * Only called by the smbiod thread.
306 */
307int
308smb_receive(struct smb_sb_info *server, struct smb_request *req)
309{
310 struct socket *sock;
311 unsigned int flags;
312 struct kvec iov[4];
313 struct kvec *p = req->rq_iov;
314 size_t num = req->rq_iovlen;
315 struct msghdr msg;
316 int rlen;
317 int result = -EIO;
318
319 sock = server_sock(server);
320 if (!sock)
321 goto out;
322 if (sock->sk->sk_state != TCP_ESTABLISHED)
323 goto out;
324
325 flags = MSG_DONTWAIT | MSG_NOSIGNAL;
326 msg.msg_flags = flags;
327 msg.msg_name = NULL;
328 msg.msg_namelen = 0;
329 msg.msg_control = NULL;
330
331 /* Dont repeat bytes and count available bufferspace */
332 rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
333 (req->rq_rlen - req->rq_bytes_recvd));
334
335 result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
336
337 VERBOSE("read: %d\n", result);
338 if (result < 0) {
339 VERBOSE("receive error: %d\n", result);
340 goto out;
341 }
342 req->rq_bytes_recvd += result;
343 server->smb_read += result;
344
345out:
346 return result;
347}
348
349/*
350 * Try to send a SMB request. This may return after sending only parts of the
351 * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
352 *
353 * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
354 */
355int
356smb_send_request(struct smb_request *req)
357{
358 struct smb_sb_info *server = req->rq_server;
359 struct socket *sock;
360 struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
361 int slen = req->rq_slen - req->rq_bytes_sent;
362 int result = -EIO;
363 struct kvec iov[4];
364 struct kvec *p = req->rq_iov;
365 size_t num = req->rq_iovlen;
366
367 sock = server_sock(server);
368 if (!sock)
369 goto out;
370 if (sock->sk->sk_state != TCP_ESTABLISHED)
371 goto out;
372
373 /* Dont repeat bytes */
374 if (req->rq_bytes_sent)
375 smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
376
377 result = kernel_sendmsg(sock, &msg, p, num, slen);
378
379 if (result >= 0) {
380 req->rq_bytes_sent += result;
381 if (req->rq_bytes_sent >= req->rq_slen)
382 req->rq_flags |= SMB_REQ_TRANSMITTED;
383 }
384out:
385 return result;
386}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd469..000000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * symlink.c
3 *
4 * Copyright (C) 2002 by John Newbigin
5 *
6 * Please add a note about your changes to smbfs in the ChangeLog file.
7 */
8
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/fcntl.h>
12#include <linux/stat.h>
13#include <linux/mm.h>
14#include <linux/slab.h>
15#include <linux/pagemap.h>
16#include <linux/net.h>
17#include <linux/namei.h>
18
19#include <asm/uaccess.h>
20#include <asm/system.h>
21
22#include <linux/smbno.h>
23#include <linux/smb_fs.h>
24
25#include "smb_debug.h"
26#include "proto.h"
27
28int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
29{
30 DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
31
32 return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
33}
34
35static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
36{
37 char *link = __getname();
38 DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
39
40 if (!link) {
41 link = ERR_PTR(-ENOMEM);
42 } else {
43 int len = smb_proc_read_link(server_from_dentry(dentry),
44 dentry, link, PATH_MAX - 1);
45 if (len < 0) {
46 __putname(link);
47 link = ERR_PTR(len);
48 } else {
49 link[len] = 0;
50 }
51 }
52 nd_set_link(nd, link);
53 return NULL;
54}
55
56static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
57{
58 char *s = nd_get_link(nd);
59 if (!IS_ERR(s))
60 __putname(s);
61}
62
63const struct inode_operations smb_link_inode_operations =
64{
65 .readlink = generic_readlink,
66 .follow_link = smb_follow_link,
67 .put_link = smb_put_link,
68};
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f0..50a5d978da16 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
682{ 682{
683 struct file *file = sd->u.file; 683 struct file *file = sd->u.file;
684 loff_t pos = sd->pos; 684 loff_t pos = sd->pos;
685 int ret, more; 685 int more;
686
687 ret = buf->ops->confirm(pipe, buf);
688 if (!ret) {
689 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
690 if (file->f_op && file->f_op->sendpage)
691 ret = file->f_op->sendpage(file, buf->page, buf->offset,
692 sd->len, &pos, more);
693 else
694 ret = -EINVAL;
695 }
696 686
697 return ret; 687 if (!likely(file->f_op && file->f_op->sendpage))
688 return -EINVAL;
689
690 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
691 return file->f_op->sendpage(file, buf->page, buf->offset,
692 sd->len, &pos, more);
698} 693}
699 694
700/* 695/*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
727 void *fsdata; 722 void *fsdata;
728 int ret; 723 int ret;
729 724
730 /*
731 * make sure the data in this buffer is uptodate
732 */
733 ret = buf->ops->confirm(pipe, buf);
734 if (unlikely(ret))
735 return ret;
736
737 offset = sd->pos & ~PAGE_CACHE_MASK; 725 offset = sd->pos & ~PAGE_CACHE_MASK;
738 726
739 this_len = sd->len; 727 this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
805 if (sd->len > sd->total_len) 793 if (sd->len > sd->total_len)
806 sd->len = sd->total_len; 794 sd->len = sd->total_len;
807 795
808 ret = actor(pipe, buf, sd); 796 ret = buf->ops->confirm(pipe, buf);
809 if (ret <= 0) { 797 if (unlikely(ret)) {
810 if (ret == -ENODATA) 798 if (ret == -ENODATA)
811 ret = 0; 799 ret = 0;
812 return ret; 800 return ret;
813 } 801 }
802
803 ret = actor(pipe, buf, sd);
804 if (ret <= 0)
805 return ret;
806
814 buf->offset += ret; 807 buf->offset += ret;
815 buf->len -= ret; 808 buf->len -= ret;
816 809
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1044 int ret; 1037 int ret;
1045 void *data; 1038 void *data;
1046 1039
1047 ret = buf->ops->confirm(pipe, buf);
1048 if (ret)
1049 return ret;
1050
1051 data = buf->ops->map(pipe, buf, 0); 1040 data = buf->ops->map(pipe, buf, 0);
1052 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); 1041 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1053 buf->ops->unmap(pipe, buf, data); 1042 buf->ops->unmap(pipe, buf, data);
@@ -1311,18 +1300,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1311static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1300static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1312 struct pipe_inode_info *opipe, 1301 struct pipe_inode_info *opipe,
1313 size_t len, unsigned int flags); 1302 size_t len, unsigned int flags);
1314/*
1315 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1316 * location, so checking ->i_pipe is not enough to verify that this is a
1317 * pipe.
1318 */
1319static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1320{
1321 if (S_ISFIFO(inode->i_mode))
1322 return inode->i_pipe;
1323
1324 return NULL;
1325}
1326 1303
1327/* 1304/*
1328 * Determine where to splice to/from. 1305 * Determine where to splice to/from.
@@ -1336,8 +1313,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1336 loff_t offset, *off; 1313 loff_t offset, *off;
1337 long ret; 1314 long ret;
1338 1315
1339 ipipe = pipe_info(in->f_path.dentry->d_inode); 1316 ipipe = get_pipe_info(in);
1340 opipe = pipe_info(out->f_path.dentry->d_inode); 1317 opipe = get_pipe_info(out);
1341 1318
1342 if (ipipe && opipe) { 1319 if (ipipe && opipe) {
1343 if (off_in || off_out) 1320 if (off_in || off_out)
@@ -1507,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1507 char *src; 1484 char *src;
1508 int ret; 1485 int ret;
1509 1486
1510 ret = buf->ops->confirm(pipe, buf);
1511 if (unlikely(ret))
1512 return ret;
1513
1514 /* 1487 /*
1515 * See if we can use the atomic maps, by prefaulting in the 1488 * See if we can use the atomic maps, by prefaulting in the
1516 * pages and doing an atomic copy 1489 * pages and doing an atomic copy
@@ -1555,7 +1528,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1555 int error; 1528 int error;
1556 long ret; 1529 long ret;
1557 1530
1558 pipe = pipe_info(file->f_path.dentry->d_inode); 1531 pipe = get_pipe_info(file);
1559 if (!pipe) 1532 if (!pipe)
1560 return -EBADF; 1533 return -EBADF;
1561 1534
@@ -1642,7 +1615,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1642 }; 1615 };
1643 long ret; 1616 long ret;
1644 1617
1645 pipe = pipe_info(file->f_path.dentry->d_inode); 1618 pipe = get_pipe_info(file);
1646 if (!pipe) 1619 if (!pipe)
1647 return -EBADF; 1620 return -EBADF;
1648 1621
@@ -2022,8 +1995,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
2022static long do_tee(struct file *in, struct file *out, size_t len, 1995static long do_tee(struct file *in, struct file *out, size_t len,
2023 unsigned int flags) 1996 unsigned int flags)
2024{ 1997{
2025 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1998 struct pipe_inode_info *ipipe = get_pipe_info(in);
2026 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1999 struct pipe_inode_info *opipe = get_pipe_info(out);
2027 int ret = -EINVAL; 2000 int ret = -EINVAL;
2028 2001
2029 /* 2002 /*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d04..aa68a8a31518 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
29config SQUASHFS_XATTR 29config SQUASHFS_XATTR
30 bool "Squashfs XATTR support" 30 bool "Squashfs XATTR support"
31 depends on SQUASHFS 31 depends on SQUASHFS
32 default n
33 help 32 help
34 Saying Y here includes support for extended attributes (xattrs). 33 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by 34 Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
40config SQUASHFS_LZO 39config SQUASHFS_LZO
41 bool "Include support for LZO compressed file systems" 40 bool "Include support for LZO compressed file systems"
42 depends on SQUASHFS 41 depends on SQUASHFS
43 default n
44 select LZO_DECOMPRESS 42 select LZO_DECOMPRESS
45 help 43 help
46 Saying Y here includes support for reading Squashfs file systems 44 Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
53 51
54 If unsure, say N. 52 If unsure, say N.
55 53
54config SQUASHFS_XZ
55 bool "Include support for XZ compressed file systems"
56 depends on SQUASHFS
57 select XZ_DEC
58 help
59 Saying Y here includes support for reading Squashfs file systems
60 compressed with XZ compresssion. XZ gives better compression than
61 the default zlib compression, at the expense of greater CPU and
62 memory overhead.
63
64 XZ is not the standard compression used in Squashfs and so most
65 file systems will be readable without selecting this option.
66
67 If unsure, say N.
68
56config SQUASHFS_EMBEDDED 69config SQUASHFS_EMBEDDED
57 bool "Additional option for memory-constrained systems" 70 bool "Additional option for memory-constrained systems"
58 depends on SQUASHFS 71 depends on SQUASHFS
59 default n
60 help 72 help
61 Saying Y here allows you to specify cache size. 73 Saying Y here allows you to specify cache size.
62 74
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d328..cecf2bea07af 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o 8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o 9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
10squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb840..8ab48bc2fa7d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
34 34
35#include "squashfs_fs.h" 35#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
37#include "squashfs_fs_i.h"
38#include "squashfs.h" 37#include "squashfs.h"
39#include "decompressor.h" 38#include "decompressor.h"
40 39
@@ -64,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
64 *length = (unsigned char) bh->b_data[*offset] | 63 *length = (unsigned char) bh->b_data[*offset] |
65 (unsigned char) bh->b_data[*offset + 1] << 8; 64 (unsigned char) bh->b_data[*offset + 1] << 8;
66 *offset += 2; 65 *offset += 2;
66
67 if (*offset == msblk->devblksize) {
68 put_bh(bh);
69 bh = sb_bread(sb, ++(*cur_index));
70 if (bh == NULL)
71 return NULL;
72 *offset = 0;
73 }
67 } 74 }
68 75
69 return bh; 76 return bh;
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee9059..26b15ae34d6f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
55 55
56#include "squashfs_fs.h" 56#include "squashfs_fs.h"
57#include "squashfs_fs_sb.h" 57#include "squashfs_fs_sb.h"
58#include "squashfs_fs_i.h"
59#include "squashfs.h" 58#include "squashfs.h"
60 59
61/* 60/*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722f..a5940e54c4dd 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
27 27
28#include "squashfs_fs.h" 28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h" 29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h" 30#include "decompressor.h"
32#include "squashfs.h" 31#include "squashfs.h"
33 32
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
41}; 40};
42 41
43#ifndef CONFIG_SQUASHFS_LZO 42#ifndef CONFIG_SQUASHFS_LZO
44static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { 43static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
46}; 45};
47#endif 46#endif
48 47
48#ifndef CONFIG_SQUASHFS_XZ
49static const struct squashfs_decompressor squashfs_xz_comp_ops = {
50 NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
51};
52#endif
53
49static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 54static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
50 NULL, NULL, NULL, 0, "unknown", 0 55 NULL, NULL, NULL, 0, "unknown", 0
51}; 56};
52 57
53static const struct squashfs_decompressor *decompressor[] = { 58static const struct squashfs_decompressor *decompressor[] = {
54 &squashfs_zlib_comp_ops, 59 &squashfs_zlib_comp_ops,
55 &squashfs_lzma_unsupported_comp_ops,
56#ifdef CONFIG_SQUASHFS_LZO
57 &squashfs_lzo_comp_ops, 60 &squashfs_lzo_comp_ops,
58#else 61 &squashfs_xz_comp_ops,
59 &squashfs_lzo_unsupported_comp_ops, 62 &squashfs_lzma_unsupported_comp_ops,
60#endif
61 &squashfs_unknown_comp_ops 63 &squashfs_unknown_comp_ops
62}; 64};
63 65
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f6..3b305a70f7aa 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, 52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages); 53 length, srclength, pages);
54} 54}
55
56#ifdef CONFIG_SQUASHFS_XZ
57extern const struct squashfs_decompressor squashfs_xz_comp_ops;
58#endif
59
60#ifdef CONFIG_SQUASHFS_LZO
61extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
62#endif
63
55#endif 64#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac6585..0dc340aa2be9 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
230 230
231const struct file_operations squashfs_dir_ops = { 231const struct file_operations squashfs_dir_ops = {
232 .read = generic_read_dir, 232 .read = generic_read_dir,
233 .readdir = squashfs_readdir 233 .readdir = squashfs_readdir,
234 .llseek = default_llseek,
234}; 235};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879d..7eef571443c6 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
39 39
40#include "squashfs_fs.h" 40#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h"
43#include "squashfs.h" 42#include "squashfs.h"
44 43
45/* 44/*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b70..d8f32452638e 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
37 37
38#include "squashfs_fs.h" 38#include "squashfs_fs.h"
39#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
40#include "squashfs_fs_i.h"
41#include "squashfs.h" 40#include "squashfs.h"
42 41
43/* 42/*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c1..7da759e34c52 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f72..ba729d808876 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
27 27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) 28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29 29
30static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
31{
32 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
33}
34
35/* block.c */ 30/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 31extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int, int); 32 int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
104 99
105/* zlib_wrapper.c */ 100/* zlib_wrapper.c */
106extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 101extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
107
108/* lzo_wrapper.c */
109extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab11..39533feffd6d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
238#define ZLIB_COMPRESSION 1 238#define ZLIB_COMPRESSION 1
239#define LZMA_COMPRESSION 2 239#define LZMA_COMPRESSION 2
240#define LZO_COMPRESSION 3 240#define LZO_COMPRESSION 3
241#define XZ_COMPRESSION 4
241 242
242struct squashfs_super_block { 243struct squashfs_super_block {
243 __le32 s_magic; 244 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a1..359baefc01fc 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
45 }; 45 };
46 struct inode vfs_inode; 46 struct inode vfs_inode;
47}; 47};
48
49
50static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
51{
52 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
53}
48#endif 54#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f8606652..20700b9f2b4c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
34#include <linux/mutex.h> 33#include <linux/mutex.h>
35#include <linux/pagemap.h> 34#include <linux/pagemap.h>
36#include <linux/init.h> 35#include <linux/init.h>
@@ -354,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
354 353
355static void squashfs_put_super(struct super_block *sb) 354static void squashfs_put_super(struct super_block *sb)
356{ 355{
357 lock_kernel();
358
359 if (sb->s_fs_info) { 356 if (sb->s_fs_info) {
360 struct squashfs_sb_info *sbi = sb->s_fs_info; 357 struct squashfs_sb_info *sbi = sb->s_fs_info;
361 squashfs_cache_delete(sbi->block_cache); 358 squashfs_cache_delete(sbi->block_cache);
@@ -370,17 +367,13 @@ static void squashfs_put_super(struct super_block *sb)
370 kfree(sb->s_fs_info); 367 kfree(sb->s_fs_info);
371 sb->s_fs_info = NULL; 368 sb->s_fs_info = NULL;
372 } 369 }
373
374 unlock_kernel();
375} 370}
376 371
377 372
378static int squashfs_get_sb(struct file_system_type *fs_type, int flags, 373static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
379 const char *dev_name, void *data, 374 const char *dev_name, void *data)
380 struct vfsmount *mnt)
381{ 375{
382 return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, 376 return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
383 mnt);
384} 377}
385 378
386 379
@@ -447,16 +440,23 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
447} 440}
448 441
449 442
450static void squashfs_destroy_inode(struct inode *inode) 443static void squashfs_i_callback(struct rcu_head *head)
451{ 444{
445 struct inode *inode = container_of(head, struct inode, i_rcu);
446 INIT_LIST_HEAD(&inode->i_dentry);
452 kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); 447 kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
453} 448}
454 449
450static void squashfs_destroy_inode(struct inode *inode)
451{
452 call_rcu(&inode->i_rcu, squashfs_i_callback);
453}
454
455 455
456static struct file_system_type squashfs_fs_type = { 456static struct file_system_type squashfs_fs_type = {
457 .owner = THIS_MODULE, 457 .owner = THIS_MODULE,
458 .name = "squashfs", 458 .name = "squashfs",
459 .get_sb = squashfs_get_sb, 459 .mount = squashfs_mount,
460 .kill_sb = kill_block_super, 460 .kill_sb = kill_block_super,
461 .fs_flags = FS_REQUIRES_DEV 461 .fs_flags = FS_REQUIRES_DEV
462}; 462};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 652b8541f9c6..3876c36699a1 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
158 strncmp(target, name, name_size) == 0) { 158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */ 159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) { 160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr; 161 __le64 xattr_val;
162 u64 xattr;
162 /* val is a reference to the real location */ 163 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start, 164 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val)); 165 &offset, sizeof(val));
165 if (err < 0) 166 if (err < 0)
166 goto failed; 167 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start, 168 err = squashfs_read_metadata(sb, &xattr_val,
168 &offset, sizeof(xattr)); 169 &start, &offset, sizeof(xattr_val));
169 if (err < 0) 170 if (err < 0)
170 goto failed; 171 goto failed;
171 xattr = le64_to_cpu(xattr); 172 xattr = le64_to_cpu(xattr_val);
172 start = SQUASHFS_XATTR_BLK(xattr) + 173 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table; 174 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr); 175 offset = SQUASHFS_XATTR_OFFSET(xattr);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 49fe0d719fbf..b634efce4bde 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -25,7 +25,7 @@
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, 25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *); 26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, 27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *); 28 unsigned int *, unsigned long long *);
29#else 29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, 30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids) 31 u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
35} 35}
36 36
37static inline int squashfs_xattr_lookup(struct super_block *sb, 37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size, 38 unsigned int index, int *count, unsigned int *size,
39 unsigned long long *xattr) 39 unsigned long long *xattr)
40{ 40{
41 return 0; 41 return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098f..05385dbe1465 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,8 +32,8 @@
32 32
33#include "squashfs_fs.h" 33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h" 34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h" 35#include "squashfs.h"
36#include "xattr.h"
37 37
38/* 38/*
39 * Map xattr id using the xattr id look up table 39 * Map xattr id using the xattr id look up table
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000000..c4eb40018256
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,147 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xz_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/slab.h>
28#include <linux/xz.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36struct squashfs_xz {
37 struct xz_dec *state;
38 struct xz_buf buf;
39};
40
41static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
42{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
44
45 struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL)
47 goto failed;
48
49 stream->state = xz_dec_init(XZ_PREALLOC, block_size);
50 if (stream->state == NULL)
51 goto failed;
52
53 return stream;
54
55failed:
56 ERROR("Failed to allocate xz workspace\n");
57 kfree(stream);
58 return NULL;
59}
60
61
62static void squashfs_xz_free(void *strm)
63{
64 struct squashfs_xz *stream = strm;
65
66 if (stream) {
67 xz_dec_end(stream->state);
68 kfree(stream);
69 }
70}
71
72
73static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
74 struct buffer_head **bh, int b, int offset, int length, int srclength,
75 int pages)
76{
77 enum xz_ret xz_err;
78 int avail, total = 0, k = 0, page = 0;
79 struct squashfs_xz *stream = msblk->stream;
80
81 mutex_lock(&msblk->read_data_mutex);
82
83 xz_dec_reset(stream->state);
84 stream->buf.in_pos = 0;
85 stream->buf.in_size = 0;
86 stream->buf.out_pos = 0;
87 stream->buf.out_size = PAGE_CACHE_SIZE;
88 stream->buf.out = buffer[page++];
89
90 do {
91 if (stream->buf.in_pos == stream->buf.in_size && k < b) {
92 avail = min(length, msblk->devblksize - offset);
93 length -= avail;
94 wait_on_buffer(bh[k]);
95 if (!buffer_uptodate(bh[k]))
96 goto release_mutex;
97
98 stream->buf.in = bh[k]->b_data + offset;
99 stream->buf.in_size = avail;
100 stream->buf.in_pos = 0;
101 offset = 0;
102 }
103
104 if (stream->buf.out_pos == stream->buf.out_size
105 && page < pages) {
106 stream->buf.out = buffer[page++];
107 stream->buf.out_pos = 0;
108 total += PAGE_CACHE_SIZE;
109 }
110
111 xz_err = xz_dec_run(stream->state, &stream->buf);
112
113 if (stream->buf.in_pos == stream->buf.in_size && k < b)
114 put_bh(bh[k++]);
115 } while (xz_err == XZ_OK);
116
117 if (xz_err != XZ_STREAM_END) {
118 ERROR("xz_dec_run error, data probably corrupt\n");
119 goto release_mutex;
120 }
121
122 if (k < b) {
123 ERROR("xz_uncompress error, input remaining\n");
124 goto release_mutex;
125 }
126
127 total += stream->buf.out_pos;
128 mutex_unlock(&msblk->read_data_mutex);
129 return total;
130
131release_mutex:
132 mutex_unlock(&msblk->read_data_mutex);
133
134 for (; k < b; k++)
135 put_bh(bh[k]);
136
137 return -EIO;
138}
139
140const struct squashfs_decompressor squashfs_xz_comp_ops = {
141 .init = squashfs_xz_init,
142 .free = squashfs_xz_free,
143 .decompress = squashfs_xz_uncompress,
144 .id = XZ_COMPRESSION,
145 .name = "xz",
146 .supported = 1
147};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e483..4661ae2b1cec 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
66 struct buffer_head **bh, int b, int offset, int length, int srclength, 65 struct buffer_head **bh, int b, int offset, int length, int srclength,
67 int pages) 66 int pages)
68{ 67{
69 int zlib_err = 0, zlib_init = 0; 68 int zlib_err, zlib_init = 0;
70 int avail, bytes, k = 0, page = 0; 69 int k = 0, page = 0;
71 z_stream *stream = msblk->stream; 70 z_stream *stream = msblk->stream;
72 71
73 mutex_lock(&msblk->read_data_mutex); 72 mutex_lock(&msblk->read_data_mutex);
@@ -75,21 +74,14 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
75 stream->avail_out = 0; 74 stream->avail_out = 0;
76 stream->avail_in = 0; 75 stream->avail_in = 0;
77 76
78 bytes = length;
79 do { 77 do {
80 if (stream->avail_in == 0 && k < b) { 78 if (stream->avail_in == 0 && k < b) {
81 avail = min(bytes, msblk->devblksize - offset); 79 int avail = min(length, msblk->devblksize - offset);
82 bytes -= avail; 80 length -= avail;
83 wait_on_buffer(bh[k]); 81 wait_on_buffer(bh[k]);
84 if (!buffer_uptodate(bh[k])) 82 if (!buffer_uptodate(bh[k]))
85 goto release_mutex; 83 goto release_mutex;
86 84
87 if (avail == 0) {
88 offset = 0;
89 put_bh(bh[k++]);
90 continue;
91 }
92
93 stream->next_in = bh[k]->b_data + offset; 85 stream->next_in = bh[k]->b_data + offset;
94 stream->avail_in = avail; 86 stream->avail_in = avail;
95 offset = 0; 87 offset = 0;
@@ -128,6 +120,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
128 goto release_mutex; 120 goto release_mutex;
129 } 121 }
130 122
123 if (k < b) {
124 ERROR("zlib_uncompress error, data remaining\n");
125 goto release_mutex;
126 }
127
131 length = stream->total_out; 128 length = stream->total_out;
132 mutex_unlock(&msblk->read_data_mutex); 129 mutex_unlock(&msblk->read_data_mutex);
133 return length; 130 return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
75 int error = -EINVAL; 75 int error = -EINVAL;
76 int lookup_flags = 0; 76 int lookup_flags = 0;
77 77
78 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) 78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
79 goto out; 79 goto out;
80 80
81 if (!(flag & AT_SYMLINK_NOFOLLOW)) 81 if (!(flag & AT_SYMLINK_NOFOLLOW))
82 lookup_flags |= LOOKUP_FOLLOW; 82 lookup_flags |= LOOKUP_FOLLOW;
83 if (flag & AT_NO_AUTOMOUNT)
84 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
83 85
84 error = user_path_at(dfd, filename, lookup_flags, &path); 86 error = user_path_at(dfd, filename, lookup_flags, &path);
85 if (error) 87 if (error)
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..74e149efed81 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -30,6 +30,7 @@
30#include <linux/idr.h> 30#include <linux/idr.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
33#include <linux/rculist_bl.h>
33#include "internal.h" 34#include "internal.h"
34 35
35 36
@@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71 INIT_LIST_HEAD(&s->s_files); 72 INIT_LIST_HEAD(&s->s_files);
72#endif 73#endif
73 INIT_LIST_HEAD(&s->s_instances); 74 INIT_LIST_HEAD(&s->s_instances);
74 INIT_HLIST_HEAD(&s->s_anon); 75 INIT_HLIST_BL_HEAD(&s->s_anon);
75 INIT_LIST_HEAD(&s->s_inodes); 76 INIT_LIST_HEAD(&s->s_inodes);
76 INIT_LIST_HEAD(&s->s_dentry_lru); 77 INIT_LIST_HEAD(&s->s_dentry_lru);
77 init_rwsem(&s->s_umount); 78 init_rwsem(&s->s_umount);
@@ -273,14 +274,14 @@ void generic_shutdown_super(struct super_block *sb)
273 get_fs_excl(); 274 get_fs_excl();
274 sb->s_flags &= ~MS_ACTIVE; 275 sb->s_flags &= ~MS_ACTIVE;
275 276
276 /* bad name - it should be evict_inodes() */ 277 fsnotify_unmount_inodes(&sb->s_inodes);
277 invalidate_inodes(sb); 278
279 evict_inodes(sb);
278 280
279 if (sop->put_super) 281 if (sop->put_super)
280 sop->put_super(sb); 282 sop->put_super(sb);
281 283
282 /* Forget any remaining inodes */ 284 if (!list_empty(&sb->s_inodes)) {
283 if (invalidate_inodes(sb)) {
284 printk("VFS: Busy inodes after unmount of %s. " 285 printk("VFS: Busy inodes after unmount of %s. "
285 "Self-destruct in 5 seconds. Have a nice day...\n", 286 "Self-destruct in 5 seconds. Have a nice day...\n",
286 sb->s_id); 287 sb->s_id);
@@ -715,15 +716,14 @@ static int ns_set_super(struct super_block *sb, void *data)
715 return set_anon_super(sb, NULL); 716 return set_anon_super(sb, NULL);
716} 717}
717 718
718int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, 719struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
719 int (*fill_super)(struct super_block *, void *, int), 720 void *data, int (*fill_super)(struct super_block *, void *, int))
720 struct vfsmount *mnt)
721{ 721{
722 struct super_block *sb; 722 struct super_block *sb;
723 723
724 sb = sget(fs_type, ns_test_super, ns_set_super, data); 724 sb = sget(fs_type, ns_test_super, ns_set_super, data);
725 if (IS_ERR(sb)) 725 if (IS_ERR(sb))
726 return PTR_ERR(sb); 726 return ERR_CAST(sb);
727 727
728 if (!sb->s_root) { 728 if (!sb->s_root) {
729 int err; 729 int err;
@@ -731,17 +731,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
731 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); 731 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
732 if (err) { 732 if (err) {
733 deactivate_locked_super(sb); 733 deactivate_locked_super(sb);
734 return err; 734 return ERR_PTR(err);
735 } 735 }
736 736
737 sb->s_flags |= MS_ACTIVE; 737 sb->s_flags |= MS_ACTIVE;
738 } 738 }
739 739
740 simple_set_mnt(mnt, sb); 740 return dget(sb->s_root);
741 return 0;
742} 741}
743 742
744EXPORT_SYMBOL(get_sb_ns); 743EXPORT_SYMBOL(mount_ns);
745 744
746#ifdef CONFIG_BLOCK 745#ifdef CONFIG_BLOCK
747static int set_bdev_super(struct super_block *s, void *data) 746static int set_bdev_super(struct super_block *s, void *data)
@@ -762,22 +761,21 @@ static int test_bdev_super(struct super_block *s, void *data)
762 return (void *)s->s_bdev == data; 761 return (void *)s->s_bdev == data;
763} 762}
764 763
765int get_sb_bdev(struct file_system_type *fs_type, 764struct dentry *mount_bdev(struct file_system_type *fs_type,
766 int flags, const char *dev_name, void *data, 765 int flags, const char *dev_name, void *data,
767 int (*fill_super)(struct super_block *, void *, int), 766 int (*fill_super)(struct super_block *, void *, int))
768 struct vfsmount *mnt)
769{ 767{
770 struct block_device *bdev; 768 struct block_device *bdev;
771 struct super_block *s; 769 struct super_block *s;
772 fmode_t mode = FMODE_READ; 770 fmode_t mode = FMODE_READ | FMODE_EXCL;
773 int error = 0; 771 int error = 0;
774 772
775 if (!(flags & MS_RDONLY)) 773 if (!(flags & MS_RDONLY))
776 mode |= FMODE_WRITE; 774 mode |= FMODE_WRITE;
777 775
778 bdev = open_bdev_exclusive(dev_name, mode, fs_type); 776 bdev = blkdev_get_by_path(dev_name, mode, fs_type);
779 if (IS_ERR(bdev)) 777 if (IS_ERR(bdev))
780 return PTR_ERR(bdev); 778 return ERR_CAST(bdev);
781 779
782 /* 780 /*
783 * once the super is inserted into the list by sget, s_umount 781 * once the super is inserted into the list by sget, s_umount
@@ -804,13 +802,13 @@ int get_sb_bdev(struct file_system_type *fs_type,
804 802
805 /* 803 /*
806 * s_umount nests inside bd_mutex during 804 * s_umount nests inside bd_mutex during
807 * __invalidate_device(). close_bdev_exclusive() 805 * __invalidate_device(). blkdev_put() acquires
808 * acquires bd_mutex and can't be called under 806 * bd_mutex and can't be called under s_umount. Drop
809 * s_umount. Drop s_umount temporarily. This is safe 807 * s_umount temporarily. This is safe as we're
810 * as we're holding an active reference. 808 * holding an active reference.
811 */ 809 */
812 up_write(&s->s_umount); 810 up_write(&s->s_umount);
813 close_bdev_exclusive(bdev, mode); 811 blkdev_put(bdev, mode);
814 down_write(&s->s_umount); 812 down_write(&s->s_umount);
815 } else { 813 } else {
816 char b[BDEVNAME_SIZE]; 814 char b[BDEVNAME_SIZE];
@@ -829,15 +827,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
829 bdev->bd_super = s; 827 bdev->bd_super = s;
830 } 828 }
831 829
832 simple_set_mnt(mnt, s); 830 return dget(s->s_root);
833 return 0;
834 831
835error_s: 832error_s:
836 error = PTR_ERR(s); 833 error = PTR_ERR(s);
837error_bdev: 834error_bdev:
838 close_bdev_exclusive(bdev, mode); 835 blkdev_put(bdev, mode);
839error: 836error:
840 return error; 837 return ERR_PTR(error);
838}
839EXPORT_SYMBOL(mount_bdev);
840
841int get_sb_bdev(struct file_system_type *fs_type,
842 int flags, const char *dev_name, void *data,
843 int (*fill_super)(struct super_block *, void *, int),
844 struct vfsmount *mnt)
845{
846 struct dentry *root;
847
848 root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
849 if (IS_ERR(root))
850 return PTR_ERR(root);
851 mnt->mnt_root = root;
852 mnt->mnt_sb = root->d_sb;
853 return 0;
841} 854}
842 855
843EXPORT_SYMBOL(get_sb_bdev); 856EXPORT_SYMBOL(get_sb_bdev);
@@ -850,35 +863,49 @@ void kill_block_super(struct super_block *sb)
850 bdev->bd_super = NULL; 863 bdev->bd_super = NULL;
851 generic_shutdown_super(sb); 864 generic_shutdown_super(sb);
852 sync_blockdev(bdev); 865 sync_blockdev(bdev);
853 close_bdev_exclusive(bdev, mode); 866 WARN_ON_ONCE(!(mode & FMODE_EXCL));
867 blkdev_put(bdev, mode | FMODE_EXCL);
854} 868}
855 869
856EXPORT_SYMBOL(kill_block_super); 870EXPORT_SYMBOL(kill_block_super);
857#endif 871#endif
858 872
859int get_sb_nodev(struct file_system_type *fs_type, 873struct dentry *mount_nodev(struct file_system_type *fs_type,
860 int flags, void *data, 874 int flags, void *data,
861 int (*fill_super)(struct super_block *, void *, int), 875 int (*fill_super)(struct super_block *, void *, int))
862 struct vfsmount *mnt)
863{ 876{
864 int error; 877 int error;
865 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); 878 struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
866 879
867 if (IS_ERR(s)) 880 if (IS_ERR(s))
868 return PTR_ERR(s); 881 return ERR_CAST(s);
869 882
870 s->s_flags = flags; 883 s->s_flags = flags;
871 884
872 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 885 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
873 if (error) { 886 if (error) {
874 deactivate_locked_super(s); 887 deactivate_locked_super(s);
875 return error; 888 return ERR_PTR(error);
876 } 889 }
877 s->s_flags |= MS_ACTIVE; 890 s->s_flags |= MS_ACTIVE;
878 simple_set_mnt(mnt, s); 891 return dget(s->s_root);
879 return 0;
880} 892}
893EXPORT_SYMBOL(mount_nodev);
894
895int get_sb_nodev(struct file_system_type *fs_type,
896 int flags, void *data,
897 int (*fill_super)(struct super_block *, void *, int),
898 struct vfsmount *mnt)
899{
900 struct dentry *root;
881 901
902 root = mount_nodev(fs_type, flags, data, fill_super);
903 if (IS_ERR(root))
904 return PTR_ERR(root);
905 mnt->mnt_root = root;
906 mnt->mnt_sb = root->d_sb;
907 return 0;
908}
882EXPORT_SYMBOL(get_sb_nodev); 909EXPORT_SYMBOL(get_sb_nodev);
883 910
884static int compare_single(struct super_block *s, void *p) 911static int compare_single(struct super_block *s, void *p)
@@ -886,29 +913,42 @@ static int compare_single(struct super_block *s, void *p)
886 return 1; 913 return 1;
887} 914}
888 915
889int get_sb_single(struct file_system_type *fs_type, 916struct dentry *mount_single(struct file_system_type *fs_type,
890 int flags, void *data, 917 int flags, void *data,
891 int (*fill_super)(struct super_block *, void *, int), 918 int (*fill_super)(struct super_block *, void *, int))
892 struct vfsmount *mnt)
893{ 919{
894 struct super_block *s; 920 struct super_block *s;
895 int error; 921 int error;
896 922
897 s = sget(fs_type, compare_single, set_anon_super, NULL); 923 s = sget(fs_type, compare_single, set_anon_super, NULL);
898 if (IS_ERR(s)) 924 if (IS_ERR(s))
899 return PTR_ERR(s); 925 return ERR_CAST(s);
900 if (!s->s_root) { 926 if (!s->s_root) {
901 s->s_flags = flags; 927 s->s_flags = flags;
902 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 928 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
903 if (error) { 929 if (error) {
904 deactivate_locked_super(s); 930 deactivate_locked_super(s);
905 return error; 931 return ERR_PTR(error);
906 } 932 }
907 s->s_flags |= MS_ACTIVE; 933 s->s_flags |= MS_ACTIVE;
908 } else { 934 } else {
909 do_remount_sb(s, flags, data, 0); 935 do_remount_sb(s, flags, data, 0);
910 } 936 }
911 simple_set_mnt(mnt, s); 937 return dget(s->s_root);
938}
939EXPORT_SYMBOL(mount_single);
940
941int get_sb_single(struct file_system_type *fs_type,
942 int flags, void *data,
943 int (*fill_super)(struct super_block *, void *, int),
944 struct vfsmount *mnt)
945{
946 struct dentry *root;
947 root = mount_single(fs_type, flags, data, fill_super);
948 if (IS_ERR(root))
949 return PTR_ERR(root);
950 mnt->mnt_root = root;
951 mnt->mnt_sb = root->d_sb;
912 return 0; 952 return 0;
913} 953}
914 954
@@ -918,6 +958,7 @@ struct vfsmount *
918vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 958vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
919{ 959{
920 struct vfsmount *mnt; 960 struct vfsmount *mnt;
961 struct dentry *root;
921 char *secdata = NULL; 962 char *secdata = NULL;
922 int error; 963 int error;
923 964
@@ -942,9 +983,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
942 goto out_free_secdata; 983 goto out_free_secdata;
943 } 984 }
944 985
945 error = type->get_sb(type, flags, name, data, mnt); 986 if (type->mount) {
946 if (error < 0) 987 root = type->mount(type, flags, name, data);
947 goto out_free_secdata; 988 if (IS_ERR(root)) {
989 error = PTR_ERR(root);
990 goto out_free_secdata;
991 }
992 mnt->mnt_root = root;
993 mnt->mnt_sb = root->d_sb;
994 } else {
995 error = type->get_sb(type, flags, name, data, mnt);
996 if (error < 0)
997 goto out_free_secdata;
998 }
948 BUG_ON(!mnt->mnt_sb); 999 BUG_ON(!mnt->mnt_sb);
949 WARN_ON(!mnt->mnt_sb->s_bdi); 1000 WARN_ON(!mnt->mnt_sb->s_bdi);
950 mnt->mnt_sb->s_flags |= MS_BORN; 1001 mnt->mnt_sb->s_flags |= MS_BORN;
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d6..8c41feacbac5 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
1config SYSFS 1config SYSFS
2 bool "sysfs file system support" if EMBEDDED 2 bool "sysfs file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to 5 The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353fa..a4759833d62d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,30 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
179 struct bin_buffer *bb = file->private_data; 179 struct bin_buffer *bb = file->private_data;
180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
181 181
182 if (!bb->vm_ops || !bb->vm_ops->open) 182 if (!bb->vm_ops)
183 return;
184
185 if (!sysfs_get_active(attr_sd))
186 return;
187
188 bb->vm_ops->open(vma);
189
190 sysfs_put_active(attr_sd);
191}
192
193static void bin_vma_close(struct vm_area_struct *vma)
194{
195 struct file *file = vma->vm_file;
196 struct bin_buffer *bb = file->private_data;
197 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
198
199 if (!bb->vm_ops || !bb->vm_ops->close)
200 return; 183 return;
201 184
202 if (!sysfs_get_active(attr_sd)) 185 if (!sysfs_get_active(attr_sd))
203 return; 186 return;
204 187
205 bb->vm_ops->close(vma); 188 if (bb->vm_ops->open)
189 bb->vm_ops->open(vma);
206 190
207 sysfs_put_active(attr_sd); 191 sysfs_put_active(attr_sd);
208} 192}
@@ -214,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
214 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 198 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
215 int ret; 199 int ret;
216 200
217 if (!bb->vm_ops || !bb->vm_ops->fault) 201 if (!bb->vm_ops)
218 return VM_FAULT_SIGBUS; 202 return VM_FAULT_SIGBUS;
219 203
220 if (!sysfs_get_active(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
221 return VM_FAULT_SIGBUS; 205 return VM_FAULT_SIGBUS;
222 206
223 ret = bb->vm_ops->fault(vma, vmf); 207 ret = VM_FAULT_SIGBUS;
208 if (bb->vm_ops->fault)
209 ret = bb->vm_ops->fault(vma, vmf);
224 210
225 sysfs_put_active(attr_sd); 211 sysfs_put_active(attr_sd);
226 return ret; 212 return ret;
@@ -236,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
236 if (!bb->vm_ops) 222 if (!bb->vm_ops)
237 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
238 224
239 if (!bb->vm_ops->page_mkwrite)
240 return 0;
241
242 if (!sysfs_get_active(attr_sd)) 225 if (!sysfs_get_active(attr_sd))
243 return VM_FAULT_SIGBUS; 226 return VM_FAULT_SIGBUS;
244 227
245 ret = bb->vm_ops->page_mkwrite(vma, vmf); 228 ret = 0;
229 if (bb->vm_ops->page_mkwrite)
230 ret = bb->vm_ops->page_mkwrite(vma, vmf);
246 231
247 sysfs_put_active(attr_sd); 232 sysfs_put_active(attr_sd);
248 return ret; 233 return ret;
@@ -256,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
256 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 241 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
257 int ret; 242 int ret;
258 243
259 if (!bb->vm_ops || !bb->vm_ops->access) 244 if (!bb->vm_ops)
260 return -EINVAL; 245 return -EINVAL;
261 246
262 if (!sysfs_get_active(attr_sd)) 247 if (!sysfs_get_active(attr_sd))
263 return -EINVAL; 248 return -EINVAL;
264 249
265 ret = bb->vm_ops->access(vma, addr, buf, len, write); 250 ret = -EINVAL;
251 if (bb->vm_ops->access)
252 ret = bb->vm_ops->access(vma, addr, buf, len, write);
266 253
267 sysfs_put_active(attr_sd); 254 sysfs_put_active(attr_sd);
268 return ret; 255 return ret;
@@ -276,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
276 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 263 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
277 int ret; 264 int ret;
278 265
279 if (!bb->vm_ops || !bb->vm_ops->set_policy) 266 if (!bb->vm_ops)
280 return 0; 267 return 0;
281 268
282 if (!sysfs_get_active(attr_sd)) 269 if (!sysfs_get_active(attr_sd))
283 return -EINVAL; 270 return -EINVAL;
284 271
285 ret = bb->vm_ops->set_policy(vma, new); 272 ret = 0;
273 if (bb->vm_ops->set_policy)
274 ret = bb->vm_ops->set_policy(vma, new);
286 275
287 sysfs_put_active(attr_sd); 276 sysfs_put_active(attr_sd);
288 return ret; 277 return ret;
@@ -296,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
296 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 285 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
297 struct mempolicy *pol; 286 struct mempolicy *pol;
298 287
299 if (!bb->vm_ops || !bb->vm_ops->get_policy) 288 if (!bb->vm_ops)
300 return vma->vm_policy; 289 return vma->vm_policy;
301 290
302 if (!sysfs_get_active(attr_sd)) 291 if (!sysfs_get_active(attr_sd))
303 return vma->vm_policy; 292 return vma->vm_policy;
304 293
305 pol = bb->vm_ops->get_policy(vma, addr); 294 pol = vma->vm_policy;
295 if (bb->vm_ops->get_policy)
296 pol = bb->vm_ops->get_policy(vma, addr);
306 297
307 sysfs_put_active(attr_sd); 298 sysfs_put_active(attr_sd);
308 return pol; 299 return pol;
@@ -316,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
316 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 307 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
317 int ret; 308 int ret;
318 309
319 if (!bb->vm_ops || !bb->vm_ops->migrate) 310 if (!bb->vm_ops)
320 return 0; 311 return 0;
321 312
322 if (!sysfs_get_active(attr_sd)) 313 if (!sysfs_get_active(attr_sd))
323 return 0; 314 return 0;
324 315
325 ret = bb->vm_ops->migrate(vma, from, to, flags); 316 ret = 0;
317 if (bb->vm_ops->migrate)
318 ret = bb->vm_ops->migrate(vma, from, to, flags);
326 319
327 sysfs_put_active(attr_sd); 320 sysfs_put_active(attr_sd);
328 return ret; 321 return ret;
@@ -331,7 +324,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
331 324
332static const struct vm_operations_struct bin_vm_ops = { 325static const struct vm_operations_struct bin_vm_ops = {
333 .open = bin_vma_open, 326 .open = bin_vma_open,
334 .close = bin_vma_close,
335 .fault = bin_fault, 327 .fault = bin_fault,
336 .page_mkwrite = bin_page_mkwrite, 328 .page_mkwrite = bin_page_mkwrite,
337 .access = bin_access, 329 .access = bin_access,
@@ -377,6 +369,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
377 if (bb->mmapped && bb->vm_ops != vma->vm_ops) 369 if (bb->mmapped && bb->vm_ops != vma->vm_ops)
378 goto out_put; 370 goto out_put;
379 371
372 /*
373 * It is not possible to successfully wrap close.
374 * So error if someone is trying to use close.
375 */
376 rc = -EINVAL;
377 if (vma->vm_ops && vma->vm_ops->close)
378 goto out_put;
379
380 rc = 0; 380 rc = 0;
381 bb->mmapped = 1; 381 bb->mmapped = 1;
382 bb->vm_ops = vma->vm_ops; 382 bb->vm_ops = vma->vm_ops;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b0..ea9120a830d8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
231 goto repeat; 231 goto repeat;
232} 232}
233 233
234static int sysfs_dentry_delete(struct dentry *dentry) 234static int sysfs_dentry_delete(const struct dentry *dentry)
235{ 235{
236 struct sysfs_dirent *sd = dentry->d_fsdata; 236 struct sysfs_dirent *sd = dentry->d_fsdata;
237 return !!(sd->s_flags & SYSFS_FLAG_REMOVED); 237 return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(struct dentry *dentry)
239 239
240static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) 240static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
241{ 241{
242 struct sysfs_dirent *sd = dentry->d_fsdata; 242 struct sysfs_dirent *sd;
243 int is_dir; 243 int is_dir;
244 244
245 if (nd->flags & LOOKUP_RCU)
246 return -ECHILD;
247
248 sd = dentry->d_fsdata;
245 mutex_lock(&sysfs_mutex); 249 mutex_lock(&sysfs_mutex);
246 250
247 /* The sysfs dirent has been deleted */ 251 /* The sysfs dirent has been deleted */
@@ -701,7 +705,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
701 /* instantiate and hash dentry */ 705 /* instantiate and hash dentry */
702 ret = d_find_alias(inode); 706 ret = d_find_alias(inode);
703 if (!ret) { 707 if (!ret) {
704 dentry->d_op = &sysfs_dentry_ops; 708 d_set_d_op(dentry, &sysfs_dentry_ops);
705 dentry->d_fsdata = sysfs_get(sd); 709 dentry->d_fsdata = sysfs_get(sd);
706 d_add(dentry, inode); 710 d_add(dentry, inode);
707 } else { 711 } else {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792a..c8769dc222d8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,59 @@ void sysfs_remove_group(struct kobject * kobj,
148 sysfs_put(sd); 148 sysfs_put(sd);
149} 149}
150 150
151/**
152 * sysfs_merge_group - merge files into a pre-existing attribute group.
153 * @kobj: The kobject containing the group.
154 * @grp: The files to create and the attribute group they belong to.
155 *
156 * This function returns an error if the group doesn't exist or any of the
157 * files already exist in that group, in which case none of the new files
158 * are created.
159 */
160int sysfs_merge_group(struct kobject *kobj,
161 const struct attribute_group *grp)
162{
163 struct sysfs_dirent *dir_sd;
164 int error = 0;
165 struct attribute *const *attr;
166 int i;
167
168 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
169 if (!dir_sd)
170 return -ENOENT;
171
172 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
173 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
174 if (error) {
175 while (--i >= 0)
176 sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
177 }
178 sysfs_put(dir_sd);
179
180 return error;
181}
182EXPORT_SYMBOL_GPL(sysfs_merge_group);
183
184/**
185 * sysfs_unmerge_group - remove files from a pre-existing attribute group.
186 * @kobj: The kobject containing the group.
187 * @grp: The files to remove and the attribute group they belong to.
188 */
189void sysfs_unmerge_group(struct kobject *kobj,
190 const struct attribute_group *grp)
191{
192 struct sysfs_dirent *dir_sd;
193 struct attribute *const *attr;
194
195 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
196 if (dir_sd) {
197 for (attr = grp->attrs; *attr; ++attr)
198 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
199 sysfs_put(dir_sd);
200 }
201}
202EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
203
151 204
152EXPORT_SYMBOL_GPL(sysfs_create_group); 205EXPORT_SYMBOL_GPL(sysfs_create_group);
153EXPORT_SYMBOL_GPL(sysfs_update_group); 206EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba33..0a12eb89cd32 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -19,6 +19,7 @@
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sysfs.h>
22#include <linux/xattr.h> 23#include <linux/xattr.h>
23#include <linux/security.h> 24#include <linux/security.h>
24#include "sysfs.h" 25#include "sysfs.h"
@@ -348,13 +349,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
348 return -ENOENT; 349 return -ENOENT;
349} 350}
350 351
351int sysfs_permission(struct inode *inode, int mask) 352int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
352{ 353{
353 struct sysfs_dirent *sd = inode->i_private; 354 struct sysfs_dirent *sd;
355
356 if (flags & IPERM_FLAG_RCU)
357 return -ECHILD;
358
359 sd = inode->i_private;
354 360
355 mutex_lock(&sysfs_mutex); 361 mutex_lock(&sysfs_mutex);
356 sysfs_refresh_inode(sd, inode); 362 sysfs_refresh_inode(sd, inode);
357 mutex_unlock(&sysfs_mutex); 363 mutex_unlock(&sysfs_mutex);
358 364
359 return generic_permission(inode, mask, NULL); 365 return generic_permission(inode, mask, flags, NULL);
360} 366}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f2af22574c50..266895783b47 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,7 @@
23#include "sysfs.h" 23#include "sysfs.h"
24 24
25 25
26static struct vfsmount *sysfs_mount; 26static struct vfsmount *sysfs_mnt;
27struct kmem_cache *sysfs_dir_cachep; 27struct kmem_cache *sysfs_dir_cachep;
28 28
29static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
95 return error; 95 return error;
96} 96}
97 97
98static int sysfs_get_sb(struct file_system_type *fs_type, 98static struct dentry *sysfs_mount(struct file_system_type *fs_type,
99 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data)
100{ 100{
101 struct sysfs_super_info *info; 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type; 102 enum kobj_ns_type type;
103 struct super_block *sb; 103 struct super_block *sb;
104 int error; 104 int error;
105 105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL); 106 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info) 107 if (!info)
109 goto out; 108 return ERR_PTR(-ENOMEM);
110 109
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) 110 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type); 111 info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); 113 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info) 114 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info); 115 kfree(info);
117 if (IS_ERR(sb)) { 116 if (IS_ERR(sb))
118 error = PTR_ERR(sb); 117 return ERR_CAST(sb);
119 goto out;
120 }
121 if (!sb->s_root) { 118 if (!sb->s_root) {
122 sb->s_flags = flags; 119 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); 120 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) { 121 if (error) {
125 deactivate_locked_super(sb); 122 deactivate_locked_super(sb);
126 goto out; 123 return ERR_PTR(error);
127 } 124 }
128 sb->s_flags |= MS_ACTIVE; 125 sb->s_flags |= MS_ACTIVE;
129 } 126 }
130 127
131 simple_set_mnt(mnt, sb); 128 return dget(sb->s_root);
132 error = 0;
133out:
134 return error;
135} 129}
136 130
137static void sysfs_kill_sb(struct super_block *sb) 131static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
147 141
148static struct file_system_type sysfs_fs_type = { 142static struct file_system_type sysfs_fs_type = {
149 .name = "sysfs", 143 .name = "sysfs",
150 .get_sb = sysfs_get_sb, 144 .mount = sysfs_mount,
151 .kill_sb = sysfs_kill_sb, 145 .kill_sb = sysfs_kill_sb,
152}; 146};
153 147
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
189 183
190 err = register_filesystem(&sysfs_fs_type); 184 err = register_filesystem(&sysfs_fs_type);
191 if (!err) { 185 if (!err) {
192 sysfs_mount = kern_mount(&sysfs_fs_type); 186 sysfs_mnt = kern_mount(&sysfs_fs_type);
193 if (IS_ERR(sysfs_mount)) { 187 if (IS_ERR(sysfs_mnt)) {
194 printk(KERN_ERR "sysfs: could not mount!\n"); 188 printk(KERN_ERR "sysfs: could not mount!\n");
195 err = PTR_ERR(sysfs_mount); 189 err = PTR_ERR(sysfs_mnt);
196 sysfs_mount = NULL; 190 sysfs_mnt = NULL;
197 unregister_filesystem(&sysfs_fs_type); 191 unregister_filesystem(&sysfs_fs_type);
198 goto out_err; 192 goto out_err;
199 } 193 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e956..3d28af31d863 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/lockdep.h> 11#include <linux/lockdep.h>
12#include <linux/kobject_ns.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14struct sysfs_open_dirent; 15struct sysfs_open_dirent;
@@ -200,7 +201,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
200struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); 201struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
201void sysfs_evict_inode(struct inode *inode); 202void sysfs_evict_inode(struct inode *inode);
202int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 203int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
203int sysfs_permission(struct inode *inode, int mask); 204int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
204int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 205int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
205int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 206int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
206int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 207int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e6..0630eb969a28 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
333 return &si->vfs_inode; 333 return &si->vfs_inode;
334} 334}
335 335
336static void sysv_destroy_inode(struct inode *inode) 336static void sysv_i_callback(struct rcu_head *head)
337{ 337{
338 struct inode *inode = container_of(head, struct inode, i_rcu);
339 INIT_LIST_HEAD(&inode->i_dentry);
338 kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); 340 kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
339} 341}
340 342
343static void sysv_destroy_inode(struct inode *inode)
344{
345 call_rcu(&inode->i_rcu, sysv_i_callback);
346}
347
341static void init_once(void *p) 348static void init_once(void *p)
342{ 349{
343 struct sysv_inode_info *si = (struct sysv_inode_info *)p; 350 struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..b427b1208c26 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
27 return err; 27 return err;
28} 28}
29 29
30static int sysv_hash(struct dentry *dentry, struct qstr *qstr) 30static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
31 struct qstr *qstr)
31{ 32{
32 /* Truncate the name in place, avoids having to define a compare 33 /* Truncate the name in place, avoids having to define a compare
33 function. */ 34 function. */
@@ -47,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
47 struct inode * inode = NULL; 48 struct inode * inode = NULL;
48 ino_t ino; 49 ino_t ino;
49 50
50 dentry->d_op = dir->i_sb->s_root->d_op;
51 if (dentry->d_name.len > SYSV_NAMELEN) 51 if (dentry->d_name.len > SYSV_NAMELEN)
52 return ERR_PTR(-ENAMETOOLONG); 52 return ERR_PTR(-ENAMETOOLONG);
53 ino = sysv_inode_by_name(dentry); 53 ino = sysv_inode_by_name(dentry);
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
126 126
127 inode->i_ctime = CURRENT_TIME_SEC; 127 inode->i_ctime = CURRENT_TIME_SEC;
128 inode_inc_link_count(inode); 128 inode_inc_link_count(inode);
129 atomic_inc(&inode->i_count); 129 ihold(inode);
130 130
131 return add_nondir(dentry, inode); 131 return add_nondir(dentry, inode);
132} 132}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index a0b0cda6927e..f60c196913ea 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
332 sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; 332 sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
333 /* set up enough so that it can read an inode */ 333 /* set up enough so that it can read an inode */
334 sb->s_op = &sysv_sops; 334 sb->s_op = &sysv_sops;
335 if (sbi->s_forced_ro)
336 sb->s_flags |= MS_RDONLY;
337 if (sbi->s_truncate)
338 sb->s_d_op = &sysv_dentry_operations;
335 root_inode = sysv_iget(sb, SYSV_ROOT_INO); 339 root_inode = sysv_iget(sb, SYSV_ROOT_INO);
336 if (IS_ERR(root_inode)) { 340 if (IS_ERR(root_inode)) {
337 printk("SysV FS: get root inode failed\n"); 341 printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
343 printk("SysV FS: get root dentry failed\n"); 347 printk("SysV FS: get root dentry failed\n");
344 return 0; 348 return 0;
345 } 349 }
346 if (sbi->s_forced_ro)
347 sb->s_flags |= MS_RDONLY;
348 if (sbi->s_truncate)
349 sb->s_root->d_op = &sysv_dentry_operations;
350 return 1; 350 return 1;
351} 351}
352 352
@@ -526,23 +526,22 @@ failed:
526 526
527/* Every kernel module contains stuff like this. */ 527/* Every kernel module contains stuff like this. */
528 528
529static int sysv_get_sb(struct file_system_type *fs_type, 529static struct dentry *sysv_mount(struct file_system_type *fs_type,
530 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 530 int flags, const char *dev_name, void *data)
531{ 531{
532 return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super, 532 return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
533 mnt);
534} 533}
535 534
536static int v7_get_sb(struct file_system_type *fs_type, 535static struct dentry *v7_mount(struct file_system_type *fs_type,
537 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 536 int flags, const char *dev_name, void *data)
538{ 537{
539 return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt); 538 return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
540} 539}
541 540
542static struct file_system_type sysv_fs_type = { 541static struct file_system_type sysv_fs_type = {
543 .owner = THIS_MODULE, 542 .owner = THIS_MODULE,
544 .name = "sysv", 543 .name = "sysv",
545 .get_sb = sysv_get_sb, 544 .mount = sysv_mount,
546 .kill_sb = kill_block_super, 545 .kill_sb = kill_block_super,
547 .fs_flags = FS_REQUIRES_DEV, 546 .fs_flags = FS_REQUIRES_DEV,
548}; 547};
@@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
550static struct file_system_type v7_fs_type = { 549static struct file_system_type v7_fs_type = {
551 .owner = THIS_MODULE, 550 .owner = THIS_MODULE,
552 .name = "v7", 551 .name = "v7",
553 .get_sb = v7_get_sb, 552 .mount = v7_mount,
554 .kill_sb = kill_block_super, 553 .kill_sb = kill_block_super,
555 .fs_flags = FS_REQUIRES_DEV, 554 .fs_flags = FS_REQUIRES_DEV,
556}; 555};
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79a..8c4fc1425b3e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
144 .release = timerfd_release, 144 .release = timerfd_release,
145 .poll = timerfd_poll, 145 .poll = timerfd_poll,
146 .read = timerfd_read, 146 .read = timerfd_read,
147 .llseek = noop_llseek,
147}; 148};
148 149
149static struct file *timerfd_fget(int fd) 150static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d8..02429d81ca33 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
63 struct ubifs_lp_stats lst; 63 struct ubifs_lp_stats lst;
64 64
65 dbg_cmt("start"); 65 dbg_cmt("start");
66 if (c->ro_media) { 66 ubifs_assert(!c->ro_media && !c->ro_mount);
67
68 if (c->ro_error) {
67 err = -EROFS; 69 err = -EROFS;
68 goto out_up; 70 goto out_up;
69 } 71 }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782f..0bee4dbffc31 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
2239 return err; 2239 return err;
2240} 2240}
2241 2241
2242/**
2243 * dbg_check_data_nodes_order - check that list of data nodes is sorted.
2244 * @c: UBIFS file-system description object
2245 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2246 *
2247 * This function returns zero if the list of data nodes is sorted correctly,
2248 * and %-EINVAL if not.
2249 */
2250int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
2251{
2252 struct list_head *cur;
2253 struct ubifs_scan_node *sa, *sb;
2254
2255 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2256 return 0;
2257
2258 for (cur = head->next; cur->next != head; cur = cur->next) {
2259 ino_t inuma, inumb;
2260 uint32_t blka, blkb;
2261
2262 cond_resched();
2263 sa = container_of(cur, struct ubifs_scan_node, list);
2264 sb = container_of(cur->next, struct ubifs_scan_node, list);
2265
2266 if (sa->type != UBIFS_DATA_NODE) {
2267 ubifs_err("bad node type %d", sa->type);
2268 dbg_dump_node(c, sa->node);
2269 return -EINVAL;
2270 }
2271 if (sb->type != UBIFS_DATA_NODE) {
2272 ubifs_err("bad node type %d", sb->type);
2273 dbg_dump_node(c, sb->node);
2274 return -EINVAL;
2275 }
2276
2277 inuma = key_inum(c, &sa->key);
2278 inumb = key_inum(c, &sb->key);
2279
2280 if (inuma < inumb)
2281 continue;
2282 if (inuma > inumb) {
2283 ubifs_err("larger inum %lu goes before inum %lu",
2284 (unsigned long)inuma, (unsigned long)inumb);
2285 goto error_dump;
2286 }
2287
2288 blka = key_block(c, &sa->key);
2289 blkb = key_block(c, &sb->key);
2290
2291 if (blka > blkb) {
2292 ubifs_err("larger block %u goes before %u", blka, blkb);
2293 goto error_dump;
2294 }
2295 if (blka == blkb) {
2296 ubifs_err("two data nodes for the same block");
2297 goto error_dump;
2298 }
2299 }
2300
2301 return 0;
2302
2303error_dump:
2304 dbg_dump_node(c, sa->node);
2305 dbg_dump_node(c, sb->node);
2306 return -EINVAL;
2307}
2308
2309/**
2310 * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
2311 * @c: UBIFS file-system description object
2312 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2313 *
2314 * This function returns zero if the list of non-data nodes is sorted correctly,
2315 * and %-EINVAL if not.
2316 */
2317int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
2318{
2319 struct list_head *cur;
2320 struct ubifs_scan_node *sa, *sb;
2321
2322 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2323 return 0;
2324
2325 for (cur = head->next; cur->next != head; cur = cur->next) {
2326 ino_t inuma, inumb;
2327 uint32_t hasha, hashb;
2328
2329 cond_resched();
2330 sa = container_of(cur, struct ubifs_scan_node, list);
2331 sb = container_of(cur->next, struct ubifs_scan_node, list);
2332
2333 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2334 sa->type != UBIFS_XENT_NODE) {
2335 ubifs_err("bad node type %d", sa->type);
2336 dbg_dump_node(c, sa->node);
2337 return -EINVAL;
2338 }
2339 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2340 sa->type != UBIFS_XENT_NODE) {
2341 ubifs_err("bad node type %d", sb->type);
2342 dbg_dump_node(c, sb->node);
2343 return -EINVAL;
2344 }
2345
2346 if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2347 ubifs_err("non-inode node goes before inode node");
2348 goto error_dump;
2349 }
2350
2351 if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
2352 continue;
2353
2354 if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2355 /* Inode nodes are sorted in descending size order */
2356 if (sa->len < sb->len) {
2357 ubifs_err("smaller inode node goes first");
2358 goto error_dump;
2359 }
2360 continue;
2361 }
2362
2363 /*
2364 * This is either a dentry or xentry, which should be sorted in
2365 * ascending (parent ino, hash) order.
2366 */
2367 inuma = key_inum(c, &sa->key);
2368 inumb = key_inum(c, &sb->key);
2369
2370 if (inuma < inumb)
2371 continue;
2372 if (inuma > inumb) {
2373 ubifs_err("larger inum %lu goes before inum %lu",
2374 (unsigned long)inuma, (unsigned long)inumb);
2375 goto error_dump;
2376 }
2377
2378 hasha = key_block(c, &sa->key);
2379 hashb = key_block(c, &sb->key);
2380
2381 if (hasha > hashb) {
2382 ubifs_err("larger hash %u goes before %u", hasha, hashb);
2383 goto error_dump;
2384 }
2385 }
2386
2387 return 0;
2388
2389error_dump:
2390 ubifs_msg("dumping first node");
2391 dbg_dump_node(c, sa->node);
2392 ubifs_msg("dumping second node");
2393 dbg_dump_node(c, sb->node);
2394 return -EINVAL;
2395 return 0;
2396}
2397
2242static int invocation_cnt; 2398static int invocation_cnt;
2243 2399
2244int dbg_force_in_the_gaps(void) 2400int dbg_force_in_the_gaps(void)
@@ -2625,6 +2781,7 @@ static const struct file_operations dfs_fops = {
2625 .open = open_debugfs_file, 2781 .open = open_debugfs_file,
2626 .write = write_debugfs_file, 2782 .write = write_debugfs_file,
2627 .owner = THIS_MODULE, 2783 .owner = THIS_MODULE,
2784 .llseek = default_llseek,
2628}; 2785};
2629 2786
2630/** 2787/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea6..69ebe4729151 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
324 int row, int col); 324 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, 325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
326 loff_t size); 326 loff_t size);
327int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
328int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
327 329
328/* Force the use of in-the-gaps method for testing */ 330/* Force the use of in-the-gaps method for testing */
329 331
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
465#define dbg_check_lprops(c) 0 467#define dbg_check_lprops(c) 0
466#define dbg_check_lpt_nodes(c, cnode, row, col) 0 468#define dbg_check_lpt_nodes(c, cnode, row, col) 0
467#define dbg_check_inode_size(c, inode, size) 0 469#define dbg_check_inode_size(c, inode, size) 0
470#define dbg_check_data_nodes_order(c, head) 0
471#define dbg_check_nondata_nodes_order(c, head) 0
468#define dbg_force_in_the_gaps_enabled 0 472#define dbg_force_in_the_gaps_enabled 0
469#define dbg_force_in_the_gaps() 0 473#define dbg_force_in_the_gaps() 0
470#define dbg_failure_mode 0 474#define dbg_failure_mode 0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
550 550
551 lock_2_inodes(dir, inode); 551 lock_2_inodes(dir, inode);
552 inc_nlink(inode); 552 inc_nlink(inode);
553 atomic_inc(&inode->i_count); 553 ihold(inode);
554 inode->i_ctime = ubifs_current_time(inode); 554 inode->i_ctime = ubifs_current_time(inode);
555 dir->i_size += sz_change; 555 dir->i_size += sz_change;
556 dir_ui->ui_size = dir->i_size; 556 dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 03ae894c45de..d77db7e36484 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
433 struct page *page; 433 struct page *page;
434 434
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
436 ubifs_assert(!c->ro_media && !c->ro_mount);
436 437
437 if (unlikely(c->ro_media)) 438 if (unlikely(c->ro_error))
438 return -EROFS; 439 return -EROFS;
439 440
440 /* Try out the fast-path part first */ 441 /* Try out the fast-path part first */
@@ -1439,9 +1440,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
1439 1440
1440 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, 1441 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
1441 i_size_read(inode)); 1442 i_size_read(inode));
1442 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1443 ubifs_assert(!c->ro_media && !c->ro_mount);
1443 1444
1444 if (unlikely(c->ro_media)) 1445 if (unlikely(c->ro_error))
1445 return VM_FAULT_SIGBUS; /* -EROFS */ 1446 return VM_FAULT_SIGBUS; /* -EROFS */
1446 1447
1447 /* 1448 /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca05..151f10882820 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,10 +125,16 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
125 struct ubifs_scan_node *sa, *sb; 125 struct ubifs_scan_node *sa, *sb;
126 126
127 cond_resched(); 127 cond_resched();
128 if (a == b)
129 return 0;
130
128 sa = list_entry(a, struct ubifs_scan_node, list); 131 sa = list_entry(a, struct ubifs_scan_node, list);
129 sb = list_entry(b, struct ubifs_scan_node, list); 132 sb = list_entry(b, struct ubifs_scan_node, list);
133
130 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); 134 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
131 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); 135 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
136 ubifs_assert(sa->type == UBIFS_DATA_NODE);
137 ubifs_assert(sb->type == UBIFS_DATA_NODE);
132 138
133 inuma = key_inum(c, &sa->key); 139 inuma = key_inum(c, &sa->key);
134 inumb = key_inum(c, &sb->key); 140 inumb = key_inum(c, &sb->key);
@@ -157,28 +163,40 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
157 */ 163 */
158int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 164int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
159{ 165{
160 int typea, typeb;
161 ino_t inuma, inumb; 166 ino_t inuma, inumb;
162 struct ubifs_info *c = priv; 167 struct ubifs_info *c = priv;
163 struct ubifs_scan_node *sa, *sb; 168 struct ubifs_scan_node *sa, *sb;
164 169
165 cond_resched(); 170 cond_resched();
171 if (a == b)
172 return 0;
173
166 sa = list_entry(a, struct ubifs_scan_node, list); 174 sa = list_entry(a, struct ubifs_scan_node, list);
167 sb = list_entry(b, struct ubifs_scan_node, list); 175 sb = list_entry(b, struct ubifs_scan_node, list);
168 typea = key_type(c, &sa->key); 176
169 typeb = key_type(c, &sb->key); 177 ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
170 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); 178 key_type(c, &sb->key) != UBIFS_DATA_KEY);
179 ubifs_assert(sa->type != UBIFS_DATA_NODE &&
180 sb->type != UBIFS_DATA_NODE);
171 181
172 /* Inodes go before directory entries */ 182 /* Inodes go before directory entries */
173 if (typea == UBIFS_INO_KEY) { 183 if (sa->type == UBIFS_INO_NODE) {
174 if (typeb == UBIFS_INO_KEY) 184 if (sb->type == UBIFS_INO_NODE)
175 return sb->len - sa->len; 185 return sb->len - sa->len;
176 return -1; 186 return -1;
177 } 187 }
178 if (typeb == UBIFS_INO_KEY) 188 if (sb->type == UBIFS_INO_NODE)
179 return 1; 189 return 1;
180 190
181 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); 191 ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
192 key_type(c, &sa->key) == UBIFS_XENT_KEY);
193 ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
194 key_type(c, &sb->key) == UBIFS_XENT_KEY);
195 ubifs_assert(sa->type == UBIFS_DENT_NODE ||
196 sa->type == UBIFS_XENT_NODE);
197 ubifs_assert(sb->type == UBIFS_DENT_NODE ||
198 sb->type == UBIFS_XENT_NODE);
199
182 inuma = key_inum(c, &sa->key); 200 inuma = key_inum(c, &sa->key);
183 inumb = key_inum(c, &sb->key); 201 inumb = key_inum(c, &sb->key);
184 202
@@ -224,17 +242,33 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
224static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, 242static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
225 struct list_head *nondata, int *min) 243 struct list_head *nondata, int *min)
226{ 244{
245 int err;
227 struct ubifs_scan_node *snod, *tmp; 246 struct ubifs_scan_node *snod, *tmp;
228 247
229 *min = INT_MAX; 248 *min = INT_MAX;
230 249
231 /* Separate data nodes and non-data nodes */ 250 /* Separate data nodes and non-data nodes */
232 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 251 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
233 int err; 252 ubifs_assert(snod->type == UBIFS_INO_NODE ||
253 snod->type == UBIFS_DATA_NODE ||
254 snod->type == UBIFS_DENT_NODE ||
255 snod->type == UBIFS_XENT_NODE ||
256 snod->type == UBIFS_TRUN_NODE);
257
258 if (snod->type != UBIFS_INO_NODE &&
259 snod->type != UBIFS_DATA_NODE &&
260 snod->type != UBIFS_DENT_NODE &&
261 snod->type != UBIFS_XENT_NODE) {
262 /* Probably truncation node, zap it */
263 list_del(&snod->list);
264 kfree(snod);
265 continue;
266 }
234 267
235 ubifs_assert(snod->type != UBIFS_IDX_NODE); 268 ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
236 ubifs_assert(snod->type != UBIFS_REF_NODE); 269 key_type(c, &snod->key) == UBIFS_INO_KEY ||
237 ubifs_assert(snod->type != UBIFS_CS_NODE); 270 key_type(c, &snod->key) == UBIFS_DENT_KEY ||
271 key_type(c, &snod->key) == UBIFS_XENT_KEY);
238 272
239 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 273 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
240 snod->offs, 0); 274 snod->offs, 0);
@@ -258,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
258 /* Sort data and non-data nodes */ 292 /* Sort data and non-data nodes */
259 list_sort(c, &sleb->nodes, &data_nodes_cmp); 293 list_sort(c, &sleb->nodes, &data_nodes_cmp);
260 list_sort(c, nondata, &nondata_nodes_cmp); 294 list_sort(c, nondata, &nondata_nodes_cmp);
295
296 err = dbg_check_data_nodes_order(c, &sleb->nodes);
297 if (err)
298 return err;
299 err = dbg_check_nondata_nodes_order(c, nondata);
300 if (err)
301 return err;
261 return 0; 302 return 0;
262} 303}
263 304
@@ -575,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
575 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 616 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
576 617
577 ubifs_assert_cmt_locked(c); 618 ubifs_assert_cmt_locked(c);
619 ubifs_assert(!c->ro_media && !c->ro_mount);
578 620
579 if (ubifs_gc_should_commit(c)) 621 if (ubifs_gc_should_commit(c))
580 return -EAGAIN; 622 return -EAGAIN;
581 623
582 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 624 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
583 625
584 if (c->ro_media) { 626 if (c->ro_error) {
585 ret = -EROFS; 627 ret = -EROFS;
586 goto out_unlock; 628 goto out_unlock;
587 } 629 }
@@ -677,14 +719,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
677 719
678 ret = ubifs_garbage_collect_leb(c, &lp); 720 ret = ubifs_garbage_collect_leb(c, &lp);
679 if (ret < 0) { 721 if (ret < 0) {
680 if (ret == -EAGAIN || ret == -ENOSPC) { 722 if (ret == -EAGAIN) {
681 /* 723 /*
682 * These codes are not errors, so we have to 724 * This is not error, so we have to return the
683 * return the LEB to lprops. But if the 725 * LEB to lprops. But if 'ubifs_return_leb()'
684 * 'ubifs_return_leb()' function fails, its 726 * fails, its failure code is propagated to the
685 * failure code is propagated to the caller 727 * caller instead of the original '-EAGAIN'.
686 * instead of the original '-EAGAIN' or
687 * '-ENOSPC'.
688 */ 728 */
689 err = ubifs_return_leb(c, lp.lnum); 729 err = ubifs_return_leb(c, lp.lnum);
690 if (err) 730 if (err)
@@ -774,8 +814,8 @@ out_unlock:
774out: 814out:
775 ubifs_assert(ret < 0); 815 ubifs_assert(ret < 0);
776 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); 816 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
777 ubifs_ro_mode(c, ret);
778 ubifs_wbuf_sync_nolock(wbuf); 817 ubifs_wbuf_sync_nolock(wbuf);
818 ubifs_ro_mode(c, ret);
779 mutex_unlock(&wbuf->io_mutex); 819 mutex_unlock(&wbuf->io_mutex);
780 ubifs_return_leb(c, lp.lnum); 820 ubifs_return_leb(c, lp.lnum);
781 return ret; 821 return ret;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30bb..d82173182eeb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
61 */ 61 */
62void ubifs_ro_mode(struct ubifs_info *c, int err) 62void ubifs_ro_mode(struct ubifs_info *c, int err)
63{ 63{
64 if (!c->ro_media) { 64 if (!c->ro_error) {
65 c->ro_media = 1; 65 c->ro_error = 1;
66 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY; 67 c->vfs_sb->s_flags |= MS_RDONLY;
68 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
@@ -356,11 +356,11 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
356 356
357 dbg_io("LEB %d:%d, %d bytes, jhead %s", 357 dbg_io("LEB %d:%d, %d bytes, jhead %s",
358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); 358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
359 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
360 ubifs_assert(!(wbuf->avail & 7)); 359 ubifs_assert(!(wbuf->avail & 7));
361 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 360 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
361 ubifs_assert(!c->ro_media && !c->ro_mount);
362 362
363 if (c->ro_media) 363 if (c->ro_error)
364 return -EROFS; 364 return -EROFS;
365 365
366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); 366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +440,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
440{ 440{
441 int err, i; 441 int err, i;
442 442
443 ubifs_assert(!c->ro_media && !c->ro_mount);
443 if (!c->need_wbuf_sync) 444 if (!c->need_wbuf_sync)
444 return 0; 445 return 0;
445 c->need_wbuf_sync = 0; 446 c->need_wbuf_sync = 0;
446 447
447 if (c->ro_media) { 448 if (c->ro_error) {
448 err = -EROFS; 449 err = -EROFS;
449 goto out_timers; 450 goto out_timers;
450 } 451 }
@@ -519,6 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
519 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 520 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
520 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); 521 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
521 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 522 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
523 ubifs_assert(!c->ro_media && !c->ro_mount);
522 524
523 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 525 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
524 err = -ENOSPC; 526 err = -ENOSPC;
@@ -527,7 +529,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
527 529
528 cancel_wbuf_timer_nolock(wbuf); 530 cancel_wbuf_timer_nolock(wbuf);
529 531
530 if (c->ro_media) 532 if (c->ro_error)
531 return -EROFS; 533 return -EROFS;
532 534
533 if (aligned_len <= wbuf->avail) { 535 if (aligned_len <= wbuf->avail) {
@@ -663,8 +665,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
663 buf_len); 665 buf_len);
664 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 666 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
665 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); 667 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
668 ubifs_assert(!c->ro_media && !c->ro_mount);
666 669
667 if (c->ro_media) 670 if (c->ro_error)
668 return -EROFS; 671 return -EROFS;
669 672
670 ubifs_prepare_node(c, buf, len, 1); 673 ubifs_prepare_node(c, buf, len, 1);
@@ -815,7 +818,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
815 return 0; 818 return 0;
816 819
817out: 820out:
818 ubifs_err("bad node at LEB %d:%d", lnum, offs); 821 ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
822 ubi_is_mapped(c->ubi, lnum));
819 dbg_dump_node(c, buf); 823 dbg_dump_node(c, buf);
820 dbg_dump_stack(); 824 dbg_dump_stack();
821 return -EINVAL; 825 return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68d..914f1bd89e57 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
122 * better to try to allocate space at the ends of eraseblocks. This is 122 * better to try to allocate space at the ends of eraseblocks. This is
123 * what the squeeze parameter does. 123 * what the squeeze parameter does.
124 */ 124 */
125 ubifs_assert(!c->ro_media && !c->ro_mount);
125 squeeze = (jhead == BASEHD); 126 squeeze = (jhead == BASEHD);
126again: 127again:
127 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 128 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
128 129
129 if (c->ro_media) { 130 if (c->ro_error) {
130 err = -EROFS; 131 err = -EROFS;
131 goto out_unlock; 132 goto out_unlock;
132 } 133 }
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0b..92a8491a8f8c 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -306,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
306} 306}
307 307
308/** 308/**
309 * invalid_key_init - initialize invalid node key.
310 * @c: UBIFS file-system description object
311 * @key: key to initialize
312 *
313 * This is a helper function which marks a @key object as invalid.
314 */
315static inline void invalid_key_init(const struct ubifs_info *c,
316 union ubifs_key *key)
317{
318 key->u32[0] = 0xDEADBEAF;
319 key->u32[1] = UBIFS_INVALID_KEY;
320}
321
322/**
309 * key_type - get key type. 323 * key_type - get key type.
310 * @c: UBIFS file-system description object 324 * @c: UBIFS file-system description object
311 * @key: key to get type of 325 * @key: key to get type of
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42c..4d0cb1241460 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
159 jhead = &c->jheads[bud->jhead]; 159 jhead = &c->jheads[bud->jhead];
160 list_add_tail(&bud->list, &jhead->buds_list); 160 list_add_tail(&bud->list, &jhead->buds_list);
161 } else 161 } else
162 ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY)); 162 ubifs_assert(c->replaying && c->ro_mount);
163 163
164 /* 164 /*
165 * Note, although this is a new bud, we anyway account this space now, 165 * Note, although this is a new bud, we anyway account this space now,
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
223 } 223 }
224 224
225 mutex_lock(&c->log_mutex); 225 mutex_lock(&c->log_mutex);
226 226 ubifs_assert(!c->ro_media && !c->ro_mount);
227 if (c->ro_media) { 227 if (c->ro_error) {
228 err = -EROFS; 228 err = -EROFS;
229 goto out_unlock; 229 goto out_unlock;
230 } 230 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 0084a33c4c69..72775d35b99e 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
1363 goto out; 1363 goto out;
1364 for (i = 0; i < c->lsave_cnt; i++) { 1364 for (i = 0; i < c->lsave_cnt; i++) {
1365 int lnum = c->lsave[i]; 1365 int lnum = c->lsave[i];
1366 struct ubifs_lprops *lprops;
1366 1367
1367 /* 1368 /*
1368 * Due to automatic resizing, the values in the lsave table 1369 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
1370 */ 1371 */
1371 if (lnum >= c->leb_cnt) 1372 if (lnum >= c->leb_cnt)
1372 continue; 1373 continue;
1373 ubifs_lpt_lookup(c, lnum); 1374 lprops = ubifs_lpt_lookup(c, lnum);
1375 if (IS_ERR(lprops)) {
1376 err = PTR_ERR(lprops);
1377 goto out;
1378 }
1374 } 1379 }
1375out: 1380out:
1376 vfree(buf); 1381 vfree(buf);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d12535b7fc78..5c90dec5db0b 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
705 struct ubifs_pnode *pnode; 705 struct ubifs_pnode *pnode;
706 706
707 pnode = pnode_lookup(c, 0); 707 pnode = pnode_lookup(c, 0);
708 if (IS_ERR(pnode))
709 return PTR_ERR(pnode);
710
708 while (pnode) { 711 while (pnode) {
709 do_make_pnode_dirty(c, pnode); 712 do_make_pnode_dirty(c, pnode);
710 pnode = next_pnode_to_dirty(c, pnode); 713 pnode = next_pnode_to_dirty(c, pnode);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc0..21f47afdacff 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
361{ 361{
362 int err, lnum, offs, len; 362 int err, lnum, offs, len;
363 363
364 if (c->ro_media) 364 ubifs_assert(!c->ro_media && !c->ro_mount);
365 if (c->ro_error)
365 return -EROFS; 366 return -EROFS;
366 367
367 lnum = UBIFS_MST_LNUM; 368 lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e41..c3de04dc952a 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
132{ 132{
133 int err; 133 int err;
134 134
135 if (c->ro_media) 135 ubifs_assert(!c->ro_media && !c->ro_mount);
136 if (c->ro_error)
136 return -EROFS; 137 return -EROFS;
137 err = ubi_leb_unmap(c->ubi, lnum); 138 err = ubi_leb_unmap(c->ubi, lnum);
138 if (err) { 139 if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
159{ 160{
160 int err; 161 int err;
161 162
162 if (c->ro_media) 163 ubifs_assert(!c->ro_media && !c->ro_mount);
164 if (c->ro_error)
163 return -EROFS; 165 return -EROFS;
164 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); 166 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
165 if (err) { 167 if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
186{ 188{
187 int err; 189 int err;
188 190
189 if (c->ro_media) 191 ubifs_assert(!c->ro_media && !c->ro_mount);
192 if (c->ro_error)
190 return -EROFS; 193 return -EROFS;
191 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); 194 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
192 if (err) { 195 if (err) {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index daae9e1f5382..77e9b874b6c2 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
292 292
293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); 293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
294 294
295 if ((c->vfs_sb->s_flags & MS_RDONLY)) { 295 if (c->ro_mount) {
296 /* Read-only mode. Keep a copy for switching to rw mode */ 296 /* Read-only mode. Keep a copy for switching to rw mode */
297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); 297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
298 if (!c->rcvrd_mst_node) { 298 if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
469 endpt = snod->offs + snod->len; 469 endpt = snod->offs + snod->len;
470 } 470 }
471 471
472 if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) { 472 if (c->ro_mount && !c->remounting_rw) {
473 /* Add to recovery list */ 473 /* Add to recovery list */
474 struct ubifs_unclean_leb *ucleb; 474 struct ubifs_unclean_leb *ucleb;
475 475
@@ -772,7 +772,8 @@ out_free:
772 * @sbuf: LEB-sized buffer to use 772 * @sbuf: LEB-sized buffer to use
773 * 773 *
774 * This function does a scan of a LEB, but caters for errors that might have 774 * This function does a scan of a LEB, but caters for errors that might have
775 * been caused by the unclean unmount from which we are attempting to recover. 775 * been caused by unclean reboots from which we are attempting to recover
776 * (assume that only the last log LEB can be corrupted by an unclean reboot).
776 * 777 *
777 * This function returns %0 on success and a negative error code on failure. 778 * This function returns %0 on success and a negative error code on failure.
778 */ 779 */
@@ -883,7 +884,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
883{ 884{
884 int err; 885 int err;
885 886
886 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw); 887 ubifs_assert(!c->ro_mount || c->remounting_rw);
887 888
888 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); 889 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
889 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); 890 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1461,7 +1462,7 @@ int ubifs_recover_size(struct ubifs_info *c)
1461 } 1462 }
1462 } 1463 }
1463 if (e->exists && e->i_size < e->d_size) { 1464 if (e->exists && e->i_size < e->d_size) {
1464 if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) { 1465 if (!e->inode && c->ro_mount) {
1465 /* Fix the inode size and pin it in memory */ 1466 /* Fix the inode size and pin it in memory */
1466 struct inode *inode; 1467 struct inode *inode;
1467 1468
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3e..eed0fcff8d73 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
627 ubifs_assert(sleb->endpt - offs >= used); 627 ubifs_assert(sleb->endpt - offs >= used);
628 ubifs_assert(sleb->endpt % c->min_io_size == 0); 628 ubifs_assert(sleb->endpt % c->min_io_size == 0);
629 629
630 if (sleb->endpt + c->min_io_size <= c->leb_size && 630 if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
631 !(c->vfs_sb->s_flags & MS_RDONLY))
632 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, 631 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
633 sleb->endpt, UBI_SHORTTERM); 632 sleb->endpt, UBI_SHORTTERM);
634 633
@@ -840,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
840 if (IS_ERR(sleb)) { 839 if (IS_ERR(sleb)) {
841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) 840 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 return PTR_ERR(sleb); 841 return PTR_ERR(sleb);
842 /*
843 * Note, the below function will recover this log LEB only if
844 * it is the last, because unclean reboots can possibly corrupt
845 * only the tail of the log.
846 */
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 847 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
844 if (IS_ERR(sleb)) 848 if (IS_ERR(sleb))
845 return PTR_ERR(sleb); 849 return PTR_ERR(sleb);
@@ -851,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
851 } 855 }
852 856
853 node = sleb->buf; 857 node = sleb->buf;
854
855 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); 858 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
856 if (c->cs_sqnum == 0) { 859 if (c->cs_sqnum == 0) {
857 /* 860 /*
@@ -898,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
898 } 901 }
899 902
900 list_for_each_entry(snod, &sleb->nodes, list) { 903 list_for_each_entry(snod, &sleb->nodes, list) {
901
902 cond_resched(); 904 cond_resched();
903 905
904 if (snod->sqnum >= SQNUM_WATERMARK) { 906 if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1011,7 +1013,6 @@ out:
1011int ubifs_replay_journal(struct ubifs_info *c) 1013int ubifs_replay_journal(struct ubifs_info *c)
1012{ 1014{
1013 int err, i, lnum, offs, free; 1015 int err, i, lnum, offs, free;
1014 void *sbuf = NULL;
1015 1016
1016 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); 1017 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
1017 1018
@@ -1026,14 +1027,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
1026 return -EINVAL; 1027 return -EINVAL;
1027 } 1028 }
1028 1029
1029 sbuf = vmalloc(c->leb_size);
1030 if (!sbuf)
1031 return -ENOMEM;
1032
1033 dbg_mnt("start replaying the journal"); 1030 dbg_mnt("start replaying the journal");
1034
1035 c->replaying = 1; 1031 c->replaying = 1;
1036
1037 lnum = c->ltail_lnum = c->lhead_lnum; 1032 lnum = c->ltail_lnum = c->lhead_lnum;
1038 offs = c->lhead_offs; 1033 offs = c->lhead_offs;
1039 1034
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
1046 lnum = UBIFS_LOG_LNUM; 1041 lnum = UBIFS_LOG_LNUM;
1047 offs = 0; 1042 offs = 0;
1048 } 1043 }
1049 err = replay_log_leb(c, lnum, offs, sbuf); 1044 err = replay_log_leb(c, lnum, offs, c->sbuf);
1050 if (err == 1) 1045 if (err == 1)
1051 /* We hit the end of the log */ 1046 /* We hit the end of the log */
1052 break; 1047 break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
1079out: 1074out:
1080 destroy_replay_tree(c); 1075 destroy_replay_tree(c);
1081 destroy_bud_list(c); 1076 destroy_bud_list(c);
1082 vfree(sbuf);
1083 c->replaying = 0; 1077 c->replaying = 0;
1084 return err; 1078 return err;
1085} 1079}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9dd..bf31b4729e51 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
542 * due to the unavailability of time-travelling equipment. 542 * due to the unavailability of time-travelling equipment.
543 */ 543 */
544 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 544 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
545 struct super_block *sb = c->vfs_sb; 545 ubifs_assert(!c->ro_media || c->ro_mount);
546 int mounting_ro = sb->s_flags & MS_RDONLY; 546 if (!c->ro_mount ||
547
548 ubifs_assert(!c->ro_media || mounting_ro);
549 if (!mounting_ro ||
550 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { 547 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
551 ubifs_err("on-flash format version is w%d/r%d, but " 548 ubifs_err("on-flash format version is w%d/r%d, but "
552 "software only supports up to version " 549 "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
624 c->old_leb_cnt = c->leb_cnt; 621 c->old_leb_cnt = c->leb_cnt;
625 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { 622 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
626 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); 623 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
627 if (c->vfs_sb->s_flags & MS_RDONLY) 624 if (c->ro_mount)
628 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", 625 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
629 c->old_leb_cnt, c->leb_cnt); 626 c->old_leb_cnt, c->leb_cnt);
630 else { 627 else {
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c525384191..3e1ee57dbeaa 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
197 struct ubifs_ino_node *ino = buf; 197 struct ubifs_ino_node *ino = buf;
198 struct ubifs_scan_node *snod; 198 struct ubifs_scan_node *snod;
199 199
200 snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); 200 snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
201 if (!snod) 201 if (!snod)
202 return -ENOMEM; 202 return -ENOMEM;
203 203
@@ -212,13 +212,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
212 case UBIFS_DENT_NODE: 212 case UBIFS_DENT_NODE:
213 case UBIFS_XENT_NODE: 213 case UBIFS_XENT_NODE:
214 case UBIFS_DATA_NODE: 214 case UBIFS_DATA_NODE:
215 case UBIFS_TRUN_NODE:
216 /* 215 /*
217 * The key is in the same place in all keyed 216 * The key is in the same place in all keyed
218 * nodes. 217 * nodes.
219 */ 218 */
220 key_read(c, &ino->key, &snod->key); 219 key_read(c, &ino->key, &snod->key);
221 break; 220 break;
221 default:
222 invalid_key_init(c, &snod->key);
223 break;
222 } 224 }
223 list_add_tail(&snod->list, &sleb->nodes); 225 list_add_tail(&snod->list, &sleb->nodes);
224 sleb->nodes_cnt += 1; 226 sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5ad..46961c003236 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); 250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
251 251
252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || 252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
253 c->ro_media) { 253 c->ro_mount || c->ro_error) {
254 mutex_unlock(&c->umount_mutex); 254 mutex_unlock(&c->umount_mutex);
255 continue; 255 continue;
256 } 256 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cd5900b85d38..6e11c2975dcf 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
272 return &ui->vfs_inode; 272 return &ui->vfs_inode;
273}; 273};
274 274
275static void ubifs_i_callback(struct rcu_head *head)
276{
277 struct inode *inode = container_of(head, struct inode, i_rcu);
278 struct ubifs_inode *ui = ubifs_inode(inode);
279 INIT_LIST_HEAD(&inode->i_dentry);
280 kmem_cache_free(ubifs_inode_slab, ui);
281}
282
275static void ubifs_destroy_inode(struct inode *inode) 283static void ubifs_destroy_inode(struct inode *inode)
276{ 284{
277 struct ubifs_inode *ui = ubifs_inode(inode); 285 struct ubifs_inode *ui = ubifs_inode(inode);
278 286
279 kfree(ui->data); 287 kfree(ui->data);
280 kmem_cache_free(ubifs_inode_slab, inode); 288 call_rcu(&inode->i_rcu, ubifs_i_callback);
281} 289}
282 290
283/* 291/*
@@ -1137,11 +1145,11 @@ static int check_free_space(struct ubifs_info *c)
1137 */ 1145 */
1138static int mount_ubifs(struct ubifs_info *c) 1146static int mount_ubifs(struct ubifs_info *c)
1139{ 1147{
1140 struct super_block *sb = c->vfs_sb; 1148 int err;
1141 int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
1142 long long x; 1149 long long x;
1143 size_t sz; 1150 size_t sz;
1144 1151
1152 c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
1145 err = init_constants_early(c); 1153 err = init_constants_early(c);
1146 if (err) 1154 if (err)
1147 return err; 1155 return err;
@@ -1154,7 +1162,7 @@ static int mount_ubifs(struct ubifs_info *c)
1154 if (err) 1162 if (err)
1155 goto out_free; 1163 goto out_free;
1156 1164
1157 if (c->empty && (mounted_read_only || c->ro_media)) { 1165 if (c->empty && (c->ro_mount || c->ro_media)) {
1158 /* 1166 /*
1159 * This UBI volume is empty, and read-only, or the file system 1167 * This UBI volume is empty, and read-only, or the file system
1160 * is mounted read-only - we cannot format it. 1168 * is mounted read-only - we cannot format it.
@@ -1165,7 +1173,7 @@ static int mount_ubifs(struct ubifs_info *c)
1165 goto out_free; 1173 goto out_free;
1166 } 1174 }
1167 1175
1168 if (c->ro_media && !mounted_read_only) { 1176 if (c->ro_media && !c->ro_mount) {
1169 ubifs_err("cannot mount read-write - read-only media"); 1177 ubifs_err("cannot mount read-write - read-only media");
1170 err = -EROFS; 1178 err = -EROFS;
1171 goto out_free; 1179 goto out_free;
@@ -1185,7 +1193,7 @@ static int mount_ubifs(struct ubifs_info *c)
1185 if (!c->sbuf) 1193 if (!c->sbuf)
1186 goto out_free; 1194 goto out_free;
1187 1195
1188 if (!mounted_read_only) { 1196 if (!c->ro_mount) {
1189 c->ileb_buf = vmalloc(c->leb_size); 1197 c->ileb_buf = vmalloc(c->leb_size);
1190 if (!c->ileb_buf) 1198 if (!c->ileb_buf)
1191 goto out_free; 1199 goto out_free;
@@ -1228,7 +1236,7 @@ static int mount_ubifs(struct ubifs_info *c)
1228 } 1236 }
1229 1237
1230 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); 1238 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
1231 if (!mounted_read_only) { 1239 if (!c->ro_mount) {
1232 err = alloc_wbufs(c); 1240 err = alloc_wbufs(c);
1233 if (err) 1241 if (err)
1234 goto out_cbuf; 1242 goto out_cbuf;
@@ -1254,12 +1262,12 @@ static int mount_ubifs(struct ubifs_info *c)
1254 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { 1262 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1255 ubifs_msg("recovery needed"); 1263 ubifs_msg("recovery needed");
1256 c->need_recovery = 1; 1264 c->need_recovery = 1;
1257 if (!mounted_read_only) { 1265 if (!c->ro_mount) {
1258 err = ubifs_recover_inl_heads(c, c->sbuf); 1266 err = ubifs_recover_inl_heads(c, c->sbuf);
1259 if (err) 1267 if (err)
1260 goto out_master; 1268 goto out_master;
1261 } 1269 }
1262 } else if (!mounted_read_only) { 1270 } else if (!c->ro_mount) {
1263 /* 1271 /*
1264 * Set the "dirty" flag so that if we reboot uncleanly we 1272 * Set the "dirty" flag so that if we reboot uncleanly we
1265 * will notice this immediately on the next mount. 1273 * will notice this immediately on the next mount.
@@ -1270,7 +1278,7 @@ static int mount_ubifs(struct ubifs_info *c)
1270 goto out_master; 1278 goto out_master;
1271 } 1279 }
1272 1280
1273 err = ubifs_lpt_init(c, 1, !mounted_read_only); 1281 err = ubifs_lpt_init(c, 1, !c->ro_mount);
1274 if (err) 1282 if (err)
1275 goto out_lpt; 1283 goto out_lpt;
1276 1284
@@ -1285,11 +1293,11 @@ static int mount_ubifs(struct ubifs_info *c)
1285 /* Calculate 'min_idx_lebs' after journal replay */ 1293 /* Calculate 'min_idx_lebs' after journal replay */
1286 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 1294 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
1287 1295
1288 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); 1296 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
1289 if (err) 1297 if (err)
1290 goto out_orphans; 1298 goto out_orphans;
1291 1299
1292 if (!mounted_read_only) { 1300 if (!c->ro_mount) {
1293 int lnum; 1301 int lnum;
1294 1302
1295 err = check_free_space(c); 1303 err = check_free_space(c);
@@ -1351,7 +1359,7 @@ static int mount_ubifs(struct ubifs_info *c)
1351 spin_unlock(&ubifs_infos_lock); 1359 spin_unlock(&ubifs_infos_lock);
1352 1360
1353 if (c->need_recovery) { 1361 if (c->need_recovery) {
1354 if (mounted_read_only) 1362 if (c->ro_mount)
1355 ubifs_msg("recovery deferred"); 1363 ubifs_msg("recovery deferred");
1356 else { 1364 else {
1357 c->need_recovery = 0; 1365 c->need_recovery = 0;
@@ -1378,7 +1386,7 @@ static int mount_ubifs(struct ubifs_info *c)
1378 1386
1379 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1387 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
1380 c->vi.ubi_num, c->vi.vol_id, c->vi.name); 1388 c->vi.ubi_num, c->vi.vol_id, c->vi.name);
1381 if (mounted_read_only) 1389 if (c->ro_mount)
1382 ubifs_msg("mounted read-only"); 1390 ubifs_msg("mounted read-only");
1383 x = (long long)c->main_lebs * c->leb_size; 1391 x = (long long)c->main_lebs * c->leb_size;
1384 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " 1392 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1640,7 +1648,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1640 } 1648 }
1641 1649
1642 dbg_gen("re-mounted read-write"); 1650 dbg_gen("re-mounted read-write");
1643 c->vfs_sb->s_flags &= ~MS_RDONLY; 1651 c->ro_mount = 0;
1644 c->remounting_rw = 0; 1652 c->remounting_rw = 0;
1645 c->always_chk_crc = 0; 1653 c->always_chk_crc = 0;
1646 err = dbg_check_space_info(c); 1654 err = dbg_check_space_info(c);
@@ -1676,7 +1684,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1676 int i, err; 1684 int i, err;
1677 1685
1678 ubifs_assert(!c->need_recovery); 1686 ubifs_assert(!c->need_recovery);
1679 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 1687 ubifs_assert(!c->ro_mount);
1680 1688
1681 mutex_lock(&c->umount_mutex); 1689 mutex_lock(&c->umount_mutex);
1682 if (c->bgt) { 1690 if (c->bgt) {
@@ -1686,10 +1694,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1686 1694
1687 dbg_save_space_info(c); 1695 dbg_save_space_info(c);
1688 1696
1689 for (i = 0; i < c->jhead_cnt; i++) { 1697 for (i = 0; i < c->jhead_cnt; i++)
1690 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1698 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1691 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1692 }
1693 1699
1694 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1700 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1695 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1701 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1704,6 +1710,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1704 vfree(c->ileb_buf); 1710 vfree(c->ileb_buf);
1705 c->ileb_buf = NULL; 1711 c->ileb_buf = NULL;
1706 ubifs_lpt_free(c, 1); 1712 ubifs_lpt_free(c, 1);
1713 c->ro_mount = 1;
1707 err = dbg_check_space_info(c); 1714 err = dbg_check_space_info(c);
1708 if (err) 1715 if (err)
1709 ubifs_ro_mode(c, err); 1716 ubifs_ro_mode(c, err);
@@ -1735,7 +1742,7 @@ static void ubifs_put_super(struct super_block *sb)
1735 * the mutex is locked. 1742 * the mutex is locked.
1736 */ 1743 */
1737 mutex_lock(&c->umount_mutex); 1744 mutex_lock(&c->umount_mutex);
1738 if (!(c->vfs_sb->s_flags & MS_RDONLY)) { 1745 if (!c->ro_mount) {
1739 /* 1746 /*
1740 * First of all kill the background thread to make sure it does 1747 * First of all kill the background thread to make sure it does
1741 * not interfere with un-mounting and freeing resources. 1748 * not interfere with un-mounting and freeing resources.
@@ -1745,23 +1752,22 @@ static void ubifs_put_super(struct super_block *sb)
1745 c->bgt = NULL; 1752 c->bgt = NULL;
1746 } 1753 }
1747 1754
1748 /* Synchronize write-buffers */
1749 if (c->jheads)
1750 for (i = 0; i < c->jhead_cnt; i++)
1751 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1752
1753 /* 1755 /*
1754 * On fatal errors c->ro_media is set to 1, in which case we do 1756 * On fatal errors c->ro_error is set to 1, in which case we do
1755 * not write the master node. 1757 * not write the master node.
1756 */ 1758 */
1757 if (!c->ro_media) { 1759 if (!c->ro_error) {
1760 int err;
1761
1762 /* Synchronize write-buffers */
1763 for (i = 0; i < c->jhead_cnt; i++)
1764 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1765
1758 /* 1766 /*
1759 * We are being cleanly unmounted which means the 1767 * We are being cleanly unmounted which means the
1760 * orphans were killed - indicate this in the master 1768 * orphans were killed - indicate this in the master
1761 * node. Also save the reserved GC LEB number. 1769 * node. Also save the reserved GC LEB number.
1762 */ 1770 */
1763 int err;
1764
1765 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1771 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1766 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1772 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1767 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1773 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1774,6 +1780,10 @@ static void ubifs_put_super(struct super_block *sb)
1774 */ 1780 */
1775 ubifs_err("failed to write master node, " 1781 ubifs_err("failed to write master node, "
1776 "error %d", err); 1782 "error %d", err);
1783 } else {
1784 for (i = 0; i < c->jhead_cnt; i++)
1785 /* Make sure write-buffer timers are canceled */
1786 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1777 } 1787 }
1778 } 1788 }
1779 1789
@@ -1797,17 +1807,21 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1797 return err; 1807 return err;
1798 } 1808 }
1799 1809
1800 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1810 if (c->ro_mount && !(*flags & MS_RDONLY)) {
1811 if (c->ro_error) {
1812 ubifs_msg("cannot re-mount R/W due to prior errors");
1813 return -EROFS;
1814 }
1801 if (c->ro_media) { 1815 if (c->ro_media) {
1802 ubifs_msg("cannot re-mount due to prior errors"); 1816 ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
1803 return -EROFS; 1817 return -EROFS;
1804 } 1818 }
1805 err = ubifs_remount_rw(c); 1819 err = ubifs_remount_rw(c);
1806 if (err) 1820 if (err)
1807 return err; 1821 return err;
1808 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1822 } else if (!c->ro_mount && (*flags & MS_RDONLY)) {
1809 if (c->ro_media) { 1823 if (c->ro_error) {
1810 ubifs_msg("cannot re-mount due to prior errors"); 1824 ubifs_msg("cannot re-mount R/O due to prior errors");
1811 return -EROFS; 1825 return -EROFS;
1812 } 1826 }
1813 ubifs_remount_ro(c); 1827 ubifs_remount_ro(c);
@@ -2032,8 +2046,8 @@ static int sb_test(struct super_block *sb, void *data)
2032 return c->vi.cdev == *dev; 2046 return c->vi.cdev == *dev;
2033} 2047}
2034 2048
2035static int ubifs_get_sb(struct file_system_type *fs_type, int flags, 2049static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
2036 const char *name, void *data, struct vfsmount *mnt) 2050 const char *name, void *data)
2037{ 2051{
2038 struct ubi_volume_desc *ubi; 2052 struct ubi_volume_desc *ubi;
2039 struct ubi_volume_info vi; 2053 struct ubi_volume_info vi;
@@ -2049,9 +2063,9 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2049 */ 2063 */
2050 ubi = open_ubi(name, UBI_READONLY); 2064 ubi = open_ubi(name, UBI_READONLY);
2051 if (IS_ERR(ubi)) { 2065 if (IS_ERR(ubi)) {
2052 ubifs_err("cannot open \"%s\", error %d", 2066 dbg_err("cannot open \"%s\", error %d",
2053 name, (int)PTR_ERR(ubi)); 2067 name, (int)PTR_ERR(ubi));
2054 return PTR_ERR(ubi); 2068 return ERR_CAST(ubi);
2055 } 2069 }
2056 ubi_get_volume_info(ubi, &vi); 2070 ubi_get_volume_info(ubi, &vi);
2057 2071
@@ -2064,9 +2078,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2064 } 2078 }
2065 2079
2066 if (sb->s_root) { 2080 if (sb->s_root) {
2081 struct ubifs_info *c1 = sb->s_fs_info;
2082
2067 /* A new mount point for already mounted UBIFS */ 2083 /* A new mount point for already mounted UBIFS */
2068 dbg_gen("this ubi volume is already mounted"); 2084 dbg_gen("this ubi volume is already mounted");
2069 if ((flags ^ sb->s_flags) & MS_RDONLY) { 2085 if (!!(flags & MS_RDONLY) != c1->ro_mount) {
2070 err = -EBUSY; 2086 err = -EBUSY;
2071 goto out_deact; 2087 goto out_deact;
2072 } 2088 }
@@ -2087,20 +2103,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2087 /* 'fill_super()' opens ubi again so we must close it here */ 2103 /* 'fill_super()' opens ubi again so we must close it here */
2088 ubi_close_volume(ubi); 2104 ubi_close_volume(ubi);
2089 2105
2090 simple_set_mnt(mnt, sb); 2106 return dget(sb->s_root);
2091 return 0;
2092 2107
2093out_deact: 2108out_deact:
2094 deactivate_locked_super(sb); 2109 deactivate_locked_super(sb);
2095out_close: 2110out_close:
2096 ubi_close_volume(ubi); 2111 ubi_close_volume(ubi);
2097 return err; 2112 return ERR_PTR(err);
2098} 2113}
2099 2114
2100static struct file_system_type ubifs_fs_type = { 2115static struct file_system_type ubifs_fs_type = {
2101 .name = "ubifs", 2116 .name = "ubifs",
2102 .owner = THIS_MODULE, 2117 .owner = THIS_MODULE,
2103 .get_sb = ubifs_get_sb, 2118 .mount = ubifs_mount,
2104 .kill_sb = kill_anon_super, 2119 .kill_sb = kill_anon_super,
2105}; 2120};
2106 2121
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e5..ad9cf0133622 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1177 unsigned long time = get_seconds(); 1177 unsigned long time = get_seconds();
1178 1178
1179 dbg_tnc("search key %s", DBGKEY(key)); 1179 dbg_tnc("search key %s", DBGKEY(key));
1180 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
1180 1181
1181 znode = c->zroot.znode; 1182 znode = c->zroot.znode;
1182 if (unlikely(!znode)) { 1183 if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
2966 * 2967 *
2967 * This function searches an indexing node by its first key @key and its 2968 * This function searches an indexing node by its first key @key and its
2968 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing 2969 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
2969 * nodes it traverses to TNC. This function is called fro indexing nodes which 2970 * nodes it traverses to TNC. This function is called for indexing nodes which
2970 * were found on the media by scanning, for example when garbage-collecting or 2971 * were found on the media by scanning, for example when garbage-collecting or
2971 * when doing in-the-gaps commit. This means that the indexing node which is 2972 * when doing in-the-gaps commit. This means that the indexing node which is
2972 * looked for does not have to have exactly the same leftmost key @key, because 2973 * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
2988 struct ubifs_znode *znode, *zn; 2989 struct ubifs_znode *znode, *zn;
2989 int n, nn; 2990 int n, nn;
2990 2991
2992 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
2993
2991 /* 2994 /*
2992 * The arguments have probably been read off flash, so don't assume 2995 * The arguments have probably been read off flash, so don't assume
2993 * they are valid. 2996 * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0c9876b396dd..381d6b207a52 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
119 * in TNC. However, when replaying, it is handy to introduce fake "truncation" 119 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
120 * keys for truncation nodes because the code becomes simpler. So we define 120 * keys for truncation nodes because the code becomes simpler. So we define
121 * %UBIFS_TRUN_KEY type. 121 * %UBIFS_TRUN_KEY type.
122 *
123 * But otherwise, out of the journal reply scope, the truncation keys are
124 * invalid.
122 */ 125 */
123#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT 126#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
127#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
124 128
125/* 129/*
126 * How much a directory entry/extended attribute entry adds to the parent/host 130 * How much a directory entry/extended attribute entry adds to the parent/host
@@ -1028,6 +1032,8 @@ struct ubifs_debug_info;
1028 * @max_leb_cnt: maximum count of logical eraseblocks 1032 * @max_leb_cnt: maximum count of logical eraseblocks
1029 * @old_leb_cnt: count of logical eraseblocks before re-size 1033 * @old_leb_cnt: count of logical eraseblocks before re-size
1030 * @ro_media: the underlying UBI volume is read-only 1034 * @ro_media: the underlying UBI volume is read-only
1035 * @ro_mount: the file-system was mounted as read-only
1036 * @ro_error: UBIFS switched to R/O mode because an error happened
1031 * 1037 *
1032 * @dirty_pg_cnt: number of dirty pages (not used) 1038 * @dirty_pg_cnt: number of dirty pages (not used)
1033 * @dirty_zn_cnt: number of dirty znodes 1039 * @dirty_zn_cnt: number of dirty znodes
@@ -1168,11 +1174,14 @@ struct ubifs_debug_info;
1168 * @replay_sqnum: sequence number of node currently being replayed 1174 * @replay_sqnum: sequence number of node currently being replayed
1169 * @need_recovery: file-system needs recovery 1175 * @need_recovery: file-system needs recovery
1170 * @replaying: set to %1 during journal replay 1176 * @replaying: set to %1 during journal replay
1171 * @unclean_leb_list: LEBs to recover when mounting ro to rw 1177 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
1172 * @rcvrd_mst_node: recovered master node to write when mounting ro to rw 1178 * mode
1179 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
1180 * FS to R/W mode
1173 * @size_tree: inode size information for recovery 1181 * @size_tree: inode size information for recovery
1174 * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) 1182 * @remounting_rw: set while re-mounting from R/O mode to R/W mode
1175 * @always_chk_crc: always check CRCs (while mounting and remounting rw) 1183 * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
1184 * mode)
1176 * @mount_opts: UBIFS-specific mount options 1185 * @mount_opts: UBIFS-specific mount options
1177 * 1186 *
1178 * @dbg: debugging-related information 1187 * @dbg: debugging-related information
@@ -1268,7 +1277,9 @@ struct ubifs_info {
1268 int leb_cnt; 1277 int leb_cnt;
1269 int max_leb_cnt; 1278 int max_leb_cnt;
1270 int old_leb_cnt; 1279 int old_leb_cnt;
1271 int ro_media; 1280 unsigned int ro_media:1;
1281 unsigned int ro_mount:1;
1282 unsigned int ro_error:1;
1272 1283
1273 atomic_long_t dirty_pg_cnt; 1284 atomic_long_t dirty_pg_cnt;
1274 atomic_long_t dirty_zn_cnt; 1285 atomic_long_t dirty_zn_cnt;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b608efaa4cee..306ee39ef2c3 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -157,10 +157,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
157 udf_debug("bit %ld already set\n", bit + i); 157 udf_debug("bit %ld already set\n", bit + i);
158 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
159 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
160 } else {
161 udf_add_free_space(sb, sbi->s_partition, 1);
162 } 160 }
163 } 161 }
162 udf_add_free_space(sb, sbi->s_partition, count);
164 mark_buffer_dirty(bh); 163 mark_buffer_dirty(bh);
165 if (overflow) { 164 if (overflow) {
166 block += count; 165 block += count;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 51552bf50225..eb8bfe2b89a5 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
30#include <linux/errno.h> 30#include <linux/errno.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35 34
36#include "udf_i.h" 35#include "udf_i.h"
@@ -190,18 +189,14 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
190 struct inode *dir = filp->f_path.dentry->d_inode; 189 struct inode *dir = filp->f_path.dentry->d_inode;
191 int result; 190 int result;
192 191
193 lock_kernel();
194
195 if (filp->f_pos == 0) { 192 if (filp->f_pos == 0) {
196 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) { 193 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
197 unlock_kernel();
198 return 0; 194 return 0;
199 } 195 }
200 filp->f_pos++; 196 filp->f_pos++;
201 } 197 }
202 198
203 result = do_udf_readdir(dir, filp, filldir, dirent); 199 result = do_udf_readdir(dir, filp, filldir, dirent);
204 unlock_kernel();
205 return result; 200 return result;
206} 201}
207 202
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 66b9e7e7e4c5..89c78486cbbe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -32,7 +32,6 @@
32#include <linux/string.h> /* memset */ 32#include <linux/string.h> /* memset */
33#include <linux/capability.h> 33#include <linux/capability.h>
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 35#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
38#include <linux/aio.h> 37#include <linux/aio.h>
@@ -114,6 +113,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
114 size_t count = iocb->ki_left; 113 size_t count = iocb->ki_left;
115 struct udf_inode_info *iinfo = UDF_I(inode); 114 struct udf_inode_info *iinfo = UDF_I(inode);
116 115
116 down_write(&iinfo->i_data_sem);
117 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 117 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
118 if (file->f_flags & O_APPEND) 118 if (file->f_flags & O_APPEND)
119 pos = inode->i_size; 119 pos = inode->i_size;
@@ -126,6 +126,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
126 udf_expand_file_adinicb(inode, pos + count, &err); 126 udf_expand_file_adinicb(inode, pos + count, &err);
127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
128 udf_debug("udf_expand_adinicb: err=%d\n", err); 128 udf_debug("udf_expand_adinicb: err=%d\n", err);
129 up_write(&iinfo->i_data_sem);
129 return err; 130 return err;
130 } 131 }
131 } else { 132 } else {
@@ -135,6 +136,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
135 iinfo->i_lenAlloc = inode->i_size; 136 iinfo->i_lenAlloc = inode->i_size;
136 } 137 }
137 } 138 }
139 up_write(&iinfo->i_data_sem);
138 140
139 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); 141 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
140 if (retval > 0) 142 if (retval > 0)
@@ -149,8 +151,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
149 long old_block, new_block; 151 long old_block, new_block;
150 int result = -EINVAL; 152 int result = -EINVAL;
151 153
152 lock_kernel();
153
154 if (file_permission(filp, MAY_READ) != 0) { 154 if (file_permission(filp, MAY_READ) != 0) {
155 udf_debug("no permission to access inode %lu\n", inode->i_ino); 155 udf_debug("no permission to access inode %lu\n", inode->i_ino);
156 result = -EPERM; 156 result = -EPERM;
@@ -196,7 +196,6 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
196 } 196 }
197 197
198out: 198out:
199 unlock_kernel();
200 return result; 199 return result;
201} 200}
202 201
@@ -204,10 +203,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
204{ 203{
205 if (filp->f_mode & FMODE_WRITE) { 204 if (filp->f_mode & FMODE_WRITE) {
206 mutex_lock(&inode->i_mutex); 205 mutex_lock(&inode->i_mutex);
207 lock_kernel(); 206 down_write(&UDF_I(inode)->i_data_sem);
208 udf_discard_prealloc(inode); 207 udf_discard_prealloc(inode);
209 udf_truncate_tail_extent(inode); 208 udf_truncate_tail_extent(inode);
210 unlock_kernel(); 209 up_write(&UDF_I(inode)->i_data_sem);
211 mutex_unlock(&inode->i_mutex); 210 mutex_unlock(&inode->i_mutex);
212 } 211 }
213 return 0; 212 return 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 75d9304d0dc3..6fb7e0adcda0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -92,28 +92,19 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
92 return NULL; 92 return NULL;
93 } 93 }
94 94
95 mutex_lock(&sbi->s_alloc_mutex);
96 if (sbi->s_lvid_bh) { 95 if (sbi->s_lvid_bh) {
97 struct logicalVolIntegrityDesc *lvid = 96 struct logicalVolIntegrityDescImpUse *lvidiu;
98 (struct logicalVolIntegrityDesc *) 97
99 sbi->s_lvid_bh->b_data; 98 iinfo->i_unique = lvid_get_unique_id(sb);
100 struct logicalVolIntegrityDescImpUse *lvidiu = 99 mutex_lock(&sbi->s_alloc_mutex);
101 udf_sb_lvidiu(sbi); 100 lvidiu = udf_sb_lvidiu(sbi);
102 struct logicalVolHeaderDesc *lvhd;
103 uint64_t uniqueID;
104 lvhd = (struct logicalVolHeaderDesc *)
105 (lvid->logicalVolContentsUse);
106 if (S_ISDIR(mode)) 101 if (S_ISDIR(mode))
107 le32_add_cpu(&lvidiu->numDirs, 1); 102 le32_add_cpu(&lvidiu->numDirs, 1);
108 else 103 else
109 le32_add_cpu(&lvidiu->numFiles, 1); 104 le32_add_cpu(&lvidiu->numFiles, 1);
110 iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
111 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
112 uniqueID += 16;
113 lvhd->uniqueID = cpu_to_le64(uniqueID);
114 udf_updated_lvid(sb); 105 udf_updated_lvid(sb);
106 mutex_unlock(&sbi->s_alloc_mutex);
115 } 107 }
116 mutex_unlock(&sbi->s_alloc_mutex);
117 108
118 inode_init_owner(inode, dir, mode); 109 inode_init_owner(inode, dir, mode);
119 110
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fc48f37aa2dd..c6a2e782b97b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -31,7 +31,6 @@
31 31
32#include "udfdecl.h" 32#include "udfdecl.h"
33#include <linux/mm.h> 33#include <linux/mm.h>
34#include <linux/smp_lock.h>
35#include <linux/module.h> 34#include <linux/module.h>
36#include <linux/pagemap.h> 35#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
@@ -51,6 +50,7 @@ MODULE_LICENSE("GPL");
51static mode_t udf_convert_permissions(struct fileEntry *); 50static mode_t udf_convert_permissions(struct fileEntry *);
52static int udf_update_inode(struct inode *, int); 51static int udf_update_inode(struct inode *, int);
53static void udf_fill_inode(struct inode *, struct buffer_head *); 52static void udf_fill_inode(struct inode *, struct buffer_head *);
53static int udf_sync_inode(struct inode *inode);
54static int udf_alloc_i_data(struct inode *inode, size_t size); 54static int udf_alloc_i_data(struct inode *inode, size_t size);
55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
56 sector_t *, int *); 56 sector_t *, int *);
@@ -79,9 +79,7 @@ void udf_evict_inode(struct inode *inode)
79 want_delete = 1; 79 want_delete = 1;
80 inode->i_size = 0; 80 inode->i_size = 0;
81 udf_truncate(inode); 81 udf_truncate(inode);
82 lock_kernel();
83 udf_update_inode(inode, IS_SYNC(inode)); 82 udf_update_inode(inode, IS_SYNC(inode));
84 unlock_kernel();
85 } 83 }
86 invalidate_inode_buffers(inode); 84 invalidate_inode_buffers(inode);
87 end_writeback(inode); 85 end_writeback(inode);
@@ -97,9 +95,7 @@ void udf_evict_inode(struct inode *inode)
97 kfree(iinfo->i_ext.i_data); 95 kfree(iinfo->i_ext.i_data);
98 iinfo->i_ext.i_data = NULL; 96 iinfo->i_ext.i_data = NULL;
99 if (want_delete) { 97 if (want_delete) {
100 lock_kernel();
101 udf_free_inode(inode); 98 udf_free_inode(inode);
102 unlock_kernel();
103 } 99 }
104} 100}
105 101
@@ -302,10 +298,9 @@ static int udf_get_block(struct inode *inode, sector_t block,
302 err = -EIO; 298 err = -EIO;
303 new = 0; 299 new = 0;
304 bh = NULL; 300 bh = NULL;
305
306 lock_kernel();
307
308 iinfo = UDF_I(inode); 301 iinfo = UDF_I(inode);
302
303 down_write(&iinfo->i_data_sem);
309 if (block == iinfo->i_next_alloc_block + 1) { 304 if (block == iinfo->i_next_alloc_block + 1) {
310 iinfo->i_next_alloc_block++; 305 iinfo->i_next_alloc_block++;
311 iinfo->i_next_alloc_goal++; 306 iinfo->i_next_alloc_goal++;
@@ -324,7 +319,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
324 map_bh(bh_result, inode->i_sb, phys); 319 map_bh(bh_result, inode->i_sb, phys);
325 320
326abort: 321abort:
327 unlock_kernel(); 322 up_write(&iinfo->i_data_sem);
328 return err; 323 return err;
329} 324}
330 325
@@ -1022,16 +1017,16 @@ void udf_truncate(struct inode *inode)
1022 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 1017 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1023 return; 1018 return;
1024 1019
1025 lock_kernel();
1026 iinfo = UDF_I(inode); 1020 iinfo = UDF_I(inode);
1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1021 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1022 down_write(&iinfo->i_data_sem);
1028 if (inode->i_sb->s_blocksize < 1023 if (inode->i_sb->s_blocksize <
1029 (udf_file_entry_alloc_offset(inode) + 1024 (udf_file_entry_alloc_offset(inode) +
1030 inode->i_size)) { 1025 inode->i_size)) {
1031 udf_expand_file_adinicb(inode, inode->i_size, &err); 1026 udf_expand_file_adinicb(inode, inode->i_size, &err);
1032 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1033 inode->i_size = iinfo->i_lenAlloc; 1028 inode->i_size = iinfo->i_lenAlloc;
1034 unlock_kernel(); 1029 up_write(&iinfo->i_data_sem);
1035 return; 1030 return;
1036 } else 1031 } else
1037 udf_truncate_extents(inode); 1032 udf_truncate_extents(inode);
@@ -1042,10 +1037,13 @@ void udf_truncate(struct inode *inode)
1042 offset - udf_file_entry_alloc_offset(inode)); 1037 offset - udf_file_entry_alloc_offset(inode));
1043 iinfo->i_lenAlloc = inode->i_size; 1038 iinfo->i_lenAlloc = inode->i_size;
1044 } 1039 }
1040 up_write(&iinfo->i_data_sem);
1045 } else { 1041 } else {
1046 block_truncate_page(inode->i_mapping, inode->i_size, 1042 block_truncate_page(inode->i_mapping, inode->i_size,
1047 udf_get_block); 1043 udf_get_block);
1044 down_write(&iinfo->i_data_sem);
1048 udf_truncate_extents(inode); 1045 udf_truncate_extents(inode);
1046 up_write(&iinfo->i_data_sem);
1049 } 1047 }
1050 1048
1051 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 1049 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
@@ -1053,7 +1051,6 @@ void udf_truncate(struct inode *inode)
1053 udf_sync_inode(inode); 1051 udf_sync_inode(inode);
1054 else 1052 else
1055 mark_inode_dirty(inode); 1053 mark_inode_dirty(inode);
1056 unlock_kernel();
1057} 1054}
1058 1055
1059static void __udf_read_inode(struct inode *inode) 1056static void __udf_read_inode(struct inode *inode)
@@ -1202,6 +1199,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1202 return; 1199 return;
1203 } 1200 }
1204 1201
1202 read_lock(&sbi->s_cred_lock);
1205 inode->i_uid = le32_to_cpu(fe->uid); 1203 inode->i_uid = le32_to_cpu(fe->uid);
1206 if (inode->i_uid == -1 || 1204 if (inode->i_uid == -1 ||
1207 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || 1205 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) ||
@@ -1214,13 +1212,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1214 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) 1212 UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
1215 inode->i_gid = UDF_SB(inode->i_sb)->s_gid; 1213 inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
1216 1214
1217 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1218 if (!inode->i_nlink)
1219 inode->i_nlink = 1;
1220
1221 inode->i_size = le64_to_cpu(fe->informationLength);
1222 iinfo->i_lenExtents = inode->i_size;
1223
1224 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && 1215 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
1225 sbi->s_fmode != UDF_INVALID_MODE) 1216 sbi->s_fmode != UDF_INVALID_MODE)
1226 inode->i_mode = sbi->s_fmode; 1217 inode->i_mode = sbi->s_fmode;
@@ -1230,6 +1221,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1230 else 1221 else
1231 inode->i_mode = udf_convert_permissions(fe); 1222 inode->i_mode = udf_convert_permissions(fe);
1232 inode->i_mode &= ~sbi->s_umask; 1223 inode->i_mode &= ~sbi->s_umask;
1224 read_unlock(&sbi->s_cred_lock);
1225
1226 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1227 if (!inode->i_nlink)
1228 inode->i_nlink = 1;
1229
1230 inode->i_size = le64_to_cpu(fe->informationLength);
1231 iinfo->i_lenExtents = inode->i_size;
1233 1232
1234 if (iinfo->i_efe == 0) { 1233 if (iinfo->i_efe == 0) {
1235 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1234 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1373,16 +1372,10 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1373 1372
1374int udf_write_inode(struct inode *inode, struct writeback_control *wbc) 1373int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
1375{ 1374{
1376 int ret; 1375 return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1377
1378 lock_kernel();
1379 ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1380 unlock_kernel();
1381
1382 return ret;
1383} 1376}
1384 1377
1385int udf_sync_inode(struct inode *inode) 1378static int udf_sync_inode(struct inode *inode)
1386{ 1379{
1387 return udf_update_inode(inode, 1); 1380 return udf_update_inode(inode, 1);
1388} 1381}
@@ -2048,7 +2041,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2048 struct extent_position epos = {}; 2041 struct extent_position epos = {};
2049 int ret; 2042 int ret;
2050 2043
2051 lock_kernel(); 2044 down_read(&UDF_I(inode)->i_data_sem);
2052 2045
2053 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == 2046 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
2054 (EXT_RECORDED_ALLOCATED >> 30)) 2047 (EXT_RECORDED_ALLOCATED >> 30))
@@ -2056,7 +2049,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2056 else 2049 else
2057 ret = 0; 2050 ret = 0;
2058 2051
2059 unlock_kernel(); 2052 up_read(&UDF_I(inode)->i_data_sem);
2060 brelse(epos.bh); 2053 brelse(epos.bh);
2061 2054
2062 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) 2055 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..2be0f9eb86d2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/crc-itu-t.h> 32#include <linux/crc-itu-t.h>
@@ -228,10 +227,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
228 } 227 }
229 228
230 if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && 229 if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
231 isdotdot) { 230 isdotdot)
232 brelse(epos.bh); 231 goto out_ok;
233 return fi;
234 }
235 232
236 if (!lfi) 233 if (!lfi)
237 continue; 234 continue;
@@ -263,7 +260,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
263 if (dentry->d_name.len > UDF_NAME_LEN - 2) 260 if (dentry->d_name.len > UDF_NAME_LEN - 2)
264 return ERR_PTR(-ENAMETOOLONG); 261 return ERR_PTR(-ENAMETOOLONG);
265 262
266 lock_kernel();
267#ifdef UDF_RECOVERY 263#ifdef UDF_RECOVERY
268 /* temporary shorthand for specifying files by inode number */ 264 /* temporary shorthand for specifying files by inode number */
269 if (!strncmp(dentry->d_name.name, ".B=", 3)) { 265 if (!strncmp(dentry->d_name.name, ".B=", 3)) {
@@ -275,7 +271,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
275 }; 271 };
276 inode = udf_iget(dir->i_sb, lb); 272 inode = udf_iget(dir->i_sb, lb);
277 if (!inode) { 273 if (!inode) {
278 unlock_kernel();
279 return ERR_PTR(-EACCES); 274 return ERR_PTR(-EACCES);
280 } 275 }
281 } else 276 } else
@@ -291,11 +286,9 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
291 loc = lelb_to_cpu(cfi.icb.extLocation); 286 loc = lelb_to_cpu(cfi.icb.extLocation);
292 inode = udf_iget(dir->i_sb, &loc); 287 inode = udf_iget(dir->i_sb, &loc);
293 if (!inode) { 288 if (!inode) {
294 unlock_kernel();
295 return ERR_PTR(-EACCES); 289 return ERR_PTR(-EACCES);
296 } 290 }
297 } 291 }
298 unlock_kernel();
299 292
300 return d_splice_alias(inode, dentry); 293 return d_splice_alias(inode, dentry);
301} 294}
@@ -476,15 +469,19 @@ add:
476 f_pos >> dir->i_sb->s_blocksize_bits, 1, err); 469 f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
477 if (!fibh->ebh) 470 if (!fibh->ebh)
478 goto out_err; 471 goto out_err;
472 /* Extents could have been merged, invalidate our position */
473 brelse(epos.bh);
474 epos.bh = NULL;
475 epos.block = dinfo->i_location;
476 epos.offset = udf_file_entry_alloc_offset(dir);
479 477
480 if (!fibh->soffset) { 478 if (!fibh->soffset) {
481 if (udf_next_aext(dir, &epos, &eloc, &elen, 1) == 479 /* Find the freshly allocated block */
482 (EXT_RECORDED_ALLOCATED >> 30)) { 480 while (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
483 block = eloc.logicalBlockNum + ((elen - 1) >> 481 (EXT_RECORDED_ALLOCATED >> 30))
482 ;
483 block = eloc.logicalBlockNum + ((elen - 1) >>
484 dir->i_sb->s_blocksize_bits); 484 dir->i_sb->s_blocksize_bits);
485 } else
486 block++;
487
488 brelse(fibh->sbh); 485 brelse(fibh->sbh);
489 fibh->sbh = fibh->ebh; 486 fibh->sbh = fibh->ebh;
490 fi = (struct fileIdentDesc *)(fibh->sbh->b_data); 487 fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
@@ -562,10 +559,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
562 int err; 559 int err;
563 struct udf_inode_info *iinfo; 560 struct udf_inode_info *iinfo;
564 561
565 lock_kernel();
566 inode = udf_new_inode(dir, mode, &err); 562 inode = udf_new_inode(dir, mode, &err);
567 if (!inode) { 563 if (!inode) {
568 unlock_kernel();
569 return err; 564 return err;
570 } 565 }
571 566
@@ -583,7 +578,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
583 inode->i_nlink--; 578 inode->i_nlink--;
584 mark_inode_dirty(inode); 579 mark_inode_dirty(inode);
585 iput(inode); 580 iput(inode);
586 unlock_kernel();
587 return err; 581 return err;
588 } 582 }
589 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 583 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -596,7 +590,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
596 if (fibh.sbh != fibh.ebh) 590 if (fibh.sbh != fibh.ebh)
597 brelse(fibh.ebh); 591 brelse(fibh.ebh);
598 brelse(fibh.sbh); 592 brelse(fibh.sbh);
599 unlock_kernel();
600 d_instantiate(dentry, inode); 593 d_instantiate(dentry, inode);
601 594
602 return 0; 595 return 0;
@@ -614,7 +607,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
614 if (!old_valid_dev(rdev)) 607 if (!old_valid_dev(rdev))
615 return -EINVAL; 608 return -EINVAL;
616 609
617 lock_kernel();
618 err = -EIO; 610 err = -EIO;
619 inode = udf_new_inode(dir, mode, &err); 611 inode = udf_new_inode(dir, mode, &err);
620 if (!inode) 612 if (!inode)
@@ -627,7 +619,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 inode->i_nlink--; 619 inode->i_nlink--;
628 mark_inode_dirty(inode); 620 mark_inode_dirty(inode);
629 iput(inode); 621 iput(inode);
630 unlock_kernel();
631 return err; 622 return err;
632 } 623 }
633 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 624 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -646,7 +637,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
646 err = 0; 637 err = 0;
647 638
648out: 639out:
649 unlock_kernel();
650 return err; 640 return err;
651} 641}
652 642
@@ -659,7 +649,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
659 struct udf_inode_info *dinfo = UDF_I(dir); 649 struct udf_inode_info *dinfo = UDF_I(dir);
660 struct udf_inode_info *iinfo; 650 struct udf_inode_info *iinfo;
661 651
662 lock_kernel();
663 err = -EMLINK; 652 err = -EMLINK;
664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 653 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
665 goto out; 654 goto out;
@@ -712,7 +701,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
712 err = 0; 701 err = 0;
713 702
714out: 703out:
715 unlock_kernel();
716 return err; 704 return err;
717} 705}
718 706
@@ -794,7 +782,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
794 struct kernel_lb_addr tloc; 782 struct kernel_lb_addr tloc;
795 783
796 retval = -ENOENT; 784 retval = -ENOENT;
797 lock_kernel();
798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 785 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
799 if (!fi) 786 if (!fi)
800 goto out; 787 goto out;
@@ -826,7 +813,6 @@ end_rmdir:
826 brelse(fibh.sbh); 813 brelse(fibh.sbh);
827 814
828out: 815out:
829 unlock_kernel();
830 return retval; 816 return retval;
831} 817}
832 818
@@ -840,7 +826,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
840 struct kernel_lb_addr tloc; 826 struct kernel_lb_addr tloc;
841 827
842 retval = -ENOENT; 828 retval = -ENOENT;
843 lock_kernel();
844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 829 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
845 if (!fi) 830 if (!fi)
846 goto out; 831 goto out;
@@ -870,7 +855,6 @@ end_unlink:
870 brelse(fibh.sbh); 855 brelse(fibh.sbh);
871 856
872out: 857out:
873 unlock_kernel();
874 return retval; 858 return retval;
875} 859}
876 860
@@ -890,21 +874,21 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
890 int block; 874 int block;
891 unsigned char *name = NULL; 875 unsigned char *name = NULL;
892 int namelen; 876 int namelen;
893 struct buffer_head *bh;
894 struct udf_inode_info *iinfo; 877 struct udf_inode_info *iinfo;
878 struct super_block *sb = dir->i_sb;
895 879
896 lock_kernel();
897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); 880 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
898 if (!inode) 881 if (!inode)
899 goto out; 882 goto out;
900 883
884 iinfo = UDF_I(inode);
885 down_write(&iinfo->i_data_sem);
901 name = kmalloc(UDF_NAME_LEN, GFP_NOFS); 886 name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
902 if (!name) { 887 if (!name) {
903 err = -ENOMEM; 888 err = -ENOMEM;
904 goto out_no_entry; 889 goto out_no_entry;
905 } 890 }
906 891
907 iinfo = UDF_I(inode);
908 inode->i_data.a_ops = &udf_symlink_aops; 892 inode->i_data.a_ops = &udf_symlink_aops;
909 inode->i_op = &udf_symlink_inode_operations; 893 inode->i_op = &udf_symlink_inode_operations;
910 894
@@ -912,7 +896,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
912 struct kernel_lb_addr eloc; 896 struct kernel_lb_addr eloc;
913 uint32_t bsize; 897 uint32_t bsize;
914 898
915 block = udf_new_block(inode->i_sb, inode, 899 block = udf_new_block(sb, inode,
916 iinfo->i_location.partitionReferenceNum, 900 iinfo->i_location.partitionReferenceNum,
917 iinfo->i_location.logicalBlockNum, &err); 901 iinfo->i_location.logicalBlockNum, &err);
918 if (!block) 902 if (!block)
@@ -923,17 +907,17 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 eloc.logicalBlockNum = block; 907 eloc.logicalBlockNum = block;
924 eloc.partitionReferenceNum = 908 eloc.partitionReferenceNum =
925 iinfo->i_location.partitionReferenceNum; 909 iinfo->i_location.partitionReferenceNum;
926 bsize = inode->i_sb->s_blocksize; 910 bsize = sb->s_blocksize;
927 iinfo->i_lenExtents = bsize; 911 iinfo->i_lenExtents = bsize;
928 udf_add_aext(inode, &epos, &eloc, bsize, 0); 912 udf_add_aext(inode, &epos, &eloc, bsize, 0);
929 brelse(epos.bh); 913 brelse(epos.bh);
930 914
931 block = udf_get_pblock(inode->i_sb, block, 915 block = udf_get_pblock(sb, block,
932 iinfo->i_location.partitionReferenceNum, 916 iinfo->i_location.partitionReferenceNum,
933 0); 917 0);
934 epos.bh = udf_tgetblk(inode->i_sb, block); 918 epos.bh = udf_tgetblk(sb, block);
935 lock_buffer(epos.bh); 919 lock_buffer(epos.bh);
936 memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize); 920 memset(epos.bh->b_data, 0x00, bsize);
937 set_buffer_uptodate(epos.bh); 921 set_buffer_uptodate(epos.bh);
938 unlock_buffer(epos.bh); 922 unlock_buffer(epos.bh);
939 mark_buffer_dirty_inode(epos.bh, inode); 923 mark_buffer_dirty_inode(epos.bh, inode);
@@ -941,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
941 } else 925 } else
942 ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr; 926 ea = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
943 927
944 eoffset = inode->i_sb->s_blocksize - udf_ext0_offset(inode); 928 eoffset = sb->s_blocksize - udf_ext0_offset(inode);
945 pc = (struct pathComponent *)ea; 929 pc = (struct pathComponent *)ea;
946 930
947 if (*symname == '/') { 931 if (*symname == '/') {
@@ -981,7 +965,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
981 } 965 }
982 966
983 if (pc->componentType == 5) { 967 if (pc->componentType == 5) {
984 namelen = udf_put_filename(inode->i_sb, compstart, name, 968 namelen = udf_put_filename(sb, compstart, name,
985 symname - compstart); 969 symname - compstart);
986 if (!namelen) 970 if (!namelen)
987 goto out_no_entry; 971 goto out_no_entry;
@@ -1015,27 +999,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1015 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 999 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1016 if (!fi) 1000 if (!fi)
1017 goto out_no_entry; 1001 goto out_no_entry;
1018 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 1002 cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
1019 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); 1003 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
1020 bh = UDF_SB(inode->i_sb)->s_lvid_bh; 1004 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1021 if (bh) {
1022 struct logicalVolIntegrityDesc *lvid =
1023 (struct logicalVolIntegrityDesc *)bh->b_data;
1024 struct logicalVolHeaderDesc *lvhd;
1025 uint64_t uniqueID;
1026 lvhd = (struct logicalVolHeaderDesc *)
1027 lvid->logicalVolContentsUse;
1028 uniqueID = le64_to_cpu(lvhd->uniqueID);
1029 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = 1005 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1030 cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); 1006 cpu_to_le32(lvid_get_unique_id(sb));
1031 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
1032 uniqueID += 16;
1033 lvhd->uniqueID = cpu_to_le64(uniqueID);
1034 mark_buffer_dirty(bh);
1035 } 1007 }
1036 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1008 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1037 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1009 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1038 mark_inode_dirty(dir); 1010 mark_inode_dirty(dir);
1011 up_write(&iinfo->i_data_sem);
1039 if (fibh.sbh != fibh.ebh) 1012 if (fibh.sbh != fibh.ebh)
1040 brelse(fibh.ebh); 1013 brelse(fibh.ebh);
1041 brelse(fibh.sbh); 1014 brelse(fibh.sbh);
@@ -1044,10 +1017,10 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1044 1017
1045out: 1018out:
1046 kfree(name); 1019 kfree(name);
1047 unlock_kernel();
1048 return err; 1020 return err;
1049 1021
1050out_no_entry: 1022out_no_entry:
1023 up_write(&iinfo->i_data_sem);
1051 inode_dec_link_count(inode); 1024 inode_dec_link_count(inode);
1052 iput(inode); 1025 iput(inode);
1053 goto out; 1026 goto out;
@@ -1060,36 +1033,20 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1060 struct udf_fileident_bh fibh; 1033 struct udf_fileident_bh fibh;
1061 struct fileIdentDesc cfi, *fi; 1034 struct fileIdentDesc cfi, *fi;
1062 int err; 1035 int err;
1063 struct buffer_head *bh;
1064 1036
1065 lock_kernel();
1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1037 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1067 unlock_kernel();
1068 return -EMLINK; 1038 return -EMLINK;
1069 } 1039 }
1070 1040
1071 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1041 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1072 if (!fi) { 1042 if (!fi) {
1073 unlock_kernel();
1074 return err; 1043 return err;
1075 } 1044 }
1076 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); 1045 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
1077 cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); 1046 cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
1078 bh = UDF_SB(inode->i_sb)->s_lvid_bh; 1047 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1079 if (bh) {
1080 struct logicalVolIntegrityDesc *lvid =
1081 (struct logicalVolIntegrityDesc *)bh->b_data;
1082 struct logicalVolHeaderDesc *lvhd;
1083 uint64_t uniqueID;
1084 lvhd = (struct logicalVolHeaderDesc *)
1085 (lvid->logicalVolContentsUse);
1086 uniqueID = le64_to_cpu(lvhd->uniqueID);
1087 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = 1048 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1088 cpu_to_le32(uniqueID & 0x00000000FFFFFFFFUL); 1049 cpu_to_le32(lvid_get_unique_id(inode->i_sb));
1089 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
1090 uniqueID += 16;
1091 lvhd->uniqueID = cpu_to_le64(uniqueID);
1092 mark_buffer_dirty(bh);
1093 } 1050 }
1094 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1051 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1095 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1052 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
@@ -1101,9 +1058,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1101 inc_nlink(inode); 1058 inc_nlink(inode);
1102 inode->i_ctime = current_fs_time(inode->i_sb); 1059 inode->i_ctime = current_fs_time(inode->i_sb);
1103 mark_inode_dirty(inode); 1060 mark_inode_dirty(inode);
1104 atomic_inc(&inode->i_count); 1061 ihold(inode);
1105 d_instantiate(dentry, inode); 1062 d_instantiate(dentry, inode);
1106 unlock_kernel();
1107 1063
1108 return 0; 1064 return 0;
1109} 1065}
@@ -1124,7 +1080,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1124 struct kernel_lb_addr tloc; 1080 struct kernel_lb_addr tloc;
1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1081 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1126 1082
1127 lock_kernel();
1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1083 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1129 if (ofi) { 1084 if (ofi) {
1130 if (ofibh.sbh != ofibh.ebh) 1085 if (ofibh.sbh != ofibh.ebh)
@@ -1248,7 +1203,6 @@ end_rename:
1248 brelse(nfibh.ebh); 1203 brelse(nfibh.ebh);
1249 brelse(nfibh.sbh); 1204 brelse(nfibh.sbh);
1250 } 1205 }
1251 unlock_kernel();
1252 1206
1253 return retval; 1207 return retval;
1254} 1208}
@@ -1261,7 +1215,6 @@ static struct dentry *udf_get_parent(struct dentry *child)
1261 struct fileIdentDesc cfi; 1215 struct fileIdentDesc cfi;
1262 struct udf_fileident_bh fibh; 1216 struct udf_fileident_bh fibh;
1263 1217
1264 lock_kernel();
1265 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) 1218 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
1266 goto out_unlock; 1219 goto out_unlock;
1267 1220
@@ -1273,11 +1226,9 @@ static struct dentry *udf_get_parent(struct dentry *child)
1273 inode = udf_iget(child->d_inode->i_sb, &tloc); 1226 inode = udf_iget(child->d_inode->i_sb, &tloc);
1274 if (!inode) 1227 if (!inode)
1275 goto out_unlock; 1228 goto out_unlock;
1276 unlock_kernel();
1277 1229
1278 return d_obtain_alias(inode); 1230 return d_obtain_alias(inode);
1279out_unlock: 1231out_unlock:
1280 unlock_kernel();
1281 return ERR_PTR(-EACCES); 1232 return ERR_PTR(-EACCES);
1282} 1233}
1283 1234
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 745eb209be0c..a71090ea0e07 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mutex.h>
28 29
29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
30 uint16_t partition, uint32_t offset) 31 uint16_t partition, uint32_t offset)
@@ -159,7 +160,9 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
159 struct udf_sb_info *sbi = UDF_SB(sb); 160 struct udf_sb_info *sbi = UDF_SB(sb);
160 u16 reallocationTableLen; 161 u16 reallocationTableLen;
161 struct buffer_head *bh; 162 struct buffer_head *bh;
163 int ret = 0;
162 164
165 mutex_lock(&sbi->s_alloc_mutex);
163 for (i = 0; i < sbi->s_partitions; i++) { 166 for (i = 0; i < sbi->s_partitions; i++) {
164 struct udf_part_map *map = &sbi->s_partmaps[i]; 167 struct udf_part_map *map = &sbi->s_partmaps[i];
165 if (old_block > map->s_partition_root && 168 if (old_block > map->s_partition_root &&
@@ -175,8 +178,10 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
175 break; 178 break;
176 } 179 }
177 180
178 if (!st) 181 if (!st) {
179 return 1; 182 ret = 1;
183 goto out;
184 }
180 185
181 reallocationTableLen = 186 reallocationTableLen =
182 le16_to_cpu(st->reallocationTableLen); 187 le16_to_cpu(st->reallocationTableLen);
@@ -207,14 +212,16 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
207 ((old_block - 212 ((old_block -
208 map->s_partition_root) & 213 map->s_partition_root) &
209 (sdata->s_packet_len - 1)); 214 (sdata->s_packet_len - 1));
210 return 0; 215 ret = 0;
216 goto out;
211 } else if (origLoc == packet) { 217 } else if (origLoc == packet) {
212 *new_block = le32_to_cpu( 218 *new_block = le32_to_cpu(
213 entry->mappedLocation) + 219 entry->mappedLocation) +
214 ((old_block - 220 ((old_block -
215 map->s_partition_root) & 221 map->s_partition_root) &
216 (sdata->s_packet_len - 1)); 222 (sdata->s_packet_len - 1));
217 return 0; 223 ret = 0;
224 goto out;
218 } else if (origLoc > packet) 225 } else if (origLoc > packet)
219 break; 226 break;
220 } 227 }
@@ -251,20 +258,24 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
251 st->mapEntry[k].mappedLocation) + 258 st->mapEntry[k].mappedLocation) +
252 ((old_block - map->s_partition_root) & 259 ((old_block - map->s_partition_root) &
253 (sdata->s_packet_len - 1)); 260 (sdata->s_packet_len - 1));
254 return 0; 261 ret = 0;
262 goto out;
255 } 263 }
256 264
257 return 1; 265 ret = 1;
266 goto out;
258 } /* if old_block */ 267 } /* if old_block */
259 } 268 }
260 269
261 if (i == sbi->s_partitions) { 270 if (i == sbi->s_partitions) {
262 /* outside of partitions */ 271 /* outside of partitions */
263 /* for now, fail =) */ 272 /* for now, fail =) */
264 return 1; 273 ret = 1;
265 } 274 }
266 275
267 return 0; 276out:
277 mutex_unlock(&sbi->s_alloc_mutex);
278 return ret;
268} 279}
269 280
270static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, 281static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 65412d84a45d..7b27b063ff6d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
48#include <linux/stat.h> 48#include <linux/stat.h>
49#include <linux/cdrom.h> 49#include <linux/cdrom.h>
50#include <linux/nls.h> 50#include <linux/nls.h>
51#include <linux/smp_lock.h>
52#include <linux/buffer_head.h> 51#include <linux/buffer_head.h>
53#include <linux/vfs.h> 52#include <linux/vfs.h>
54#include <linux/vmalloc.h> 53#include <linux/vmalloc.h>
@@ -107,17 +106,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
107} 106}
108 107
109/* UDF filesystem type */ 108/* UDF filesystem type */
110static int udf_get_sb(struct file_system_type *fs_type, 109static struct dentry *udf_mount(struct file_system_type *fs_type,
111 int flags, const char *dev_name, void *data, 110 int flags, const char *dev_name, void *data)
112 struct vfsmount *mnt)
113{ 111{
114 return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt); 112 return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
115} 113}
116 114
117static struct file_system_type udf_fstype = { 115static struct file_system_type udf_fstype = {
118 .owner = THIS_MODULE, 116 .owner = THIS_MODULE,
119 .name = "udf", 117 .name = "udf",
120 .get_sb = udf_get_sb, 118 .mount = udf_mount,
121 .kill_sb = kill_block_super, 119 .kill_sb = kill_block_super,
122 .fs_flags = FS_REQUIRES_DEV, 120 .fs_flags = FS_REQUIRES_DEV,
123}; 121};
@@ -136,15 +134,23 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
136 ei->i_next_alloc_block = 0; 134 ei->i_next_alloc_block = 0;
137 ei->i_next_alloc_goal = 0; 135 ei->i_next_alloc_goal = 0;
138 ei->i_strat4096 = 0; 136 ei->i_strat4096 = 0;
137 init_rwsem(&ei->i_data_sem);
139 138
140 return &ei->vfs_inode; 139 return &ei->vfs_inode;
141} 140}
142 141
143static void udf_destroy_inode(struct inode *inode) 142static void udf_i_callback(struct rcu_head *head)
144{ 143{
144 struct inode *inode = container_of(head, struct inode, i_rcu);
145 INIT_LIST_HEAD(&inode->i_dentry);
145 kmem_cache_free(udf_inode_cachep, UDF_I(inode)); 146 kmem_cache_free(udf_inode_cachep, UDF_I(inode));
146} 147}
147 148
149static void udf_destroy_inode(struct inode *inode)
150{
151 call_rcu(&inode->i_rcu, udf_i_callback);
152}
153
148static void init_once(void *foo) 154static void init_once(void *foo)
149{ 155{
150 struct udf_inode_info *ei = (struct udf_inode_info *)foo; 156 struct udf_inode_info *ei = (struct udf_inode_info *)foo;
@@ -568,13 +574,14 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
568 if (!udf_parse_options(options, &uopt, true)) 574 if (!udf_parse_options(options, &uopt, true))
569 return -EINVAL; 575 return -EINVAL;
570 576
571 lock_kernel(); 577 write_lock(&sbi->s_cred_lock);
572 sbi->s_flags = uopt.flags; 578 sbi->s_flags = uopt.flags;
573 sbi->s_uid = uopt.uid; 579 sbi->s_uid = uopt.uid;
574 sbi->s_gid = uopt.gid; 580 sbi->s_gid = uopt.gid;
575 sbi->s_umask = uopt.umask; 581 sbi->s_umask = uopt.umask;
576 sbi->s_fmode = uopt.fmode; 582 sbi->s_fmode = uopt.fmode;
577 sbi->s_dmode = uopt.dmode; 583 sbi->s_dmode = uopt.dmode;
584 write_unlock(&sbi->s_cred_lock);
578 585
579 if (sbi->s_lvid_bh) { 586 if (sbi->s_lvid_bh) {
580 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); 587 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -591,7 +598,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
591 udf_open_lvid(sb); 598 udf_open_lvid(sb);
592 599
593out_unlock: 600out_unlock:
594 unlock_kernel();
595 return error; 601 return error;
596} 602}
597 603
@@ -960,9 +966,9 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
960 (sizeof(struct buffer_head *) * nr_groups); 966 (sizeof(struct buffer_head *) * nr_groups);
961 967
962 if (size <= PAGE_SIZE) 968 if (size <= PAGE_SIZE)
963 bitmap = kmalloc(size, GFP_KERNEL); 969 bitmap = kzalloc(size, GFP_KERNEL);
964 else 970 else
965 bitmap = vmalloc(size); /* TODO: get rid of vmalloc */ 971 bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
966 972
967 if (bitmap == NULL) { 973 if (bitmap == NULL) {
968 udf_error(sb, __func__, 974 udf_error(sb, __func__,
@@ -971,7 +977,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
971 return NULL; 977 return NULL;
972 } 978 }
973 979
974 memset(bitmap, 0x00, size);
975 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); 980 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
976 bitmap->s_nr_groups = nr_groups; 981 bitmap->s_nr_groups = nr_groups;
977 return bitmap; 982 return bitmap;
@@ -1775,6 +1780,8 @@ static void udf_open_lvid(struct super_block *sb)
1775 1780
1776 if (!bh) 1781 if (!bh)
1777 return; 1782 return;
1783
1784 mutex_lock(&sbi->s_alloc_mutex);
1778 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1785 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1779 lvidiu = udf_sb_lvidiu(sbi); 1786 lvidiu = udf_sb_lvidiu(sbi);
1780 1787
@@ -1791,6 +1798,7 @@ static void udf_open_lvid(struct super_block *sb)
1791 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1798 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1792 mark_buffer_dirty(bh); 1799 mark_buffer_dirty(bh);
1793 sbi->s_lvid_dirty = 0; 1800 sbi->s_lvid_dirty = 0;
1801 mutex_unlock(&sbi->s_alloc_mutex);
1794} 1802}
1795 1803
1796static void udf_close_lvid(struct super_block *sb) 1804static void udf_close_lvid(struct super_block *sb)
@@ -1803,6 +1811,7 @@ static void udf_close_lvid(struct super_block *sb)
1803 if (!bh) 1811 if (!bh)
1804 return; 1812 return;
1805 1813
1814 mutex_lock(&sbi->s_alloc_mutex);
1806 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1815 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1807 lvidiu = udf_sb_lvidiu(sbi); 1816 lvidiu = udf_sb_lvidiu(sbi);
1808 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1817 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1823,6 +1832,34 @@ static void udf_close_lvid(struct super_block *sb)
1823 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1832 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1824 mark_buffer_dirty(bh); 1833 mark_buffer_dirty(bh);
1825 sbi->s_lvid_dirty = 0; 1834 sbi->s_lvid_dirty = 0;
1835 mutex_unlock(&sbi->s_alloc_mutex);
1836}
1837
1838u64 lvid_get_unique_id(struct super_block *sb)
1839{
1840 struct buffer_head *bh;
1841 struct udf_sb_info *sbi = UDF_SB(sb);
1842 struct logicalVolIntegrityDesc *lvid;
1843 struct logicalVolHeaderDesc *lvhd;
1844 u64 uniqueID;
1845 u64 ret;
1846
1847 bh = sbi->s_lvid_bh;
1848 if (!bh)
1849 return 0;
1850
1851 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1852 lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;
1853
1854 mutex_lock(&sbi->s_alloc_mutex);
1855 ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
1856 if (!(++uniqueID & 0xFFFFFFFF))
1857 uniqueID += 16;
1858 lvhd->uniqueID = cpu_to_le64(uniqueID);
1859 mutex_unlock(&sbi->s_alloc_mutex);
1860 mark_buffer_dirty(bh);
1861
1862 return ret;
1826} 1863}
1827 1864
1828static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1865static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1926,6 +1963,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1926 sbi->s_fmode = uopt.fmode; 1963 sbi->s_fmode = uopt.fmode;
1927 sbi->s_dmode = uopt.dmode; 1964 sbi->s_dmode = uopt.dmode;
1928 sbi->s_nls_map = uopt.nls_map; 1965 sbi->s_nls_map = uopt.nls_map;
1966 rwlock_init(&sbi->s_cred_lock);
1929 1967
1930 if (uopt.session == 0xFFFFFFFF) 1968 if (uopt.session == 0xFFFFFFFF)
1931 sbi->s_session = udf_get_last_session(sb); 1969 sbi->s_session = udf_get_last_session(sb);
@@ -2093,8 +2131,6 @@ static void udf_put_super(struct super_block *sb)
2093 2131
2094 sbi = UDF_SB(sb); 2132 sbi = UDF_SB(sb);
2095 2133
2096 lock_kernel();
2097
2098 if (sbi->s_vat_inode) 2134 if (sbi->s_vat_inode)
2099 iput(sbi->s_vat_inode); 2135 iput(sbi->s_vat_inode);
2100 if (sbi->s_partitions) 2136 if (sbi->s_partitions)
@@ -2110,8 +2146,6 @@ static void udf_put_super(struct super_block *sb)
2110 kfree(sbi->s_partmaps); 2146 kfree(sbi->s_partmaps);
2111 kfree(sb->s_fs_info); 2147 kfree(sb->s_fs_info);
2112 sb->s_fs_info = NULL; 2148 sb->s_fs_info = NULL;
2113
2114 unlock_kernel();
2115} 2149}
2116 2150
2117static int udf_sync_fs(struct super_block *sb, int wait) 2151static int udf_sync_fs(struct super_block *sb, int wait)
@@ -2174,8 +2208,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2174 uint16_t ident; 2208 uint16_t ident;
2175 struct spaceBitmapDesc *bm; 2209 struct spaceBitmapDesc *bm;
2176 2210
2177 lock_kernel();
2178
2179 loc.logicalBlockNum = bitmap->s_extPosition; 2211 loc.logicalBlockNum = bitmap->s_extPosition;
2180 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 2212 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
2181 bh = udf_read_ptagged(sb, &loc, 0, &ident); 2213 bh = udf_read_ptagged(sb, &loc, 0, &ident);
@@ -2212,10 +2244,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2212 } 2244 }
2213 } 2245 }
2214 brelse(bh); 2246 brelse(bh);
2215
2216out: 2247out:
2217 unlock_kernel();
2218
2219 return accum; 2248 return accum;
2220} 2249}
2221 2250
@@ -2228,8 +2257,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2228 int8_t etype; 2257 int8_t etype;
2229 struct extent_position epos; 2258 struct extent_position epos;
2230 2259
2231 lock_kernel(); 2260 mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
2232
2233 epos.block = UDF_I(table)->i_location; 2261 epos.block = UDF_I(table)->i_location;
2234 epos.offset = sizeof(struct unallocSpaceEntry); 2262 epos.offset = sizeof(struct unallocSpaceEntry);
2235 epos.bh = NULL; 2263 epos.bh = NULL;
@@ -2238,8 +2266,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2238 accum += (elen >> table->i_sb->s_blocksize_bits); 2266 accum += (elen >> table->i_sb->s_blocksize_bits);
2239 2267
2240 brelse(epos.bh); 2268 brelse(epos.bh);
2241 2269 mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
2242 unlock_kernel();
2243 2270
2244 return accum; 2271 return accum;
2245} 2272}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 16064787d2b7..b1d4488b0f14 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/pagemap.h> 29#include <linux/pagemap.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include "udf_i.h" 31#include "udf_i.h"
33 32
@@ -78,13 +77,16 @@ static int udf_symlink_filler(struct file *file, struct page *page)
78 int err = -EIO; 77 int err = -EIO;
79 unsigned char *p = kmap(page); 78 unsigned char *p = kmap(page);
80 struct udf_inode_info *iinfo; 79 struct udf_inode_info *iinfo;
80 uint32_t pos;
81 81
82 lock_kernel();
83 iinfo = UDF_I(inode); 82 iinfo = UDF_I(inode);
83 pos = udf_block_map(inode, 0);
84
85 down_read(&iinfo->i_data_sem);
84 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 86 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
85 symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr; 87 symlink = iinfo->i_ext.i_data + iinfo->i_lenEAttr;
86 } else { 88 } else {
87 bh = sb_bread(inode->i_sb, udf_block_map(inode, 0)); 89 bh = sb_bread(inode->i_sb, pos);
88 90
89 if (!bh) 91 if (!bh)
90 goto out; 92 goto out;
@@ -95,14 +97,14 @@ static int udf_symlink_filler(struct file *file, struct page *page)
95 udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); 97 udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
96 brelse(bh); 98 brelse(bh);
97 99
98 unlock_kernel(); 100 up_read(&iinfo->i_data_sem);
99 SetPageUptodate(page); 101 SetPageUptodate(page);
100 kunmap(page); 102 kunmap(page);
101 unlock_page(page); 103 unlock_page(page);
102 return 0; 104 return 0;
103 105
104out: 106out:
105 unlock_kernel(); 107 up_read(&iinfo->i_data_sem);
106 SetPageError(page); 108 SetPageError(page);
107 kunmap(page); 109 kunmap(page);
108 unlock_page(page); 110 unlock_page(page);
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index e58d1de41073..d1bd31ea724e 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,18 @@
1#ifndef _UDF_I_H 1#ifndef _UDF_I_H
2#define _UDF_I_H 2#define _UDF_I_H
3 3
4/*
5 * The i_data_sem and i_mutex serve for protection of allocation information
6 * of a regular files and symlinks. This includes all extents belonging to
7 * the file/symlink, a fact whether data are in-inode or in external data
8 * blocks, preallocation, goal block information... When extents are read,
9 * i_mutex or i_data_sem must be held (for reading is enough in case of
10 * i_data_sem). When extents are changed, i_data_sem must be held for writing
11 * and also i_mutex must be held.
12 *
13 * For directories i_mutex is used for all the necessary protection.
14 */
15
4struct udf_inode_info { 16struct udf_inode_info {
5 struct timespec i_crtime; 17 struct timespec i_crtime;
6 /* Physical address of inode */ 18 /* Physical address of inode */
@@ -21,6 +33,7 @@ struct udf_inode_info {
21 struct long_ad *i_lad; 33 struct long_ad *i_lad;
22 __u8 *i_data; 34 __u8 *i_data;
23 } i_ext; 35 } i_ext;
36 struct rw_semaphore i_data_sem;
24 struct inode vfs_inode; 37 struct inode vfs_inode;
25}; 38};
26 39
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index d113b72c2768..4858c191242b 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -2,6 +2,7 @@
2#define __LINUX_UDF_SB_H 2#define __LINUX_UDF_SB_H
3 3
4#include <linux/mutex.h> 4#include <linux/mutex.h>
5#include <linux/bitops.h>
5 6
6/* Since UDF 2.01 is ISO 13346 based... */ 7/* Since UDF 2.01 is ISO 13346 based... */
7#define UDF_SUPER_MAGIC 0x15013346 8#define UDF_SUPER_MAGIC 0x15013346
@@ -128,6 +129,8 @@ struct udf_sb_info {
128 uid_t s_uid; 129 uid_t s_uid;
129 mode_t s_fmode; 130 mode_t s_fmode;
130 mode_t s_dmode; 131 mode_t s_dmode;
132 /* Lock protecting consistency of above permission settings */
133 rwlock_t s_cred_lock;
131 134
132 /* Root Info */ 135 /* Root Info */
133 struct timespec s_record_time; 136 struct timespec s_record_time;
@@ -139,7 +142,7 @@ struct udf_sb_info {
139 __u16 s_udfrev; 142 __u16 s_udfrev;
140 143
141 /* Miscellaneous flags */ 144 /* Miscellaneous flags */
142 __u32 s_flags; 145 unsigned long s_flags;
143 146
144 /* Encoding info */ 147 /* Encoding info */
145 struct nls_table *s_nls_map; 148 struct nls_table *s_nls_map;
@@ -161,8 +164,19 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
161 164
162int udf_compute_nr_groups(struct super_block *sb, u32 partition); 165int udf_compute_nr_groups(struct super_block *sb, u32 partition);
163 166
164#define UDF_QUERY_FLAG(X,Y) ( UDF_SB(X)->s_flags & ( 1 << (Y) ) ) 167static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag)
165#define UDF_SET_FLAG(X,Y) ( UDF_SB(X)->s_flags |= ( 1 << (Y) ) ) 168{
166#define UDF_CLEAR_FLAG(X,Y) ( UDF_SB(X)->s_flags &= ~( 1 << (Y) ) ) 169 return test_bit(flag, &UDF_SB(sb)->s_flags);
170}
171
172static inline void UDF_SET_FLAG(struct super_block *sb, int flag)
173{
174 set_bit(flag, &UDF_SB(sb)->s_flags);
175}
176
177static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag)
178{
179 clear_bit(flag, &UDF_SB(sb)->s_flags);
180}
167 181
168#endif /* __LINUX_UDF_SB_H */ 182#endif /* __LINUX_UDF_SB_H */
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 6995ab1f4305..eba48209f9f3 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -111,6 +111,8 @@ struct extent_position {
111}; 111};
112 112
113/* super.c */ 113/* super.c */
114
115__attribute__((format(printf, 3, 4)))
114extern void udf_warning(struct super_block *, const char *, const char *, ...); 116extern void udf_warning(struct super_block *, const char *, const char *, ...);
115static inline void udf_updated_lvid(struct super_block *sb) 117static inline void udf_updated_lvid(struct super_block *sb)
116{ 118{
@@ -123,6 +125,7 @@ static inline void udf_updated_lvid(struct super_block *sb)
123 sb->s_dirt = 1; 125 sb->s_dirt = 1;
124 UDF_SB(sb)->s_lvid_dirty = 1; 126 UDF_SB(sb)->s_lvid_dirty = 1;
125} 127}
128extern u64 lvid_get_unique_id(struct super_block *sb);
126 129
127/* namei.c */ 130/* namei.c */
128extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 131extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -133,7 +136,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
133extern long udf_ioctl(struct file *, unsigned int, unsigned long); 136extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134/* inode.c */ 137/* inode.c */
135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 138extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
136extern int udf_sync_inode(struct inode *);
137extern void udf_expand_file_adinicb(struct inode *, int, int *); 139extern void udf_expand_file_adinicb(struct inode *, int, int *);
138extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 140extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
139extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 141extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
1config UFS_FS 1config UFS_FS
2 tristate "UFS file system support (read only)" 2 tristate "UFS file system support (read only)"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # probably fixable
4 help 5 help
5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 6 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
6 OpenBSD and NeXTstep) use a file system called UFS. Some System V 7 OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
180 180
181 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
182 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
183 atomic_inc(&inode->i_count); 183 ihold(inode);
184 184
185 error = ufs_add_nondir(dentry, inode); 185 error = ufs_add_nondir(dentry, inode);
186 unlock_kernel(); 186 unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d510c1b91817..2c61ac5d4e48 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
696 unsigned maxsymlen; 696 unsigned maxsymlen;
697 int ret = -EINVAL; 697 int ret = -EINVAL;
698 698
699 lock_kernel();
700
699 uspi = NULL; 701 uspi = NULL;
700 ubh = NULL; 702 ubh = NULL;
701 flags = 0; 703 flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
1163 goto failed; 1165 goto failed;
1164 1166
1165 UFSD("EXIT\n"); 1167 UFSD("EXIT\n");
1168 unlock_kernel();
1166 return 0; 1169 return 0;
1167 1170
1168dalloc_failed: 1171dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
1174 kfree(sbi); 1177 kfree(sbi);
1175 sb->s_fs_info = NULL; 1178 sb->s_fs_info = NULL;
1176 UFSD("EXIT (FAILED)\n"); 1179 UFSD("EXIT (FAILED)\n");
1180 unlock_kernel();
1177 return ret; 1181 return ret;
1178 1182
1179failed_nomem: 1183failed_nomem:
1180 UFSD("EXIT (NOMEM)\n"); 1184 UFSD("EXIT (NOMEM)\n");
1185 unlock_kernel();
1181 return -ENOMEM; 1186 return -ENOMEM;
1182} 1187}
1183 1188
@@ -1407,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
1407 return &ei->vfs_inode; 1412 return &ei->vfs_inode;
1408} 1413}
1409 1414
1410static void ufs_destroy_inode(struct inode *inode) 1415static void ufs_i_callback(struct rcu_head *head)
1411{ 1416{
1417 struct inode *inode = container_of(head, struct inode, i_rcu);
1418 INIT_LIST_HEAD(&inode->i_dentry);
1412 kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); 1419 kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
1413} 1420}
1414 1421
1422static void ufs_destroy_inode(struct inode *inode)
1423{
1424 call_rcu(&inode->i_rcu, ufs_i_callback);
1425}
1426
1415static void init_once(void *foo) 1427static void init_once(void *foo)
1416{ 1428{
1417 struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; 1429 struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
@@ -1449,16 +1461,16 @@ static const struct super_operations ufs_super_ops = {
1449 .show_options = ufs_show_options, 1461 .show_options = ufs_show_options,
1450}; 1462};
1451 1463
1452static int ufs_get_sb(struct file_system_type *fs_type, 1464static struct dentry *ufs_mount(struct file_system_type *fs_type,
1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1465 int flags, const char *dev_name, void *data)
1454{ 1466{
1455 return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt); 1467 return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
1456} 1468}
1457 1469
1458static struct file_system_type ufs_fs_type = { 1470static struct file_system_type ufs_fs_type = {
1459 .owner = THIS_MODULE, 1471 .owner = THIS_MODULE,
1460 .name = "ufs", 1472 .name = "ufs",
1461 .get_sb = ufs_get_sb, 1473 .mount = ufs_mount,
1462 .kill_sb = kill_block_super, 1474 .kill_sb = kill_block_super,
1463 .fs_flags = FS_REQUIRES_DEV, 1475 .fs_flags = FS_REQUIRES_DEV,
1464}; 1476};
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
22config XFS_QUOTA 22config XFS_QUOTA
23 bool "XFS Quota support" 23 bool "XFS Quota support"
24 depends on XFS_FS 24 depends on XFS_FS
25 select QUOTACTL
25 help 26 help
26 If you say Y here, you will be able to set limits for disk usage on 27 If you say Y here, you will be able to set limits for disk usage on
27 a per user and/or a per group basis under XFS. XFS considers quota 28 a per user and/or a per group basis under XFS. XFS considers quota
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
98 kmem.o \ 98 kmem.o \
99 xfs_aops.o \ 99 xfs_aops.o \
100 xfs_buf.o \ 100 xfs_buf.o \
101 xfs_discard.o \
101 xfs_export.o \ 102 xfs_export.o \
102 xfs_file.o \ 103 xfs_file.o \
103 xfs_fs_subr.o \ 104 xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SV_H__
19#define __XFS_SUPPORT_SV_H__
20
21#include <linux/wait.h>
22#include <linux/sched.h>
23#include <linux/spinlock.h>
24
25/*
26 * Synchronisation variables.
27 *
28 * (Parameters "pri", "svf" and "rts" are not implemented)
29 */
30
31typedef struct sv_s {
32 wait_queue_head_t waiters;
33} sv_t;
34
35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36{
37 DECLARE_WAITQUEUE(wait, current);
38
39 add_wait_queue_exclusive(&sv->waiters, &wait);
40 __set_current_state(TASK_UNINTERRUPTIBLE);
41 spin_unlock(lock);
42
43 schedule();
44
45 remove_wait_queue(&sv->waiters, &wait);
46}
47
48#define sv_init(sv,flag,name) \
49 init_waitqueue_head(&(sv)->waiters)
50#define sv_destroy(sv) \
51 /*NOTHING*/
52#define sv_wait(sv, pri, lock, s) \
53 _sv_wait(sv, lock)
54#define sv_signal(sv) \
55 wake_up(&(sv)->waiters)
56#define sv_broadcast(sv) \
57 wake_up_all(&(sv)->waiters)
58
59#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
219} 219}
220 220
221int 221int
222xfs_check_acl(struct inode *inode, int mask) 222xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
223{ 223{
224 struct xfs_inode *ip = XFS_I(inode); 224 struct xfs_inode *ip;
225 struct posix_acl *acl; 225 struct posix_acl *acl;
226 int error = -EAGAIN; 226 int error = -EAGAIN;
227 227
228 ip = XFS_I(inode);
228 trace_xfs_check_acl(ip); 229 trace_xfs_check_acl(ip);
229 230
230 /* 231 /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
234 if (!XFS_IFORK_Q(ip)) 235 if (!XFS_IFORK_Q(ip))
235 return -EAGAIN; 236 return -EAGAIN;
236 237
238 if (flags & IPERM_FLAG_RCU) {
239 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
240 return -ECHILD;
241 return -EAGAIN;
242 }
243
237 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); 244 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
238 if (IS_ERR(acl)) 245 if (IS_ERR(acl))
239 return PTR_ERR(acl); 246 return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..ec7bbb5645b6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
50 41
51/* 42/*
52 * Prime number of hash buckets since address is used as the key. 43 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
182 xfs_inode_t *ip = XFS_I(ioend->io_inode); 173 xfs_inode_t *ip = XFS_I(ioend->io_inode);
183 xfs_fsize_t isize; 174 xfs_fsize_t isize;
184 175
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IO_READ);
187
188 if (unlikely(ioend->io_error)) 176 if (unlikely(ioend->io_error))
189 return 0; 177 return 0;
190 178
@@ -244,10 +232,8 @@ xfs_end_io(
244 * We might have to update the on-disk file size after extending 232 * We might have to update the on-disk file size after extending
245 * writes. 233 * writes.
246 */ 234 */
247 if (ioend->io_type != IO_READ) { 235 error = xfs_setfilesize(ioend);
248 error = xfs_setfilesize(ioend); 236 ASSERT(!error || error == EAGAIN);
249 ASSERT(!error || error == EAGAIN);
250 }
251 237
252 /* 238 /*
253 * If we didn't complete processing of the ioend, requeue it to the 239 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
318xfs_map_blocks( 304xfs_map_blocks(
319 struct inode *inode, 305 struct inode *inode,
320 loff_t offset, 306 loff_t offset,
321 ssize_t count,
322 struct xfs_bmbt_irec *imap, 307 struct xfs_bmbt_irec *imap,
323 int flags) 308 int type,
309 int nonblocking)
324{ 310{
325 int nmaps = 1; 311 struct xfs_inode *ip = XFS_I(inode);
326 int new = 0; 312 struct xfs_mount *mp = ip->i_mount;
313 ssize_t count = 1 << inode->i_blkbits;
314 xfs_fileoff_t offset_fsb, end_fsb;
315 int error = 0;
316 int bmapi_flags = XFS_BMAPI_ENTIRE;
317 int nimaps = 1;
318
319 if (XFS_FORCED_SHUTDOWN(mp))
320 return -XFS_ERROR(EIO);
321
322 if (type == IO_UNWRITTEN)
323 bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326 if (nonblocking)
327 return -XFS_ERROR(EAGAIN);
328 xfs_ilock(ip, XFS_ILOCK_SHARED);
329 }
330
331 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332 (ip->i_df.if_flags & XFS_IFEXTENTS));
333 ASSERT(offset <= mp->m_maxioffset);
334
335 if (offset + count > mp->m_maxioffset)
336 count = mp->m_maxioffset - offset;
337 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338 offset_fsb = XFS_B_TO_FSBT(mp, offset);
339 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
340 bmapi_flags, NULL, 0, imap, &nimaps, NULL);
341 xfs_iunlock(ip, XFS_ILOCK_SHARED);
327 342
328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); 343 if (error)
344 return -XFS_ERROR(error);
345
346 if (type == IO_DELALLOC &&
347 (!nimaps || isnullstartblock(imap->br_startblock))) {
348 error = xfs_iomap_write_allocate(ip, offset, count, imap);
349 if (!error)
350 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351 return -XFS_ERROR(error);
352 }
353
354#ifdef DEBUG
355 if (type == IO_UNWRITTEN) {
356 ASSERT(nimaps);
357 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359 }
360#endif
361 if (nimaps)
362 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363 return 0;
329} 364}
330 365
331STATIC int 366STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
380 415
381 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
382 WRITE_SYNC_PLUG : WRITE, bio); 417 WRITE_SYNC_PLUG : WRITE, bio);
383 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
384 bio_put(bio);
385} 418}
386 419
387STATIC struct bio * 420STATIC struct bio *
388xfs_alloc_ioend_bio( 421xfs_alloc_ioend_bio(
389 struct buffer_head *bh) 422 struct buffer_head *bh)
390{ 423{
391 struct bio *bio;
392 int nvecs = bio_get_nr_vecs(bh->b_bdev); 424 int nvecs = bio_get_nr_vecs(bh->b_bdev);
393 425 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
394 do {
395 bio = bio_alloc(GFP_NOIO, nvecs);
396 nvecs >>= 1;
397 } while (!bio);
398 426
399 ASSERT(bio->bi_private == NULL); 427 ASSERT(bio->bi_private == NULL);
400 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 428 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
401 bio->bi_bdev = bh->b_bdev; 429 bio->bi_bdev = bh->b_bdev;
402 bio_get(bio);
403 return bio; 430 return bio;
404} 431}
405 432
@@ -470,9 +497,8 @@ xfs_submit_ioend(
470 /* Pass 1 - start writeback */ 497 /* Pass 1 - start writeback */
471 do { 498 do {
472 next = ioend->io_list; 499 next = ioend->io_list;
473 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 500 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
474 xfs_start_buffer_writeback(bh); 501 xfs_start_buffer_writeback(bh);
475 }
476 } while ((ioend = next) != NULL); 502 } while ((ioend = next) != NULL);
477 503
478 /* Pass 2 - submit I/O */ 504 /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
600 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 626 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 627 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
602 628
603 lock_buffer(bh);
604 xfs_map_buffer(inode, bh, imap, offset); 629 xfs_map_buffer(inode, bh, imap, offset);
605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
606 set_buffer_mapped(bh); 630 set_buffer_mapped(bh);
607 clear_buffer_delay(bh); 631 clear_buffer_delay(bh);
608 clear_buffer_unwritten(bh); 632 clear_buffer_unwritten(bh);
609} 633}
610 634
611/* 635/*
612 * Look for a page at index that is suitable for clustering.
613 */
614STATIC unsigned int
615xfs_probe_page(
616 struct page *page,
617 unsigned int pg_offset)
618{
619 struct buffer_head *bh, *head;
620 int ret = 0;
621
622 if (PageWriteback(page))
623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
630
631 bh = head = page_buffers(page);
632 do {
633 if (!buffer_uptodate(bh))
634 break;
635 if (!buffer_mapped(bh))
636 break;
637 ret += bh->b_size;
638 if (ret >= pg_offset)
639 break;
640 } while ((bh = bh->b_this_page) != head);
641
642 return ret;
643}
644
645STATIC size_t
646xfs_probe_cluster(
647 struct inode *inode,
648 struct page *startpage,
649 struct buffer_head *bh,
650 struct buffer_head *head)
651{
652 struct pagevec pvec;
653 pgoff_t tindex, tlast, tloff;
654 size_t total = 0;
655 int done = 0, i;
656
657 /* First sum forwards in this page */
658 do {
659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
660 return total;
661 total += bh->b_size;
662 } while ((bh = bh->b_this_page) != head);
663
664 /* if we reached the end of the page, sum forwards in following pages */
665 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
666 tindex = startpage->index + 1;
667
668 /* Prune this back to avoid pathological behavior */
669 tloff = min(tlast, startpage->index + 64);
670
671 pagevec_init(&pvec, 0);
672 while (!done && tindex <= tloff) {
673 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
674
675 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
676 break;
677
678 for (i = 0; i < pagevec_count(&pvec); i++) {
679 struct page *page = pvec.pages[i];
680 size_t pg_offset, pg_len = 0;
681
682 if (tindex == tlast) {
683 pg_offset =
684 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
685 if (!pg_offset) {
686 done = 1;
687 break;
688 }
689 } else
690 pg_offset = PAGE_CACHE_SIZE;
691
692 if (page->index == tindex && trylock_page(page)) {
693 pg_len = xfs_probe_page(page, pg_offset);
694 unlock_page(page);
695 }
696
697 if (!pg_len) {
698 done = 1;
699 break;
700 }
701
702 total += pg_len;
703 tindex++;
704 }
705
706 pagevec_release(&pvec);
707 cond_resched();
708 }
709
710 return total;
711}
712
713/*
714 * Test if a given page is suitable for writing as part of an unwritten 636 * Test if a given page is suitable for writing as part of an unwritten
715 * or delayed allocate extent. 637 * or delayed allocate extent.
716 */ 638 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
731 if (buffer_unwritten(bh)) 653 if (buffer_unwritten(bh))
732 acceptable = (type == IO_UNWRITTEN); 654 acceptable = (type == IO_UNWRITTEN);
733 else if (buffer_delay(bh)) 655 else if (buffer_delay(bh))
734 acceptable = (type == IO_DELAY); 656 acceptable = (type == IO_DELALLOC);
735 else if (buffer_dirty(bh) && buffer_mapped(bh)) 657 else if (buffer_dirty(bh) && buffer_mapped(bh))
736 acceptable = (type == IO_NEW); 658 acceptable = (type == IO_OVERWRITE);
737 else 659 else
738 break; 660 break;
739 } while ((bh = bh->b_this_page) != head); 661 } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
758 loff_t tindex, 680 loff_t tindex,
759 struct xfs_bmbt_irec *imap, 681 struct xfs_bmbt_irec *imap,
760 xfs_ioend_t **ioendp, 682 xfs_ioend_t **ioendp,
761 struct writeback_control *wbc, 683 struct writeback_control *wbc)
762 int all_bh)
763{ 684{
764 struct buffer_head *bh, *head; 685 struct buffer_head *bh, *head;
765 xfs_off_t end_offset; 686 xfs_off_t end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
814 continue; 735 continue;
815 } 736 }
816 737
817 if (buffer_unwritten(bh) || buffer_delay(bh)) { 738 if (buffer_unwritten(bh) || buffer_delay(bh) ||
739 buffer_mapped(bh)) {
818 if (buffer_unwritten(bh)) 740 if (buffer_unwritten(bh))
819 type = IO_UNWRITTEN; 741 type = IO_UNWRITTEN;
742 else if (buffer_delay(bh))
743 type = IO_DELALLOC;
820 else 744 else
821 type = IO_DELAY; 745 type = IO_OVERWRITE;
822 746
823 if (!xfs_imap_valid(inode, imap, offset)) { 747 if (!xfs_imap_valid(inode, imap, offset)) {
824 done = 1; 748 done = 1;
825 continue; 749 continue;
826 } 750 }
827 751
828 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 752 lock_buffer(bh);
829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 753 if (type != IO_OVERWRITE)
830 754 xfs_map_at_offset(inode, bh, imap, offset);
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type, 755 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done); 756 ioendp, done);
834 757
835 page_dirty--; 758 page_dirty--;
836 count++; 759 count++;
837 } else { 760 } else {
838 type = IO_NEW; 761 done = 1;
839 if (buffer_mapped(bh) && all_bh) {
840 lock_buffer(bh);
841 xfs_add_to_ioend(inode, bh, offset,
842 type, ioendp, done);
843 count++;
844 page_dirty--;
845 } else {
846 done = 1;
847 }
848 } 762 }
849 } while (offset += len, (bh = bh->b_this_page) != head); 763 } while (offset += len, (bh = bh->b_this_page) != head);
850 764
@@ -876,7 +790,6 @@ xfs_cluster_write(
876 struct xfs_bmbt_irec *imap, 790 struct xfs_bmbt_irec *imap,
877 xfs_ioend_t **ioendp, 791 xfs_ioend_t **ioendp,
878 struct writeback_control *wbc, 792 struct writeback_control *wbc,
879 int all_bh,
880 pgoff_t tlast) 793 pgoff_t tlast)
881{ 794{
882 struct pagevec pvec; 795 struct pagevec pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
891 804
892 for (i = 0; i < pagevec_count(&pvec); i++) { 805 for (i = 0; i < pagevec_count(&pvec); i++) {
893 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 806 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
894 imap, ioendp, wbc, all_bh); 807 imap, ioendp, wbc);
895 if (done) 808 if (done)
896 break; 809 break;
897 } 810 }
@@ -934,9 +847,8 @@ xfs_aops_discard_page(
934 struct xfs_inode *ip = XFS_I(inode); 847 struct xfs_inode *ip = XFS_I(inode);
935 struct buffer_head *bh, *head; 848 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 849 loff_t offset = page_offset(page);
937 ssize_t len = 1 << inode->i_blkbits;
938 850
939 if (!xfs_is_delayed_page(page, IO_DELAY)) 851 if (!xfs_is_delayed_page(page, IO_DELALLOC))
940 goto out_invalidate; 852 goto out_invalidate;
941 853
942 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 854 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -949,58 +861,14 @@ xfs_aops_discard_page(
949 xfs_ilock(ip, XFS_ILOCK_EXCL); 861 xfs_ilock(ip, XFS_ILOCK_EXCL);
950 bh = head = page_buffers(page); 862 bh = head = page_buffers(page);
951 do { 863 do {
952 int done;
953 xfs_fileoff_t offset_fsb;
954 xfs_bmbt_irec_t imap;
955 int nimaps = 1;
956 int error; 864 int error;
957 xfs_fsblock_t firstblock; 865 xfs_fileoff_t start_fsb;
958 xfs_bmap_free_t flist;
959 866
960 if (!buffer_delay(bh)) 867 if (!buffer_delay(bh))
961 goto next_buffer; 868 goto next_buffer;
962 869
963 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 870 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
964 871 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
965 /*
966 * Map the range first and check that it is a delalloc extent
967 * before trying to unmap the range. Otherwise we will be
968 * trying to remove a real extent (which requires a
969 * transaction) or a hole, which is probably a bad idea...
970 */
971 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
972 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
973 &nimaps, NULL);
974
975 if (error) {
976 /* something screwed, just bail */
977 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
978 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
979 "page discard failed delalloc mapping lookup.");
980 }
981 break;
982 }
983 if (!nimaps) {
984 /* nothing there */
985 goto next_buffer;
986 }
987 if (imap.br_startblock != DELAYSTARTBLOCK) {
988 /* been converted, ignore */
989 goto next_buffer;
990 }
991 WARN_ON(imap.br_blockcount == 0);
992
993 /*
994 * Note: while we initialise the firstblock/flist pair, they
995 * should never be used because blocks should never be
996 * allocated or freed for a delalloc extent and hence we need
997 * don't cancel or finish them after the xfs_bunmapi() call.
998 */
999 xfs_bmap_init(&flist, &firstblock);
1000 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
1001 &flist, &done);
1002
1003 ASSERT(!flist.xbf_count && !flist.xbf_first);
1004 if (error) { 872 if (error) {
1005 /* something screwed, just bail */ 873 /* something screwed, just bail */
1006 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 874 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +878,7 @@ xfs_aops_discard_page(
1010 break; 878 break;
1011 } 879 }
1012next_buffer: 880next_buffer:
1013 offset += len; 881 offset += 1 << inode->i_blkbits;
1014 882
1015 } while ((bh = bh->b_this_page) != head); 883 } while ((bh = bh->b_this_page) != head);
1016 884
@@ -1047,10 +915,10 @@ xfs_vm_writepage(
1047 unsigned int type; 915 unsigned int type;
1048 __uint64_t end_offset; 916 __uint64_t end_offset;
1049 pgoff_t end_index, last_index; 917 pgoff_t end_index, last_index;
1050 ssize_t size, len; 918 ssize_t len;
1051 int flags, err, imap_valid = 0, uptodate = 1; 919 int err, imap_valid = 0, uptodate = 1;
1052 int count = 0; 920 int count = 0;
1053 int all_bh = 0; 921 int nonblocking = 0;
1054 922
1055 trace_xfs_writepage(inode, page, 0); 923 trace_xfs_writepage(inode, page, 0);
1056 924
@@ -1101,110 +969,78 @@ xfs_vm_writepage(
1101 969
1102 bh = head = page_buffers(page); 970 bh = head = page_buffers(page);
1103 offset = page_offset(page); 971 offset = page_offset(page);
1104 flags = BMAPI_READ; 972 type = IO_OVERWRITE;
1105 type = IO_NEW; 973
974 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
975 nonblocking = 1;
1106 976
1107 do { 977 do {
978 int new_ioend = 0;
979
1108 if (offset >= end_offset) 980 if (offset >= end_offset)
1109 break; 981 break;
1110 if (!buffer_uptodate(bh)) 982 if (!buffer_uptodate(bh))
1111 uptodate = 0; 983 uptodate = 0;
1112 984
1113 /* 985 /*
1114 * A hole may still be marked uptodate because discard_buffer 986 * set_page_dirty dirties all buffers in a page, independent
1115 * leaves the flag set. 987 * of their state. The dirty state however is entirely
988 * meaningless for holes (!mapped && uptodate), so skip
989 * buffers covering holes here.
1116 */ 990 */
1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 991 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1119 imap_valid = 0; 992 imap_valid = 0;
1120 continue; 993 continue;
1121 } 994 }
1122 995
1123 if (imap_valid) 996 if (buffer_unwritten(bh)) {
1124 imap_valid = xfs_imap_valid(inode, &imap, offset); 997 if (type != IO_UNWRITTEN) {
1125
1126 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1127 int new_ioend = 0;
1128
1129 /*
1130 * Make sure we don't use a read-only iomap
1131 */
1132 if (flags == BMAPI_READ)
1133 imap_valid = 0;
1134
1135 if (buffer_unwritten(bh)) {
1136 type = IO_UNWRITTEN; 998 type = IO_UNWRITTEN;
1137 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 999 imap_valid = 0;
1138 } else if (buffer_delay(bh)) {
1139 type = IO_DELAY;
1140 flags = BMAPI_ALLOCATE;
1141
1142 if (wbc->sync_mode == WB_SYNC_NONE &&
1143 wbc->nonblocking)
1144 flags |= BMAPI_TRYLOCK;
1145 }
1146
1147 if (!imap_valid) {
1148 /*
1149 * If we didn't have a valid mapping then we
1150 * need to ensure that we put the new mapping
1151 * in a new ioend structure. This needs to be
1152 * done to ensure that the ioends correctly
1153 * reflect the block mappings at io completion
1154 * for unwritten extent conversion.
1155 */
1156 new_ioend = 1;
1157 err = xfs_map_blocks(inode, offset, len,
1158 &imap, flags);
1159 if (err)
1160 goto error;
1161 imap_valid = xfs_imap_valid(inode, &imap,
1162 offset);
1163 } 1000 }
1164 if (imap_valid) { 1001 } else if (buffer_delay(bh)) {
1165 xfs_map_at_offset(inode, bh, &imap, offset); 1002 if (type != IO_DELALLOC) {
1166 xfs_add_to_ioend(inode, bh, offset, type, 1003 type = IO_DELALLOC;
1167 &ioend, new_ioend); 1004 imap_valid = 0;
1168 count++;
1169 } 1005 }
1170 } else if (buffer_uptodate(bh)) { 1006 } else if (buffer_uptodate(bh)) {
1171 /* 1007 if (type != IO_OVERWRITE) {
1172 * we got here because the buffer is already mapped. 1008 type = IO_OVERWRITE;
1173 * That means it must already have extents allocated 1009 imap_valid = 0;
1174 * underneath it. Map the extent by reading it.
1175 */
1176 if (!imap_valid || flags != BMAPI_READ) {
1177 flags = BMAPI_READ;
1178 size = xfs_probe_cluster(inode, page, bh, head);
1179 err = xfs_map_blocks(inode, offset, size,
1180 &imap, flags);
1181 if (err)
1182 goto error;
1183 imap_valid = xfs_imap_valid(inode, &imap,
1184 offset);
1185 } 1010 }
1011 } else {
1012 if (PageUptodate(page)) {
1013 ASSERT(buffer_mapped(bh));
1014 imap_valid = 0;
1015 }
1016 continue;
1017 }
1186 1018
1019 if (imap_valid)
1020 imap_valid = xfs_imap_valid(inode, &imap, offset);
1021 if (!imap_valid) {
1187 /* 1022 /*
1188 * We set the type to IO_NEW in case we are doing a 1023 * If we didn't have a valid mapping then we need to
1189 * small write at EOF that is extending the file but 1024 * put the new mapping into a separate ioend structure.
1190 * without needing an allocation. We need to update the 1025 * This ensures non-contiguous extents always have
1191 * file size on I/O completion in this case so it is 1026 * separate ioends, which is particularly important
1192 * the same case as having just allocated a new extent 1027 * for unwritten extent conversion at I/O completion
1193 * that we are writing into for the first time. 1028 * time.
1194 */ 1029 */
1195 type = IO_NEW; 1030 new_ioend = 1;
1196 if (trylock_buffer(bh)) { 1031 err = xfs_map_blocks(inode, offset, &imap, type,
1197 if (imap_valid) 1032 nonblocking);
1198 all_bh = 1; 1033 if (err)
1199 xfs_add_to_ioend(inode, bh, offset, type, 1034 goto error;
1200 &ioend, !imap_valid); 1035 imap_valid = xfs_imap_valid(inode, &imap, offset);
1201 count++; 1036 }
1202 } else { 1037 if (imap_valid) {
1203 imap_valid = 0; 1038 lock_buffer(bh);
1204 } 1039 if (type != IO_OVERWRITE)
1205 } else if (PageUptodate(page)) { 1040 xfs_map_at_offset(inode, bh, &imap, offset);
1206 ASSERT(buffer_mapped(bh)); 1041 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1207 imap_valid = 0; 1042 new_ioend);
1043 count++;
1208 } 1044 }
1209 1045
1210 if (!iohead) 1046 if (!iohead)
@@ -1233,7 +1069,7 @@ xfs_vm_writepage(
1233 end_index = last_index; 1069 end_index = last_index;
1234 1070
1235 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1071 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1236 wbc, all_bh, end_index); 1072 wbc, end_index);
1237 } 1073 }
1238 1074
1239 if (iohead) 1075 if (iohead)
@@ -1302,13 +1138,19 @@ __xfs_get_blocks(
1302 int create, 1138 int create,
1303 int direct) 1139 int direct)
1304{ 1140{
1305 int flags = create ? BMAPI_WRITE : BMAPI_READ; 1141 struct xfs_inode *ip = XFS_I(inode);
1142 struct xfs_mount *mp = ip->i_mount;
1143 xfs_fileoff_t offset_fsb, end_fsb;
1144 int error = 0;
1145 int lockmode = 0;
1306 struct xfs_bmbt_irec imap; 1146 struct xfs_bmbt_irec imap;
1147 int nimaps = 1;
1307 xfs_off_t offset; 1148 xfs_off_t offset;
1308 ssize_t size; 1149 ssize_t size;
1309 int nimap = 1;
1310 int new = 0; 1150 int new = 0;
1311 int error; 1151
1152 if (XFS_FORCED_SHUTDOWN(mp))
1153 return -XFS_ERROR(EIO);
1312 1154
1313 offset = (xfs_off_t)iblock << inode->i_blkbits; 1155 offset = (xfs_off_t)iblock << inode->i_blkbits;
1314 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1156 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1317,15 +1159,45 @@ __xfs_get_blocks(
1317 if (!create && direct && offset >= i_size_read(inode)) 1159 if (!create && direct && offset >= i_size_read(inode))
1318 return 0; 1160 return 0;
1319 1161
1320 if (direct && create) 1162 if (create) {
1321 flags |= BMAPI_DIRECT; 1163 lockmode = XFS_ILOCK_EXCL;
1164 xfs_ilock(ip, lockmode);
1165 } else {
1166 lockmode = xfs_ilock_map_shared(ip);
1167 }
1168
1169 ASSERT(offset <= mp->m_maxioffset);
1170 if (offset + size > mp->m_maxioffset)
1171 size = mp->m_maxioffset - offset;
1172 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1173 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1322 1174
1323 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, 1175 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
1324 &new); 1176 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
1325 if (error) 1177 if (error)
1326 return -error; 1178 goto out_unlock;
1327 if (nimap == 0) 1179
1328 return 0; 1180 if (create &&
1181 (!nimaps ||
1182 (imap.br_startblock == HOLESTARTBLOCK ||
1183 imap.br_startblock == DELAYSTARTBLOCK))) {
1184 if (direct) {
1185 error = xfs_iomap_write_direct(ip, offset, size,
1186 &imap, nimaps);
1187 } else {
1188 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1189 }
1190 if (error)
1191 goto out_unlock;
1192
1193 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1194 } else if (nimaps) {
1195 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1196 } else {
1197 trace_xfs_get_blocks_notfound(ip, offset, size);
1198 goto out_unlock;
1199 }
1200 xfs_iunlock(ip, lockmode);
1329 1201
1330 if (imap.br_startblock != HOLESTARTBLOCK && 1202 if (imap.br_startblock != HOLESTARTBLOCK &&
1331 imap.br_startblock != DELAYSTARTBLOCK) { 1203 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1392,6 +1264,10 @@ __xfs_get_blocks(
1392 } 1264 }
1393 1265
1394 return 0; 1266 return 0;
1267
1268out_unlock:
1269 xfs_iunlock(ip, lockmode);
1270 return -error;
1395} 1271}
1396 1272
1397int 1273int
@@ -1479,7 +1355,7 @@ xfs_vm_direct_IO(
1479 ssize_t ret; 1355 ssize_t ret;
1480 1356
1481 if (rw & WRITE) { 1357 if (rw & WRITE) {
1482 iocb->private = xfs_alloc_ioend(inode, IO_NEW); 1358 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
1483 1359
1484 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1360 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1485 offset, nr_segs, 1361 offset, nr_segs,
@@ -1505,11 +1381,42 @@ xfs_vm_write_failed(
1505 struct inode *inode = mapping->host; 1381 struct inode *inode = mapping->host;
1506 1382
1507 if (to > inode->i_size) { 1383 if (to > inode->i_size) {
1508 struct iattr ia = { 1384 /*
1509 .ia_valid = ATTR_SIZE | ATTR_FORCE, 1385 * punch out the delalloc blocks we have already allocated. We
1510 .ia_size = inode->i_size, 1386 * don't call xfs_setattr() to do this as we may be in the
1511 }; 1387 * middle of a multi-iovec write and so the vfs inode->i_size
1512 xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); 1388 * will not match the xfs ip->i_size and so it will zero too
1389 * much. Hence we jus truncate the page cache to zero what is
1390 * necessary and punch the delalloc blocks directly.
1391 */
1392 struct xfs_inode *ip = XFS_I(inode);
1393 xfs_fileoff_t start_fsb;
1394 xfs_fileoff_t end_fsb;
1395 int error;
1396
1397 truncate_pagecache(inode, to, inode->i_size);
1398
1399 /*
1400 * Check if there are any blocks that are outside of i_size
1401 * that need to be trimmed back.
1402 */
1403 start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
1404 end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
1405 if (end_fsb <= start_fsb)
1406 return;
1407
1408 xfs_ilock(ip, XFS_ILOCK_EXCL);
1409 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1410 end_fsb - start_fsb);
1411 if (error) {
1412 /* something screwed, just bail */
1413 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1414 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
1415 "xfs_vm_write_failed: unable to clean up ino %lld",
1416 ip->i_ino);
1417 }
1418 }
1419 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1513 } 1420 }
1514} 1421}
1515 1422
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
24 24
25/* 25/*
26 * Types of I/O for bmap clustering and I/O completion tracking.
27 */
28enum {
29 IO_DIRECT = 0, /* special case for direct I/O ioends */
30 IO_DELALLOC, /* mapping covers delalloc region */
31 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
32 IO_OVERWRITE, /* mapping covers already allocated extent */
33};
34
35#define XFS_IO_TYPES \
36 { 0, "" }, \
37 { IO_DELALLOC, "delalloc" }, \
38 { IO_UNWRITTEN, "unwritten" }, \
39 { IO_OVERWRITE, "overwrite" }
40
41/*
26 * xfs_ioend struct manages large extent writes for XFS. 42 * xfs_ioend struct manages large extent writes for XFS.
27 * It can manage several multi-page bio's at once. 43 * It can manage several multi-page bio's at once.
28 */ 44 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
44 44
45static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 46STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 47STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup,
51 .seeks = DEFAULT_SEEKS,
52};
53 48
54static struct workqueue_struct *xfslogd_workqueue; 49static struct workqueue_struct *xfslogd_workqueue;
55struct workqueue_struct *xfsdatad_workqueue; 50struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
168} 163}
169 164
170/* 165/*
171 * Internal xfs_buf_t object manipulation 166 * xfs_buf_lru_add - add a buffer to the LRU.
167 *
168 * The LRU takes a new reference to the buffer so that it will only be freed
169 * once the shrinker takes the buffer off the LRU.
172 */ 170 */
171STATIC void
172xfs_buf_lru_add(
173 struct xfs_buf *bp)
174{
175 struct xfs_buftarg *btp = bp->b_target;
176
177 spin_lock(&btp->bt_lru_lock);
178 if (list_empty(&bp->b_lru)) {
179 atomic_inc(&bp->b_hold);
180 list_add_tail(&bp->b_lru, &btp->bt_lru);
181 btp->bt_lru_nr++;
182 }
183 spin_unlock(&btp->bt_lru_lock);
184}
185
186/*
187 * xfs_buf_lru_del - remove a buffer from the LRU
188 *
189 * The unlocked check is safe here because it only occurs when there are not
190 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
191 * to optimise the shrinker removing the buffer from the LRU and calling
192 * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
193 * bt_lru_lock.
194 */
195STATIC void
196xfs_buf_lru_del(
197 struct xfs_buf *bp)
198{
199 struct xfs_buftarg *btp = bp->b_target;
200
201 if (list_empty(&bp->b_lru))
202 return;
203
204 spin_lock(&btp->bt_lru_lock);
205 if (!list_empty(&bp->b_lru)) {
206 list_del_init(&bp->b_lru);
207 btp->bt_lru_nr--;
208 }
209 spin_unlock(&btp->bt_lru_lock);
210}
211
212/*
213 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
214 * b_lru_ref count so that the buffer is freed immediately when the buffer
215 * reference count falls to zero. If the buffer is already on the LRU, we need
216 * to remove the reference that LRU holds on the buffer.
217 *
218 * This prevents build-up of stale buffers on the LRU.
219 */
220void
221xfs_buf_stale(
222 struct xfs_buf *bp)
223{
224 bp->b_flags |= XBF_STALE;
225 atomic_set(&(bp)->b_lru_ref, 0);
226 if (!list_empty(&bp->b_lru)) {
227 struct xfs_buftarg *btp = bp->b_target;
228
229 spin_lock(&btp->bt_lru_lock);
230 if (!list_empty(&bp->b_lru)) {
231 list_del_init(&bp->b_lru);
232 btp->bt_lru_nr--;
233 atomic_dec(&bp->b_hold);
234 }
235 spin_unlock(&btp->bt_lru_lock);
236 }
237 ASSERT(atomic_read(&bp->b_hold) >= 1);
238}
173 239
174STATIC void 240STATIC void
175_xfs_buf_initialize( 241_xfs_buf_initialize(
@@ -186,10 +252,12 @@ _xfs_buf_initialize(
186 252
187 memset(bp, 0, sizeof(xfs_buf_t)); 253 memset(bp, 0, sizeof(xfs_buf_t));
188 atomic_set(&bp->b_hold, 1); 254 atomic_set(&bp->b_hold, 1);
255 atomic_set(&bp->b_lru_ref, 1);
189 init_completion(&bp->b_iowait); 256 init_completion(&bp->b_iowait);
257 INIT_LIST_HEAD(&bp->b_lru);
190 INIT_LIST_HEAD(&bp->b_list); 258 INIT_LIST_HEAD(&bp->b_list);
191 INIT_LIST_HEAD(&bp->b_hash_list); 259 RB_CLEAR_NODE(&bp->b_rbnode);
192 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 260 sema_init(&bp->b_sema, 0); /* held, no waiters */
193 XB_SET_OWNER(bp); 261 XB_SET_OWNER(bp);
194 bp->b_target = target; 262 bp->b_target = target;
195 bp->b_file_offset = range_base; 263 bp->b_file_offset = range_base;
@@ -262,7 +330,7 @@ xfs_buf_free(
262{ 330{
263 trace_xfs_buf_free(bp, _RET_IP_); 331 trace_xfs_buf_free(bp, _RET_IP_);
264 332
265 ASSERT(list_empty(&bp->b_hash_list)); 333 ASSERT(list_empty(&bp->b_lru));
266 334
267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 335 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
268 uint i; 336 uint i;
@@ -339,7 +407,6 @@ _xfs_buf_lookup_pages(
339 __func__, gfp_mask); 407 __func__, gfp_mask);
340 408
341 XFS_STATS_INC(xb_page_retries); 409 XFS_STATS_INC(xb_page_retries);
342 xfsbufd_wakeup(NULL, 0, gfp_mask);
343 congestion_wait(BLK_RW_ASYNC, HZ/50); 410 congestion_wait(BLK_RW_ASYNC, HZ/50);
344 goto retry; 411 goto retry;
345 } 412 }
@@ -422,8 +489,10 @@ _xfs_buf_find(
422{ 489{
423 xfs_off_t range_base; 490 xfs_off_t range_base;
424 size_t range_length; 491 size_t range_length;
425 xfs_bufhash_t *hash; 492 struct xfs_perag *pag;
426 xfs_buf_t *bp, *n; 493 struct rb_node **rbp;
494 struct rb_node *parent;
495 xfs_buf_t *bp;
427 496
428 range_base = (ioff << BBSHIFT); 497 range_base = (ioff << BBSHIFT);
429 range_length = (isize << BBSHIFT); 498 range_length = (isize << BBSHIFT);
@@ -432,14 +501,37 @@ _xfs_buf_find(
432 ASSERT(!(range_length < (1 << btp->bt_sshift))); 501 ASSERT(!(range_length < (1 << btp->bt_sshift)));
433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 502 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
434 503
435 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 504 /* get tree root */
436 505 pag = xfs_perag_get(btp->bt_mount,
437 spin_lock(&hash->bh_lock); 506 xfs_daddr_to_agno(btp->bt_mount, ioff));
438 507
439 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 508 /* walk tree */
440 ASSERT(btp == bp->b_target); 509 spin_lock(&pag->pag_buf_lock);
441 if (bp->b_file_offset == range_base && 510 rbp = &pag->pag_buf_tree.rb_node;
442 bp->b_buffer_length == range_length) { 511 parent = NULL;
512 bp = NULL;
513 while (*rbp) {
514 parent = *rbp;
515 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
516
517 if (range_base < bp->b_file_offset)
518 rbp = &(*rbp)->rb_left;
519 else if (range_base > bp->b_file_offset)
520 rbp = &(*rbp)->rb_right;
521 else {
522 /*
523 * found a block offset match. If the range doesn't
524 * match, the only way this is allowed is if the buffer
525 * in the cache is stale and the transaction that made
526 * it stale has not yet committed. i.e. we are
527 * reallocating a busy extent. Skip this buffer and
528 * continue searching to the right for an exact match.
529 */
530 if (bp->b_buffer_length != range_length) {
531 ASSERT(bp->b_flags & XBF_STALE);
532 rbp = &(*rbp)->rb_right;
533 continue;
534 }
443 atomic_inc(&bp->b_hold); 535 atomic_inc(&bp->b_hold);
444 goto found; 536 goto found;
445 } 537 }
@@ -449,41 +541,32 @@ _xfs_buf_find(
449 if (new_bp) { 541 if (new_bp) {
450 _xfs_buf_initialize(new_bp, btp, range_base, 542 _xfs_buf_initialize(new_bp, btp, range_base,
451 range_length, flags); 543 range_length, flags);
452 new_bp->b_hash = hash; 544 rb_link_node(&new_bp->b_rbnode, parent, rbp);
453 list_add(&new_bp->b_hash_list, &hash->bh_list); 545 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
546 /* the buffer keeps the perag reference until it is freed */
547 new_bp->b_pag = pag;
548 spin_unlock(&pag->pag_buf_lock);
454 } else { 549 } else {
455 XFS_STATS_INC(xb_miss_locked); 550 XFS_STATS_INC(xb_miss_locked);
551 spin_unlock(&pag->pag_buf_lock);
552 xfs_perag_put(pag);
456 } 553 }
457
458 spin_unlock(&hash->bh_lock);
459 return new_bp; 554 return new_bp;
460 555
461found: 556found:
462 spin_unlock(&hash->bh_lock); 557 spin_unlock(&pag->pag_buf_lock);
558 xfs_perag_put(pag);
463 559
464 /* Attempt to get the semaphore without sleeping, 560 if (xfs_buf_cond_lock(bp)) {
465 * if this does not work then we need to drop the 561 /* failed, so wait for the lock if requested. */
466 * spinlock and do a hard attempt on the semaphore.
467 */
468 if (down_trylock(&bp->b_sema)) {
469 if (!(flags & XBF_TRYLOCK)) { 562 if (!(flags & XBF_TRYLOCK)) {
470 /* wait for buffer ownership */
471 xfs_buf_lock(bp); 563 xfs_buf_lock(bp);
472 XFS_STATS_INC(xb_get_locked_waited); 564 XFS_STATS_INC(xb_get_locked_waited);
473 } else { 565 } else {
474 /* We asked for a trylock and failed, no need
475 * to look at file offset and length here, we
476 * know that this buffer at least overlaps our
477 * buffer and is locked, therefore our buffer
478 * either does not exist, or is this buffer.
479 */
480 xfs_buf_rele(bp); 566 xfs_buf_rele(bp);
481 XFS_STATS_INC(xb_busy_locked); 567 XFS_STATS_INC(xb_busy_locked);
482 return NULL; 568 return NULL;
483 } 569 }
484 } else {
485 /* trylock worked */
486 XB_SET_OWNER(bp);
487 } 570 }
488 571
489 if (bp->b_flags & XBF_STALE) { 572 if (bp->b_flags & XBF_STALE) {
@@ -625,8 +708,7 @@ void
625xfs_buf_readahead( 708xfs_buf_readahead(
626 xfs_buftarg_t *target, 709 xfs_buftarg_t *target,
627 xfs_off_t ioff, 710 xfs_off_t ioff,
628 size_t isize, 711 size_t isize)
629 xfs_buf_flags_t flags)
630{ 712{
631 struct backing_dev_info *bdi; 713 struct backing_dev_info *bdi;
632 714
@@ -634,8 +716,42 @@ xfs_buf_readahead(
634 if (bdi_read_congested(bdi)) 716 if (bdi_read_congested(bdi))
635 return; 717 return;
636 718
637 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 719 xfs_buf_read(target, ioff, isize,
638 xfs_buf_read(target, ioff, isize, flags); 720 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
721}
722
723/*
724 * Read an uncached buffer from disk. Allocates and returns a locked
725 * buffer containing the disk contents or nothing.
726 */
727struct xfs_buf *
728xfs_buf_read_uncached(
729 struct xfs_mount *mp,
730 struct xfs_buftarg *target,
731 xfs_daddr_t daddr,
732 size_t length,
733 int flags)
734{
735 xfs_buf_t *bp;
736 int error;
737
738 bp = xfs_buf_get_uncached(target, length, flags);
739 if (!bp)
740 return NULL;
741
742 /* set up the buffer for a read IO */
743 xfs_buf_lock(bp);
744 XFS_BUF_SET_ADDR(bp, daddr);
745 XFS_BUF_READ(bp);
746 XFS_BUF_BUSY(bp);
747
748 xfsbdstrat(mp, bp);
749 error = xfs_buf_iowait(bp);
750 if (error || bp->b_error) {
751 xfs_buf_relse(bp);
752 return NULL;
753 }
754 return bp;
639} 755}
640 756
641xfs_buf_t * 757xfs_buf_t *
@@ -707,9 +823,10 @@ xfs_buf_associate_memory(
707} 823}
708 824
709xfs_buf_t * 825xfs_buf_t *
710xfs_buf_get_noaddr( 826xfs_buf_get_uncached(
827 struct xfs_buftarg *target,
711 size_t len, 828 size_t len,
712 xfs_buftarg_t *target) 829 int flags)
713{ 830{
714 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 831 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
715 int error, i; 832 int error, i;
@@ -725,7 +842,7 @@ xfs_buf_get_noaddr(
725 goto fail_free_buf; 842 goto fail_free_buf;
726 843
727 for (i = 0; i < page_count; i++) { 844 for (i = 0; i < page_count; i++) {
728 bp->b_pages[i] = alloc_page(GFP_KERNEL); 845 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
729 if (!bp->b_pages[i]) 846 if (!bp->b_pages[i])
730 goto fail_free_mem; 847 goto fail_free_mem;
731 } 848 }
@@ -740,7 +857,7 @@ xfs_buf_get_noaddr(
740 857
741 xfs_buf_unlock(bp); 858 xfs_buf_unlock(bp);
742 859
743 trace_xfs_buf_get_noaddr(bp, _RET_IP_); 860 trace_xfs_buf_get_uncached(bp, _RET_IP_);
744 return bp; 861 return bp;
745 862
746 fail_free_mem: 863 fail_free_mem:
@@ -774,29 +891,32 @@ void
774xfs_buf_rele( 891xfs_buf_rele(
775 xfs_buf_t *bp) 892 xfs_buf_t *bp)
776{ 893{
777 xfs_bufhash_t *hash = bp->b_hash; 894 struct xfs_perag *pag = bp->b_pag;
778 895
779 trace_xfs_buf_rele(bp, _RET_IP_); 896 trace_xfs_buf_rele(bp, _RET_IP_);
780 897
781 if (unlikely(!hash)) { 898 if (!pag) {
782 ASSERT(!bp->b_relse); 899 ASSERT(list_empty(&bp->b_lru));
900 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
783 if (atomic_dec_and_test(&bp->b_hold)) 901 if (atomic_dec_and_test(&bp->b_hold))
784 xfs_buf_free(bp); 902 xfs_buf_free(bp);
785 return; 903 return;
786 } 904 }
787 905
906 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
907
788 ASSERT(atomic_read(&bp->b_hold) > 0); 908 ASSERT(atomic_read(&bp->b_hold) > 0);
789 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 909 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
790 if (bp->b_relse) { 910 if (!(bp->b_flags & XBF_STALE) &&
791 atomic_inc(&bp->b_hold); 911 atomic_read(&bp->b_lru_ref)) {
792 spin_unlock(&hash->bh_lock); 912 xfs_buf_lru_add(bp);
793 (*(bp->b_relse)) (bp); 913 spin_unlock(&pag->pag_buf_lock);
794 } else if (bp->b_flags & XBF_FS_MANAGED) {
795 spin_unlock(&hash->bh_lock);
796 } else { 914 } else {
915 xfs_buf_lru_del(bp);
797 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 916 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
798 list_del_init(&bp->b_hash_list); 917 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
799 spin_unlock(&hash->bh_lock); 918 spin_unlock(&pag->pag_buf_lock);
919 xfs_perag_put(pag);
800 xfs_buf_free(bp); 920 xfs_buf_free(bp);
801 } 921 }
802 } 922 }
@@ -814,10 +934,18 @@ xfs_buf_rele(
814 */ 934 */
815 935
816/* 936/*
817 * Locks a buffer object, if it is not already locked. 937 * Locks a buffer object, if it is not already locked. Note that this in
818 * Note that this in no way locks the underlying pages, so it is only 938 * no way locks the underlying pages, so it is only useful for
819 * useful for synchronizing concurrent use of buffer objects, not for 939 * synchronizing concurrent use of buffer objects, not for synchronizing
820 * synchronizing independent access to the underlying pages. 940 * independent access to the underlying pages.
941 *
942 * If we come across a stale, pinned, locked buffer, we know that we are
943 * being asked to lock a buffer that has been reallocated. Because it is
944 * pinned, we know that the log has not been pushed to disk and hence it
945 * will still be locked. Rather than continuing to have trylock attempts
946 * fail until someone else pushes the log, push it ourselves before
947 * returning. This means that the xfsaild will not get stuck trying
948 * to push on stale inode buffers.
821 */ 949 */
822int 950int
823xfs_buf_cond_lock( 951xfs_buf_cond_lock(
@@ -828,6 +956,8 @@ xfs_buf_cond_lock(
828 locked = down_trylock(&bp->b_sema) == 0; 956 locked = down_trylock(&bp->b_sema) == 0;
829 if (locked) 957 if (locked)
830 XB_SET_OWNER(bp); 958 XB_SET_OWNER(bp);
959 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
960 xfs_log_force(bp->b_target->bt_mount, 0);
831 961
832 trace_xfs_buf_cond_lock(bp, _RET_IP_); 962 trace_xfs_buf_cond_lock(bp, _RET_IP_);
833 return locked ? 0 : -EBUSY; 963 return locked ? 0 : -EBUSY;
@@ -859,7 +989,7 @@ xfs_buf_lock(
859 trace_xfs_buf_lock(bp, _RET_IP_); 989 trace_xfs_buf_lock(bp, _RET_IP_);
860 990
861 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 991 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
862 xfs_log_force(bp->b_mount, 0); 992 xfs_log_force(bp->b_target->bt_mount, 0);
863 if (atomic_read(&bp->b_io_remaining)) 993 if (atomic_read(&bp->b_io_remaining))
864 blk_run_address_space(bp->b_target->bt_mapping); 994 blk_run_address_space(bp->b_target->bt_mapping);
865 down(&bp->b_sema); 995 down(&bp->b_sema);
@@ -924,19 +1054,7 @@ xfs_buf_iodone_work(
924 xfs_buf_t *bp = 1054 xfs_buf_t *bp =
925 container_of(work, xfs_buf_t, b_iodone_work); 1055 container_of(work, xfs_buf_t, b_iodone_work);
926 1056
927 /* 1057 if (bp->b_iodone)
928 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
929 * ordered flag and reissue them. Because we can't tell the higher
930 * layers directly that they should not issue ordered I/O anymore, they
931 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
932 */
933 if ((bp->b_error == EOPNOTSUPP) &&
934 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
935 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
936 bp->b_flags &= ~XBF_ORDERED;
937 bp->b_flags |= _XFS_BARRIER_FAILED;
938 xfs_buf_iorequest(bp);
939 } else if (bp->b_iodone)
940 (*(bp->b_iodone))(bp); 1058 (*(bp->b_iodone))(bp);
941 else if (bp->b_flags & XBF_ASYNC) 1059 else if (bp->b_flags & XBF_ASYNC)
942 xfs_buf_relse(bp); 1060 xfs_buf_relse(bp);
@@ -982,7 +1100,6 @@ xfs_bwrite(
982{ 1100{
983 int error; 1101 int error;
984 1102
985 bp->b_mount = mp;
986 bp->b_flags |= XBF_WRITE; 1103 bp->b_flags |= XBF_WRITE;
987 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1104 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
988 1105
@@ -1003,8 +1120,6 @@ xfs_bdwrite(
1003{ 1120{
1004 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1121 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1005 1122
1006 bp->b_mount = mp;
1007
1008 bp->b_flags &= ~XBF_READ; 1123 bp->b_flags &= ~XBF_READ;
1009 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1124 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1010 1125
@@ -1013,7 +1128,7 @@ xfs_bdwrite(
1013 1128
1014/* 1129/*
1015 * Called when we want to stop a buffer from getting written or read. 1130 * Called when we want to stop a buffer from getting written or read.
1016 * We attach the EIO error, muck with its flags, and call biodone 1131 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1017 * so that the proper iodone callbacks get called. 1132 * so that the proper iodone callbacks get called.
1018 */ 1133 */
1019STATIC int 1134STATIC int
@@ -1030,21 +1145,21 @@ xfs_bioerror(
1030 XFS_BUF_ERROR(bp, EIO); 1145 XFS_BUF_ERROR(bp, EIO);
1031 1146
1032 /* 1147 /*
1033 * We're calling biodone, so delete XBF_DONE flag. 1148 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1034 */ 1149 */
1035 XFS_BUF_UNREAD(bp); 1150 XFS_BUF_UNREAD(bp);
1036 XFS_BUF_UNDELAYWRITE(bp); 1151 XFS_BUF_UNDELAYWRITE(bp);
1037 XFS_BUF_UNDONE(bp); 1152 XFS_BUF_UNDONE(bp);
1038 XFS_BUF_STALE(bp); 1153 XFS_BUF_STALE(bp);
1039 1154
1040 xfs_biodone(bp); 1155 xfs_buf_ioend(bp, 0);
1041 1156
1042 return EIO; 1157 return EIO;
1043} 1158}
1044 1159
1045/* 1160/*
1046 * Same as xfs_bioerror, except that we are releasing the buffer 1161 * Same as xfs_bioerror, except that we are releasing the buffer
1047 * here ourselves, and avoiding the biodone call. 1162 * here ourselves, and avoiding the xfs_buf_ioend call.
1048 * This is meant for userdata errors; metadata bufs come with 1163 * This is meant for userdata errors; metadata bufs come with
1049 * iodone functions attached, so that we can track down errors. 1164 * iodone functions attached, so that we can track down errors.
1050 */ 1165 */
@@ -1093,7 +1208,7 @@ int
1093xfs_bdstrat_cb( 1208xfs_bdstrat_cb(
1094 struct xfs_buf *bp) 1209 struct xfs_buf *bp)
1095{ 1210{
1096 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { 1211 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1097 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1212 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1098 /* 1213 /*
1099 * Metadata write that didn't get logged but 1214 * Metadata write that didn't get logged but
@@ -1195,7 +1310,7 @@ _xfs_buf_ioapply(
1195 1310
1196 if (bp->b_flags & XBF_ORDERED) { 1311 if (bp->b_flags & XBF_ORDERED) {
1197 ASSERT(!(bp->b_flags & XBF_READ)); 1312 ASSERT(!(bp->b_flags & XBF_READ));
1198 rw = WRITE_BARRIER; 1313 rw = WRITE_FLUSH_FUA;
1199 } else if (bp->b_flags & XBF_LOG_BUFFER) { 1314 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1200 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1315 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1201 bp->b_flags &= ~_XBF_RUN_QUEUES; 1316 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1394,89 +1509,84 @@ xfs_buf_iomove(
1394 */ 1509 */
1395 1510
1396/* 1511/*
1397 * Wait for any bufs with callbacks that have been submitted but 1512 * Wait for any bufs with callbacks that have been submitted but have not yet
1398 * have not yet returned... walk the hash list for the target. 1513 * returned. These buffers will have an elevated hold count, so wait on those
1514 * while freeing all the buffers only held by the LRU.
1399 */ 1515 */
1400void 1516void
1401xfs_wait_buftarg( 1517xfs_wait_buftarg(
1402 xfs_buftarg_t *btp) 1518 struct xfs_buftarg *btp)
1403{ 1519{
1404 xfs_buf_t *bp, *n; 1520 struct xfs_buf *bp;
1405 xfs_bufhash_t *hash; 1521
1406 uint i; 1522restart:
1407 1523 spin_lock(&btp->bt_lru_lock);
1408 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1524 while (!list_empty(&btp->bt_lru)) {
1409 hash = &btp->bt_hash[i]; 1525 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1410again: 1526 if (atomic_read(&bp->b_hold) > 1) {
1411 spin_lock(&hash->bh_lock); 1527 spin_unlock(&btp->bt_lru_lock);
1412 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 1528 delay(100);
1413 ASSERT(btp == bp->b_target); 1529 goto restart;
1414 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1415 spin_unlock(&hash->bh_lock);
1416 /*
1417 * Catch superblock reference count leaks
1418 * immediately
1419 */
1420 BUG_ON(bp->b_bn == 0);
1421 delay(100);
1422 goto again;
1423 }
1424 } 1530 }
1425 spin_unlock(&hash->bh_lock); 1531 /*
1532 * clear the LRU reference count so the bufer doesn't get
1533 * ignored in xfs_buf_rele().
1534 */
1535 atomic_set(&bp->b_lru_ref, 0);
1536 spin_unlock(&btp->bt_lru_lock);
1537 xfs_buf_rele(bp);
1538 spin_lock(&btp->bt_lru_lock);
1426 } 1539 }
1540 spin_unlock(&btp->bt_lru_lock);
1427} 1541}
1428 1542
1429/* 1543int
1430 * Allocate buffer hash table for a given target. 1544xfs_buftarg_shrink(
1431 * For devices containing metadata (i.e. not the log/realtime devices) 1545 struct shrinker *shrink,
1432 * we need to allocate a much larger hash table. 1546 int nr_to_scan,
1433 */ 1547 gfp_t mask)
1434STATIC void
1435xfs_alloc_bufhash(
1436 xfs_buftarg_t *btp,
1437 int external)
1438{ 1548{
1439 unsigned int i; 1549 struct xfs_buftarg *btp = container_of(shrink,
1550 struct xfs_buftarg, bt_shrinker);
1551 struct xfs_buf *bp;
1552 LIST_HEAD(dispose);
1440 1553
1441 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ 1554 if (!nr_to_scan)
1442 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1555 return btp->bt_lru_nr;
1443 sizeof(xfs_bufhash_t));
1444 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1445 spin_lock_init(&btp->bt_hash[i].bh_lock);
1446 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1447 }
1448}
1449 1556
1450STATIC void 1557 spin_lock(&btp->bt_lru_lock);
1451xfs_free_bufhash( 1558 while (!list_empty(&btp->bt_lru)) {
1452 xfs_buftarg_t *btp) 1559 if (nr_to_scan-- <= 0)
1453{ 1560 break;
1454 kmem_free_large(btp->bt_hash);
1455 btp->bt_hash = NULL;
1456}
1457 1561
1458/* 1562 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1459 * buftarg list for delwrite queue processing
1460 */
1461static LIST_HEAD(xfs_buftarg_list);
1462static DEFINE_SPINLOCK(xfs_buftarg_lock);
1463 1563
1464STATIC void 1564 /*
1465xfs_register_buftarg( 1565 * Decrement the b_lru_ref count unless the value is already
1466 xfs_buftarg_t *btp) 1566 * zero. If the value is already zero, we need to reclaim the
1467{ 1567 * buffer, otherwise it gets another trip through the LRU.
1468 spin_lock(&xfs_buftarg_lock); 1568 */
1469 list_add(&btp->bt_list, &xfs_buftarg_list); 1569 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1470 spin_unlock(&xfs_buftarg_lock); 1570 list_move_tail(&bp->b_lru, &btp->bt_lru);
1471} 1571 continue;
1572 }
1472 1573
1473STATIC void 1574 /*
1474xfs_unregister_buftarg( 1575 * remove the buffer from the LRU now to avoid needing another
1475 xfs_buftarg_t *btp) 1576 * lock round trip inside xfs_buf_rele().
1476{ 1577 */
1477 spin_lock(&xfs_buftarg_lock); 1578 list_move(&bp->b_lru, &dispose);
1478 list_del(&btp->bt_list); 1579 btp->bt_lru_nr--;
1479 spin_unlock(&xfs_buftarg_lock); 1580 }
1581 spin_unlock(&btp->bt_lru_lock);
1582
1583 while (!list_empty(&dispose)) {
1584 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1585 list_del_init(&bp->b_lru);
1586 xfs_buf_rele(bp);
1587 }
1588
1589 return btp->bt_lru_nr;
1480} 1590}
1481 1591
1482void 1592void
@@ -1484,18 +1594,14 @@ xfs_free_buftarg(
1484 struct xfs_mount *mp, 1594 struct xfs_mount *mp,
1485 struct xfs_buftarg *btp) 1595 struct xfs_buftarg *btp)
1486{ 1596{
1597 unregister_shrinker(&btp->bt_shrinker);
1598
1487 xfs_flush_buftarg(btp, 1); 1599 xfs_flush_buftarg(btp, 1);
1488 if (mp->m_flags & XFS_MOUNT_BARRIER) 1600 if (mp->m_flags & XFS_MOUNT_BARRIER)
1489 xfs_blkdev_issue_flush(btp); 1601 xfs_blkdev_issue_flush(btp);
1490 xfs_free_bufhash(btp);
1491 iput(btp->bt_mapping->host); 1602 iput(btp->bt_mapping->host);
1492 1603
1493 /* Unregister the buftarg first so that we don't get a
1494 * wakeup finding a non-existent task
1495 */
1496 xfs_unregister_buftarg(btp);
1497 kthread_stop(btp->bt_task); 1604 kthread_stop(btp->bt_task);
1498
1499 kmem_free(btp); 1605 kmem_free(btp);
1500} 1606}
1501 1607
@@ -1572,6 +1678,7 @@ xfs_mapping_buftarg(
1572 XFS_BUFTARG_NAME(btp)); 1678 XFS_BUFTARG_NAME(btp));
1573 return ENOMEM; 1679 return ENOMEM;
1574 } 1680 }
1681 inode->i_ino = get_next_ino();
1575 inode->i_mode = S_IFBLK; 1682 inode->i_mode = S_IFBLK;
1576 inode->i_bdev = bdev; 1683 inode->i_bdev = bdev;
1577 inode->i_rdev = bdev->bd_dev; 1684 inode->i_rdev = bdev->bd_dev;
@@ -1591,24 +1698,18 @@ xfs_alloc_delwrite_queue(
1591 xfs_buftarg_t *btp, 1698 xfs_buftarg_t *btp,
1592 const char *fsname) 1699 const char *fsname)
1593{ 1700{
1594 int error = 0;
1595
1596 INIT_LIST_HEAD(&btp->bt_list);
1597 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1701 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1598 spin_lock_init(&btp->bt_delwrite_lock); 1702 spin_lock_init(&btp->bt_delwrite_lock);
1599 btp->bt_flags = 0; 1703 btp->bt_flags = 0;
1600 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1704 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1601 if (IS_ERR(btp->bt_task)) { 1705 if (IS_ERR(btp->bt_task))
1602 error = PTR_ERR(btp->bt_task); 1706 return PTR_ERR(btp->bt_task);
1603 goto out_error; 1707 return 0;
1604 }
1605 xfs_register_buftarg(btp);
1606out_error:
1607 return error;
1608} 1708}
1609 1709
1610xfs_buftarg_t * 1710xfs_buftarg_t *
1611xfs_alloc_buftarg( 1711xfs_alloc_buftarg(
1712 struct xfs_mount *mp,
1612 struct block_device *bdev, 1713 struct block_device *bdev,
1613 int external, 1714 int external,
1614 const char *fsname) 1715 const char *fsname)
@@ -1617,15 +1718,20 @@ xfs_alloc_buftarg(
1617 1718
1618 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1719 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1619 1720
1721 btp->bt_mount = mp;
1620 btp->bt_dev = bdev->bd_dev; 1722 btp->bt_dev = bdev->bd_dev;
1621 btp->bt_bdev = bdev; 1723 btp->bt_bdev = bdev;
1724 INIT_LIST_HEAD(&btp->bt_lru);
1725 spin_lock_init(&btp->bt_lru_lock);
1622 if (xfs_setsize_buftarg_early(btp, bdev)) 1726 if (xfs_setsize_buftarg_early(btp, bdev))
1623 goto error; 1727 goto error;
1624 if (xfs_mapping_buftarg(btp, bdev)) 1728 if (xfs_mapping_buftarg(btp, bdev))
1625 goto error; 1729 goto error;
1626 if (xfs_alloc_delwrite_queue(btp, fsname)) 1730 if (xfs_alloc_delwrite_queue(btp, fsname))
1627 goto error; 1731 goto error;
1628 xfs_alloc_bufhash(btp, external); 1732 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1733 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1734 register_shrinker(&btp->bt_shrinker);
1629 return btp; 1735 return btp;
1630 1736
1631error: 1737error:
@@ -1730,27 +1836,6 @@ xfs_buf_runall_queues(
1730 flush_workqueue(queue); 1836 flush_workqueue(queue);
1731} 1837}
1732 1838
1733STATIC int
1734xfsbufd_wakeup(
1735 struct shrinker *shrink,
1736 int priority,
1737 gfp_t mask)
1738{
1739 xfs_buftarg_t *btp;
1740
1741 spin_lock(&xfs_buftarg_lock);
1742 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1743 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1744 continue;
1745 if (list_empty(&btp->bt_delwrite_queue))
1746 continue;
1747 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1748 wake_up_process(btp->bt_task);
1749 }
1750 spin_unlock(&xfs_buftarg_lock);
1751 return 0;
1752}
1753
1754/* 1839/*
1755 * Move as many buffers as specified to the supplied list 1840 * Move as many buffers as specified to the supplied list
1756 * idicating if we skipped any buffers to prevent deadlocks. 1841 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1771,7 +1856,6 @@ xfs_buf_delwri_split(
1771 INIT_LIST_HEAD(list); 1856 INIT_LIST_HEAD(list);
1772 spin_lock(dwlk); 1857 spin_lock(dwlk);
1773 list_for_each_entry_safe(bp, n, dwq, b_list) { 1858 list_for_each_entry_safe(bp, n, dwq, b_list) {
1774 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1775 ASSERT(bp->b_flags & XBF_DELWRI); 1859 ASSERT(bp->b_flags & XBF_DELWRI);
1776 1860
1777 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { 1861 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1785,6 +1869,7 @@ xfs_buf_delwri_split(
1785 _XBF_RUN_QUEUES); 1869 _XBF_RUN_QUEUES);
1786 bp->b_flags |= XBF_WRITE; 1870 bp->b_flags |= XBF_WRITE;
1787 list_move_tail(&bp->b_list, list); 1871 list_move_tail(&bp->b_list, list);
1872 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1788 } else 1873 } else
1789 skipped++; 1874 skipped++;
1790 } 1875 }
@@ -1916,7 +2001,7 @@ xfs_flush_buftarg(
1916 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2001 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1917 2002
1918 list_del_init(&bp->b_list); 2003 list_del_init(&bp->b_list);
1919 xfs_iowait(bp); 2004 xfs_buf_iowait(bp);
1920 xfs_buf_relse(bp); 2005 xfs_buf_relse(bp);
1921 } 2006 }
1922 } 2007 }
@@ -1933,7 +2018,7 @@ xfs_buf_init(void)
1933 goto out; 2018 goto out;
1934 2019
1935 xfslogd_workqueue = alloc_workqueue("xfslogd", 2020 xfslogd_workqueue = alloc_workqueue("xfslogd",
1936 WQ_RESCUER | WQ_HIGHPRI, 1); 2021 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1937 if (!xfslogd_workqueue) 2022 if (!xfslogd_workqueue)
1938 goto out_free_buf_zone; 2023 goto out_free_buf_zone;
1939 2024
@@ -1945,7 +2030,6 @@ xfs_buf_init(void)
1945 if (!xfsconvertd_workqueue) 2030 if (!xfsconvertd_workqueue)
1946 goto out_destroy_xfsdatad_workqueue; 2031 goto out_destroy_xfsdatad_workqueue;
1947 2032
1948 register_shrinker(&xfs_buf_shake);
1949 return 0; 2033 return 0;
1950 2034
1951 out_destroy_xfsdatad_workqueue: 2035 out_destroy_xfsdatad_workqueue:
@@ -1961,7 +2045,6 @@ xfs_buf_init(void)
1961void 2045void
1962xfs_buf_terminate(void) 2046xfs_buf_terminate(void)
1963{ 2047{
1964 unregister_shrinker(&xfs_buf_shake);
1965 destroy_workqueue(xfsconvertd_workqueue); 2048 destroy_workqueue(xfsconvertd_workqueue);
1966 destroy_workqueue(xfsdatad_workqueue); 2049 destroy_workqueue(xfsdatad_workqueue);
1967 destroy_workqueue(xfslogd_workqueue); 2050 destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ 53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
55#define XBF_ORDERED (1 << 11)/* use ordered writes */ 54#define XBF_ORDERED (1 << 11)/* use ordered writes */
56#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ 55#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
57#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ 56#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
@@ -86,14 +85,6 @@ typedef enum {
86 */ 85 */
87#define _XBF_PAGE_LOCKED (1 << 22) 86#define _XBF_PAGE_LOCKED (1 << 22)
88 87
89/*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95#define _XFS_BARRIER_FAILED (1 << 23)
96
97typedef unsigned int xfs_buf_flags_t; 88typedef unsigned int xfs_buf_flags_t;
98 89
99#define XFS_BUF_FLAGS \ 90#define XFS_BUF_FLAGS \
@@ -104,7 +95,6 @@ typedef unsigned int xfs_buf_flags_t;
104 { XBF_DONE, "DONE" }, \ 95 { XBF_DONE, "DONE" }, \
105 { XBF_DELWRI, "DELWRI" }, \ 96 { XBF_DELWRI, "DELWRI" }, \
106 { XBF_STALE, "STALE" }, \ 97 { XBF_STALE, "STALE" }, \
107 { XBF_FS_MANAGED, "FS_MANAGED" }, \
108 { XBF_ORDERED, "ORDERED" }, \ 98 { XBF_ORDERED, "ORDERED" }, \
109 { XBF_READ_AHEAD, "READ_AHEAD" }, \ 99 { XBF_READ_AHEAD, "READ_AHEAD" }, \
110 { XBF_LOCK, "LOCK" }, /* should never be set */\ 100 { XBF_LOCK, "LOCK" }, /* should never be set */\
@@ -114,8 +104,7 @@ typedef unsigned int xfs_buf_flags_t;
114 { _XBF_PAGES, "PAGES" }, \ 104 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 105 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 106 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ 107 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119 108
120 109
121typedef enum { 110typedef enum {
@@ -132,20 +121,22 @@ typedef struct xfs_buftarg {
132 dev_t bt_dev; 121 dev_t bt_dev;
133 struct block_device *bt_bdev; 122 struct block_device *bt_bdev;
134 struct address_space *bt_mapping; 123 struct address_space *bt_mapping;
124 struct xfs_mount *bt_mount;
135 unsigned int bt_bsize; 125 unsigned int bt_bsize;
136 unsigned int bt_sshift; 126 unsigned int bt_sshift;
137 size_t bt_smask; 127 size_t bt_smask;
138 128
139 /* per device buffer hash table */
140 uint bt_hashshift;
141 xfs_bufhash_t *bt_hash;
142
143 /* per device delwri queue */ 129 /* per device delwri queue */
144 struct task_struct *bt_task; 130 struct task_struct *bt_task;
145 struct list_head bt_list;
146 struct list_head bt_delwrite_queue; 131 struct list_head bt_delwrite_queue;
147 spinlock_t bt_delwrite_lock; 132 spinlock_t bt_delwrite_lock;
148 unsigned long bt_flags; 133 unsigned long bt_flags;
134
135 /* LRU control structures */
136 struct shrinker bt_shrinker;
137 struct list_head bt_lru;
138 spinlock_t bt_lru_lock;
139 unsigned int bt_lru_nr;
149} xfs_buftarg_t; 140} xfs_buftarg_t;
150 141
151/* 142/*
@@ -161,40 +152,46 @@ typedef struct xfs_buftarg {
161 152
162struct xfs_buf; 153struct xfs_buf;
163typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
164typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
165typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
166 155
167#define XB_PAGES 2 156#define XB_PAGES 2
168 157
169typedef struct xfs_buf { 158typedef struct xfs_buf {
159 /*
160 * first cacheline holds all the fields needed for an uncontended cache
161 * hit to be fully processed. The semaphore straddles the cacheline
162 * boundary, but the counter and lock sits on the first cacheline,
163 * which is the only bit that is touched if we hit the semaphore
164 * fast-path on locking.
165 */
166 struct rb_node b_rbnode; /* rbtree node */
167 xfs_off_t b_file_offset; /* offset in file */
168 size_t b_buffer_length;/* size of buffer in bytes */
169 atomic_t b_hold; /* reference count */
170 atomic_t b_lru_ref; /* lru reclaim ref count */
171 xfs_buf_flags_t b_flags; /* status flags */
170 struct semaphore b_sema; /* semaphore for lockables */ 172 struct semaphore b_sema; /* semaphore for lockables */
171 unsigned long b_queuetime; /* time buffer was queued */ 173
172 atomic_t b_pin_count; /* pin count */ 174 struct list_head b_lru; /* lru list */
173 wait_queue_head_t b_waiters; /* unpin waiters */ 175 wait_queue_head_t b_waiters; /* unpin waiters */
174 struct list_head b_list; 176 struct list_head b_list;
175 xfs_buf_flags_t b_flags; /* status flags */ 177 struct xfs_perag *b_pag; /* contains rbtree root */
176 struct list_head b_hash_list; /* hash table list */
177 xfs_bufhash_t *b_hash; /* hash table list start */
178 xfs_buftarg_t *b_target; /* buffer target (device) */ 178 xfs_buftarg_t *b_target; /* buffer target (device) */
179 atomic_t b_hold; /* reference count */
180 xfs_daddr_t b_bn; /* block number for I/O */ 179 xfs_daddr_t b_bn; /* block number for I/O */
181 xfs_off_t b_file_offset; /* offset in file */
182 size_t b_buffer_length;/* size of buffer in bytes */
183 size_t b_count_desired;/* desired transfer size */ 180 size_t b_count_desired;/* desired transfer size */
184 void *b_addr; /* virtual address of buffer */ 181 void *b_addr; /* virtual address of buffer */
185 struct work_struct b_iodone_work; 182 struct work_struct b_iodone_work;
186 atomic_t b_io_remaining; /* #outstanding I/O requests */
187 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 183 xfs_buf_iodone_t b_iodone; /* I/O completion function */
188 xfs_buf_relse_t b_relse; /* releasing function */
189 struct completion b_iowait; /* queue for I/O waiters */ 184 struct completion b_iowait; /* queue for I/O waiters */
190 void *b_fspriv; 185 void *b_fspriv;
191 void *b_fspriv2; 186 void *b_fspriv2;
192 struct xfs_mount *b_mount;
193 unsigned short b_error; /* error code on I/O */
194 unsigned int b_page_count; /* size of page array */
195 unsigned int b_offset; /* page offset in first page */
196 struct page **b_pages; /* array of page pointers */ 187 struct page **b_pages; /* array of page pointers */
197 struct page *b_page_array[XB_PAGES]; /* inline pages */ 188 struct page *b_page_array[XB_PAGES]; /* inline pages */
189 unsigned long b_queuetime; /* time buffer was queued */
190 atomic_t b_pin_count; /* pin count */
191 atomic_t b_io_remaining; /* #outstanding I/O requests */
192 unsigned int b_page_count; /* size of page array */
193 unsigned int b_offset; /* page offset in first page */
194 unsigned short b_error; /* error code on I/O */
198#ifdef XFS_BUF_LOCK_TRACKING 195#ifdef XFS_BUF_LOCK_TRACKING
199 int b_last_holder; 196 int b_last_holder;
200#endif 197#endif
@@ -213,11 +210,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
213 xfs_buf_flags_t); 210 xfs_buf_flags_t);
214 211
215extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 212extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
216extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 213extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
217extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 214extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
218extern void xfs_buf_hold(xfs_buf_t *); 215extern void xfs_buf_hold(xfs_buf_t *);
219extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, 216extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
220 xfs_buf_flags_t); 217struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
218 struct xfs_buftarg *target,
219 xfs_daddr_t daddr, size_t length, int flags);
221 220
222/* Releasing Buffers */ 221/* Releasing Buffers */
223extern void xfs_buf_free(xfs_buf_t *); 222extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +241,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
242extern int xfs_buf_iowait(xfs_buf_t *); 241extern int xfs_buf_iowait(xfs_buf_t *);
243extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 242extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
244 xfs_buf_rw_t); 243 xfs_buf_rw_t);
244#define xfs_buf_zero(bp, off, len) \
245 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
245 246
246static inline int xfs_buf_geterror(xfs_buf_t *bp) 247static inline int xfs_buf_geterror(xfs_buf_t *bp)
247{ 248{
@@ -267,7 +268,8 @@ extern void xfs_buf_terminate(void);
267#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 268#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
268 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 269 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
269 270
270#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) 271void xfs_buf_stale(struct xfs_buf *bp);
272#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
271#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 273#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
272#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 274#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
273#define XFS_BUF_SUPER_STALE(bp) do { \ 275#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -276,8 +278,6 @@ extern void xfs_buf_terminate(void);
276 XFS_BUF_DONE(bp); \ 278 XFS_BUF_DONE(bp); \
277 } while (0) 279 } while (0)
278 280
279#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
280
281#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 281#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
282#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 282#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
283#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 283#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
@@ -320,7 +320,6 @@ extern void xfs_buf_terminate(void);
320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
322#define XFS_BUF_SET_START(bp) do { } while (0) 322#define XFS_BUF_SET_START(bp) do { } while (0)
323#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
324 323
325#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 324#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
326#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 325#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -333,9 +332,15 @@ extern void xfs_buf_terminate(void);
333#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 332#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
334#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 333#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
335 334
336#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 335static inline void
336xfs_buf_set_ref(
337 struct xfs_buf *bp,
338 int lru_ref)
339{
340 atomic_set(&bp->b_lru_ref, lru_ref);
341}
342#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
337#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 343#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
338#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
339 344
340#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 345#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
341 346
@@ -351,30 +356,15 @@ extern void xfs_buf_terminate(void);
351 356
352static inline void xfs_buf_relse(xfs_buf_t *bp) 357static inline void xfs_buf_relse(xfs_buf_t *bp)
353{ 358{
354 if (!bp->b_relse) 359 xfs_buf_unlock(bp);
355 xfs_buf_unlock(bp);
356 xfs_buf_rele(bp); 360 xfs_buf_rele(bp);
357} 361}
358 362
359#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
360
361#define xfs_biomove(bp, off, len, data, rw) \
362 xfs_buf_iomove((bp), (off), (len), (data), \
363 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
364
365#define xfs_biozero(bp, off, len) \
366 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
367
368#define xfs_iowait(bp) xfs_buf_iowait(bp)
369
370#define xfs_baread(target, rablkno, ralen) \
371 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
372
373
374/* 363/*
375 * Handling of buftargs. 364 * Handling of buftargs.
376 */ 365 */
377extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); 366extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
367 struct block_device *, int, const char *);
378extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 368extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
379extern void xfs_wait_buftarg(xfs_buftarg_t *); 369extern void xfs_wait_buftarg(xfs_buftarg_t *);
380extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 370extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_CRED_H__
19#define __XFS_CRED_H__
20
21#include <linux/capability.h>
22
23/*
24 * Credentials
25 */
26typedef const struct cred cred_t;
27
28#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (copy_from_user(&range, urange, sizeof(range)))
156 return -XFS_ERROR(EFAULT);
157
158 /*
159 * Truncating down the len isn't actually quite correct, but using
160 * XFS_B_TO_FSB would mean we trivially get overflows for values
161 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
162 * used by the fstrim application. In the end it really doesn't
163 * matter as trimming blocks is an advisory interface.
164 */
165 start = XFS_B_TO_FSBT(mp, range.start);
166 len = XFS_B_TO_FSBT(mp, range.len);
167 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
168
169 start_agno = XFS_FSB_TO_AGNO(mp, start);
170 if (start_agno >= mp->m_sb.sb_agcount)
171 return -XFS_ERROR(EINVAL);
172
173 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
174 if (end_agno >= mp->m_sb.sb_agcount)
175 end_agno = mp->m_sb.sb_agcount - 1;
176
177 for (agno = start_agno; agno <= end_agno; agno++) {
178 error = -xfs_trim_extents(mp, agno, start, len, minlen,
179 &blocks_trimmed);
180 if (error)
181 last_error = error;
182 }
183
184 if (last_error)
185 return last_error;
186
187 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
188 if (copy_to_user(urange, &range, sizeof(range)))
189 return -XFS_ERROR(EFAULT);
190 return 0;
191}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
7
8#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..fc0114da7fdd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
70 else 70 else
71 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
72 72
73 /* filesystem may contain 64bit inode numbers */ 73 /*
74 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) 74 * If the the filesystem may contain 64bit inode numbers, we need
75 * to use larger file handles that can represent them.
76 *
77 * While we only allocate inodes that do not fit into 32 bits any
78 * large enough filesystem may contain them, thus the slightly
79 * confusing looking conditional below.
80 */
81 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
82 (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
75 fileid_type |= XFS_FILEID_TYPE_64FLAG; 83 fileid_type |= XFS_FILEID_TYPE_64FLAG;
76 84
77 /* 85 /*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 297 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 298 return -EIO;
264 299
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 300 if (unlikely(ioflags & IO_ISDIRECT)) {
301 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
302
270 if (inode->i_mapping->nrpages) { 303 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 304 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 305 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 306 -1, FI_REMAPF_LOCKED);
307 if (ret) {
308 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310 }
274 } 311 }
275 mutex_unlock(&inode->i_mutex); 312 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 313 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 314 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 315
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 316 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 317
@@ -285,7 +319,7 @@ xfs_file_aio_read(
285 if (ret > 0) 319 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 320 XFS_STATS_ADD(xs_read_bytes, ret);
287 321
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 323 return ret;
290} 324}
291 325
@@ -309,7 +343,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 343 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 344 return -EIO;
311 345
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 346 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 347
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 349
@@ -317,10 +351,61 @@ xfs_file_splice_read(
317 if (ret > 0) 351 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 352 XFS_STATS_ADD(xs_read_bytes, ret);
319 353
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 354 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 355 return ret;
322} 356}
323 357
358STATIC void
359xfs_aio_write_isize_update(
360 struct inode *inode,
361 loff_t *ppos,
362 ssize_t bytes_written)
363{
364 struct xfs_inode *ip = XFS_I(inode);
365 xfs_fsize_t isize = i_size_read(inode);
366
367 if (bytes_written > 0)
368 XFS_STATS_ADD(xs_write_bytes, bytes_written);
369
370 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
371 *ppos > isize))
372 *ppos = isize;
373
374 if (*ppos > ip->i_size) {
375 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
376 if (*ppos > ip->i_size)
377 ip->i_size = *ppos;
378 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
379 }
380}
381
382/*
383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
384 * part of the I/O may have been written to disk before the error occured. In
385 * this case the on-disk file size may have been adjusted beyond the in-memory
386 * file size and now needs to be truncated back.
387 */
388STATIC void
389xfs_aio_write_newsize_update(
390 struct xfs_inode *ip)
391{
392 if (ip->i_new_size) {
393 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
394 ip->i_new_size = 0;
395 if (ip->i_d.di_size > ip->i_size)
396 ip->i_d.di_size = ip->i_size;
397 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
398 }
399}
400
401/*
402 * xfs_file_splice_write() does not use xfs_rw_ilock() because
403 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
404 * couuld cause lock inversions between the aio_write path and the splice path
405 * if someone is doing concurrent splice(2) based writes and write(2) based
406 * writes to the same inode. The only real way to fix this is to re-implement
407 * the generic code here with correct locking orders.
408 */
324STATIC ssize_t 409STATIC ssize_t
325xfs_file_splice_write( 410xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 411 struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
331{ 416{
332 struct inode *inode = outfilp->f_mapping->host; 417 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 418 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 419 xfs_fsize_t new_size;
335 int ioflags = 0; 420 int ioflags = 0;
336 ssize_t ret; 421 ssize_t ret;
337 422
@@ -355,27 +440,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 440 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 441
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 442 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360
361 isize = i_size_read(inode);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371 443
372 if (ip->i_new_size) { 444 xfs_aio_write_isize_update(inode, ppos, ret);
373 xfs_ilock(ip, XFS_ILOCK_EXCL); 445 xfs_aio_write_newsize_update(ip);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 447 return ret;
381} 448}
@@ -562,247 +629,314 @@ out_lock:
562 return error; 629 return error;
563} 630}
564 631
632/*
633 * Common pre-write limit and setup checks.
634 *
635 * Returns with iolock held according to @iolock.
636 */
565STATIC ssize_t 637STATIC ssize_t
566xfs_file_aio_write( 638xfs_file_aio_write_checks(
567 struct kiocb *iocb, 639 struct file *file,
568 const struct iovec *iovp, 640 loff_t *pos,
569 unsigned long nr_segs, 641 size_t *count,
570 loff_t pos) 642 int *iolock)
571{ 643{
572 struct file *file = iocb->ki_filp; 644 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 645 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 646 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 647 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 648
584 XFS_STATS_INC(xs_write_calls); 649 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
650 if (error) {
651 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
652 *iolock = 0;
653 return error;
654 }
585 655
586 BUG_ON(iocb->ki_pos != pos); 656 new_size = *pos + *count;
657 if (new_size > ip->i_size)
658 ip->i_new_size = new_size;
587 659
588 if (unlikely(file->f_flags & O_DIRECT)) 660 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 661 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME) 662
591 ioflags |= IO_INVIS; 663 /*
664 * If the offset is beyond the size of the file, we need to zero any
665 * blocks that fall between the existing EOF and the start of this
666 * write.
667 */
668 if (*pos > ip->i_size)
669 error = -xfs_zero_eof(ip, *pos, ip->i_size);
592 670
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 672 if (error)
595 return error; 673 return error;
596 674
597 count = ocount; 675 /*
598 if (count == 0) 676 * If we're writing the file then make sure to clear the setuid and
599 return 0; 677 * setgid bits if the process is not being run by root. This keeps
600 678 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 679 */
680 return file_remove_suid(file);
602 681
603 if (XFS_FORCED_SHUTDOWN(mp)) 682}
604 return -EIO;
605 683
606relock: 684/*
607 if (ioflags & IO_ISDIRECT) { 685 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 686 *
609 need_i_mutex = 0; 687 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 688 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 689 * follow locking changes and looping.
612 need_i_mutex = 1; 690 *
613 mutex_lock(&inode->i_mutex); 691 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
692 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
693 * pages are flushed out.
694 *
695 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
696 * allowing them to be done in parallel with reads and other direct IO writes.
697 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
698 * needs to do sub-block zeroing and that requires serialisation against other
699 * direct IOs to the same block. In this case we need to serialise the
700 * submission of the unaligned IOs so that we don't get racing block zeroing in
701 * the dio layer. To avoid the problem with aio, we also need to wait for
702 * outstanding IOs to complete so that unwritten extent conversion is completed
703 * before we try to map the overlapping block. This is currently implemented by
704 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
705 *
706 * Returns with locks held indicated by @iolock and errors indicated by
707 * negative return values.
708 */
709STATIC ssize_t
710xfs_file_dio_aio_write(
711 struct kiocb *iocb,
712 const struct iovec *iovp,
713 unsigned long nr_segs,
714 loff_t pos,
715 size_t ocount,
716 int *iolock)
717{
718 struct file *file = iocb->ki_filp;
719 struct address_space *mapping = file->f_mapping;
720 struct inode *inode = mapping->host;
721 struct xfs_inode *ip = XFS_I(inode);
722 struct xfs_mount *mp = ip->i_mount;
723 ssize_t ret = 0;
724 size_t count = ocount;
725 int unaligned_io = 0;
726 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
727 mp->m_rtdev_targp : mp->m_ddev_targp;
728
729 *iolock = 0;
730 if ((pos & target->bt_smask) || (count & target->bt_smask))
731 return -XFS_ERROR(EINVAL);
732
733 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
734 unaligned_io = 1;
735
736 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
737 *iolock = XFS_IOLOCK_EXCL;
738 else
739 *iolock = XFS_IOLOCK_SHARED;
740 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
741
742 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
743 if (ret)
744 return ret;
745
746 if (mapping->nrpages) {
747 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
748 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
749 FI_REMAPF_LOCKED);
750 if (ret)
751 return ret;
614 } 752 }
615 753
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 754 /*
617 755 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 756 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 757 */
620 S_ISBLK(inode->i_mode)); 758 if (unaligned_io)
621 if (error) { 759 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 760 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 761 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
762 *iolock = XFS_IOLOCK_SHARED;
624 } 763 }
625 764
626 if (ioflags & IO_ISDIRECT) { 765 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 766 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 767 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 768
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 769 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 770 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 771 return ret;
634 } 772}
635 773
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 774STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 775xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 776 struct kiocb *iocb,
639 need_i_mutex = 1; 777 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 778 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 779 loff_t pos,
642 goto start; 780 size_t ocount,
643 } 781 int *iolock)
644 } 782{
783 struct file *file = iocb->ki_filp;
784 struct address_space *mapping = file->f_mapping;
785 struct inode *inode = mapping->host;
786 struct xfs_inode *ip = XFS_I(inode);
787 ssize_t ret;
788 int enospc = 0;
789 size_t count = ocount;
645 790
646 new_size = pos + count; 791 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 792 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 793
650 if (likely(!(ioflags & IO_INVIS))) 794 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 795 if (ret)
796 return ret;
652 797
798 /* We can write back this queue in page reclaim */
799 current->backing_dev_info = mapping->backing_dev_info;
800
801write_retry:
802 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
803 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
804 pos, &iocb->ki_pos, count, ret);
653 /* 805 /*
654 * If the offset is beyond the size of the file, we have a couple 806 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 807 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 808 */
661 809 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 810 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 811 if (ret)
664 if (error) { 812 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 813 enospc = 1;
666 goto out_unlock_internal; 814 goto write_retry;
667 }
668 } 815 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 816 current->backing_dev_info = NULL;
817 return ret;
818}
670 819
671 /* 820STATIC ssize_t
672 * If we're writing the file then make sure to clear the 821xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 822 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 823 const struct iovec *iovp,
675 * setgid binaries. 824 unsigned long nr_segs,
676 */ 825 loff_t pos)
677 error = -file_remove_suid(file); 826{
678 if (unlikely(error)) 827 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 828 struct address_space *mapping = file->f_mapping;
829 struct inode *inode = mapping->host;
830 struct xfs_inode *ip = XFS_I(inode);
831 ssize_t ret;
832 int iolock;
833 size_t ocount = 0;
680 834
681 /* We can write back this queue in page reclaim */ 835 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 836
684 if ((ioflags & IO_ISDIRECT)) { 837 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 838
694 if (need_i_mutex) { 839 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 840 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 841 return ret;
697 mutex_unlock(&inode->i_mutex);
698 842
699 iolock = XFS_IOLOCK_SHARED; 843 if (ocount == 0)
700 need_i_mutex = 0; 844 return 0;
701 }
702 845
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 846 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 847
707 /* 848 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 849 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 850
714 pos += ret; 851 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 852 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
853 ocount, &iolock);
854 else
855 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
856 ocount, &iolock);
716 857
717 ioflags &= ~IO_ISDIRECT; 858 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 859
725write_retry: 860 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 861 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 862
743 current->backing_dev_info = NULL; 863 /* Handle various SYNC-type writes */
864 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
865 loff_t end = pos + ret - 1;
866 int error, error2;
744 867
745 isize = i_size_read(inode); 868 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 869 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 870 xfs_rw_ilock(ip, iolock);
748 871
749 if (iocb->ki_pos > ip->i_size) { 872 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 873 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 874 if (error)
752 ip->i_size = iocb->ki_pos; 875 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 876 else if (error2)
877 ret = error2;
754 } 878 }
755 879
756 error = -ret; 880out_unlock:
757 if (ret <= 0) 881 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 882 xfs_rw_iunlock(ip, iolock);
883 return ret;
884}
759 885
760 XFS_STATS_ADD(xs_write_bytes, ret); 886STATIC long
887xfs_file_fallocate(
888 struct file *file,
889 int mode,
890 loff_t offset,
891 loff_t len)
892{
893 struct inode *inode = file->f_path.dentry->d_inode;
894 long error;
895 loff_t new_size = 0;
896 xfs_flock64_t bf;
897 xfs_inode_t *ip = XFS_I(inode);
898 int cmd = XFS_IOC_RESVSP;
761 899
762 /* Handle various SYNC-type writes */ 900 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 901 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 902
767 xfs_iunlock(ip, iolock); 903 bf.l_whence = 0;
768 if (need_i_mutex) 904 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 905 bf.l_len = len;
770 906
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 907 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 908
778 error2 = -xfs_file_fsync(file, 909 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 910 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 911
781 error = error2; 912 /* check the new inode size is valid before allocating */
913 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
914 offset + len > i_size_read(inode)) {
915 new_size = offset + len;
916 error = inode_newsize_ok(inode, new_size);
917 if (error)
918 goto out_unlock;
782 } 919 }
783 920
784 out_unlock_internal: 921 error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
785 if (ip->i_new_size) { 922 if (error)
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 923 goto out_unlock;
787 ip->i_new_size = 0; 924
788 /* 925 /* Change file size if needed */
789 * If this was a direct or synchronous I/O that failed (such 926 if (new_size) {
790 * as ENOSPC) then part of the I/O may have been written to 927 struct iattr iattr;
791 * disk before the error occured. In this case the on-disk 928
792 * file size may have been adjusted beyond the in-memory file 929 iattr.ia_valid = ATTR_SIZE;
793 * size and now needs to be truncated back. 930 iattr.ia_size = new_size;
794 */ 931 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
795 if (ip->i_d.di_size > ip->i_size)
796 ip->i_d.di_size = ip->i_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL);
798 } 932 }
799 xfs_iunlock(ip, iolock); 933
800 out_unlock_mutex: 934out_unlock:
801 if (need_i_mutex) 935 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 936 return error;
803 return -error;
804} 937}
805 938
939
806STATIC int 940STATIC int
807xfs_file_open( 941xfs_file_open(
808 struct inode *inode, 942 struct inode *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1055 .open = xfs_file_open,
922 .release = xfs_file_release, 1056 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1057 .fsync = xfs_file_fsync,
1058 .fallocate = xfs_file_fallocate,
924}; 1059};
925 1060
926const struct file_operations xfs_dir_file_operations = { 1061const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
32 xfs_off_t last, 32 xfs_off_t last,
33 int fiopt) 33 int fiopt)
34{ 34{
35 struct address_space *mapping = VFS_I(ip)->i_mapping; 35 /* can't toss partial tail pages, so mask them out */
36 36 last &= ~(PAGE_SIZE - 1);
37 if (mapping->nrpages) 37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38 truncate_inode_pages(mapping, first);
39} 38}
40 39
41int 40int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
50 49
51 trace_xfs_pagecache_inval(ip, first, last); 50 trace_xfs_pagecache_inval(ip, first, last);
52 51
53 if (mapping->nrpages) { 52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
54 xfs_iflags_clear(ip, XFS_ITRUNCATED); 53 ret = filemap_write_and_wait_range(mapping, first,
55 ret = filemap_write_and_wait(mapping); 54 last == -1 ? LLONG_MAX : last);
56 if (!ret) 55 if (!ret)
57 truncate_inode_pages(mapping, first); 56 truncate_inode_pages_range(mapping, first, last);
58 }
59 return -ret; 57 return -ret;
60} 58}
61 59
@@ -71,10 +69,9 @@ xfs_flush_pages(
71 int ret = 0; 69 int ret = 0;
72 int ret2; 70 int ret2;
73 71
74 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
75 xfs_iflags_clear(ip, XFS_ITRUNCATED); 73 ret = -filemap_fdatawrite_range(mapping, first,
76 ret = -filemap_fdatawrite(mapping); 74 last == -1 ? LLONG_MAX : last);
77 }
78 if (flags & XBF_ASYNC) 75 if (flags & XBF_ASYNC)
79 return ret; 76 return ret;
80 ret2 = xfs_wait_on_pages(ip, first, last); 77 ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
91{ 88{
92 struct address_space *mapping = VFS_I(ip)->i_mapping; 89 struct address_space *mapping = VFS_I(ip)->i_mapping;
93 90
94 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
95 return -filemap_fdatawait(mapping); 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last);
94 }
96 return 0; 95 return 0;
97} 96}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_cred.h"
20#include "xfs_sysctl.h" 19#include "xfs_sysctl.h"
21 20
22/* 21/*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_GLOBALS_H__
19#define __XFS_GLOBALS_H__
20
21extern uint64_t xfs_panic_mask; /* set to cause more panics */
22
23#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..f5e2a19e0f8e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -416,7 +417,7 @@ xfs_attrlist_by_handle(
416 if (IS_ERR(dentry)) 417 if (IS_ERR(dentry))
417 return PTR_ERR(dentry); 418 return PTR_ERR(dentry);
418 419
419 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 420 kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
420 if (!kbuf) 421 if (!kbuf)
421 goto out_dput; 422 goto out_dput;
422 423
@@ -790,7 +791,7 @@ xfs_ioc_fsgetxattr(
790 xfs_ilock(ip, XFS_ILOCK_SHARED); 791 xfs_ilock(ip, XFS_ILOCK_SHARED);
791 fa.fsx_xflags = xfs_ip2xflags(ip); 792 fa.fsx_xflags = xfs_ip2xflags(ip);
792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 793 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
793 fa.fsx_projid = ip->i_d.di_projid; 794 fa.fsx_projid = xfs_get_projid(ip);
794 795
795 if (attr) { 796 if (attr) {
796 if (ip->i_afp) { 797 if (ip->i_afp) {
@@ -909,10 +910,10 @@ xfs_ioctl_setattr(
909 return XFS_ERROR(EIO); 910 return XFS_ERROR(EIO);
910 911
911 /* 912 /*
912 * Disallow 32bit project ids because on-disk structure 913 * Disallow 32bit project ids when projid32bit feature is not enabled.
913 * is 16bit only.
914 */ 914 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) 915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
916 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
916 return XFS_ERROR(EINVAL); 917 return XFS_ERROR(EINVAL);
917 918
918 /* 919 /*
@@ -961,7 +962,7 @@ xfs_ioctl_setattr(
961 if (mask & FSX_PROJID) { 962 if (mask & FSX_PROJID) {
962 if (XFS_IS_QUOTA_RUNNING(mp) && 963 if (XFS_IS_QUOTA_RUNNING(mp) &&
963 XFS_IS_PQUOTA_ON(mp) && 964 XFS_IS_PQUOTA_ON(mp) &&
964 ip->i_d.di_projid != fa->fsx_projid) { 965 xfs_get_projid(ip) != fa->fsx_projid) {
965 ASSERT(tp); 966 ASSERT(tp);
966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 967 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
967 capable(CAP_FOWNER) ? 968 capable(CAP_FOWNER) ?
@@ -984,10 +985,22 @@ xfs_ioctl_setattr(
984 985
985 /* 986 /*
986 * Extent size must be a multiple of the appropriate block 987 * Extent size must be a multiple of the appropriate block
987 * size, if set at all. 988 * size, if set at all. It must also be smaller than the
989 * maximum extent size supported by the filesystem.
990 *
991 * Also, for non-realtime files, limit the extent size hint to
992 * half the size of the AGs in the filesystem so alignment
993 * doesn't result in extents larger than an AG.
988 */ 994 */
989 if (fa->fsx_extsize != 0) { 995 if (fa->fsx_extsize != 0) {
990 xfs_extlen_t size; 996 xfs_extlen_t size;
997 xfs_fsblock_t extsize_fsb;
998
999 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1000 if (extsize_fsb > MAXEXTLEN) {
1001 code = XFS_ERROR(EINVAL);
1002 goto error_return;
1003 }
991 1004
992 if (XFS_IS_REALTIME_INODE(ip) || 1005 if (XFS_IS_REALTIME_INODE(ip) ||
993 ((mask & FSX_XFLAGS) && 1006 ((mask & FSX_XFLAGS) &&
@@ -996,6 +1009,10 @@ xfs_ioctl_setattr(
996 mp->m_sb.sb_blocklog; 1009 mp->m_sb.sb_blocklog;
997 } else { 1010 } else {
998 size = mp->m_sb.sb_blocksize; 1011 size = mp->m_sb.sb_blocksize;
1012 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1013 code = XFS_ERROR(EINVAL);
1014 goto error_return;
1015 }
999 } 1016 }
1000 1017
1001 if (fa->fsx_extsize % size) { 1018 if (fa->fsx_extsize % size) {
@@ -1063,12 +1080,12 @@ xfs_ioctl_setattr(
1063 * Change the ownerships and register quota modifications 1080 * Change the ownerships and register quota modifications
1064 * in the transaction. 1081 * in the transaction.
1065 */ 1082 */
1066 if (ip->i_d.di_projid != fa->fsx_projid) { 1083 if (xfs_get_projid(ip) != fa->fsx_projid) {
1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1084 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1068 olddquot = xfs_qm_vop_chown(tp, ip, 1085 olddquot = xfs_qm_vop_chown(tp, ip,
1069 &ip->i_gdquot, gdqp); 1086 &ip->i_gdquot, gdqp);
1070 } 1087 }
1071 ip->i_d.di_projid = fa->fsx_projid; 1088 xfs_set_projid(ip, fa->fsx_projid);
1072 1089
1073 /* 1090 /*
1074 * We may have to rev the inode as well as 1091 * We may have to rev the inode as well as
@@ -1088,8 +1105,8 @@ xfs_ioctl_setattr(
1088 xfs_diflags_to_linux(ip); 1105 xfs_diflags_to_linux(ip);
1089 } 1106 }
1090 1107
1108 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1091 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1109 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1092 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1093 1110
1094 XFS_STATS_INC(xs_ig_attrchg); 1111 XFS_STATS_INC(xs_ig_attrchg);
1095 1112
@@ -1294,6 +1311,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1311 trace_xfs_file_ioctl(ip);
1295 1312
1296 switch (cmd) { 1313 switch (cmd) {
1314 case FITRIM:
1315 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1316 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1317 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1318 case XFS_IOC_RESVSP:
@@ -1301,7 +1320,8 @@ xfs_file_ioctl(
1301 case XFS_IOC_ALLOCSP64: 1320 case XFS_IOC_ALLOCSP64:
1302 case XFS_IOC_FREESP64: 1321 case XFS_IOC_FREESP64:
1303 case XFS_IOC_RESVSP64: 1322 case XFS_IOC_RESVSP64:
1304 case XFS_IOC_UNRESVSP64: { 1323 case XFS_IOC_UNRESVSP64:
1324 case XFS_IOC_ZERO_RANGE: {
1305 xfs_flock64_t bf; 1325 xfs_flock64_t bf;
1306 1326
1307 if (copy_from_user(&bf, arg, sizeof(bf))) 1327 if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..b3486dfa5520 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) || 164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
165 get_user(bstat->bs_extents, &bstat32->bs_extents) || 165 get_user(bstat->bs_extents, &bstat32->bs_extents) ||
166 get_user(bstat->bs_gen, &bstat32->bs_gen) || 166 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
167 get_user(bstat->bs_projid, &bstat32->bs_projid) || 167 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
168 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
168 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || 169 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
169 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || 170 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
170 get_user(bstat->bs_aextents, &bstat32->bs_aextents)) 171 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
218 put_user(buffer->bs_extents, &p32->bs_extents) || 219 put_user(buffer->bs_extents, &p32->bs_extents) ||
219 put_user(buffer->bs_gen, &p32->bs_gen) || 220 put_user(buffer->bs_gen, &p32->bs_gen) ||
220 put_user(buffer->bs_projid, &p32->bs_projid) || 221 put_user(buffer->bs_projid, &p32->bs_projid) ||
222 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
221 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 223 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
222 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 224 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
223 put_user(buffer->bs_aextents, &p32->bs_aextents)) 225 put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
574 case XFS_IOC_FSGEOMETRY_V1: 576 case XFS_IOC_FSGEOMETRY_V1:
575 case XFS_IOC_FSGROWFSDATA: 577 case XFS_IOC_FSGROWFSDATA:
576 case XFS_IOC_FSGROWFSRT: 578 case XFS_IOC_FSGROWFSRT:
579 case XFS_IOC_ZERO_RANGE:
577 return xfs_file_ioctl(filp, cmd, p); 580 return xfs_file_ioctl(filp, cmd, p);
578#else 581#else
579 case XFS_IOC_ALLOCSP_32: 582 case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..08b605792a99 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
65 __s32 bs_extsize; /* extent size */ 65 __s32 bs_extsize; /* extent size */
66 __s32 bs_extents; /* number of extents */ 66 __s32 bs_extents; /* number of extents */
67 __u32 bs_gen; /* generation count */ 67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid; /* project id */ 68 __u16 bs_projid_lo; /* lower part of project id */
69 unsigned char bs_pad[14]; /* pad space, unused */ 69#define bs_projid bs_projid_lo /* (previously just bs_projid) */
70 __u16 bs_projid_hi; /* high part of project id */
71 unsigned char bs_pad[12]; /* pad space, unused */
70 __u32 bs_dmevmask; /* DMIG event mask */ 72 __u32 bs_dmevmask; /* DMIG event mask */
71 __u16 bs_dmstate; /* DMIG state info */ 73 __u16 bs_dmstate; /* DMIG state info */
72 __u16 bs_aextents; /* attribute number of extents */ 74 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -95,41 +94,6 @@ xfs_mark_inode_dirty(
95} 94}
96 95
97/* 96/*
98 * Change the requested timestamp in the given inode.
99 * We don't lock across timestamp updates, and we don't log them but
100 * we do record the fact that there is dirty information in core.
101 */
102void
103xfs_ichgtime(
104 xfs_inode_t *ip,
105 int flags)
106{
107 struct inode *inode = VFS_I(ip);
108 timespec_t tv;
109 int sync_it = 0;
110
111 tv = current_fs_time(inode->i_sb);
112
113 if ((flags & XFS_ICHGTIME_MOD) &&
114 !timespec_equal(&inode->i_mtime, &tv)) {
115 inode->i_mtime = tv;
116 sync_it = 1;
117 }
118 if ((flags & XFS_ICHGTIME_CHG) &&
119 !timespec_equal(&inode->i_ctime, &tv)) {
120 inode->i_ctime = tv;
121 sync_it = 1;
122 }
123
124 /*
125 * Update complete - now make sure everyone knows that the inode
126 * is dirty.
127 */
128 if (sync_it)
129 xfs_mark_inode_dirty_sync(ip);
130}
131
132/*
133 * Hook in SELinux. This is not quite correct yet, what we really need 97 * Hook in SELinux. This is not quite correct yet, what we really need
134 * here (as we do for default ACLs) is a mechanism by which creation of 98 * here (as we do for default ACLs) is a mechanism by which creation of
135 * these attrs can be journalled at inode creation time (along with the 99 * these attrs can be journalled at inode creation time (along with the
@@ -224,7 +188,7 @@ xfs_vn_mknod(
224 } 188 }
225 189
226 xfs_dentry_to_name(&name, dentry); 190 xfs_dentry_to_name(&name, dentry);
227 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 191 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
228 if (unlikely(error)) 192 if (unlikely(error))
229 goto out_free_acl; 193 goto out_free_acl;
230 194
@@ -352,7 +316,7 @@ xfs_vn_link(
352 if (unlikely(error)) 316 if (unlikely(error))
353 return -error; 317 return -error;
354 318
355 atomic_inc(&inode->i_count); 319 ihold(inode);
356 d_instantiate(dentry, inode); 320 d_instantiate(dentry, inode);
357 return 0; 321 return 0;
358} 322}
@@ -397,7 +361,7 @@ xfs_vn_symlink(
397 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 361 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
398 xfs_dentry_to_name(&name, dentry); 362 xfs_dentry_to_name(&name, dentry);
399 363
400 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 364 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
401 if (unlikely(error)) 365 if (unlikely(error))
402 goto out; 366 goto out;
403 367
@@ -540,58 +504,6 @@ xfs_vn_setattr(
540 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 504 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
541} 505}
542 506
543STATIC long
544xfs_vn_fallocate(
545 struct inode *inode,
546 int mode,
547 loff_t offset,
548 loff_t len)
549{
550 long error;
551 loff_t new_size = 0;
552 xfs_flock64_t bf;
553 xfs_inode_t *ip = XFS_I(inode);
554
555 /* preallocation on directories not yet supported */
556 error = -ENODEV;
557 if (S_ISDIR(inode->i_mode))
558 goto out_error;
559
560 bf.l_whence = 0;
561 bf.l_start = offset;
562 bf.l_len = len;
563
564 xfs_ilock(ip, XFS_IOLOCK_EXCL);
565
566 /* check the new inode size is valid before allocating */
567 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
568 offset + len > i_size_read(inode)) {
569 new_size = offset + len;
570 error = inode_newsize_ok(inode, new_size);
571 if (error)
572 goto out_unlock;
573 }
574
575 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
576 0, XFS_ATTR_NOLOCK);
577 if (error)
578 goto out_unlock;
579
580 /* Change file size if needed */
581 if (new_size) {
582 struct iattr iattr;
583
584 iattr.ia_valid = ATTR_SIZE;
585 iattr.ia_size = new_size;
586 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
587 }
588
589out_unlock:
590 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
591out_error:
592 return error;
593}
594
595#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 507#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
596 508
597/* 509/*
@@ -685,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
685 .getxattr = generic_getxattr, 597 .getxattr = generic_getxattr,
686 .removexattr = generic_removexattr, 598 .removexattr = generic_removexattr,
687 .listxattr = xfs_vn_listxattr, 599 .listxattr = xfs_vn_listxattr,
688 .fallocate = xfs_vn_fallocate,
689 .fiemap = xfs_vn_fiemap, 600 .fiemap = xfs_vn_fiemap,
690}; 601};
691 602
@@ -795,7 +706,10 @@ xfs_setup_inode(
795 706
796 inode->i_ino = ip->i_ino; 707 inode->i_ino = ip->i_ino;
797 inode->i_state = I_NEW; 708 inode->i_state = I_NEW;
798 inode_add_to_lists(ip->i_mount->m_super, inode); 709
710 inode_sb_list_add(inode);
711 /* make the inode look hashed for the writeback code */
712 hlist_add_fake(&inode->i_hash);
799 713
800 inode->i_mode = ip->i_d.di_mode; 714 inode->i_mode = ip->i_d.di_mode;
801 inode->i_nlink = ip->i_d.di_nlink; 715 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..096494997747 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
37 37
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h>
41#include <time.h> 40#include <time.h>
42 41
43#include <support/debug.h> 42#include <support/debug.h>
@@ -71,6 +70,7 @@
71#include <linux/random.h> 70#include <linux/random.h>
72#include <linux/ctype.h> 71#include <linux/ctype.h>
73#include <linux/writeback.h> 72#include <linux/writeback.h>
73#include <linux/capability.h>
74 74
75#include <asm/page.h> 75#include <asm/page.h>
76#include <asm/div64.h> 76#include <asm/div64.h>
@@ -79,14 +79,12 @@
79#include <asm/byteorder.h> 79#include <asm/byteorder.h>
80#include <asm/unaligned.h> 80#include <asm/unaligned.h>
81 81
82#include <xfs_cred.h>
83#include <xfs_vnode.h> 82#include <xfs_vnode.h>
84#include <xfs_stats.h> 83#include <xfs_stats.h>
85#include <xfs_sysctl.h> 84#include <xfs_sysctl.h>
86#include <xfs_iops.h> 85#include <xfs_iops.h>
87#include <xfs_aops.h> 86#include <xfs_aops.h>
88#include <xfs_super.h> 87#include <xfs_super.h>
89#include <xfs_globals.h>
90#include <xfs_buf.h> 88#include <xfs_buf.h>
91 89
92/* 90/*
@@ -144,7 +142,7 @@
144#define SYNCHRONIZE() barrier() 142#define SYNCHRONIZE() barrier()
145#define __return_address __builtin_return_address(0) 143#define __return_address __builtin_return_address(0)
146 144
147#define dfltprid 0 145#define XFS_PROJID_DEFAULT 0
148#define MAXPATHLEN 1024 146#define MAXPATHLEN 1024
149 147
150#define MIN(a,b) (min(a,b)) 148#define MIN(a,b) (min(a,b))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
44#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
45#include "xfs_utils.h" 45#include "xfs_utils.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_version.h"
48#include "xfs_log_priv.h" 47#include "xfs_log_priv.h"
49#include "xfs_trans_priv.h" 48#include "xfs_trans_priv.h"
50#include "xfs_filestream.h" 49#include "xfs_filestream.h"
@@ -354,9 +353,6 @@ xfs_parseargs(
354 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 353 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
355 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 354 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
356 mp->m_flags |= XFS_MOUNT_DELAYLOG; 355 mp->m_flags |= XFS_MOUNT_DELAYLOG;
357 cmn_err(CE_WARN,
358 "Enabling EXPERIMENTAL delayed logging feature "
359 "- use at your own risk.\n");
360 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
361 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
362 } else if (!strcmp(this_char, "ihashsize")) { 358 } else if (!strcmp(this_char, "ihashsize")) {
@@ -577,7 +573,7 @@ xfs_max_file_offset(
577 573
578 /* Figure out maximum filesize, on Linux this can depend on 574 /* Figure out maximum filesize, on Linux this can depend on
579 * the filesystem blocksize (on 32 bit platforms). 575 * the filesystem blocksize (on 32 bit platforms).
580 * __block_prepare_write does this in an [unsigned] long... 576 * __block_write_begin does this in an [unsigned] long...
581 * page->index << (PAGE_CACHE_SHIFT - bbits) 577 * page->index << (PAGE_CACHE_SHIFT - bbits)
582 * So, for page sized blocks (4K on 32 bit platforms), 578 * So, for page sized blocks (4K on 32 bit platforms),
583 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 579 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -610,7 +606,8 @@ xfs_blkdev_get(
610{ 606{
611 int error = 0; 607 int error = 0;
612 608
613 *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); 609 *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
610 mp);
614 if (IS_ERR(*bdevp)) { 611 if (IS_ERR(*bdevp)) {
615 error = PTR_ERR(*bdevp); 612 error = PTR_ERR(*bdevp);
616 printk("XFS: Invalid device [%s], error=%d\n", name, error); 613 printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -624,7 +621,7 @@ xfs_blkdev_put(
624 struct block_device *bdev) 621 struct block_device *bdev)
625{ 622{
626 if (bdev) 623 if (bdev)
627 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 624 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
628} 625}
629 626
630/* 627/*
@@ -645,7 +642,7 @@ xfs_barrier_test(
645 XFS_BUF_ORDERED(sbp); 642 XFS_BUF_ORDERED(sbp);
646 643
647 xfsbdstrat(mp, sbp); 644 xfsbdstrat(mp, sbp);
648 error = xfs_iowait(sbp); 645 error = xfs_buf_iowait(sbp);
649 646
650 /* 647 /*
651 * Clear all the flags we set and possible error state in the 648 * Clear all the flags we set and possible error state in the
@@ -693,8 +690,7 @@ void
693xfs_blkdev_issue_flush( 690xfs_blkdev_issue_flush(
694 xfs_buftarg_t *buftarg) 691 xfs_buftarg_t *buftarg)
695{ 692{
696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, 693 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
697 BLKDEV_IFL_WAIT);
698} 694}
699 695
700STATIC void 696STATIC void
@@ -758,18 +754,20 @@ xfs_open_devices(
758 * Setup xfs_mount buffer target pointers 754 * Setup xfs_mount buffer target pointers
759 */ 755 */
760 error = ENOMEM; 756 error = ENOMEM;
761 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); 757 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
762 if (!mp->m_ddev_targp) 758 if (!mp->m_ddev_targp)
763 goto out_close_rtdev; 759 goto out_close_rtdev;
764 760
765 if (rtdev) { 761 if (rtdev) {
766 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); 762 mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
763 mp->m_fsname);
767 if (!mp->m_rtdev_targp) 764 if (!mp->m_rtdev_targp)
768 goto out_free_ddev_targ; 765 goto out_free_ddev_targ;
769 } 766 }
770 767
771 if (logdev && logdev != ddev) { 768 if (logdev && logdev != ddev) {
772 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); 769 mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
770 mp->m_fsname);
773 if (!mp->m_logdev_targp) 771 if (!mp->m_logdev_targp)
774 goto out_free_rtdev_targ; 772 goto out_free_rtdev_targ;
775 } else { 773 } else {
@@ -837,8 +835,11 @@ xfsaild_wakeup(
837 struct xfs_ail *ailp, 835 struct xfs_ail *ailp,
838 xfs_lsn_t threshold_lsn) 836 xfs_lsn_t threshold_lsn)
839{ 837{
840 ailp->xa_target = threshold_lsn; 838 /* only ever move the target forwards */
841 wake_up_process(ailp->xa_task); 839 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
840 ailp->xa_target = threshold_lsn;
841 wake_up_process(ailp->xa_task);
842 }
842} 843}
843 844
844STATIC int 845STATIC int
@@ -850,8 +851,17 @@ xfsaild(
850 long tout = 0; /* milliseconds */ 851 long tout = 0; /* milliseconds */
851 852
852 while (!kthread_should_stop()) { 853 while (!kthread_should_stop()) {
853 schedule_timeout_interruptible(tout ? 854 /*
854 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); 855 * for short sleeps indicating congestion, don't allow us to
856 * get woken early. Otherwise all we do is bang on the AIL lock
857 * without making progress.
858 */
859 if (tout && tout <= 20)
860 __set_current_state(TASK_KILLABLE);
861 else
862 __set_current_state(TASK_INTERRUPTIBLE);
863 schedule_timeout(tout ?
864 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
855 865
856 /* swsusp */ 866 /* swsusp */
857 try_to_freeze(); 867 try_to_freeze();
@@ -938,7 +948,7 @@ out_reclaim:
938 * Slab object creation initialisation for the XFS inode. 948 * Slab object creation initialisation for the XFS inode.
939 * This covers only the idempotent fields in the XFS inode; 949 * This covers only the idempotent fields in the XFS inode;
940 * all other fields need to be initialised on allocation 950 * all other fields need to be initialised on allocation
941 * from the slab. This avoids the need to repeatedly intialise 951 * from the slab. This avoids the need to repeatedly initialise
942 * fields in the xfs inode that left in the initialise state 952 * fields in the xfs inode that left in the initialise state
943 * when freeing the inode. 953 * when freeing the inode.
944 */ 954 */
@@ -972,12 +982,7 @@ xfs_fs_inode_init_once(
972 982
973/* 983/*
974 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that 984 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
975 * we catch unlogged VFS level updates to the inode. Care must be taken 985 * we catch unlogged VFS level updates to the inode.
976 * here - the transaction code calls mark_inode_dirty_sync() to mark the
977 * VFS inode dirty in a transaction and clears the i_update_core field;
978 * it must clear the field after calling mark_inode_dirty_sync() to
979 * correctly indicate that the dirty state has been propagated into the
980 * inode log item.
981 * 986 *
982 * We need the barrier() to maintain correct ordering between unlogged 987 * We need the barrier() to maintain correct ordering between unlogged
983 * updates and the transaction commit code that clears the i_update_core 988 * updates and the transaction commit code that clears the i_update_core
@@ -1126,6 +1131,8 @@ xfs_fs_evict_inode(
1126 */ 1131 */
1127 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 1132 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1128 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 1133 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1134 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
1135 &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
1129 1136
1130 xfs_inactive(ip); 1137 xfs_inactive(ip);
1131} 1138}
@@ -1407,7 +1414,7 @@ xfs_fs_freeze(
1407 1414
1408 xfs_save_resvblks(mp); 1415 xfs_save_resvblks(mp);
1409 xfs_quiesce_attr(mp); 1416 xfs_quiesce_attr(mp);
1410 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1417 return -xfs_fs_log_dummy(mp);
1411} 1418}
1412 1419
1413STATIC int 1420STATIC int
@@ -1521,8 +1528,9 @@ xfs_fs_fill_super(
1521 if (error) 1528 if (error)
1522 goto out_free_fsname; 1529 goto out_free_fsname;
1523 1530
1524 if (xfs_icsb_init_counters(mp)) 1531 error = xfs_icsb_init_counters(mp);
1525 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1532 if (error)
1533 goto out_close_devices;
1526 1534
1527 error = xfs_readsb(mp, flags); 1535 error = xfs_readsb(mp, flags);
1528 if (error) 1536 if (error)
@@ -1583,6 +1591,7 @@ xfs_fs_fill_super(
1583 xfs_freesb(mp); 1591 xfs_freesb(mp);
1584 out_destroy_counters: 1592 out_destroy_counters:
1585 xfs_icsb_destroy_counters(mp); 1593 xfs_icsb_destroy_counters(mp);
1594 out_close_devices:
1586 xfs_close_devices(mp); 1595 xfs_close_devices(mp);
1587 out_free_fsname: 1596 out_free_fsname:
1588 xfs_free_fsname(mp); 1597 xfs_free_fsname(mp);
@@ -1612,16 +1621,14 @@ xfs_fs_fill_super(
1612 goto out_free_sb; 1621 goto out_free_sb;
1613} 1622}
1614 1623
1615STATIC int 1624STATIC struct dentry *
1616xfs_fs_get_sb( 1625xfs_fs_mount(
1617 struct file_system_type *fs_type, 1626 struct file_system_type *fs_type,
1618 int flags, 1627 int flags,
1619 const char *dev_name, 1628 const char *dev_name,
1620 void *data, 1629 void *data)
1621 struct vfsmount *mnt)
1622{ 1630{
1623 return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, 1631 return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
1624 mnt);
1625} 1632}
1626 1633
1627static const struct super_operations xfs_super_operations = { 1634static const struct super_operations xfs_super_operations = {
@@ -1642,7 +1649,7 @@ static const struct super_operations xfs_super_operations = {
1642static struct file_system_type xfs_fs_type = { 1649static struct file_system_type xfs_fs_type = {
1643 .owner = THIS_MODULE, 1650 .owner = THIS_MODULE,
1644 .name = "xfs", 1651 .name = "xfs",
1645 .get_sb = xfs_fs_get_sb, 1652 .mount = xfs_fs_mount,
1646 .kill_sb = kill_block_super, 1653 .kill_sb = kill_block_super,
1647 .fs_flags = FS_REQUIRES_DEV, 1654 .fs_flags = FS_REQUIRES_DEV,
1648}; 1655};
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
62# define XFS_DBG_STRING "no debug" 62# define XFS_DBG_STRING "no debug"
63#endif 63#endif
64 64
65#define XFS_VERSION_STRING "SGI XFS"
65#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ 66#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
66 XFS_SECURITY_STRING \ 67 XFS_SECURITY_STRING \
67 XFS_REALTIME_STRING \ 68 XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,42 +39,59 @@
39#include <linux/kthread.h> 39#include <linux/kthread.h>
40#include <linux/freezer.h> 40#include <linux/freezer.h>
41 41
42/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between
45 * lookup reduction and stack usage. This is in the reclaim path, so we can't
46 * be too greedy.
47 */
48#define XFS_LOOKUP_BATCH 32
42 49
43STATIC xfs_inode_t * 50STATIC int
44xfs_inode_ag_lookup( 51xfs_inode_ag_walk_grab(
45 struct xfs_mount *mp, 52 struct xfs_inode *ip)
46 struct xfs_perag *pag,
47 uint32_t *first_index,
48 int tag)
49{ 53{
50 int nr_found; 54 struct inode *inode = VFS_I(ip);
51 struct xfs_inode *ip; 55
56 ASSERT(rcu_read_lock_held());
52 57
53 /* 58 /*
54 * use a gang lookup to find the next inode in the tree 59 * check for stale RCU freed inode
55 * as the tree is sparse and a gang lookup walks to find 60 *
56 * the number of objects requested. 61 * If the inode has been reallocated, it doesn't matter if it's not in
62 * the AG we are walking - we are walking for writeback, so if it
63 * passes all the "valid inode" checks and is dirty, then we'll write
64 * it back anyway. If it has been reallocated and still being
65 * initialised, the XFS_INEW check below will catch it.
57 */ 66 */
58 if (tag == XFS_ICI_NO_TAG) { 67 spin_lock(&ip->i_flags_lock);
59 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 68 if (!ip->i_ino)
60 (void **)&ip, *first_index, 1); 69 goto out_unlock_noent;
61 } else { 70
62 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 71 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
63 (void **)&ip, *first_index, 1, tag); 72 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
73 goto out_unlock_noent;
74 spin_unlock(&ip->i_flags_lock);
75
76 /* nothing to sync during shutdown */
77 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
78 return EFSCORRUPTED;
79
80 /* If we can't grab the inode, it must on it's way to reclaim. */
81 if (!igrab(inode))
82 return ENOENT;
83
84 if (is_bad_inode(inode)) {
85 IRELE(ip);
86 return ENOENT;
64 } 87 }
65 if (!nr_found)
66 return NULL;
67 88
68 /* 89 /* inode is valid */
69 * Update the index for the next lookup. Catch overflows 90 return 0;
70 * into the next AG range which can occur if we have inodes 91
71 * in the last block of the AG and we are currently 92out_unlock_noent:
72 * pointing to the last inode. 93 spin_unlock(&ip->i_flags_lock);
73 */ 94 return ENOENT;
74 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
75 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
76 return NULL;
77 return ip;
78} 95}
79 96
80STATIC int 97STATIC int
@@ -83,49 +100,83 @@ xfs_inode_ag_walk(
83 struct xfs_perag *pag, 100 struct xfs_perag *pag,
84 int (*execute)(struct xfs_inode *ip, 101 int (*execute)(struct xfs_inode *ip,
85 struct xfs_perag *pag, int flags), 102 struct xfs_perag *pag, int flags),
86 int flags, 103 int flags)
87 int tag,
88 int exclusive,
89 int *nr_to_scan)
90{ 104{
91 uint32_t first_index; 105 uint32_t first_index;
92 int last_error = 0; 106 int last_error = 0;
93 int skipped; 107 int skipped;
108 int done;
109 int nr_found;
94 110
95restart: 111restart:
112 done = 0;
96 skipped = 0; 113 skipped = 0;
97 first_index = 0; 114 first_index = 0;
115 nr_found = 0;
98 do { 116 do {
117 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
99 int error = 0; 118 int error = 0;
100 xfs_inode_t *ip; 119 int i;
101 120
102 if (exclusive) 121 rcu_read_lock();
103 write_lock(&pag->pag_ici_lock); 122 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
104 else 123 (void **)batch, first_index,
105 read_lock(&pag->pag_ici_lock); 124 XFS_LOOKUP_BATCH);
106 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 125 if (!nr_found) {
107 if (!ip) { 126 rcu_read_unlock();
108 if (exclusive)
109 write_unlock(&pag->pag_ici_lock);
110 else
111 read_unlock(&pag->pag_ici_lock);
112 break; 127 break;
113 } 128 }
114 129
115 /* execute releases pag->pag_ici_lock */ 130 /*
116 error = execute(ip, pag, flags); 131 * Grab the inodes before we drop the lock. if we found
117 if (error == EAGAIN) { 132 * nothing, nr == 0 and the loop will be skipped.
118 skipped++; 133 */
119 continue; 134 for (i = 0; i < nr_found; i++) {
135 struct xfs_inode *ip = batch[i];
136
137 if (done || xfs_inode_ag_walk_grab(ip))
138 batch[i] = NULL;
139
140 /*
141 * Update the index for the next lookup. Catch
142 * overflows into the next AG range which can occur if
143 * we have inodes in the last block of the AG and we
144 * are currently pointing to the last inode.
145 *
146 * Because we may see inodes that are from the wrong AG
147 * due to RCU freeing and reallocation, only update the
148 * index if it lies in this AG. It was a race that lead
149 * us to see this inode, so another lookup from the
150 * same index will not find it again.
151 */
152 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
153 continue;
154 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
155 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
156 done = 1;
157 }
158
159 /* unlock now we've grabbed the inodes. */
160 rcu_read_unlock();
161
162 for (i = 0; i < nr_found; i++) {
163 if (!batch[i])
164 continue;
165 error = execute(batch[i], pag, flags);
166 IRELE(batch[i]);
167 if (error == EAGAIN) {
168 skipped++;
169 continue;
170 }
171 if (error && last_error != EFSCORRUPTED)
172 last_error = error;
120 } 173 }
121 if (error)
122 last_error = error;
123 174
124 /* bail out if the filesystem is corrupted. */ 175 /* bail out if the filesystem is corrupted. */
125 if (error == EFSCORRUPTED) 176 if (error == EFSCORRUPTED)
126 break; 177 break;
127 178
128 } while ((*nr_to_scan)--); 179 } while (nr_found && !done);
129 180
130 if (skipped) { 181 if (skipped) {
131 delay(1); 182 delay(1);
@@ -134,110 +185,32 @@ restart:
134 return last_error; 185 return last_error;
135} 186}
136 187
137/*
138 * Select the next per-ag structure to iterate during the walk. The reclaim
139 * walk is optimised only to walk AGs with reclaimable inodes in them.
140 */
141static struct xfs_perag *
142xfs_inode_ag_iter_next_pag(
143 struct xfs_mount *mp,
144 xfs_agnumber_t *first,
145 int tag)
146{
147 struct xfs_perag *pag = NULL;
148
149 if (tag == XFS_ICI_RECLAIM_TAG) {
150 int found;
151 int ref;
152
153 spin_lock(&mp->m_perag_lock);
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, *first, 1, tag);
156 if (found <= 0) {
157 spin_unlock(&mp->m_perag_lock);
158 return NULL;
159 }
160 *first = pag->pag_agno + 1;
161 /* open coded pag reference increment */
162 ref = atomic_inc_return(&pag->pag_ref);
163 spin_unlock(&mp->m_perag_lock);
164 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
165 } else {
166 pag = xfs_perag_get(mp, *first);
167 (*first)++;
168 }
169 return pag;
170}
171
172int 188int
173xfs_inode_ag_iterator( 189xfs_inode_ag_iterator(
174 struct xfs_mount *mp, 190 struct xfs_mount *mp,
175 int (*execute)(struct xfs_inode *ip, 191 int (*execute)(struct xfs_inode *ip,
176 struct xfs_perag *pag, int flags), 192 struct xfs_perag *pag, int flags),
177 int flags, 193 int flags)
178 int tag,
179 int exclusive,
180 int *nr_to_scan)
181{ 194{
182 struct xfs_perag *pag; 195 struct xfs_perag *pag;
183 int error = 0; 196 int error = 0;
184 int last_error = 0; 197 int last_error = 0;
185 xfs_agnumber_t ag; 198 xfs_agnumber_t ag;
186 int nr;
187 199
188 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
189 ag = 0; 200 ag = 0;
190 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { 201 while ((pag = xfs_perag_get(mp, ag))) {
191 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 202 ag = pag->pag_agno + 1;
192 exclusive, &nr); 203 error = xfs_inode_ag_walk(mp, pag, execute, flags);
193 xfs_perag_put(pag); 204 xfs_perag_put(pag);
194 if (error) { 205 if (error) {
195 last_error = error; 206 last_error = error;
196 if (error == EFSCORRUPTED) 207 if (error == EFSCORRUPTED)
197 break; 208 break;
198 } 209 }
199 if (nr <= 0)
200 break;
201 } 210 }
202 if (nr_to_scan)
203 *nr_to_scan = nr;
204 return XFS_ERROR(last_error); 211 return XFS_ERROR(last_error);
205} 212}
206 213
207/* must be called with pag_ici_lock held and releases it */
208int
209xfs_sync_inode_valid(
210 struct xfs_inode *ip,
211 struct xfs_perag *pag)
212{
213 struct inode *inode = VFS_I(ip);
214 int error = EFSCORRUPTED;
215
216 /* nothing to sync during shutdown */
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 goto out_unlock;
219
220 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
221 error = ENOENT;
222 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
223 goto out_unlock;
224
225 /* If we can't grab the inode, it must on it's way to reclaim. */
226 if (!igrab(inode))
227 goto out_unlock;
228
229 if (is_bad_inode(inode)) {
230 IRELE(ip);
231 goto out_unlock;
232 }
233
234 /* inode is valid */
235 error = 0;
236out_unlock:
237 read_unlock(&pag->pag_ici_lock);
238 return error;
239}
240
241STATIC int 214STATIC int
242xfs_sync_inode_data( 215xfs_sync_inode_data(
243 struct xfs_inode *ip, 216 struct xfs_inode *ip,
@@ -248,10 +221,6 @@ xfs_sync_inode_data(
248 struct address_space *mapping = inode->i_mapping; 221 struct address_space *mapping = inode->i_mapping;
249 int error = 0; 222 int error = 0;
250 223
251 error = xfs_sync_inode_valid(ip, pag);
252 if (error)
253 return error;
254
255 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 224 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
256 goto out_wait; 225 goto out_wait;
257 226
@@ -268,7 +237,6 @@ xfs_sync_inode_data(
268 out_wait: 237 out_wait:
269 if (flags & SYNC_WAIT) 238 if (flags & SYNC_WAIT)
270 xfs_ioend_wait(ip); 239 xfs_ioend_wait(ip);
271 IRELE(ip);
272 return error; 240 return error;
273} 241}
274 242
@@ -280,10 +248,6 @@ xfs_sync_inode_attr(
280{ 248{
281 int error = 0; 249 int error = 0;
282 250
283 error = xfs_sync_inode_valid(ip, pag);
284 if (error)
285 return error;
286
287 xfs_ilock(ip, XFS_ILOCK_SHARED); 251 xfs_ilock(ip, XFS_ILOCK_SHARED);
288 if (xfs_inode_clean(ip)) 252 if (xfs_inode_clean(ip))
289 goto out_unlock; 253 goto out_unlock;
@@ -302,7 +266,6 @@ xfs_sync_inode_attr(
302 266
303 out_unlock: 267 out_unlock:
304 xfs_iunlock(ip, XFS_ILOCK_SHARED); 268 xfs_iunlock(ip, XFS_ILOCK_SHARED);
305 IRELE(ip);
306 return error; 269 return error;
307} 270}
308 271
@@ -318,8 +281,7 @@ xfs_sync_data(
318 281
319 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 282 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
320 283
321 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 284 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
322 XFS_ICI_NO_TAG, 0, NULL);
323 if (error) 285 if (error)
324 return XFS_ERROR(error); 286 return XFS_ERROR(error);
325 287
@@ -337,8 +299,7 @@ xfs_sync_attr(
337{ 299{
338 ASSERT((flags & ~SYNC_WAIT) == 0); 300 ASSERT((flags & ~SYNC_WAIT) == 0);
339 301
340 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 302 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
341 XFS_ICI_NO_TAG, 0, NULL);
342} 303}
343 304
344STATIC int 305STATIC int
@@ -401,7 +362,7 @@ xfs_quiesce_data(
401 362
402 /* mark the log as covered if needed */ 363 /* mark the log as covered if needed */
403 if (xfs_log_need_covered(mp)) 364 if (xfs_log_need_covered(mp))
404 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 365 error2 = xfs_fs_log_dummy(mp);
405 366
406 /* flush data-only devices */ 367 /* flush data-only devices */
407 if (mp->m_rtdev_targp) 368 if (mp->m_rtdev_targp)
@@ -542,13 +503,14 @@ xfs_sync_worker(
542 int error; 503 int error;
543 504
544 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
545 xfs_log_force(mp, 0);
546 xfs_reclaim_inodes(mp, 0);
547 /* dgc: errors ignored here */ 506 /* dgc: errors ignored here */
548 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
549 if (mp->m_super->s_frozen == SB_UNFROZEN && 507 if (mp->m_super->s_frozen == SB_UNFROZEN &&
550 xfs_log_need_covered(mp)) 508 xfs_log_need_covered(mp))
551 error = xfs_fs_log_dummy(mp, 0); 509 error = xfs_fs_log_dummy(mp);
510 else
511 xfs_log_force(mp, 0);
512 xfs_reclaim_inodes(mp, 0);
513 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
552 } 514 }
553 mp->m_sync_seq++; 515 mp->m_sync_seq++;
554 wake_up(&mp->m_wait_single_sync_task); 516 wake_up(&mp->m_wait_single_sync_task);
@@ -659,12 +621,12 @@ xfs_inode_set_reclaim_tag(
659 struct xfs_perag *pag; 621 struct xfs_perag *pag;
660 622
661 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 623 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
662 write_lock(&pag->pag_ici_lock); 624 spin_lock(&pag->pag_ici_lock);
663 spin_lock(&ip->i_flags_lock); 625 spin_lock(&ip->i_flags_lock);
664 __xfs_inode_set_reclaim_tag(pag, ip); 626 __xfs_inode_set_reclaim_tag(pag, ip);
665 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 627 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
666 spin_unlock(&ip->i_flags_lock); 628 spin_unlock(&ip->i_flags_lock);
667 write_unlock(&pag->pag_ici_lock); 629 spin_unlock(&pag->pag_ici_lock);
668 xfs_perag_put(pag); 630 xfs_perag_put(pag);
669} 631}
670 632
@@ -698,6 +660,53 @@ __xfs_inode_clear_reclaim_tag(
698} 660}
699 661
700/* 662/*
663 * Grab the inode for reclaim exclusively.
664 * Return 0 if we grabbed it, non-zero otherwise.
665 */
666STATIC int
667xfs_reclaim_inode_grab(
668 struct xfs_inode *ip,
669 int flags)
670{
671 ASSERT(rcu_read_lock_held());
672
673 /* quick check for stale RCU freed inode */
674 if (!ip->i_ino)
675 return 1;
676
677 /*
678 * do some unlocked checks first to avoid unnecessary lock traffic.
679 * The first is a flush lock check, the second is a already in reclaim
680 * check. Only do these checks if we are not going to block on locks.
681 */
682 if ((flags & SYNC_TRYLOCK) &&
683 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
684 return 1;
685 }
686
687 /*
688 * The radix tree lock here protects a thread in xfs_iget from racing
689 * with us starting reclaim on the inode. Once we have the
690 * XFS_IRECLAIM flag set it will not touch us.
691 *
692 * Due to RCU lookup, we may find inodes that have been freed and only
693 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
694 * aren't candidates for reclaim at all, so we must check the
695 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
696 */
697 spin_lock(&ip->i_flags_lock);
698 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
699 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
700 /* not a reclaim candidate. */
701 spin_unlock(&ip->i_flags_lock);
702 return 1;
703 }
704 __xfs_iflags_set(ip, XFS_IRECLAIM);
705 spin_unlock(&ip->i_flags_lock);
706 return 0;
707}
708
709/*
701 * Inodes in different states need to be treated differently, and the return 710 * Inodes in different states need to be treated differently, and the return
702 * value of xfs_iflush is not sufficient to get this right. The following table 711 * value of xfs_iflush is not sufficient to get this right. The following table
703 * lists the inode states and the reclaim actions necessary for non-blocking 712 * lists the inode states and the reclaim actions necessary for non-blocking
@@ -755,23 +764,6 @@ xfs_reclaim_inode(
755{ 764{
756 int error = 0; 765 int error = 0;
757 766
758 /*
759 * The radix tree lock here protects a thread in xfs_iget from racing
760 * with us starting reclaim on the inode. Once we have the
761 * XFS_IRECLAIM flag set it will not touch us.
762 */
763 spin_lock(&ip->i_flags_lock);
764 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
765 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
766 /* ignore as it is already under reclaim */
767 spin_unlock(&ip->i_flags_lock);
768 write_unlock(&pag->pag_ici_lock);
769 return 0;
770 }
771 __xfs_iflags_set(ip, XFS_IRECLAIM);
772 spin_unlock(&ip->i_flags_lock);
773 write_unlock(&pag->pag_ici_lock);
774
775 xfs_ilock(ip, XFS_ILOCK_EXCL); 767 xfs_ilock(ip, XFS_ILOCK_EXCL);
776 if (!xfs_iflock_nowait(ip)) { 768 if (!xfs_iflock_nowait(ip)) {
777 if (!(sync_mode & SYNC_WAIT)) 769 if (!(sync_mode & SYNC_WAIT))
@@ -842,12 +834,12 @@ reclaim:
842 * added to the tree assert that it's been there before to catch 834 * added to the tree assert that it's been there before to catch
843 * problems with the inode life time early on. 835 * problems with the inode life time early on.
844 */ 836 */
845 write_lock(&pag->pag_ici_lock); 837 spin_lock(&pag->pag_ici_lock);
846 if (!radix_tree_delete(&pag->pag_ici_root, 838 if (!radix_tree_delete(&pag->pag_ici_root,
847 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 839 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
848 ASSERT(0); 840 ASSERT(0);
849 __xfs_inode_clear_reclaim(pag, ip); 841 __xfs_inode_clear_reclaim(pag, ip);
850 write_unlock(&pag->pag_ici_lock); 842 spin_unlock(&pag->pag_ici_lock);
851 843
852 /* 844 /*
853 * Here we do an (almost) spurious inode lock in order to coordinate 845 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -868,13 +860,137 @@ reclaim:
868 860
869} 861}
870 862
863/*
864 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
865 * corrupted, we still want to try to reclaim all the inodes. If we don't,
866 * then a shut down during filesystem unmount reclaim walk leak all the
867 * unreclaimed inodes.
868 */
869int
870xfs_reclaim_inodes_ag(
871 struct xfs_mount *mp,
872 int flags,
873 int *nr_to_scan)
874{
875 struct xfs_perag *pag;
876 int error = 0;
877 int last_error = 0;
878 xfs_agnumber_t ag;
879 int trylock = flags & SYNC_TRYLOCK;
880 int skipped;
881
882restart:
883 ag = 0;
884 skipped = 0;
885 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
886 unsigned long first_index = 0;
887 int done = 0;
888 int nr_found = 0;
889
890 ag = pag->pag_agno + 1;
891
892 if (trylock) {
893 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
894 skipped++;
895 xfs_perag_put(pag);
896 continue;
897 }
898 first_index = pag->pag_ici_reclaim_cursor;
899 } else
900 mutex_lock(&pag->pag_ici_reclaim_lock);
901
902 do {
903 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
904 int i;
905
906 rcu_read_lock();
907 nr_found = radix_tree_gang_lookup_tag(
908 &pag->pag_ici_root,
909 (void **)batch, first_index,
910 XFS_LOOKUP_BATCH,
911 XFS_ICI_RECLAIM_TAG);
912 if (!nr_found) {
913 rcu_read_unlock();
914 break;
915 }
916
917 /*
918 * Grab the inodes before we drop the lock. if we found
919 * nothing, nr == 0 and the loop will be skipped.
920 */
921 for (i = 0; i < nr_found; i++) {
922 struct xfs_inode *ip = batch[i];
923
924 if (done || xfs_reclaim_inode_grab(ip, flags))
925 batch[i] = NULL;
926
927 /*
928 * Update the index for the next lookup. Catch
929 * overflows into the next AG range which can
930 * occur if we have inodes in the last block of
931 * the AG and we are currently pointing to the
932 * last inode.
933 *
934 * Because we may see inodes that are from the
935 * wrong AG due to RCU freeing and
936 * reallocation, only update the index if it
937 * lies in this AG. It was a race that lead us
938 * to see this inode, so another lookup from
939 * the same index will not find it again.
940 */
941 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
942 pag->pag_agno)
943 continue;
944 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
945 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
946 done = 1;
947 }
948
949 /* unlock now we've grabbed the inodes. */
950 rcu_read_unlock();
951
952 for (i = 0; i < nr_found; i++) {
953 if (!batch[i])
954 continue;
955 error = xfs_reclaim_inode(batch[i], pag, flags);
956 if (error && last_error != EFSCORRUPTED)
957 last_error = error;
958 }
959
960 *nr_to_scan -= XFS_LOOKUP_BATCH;
961
962 } while (nr_found && !done && *nr_to_scan > 0);
963
964 if (trylock && !done)
965 pag->pag_ici_reclaim_cursor = first_index;
966 else
967 pag->pag_ici_reclaim_cursor = 0;
968 mutex_unlock(&pag->pag_ici_reclaim_lock);
969 xfs_perag_put(pag);
970 }
971
972 /*
973 * if we skipped any AG, and we still have scan count remaining, do
974 * another pass this time using blocking reclaim semantics (i.e
975 * waiting on the reclaim locks and ignoring the reclaim cursors). This
976 * ensure that when we get more reclaimers than AGs we block rather
977 * than spin trying to execute reclaim.
978 */
979 if (trylock && skipped && *nr_to_scan > 0) {
980 trylock = 0;
981 goto restart;
982 }
983 return XFS_ERROR(last_error);
984}
985
871int 986int
872xfs_reclaim_inodes( 987xfs_reclaim_inodes(
873 xfs_mount_t *mp, 988 xfs_mount_t *mp,
874 int mode) 989 int mode)
875{ 990{
876 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 991 int nr_to_scan = INT_MAX;
877 XFS_ICI_RECLAIM_TAG, 1, NULL); 992
993 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
878} 994}
879 995
880/* 996/*
@@ -896,17 +1012,16 @@ xfs_reclaim_inode_shrink(
896 if (!(gfp_mask & __GFP_FS)) 1012 if (!(gfp_mask & __GFP_FS))
897 return -1; 1013 return -1;
898 1014
899 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, 1015 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
900 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 1016 /* terminate if we don't exhaust the scan */
901 /* if we don't exhaust the scan, don't bother coming back */
902 if (nr_to_scan > 0) 1017 if (nr_to_scan > 0)
903 return -1; 1018 return -1;
904 } 1019 }
905 1020
906 reclaimable = 0; 1021 reclaimable = 0;
907 ag = 0; 1022 ag = 0;
908 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, 1023 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
909 XFS_ICI_RECLAIM_TAG))) { 1024 ag = pag->pag_agno + 1;
910 reclaimable += pag->pag_ici_reclaimable; 1025 reclaimable += pag->pag_ici_reclaimable;
911 xfs_perag_put(pag); 1026 xfs_perag_put(pag);
912 } 1027 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..32ba6628290c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -47,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
48 struct xfs_inode *ip); 48 struct xfs_inode *ip);
49 49
50int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 50int xfs_sync_inode_grab(struct xfs_inode *ip);
51int xfs_inode_ag_iterator(struct xfs_mount *mp, 51int xfs_inode_ag_iterator(struct xfs_mount *mp,
52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
53 int flags, int tag, int write_lock, int *nr_to_scan); 53 int flags);
54 54
55void xfs_inode_shrinker_register(struct xfs_mount *mp); 55void xfs_inode_shrinker_register(struct xfs_mount *mp);
56void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 56void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); 127DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
128DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_cond_lock); 326DEFINE_BUF_EVENT(xfs_buf_cond_lock);
327DEFINE_BUF_EVENT(xfs_buf_unlock); 327DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
329DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
330DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
331DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
332DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
333DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
334DEFINE_BUF_EVENT(xfs_buf_get_noaddr); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
335DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
336DEFINE_BUF_EVENT(xfs_buf_item_relse); 335DEFINE_BUF_EVENT(xfs_buf_item_relse);
337DEFINE_BUF_EVENT(xfs_buf_item_iodone); 336DEFINE_BUF_EVENT(xfs_buf_item_iodone);
@@ -767,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
767 __field(int, curr_res) 766 __field(int, curr_res)
768 __field(int, unit_res) 767 __field(int, unit_res)
769 __field(unsigned int, flags) 768 __field(unsigned int, flags)
770 __field(void *, reserve_headq) 769 __field(int, reserveq)
771 __field(void *, write_headq) 770 __field(int, writeq)
772 __field(int, grant_reserve_cycle) 771 __field(int, grant_reserve_cycle)
773 __field(int, grant_reserve_bytes) 772 __field(int, grant_reserve_bytes)
774 __field(int, grant_write_cycle) 773 __field(int, grant_write_cycle)
@@ -785,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
785 __entry->curr_res = tic->t_curr_res; 784 __entry->curr_res = tic->t_curr_res;
786 __entry->unit_res = tic->t_unit_res; 785 __entry->unit_res = tic->t_unit_res;
787 __entry->flags = tic->t_flags; 786 __entry->flags = tic->t_flags;
788 __entry->reserve_headq = log->l_reserve_headq; 787 __entry->reserveq = list_empty(&log->l_reserveq);
789 __entry->write_headq = log->l_write_headq; 788 __entry->writeq = list_empty(&log->l_writeq);
790 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; 789 xlog_crack_grant_head(&log->l_grant_reserve_head,
791 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; 790 &__entry->grant_reserve_cycle,
792 __entry->grant_write_cycle = log->l_grant_write_cycle; 791 &__entry->grant_reserve_bytes);
793 __entry->grant_write_bytes = log->l_grant_write_bytes; 792 xlog_crack_grant_head(&log->l_grant_write_head,
793 &__entry->grant_write_cycle,
794 &__entry->grant_write_bytes);
794 __entry->curr_cycle = log->l_curr_cycle; 795 __entry->curr_cycle = log->l_curr_cycle;
795 __entry->curr_block = log->l_curr_block; 796 __entry->curr_block = log->l_curr_block;
796 __entry->tail_lsn = log->l_tail_lsn; 797 __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
797 ), 798 ),
798 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " 799 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
799 "t_unit_res %u t_flags %s reserve_headq 0x%p " 800 "t_unit_res %u t_flags %s reserveq %s "
800 "write_headq 0x%p grant_reserve_cycle %d " 801 "writeq %s grant_reserve_cycle %d "
801 "grant_reserve_bytes %d grant_write_cycle %d " 802 "grant_reserve_bytes %d grant_write_cycle %d "
802 "grant_write_bytes %d curr_cycle %d curr_block %d " 803 "grant_write_bytes %d curr_cycle %d curr_block %d "
803 "tail_cycle %d tail_block %d", 804 "tail_cycle %d tail_block %d",
@@ -808,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
808 __entry->curr_res, 809 __entry->curr_res,
809 __entry->unit_res, 810 __entry->unit_res,
810 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), 811 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
811 __entry->reserve_headq, 812 __entry->reserveq ? "empty" : "active",
812 __entry->write_headq, 813 __entry->writeq ? "empty" : "active",
813 __entry->grant_reserve_cycle, 814 __entry->grant_reserve_cycle,
814 __entry->grant_reserve_bytes, 815 __entry->grant_reserve_bytes,
815 __entry->grant_write_cycle, 816 __entry->grant_write_cycle,
@@ -836,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
838DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -843,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -936,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
936DEFINE_PAGE_EVENT(xfs_releasepage); 939DEFINE_PAGE_EVENT(xfs_releasepage);
937DEFINE_PAGE_EVENT(xfs_invalidatepage); 940DEFINE_PAGE_EVENT(xfs_invalidatepage);
938 941
939DECLARE_EVENT_CLASS(xfs_iomap_class, 942DECLARE_EVENT_CLASS(xfs_imap_class,
940 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 943 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
941 int flags, struct xfs_bmbt_irec *irec), 944 int type, struct xfs_bmbt_irec *irec),
942 TP_ARGS(ip, offset, count, flags, irec), 945 TP_ARGS(ip, offset, count, type, irec),
943 TP_STRUCT__entry( 946 TP_STRUCT__entry(
944 __field(dev_t, dev) 947 __field(dev_t, dev)
945 __field(xfs_ino_t, ino) 948 __field(xfs_ino_t, ino)
@@ -947,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
947 __field(loff_t, new_size) 950 __field(loff_t, new_size)
948 __field(loff_t, offset) 951 __field(loff_t, offset)
949 __field(size_t, count) 952 __field(size_t, count)
950 __field(int, flags) 953 __field(int, type)
951 __field(xfs_fileoff_t, startoff) 954 __field(xfs_fileoff_t, startoff)
952 __field(xfs_fsblock_t, startblock) 955 __field(xfs_fsblock_t, startblock)
953 __field(xfs_filblks_t, blockcount) 956 __field(xfs_filblks_t, blockcount)
@@ -959,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
959 __entry->new_size = ip->i_new_size; 962 __entry->new_size = ip->i_new_size;
960 __entry->offset = offset; 963 __entry->offset = offset;
961 __entry->count = count; 964 __entry->count = count;
962 __entry->flags = flags; 965 __entry->type = type;
963 __entry->startoff = irec ? irec->br_startoff : 0; 966 __entry->startoff = irec ? irec->br_startoff : 0;
964 __entry->startblock = irec ? irec->br_startblock : 0; 967 __entry->startblock = irec ? irec->br_startblock : 0;
965 __entry->blockcount = irec ? irec->br_blockcount : 0; 968 __entry->blockcount = irec ? irec->br_blockcount : 0;
966 ), 969 ),
967 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 970 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
968 "offset 0x%llx count %zd flags %s " 971 "offset 0x%llx count %zd type %s "
969 "startoff 0x%llx startblock %lld blockcount 0x%llx", 972 "startoff 0x%llx startblock %lld blockcount 0x%llx",
970 MAJOR(__entry->dev), MINOR(__entry->dev), 973 MAJOR(__entry->dev), MINOR(__entry->dev),
971 __entry->ino, 974 __entry->ino,
@@ -973,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
973 __entry->new_size, 976 __entry->new_size,
974 __entry->offset, 977 __entry->offset,
975 __entry->count, 978 __entry->count,
976 __print_flags(__entry->flags, "|", BMAPI_FLAGS), 979 __print_symbolic(__entry->type, XFS_IO_TYPES),
977 __entry->startoff, 980 __entry->startoff,
978 (__int64_t)__entry->startblock, 981 (__int64_t)__entry->startblock,
979 __entry->blockcount) 982 __entry->blockcount)
980) 983)
981 984
982#define DEFINE_IOMAP_EVENT(name) \ 985#define DEFINE_IOMAP_EVENT(name) \
983DEFINE_EVENT(xfs_iomap_class, name, \ 986DEFINE_EVENT(xfs_imap_class, name, \
984 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 987 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
985 int flags, struct xfs_bmbt_irec *irec), \ 988 int type, struct xfs_bmbt_irec *irec), \
986 TP_ARGS(ip, offset, count, flags, irec)) 989 TP_ARGS(ip, offset, count, type, irec))
987DEFINE_IOMAP_EVENT(xfs_iomap_enter); 990DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
988DEFINE_IOMAP_EVENT(xfs_iomap_found); 991DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
989DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 992DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
993DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
990 994
991DECLARE_EVENT_CLASS(xfs_simple_io_class, 995DECLARE_EVENT_CLASS(xfs_simple_io_class,
992 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 996 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1023,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
1023 TP_ARGS(ip, offset, count)) 1027 TP_ARGS(ip, offset, count))
1024DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1028DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
1025DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1029DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1030DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1026 1031
1027 1032
1028TRACE_EVENT(xfs_itruncate_start, 1033TRACE_EVENT(xfs_itruncate_start,
@@ -1421,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
1421 TP_PROTO(struct xfs_alloc_arg *args), \ 1426 TP_PROTO(struct xfs_alloc_arg *args), \
1422 TP_ARGS(args)) 1427 TP_ARGS(args))
1423DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1428DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1429DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
1424DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1430DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1425DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1431DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1426DEFINE_ALLOC_EVENT(xfs_alloc_near_first); 1432DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1753,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1759DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1754DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1760DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1755 1761
1762DECLARE_EVENT_CLASS(xfs_discard_class,
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1764 xfs_agblock_t agbno, xfs_extlen_t len),
1765 TP_ARGS(mp, agno, agbno, len),
1766 TP_STRUCT__entry(
1767 __field(dev_t, dev)
1768 __field(xfs_agnumber_t, agno)
1769 __field(xfs_agblock_t, agbno)
1770 __field(xfs_extlen_t, len)
1771 ),
1772 TP_fast_assign(
1773 __entry->dev = mp->m_super->s_dev;
1774 __entry->agno = agno;
1775 __entry->agbno = agbno;
1776 __entry->len = len;
1777 ),
1778 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1779 MAJOR(__entry->dev), MINOR(__entry->dev),
1780 __entry->agno,
1781 __entry->agbno,
1782 __entry->len)
1783)
1784
1785#define DEFINE_DISCARD_EVENT(name) \
1786DEFINE_EVENT(xfs_discard_class, name, \
1787 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1788 xfs_agblock_t agbno, xfs_extlen_t len), \
1789 TP_ARGS(mp, agno, agbno, len))
1790DEFINE_DISCARD_EVENT(xfs_discard_extent);
1791DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1792DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1793DEFINE_DISCARD_EVENT(xfs_discard_busy);
1794
1756#endif /* _TRACE_XFS_H */ 1795#endif /* _TRACE_XFS_H */
1757 1796
1758#undef TRACE_INCLUDE_PATH 1797#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VERSION_H__
19#define __XFS_VERSION_H__
20
21/*
22 * Dummy file that can contain a timestamp to put into the
23 * XFS init string, to help users keep track of what they're
24 * running
25 */
26
27#define XFS_VERSION_STRING "SGI XFS"
28
29#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..d22aa3103106 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
149 ASSERT(list_empty(&dqp->q_freelist)); 149 ASSERT(list_empty(&dqp->q_freelist));
150 150
151 mutex_destroy(&dqp->q_qlock); 151 mutex_destroy(&dqp->q_qlock);
152 sv_destroy(&dqp->q_pinwait);
153 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 152 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
154 153
155 atomic_dec(&xfs_Gqm->qm_totaldquots); 154 atomic_dec(&xfs_Gqm->qm_totaldquots);
@@ -463,87 +462,68 @@ xfs_qm_dqtobp(
463 uint flags) 462 uint flags)
464{ 463{
465 xfs_bmbt_irec_t map; 464 xfs_bmbt_irec_t map;
466 int nmaps, error; 465 int nmaps = 1, error;
467 xfs_buf_t *bp; 466 xfs_buf_t *bp;
468 xfs_inode_t *quotip; 467 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
469 xfs_mount_t *mp; 468 xfs_mount_t *mp = dqp->q_mount;
470 xfs_disk_dquot_t *ddq; 469 xfs_disk_dquot_t *ddq;
471 xfs_dqid_t id; 470 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
472 boolean_t newdquot;
473 xfs_trans_t *tp = (tpp ? *tpp : NULL); 471 xfs_trans_t *tp = (tpp ? *tpp : NULL);
474 472
475 mp = dqp->q_mount; 473 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
476 id = be32_to_cpu(dqp->q_core.d_id);
477 nmaps = 1;
478 newdquot = B_FALSE;
479 474
480 /* 475 xfs_ilock(quotip, XFS_ILOCK_SHARED);
481 * If we don't know where the dquot lives, find out. 476 if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
482 */
483 if (dqp->q_blkno == (xfs_daddr_t) 0) {
484 /* We use the id as an index */
485 dqp->q_fileoffset = (xfs_fileoff_t)id /
486 mp->m_quotainfo->qi_dqperchunk;
487 nmaps = 1;
488 quotip = XFS_DQ_TO_QIP(dqp);
489 xfs_ilock(quotip, XFS_ILOCK_SHARED);
490 /* 477 /*
491 * Return if this type of quotas is turned off while we didn't 478 * Return if this type of quotas is turned off while we
492 * have an inode lock 479 * didn't have the quota inode lock.
493 */ 480 */
494 if (XFS_IS_THIS_QUOTA_OFF(dqp)) { 481 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
495 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 482 return ESRCH;
496 return (ESRCH); 483 }
497 } 484
485 /*
486 * Find the block map; no allocations yet
487 */
488 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
489 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
490 NULL, 0, &map, &nmaps, NULL);
491
492 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
493 if (error)
494 return error;
495
496 ASSERT(nmaps == 1);
497 ASSERT(map.br_blockcount == 1);
498
499 /*
500 * Offset of dquot in the (fixed sized) dquot chunk.
501 */
502 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
503 sizeof(xfs_dqblk_t);
504
505 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
506 if (map.br_startblock == HOLESTARTBLOCK) {
498 /* 507 /*
499 * Find the block map; no allocations yet 508 * We don't allocate unless we're asked to
500 */ 509 */
501 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 510 if (!(flags & XFS_QMOPT_DQALLOC))
502 XFS_DQUOT_CLUSTER_SIZE_FSB, 511 return ENOENT;
503 XFS_BMAPI_METADATA,
504 NULL, 0, &map, &nmaps, NULL);
505 512
506 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 513 ASSERT(tp);
514 error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
515 dqp->q_fileoffset, &bp);
507 if (error) 516 if (error)
508 return (error); 517 return error;
509 ASSERT(nmaps == 1); 518 tp = *tpp;
510 ASSERT(map.br_blockcount == 1); 519 } else {
520 trace_xfs_dqtobp_read(dqp);
511 521
512 /* 522 /*
513 * offset of dquot in the (fixed sized) dquot chunk. 523 * store the blkno etc so that we don't have to do the
524 * mapping all the time
514 */ 525 */
515 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * 526 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
516 sizeof(xfs_dqblk_t);
517 if (map.br_startblock == HOLESTARTBLOCK) {
518 /*
519 * We don't allocate unless we're asked to
520 */
521 if (!(flags & XFS_QMOPT_DQALLOC))
522 return (ENOENT);
523
524 ASSERT(tp);
525 if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
526 dqp->q_fileoffset, &bp)))
527 return (error);
528 tp = *tpp;
529 newdquot = B_TRUE;
530 } else {
531 /*
532 * store the blkno etc so that we don't have to do the
533 * mapping all the time
534 */
535 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
536 }
537 }
538 ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
539 ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
540
541 /*
542 * Read in the buffer, unless we've just done the allocation
543 * (in which case we already have the buf).
544 */
545 if (!newdquot) {
546 trace_xfs_dqtobp_read(dqp);
547 527
548 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 528 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
549 dqp->q_blkno, 529 dqp->q_blkno,
@@ -552,13 +532,14 @@ xfs_qm_dqtobp(
552 if (error || !bp) 532 if (error || !bp)
553 return XFS_ERROR(error); 533 return XFS_ERROR(error);
554 } 534 }
535
555 ASSERT(XFS_BUF_ISBUSY(bp)); 536 ASSERT(XFS_BUF_ISBUSY(bp));
556 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 537 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
557 538
558 /* 539 /*
559 * calculate the location of the dquot inside the buffer. 540 * calculate the location of the dquot inside the buffer.
560 */ 541 */
561 ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); 542 ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
562 543
563 /* 544 /*
564 * A simple sanity check in case we got a corrupted dquot... 545 * A simple sanity check in case we got a corrupted dquot...
@@ -1176,18 +1157,18 @@ xfs_qm_dqflush(
1176 xfs_dquot_t *dqp, 1157 xfs_dquot_t *dqp,
1177 uint flags) 1158 uint flags)
1178{ 1159{
1179 xfs_mount_t *mp; 1160 struct xfs_mount *mp = dqp->q_mount;
1180 xfs_buf_t *bp; 1161 struct xfs_buf *bp;
1181 xfs_disk_dquot_t *ddqp; 1162 struct xfs_disk_dquot *ddqp;
1182 int error; 1163 int error;
1183 1164
1184 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1165 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1185 ASSERT(!completion_done(&dqp->q_flush)); 1166 ASSERT(!completion_done(&dqp->q_flush));
1167
1186 trace_xfs_dqflush(dqp); 1168 trace_xfs_dqflush(dqp);
1187 1169
1188 /* 1170 /*
1189 * If not dirty, or it's pinned and we are not supposed to 1171 * If not dirty, or it's pinned and we are not supposed to block, nada.
1190 * block, nada.
1191 */ 1172 */
1192 if (!XFS_DQ_IS_DIRTY(dqp) || 1173 if (!XFS_DQ_IS_DIRTY(dqp) ||
1193 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { 1174 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1182,46 @@ xfs_qm_dqflush(
1201 * down forcibly. If that's the case we must not write this dquot 1182 * down forcibly. If that's the case we must not write this dquot
1202 * to disk, because the log record didn't make it to disk! 1183 * to disk, because the log record didn't make it to disk!
1203 */ 1184 */
1204 if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { 1185 if (XFS_FORCED_SHUTDOWN(mp)) {
1205 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1186 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1206 xfs_dqfunlock(dqp); 1187 xfs_dqfunlock(dqp);
1207 return XFS_ERROR(EIO); 1188 return XFS_ERROR(EIO);
1208 } 1189 }
1209 1190
1210 /* 1191 /*
1211 * Get the buffer containing the on-disk dquot 1192 * Get the buffer containing the on-disk dquot
1212 * We don't need a transaction envelope because we know that the
1213 * the ondisk-dquot has already been allocated for.
1214 */ 1193 */
1215 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { 1194 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
1195 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1196 if (error) {
1216 ASSERT(error != ENOENT); 1197 ASSERT(error != ENOENT);
1217 /*
1218 * Quotas could have gotten turned off (ESRCH)
1219 */
1220 xfs_dqfunlock(dqp); 1198 xfs_dqfunlock(dqp);
1221 return (error); 1199 return error;
1222 } 1200 }
1223 1201
1224 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 1202 /*
1225 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1203 * Calculate the location of the dquot inside the buffer.
1226 xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE); 1204 */
1205 ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
1206
1207 /*
1208 * A simple sanity check in case we got a corrupted dquot..
1209 */
1210 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1211 XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
1212 xfs_buf_relse(bp);
1213 xfs_dqfunlock(dqp);
1214 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1227 return XFS_ERROR(EIO); 1215 return XFS_ERROR(EIO);
1228 } 1216 }
1229 1217
1230 /* This is the only portion of data that needs to persist */ 1218 /* This is the only portion of data that needs to persist */
1231 memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); 1219 memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
1232 1220
1233 /* 1221 /*
1234 * Clear the dirty field and remember the flush lsn for later use. 1222 * Clear the dirty field and remember the flush lsn for later use.
1235 */ 1223 */
1236 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1224 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1237 mp = dqp->q_mount;
1238 1225
1239 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, 1226 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
1240 &dqp->q_logitem.qli_item.li_lsn); 1227 &dqp->q_logitem.qli_item.li_lsn);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..206a2815ced6 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,8 +55,6 @@ uint ndquot;
55kmem_zone_t *qm_dqzone; 55kmem_zone_t *qm_dqzone;
56kmem_zone_t *qm_dqtrxzone; 56kmem_zone_t *qm_dqtrxzone;
57 57
58static cred_t xfs_zerocr;
59
60STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 58STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
61STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 59STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
62 60
@@ -837,7 +835,7 @@ xfs_qm_dqattach_locked(
837 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 835 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
838 flags & XFS_QMOPT_DQALLOC, 836 flags & XFS_QMOPT_DQALLOC,
839 ip->i_udquot, &ip->i_gdquot) : 837 ip->i_udquot, &ip->i_gdquot) :
840 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 838 xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
841 flags & XFS_QMOPT_DQALLOC, 839 flags & XFS_QMOPT_DQALLOC,
842 ip->i_udquot, &ip->i_gdquot); 840 ip->i_udquot, &ip->i_gdquot);
843 /* 841 /*
@@ -1199,87 +1197,6 @@ xfs_qm_list_destroy(
1199 mutex_destroy(&(list->qh_lock)); 1197 mutex_destroy(&(list->qh_lock));
1200} 1198}
1201 1199
1202
1203/*
1204 * Stripped down version of dqattach. This doesn't attach, or even look at the
1205 * dquots attached to the inode. The rationale is that there won't be any
1206 * attached at the time this is called from quotacheck.
1207 */
1208STATIC int
1209xfs_qm_dqget_noattach(
1210 xfs_inode_t *ip,
1211 xfs_dquot_t **O_udqpp,
1212 xfs_dquot_t **O_gdqpp)
1213{
1214 int error;
1215 xfs_mount_t *mp;
1216 xfs_dquot_t *udqp, *gdqp;
1217
1218 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1219 mp = ip->i_mount;
1220 udqp = NULL;
1221 gdqp = NULL;
1222
1223 if (XFS_IS_UQUOTA_ON(mp)) {
1224 ASSERT(ip->i_udquot == NULL);
1225 /*
1226 * We want the dquot allocated if it doesn't exist.
1227 */
1228 if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
1229 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
1230 &udqp))) {
1231 /*
1232 * Shouldn't be able to turn off quotas here.
1233 */
1234 ASSERT(error != ESRCH);
1235 ASSERT(error != ENOENT);
1236 return error;
1237 }
1238 ASSERT(udqp);
1239 }
1240
1241 if (XFS_IS_OQUOTA_ON(mp)) {
1242 ASSERT(ip->i_gdquot == NULL);
1243 if (udqp)
1244 xfs_dqunlock(udqp);
1245 error = XFS_IS_GQUOTA_ON(mp) ?
1246 xfs_qm_dqget(mp, ip,
1247 ip->i_d.di_gid, XFS_DQ_GROUP,
1248 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1249 &gdqp) :
1250 xfs_qm_dqget(mp, ip,
1251 ip->i_d.di_projid, XFS_DQ_PROJ,
1252 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1253 &gdqp);
1254 if (error) {
1255 if (udqp)
1256 xfs_qm_dqrele(udqp);
1257 ASSERT(error != ESRCH);
1258 ASSERT(error != ENOENT);
1259 return error;
1260 }
1261 ASSERT(gdqp);
1262
1263 /* Reacquire the locks in the right order */
1264 if (udqp) {
1265 if (! xfs_qm_dqlock_nowait(udqp)) {
1266 xfs_dqunlock(gdqp);
1267 xfs_dqlock(udqp);
1268 xfs_dqlock(gdqp);
1269 }
1270 }
1271 }
1272
1273 *O_udqpp = udqp;
1274 *O_gdqpp = gdqp;
1275
1276#ifdef QUOTADEBUG
1277 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
1278 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
1279#endif
1280 return 0;
1281}
1282
1283/* 1200/*
1284 * Create an inode and return with a reference already taken, but unlocked 1201 * Create an inode and return with a reference already taken, but unlocked
1285 * This is how we create quota inodes 1202 * This is how we create quota inodes
@@ -1305,8 +1222,8 @@ xfs_qm_qino_alloc(
1305 return error; 1222 return error;
1306 } 1223 }
1307 1224
1308 if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 1225 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
1309 &xfs_zerocr, 0, 1, ip, &committed))) { 1226 if (error) {
1310 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1227 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1311 XFS_TRANS_ABORT); 1228 XFS_TRANS_ABORT);
1312 return error; 1229 return error;
@@ -1516,7 +1433,7 @@ xfs_qm_dqiterate(
1516 rablkcnt = map[i+1].br_blockcount; 1433 rablkcnt = map[i+1].br_blockcount;
1517 rablkno = map[i+1].br_startblock; 1434 rablkno = map[i+1].br_startblock;
1518 while (rablkcnt--) { 1435 while (rablkcnt--) {
1519 xfs_baread(mp->m_ddev_targp, 1436 xfs_buf_readahead(mp->m_ddev_targp,
1520 XFS_FSB_TO_DADDR(mp, rablkno), 1437 XFS_FSB_TO_DADDR(mp, rablkno),
1521 mp->m_quotainfo->qi_dqchunklen); 1438 mp->m_quotainfo->qi_dqchunklen);
1522 rablkno++; 1439 rablkno++;
@@ -1546,18 +1463,34 @@ xfs_qm_dqiterate(
1546 1463
1547/* 1464/*
1548 * Called by dqusage_adjust in doing a quotacheck. 1465 * Called by dqusage_adjust in doing a quotacheck.
1549 * Given the inode, and a dquot (either USR or GRP, doesn't matter), 1466 *
1550 * this updates its incore copy as well as the buffer copy. This is 1467 * Given the inode, and a dquot id this updates both the incore dqout as well
1551 * so that once the quotacheck is done, we can just log all the buffers, 1468 * as the buffer copy. This is so that once the quotacheck is done, we can
1552 * as opposed to logging numerous updates to individual dquots. 1469 * just log all the buffers, as opposed to logging numerous updates to
1470 * individual dquots.
1553 */ 1471 */
1554STATIC void 1472STATIC int
1555xfs_qm_quotacheck_dqadjust( 1473xfs_qm_quotacheck_dqadjust(
1556 xfs_dquot_t *dqp, 1474 struct xfs_inode *ip,
1475 xfs_dqid_t id,
1476 uint type,
1557 xfs_qcnt_t nblks, 1477 xfs_qcnt_t nblks,
1558 xfs_qcnt_t rtblks) 1478 xfs_qcnt_t rtblks)
1559{ 1479{
1560 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1480 struct xfs_mount *mp = ip->i_mount;
1481 struct xfs_dquot *dqp;
1482 int error;
1483
1484 error = xfs_qm_dqget(mp, ip, id, type,
1485 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
1486 if (error) {
1487 /*
1488 * Shouldn't be able to turn off quotas here.
1489 */
1490 ASSERT(error != ESRCH);
1491 ASSERT(error != ENOENT);
1492 return error;
1493 }
1561 1494
1562 trace_xfs_dqadjust(dqp); 1495 trace_xfs_dqadjust(dqp);
1563 1496
@@ -1582,11 +1515,13 @@ xfs_qm_quotacheck_dqadjust(
1582 * There are no timers for the default values set in the root dquot. 1515 * There are no timers for the default values set in the root dquot.
1583 */ 1516 */
1584 if (dqp->q_core.d_id) { 1517 if (dqp->q_core.d_id) {
1585 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1518 xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
1586 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1519 xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
1587 } 1520 }
1588 1521
1589 dqp->dq_flags |= XFS_DQ_DIRTY; 1522 dqp->dq_flags |= XFS_DQ_DIRTY;
1523 xfs_qm_dqput(dqp);
1524 return 0;
1590} 1525}
1591 1526
1592STATIC int 1527STATIC int
@@ -1629,8 +1564,7 @@ xfs_qm_dqusage_adjust(
1629 int *res) /* result code value */ 1564 int *res) /* result code value */
1630{ 1565{
1631 xfs_inode_t *ip; 1566 xfs_inode_t *ip;
1632 xfs_dquot_t *udqp, *gdqp; 1567 xfs_qcnt_t nblks, rtblks = 0;
1633 xfs_qcnt_t nblks, rtblks;
1634 int error; 1568 int error;
1635 1569
1636 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1570 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1584,24 @@ xfs_qm_dqusage_adjust(
1650 * the case in all other instances. It's OK that we do this because 1584 * the case in all other instances. It's OK that we do this because
1651 * quotacheck is done only at mount time. 1585 * quotacheck is done only at mount time.
1652 */ 1586 */
1653 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { 1587 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
1588 if (error) {
1654 *res = BULKSTAT_RV_NOTHING; 1589 *res = BULKSTAT_RV_NOTHING;
1655 return error; 1590 return error;
1656 } 1591 }
1657 1592
1658 /* 1593 ASSERT(ip->i_delayed_blks == 0);
1659 * Obtain the locked dquots. In case of an error (eg. allocation
1660 * fails for ENOSPC), we return the negative of the error number
1661 * to bulkstat, so that it can get propagated to quotacheck() and
1662 * making us disable quotas for the file system.
1663 */
1664 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1665 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1666 IRELE(ip);
1667 *res = BULKSTAT_RV_GIVEUP;
1668 return error;
1669 }
1670 1594
1671 rtblks = 0; 1595 if (XFS_IS_REALTIME_INODE(ip)) {
1672 if (! XFS_IS_REALTIME_INODE(ip)) {
1673 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
1674 } else {
1675 /* 1596 /*
1676 * Walk thru the extent list and count the realtime blocks. 1597 * Walk thru the extent list and count the realtime blocks.
1677 */ 1598 */
1678 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { 1599 error = xfs_qm_get_rtblks(ip, &rtblks);
1679 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1600 if (error)
1680 IRELE(ip); 1601 goto error0;
1681 if (udqp)
1682 xfs_qm_dqput(udqp);
1683 if (gdqp)
1684 xfs_qm_dqput(gdqp);
1685 *res = BULKSTAT_RV_GIVEUP;
1686 return error;
1687 }
1688 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1689 } 1602 }
1690 ASSERT(ip->i_delayed_blks == 0);
1691 1603
1692 /* 1604 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1693 * We can't release the inode while holding its dquot locks.
1694 * The inode can go into inactive and might try to acquire the dquotlocks.
1695 * So, just unlock here and do a vn_rele at the end.
1696 */
1697 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1698 1605
1699 /* 1606 /*
1700 * Add the (disk blocks and inode) resources occupied by this 1607 * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1616,36 @@ xfs_qm_dqusage_adjust(
1709 * and quotaoffs don't race. (Quotachecks happen at mount time only). 1616 * and quotaoffs don't race. (Quotachecks happen at mount time only).
1710 */ 1617 */
1711 if (XFS_IS_UQUOTA_ON(mp)) { 1618 if (XFS_IS_UQUOTA_ON(mp)) {
1712 ASSERT(udqp); 1619 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
1713 xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); 1620 XFS_DQ_USER, nblks, rtblks);
1714 xfs_qm_dqput(udqp); 1621 if (error)
1622 goto error0;
1715 } 1623 }
1716 if (XFS_IS_OQUOTA_ON(mp)) { 1624
1717 ASSERT(gdqp); 1625 if (XFS_IS_GQUOTA_ON(mp)) {
1718 xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); 1626 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
1719 xfs_qm_dqput(gdqp); 1627 XFS_DQ_GROUP, nblks, rtblks);
1628 if (error)
1629 goto error0;
1720 } 1630 }
1721 /*
1722 * Now release the inode. This will send it to 'inactive', and
1723 * possibly even free blocks.
1724 */
1725 IRELE(ip);
1726 1631
1727 /* 1632 if (XFS_IS_PQUOTA_ON(mp)) {
1728 * Goto next inode. 1633 error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
1729 */ 1634 XFS_DQ_PROJ, nblks, rtblks);
1635 if (error)
1636 goto error0;
1637 }
1638
1639 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1640 IRELE(ip);
1730 *res = BULKSTAT_RV_DIDONE; 1641 *res = BULKSTAT_RV_DIDONE;
1731 return 0; 1642 return 0;
1643
1644error0:
1645 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1646 IRELE(ip);
1647 *res = BULKSTAT_RV_GIVEUP;
1648 return error;
1732} 1649}
1733 1650
1734/* 1651/*
@@ -1946,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
1946 xfs_dquot_t *dqpout; 1863 xfs_dquot_t *dqpout;
1947 xfs_dquot_t *dqp; 1864 xfs_dquot_t *dqp;
1948 int restarts; 1865 int restarts;
1866 int startagain;
1949 1867
1950 restarts = 0; 1868 restarts = 0;
1951 dqpout = NULL; 1869 dqpout = NULL;
1952 1870
1953 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ 1871 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1954startagain: 1872again:
1873 startagain = 0;
1955 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1874 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1956 1875
1957 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { 1876 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1968,13 +1887,10 @@ startagain:
1968 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 1887 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1969 1888
1970 trace_xfs_dqreclaim_want(dqp); 1889 trace_xfs_dqreclaim_want(dqp);
1971
1972 xfs_dqunlock(dqp);
1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1974 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1975 return NULL;
1976 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1890 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1977 goto startagain; 1891 restarts++;
1892 startagain = 1;
1893 goto dqunlock;
1978 } 1894 }
1979 1895
1980 /* 1896 /*
@@ -1989,23 +1905,20 @@ startagain:
1989 ASSERT(list_empty(&dqp->q_mplist)); 1905 ASSERT(list_empty(&dqp->q_mplist));
1990 list_del_init(&dqp->q_freelist); 1906 list_del_init(&dqp->q_freelist);
1991 xfs_Gqm->qm_dqfrlist_cnt--; 1907 xfs_Gqm->qm_dqfrlist_cnt--;
1992 xfs_dqunlock(dqp);
1993 dqpout = dqp; 1908 dqpout = dqp;
1994 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1909 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1995 break; 1910 goto dqunlock;
1996 } 1911 }
1997 1912
1998 ASSERT(dqp->q_hash); 1913 ASSERT(dqp->q_hash);
1999 ASSERT(!list_empty(&dqp->q_mplist)); 1914 ASSERT(!list_empty(&dqp->q_mplist));
2000 1915
2001 /* 1916 /*
2002 * Try to grab the flush lock. If this dquot is in the process of 1917 * Try to grab the flush lock. If this dquot is in the process
2003 * getting flushed to disk, we don't want to reclaim it. 1918 * of getting flushed to disk, we don't want to reclaim it.
2004 */ 1919 */
2005 if (!xfs_dqflock_nowait(dqp)) { 1920 if (!xfs_dqflock_nowait(dqp))
2006 xfs_dqunlock(dqp); 1921 goto dqunlock;
2007 continue;
2008 }
2009 1922
2010 /* 1923 /*
2011 * We have the flush lock so we know that this is not in the 1924 * We have the flush lock so we know that this is not in the
@@ -2027,8 +1940,7 @@ startagain:
2027 xfs_fs_cmn_err(CE_WARN, mp, 1940 xfs_fs_cmn_err(CE_WARN, mp,
2028 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1941 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2029 } 1942 }
2030 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 1943 goto dqunlock;
2031 continue;
2032 } 1944 }
2033 1945
2034 /* 1946 /*
@@ -2050,13 +1962,8 @@ startagain:
2050 */ 1962 */
2051 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { 1963 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
2052 restarts++; 1964 restarts++;
2053 mutex_unlock(&dqp->q_hash->qh_lock); 1965 startagain = 1;
2054 xfs_dqfunlock(dqp); 1966 goto qhunlock;
2055 xfs_dqunlock(dqp);
2056 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2057 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
2058 return NULL;
2059 goto startagain;
2060 } 1967 }
2061 1968
2062 ASSERT(dqp->q_nrefs == 0); 1969 ASSERT(dqp->q_nrefs == 0);
@@ -2069,14 +1976,20 @@ startagain:
2069 xfs_Gqm->qm_dqfrlist_cnt--; 1976 xfs_Gqm->qm_dqfrlist_cnt--;
2070 dqpout = dqp; 1977 dqpout = dqp;
2071 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 1978 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1979qhunlock:
2072 mutex_unlock(&dqp->q_hash->qh_lock); 1980 mutex_unlock(&dqp->q_hash->qh_lock);
2073dqfunlock: 1981dqfunlock:
2074 xfs_dqfunlock(dqp); 1982 xfs_dqfunlock(dqp);
1983dqunlock:
2075 xfs_dqunlock(dqp); 1984 xfs_dqunlock(dqp);
2076 if (dqpout) 1985 if (dqpout)
2077 break; 1986 break;
2078 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1987 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2079 return NULL; 1988 break;
1989 if (startagain) {
1990 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1991 goto again;
1992 }
2080 } 1993 }
2081 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1994 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2082 return dqpout; 1995 return dqpout;
@@ -2224,7 +2137,7 @@ xfs_qm_write_sb_changes(
2224 2137
2225 2138
2226/* 2139/*
2227 * Given an inode, a uid and gid (from cred_t) make sure that we have 2140 * Given an inode, a uid, gid and prid make sure that we have
2228 * allocated relevant dquot(s) on disk, and that we won't exceed inode 2141 * allocated relevant dquot(s) on disk, and that we won't exceed inode
2229 * quotas by creating this file. 2142 * quotas by creating this file.
2230 * This also attaches dquot(s) to the given inode after locking it, 2143 * This also attaches dquot(s) to the given inode after locking it,
@@ -2332,7 +2245,7 @@ xfs_qm_vop_dqalloc(
2332 xfs_dqunlock(gq); 2245 xfs_dqunlock(gq);
2333 } 2246 }
2334 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 2247 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
2335 if (ip->i_d.di_projid != prid) { 2248 if (xfs_get_projid(ip) != prid) {
2336 xfs_iunlock(ip, lockflags); 2249 xfs_iunlock(ip, lockflags);
2337 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 2250 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
2338 XFS_DQ_PROJ, 2251 XFS_DQ_PROJ,
@@ -2454,7 +2367,7 @@ xfs_qm_vop_chown_reserve(
2454 } 2367 }
2455 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 2368 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
2456 if (XFS_IS_PQUOTA_ON(ip->i_mount) && 2369 if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
2457 ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) 2370 xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
2458 prjflags = XFS_QMOPT_ENOSPC; 2371 prjflags = XFS_QMOPT_ENOSPC;
2459 2372
2460 if (prjflags || 2373 if (prjflags ||
@@ -2558,7 +2471,7 @@ xfs_qm_vop_create_dqattach(
2558 ip->i_gdquot = gdqp; 2471 ip->i_gdquot = gdqp;
2559 ASSERT(XFS_IS_OQUOTA_ON(mp)); 2472 ASSERT(XFS_IS_OQUOTA_ON(mp));
2560 ASSERT((XFS_IS_GQUOTA_ON(mp) ? 2473 ASSERT((XFS_IS_GQUOTA_ON(mp) ?
2561 ip->i_d.di_gid : ip->i_d.di_projid) == 2474 ip->i_d.di_gid : xfs_get_projid(ip)) ==
2562 be32_to_cpu(gdqp->q_core.d_id)); 2475 be32_to_cpu(gdqp->q_core.d_id));
2563 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2476 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2564 } 2477 }
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..45b5cb1788ab 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
81 xfs_mount_t *mp = ip->i_mount; 81 xfs_mount_t *mp = ip->i_mount;
82 xfs_dquot_t *dqp; 82 xfs_dquot_t *dqp;
83 83
84 if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { 84 if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); 85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
86 xfs_qm_dqput(dqp); 86 xfs_qm_dqput(dqp);
87 } 87 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..bdebc183223e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile(
276 goto out_unlock; 276 goto out_unlock;
277 } 277 }
278 278
279 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 279 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
281 281
282out_unlock: 282out_unlock:
@@ -875,21 +875,14 @@ xfs_dqrele_inode(
875 struct xfs_perag *pag, 875 struct xfs_perag *pag,
876 int flags) 876 int flags)
877{ 877{
878 int error;
879
880 /* skip quota inodes */ 878 /* skip quota inodes */
881 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 879 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
882 ip == ip->i_mount->m_quotainfo->qi_gquotaip) { 880 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
883 ASSERT(ip->i_udquot == NULL); 881 ASSERT(ip->i_udquot == NULL);
884 ASSERT(ip->i_gdquot == NULL); 882 ASSERT(ip->i_gdquot == NULL);
885 read_unlock(&pag->pag_ici_lock);
886 return 0; 883 return 0;
887 } 884 }
888 885
889 error = xfs_sync_inode_valid(ip, pag);
890 if (error)
891 return error;
892
893 xfs_ilock(ip, XFS_ILOCK_EXCL); 886 xfs_ilock(ip, XFS_ILOCK_EXCL);
894 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { 887 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
895 xfs_qm_dqrele(ip->i_udquot); 888 xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +893,6 @@ xfs_dqrele_inode(
900 ip->i_gdquot = NULL; 893 ip->i_gdquot = NULL;
901 } 894 }
902 xfs_iunlock(ip, XFS_ILOCK_EXCL); 895 xfs_iunlock(ip, XFS_ILOCK_EXCL);
903
904 IRELE(ip);
905 return 0; 896 return 0;
906} 897}
907 898
@@ -918,8 +909,7 @@ xfs_qm_dqrele_all_inodes(
918 uint flags) 909 uint flags)
919{ 910{
920 ASSERT(mp->m_quotainfo); 911 ASSERT(mp->m_quotainfo);
921 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, 912 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
922 XFS_ICI_NO_TAG, 0, NULL);
923} 913}
924 914
925/*------------------------------------------------------------------------*/ 915/*------------------------------------------------------------------------*/
@@ -1175,7 +1165,7 @@ xfs_qm_internalqcheck_adjust(
1175 } 1165 }
1176 xfs_qm_internalqcheck_get_dquots(mp, 1166 xfs_qm_internalqcheck_get_dquots(mp,
1177 (xfs_dqid_t) ip->i_d.di_uid, 1167 (xfs_dqid_t) ip->i_d.di_uid,
1178 (xfs_dqid_t) ip->i_d.di_projid, 1168 (xfs_dqid_t) xfs_get_projid(ip),
1179 (xfs_dqid_t) ip->i_d.di_gid, 1169 (xfs_dqid_t) ip->i_d.di_gid,
1180 &ud, &gd); 1170 &ud, &gd);
1181 if (XFS_IS_UQUOTA_ON(mp)) { 1171 if (XFS_IS_UQUOTA_ON(mp)) {
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_error.h" 26#include "xfs_error.h"
27 27
28static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock);
30
31/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
32#define XFS_MAX_ERR_LEVEL 7
33#define XFS_ERR_MASK ((1 << 3) - 1)
34static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
35 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
36 KERN_ERR, KERN_WARNING, KERN_NOTICE,
37 KERN_INFO, KERN_DEBUG};
38
39void 28void
40cmn_err(register int level, char *fmt, ...) 29cmn_err(
30 const char *lvl,
31 const char *fmt,
32 ...)
41{ 33{
42 char *fp = fmt; 34 struct va_format vaf;
43 int len; 35 va_list args;
44 ulong flags; 36
45 va_list ap; 37 va_start(args, fmt);
46 38 vaf.fmt = fmt;
47 level &= XFS_ERR_MASK; 39 vaf.va = &args;
48 if (level > XFS_MAX_ERR_LEVEL) 40
49 level = XFS_MAX_ERR_LEVEL; 41 printk("%s%pV", lvl, &vaf);
50 spin_lock_irqsave(&xfs_err_lock,flags); 42 va_end(args);
51 va_start(ap, fmt); 43
52 if (*fmt == '!') fp++; 44 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
53 len = vsnprintf(message, sizeof(message), fp, ap);
54 if (len >= sizeof(message))
55 len = sizeof(message) - 1;
56 if (message[len-1] == '\n')
57 message[len-1] = 0;
58 printk("%s%s\n", err_level[level], message);
59 va_end(ap);
60 spin_unlock_irqrestore(&xfs_err_lock,flags);
61 BUG_ON(level == CE_PANIC);
62} 45}
63 46
64void 47void
65xfs_fs_vcmn_err( 48xfs_fs_cmn_err(
66 int level, 49 const char *lvl,
67 struct xfs_mount *mp, 50 struct xfs_mount *mp,
68 char *fmt, 51 const char *fmt,
69 va_list ap) 52 ...)
70{ 53{
71 unsigned long flags; 54 struct va_format vaf;
72 int len = 0; 55 va_list args;
73 56
74 level &= XFS_ERR_MASK; 57 va_start(args, fmt);
75 if (level > XFS_MAX_ERR_LEVEL) 58 vaf.fmt = fmt;
76 level = XFS_MAX_ERR_LEVEL; 59 vaf.va = &args;
77 60
78 spin_lock_irqsave(&xfs_err_lock,flags); 61 printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
62 va_end(args);
79 63
80 if (mp) { 64 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); 65}
66
67/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
68void
69xfs_cmn_err(
70 int panic_tag,
71 const char *lvl,
72 struct xfs_mount *mp,
73 const char *fmt,
74 ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
82 79
83 /* 80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
84 * Skip the printk if we can't print anything useful 81 printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
85 * due to an over-long device name. 82 do_panic = 1;
86 */
87 if (len >= sizeof(message))
88 goto out;
89 } 83 }
90 84
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); 85 va_start(args, fmt);
92 if (len >= sizeof(message)) 86 vaf.fmt = fmt;
93 len = sizeof(message) - 1; 87 vaf.va = &args;
94 if (message[len-1] == '\n')
95 message[len-1] = 0;
96 88
97 printk("%s%s\n", err_level[level], message); 89 printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
98 out: 90 va_end(args);
99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100 91
101 BUG_ON(level == CE_PANIC); 92 BUG_ON(do_panic);
102} 93}
103 94
104void 95void
105assfail(char *expr, char *file, int line) 96assfail(char *expr, char *file, int line)
106{ 97{
107 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); 98 printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
99 file, line);
108 BUG(); 100 BUG();
109} 101}
110 102
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
20 20
21#include <stdarg.h> 21#include <stdarg.h>
22 22
23#define CE_DEBUG 7 /* debug */ 23struct xfs_mount;
24#define CE_CONT 6 /* continuation */ 24
25#define CE_NOTE 5 /* notice */ 25#define CE_DEBUG KERN_DEBUG
26#define CE_WARN 4 /* warning */ 26#define CE_CONT KERN_INFO
27#define CE_ALERT 1 /* alert */ 27#define CE_NOTE KERN_NOTICE
28#define CE_PANIC 0 /* panic */ 28#define CE_WARN KERN_WARNING
29 29#define CE_ALERT KERN_ALERT
30extern void cmn_err(int, char *, ...) 30#define CE_PANIC KERN_EMERG
31 __attribute__ ((format (printf, 2, 3))); 31
32void cmn_err(const char *lvl, const char *fmt, ...)
33 __attribute__ ((format (printf, 2, 3)));
34void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
35 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
36void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
37 const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
38
32extern void assfail(char *expr, char *f, int l); 39extern void assfail(char *expr, char *f, int l);
33 40
34#define ASSERT_ALWAYS(expr) \ 41#define ASSERT_ALWAYS(expr) \
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
44#ifdef CONFIG_XFS_POSIX_ACL 44#ifdef CONFIG_XFS_POSIX_ACL
45extern int xfs_check_acl(struct inode *inode, int mask); 45extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); 46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); 47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
48extern int xfs_acl_chmod(struct inode *inode); 48extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,9 +227,18 @@ typedef struct xfs_perag {
227 227
228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
229 229
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 spinlock_t pag_ici_lock; /* incore inode cache lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
234 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
235
236 /* buffer cache index */
237 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
238 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
239
240 /* for rcu-safe freeing */
241 struct rcu_head rcu_head;
233#endif 242#endif
234 int pagb_count; /* pagb slots in use */ 243 int pagb_count; /* pagb slots in use */
235} xfs_perag_t; 244} xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44static int
45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
46 xfs_agblock_t bno, xfs_extlen_t len);
47
48/* 44/*
49 * Prototypes for per-ag allocation routines 45 * Prototypes for per-ag allocation routines
50 */ 46 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
94 * Lookup the first record less than or equal to [bno, len] 90 * Lookup the first record less than or equal to [bno, len]
95 * in the btree given by cur. 91 * in the btree given by cur.
96 */ 92 */
97STATIC int /* error */ 93int /* error */
98xfs_alloc_lookup_le( 94xfs_alloc_lookup_le(
99 struct xfs_btree_cur *cur, /* btree cursor */ 95 struct xfs_btree_cur *cur, /* btree cursor */
100 xfs_agblock_t bno, /* starting block of extent */ 96 xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
127/* 123/*
128 * Get the data from the pointed-to record. 124 * Get the data from the pointed-to record.
129 */ 125 */
130STATIC int /* error */ 126int /* error */
131xfs_alloc_get_rec( 127xfs_alloc_get_rec(
132 struct xfs_btree_cur *cur, /* btree cursor */ 128 struct xfs_btree_cur *cur, /* btree cursor */
133 xfs_agblock_t *bno, /* output: starting block of extent */ 129 xfs_agblock_t *bno, /* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
577 xfs_extlen_t rlen; /* length of returned extent */ 573 xfs_extlen_t rlen; /* length of returned extent */
578 574
579 ASSERT(args->alignment == 1); 575 ASSERT(args->alignment == 1);
576
580 /* 577 /*
581 * Allocate/initialize a cursor for the by-number freespace btree. 578 * Allocate/initialize a cursor for the by-number freespace btree.
582 */ 579 */
583 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 580 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
584 args->agno, XFS_BTNUM_BNO); 581 args->agno, XFS_BTNUM_BNO);
582
585 /* 583 /*
586 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 584 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
587 * Look for the closest free block <= bno, it must contain bno 585 * Look for the closest free block <= bno, it must contain bno
588 * if any free block does. 586 * if any free block does.
589 */ 587 */
590 if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) 588 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
589 if (error)
591 goto error0; 590 goto error0;
592 if (!i) { 591 if (!i)
593 /* 592 goto not_found;
594 * Didn't find it, return null. 593
595 */
596 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
597 args->agbno = NULLAGBLOCK;
598 return 0;
599 }
600 /* 594 /*
601 * Grab the freespace record. 595 * Grab the freespace record.
602 */ 596 */
603 if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) 597 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
598 if (error)
604 goto error0; 599 goto error0;
605 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
606 ASSERT(fbno <= args->agbno); 601 ASSERT(fbno <= args->agbno);
607 minend = args->agbno + args->minlen; 602 minend = args->agbno + args->minlen;
608 maxend = args->agbno + args->maxlen; 603 maxend = args->agbno + args->maxlen;
609 fend = fbno + flen; 604 fend = fbno + flen;
605
610 /* 606 /*
611 * Give up if the freespace isn't long enough for the minimum request. 607 * Give up if the freespace isn't long enough for the minimum request.
612 */ 608 */
613 if (fend < minend) { 609 if (fend < minend)
614 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 610 goto not_found;
615 args->agbno = NULLAGBLOCK; 611
616 return 0;
617 }
618 /* 612 /*
619 * End of extent will be smaller of the freespace end and the 613 * End of extent will be smaller of the freespace end and the
620 * maximal requested end. 614 * maximal requested end.
621 */ 615 *
622 end = XFS_AGBLOCK_MIN(fend, maxend);
623 /*
624 * Fix the length according to mod and prod if given. 616 * Fix the length according to mod and prod if given.
625 */ 617 */
618 end = XFS_AGBLOCK_MIN(fend, maxend);
626 args->len = end - args->agbno; 619 args->len = end - args->agbno;
627 xfs_alloc_fix_len(args); 620 xfs_alloc_fix_len(args);
628 if (!xfs_alloc_fix_minleft(args)) { 621 if (!xfs_alloc_fix_minleft(args))
629 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 622 goto not_found;
630 return 0; 623
631 }
632 rlen = args->len; 624 rlen = args->len;
633 ASSERT(args->agbno + rlen <= fend); 625 ASSERT(args->agbno + rlen <= fend);
634 end = args->agbno + rlen; 626 end = args->agbno + rlen;
627
635 /* 628 /*
636 * We are allocating agbno for rlen [agbno .. end] 629 * We are allocating agbno for rlen [agbno .. end]
637 * Allocate/initialize a cursor for the by-size btree. 630 * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
640 args->agno, XFS_BTNUM_CNT); 633 args->agno, XFS_BTNUM_CNT);
641 ASSERT(args->agbno + args->len <= 634 ASSERT(args->agbno + args->len <=
642 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 635 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
643 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 636 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
644 args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { 637 args->len, XFSA_FIXUP_BNO_OK);
638 if (error) {
645 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 639 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
646 goto error0; 640 goto error0;
647 } 641 }
642
648 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 643 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 644 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
650 645
651 trace_xfs_alloc_exact_done(args);
652 args->wasfromfl = 0; 646 args->wasfromfl = 0;
647 trace_xfs_alloc_exact_done(args);
648 return 0;
649
650not_found:
651 /* Didn't find it, return null. */
652 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
653 args->agbno = NULLAGBLOCK;
654 trace_xfs_alloc_exact_notfound(args);
653 return 0; 655 return 0;
654 656
655error0: 657error0:
@@ -659,6 +661,95 @@ error0:
659} 661}
660 662
661/* 663/*
664 * Search the btree in a given direction via the search cursor and compare
665 * the records found against the good extent we've already found.
666 */
667STATIC int
668xfs_alloc_find_best_extent(
669 struct xfs_alloc_arg *args, /* allocation argument structure */
670 struct xfs_btree_cur **gcur, /* good cursor */
671 struct xfs_btree_cur **scur, /* searching cursor */
672 xfs_agblock_t gdiff, /* difference for search comparison */
673 xfs_agblock_t *sbno, /* extent found by search */
674 xfs_extlen_t *slen,
675 xfs_extlen_t *slena, /* aligned length */
676 int dir) /* 0 = search right, 1 = search left */
677{
678 xfs_agblock_t bno;
679 xfs_agblock_t new;
680 xfs_agblock_t sdiff;
681 int error;
682 int i;
683
684 /* The good extent is perfect, no need to search. */
685 if (!gdiff)
686 goto out_use_good;
687
688 /*
689 * Look until we find a better one, run out of space or run off the end.
690 */
691 do {
692 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
693 if (error)
694 goto error0;
695 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
696 xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
697 args->minlen, &bno, slena);
698
699 /*
700 * The good extent is closer than this one.
701 */
702 if (!dir) {
703 if (bno >= args->agbno + gdiff)
704 goto out_use_good;
705 } else {
706 if (bno <= args->agbno - gdiff)
707 goto out_use_good;
708 }
709
710 /*
711 * Same distance, compare length and pick the best.
712 */
713 if (*slena >= args->minlen) {
714 args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
715 xfs_alloc_fix_len(args);
716
717 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
718 args->alignment, *sbno,
719 *slen, &new);
720
721 /*
722 * Choose closer size and invalidate other cursor.
723 */
724 if (sdiff < gdiff)
725 goto out_use_search;
726 goto out_use_good;
727 }
728
729 if (!dir)
730 error = xfs_btree_increment(*scur, 0, &i);
731 else
732 error = xfs_btree_decrement(*scur, 0, &i);
733 if (error)
734 goto error0;
735 } while (i);
736
737out_use_good:
738 xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
739 *scur = NULL;
740 return 0;
741
742out_use_search:
743 xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
744 *gcur = NULL;
745 return 0;
746
747error0:
748 /* caller invalidates cursors */
749 return error;
750}
751
752/*
662 * Allocate a variable extent near bno in the allocation group agno. 753 * Allocate a variable extent near bno in the allocation group agno.
663 * Extent's length (returned in len) will be between minlen and maxlen, 754 * Extent's length (returned in len) will be between minlen and maxlen,
664 * and of the form k * prod + mod unless there's nothing that large. 755 * and of the form k * prod + mod unless there's nothing that large.
@@ -675,7 +766,7 @@ xfs_alloc_ag_vextent_near(
675 xfs_agblock_t gtbnoa; /* aligned ... */ 766 xfs_agblock_t gtbnoa; /* aligned ... */
676 xfs_extlen_t gtdiff; /* difference to right side entry */ 767 xfs_extlen_t gtdiff; /* difference to right side entry */
677 xfs_extlen_t gtlen; /* length of right side entry */ 768 xfs_extlen_t gtlen; /* length of right side entry */
678 xfs_extlen_t gtlena; /* aligned ... */ 769 xfs_extlen_t gtlena = 0; /* aligned ... */
679 xfs_agblock_t gtnew; /* useful start bno of right side */ 770 xfs_agblock_t gtnew; /* useful start bno of right side */
680 int error; /* error code */ 771 int error; /* error code */
681 int i; /* result code, temporary */ 772 int i; /* result code, temporary */
@@ -684,7 +775,7 @@ xfs_alloc_ag_vextent_near(
684 xfs_agblock_t ltbnoa; /* aligned ... */ 775 xfs_agblock_t ltbnoa; /* aligned ... */
685 xfs_extlen_t ltdiff; /* difference to left side entry */ 776 xfs_extlen_t ltdiff; /* difference to left side entry */
686 xfs_extlen_t ltlen; /* length of left side entry */ 777 xfs_extlen_t ltlen; /* length of left side entry */
687 xfs_extlen_t ltlena; /* aligned ... */ 778 xfs_extlen_t ltlena = 0; /* aligned ... */
688 xfs_agblock_t ltnew; /* useful start bno of left side */ 779 xfs_agblock_t ltnew; /* useful start bno of left side */
689 xfs_extlen_t rlen; /* length of returned extent */ 780 xfs_extlen_t rlen; /* length of returned extent */
690#if defined(DEBUG) && defined(__KERNEL__) 781#if defined(DEBUG) && defined(__KERNEL__)
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
925 } 1016 }
926 } 1017 }
927 } while (bno_cur_lt || bno_cur_gt); 1018 } while (bno_cur_lt || bno_cur_gt);
1019
928 /* 1020 /*
929 * Got both cursors still active, need to find better entry. 1021 * Got both cursors still active, need to find better entry.
930 */ 1022 */
931 if (bno_cur_lt && bno_cur_gt) { 1023 if (bno_cur_lt && bno_cur_gt) {
932 /*
933 * Left side is long enough, look for a right side entry.
934 */
935 if (ltlena >= args->minlen) { 1024 if (ltlena >= args->minlen) {
936 /* 1025 /*
937 * Fix up the length. 1026 * Left side is good, look for a right side entry.
938 */ 1027 */
939 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1028 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
940 xfs_alloc_fix_len(args); 1029 xfs_alloc_fix_len(args);
941 rlen = args->len; 1030 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
942 ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
943 args->alignment, ltbno, ltlen, &ltnew); 1031 args->alignment, ltbno, ltlen, &ltnew);
1032
1033 error = xfs_alloc_find_best_extent(args,
1034 &bno_cur_lt, &bno_cur_gt,
1035 ltdiff, &gtbno, &gtlen, &gtlena,
1036 0 /* search right */);
1037 } else {
1038 ASSERT(gtlena >= args->minlen);
1039
944 /* 1040 /*
945 * Not perfect. 1041 * Right side is good, look for a left side entry.
946 */
947 if (ltdiff) {
948 /*
949 * Look until we find a better one, run out of
950 * space, or run off the end.
951 */
952 while (bno_cur_lt && bno_cur_gt) {
953 if ((error = xfs_alloc_get_rec(
954 bno_cur_gt, &gtbno,
955 &gtlen, &i)))
956 goto error0;
957 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
958 xfs_alloc_compute_aligned(gtbno, gtlen,
959 args->alignment, args->minlen,
960 &gtbnoa, &gtlena);
961 /*
962 * The left one is clearly better.
963 */
964 if (gtbnoa >= args->agbno + ltdiff) {
965 xfs_btree_del_cursor(
966 bno_cur_gt,
967 XFS_BTREE_NOERROR);
968 bno_cur_gt = NULL;
969 break;
970 }
971 /*
972 * If we reach a big enough entry,
973 * compare the two and pick the best.
974 */
975 if (gtlena >= args->minlen) {
976 args->len =
977 XFS_EXTLEN_MIN(gtlena,
978 args->maxlen);
979 xfs_alloc_fix_len(args);
980 rlen = args->len;
981 gtdiff = xfs_alloc_compute_diff(
982 args->agbno, rlen,
983 args->alignment,
984 gtbno, gtlen, &gtnew);
985 /*
986 * Right side is better.
987 */
988 if (gtdiff < ltdiff) {
989 xfs_btree_del_cursor(
990 bno_cur_lt,
991 XFS_BTREE_NOERROR);
992 bno_cur_lt = NULL;
993 }
994 /*
995 * Left side is better.
996 */
997 else {
998 xfs_btree_del_cursor(
999 bno_cur_gt,
1000 XFS_BTREE_NOERROR);
1001 bno_cur_gt = NULL;
1002 }
1003 break;
1004 }
1005 /*
1006 * Fell off the right end.
1007 */
1008 if ((error = xfs_btree_increment(
1009 bno_cur_gt, 0, &i)))
1010 goto error0;
1011 if (!i) {
1012 xfs_btree_del_cursor(
1013 bno_cur_gt,
1014 XFS_BTREE_NOERROR);
1015 bno_cur_gt = NULL;
1016 break;
1017 }
1018 }
1019 }
1020 /*
1021 * The left side is perfect, trash the right side.
1022 */
1023 else {
1024 xfs_btree_del_cursor(bno_cur_gt,
1025 XFS_BTREE_NOERROR);
1026 bno_cur_gt = NULL;
1027 }
1028 }
1029 /*
1030 * It's the right side that was found first, look left.
1031 */
1032 else {
1033 /*
1034 * Fix up the length.
1035 */ 1042 */
1036 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1043 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1037 xfs_alloc_fix_len(args); 1044 xfs_alloc_fix_len(args);
1038 rlen = args->len; 1045 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1039 gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
1040 args->alignment, gtbno, gtlen, &gtnew); 1046 args->alignment, gtbno, gtlen, &gtnew);
1041 /* 1047
1042 * Right side entry isn't perfect. 1048 error = xfs_alloc_find_best_extent(args,
1043 */ 1049 &bno_cur_gt, &bno_cur_lt,
1044 if (gtdiff) { 1050 gtdiff, &ltbno, &ltlen, &ltlena,
1045 /* 1051 1 /* search left */);
1046 * Look until we find a better one, run out of
1047 * space, or run off the end.
1048 */
1049 while (bno_cur_lt && bno_cur_gt) {
1050 if ((error = xfs_alloc_get_rec(
1051 bno_cur_lt, &ltbno,
1052 &ltlen, &i)))
1053 goto error0;
1054 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1055 xfs_alloc_compute_aligned(ltbno, ltlen,
1056 args->alignment, args->minlen,
1057 &ltbnoa, &ltlena);
1058 /*
1059 * The right one is clearly better.
1060 */
1061 if (ltbnoa <= args->agbno - gtdiff) {
1062 xfs_btree_del_cursor(
1063 bno_cur_lt,
1064 XFS_BTREE_NOERROR);
1065 bno_cur_lt = NULL;
1066 break;
1067 }
1068 /*
1069 * If we reach a big enough entry,
1070 * compare the two and pick the best.
1071 */
1072 if (ltlena >= args->minlen) {
1073 args->len = XFS_EXTLEN_MIN(
1074 ltlena, args->maxlen);
1075 xfs_alloc_fix_len(args);
1076 rlen = args->len;
1077 ltdiff = xfs_alloc_compute_diff(
1078 args->agbno, rlen,
1079 args->alignment,
1080 ltbno, ltlen, &ltnew);
1081 /*
1082 * Left side is better.
1083 */
1084 if (ltdiff < gtdiff) {
1085 xfs_btree_del_cursor(
1086 bno_cur_gt,
1087 XFS_BTREE_NOERROR);
1088 bno_cur_gt = NULL;
1089 }
1090 /*
1091 * Right side is better.
1092 */
1093 else {
1094 xfs_btree_del_cursor(
1095 bno_cur_lt,
1096 XFS_BTREE_NOERROR);
1097 bno_cur_lt = NULL;
1098 }
1099 break;
1100 }
1101 /*
1102 * Fell off the left end.
1103 */
1104 if ((error = xfs_btree_decrement(
1105 bno_cur_lt, 0, &i)))
1106 goto error0;
1107 if (!i) {
1108 xfs_btree_del_cursor(bno_cur_lt,
1109 XFS_BTREE_NOERROR);
1110 bno_cur_lt = NULL;
1111 break;
1112 }
1113 }
1114 }
1115 /*
1116 * The right side is perfect, trash the left side.
1117 */
1118 else {
1119 xfs_btree_del_cursor(bno_cur_lt,
1120 XFS_BTREE_NOERROR);
1121 bno_cur_lt = NULL;
1122 }
1123 } 1052 }
1053
1054 if (error)
1055 goto error0;
1124 } 1056 }
1057
1125 /* 1058 /*
1126 * If we couldn't get anything, give up. 1059 * If we couldn't get anything, give up.
1127 */ 1060 */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
1130 args->agbno = NULLAGBLOCK; 1063 args->agbno = NULLAGBLOCK;
1131 return 0; 1064 return 0;
1132 } 1065 }
1066
1133 /* 1067 /*
1134 * At this point we have selected a freespace entry, either to the 1068 * At this point we have selected a freespace entry, either to the
1135 * left or to the right. If it's on the right, copy all the 1069 * left or to the right. If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
1146 j = 1; 1080 j = 1;
1147 } else 1081 } else
1148 j = 0; 1082 j = 0;
1083
1149 /* 1084 /*
1150 * Fix up the length and compute the useful address. 1085 * Fix up the length and compute the useful address.
1151 */ 1086 */
@@ -2676,7 +2611,7 @@ restart:
2676 * will require a synchronous transaction, but it can still be 2611 * will require a synchronous transaction, but it can still be
2677 * used to distinguish between a partial or exact match. 2612 * used to distinguish between a partial or exact match.
2678 */ 2613 */
2679static int 2614int
2680xfs_alloc_busy_search( 2615xfs_alloc_busy_search(
2681 struct xfs_mount *mp, 2616 struct xfs_mount *mp,
2682 xfs_agnumber_t agno, 2617 xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
19#define __XFS_ALLOC_H__ 19#define __XFS_ALLOC_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_btree_cur;
22struct xfs_mount; 23struct xfs_mount;
23struct xfs_perag; 24struct xfs_perag;
24struct xfs_trans; 25struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
74#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) 75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
75 76
76/* 77/*
78 * When deciding how much space to allocate out of an AG, we limit the
79 * allocation maximum size to the size the AG. However, we cannot use all the
80 * blocks in the AG - some are permanently used by metadata. These
81 * blocks are generally:
82 * - the AG superblock, AGF, AGI and AGFL
83 * - the AGF (bno and cnt) and AGI btree root blocks
84 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
85 *
86 * The AG headers are sector sized, so the amount of space they take up is
87 * dependent on filesystem geometry. The others are all single blocks.
88 */
89#define XFS_ALLOC_AG_MAX_USABLE(mp) \
90 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
91
92
93/*
77 * Argument structure for xfs_alloc routines. 94 * Argument structure for xfs_alloc routines.
78 * This is turned into a structure to avoid having 20 arguments passed 95 * This is turned into a structure to avoid having 20 arguments passed
79 * down several levels of the stack. 96 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
118 struct xfs_perag *pag); 135 struct xfs_perag *pag);
119 136
120#ifdef __KERNEL__ 137#ifdef __KERNEL__
121
122void 138void
123xfs_alloc_busy_insert(xfs_trans_t *tp, 139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
124 xfs_agnumber_t agno, 140 xfs_agblock_t bno, xfs_extlen_t len);
125 xfs_agblock_t bno,
126 xfs_extlen_t len);
127 141
128void 142void
129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
130 144
145int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
147 xfs_agblock_t bno, xfs_extlen_t len);
131#endif /* __KERNEL__ */ 148#endif /* __KERNEL__ */
132 149
133/* 150/*
@@ -205,4 +222,18 @@ xfs_free_extent(
205 xfs_fsblock_t bno, /* starting block number of extent */ 222 xfs_fsblock_t bno, /* starting block number of extent */
206 xfs_extlen_t len); /* length of extent */ 223 xfs_extlen_t len); /* length of extent */
207 224
225int /* error */
226xfs_alloc_lookup_le(
227 struct xfs_btree_cur *cur, /* btree cursor */
228 xfs_agblock_t bno, /* starting block of extent */
229 xfs_extlen_t len, /* length of extent */
230 int *stat); /* success/failure */
231
232int /* error */
233xfs_alloc_get_rec(
234 struct xfs_btree_cur *cur, /* btree cursor */
235 xfs_agblock_t *bno, /* output: starting block of extent */
236 xfs_extlen_t *len, /* output: length of extent */
237 int *stat); /* output: success/failure */
238
208#endif /* __XFS_ALLOC_H__ */ 239#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..3916925e2584 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -280,38 +280,6 @@ xfs_allocbt_key_diff(
280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
281} 281}
282 282
283STATIC int
284xfs_allocbt_kill_root(
285 struct xfs_btree_cur *cur,
286 struct xfs_buf *bp,
287 int level,
288 union xfs_btree_ptr *newroot)
289{
290 int error;
291
292 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
293 XFS_BTREE_STATS_INC(cur, killroot);
294
295 /*
296 * Update the root pointer, decreasing the level by 1 and then
297 * free the old root.
298 */
299 xfs_allocbt_set_root(cur, newroot, -1);
300 error = xfs_allocbt_free_block(cur, bp);
301 if (error) {
302 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
303 return error;
304 }
305
306 XFS_BTREE_STATS_INC(cur, free);
307
308 xfs_btree_setbuf(cur, level, NULL);
309 cur->bc_nlevels--;
310
311 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
312 return 0;
313}
314
315#ifdef DEBUG 283#ifdef DEBUG
316STATIC int 284STATIC int
317xfs_allocbt_keys_inorder( 285xfs_allocbt_keys_inorder(
@@ -423,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
423 391
424 .dup_cursor = xfs_allocbt_dup_cursor, 392 .dup_cursor = xfs_allocbt_dup_cursor,
425 .set_root = xfs_allocbt_set_root, 393 .set_root = xfs_allocbt_set_root,
426 .kill_root = xfs_allocbt_kill_root,
427 .alloc_block = xfs_allocbt_alloc_block, 394 .alloc_block = xfs_allocbt_alloc_block,
428 .free_block = xfs_allocbt_free_block, 395 .free_block = xfs_allocbt_free_block,
429 .update_lastrec = xfs_allocbt_update_lastrec, 396 .update_lastrec = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..c86375378810 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
355 if (mp->m_flags & XFS_MOUNT_WSYNC) { 355 if (mp->m_flags & XFS_MOUNT_WSYNC) {
356 xfs_trans_set_sync(args.trans); 356 xfs_trans_set_sync(args.trans);
357 } 357 }
358
359 if (!error && (flags & ATTR_KERNOTIME) == 0) {
360 xfs_trans_ichgtime(args.trans, dp,
361 XFS_ICHGTIME_CHG);
362 }
358 err2 = xfs_trans_commit(args.trans, 363 err2 = xfs_trans_commit(args.trans,
359 XFS_TRANS_RELEASE_LOG_RES); 364 XFS_TRANS_RELEASE_LOG_RES);
360 xfs_iunlock(dp, XFS_ILOCK_EXCL); 365 xfs_iunlock(dp, XFS_ILOCK_EXCL);
361 366
362 /*
363 * Hit the inode change time.
364 */
365 if (!error && (flags & ATTR_KERNOTIME) == 0) {
366 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
367 }
368 return(error == 0 ? err2 : error); 367 return(error == 0 ? err2 : error);
369 } 368 }
370 369
@@ -420,6 +419,9 @@ xfs_attr_set_int(
420 xfs_trans_set_sync(args.trans); 419 xfs_trans_set_sync(args.trans);
421 } 420 }
422 421
422 if ((flags & ATTR_KERNOTIME) == 0)
423 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
424
423 /* 425 /*
424 * Commit the last in the sequence of transactions. 426 * Commit the last in the sequence of transactions.
425 */ 427 */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
427 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 429 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
428 xfs_iunlock(dp, XFS_ILOCK_EXCL); 430 xfs_iunlock(dp, XFS_ILOCK_EXCL);
429 431
430 /*
431 * Hit the inode change time.
432 */
433 if (!error && (flags & ATTR_KERNOTIME) == 0) {
434 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
435 }
436
437 return(error); 432 return(error);
438 433
439out: 434out:
@@ -567,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
567 xfs_trans_set_sync(args.trans); 562 xfs_trans_set_sync(args.trans);
568 } 563 }
569 564
565 if ((flags & ATTR_KERNOTIME) == 0)
566 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
567
570 /* 568 /*
571 * Commit the last in the sequence of transactions. 569 * Commit the last in the sequence of transactions.
572 */ 570 */
@@ -574,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
574 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 572 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
575 xfs_iunlock(dp, XFS_ILOCK_EXCL); 573 xfs_iunlock(dp, XFS_ILOCK_EXCL);
576 574
577 /*
578 * Hit the inode change time.
579 */
580 if (!error && (flags & ATTR_KERNOTIME) == 0) {
581 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
582 }
583
584 return(error); 575 return(error);
585 576
586out: 577out:
@@ -1995,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1995 1986
1996 tmp = (valuelen < XFS_BUF_SIZE(bp)) 1987 tmp = (valuelen < XFS_BUF_SIZE(bp))
1997 ? valuelen : XFS_BUF_SIZE(bp); 1988 ? valuelen : XFS_BUF_SIZE(bp);
1998 xfs_biomove(bp, 0, tmp, dst, XBF_READ); 1989 xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
1999 xfs_buf_relse(bp); 1990 xfs_buf_relse(bp);
2000 dst += tmp; 1991 dst += tmp;
2001 valuelen -= tmp; 1992 valuelen -= tmp;
@@ -2125,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2125 2116
2126 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2117 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2127 XFS_BUF_SIZE(bp); 2118 XFS_BUF_SIZE(bp);
2128 xfs_biomove(bp, 0, tmp, src, XBF_WRITE); 2119 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2129 if (tmp < XFS_BUF_SIZE(bp)) 2120 if (tmp < XFS_BUF_SIZE(bp))
2130 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2121 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2131 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2122 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
2132 return (error); 2123 return (error);
2133 } 2124 }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
637 * It didn't all fit, so we have to sort everything on hashval. 637 * It didn't all fit, so we have to sort everything on hashval.
638 */ 638 */
639 sbsize = sf->hdr.count * sizeof(*sbuf); 639 sbsize = sf->hdr.count * sizeof(*sbuf);
640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); 640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
641 641
642 /* 642 /*
643 * Scan the attribute list for the rest of the entries, storing 643 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2386 args.dp = context->dp; 2386 args.dp = context->dp;
2387 args.whichfork = XFS_ATTR_FORK; 2387 args.whichfork = XFS_ATTR_FORK;
2388 args.valuelen = valuelen; 2388 args.valuelen = valuelen;
2389 args.value = kmem_alloc(valuelen, KM_SLEEP); 2389 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); 2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
2392 retval = xfs_attr_rmtval_get(&args); 2392 retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..dc3afd7739ff 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -614,7 +614,7 @@ xfs_bmap_add_extent(
614 nblks += cur->bc_private.b.allocated; 614 nblks += cur->bc_private.b.allocated;
615 ASSERT(nblks <= da_old); 615 ASSERT(nblks <= da_old);
616 if (nblks < da_old) 616 if (nblks < da_old)
617 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 617 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
618 (int64_t)(da_old - nblks), rsvd); 618 (int64_t)(da_old - nblks), rsvd);
619 } 619 }
620 /* 620 /*
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
1038 * Filling in the middle part of a previous delayed allocation. 1038 * Filling in the middle part of a previous delayed allocation.
1039 * Contiguity is impossible here. 1039 * Contiguity is impossible here.
1040 * This case is avoided almost all the time. 1040 * This case is avoided almost all the time.
1041 *
1042 * We start with a delayed allocation:
1043 *
1044 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
1045 * PREV @ idx
1046 *
1047 * and we are allocating:
1048 * +rrrrrrrrrrrrrrrrr+
1049 * new
1050 *
1051 * and we set it up for insertion as:
1052 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
1053 * new
1054 * PREV @ idx LEFT RIGHT
1055 * inserted at idx + 1
1041 */ 1056 */
1042 temp = new->br_startoff - PREV.br_startoff; 1057 temp = new->br_startoff - PREV.br_startoff;
1043 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1044 xfs_bmbt_set_blockcount(ep, temp);
1045 r[0] = *new;
1046 r[1].br_state = PREV.br_state;
1047 r[1].br_startblock = 0;
1048 r[1].br_startoff = new_endoff;
1049 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1050 r[1].br_blockcount = temp2; 1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1051 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state;
1063 RIGHT.br_startblock = nullstartblock(
1064 (int)xfs_bmap_worst_indlen(ip, temp2));
1065 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
1052 ip->i_df.if_lastex = idx + 1; 1069 ip->i_df.if_lastex = idx + 1;
1053 ip->i_d.di_nextents++; 1070 ip->i_d.di_nextents++;
1054 if (cur == NULL) 1071 if (cur == NULL)
@@ -1079,7 +1096,8 @@ xfs_bmap_add_extent_delay_real(
1079 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 1096 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1080 (cur ? cur->bc_private.b.allocated : 0)); 1097 (cur ? cur->bc_private.b.allocated : 0));
1081 if (diff > 0 && 1098 if (diff > 0 &&
1082 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { 1099 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1100 -((int64_t)diff), rsvd)) {
1083 /* 1101 /*
1084 * Ick gross gag me with a spoon. 1102 * Ick gross gag me with a spoon.
1085 */ 1103 */
@@ -1089,16 +1107,18 @@ xfs_bmap_add_extent_delay_real(
1089 temp--; 1107 temp--;
1090 diff--; 1108 diff--;
1091 if (!diff || 1109 if (!diff ||
1092 !xfs_mod_incore_sb(ip->i_mount, 1110 !xfs_icsb_modify_counters(ip->i_mount,
1093 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1111 XFS_SBS_FDBLOCKS,
1112 -((int64_t)diff), rsvd))
1094 break; 1113 break;
1095 } 1114 }
1096 if (temp2) { 1115 if (temp2) {
1097 temp2--; 1116 temp2--;
1098 diff--; 1117 diff--;
1099 if (!diff || 1118 if (!diff ||
1100 !xfs_mod_incore_sb(ip->i_mount, 1119 !xfs_icsb_modify_counters(ip->i_mount,
1101 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1120 XFS_SBS_FDBLOCKS,
1121 -((int64_t)diff), rsvd))
1102 break; 1122 break;
1103 } 1123 }
1104 } 1124 }
@@ -1766,7 +1786,7 @@ xfs_bmap_add_extent_hole_delay(
1766 } 1786 }
1767 if (oldlen != newlen) { 1787 if (oldlen != newlen) {
1768 ASSERT(oldlen > newlen); 1788 ASSERT(oldlen > newlen);
1769 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 1789 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1770 (int64_t)(oldlen - newlen), rsvd); 1790 (int64_t)(oldlen - newlen), rsvd);
1771 /* 1791 /*
1772 * Nothing to do for disk quota accounting here. 1792 * Nothing to do for disk quota accounting here.
@@ -2427,7 +2447,7 @@ xfs_bmap_btalloc_nullfb(
2427 startag = ag = 0; 2447 startag = ag = 0;
2428 2448
2429 pag = xfs_perag_get(mp, ag); 2449 pag = xfs_perag_get(mp, ag);
2430 while (*blen < ap->alen) { 2450 while (*blen < args->maxlen) {
2431 if (!pag->pagf_init) { 2451 if (!pag->pagf_init) {
2432 error = xfs_alloc_pagf_init(mp, args->tp, ag, 2452 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2433 XFS_ALLOC_FLAG_TRYLOCK); 2453 XFS_ALLOC_FLAG_TRYLOCK);
@@ -2449,7 +2469,7 @@ xfs_bmap_btalloc_nullfb(
2449 notinit = 1; 2469 notinit = 1;
2450 2470
2451 if (xfs_inode_is_filestream(ap->ip)) { 2471 if (xfs_inode_is_filestream(ap->ip)) {
2452 if (*blen >= ap->alen) 2472 if (*blen >= args->maxlen)
2453 break; 2473 break;
2454 2474
2455 if (ap->userdata) { 2475 if (ap->userdata) {
@@ -2495,14 +2515,14 @@ xfs_bmap_btalloc_nullfb(
2495 * If the best seen length is less than the request 2515 * If the best seen length is less than the request
2496 * length, use the best as the minimum. 2516 * length, use the best as the minimum.
2497 */ 2517 */
2498 else if (*blen < ap->alen) 2518 else if (*blen < args->maxlen)
2499 args->minlen = *blen; 2519 args->minlen = *blen;
2500 /* 2520 /*
2501 * Otherwise we've seen an extent as big as alen, 2521 * Otherwise we've seen an extent as big as maxlen,
2502 * use that as the minimum. 2522 * use that as the minimum.
2503 */ 2523 */
2504 else 2524 else
2505 args->minlen = ap->alen; 2525 args->minlen = args->maxlen;
2506 2526
2507 /* 2527 /*
2508 * set the failure fallback case to look in the selected 2528 * set the failure fallback case to look in the selected
@@ -2570,7 +2590,9 @@ xfs_bmap_btalloc(
2570 args.tp = ap->tp; 2590 args.tp = ap->tp;
2571 args.mp = mp; 2591 args.mp = mp;
2572 args.fsbno = ap->rval; 2592 args.fsbno = ap->rval;
2573 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); 2593
2594 /* Trim the allocation back to the maximum an AG can fit. */
2595 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
2574 args.firstblock = ap->firstblock; 2596 args.firstblock = ap->firstblock;
2575 blen = 0; 2597 blen = 0;
2576 if (nullfb) { 2598 if (nullfb) {
@@ -2618,7 +2640,7 @@ xfs_bmap_btalloc(
2618 /* 2640 /*
2619 * Adjust for alignment 2641 * Adjust for alignment
2620 */ 2642 */
2621 if (blen > args.alignment && blen <= ap->alen) 2643 if (blen > args.alignment && blen <= args.maxlen)
2622 args.minlen = blen - args.alignment; 2644 args.minlen = blen - args.alignment;
2623 args.minalignslop = 0; 2645 args.minalignslop = 0;
2624 } else { 2646 } else {
@@ -2637,7 +2659,7 @@ xfs_bmap_btalloc(
2637 * of minlen+alignment+slop doesn't go up 2659 * of minlen+alignment+slop doesn't go up
2638 * between the calls. 2660 * between the calls.
2639 */ 2661 */
2640 if (blen > mp->m_dalign && blen <= ap->alen) 2662 if (blen > mp->m_dalign && blen <= args.maxlen)
2641 nextminlen = blen - mp->m_dalign; 2663 nextminlen = blen - mp->m_dalign;
2642 else 2664 else
2643 nextminlen = args.minlen; 2665 nextminlen = args.minlen;
@@ -3111,9 +3133,10 @@ xfs_bmap_del_extent(
3111 * Nothing to do for disk quota accounting here. 3133 * Nothing to do for disk quota accounting here.
3112 */ 3134 */
3113 ASSERT(da_old >= da_new); 3135 ASSERT(da_old >= da_new);
3114 if (da_old > da_new) 3136 if (da_old > da_new) {
3115 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), 3137 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
3116 rsvd); 3138 (int64_t)(da_old - da_new), rsvd);
3139 }
3117done: 3140done:
3118 *logflagsp = flags; 3141 *logflagsp = flags;
3119 return error; 3142 return error;
@@ -4481,6 +4504,16 @@ xfs_bmapi(
4481 /* Figure out the extent size, adjust alen */ 4504 /* Figure out the extent size, adjust alen */
4482 extsz = xfs_get_extsz_hint(ip); 4505 extsz = xfs_get_extsz_hint(ip);
4483 if (extsz) { 4506 if (extsz) {
4507 /*
4508 * make sure we don't exceed a single
4509 * extent length when we align the
4510 * extent by reducing length we are
4511 * going to allocate by the maximum
4512 * amount extent size aligment may
4513 * require.
4514 */
4515 alen = XFS_FILBLKS_MIN(len,
4516 MAXEXTLEN - (2 * extsz - 1));
4484 error = xfs_bmap_extsize_align(mp, 4517 error = xfs_bmap_extsize_align(mp,
4485 &got, &prev, extsz, 4518 &got, &prev, extsz,
4486 rt, eof, 4519 rt, eof,
@@ -4526,13 +4559,13 @@ xfs_bmapi(
4526 -((int64_t)extsz), (flags & 4559 -((int64_t)extsz), (flags &
4527 XFS_BMAPI_RSVBLOCKS)); 4560 XFS_BMAPI_RSVBLOCKS));
4528 } else { 4561 } else {
4529 error = xfs_mod_incore_sb(mp, 4562 error = xfs_icsb_modify_counters(mp,
4530 XFS_SBS_FDBLOCKS, 4563 XFS_SBS_FDBLOCKS,
4531 -((int64_t)alen), (flags & 4564 -((int64_t)alen), (flags &
4532 XFS_BMAPI_RSVBLOCKS)); 4565 XFS_BMAPI_RSVBLOCKS));
4533 } 4566 }
4534 if (!error) { 4567 if (!error) {
4535 error = xfs_mod_incore_sb(mp, 4568 error = xfs_icsb_modify_counters(mp,
4536 XFS_SBS_FDBLOCKS, 4569 XFS_SBS_FDBLOCKS,
4537 -((int64_t)indlen), (flags & 4570 -((int64_t)indlen), (flags &
4538 XFS_BMAPI_RSVBLOCKS)); 4571 XFS_BMAPI_RSVBLOCKS));
@@ -4542,7 +4575,7 @@ xfs_bmapi(
4542 (int64_t)extsz, (flags & 4575 (int64_t)extsz, (flags &
4543 XFS_BMAPI_RSVBLOCKS)); 4576 XFS_BMAPI_RSVBLOCKS));
4544 else if (error) 4577 else if (error)
4545 xfs_mod_incore_sb(mp, 4578 xfs_icsb_modify_counters(mp,
4546 XFS_SBS_FDBLOCKS, 4579 XFS_SBS_FDBLOCKS,
4547 (int64_t)alen, (flags & 4580 (int64_t)alen, (flags &
4548 XFS_BMAPI_RSVBLOCKS)); 4581 XFS_BMAPI_RSVBLOCKS));
@@ -4744,8 +4777,12 @@ xfs_bmapi(
4744 * Check if writing previously allocated but 4777 * Check if writing previously allocated but
4745 * unwritten extents. 4778 * unwritten extents.
4746 */ 4779 */
4747 if (wr && mval->br_state == XFS_EXT_UNWRITTEN && 4780 if (wr &&
4748 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { 4781 ((mval->br_state == XFS_EXT_UNWRITTEN &&
4782 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
4783 (mval->br_state == XFS_EXT_NORM &&
4784 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
4785 (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
4749 /* 4786 /*
4750 * Modify (by adding) the state flag, if writing. 4787 * Modify (by adding) the state flag, if writing.
4751 */ 4788 */
@@ -4757,7 +4794,9 @@ xfs_bmapi(
4757 *firstblock; 4794 *firstblock;
4758 cur->bc_private.b.flist = flist; 4795 cur->bc_private.b.flist = flist;
4759 } 4796 }
4760 mval->br_state = XFS_EXT_NORM; 4797 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4798 ? XFS_EXT_NORM
4799 : XFS_EXT_UNWRITTEN;
4761 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4800 error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
4762 firstblock, flist, &tmp_logflags, 4801 firstblock, flist, &tmp_logflags,
4763 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4802 whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
@@ -5200,7 +5239,7 @@ xfs_bunmapi(
5200 ip, -((long)del.br_blockcount), 0, 5239 ip, -((long)del.br_blockcount), 0,
5201 XFS_QMOPT_RES_RTBLKS); 5240 XFS_QMOPT_RES_RTBLKS);
5202 } else { 5241 } else {
5203 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, 5242 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5204 (int64_t)del.br_blockcount, rsvd); 5243 (int64_t)del.br_blockcount, rsvd);
5205 (void)xfs_trans_reserve_quota_nblks(NULL, 5244 (void)xfs_trans_reserve_quota_nblks(NULL,
5206 ip, -((long)del.br_blockcount), 0, 5245 ip, -((long)del.br_blockcount), 0,
@@ -5461,8 +5500,13 @@ xfs_getbmap(
5461 if (error) 5500 if (error)
5462 goto out_unlock_iolock; 5501 goto out_unlock_iolock;
5463 } 5502 }
5464 5503 /*
5465 ASSERT(ip->i_delayed_blks == 0); 5504 * even after flushing the inode, there can still be delalloc
5505 * blocks on the inode beyond EOF due to speculative
5506 * preallocation. These are not removed until the release
5507 * function is called or the inode is inactivated. Hence we
5508 * cannot assert here that ip->i_delayed_blks == 0.
5509 */
5466 } 5510 }
5467 5511
5468 lock = xfs_ilock_map_shared(ip); 5512 lock = xfs_ilock_map_shared(ip);
@@ -6060,3 +6104,79 @@ xfs_bmap_disk_count_leaves(
6060 *count += xfs_bmbt_disk_get_blockcount(frp); 6104 *count += xfs_bmbt_disk_get_blockcount(frp);
6061 } 6105 }
6062} 6106}
6107
6108/*
6109 * dead simple method of punching delalyed allocation blocks from a range in
6110 * the inode. Walks a block at a time so will be slow, but is only executed in
6111 * rare error cases so the overhead is not critical. This will alays punch out
6112 * both the start and end blocks, even if the ranges only partially overlap
6113 * them, so it is up to the caller to ensure that partial blocks are not
6114 * passed in.
6115 */
6116int
6117xfs_bmap_punch_delalloc_range(
6118 struct xfs_inode *ip,
6119 xfs_fileoff_t start_fsb,
6120 xfs_fileoff_t length)
6121{
6122 xfs_fileoff_t remaining = length;
6123 int error = 0;
6124
6125 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
6126
6127 do {
6128 int done;
6129 xfs_bmbt_irec_t imap;
6130 int nimaps = 1;
6131 xfs_fsblock_t firstblock;
6132 xfs_bmap_free_t flist;
6133
6134 /*
6135 * Map the range first and check that it is a delalloc extent
6136 * before trying to unmap the range. Otherwise we will be
6137 * trying to remove a real extent (which requires a
6138 * transaction) or a hole, which is probably a bad idea...
6139 */
6140 error = xfs_bmapi(NULL, ip, start_fsb, 1,
6141 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
6142 &nimaps, NULL);
6143
6144 if (error) {
6145 /* something screwed, just bail */
6146 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6147 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
6148 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6149 ip->i_ino, start_fsb);
6150 }
6151 break;
6152 }
6153 if (!nimaps) {
6154 /* nothing there */
6155 goto next_block;
6156 }
6157 if (imap.br_startblock != DELAYSTARTBLOCK) {
6158 /* been converted, ignore */
6159 goto next_block;
6160 }
6161 WARN_ON(imap.br_blockcount == 0);
6162
6163 /*
6164 * Note: while we initialise the firstblock/flist pair, they
6165 * should never be used because blocks should never be
6166 * allocated or freed for a delalloc extent and hence we need
6167 * don't cancel or finish them after the xfs_bunmapi() call.
6168 */
6169 xfs_bmap_init(&flist, &firstblock);
6170 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
6171 &flist, &done);
6172 if (error)
6173 break;
6174
6175 ASSERT(!flist.xbf_count && !flist.xbf_first);
6176next_block:
6177 start_fsb++;
6178 remaining--;
6179 } while(remaining > 0);
6180
6181 return error;
6182}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..3651191daea1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -74,9 +74,12 @@ typedef struct xfs_bmap_free
74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ 74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
75 /* combine contig. space */ 75 /* combine contig. space */
76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ 76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
77#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */ 77/*
78 /* need write cache flushing and no */ 78 * unwritten extent conversion - this needs write cache flushing and no additional
79 /* additional allocation alignments */ 79 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
80 * from written to unwritten, otherwise convert from unwritten to written.
81 */
82#define XFS_BMAPI_CONVERT 0x200
80 83
81#define XFS_BMAPI_FLAGS \ 84#define XFS_BMAPI_FLAGS \
82 { XFS_BMAPI_WRITE, "WRITE" }, \ 85 { XFS_BMAPI_WRITE, "WRITE" }, \
@@ -391,6 +394,11 @@ xfs_bmap_count_blocks(
391 int whichfork, 394 int whichfork,
392 int *count); 395 int *count);
393 396
397int
398xfs_bmap_punch_delalloc_range(
399 struct xfs_inode *ip,
400 xfs_fileoff_t start_fsb,
401 xfs_fileoff_t length);
394#endif /* __KERNEL__ */ 402#endif /* __KERNEL__ */
395 403
396#endif /* __XFS_BMAP_H__ */ 404#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
217 */ 217 */
218 for (i = 0; i < cur->bc_nlevels; i++) { 218 for (i = 0; i < cur->bc_nlevels; i++) {
219 if (cur->bc_bufs[i]) 219 if (cur->bc_bufs[i])
220 xfs_btree_setbuf(cur, i, NULL); 220 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
221 else if (!error) 221 else if (!error)
222 break; 222 break;
223 } 223 }
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
634 return error; 634 return error;
635 } 635 }
636 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 636 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
637 if (bp != NULL) { 637 if (bp)
638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
639 }
640 *bpp = bp; 639 *bpp = bp;
641 return 0; 640 return 0;
642} 641}
@@ -656,7 +655,7 @@ xfs_btree_reada_bufl(
656 655
657 ASSERT(fsbno != NULLFSBLOCK); 656 ASSERT(fsbno != NULLFSBLOCK);
658 d = XFS_FSB_TO_DADDR(mp, fsbno); 657 d = XFS_FSB_TO_DADDR(mp, fsbno);
659 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 658 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
660} 659}
661 660
662/* 661/*
@@ -676,7 +675,7 @@ xfs_btree_reada_bufs(
676 ASSERT(agno != NULLAGNUMBER); 675 ASSERT(agno != NULLAGNUMBER);
677 ASSERT(agbno != NULLAGBLOCK); 676 ASSERT(agbno != NULLAGBLOCK);
678 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 677 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
679 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 678 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
680} 679}
681 680
682STATIC int 681STATIC int
@@ -763,22 +762,19 @@ xfs_btree_readahead(
763 * Set the buffer for level "lev" in the cursor to bp, releasing 762 * Set the buffer for level "lev" in the cursor to bp, releasing
764 * any previous buffer. 763 * any previous buffer.
765 */ 764 */
766void 765STATIC void
767xfs_btree_setbuf( 766xfs_btree_setbuf(
768 xfs_btree_cur_t *cur, /* btree cursor */ 767 xfs_btree_cur_t *cur, /* btree cursor */
769 int lev, /* level in btree */ 768 int lev, /* level in btree */
770 xfs_buf_t *bp) /* new buffer to set */ 769 xfs_buf_t *bp) /* new buffer to set */
771{ 770{
772 struct xfs_btree_block *b; /* btree block */ 771 struct xfs_btree_block *b; /* btree block */
773 xfs_buf_t *obp; /* old buffer pointer */
774 772
775 obp = cur->bc_bufs[lev]; 773 if (cur->bc_bufs[lev])
776 if (obp) 774 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
777 xfs_trans_brelse(cur->bc_tp, obp);
778 cur->bc_bufs[lev] = bp; 775 cur->bc_bufs[lev] = bp;
779 cur->bc_ra[lev] = 0; 776 cur->bc_ra[lev] = 0;
780 if (!bp) 777
781 return;
782 b = XFS_BUF_TO_BLOCK(bp); 778 b = XFS_BUF_TO_BLOCK(bp);
783 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 779 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
784 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) 780 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -947,13 +943,13 @@ xfs_btree_set_refs(
947 switch (cur->bc_btnum) { 943 switch (cur->bc_btnum) {
948 case XFS_BTNUM_BNO: 944 case XFS_BTNUM_BNO:
949 case XFS_BTNUM_CNT: 945 case XFS_BTNUM_CNT:
950 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 946 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
951 break; 947 break;
952 case XFS_BTNUM_INO: 948 case XFS_BTNUM_INO:
953 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); 949 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
954 break; 950 break;
955 case XFS_BTNUM_BMAP: 951 case XFS_BTNUM_BMAP:
956 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); 952 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
957 break; 953 break;
958 default: 954 default:
959 ASSERT(0); 955 ASSERT(0);
@@ -3011,6 +3007,43 @@ out0:
3011 return 0; 3007 return 0;
3012} 3008}
3013 3009
3010/*
3011 * Kill the current root node, and replace it with it's only child node.
3012 */
3013STATIC int
3014xfs_btree_kill_root(
3015 struct xfs_btree_cur *cur,
3016 struct xfs_buf *bp,
3017 int level,
3018 union xfs_btree_ptr *newroot)
3019{
3020 int error;
3021
3022 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3023 XFS_BTREE_STATS_INC(cur, killroot);
3024
3025 /*
3026 * Update the root pointer, decreasing the level by 1 and then
3027 * free the old root.
3028 */
3029 cur->bc_ops->set_root(cur, newroot, -1);
3030
3031 error = cur->bc_ops->free_block(cur, bp);
3032 if (error) {
3033 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3034 return error;
3035 }
3036
3037 XFS_BTREE_STATS_INC(cur, free);
3038
3039 cur->bc_bufs[level] = NULL;
3040 cur->bc_ra[level] = 0;
3041 cur->bc_nlevels--;
3042
3043 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3044 return 0;
3045}
3046
3014STATIC int 3047STATIC int
3015xfs_btree_dec_cursor( 3048xfs_btree_dec_cursor(
3016 struct xfs_btree_cur *cur, 3049 struct xfs_btree_cur *cur,
@@ -3195,7 +3228,7 @@ xfs_btree_delrec(
3195 * Make it the new root of the btree. 3228 * Make it the new root of the btree.
3196 */ 3229 */
3197 pp = xfs_btree_ptr_addr(cur, 1, block); 3230 pp = xfs_btree_ptr_addr(cur, 1, block);
3198 error = cur->bc_ops->kill_root(cur, bp, level, pp); 3231 error = xfs_btree_kill_root(cur, bp, level, pp);
3199 if (error) 3232 if (error)
3200 goto error0; 3233 goto error0;
3201 } else if (level > 0) { 3234 } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
152 152
153 /* update btree root pointer */ 153 /* update btree root pointer */
154 void (*set_root)(struct xfs_btree_cur *cur, 154 void (*set_root)(struct xfs_btree_cur *cur,
155 union xfs_btree_ptr *nptr, int level_change); 155 union xfs_btree_ptr *nptr, int level_change);
156 int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
157 int level, union xfs_btree_ptr *newroot);
158 156
159 /* block allocation / freeing */ 157 /* block allocation / freeing */
160 int (*alloc_block)(struct xfs_btree_cur *cur, 158 int (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
399 xfs_agblock_t agbno, /* allocation group block number */ 397 xfs_agblock_t agbno, /* allocation group block number */
400 xfs_extlen_t count); /* count of filesystem blocks */ 398 xfs_extlen_t count); /* count of filesystem blocks */
401 399
402/*
403 * Set the buffer for level "lev" in the cursor to bp, releasing
404 * any previous buffer.
405 */
406void
407xfs_btree_setbuf(
408 xfs_btree_cur_t *cur, /* btree cursor */
409 int lev, /* level in btree */
410 struct xfs_buf *bp); /* new buffer to set */
411
412 400
413/* 401/*
414 * Common btree core entry points. 402 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..6f8c21ce0d6d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 141#define xfs_buf_item_log_check(x)
142#endif 142#endif
143 143
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp); 144STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
145STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
146 145
147/* 146/*
148 * This returns the number of log iovecs needed to log the 147 * This returns the number of log iovecs needed to log the
@@ -428,13 +427,15 @@ xfs_buf_item_unpin(
428 427
429 if (remove) { 428 if (remove) {
430 /* 429 /*
431 * We have to remove the log item from the transaction 430 * If we are in a transaction context, we have to
432 * as we are about to release our reference to the 431 * remove the log item from the transaction as we are
433 * buffer. If we don't, the unlock that occurs later 432 * about to release our reference to the buffer. If we
434 * in xfs_trans_uncommit() will ry to reference the 433 * don't, the unlock that occurs later in
434 * xfs_trans_uncommit() will try to reference the
435 * buffer which we no longer have a hold on. 435 * buffer which we no longer have a hold on.
436 */ 436 */
437 xfs_trans_del_item(lip); 437 if (lip->li_desc)
438 xfs_trans_del_item(lip);
438 439
439 /* 440 /*
440 * Since the transaction no longer refers to the buffer, 441 * Since the transaction no longer refers to the buffer,
@@ -450,7 +451,7 @@ xfs_buf_item_unpin(
450 * xfs_trans_ail_delete() drops the AIL lock. 451 * xfs_trans_ail_delete() drops the AIL lock.
451 */ 452 */
452 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 453 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
453 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 454 xfs_buf_do_callbacks(bp);
454 XFS_BUF_SET_FSPRIVATE(bp, NULL); 455 XFS_BUF_SET_FSPRIVATE(bp, NULL);
455 XFS_BUF_CLR_IODONE_FUNC(bp); 456 XFS_BUF_CLR_IODONE_FUNC(bp);
456 } else { 457 } else {
@@ -692,8 +693,7 @@ xfs_buf_item_init(
692 * the first. If we do already have one, there is 693 * the first. If we do already have one, there is
693 * nothing to do here so return. 694 * nothing to do here so return.
694 */ 695 */
695 if (bp->b_mount != mp) 696 ASSERT(bp->b_target->bt_mount == mp);
696 bp->b_mount = mp;
697 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 697 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
698 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 698 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
699 if (lip->li_type == XFS_LI_BUF) { 699 if (lip->li_type == XFS_LI_BUF) {
@@ -919,15 +919,26 @@ xfs_buf_attach_iodone(
919 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 919 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
920} 920}
921 921
922/*
923 * We can have many callbacks on a buffer. Running the callbacks individually
924 * can cause a lot of contention on the AIL lock, so we allow for a single
925 * callback to be able to scan the remaining lip->li_bio_list for other items
926 * of the same type and callback to be processed in the first call.
927 *
928 * As a result, the loop walking the callback list below will also modify the
929 * list. it removes the first item from the list and then runs the callback.
930 * The loop then restarts from the new head of the list. This allows the
931 * callback to scan and modify the list attached to the buffer and we don't
932 * have to care about maintaining a next item pointer.
933 */
922STATIC void 934STATIC void
923xfs_buf_do_callbacks( 935xfs_buf_do_callbacks(
924 xfs_buf_t *bp, 936 struct xfs_buf *bp)
925 xfs_log_item_t *lip)
926{ 937{
927 xfs_log_item_t *nlip; 938 struct xfs_log_item *lip;
928 939
929 while (lip != NULL) { 940 while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
930 nlip = lip->li_bio_list; 941 XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
931 ASSERT(lip->li_cb != NULL); 942 ASSERT(lip->li_cb != NULL);
932 /* 943 /*
933 * Clear the next pointer so we don't have any 944 * Clear the next pointer so we don't have any
@@ -937,7 +948,6 @@ xfs_buf_do_callbacks(
937 */ 948 */
938 lip->li_bio_list = NULL; 949 lip->li_bio_list = NULL;
939 lip->li_cb(bp, lip); 950 lip->li_cb(bp, lip);
940 lip = nlip;
941 } 951 }
942} 952}
943 953
@@ -950,128 +960,76 @@ xfs_buf_do_callbacks(
950 */ 960 */
951void 961void
952xfs_buf_iodone_callbacks( 962xfs_buf_iodone_callbacks(
953 xfs_buf_t *bp) 963 struct xfs_buf *bp)
954{ 964{
955 xfs_log_item_t *lip; 965 struct xfs_log_item *lip = bp->b_fspriv;
956 static ulong lasttime; 966 struct xfs_mount *mp = lip->li_mountp;
957 static xfs_buftarg_t *lasttarg; 967 static ulong lasttime;
958 xfs_mount_t *mp; 968 static xfs_buftarg_t *lasttarg;
959 969
960 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 970 if (likely(!XFS_BUF_GETERROR(bp)))
961 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 971 goto do_callbacks;
962 972
963 if (XFS_BUF_GETERROR(bp) != 0) { 973 /*
964 /* 974 * If we've already decided to shutdown the filesystem because of
965 * If we've already decided to shutdown the filesystem 975 * I/O errors, there's no point in giving this a retry.
966 * because of IO errors, there's no point in giving this 976 */
967 * a retry. 977 if (XFS_FORCED_SHUTDOWN(mp)) {
968 */ 978 XFS_BUF_SUPER_STALE(bp);
969 mp = lip->li_mountp; 979 trace_xfs_buf_item_iodone(bp, _RET_IP_);
970 if (XFS_FORCED_SHUTDOWN(mp)) { 980 goto do_callbacks;
971 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 981 }
972 XFS_BUF_SUPER_STALE(bp);
973 trace_xfs_buf_item_iodone(bp, _RET_IP_);
974 xfs_buf_do_callbacks(bp, lip);
975 XFS_BUF_SET_FSPRIVATE(bp, NULL);
976 XFS_BUF_CLR_IODONE_FUNC(bp);
977 xfs_biodone(bp);
978 return;
979 }
980 982
981 if ((XFS_BUF_TARGET(bp) != lasttarg) || 983 if (XFS_BUF_TARGET(bp) != lasttarg ||
982 (time_after(jiffies, (lasttime + 5*HZ)))) { 984 time_after(jiffies, (lasttime + 5*HZ))) {
983 lasttime = jiffies; 985 lasttime = jiffies;
984 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 986 cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
985 " block 0x%llx in %s", 987 " block 0x%llx in %s",
986 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 988 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
987 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 989 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
988 } 990 }
989 lasttarg = XFS_BUF_TARGET(bp); 991 lasttarg = XFS_BUF_TARGET(bp);
990 992
991 if (XFS_BUF_ISASYNC(bp)) { 993 /*
992 /* 994 * If the write was asynchronous then noone will be looking for the
993 * If the write was asynchronous then noone will be 995 * error. Clear the error state and write the buffer out again.
994 * looking for the error. Clear the error state 996 *
995 * and write the buffer out again delayed write. 997 * During sync or umount we'll write all pending buffers again
996 * 998 * synchronous, which will catch these errors if they keep hanging
997 * XXXsup This is OK, so long as we catch these 999 * around.
998 * before we start the umount; we don't want these 1000 */
999 * DELWRI metadata bufs to be hanging around. 1001 if (XFS_BUF_ISASYNC(bp)) {
1000 */ 1002 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1001 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1003
1002 1004 if (!XFS_BUF_ISSTALE(bp)) {
1003 if (!(XFS_BUF_ISSTALE(bp))) { 1005 XFS_BUF_DELAYWRITE(bp);
1004 XFS_BUF_DELAYWRITE(bp);
1005 XFS_BUF_DONE(bp);
1006 XFS_BUF_SET_START(bp);
1007 }
1008 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1009 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1010 xfs_buf_relse(bp);
1011 } else {
1012 /*
1013 * If the write of the buffer was not asynchronous,
1014 * then we want to make sure to return the error
1015 * to the caller of bwrite(). Because of this we
1016 * cannot clear the B_ERROR state at this point.
1017 * Instead we install a callback function that
1018 * will be called when the buffer is released, and
1019 * that routine will clear the error state and
1020 * set the buffer to be written out again after
1021 * some delay.
1022 */
1023 /* We actually overwrite the existing b-relse
1024 function at times, but we're gonna be shutting down
1025 anyway. */
1026 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1027 XFS_BUF_DONE(bp); 1006 XFS_BUF_DONE(bp);
1028 XFS_BUF_FINISH_IOWAIT(bp); 1007 XFS_BUF_SET_START(bp);
1029 } 1008 }
1009 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1010 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1011 xfs_buf_relse(bp);
1030 return; 1012 return;
1031 } 1013 }
1032 1014
1033 xfs_buf_do_callbacks(bp, lip); 1015 /*
1034 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1016 * If the write of the buffer was synchronous, we want to make
1035 XFS_BUF_CLR_IODONE_FUNC(bp); 1017 * sure to return the error to the caller of xfs_bwrite().
1036 xfs_biodone(bp); 1018 */
1037}
1038
1039/*
1040 * This is a callback routine attached to a buffer which gets an error
1041 * when being written out synchronously.
1042 */
1043STATIC void
1044xfs_buf_error_relse(
1045 xfs_buf_t *bp)
1046{
1047 xfs_log_item_t *lip;
1048 xfs_mount_t *mp;
1049
1050 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1051 mp = (xfs_mount_t *)lip->li_mountp;
1052 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1053
1054 XFS_BUF_STALE(bp); 1019 XFS_BUF_STALE(bp);
1055 XFS_BUF_DONE(bp); 1020 XFS_BUF_DONE(bp);
1056 XFS_BUF_UNDELAYWRITE(bp); 1021 XFS_BUF_UNDELAYWRITE(bp);
1057 XFS_BUF_ERROR(bp,0);
1058 1022
1059 trace_xfs_buf_error_relse(bp, _RET_IP_); 1023 trace_xfs_buf_error_relse(bp, _RET_IP_);
1024 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1060 1025
1061 if (! XFS_FORCED_SHUTDOWN(mp)) 1026do_callbacks:
1062 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1027 xfs_buf_do_callbacks(bp);
1063 /*
1064 * We have to unpin the pinned buffers so do the
1065 * callbacks.
1066 */
1067 xfs_buf_do_callbacks(bp, lip);
1068 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1028 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1069 XFS_BUF_CLR_IODONE_FUNC(bp); 1029 XFS_BUF_CLR_IODONE_FUNC(bp);
1070 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1030 xfs_buf_ioend(bp, 0);
1071 xfs_buf_relse(bp);
1072} 1031}
1073 1032
1074
1075/* 1033/*
1076 * This is the iodone() function for buffers which have been 1034 * This is the iodone() function for buffers which have been
1077 * logged. It is called when they are eventually flushed out. 1035 * logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
105 xfs_buf_log_format_t bli_format; /* in-log header */ 105 xfs_buf_log_format_t bli_format; /* in-log header */
106} xfs_buf_log_item_t; 106} xfs_buf_log_item_t;
107 107
108/*
109 * This structure is used during recovery to record the buf log
110 * items which have been canceled and should not be replayed.
111 */
112typedef struct xfs_buf_cancel {
113 xfs_daddr_t bc_blkno;
114 uint bc_len;
115 int bc_refcount;
116 struct xfs_buf_cancel *bc_next;
117} xfs_buf_cancel_t;
118
119void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); 108void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
120void xfs_buf_item_relse(struct xfs_buf *); 109void xfs_buf_item_relse(struct xfs_buf *);
121void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); 110void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..1c00bedb3175 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2042,7 +2042,7 @@ xfs_da_do_buf(
2042 mappedbno, nmapped, 0, &bp); 2042 mappedbno, nmapped, 0, &bp);
2043 break; 2043 break;
2044 case 3: 2044 case 3:
2045 xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); 2045 xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
2046 error = 0; 2046 error = 0;
2047 bp = NULL; 2047 bp = NULL;
2048 break; 2048 break;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a22..e60490bc00a6 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
377 ip->i_d.di_format = tip->i_d.di_format; 377 ip->i_d.di_format = tip->i_d.di_format;
378 tip->i_d.di_format = tmp; 378 tip->i_d.di_format = tmp;
379 379
380 /*
381 * The extents in the source inode could still contain speculative
382 * preallocation beyond EOF (e.g. the file is open but not modified
383 * while defrag is in progress). In that case, we need to copy over the
384 * number of delalloc blocks the data fork in the source inode is
385 * tracking beyond EOF so that when the fork is truncated away when the
386 * temporary inode is unlinked we don't underrun the i_delayed_blks
387 * counter on that inode.
388 */
389 ASSERT(tip->i_delayed_blks == 0);
390 tip->i_delayed_blks = ip->i_delayed_blks;
391 ip->i_delayed_blks = 0;
392
380 ilf_fields = XFS_ILOG_CORE; 393 ilf_fields = XFS_ILOG_CORE;
381 394
382 switch(ip->i_d.di_format) { 395 switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
49 __be32 di_uid; /* owner's user id */ 49 __be32 di_uid; /* owner's user id */
50 __be32 di_gid; /* owner's group id */ 50 __be32 di_gid; /* owner's group id */
51 __be32 di_nlink; /* number of links to file */ 51 __be32 di_nlink; /* number of links to file */
52 __be16 di_projid; /* owner's project id */ 52 __be16 di_projid_lo; /* lower part of owner's project id */
53 __u8 di_pad[8]; /* unused, zeroed space */ 53 __be16 di_projid_hi; /* higher part owner's project id */
54 __u8 di_pad[6]; /* unused, zeroed space */
54 __be16 di_flushiter; /* incremented on flush */ 55 __be16 di_flushiter; /* incremented on flush */
55 xfs_timestamp_t di_atime; /* time last accessed */ 56 xfs_timestamp_t di_atime; /* time last accessed */
56 xfs_timestamp_t di_mtime; /* time last modified */ 57 xfs_timestamp_t di_mtime; /* time last modified */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
961 if (i > ra_current && 961 if (i > ra_current &&
962 map[ra_index].br_blockcount >= 962 map[ra_index].br_blockcount >=
963 mp->m_dirblkfsbs) { 963 mp->m_dirblkfsbs) {
964 xfs_baread(mp->m_ddev_targp, 964 xfs_buf_readahead(mp->m_ddev_targp,
965 XFS_FSB_TO_DADDR(mp, 965 XFS_FSB_TO_DADDR(mp,
966 map[ra_index].br_startblock + 966 map[ra_index].br_startblock +
967 ra_offset), 967 ra_offset),
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed9990267661..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
58int xfs_etest[XFS_NUM_INJECT_ERROR]; 58int xfs_etest[XFS_NUM_INJECT_ERROR];
59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; 59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; 60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
61int xfs_error_test_active;
61 62
62int 63int
63xfs_error_test(int error_tag, int *fsidp, char *expression, 64xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
108 len = strlen(mp->m_fsname); 109 len = strlen(mp->m_fsname);
109 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); 110 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
110 strcpy(xfs_etest_fsname[i], mp->m_fsname); 111 strcpy(xfs_etest_fsname[i], mp->m_fsname);
112 xfs_error_test_active++;
111 return 0; 113 return 0;
112 } 114 }
113 } 115 }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
137 xfs_etest_fsid[i] = 0LL; 139 xfs_etest_fsid[i] = 0LL;
138 kmem_free(xfs_etest_fsname[i]); 140 kmem_free(xfs_etest_fsname[i]);
139 xfs_etest_fsname[i] = NULL; 141 xfs_etest_fsname[i] = NULL;
142 xfs_error_test_active--;
140 } 143 }
141 } 144 }
142 145
@@ -149,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
149} 152}
150#endif /* DEBUG */ 153#endif /* DEBUG */
151 154
152
153void
154xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
155{
156 va_list ap;
157
158 va_start(ap, fmt);
159 xfs_fs_vcmn_err(level, mp, fmt, ap);
160 va_end(ap);
161}
162
163void
164xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
165{
166 va_list ap;
167
168#ifdef DEBUG
169 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
170#endif
171
172 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
173 && (level & CE_ALERT)) {
174 level &= ~CE_ALERT;
175 level |= CE_PANIC;
176 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
177 }
178 va_start(ap, fmt);
179 xfs_fs_vcmn_err(level, mp, fmt, ap);
180 va_end(ap);
181}
182
183void 155void
184xfs_error_report( 156xfs_error_report(
185 const char *tag, 157 const char *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb82..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,16 +127,17 @@ extern void xfs_corruption_error(const char *tag, int level,
127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT 127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
128 128
129#ifdef DEBUG 129#ifdef DEBUG
130extern int xfs_error_test_active;
130extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); 131extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
131 132
132#define XFS_NUM_INJECT_ERROR 10 133#define XFS_NUM_INJECT_ERROR 10
133#define XFS_TEST_ERROR(expr, mp, tag, rf) \ 134#define XFS_TEST_ERROR(expr, mp, tag, rf) \
134 ((expr) || \ 135 ((expr) || (xfs_error_test_active && \
135 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
136 (rf))) 137 (rf))))
137 138
138extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
139extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
140#else 141#else
141#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
142#define xfs_errortag_add(tag, mp) (ENOSYS) 143#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -161,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
161 162
162struct xfs_mount; 163struct xfs_mount;
163 164
164extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
165 char *fmt, va_list ap)
166 __attribute__ ((format (printf, 3, 0)));
167extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
168 char *fmt, ...)
169 __attribute__ ((format (printf, 4, 5)));
170extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
171 __attribute__ ((format (printf, 3, 4)));
172
173extern void xfs_hex_dump(void *p, int length); 165extern void xfs_hex_dump(void *p, int length);
174 166
175#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ 167#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
176 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) 168 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
177 169
178#define xfs_fs_mount_cmn_err(f, fmt, args...) \ 170#define xfs_fs_mount_cmn_err(f, fmt, args...) \
179 ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) 171 do { \
172 if (!(f & XFS_MFSI_QUIET)) \
173 cmn_err(CE_WARN, "XFS: " fmt, ## args); \
174 } while (0)
180 175
181#endif /* __XFS_ERROR_H__ */ 176#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
48} 48}
49 49
50/* 50/*
51 * Freeing the efi requires that we remove it from the AIL if it has already
52 * been placed there. However, the EFI may not yet have been placed in the AIL
53 * when called by xfs_efi_release() from EFD processing due to the ordering of
54 * committed vs unpin operations in bulk insert operations. Hence the
55 * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
56 * the EFI.
57 */
58STATIC void
59__xfs_efi_release(
60 struct xfs_efi_log_item *efip)
61{
62 struct xfs_ail *ailp = efip->efi_item.li_ailp;
63
64 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
65 spin_lock(&ailp->xa_lock);
66 /* xfs_trans_ail_delete() drops the AIL lock. */
67 xfs_trans_ail_delete(ailp, &efip->efi_item);
68 xfs_efi_item_free(efip);
69 }
70}
71
72/*
51 * This returns the number of iovecs needed to log the given efi item. 73 * This returns the number of iovecs needed to log the given efi item.
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 74 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 75 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
74 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 96 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size; 97 uint size;
76 98
77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 99 ASSERT(atomic_read(&efip->efi_next_extent) ==
100 efip->efi_format.efi_nextents);
78 101
79 efip->efi_format.efi_type = XFS_LI_EFI; 102 efip->efi_format.efi_type = XFS_LI_EFI;
80 103
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
99} 122}
100 123
101/* 124/*
102 * While EFIs cannot really be pinned, the unpin operation is the 125 * While EFIs cannot really be pinned, the unpin operation is the last place at
103 * last place at which the EFI is manipulated during a transaction. 126 * which the EFI is manipulated during a transaction. If we are being asked to
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 127 * remove the EFI it's because the transaction has been cancelled and by
105 * free the EFI. 128 * definition that means the EFI cannot be in the AIL so remove it from the
129 * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
130 * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
106 */ 131 */
107STATIC void 132STATIC void
108xfs_efi_item_unpin( 133xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
110 int remove) 135 int remove)
111{ 136{
112 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 137 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
113 struct xfs_ail *ailp = lip->li_ailp;
114 138
115 spin_lock(&ailp->xa_lock); 139 if (remove) {
116 if (efip->efi_flags & XFS_EFI_CANCELED) { 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
117 if (remove) 141 if (lip->li_desc)
118 xfs_trans_del_item(lip); 142 xfs_trans_del_item(lip);
119
120 /* xfs_trans_ail_delete() drops the AIL lock. */
121 xfs_trans_ail_delete(ailp, lip);
122 xfs_efi_item_free(efip); 143 xfs_efi_item_free(efip);
123 } else { 144 return;
124 efip->efi_flags |= XFS_EFI_COMMITTED;
125 spin_unlock(&ailp->xa_lock);
126 } 145 }
146 __xfs_efi_release(efip);
127} 147}
128 148
129/* 149/*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
152} 172}
153 173
154/* 174/*
155 * The EFI is logged only once and cannot be moved in the log, so 175 * The EFI is logged only once and cannot be moved in the log, so simply return
156 * simply return the lsn at which it's been logged. The canceled 176 * the lsn at which it's been logged. For bulk transaction committed
157 * flag is not paid any attention here. Checking for that is delayed 177 * processing, the EFI may be processed but not yet unpinned prior to the EFD
158 * until the EFI is unpinned. 178 * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
179 * when processing the EFD.
159 */ 180 */
160STATIC xfs_lsn_t 181STATIC xfs_lsn_t
161xfs_efi_item_committed( 182xfs_efi_item_committed(
162 struct xfs_log_item *lip, 183 struct xfs_log_item *lip,
163 xfs_lsn_t lsn) 184 xfs_lsn_t lsn)
164{ 185{
186 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
187
188 set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
165 return lsn; 189 return lsn;
166} 190}
167 191
@@ -230,6 +254,7 @@ xfs_efi_init(
230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 254 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
231 efip->efi_format.efi_nextents = nextents; 255 efip->efi_format.efi_nextents = nextents;
232 efip->efi_format.efi_id = (__psint_t)(void*)efip; 256 efip->efi_format.efi_id = (__psint_t)(void*)efip;
257 atomic_set(&efip->efi_next_extent, 0);
233 258
234 return efip; 259 return efip;
235} 260}
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
289} 314}
290 315
291/* 316/*
292 * This is called by the efd item code below to release references to 317 * This is called by the efd item code below to release references to the given
293 * the given efi item. Each efd calls this with the number of 318 * efi item. Each efd calls this with the number of extents that it has
294 * extents that it has logged, and when the sum of these reaches 319 * logged, and when the sum of these reaches the total number of extents logged
295 * the total number of extents logged by this efi item we can free 320 * by this efi item we can free the efi item.
296 * the efi item.
297 *
298 * Freeing the efi item requires that we remove it from the AIL.
299 * We'll use the AIL lock to protect our counters as well as
300 * the removal from the AIL.
301 */ 321 */
302void 322void
303xfs_efi_release(xfs_efi_log_item_t *efip, 323xfs_efi_release(xfs_efi_log_item_t *efip,
304 uint nextents) 324 uint nextents)
305{ 325{
306 struct xfs_ail *ailp = efip->efi_item.li_ailp; 326 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
307 int extents_left; 327 if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
308 328 __xfs_efi_release(efip);
309 ASSERT(efip->efi_next_extent > 0);
310 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
311
312 spin_lock(&ailp->xa_lock);
313 ASSERT(efip->efi_next_extent >= nextents);
314 efip->efi_next_extent -= nextents;
315 extents_left = efip->efi_next_extent;
316 if (extents_left == 0) {
317 /* xfs_trans_ail_delete() drops the AIL lock. */
318 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
319 xfs_efi_item_free(efip);
320 } else {
321 spin_unlock(&ailp->xa_lock);
322 }
323} 329}
324 330
325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) 331static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
111#define XFS_EFI_MAX_FAST_EXTENTS 16 111#define XFS_EFI_MAX_FAST_EXTENTS 16
112 112
113/* 113/*
114 * Define EFI flags. 114 * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
115 */ 115 */
116#define XFS_EFI_RECOVERED 0x1 116#define XFS_EFI_RECOVERED 1
117#define XFS_EFI_COMMITTED 0x2 117#define XFS_EFI_COMMITTED 2
118#define XFS_EFI_CANCELED 0x4
119 118
120/* 119/*
121 * This is the "extent free intention" log item. It is used 120 * This is the "extent free intention" log item. It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
125 */ 124 */
126typedef struct xfs_efi_log_item { 125typedef struct xfs_efi_log_item {
127 xfs_log_item_t efi_item; 126 xfs_log_item_t efi_item;
128 uint efi_flags; /* misc flags */ 127 atomic_t efi_next_extent;
129 uint efi_next_extent; 128 unsigned long efi_flags; /* misc flags */
130 xfs_efi_log_format_t efi_format; 129 xfs_efi_log_format_t efi_format;
131} xfs_efi_log_item_t; 130} xfs_efi_log_item_t;
132 131
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
744 * If the file's parent directory is known, take its iolock in exclusive 744 * If the file's parent directory is known, take its iolock in exclusive
745 * mode to prevent two sibling files from racing each other to migrate 745 * mode to prevent two sibling files from racing each other to migrate
746 * themselves and their parent to different AGs. 746 * themselves and their parent to different AGs.
747 *
748 * Note that we lock the parent directory iolock inside the child
749 * iolock here. That's fine as we never hold both parent and child
750 * iolock in any other place. This is different from the ilock,
751 * which requires locking of the child after the parent for namespace
752 * operations.
747 */ 753 */
748 if (pip) 754 if (pip)
749 xfs_ilock(pip, XFS_IOLOCK_EXCL); 755 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
750 756
751 /* 757 /*
752 * A new AG needs to be found for the file. If the file's parent 758 * A new AG needs to be found for the file. If the file's parent
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
293 __s32 bs_extsize; /* extent size */ 293 __s32 bs_extsize; /* extent size */
294 __s32 bs_extents; /* number of extents */ 294 __s32 bs_extents; /* number of extents */
295 __u32 bs_gen; /* generation count */ 295 __u32 bs_gen; /* generation count */
296 __u16 bs_projid; /* project id */ 296 __u16 bs_projid_lo; /* lower part of project id */
297#define bs_projid bs_projid_lo /* (previously just bs_projid) */
297 __u16 bs_forkoff; /* inode fork offset in bytes */ 298 __u16 bs_forkoff; /* inode fork offset in bytes */
298 unsigned char bs_pad[12]; /* pad space, unused */ 299 __u16 bs_projid_hi; /* higher part of project id */
300 unsigned char bs_pad[10]; /* pad space, unused */
299 __u32 bs_dmevmask; /* DMIG event mask */ 301 __u32 bs_dmevmask; /* DMIG event mask */
300 __u16 bs_dmstate; /* DMIG state info */ 302 __u16 bs_dmstate; /* DMIG state info */
301 __u16 bs_aextents; /* attribute number of extents */ 303 __u16 bs_aextents; /* attribute number of extents */
@@ -448,6 +450,7 @@ typedef struct xfs_handle {
448/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ 450/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
449/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 451/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
450#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 452#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
453#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
451 454
452/* 455/*
453 * ioctl commands that replace IRIX syssgi()'s 456 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..cec89dd5d7d2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) 144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
145 return error; 145 return error;
146 dpct = pct - mp->m_sb.sb_imax_pct; 146 dpct = pct - mp->m_sb.sb_imax_pct;
147 error = xfs_read_buf(mp, mp->m_ddev_targp, 147 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0, &bp); 149 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
150 if (error) 150 if (!bp)
151 return error; 151 return EIO;
152 ASSERT(bp);
153 xfs_buf_relse(bp); 152 xfs_buf_relse(bp);
154 153
155 new = nb; /* use new as a temporary here */ 154 new = nb; /* use new as a temporary here */
@@ -375,6 +374,7 @@ xfs_growfs_data_private(
375 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
376 } else 375 } else
377 mp->m_maxicount = 0; 376 mp->m_maxicount = 0;
377 xfs_set_low_space_thresholds(mp);
378 378
379 /* update secondary superblocks. */ 379 /* update secondary superblocks. */
380 for (agno = 1; agno < nagcount; agno++) { 380 for (agno = 1; agno < nagcount; agno++) {
@@ -597,7 +597,8 @@ out:
597 * the extra reserve blocks from the reserve..... 597 * the extra reserve blocks from the reserve.....
598 */ 598 */
599 int error; 599 int error;
600 error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); 600 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
601 fdblks_delta, 0);
601 if (error == ENOSPC) 602 if (error == ENOSPC)
602 goto retry; 603 goto retry;
603 } 604 }
@@ -611,12 +612,13 @@ out:
611 * 612 *
612 * We cannot use an inode here for this - that will push dirty state back up 613 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from 614 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead. 615 * making progress. Hence we log a field in the superblock instead and use a
616 * synchronous transaction to ensure the superblock is immediately unpinned
617 * and can be written back.
615 */ 618 */
616int 619int
617xfs_fs_log_dummy( 620xfs_fs_log_dummy(
618 xfs_mount_t *mp, 621 xfs_mount_t *mp)
619 int flags)
620{ 622{
621 xfs_trans_t *tp; 623 xfs_trans_t *tp;
622 int error; 624 int error;
@@ -631,8 +633,7 @@ xfs_fs_log_dummy(
631 633
632 /* log the UUID because it is an unchanging field */ 634 /* log the UUID because it is an unchanging field */
633 xfs_mod_sb(tp, XFS_SB_UUID); 635 xfs_mod_sb(tp, XFS_SB_UUID);
634 if (flags & SYNC_WAIT) 636 xfs_trans_set_sync(tp);
635 xfs_trans_set_sync(tp);
636 return xfs_trans_commit(tp, 0); 637 return xfs_trans_commit(tp, 0);
637} 638}
638 639
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..0626a32c3447 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
212 * to log a whole cluster of inodes instead of all the 212 * to log a whole cluster of inodes instead of all the
213 * individual transactions causing a lot of log traffic. 213 * individual transactions causing a lot of log traffic.
214 */ 214 */
215 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
216 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
217 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
218 uint isize = sizeof(struct xfs_dinode); 218 uint isize = sizeof(struct xfs_dinode);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
183 cur->bc_rec.i.ir_startino; 183 cur->bc_rec.i.ir_startino;
184} 184}
185 185
186STATIC int
187xfs_inobt_kill_root(
188 struct xfs_btree_cur *cur,
189 struct xfs_buf *bp,
190 int level,
191 union xfs_btree_ptr *newroot)
192{
193 int error;
194
195 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
196 XFS_BTREE_STATS_INC(cur, killroot);
197
198 /*
199 * Update the root pointer, decreasing the level by 1 and then
200 * free the old root.
201 */
202 xfs_inobt_set_root(cur, newroot, -1);
203 error = xfs_inobt_free_block(cur, bp);
204 if (error) {
205 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
206 return error;
207 }
208
209 XFS_BTREE_STATS_INC(cur, free);
210
211 cur->bc_bufs[level] = NULL;
212 cur->bc_nlevels--;
213
214 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
215 return 0;
216}
217
218#ifdef DEBUG 186#ifdef DEBUG
219STATIC int 187STATIC int
220xfs_inobt_keys_inorder( 188xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
309 277
310 .dup_cursor = xfs_inobt_dup_cursor, 278 .dup_cursor = xfs_inobt_dup_cursor,
311 .set_root = xfs_inobt_set_root, 279 .set_root = xfs_inobt_set_root,
312 .kill_root = xfs_inobt_kill_root,
313 .alloc_block = xfs_inobt_alloc_block, 280 .alloc_block = xfs_inobt_alloc_block,
314 .free_block = xfs_inobt_free_block, 281 .free_block = xfs_inobt_free_block,
315 .get_minrecs = xfs_inobt_get_minrecs, 282 .get_minrecs = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
43 43
44 44
45/* 45/*
46 * Define xfs inode iolock lockdep classes. We need to ensure that all active
47 * inodes are considered the same for lockdep purposes, including inodes that
48 * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
49 * guarantee the locks are considered the same when there are multiple lock
50 * initialisation siteѕ. Also, define a reclaimable inode class so it is
51 * obvious in lockdep reports which class the report is against.
52 */
53static struct lock_class_key xfs_iolock_active;
54struct lock_class_key xfs_iolock_reclaimable;
55
56/*
46 * Allocate and initialise an xfs_inode. 57 * Allocate and initialise an xfs_inode.
47 */ 58 */
48STATIC struct xfs_inode * 59STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
69 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
70 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
71 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
72 84
73 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
87 &xfs_iolock_active, "xfs_iolock_active");
74 88
75 /* initialise the xfs inode */ 89 /* initialise the xfs inode */
76 ip->i_ino = ino; 90 ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
85 ip->i_size = 0; 99 ip->i_size = 0;
86 ip->i_new_size = 0; 100 ip->i_new_size = 0;
87 101
88 /* prevent anyone from using this yet */
89 VFS_I(ip)->i_state = I_NEW;
90
91 return ip; 102 return ip;
92} 103}
93 104
105STATIC void
106xfs_inode_free_callback(
107 struct rcu_head *head)
108{
109 struct inode *inode = container_of(head, struct inode, i_rcu);
110 struct xfs_inode *ip = XFS_I(inode);
111
112 INIT_LIST_HEAD(&inode->i_dentry);
113 kmem_zone_free(xfs_inode_zone, ip);
114}
115
94void 116void
95xfs_inode_free( 117xfs_inode_free(
96 struct xfs_inode *ip) 118 struct xfs_inode *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
134 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 156 ASSERT(!spin_is_locked(&ip->i_flags_lock));
135 ASSERT(completion_done(&ip->i_flush)); 157 ASSERT(completion_done(&ip->i_flush));
136 158
137 kmem_zone_free(xfs_inode_zone, ip); 159 /*
160 * Because we use RCU freeing we need to ensure the inode always
161 * appears to be reclaimed with an invalid inode number when in the
162 * free state. The ip->i_flags_lock provides the barrier against lookup
163 * races.
164 */
165 spin_lock(&ip->i_flags_lock);
166 ip->i_flags = XFS_IRECLAIM;
167 ip->i_ino = 0;
168 spin_unlock(&ip->i_flags_lock);
169
170 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138} 171}
139 172
140/* 173/*
@@ -144,14 +177,29 @@ static int
144xfs_iget_cache_hit( 177xfs_iget_cache_hit(
145 struct xfs_perag *pag, 178 struct xfs_perag *pag,
146 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
147 int flags, 181 int flags,
148 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
149{ 183{
150 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
151 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
152 int error; 186 int error;
153 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
154 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
155 203
156 /* 204 /*
157 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
194 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
195 243
196 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
197 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
198 246
199 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
200 if (error) { 248 if (error) {
@@ -202,7 +250,7 @@ xfs_iget_cache_hit(
202 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
203 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
204 */ 252 */
205 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
206 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
207 255
208 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~XFS_INEW;
@@ -212,14 +260,20 @@ xfs_iget_cache_hit(
212 goto out_error; 260 goto out_error;
213 } 261 }
214 262
215 write_lock(&pag->pag_ici_lock); 263 spin_lock(&pag->pag_ici_lock);
216 spin_lock(&ip->i_flags_lock); 264 spin_lock(&ip->i_flags_lock);
217 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); 265 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
218 ip->i_flags |= XFS_INEW; 266 ip->i_flags |= XFS_INEW;
219 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 267 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
220 inode->i_state = I_NEW; 268 inode->i_state = I_NEW;
269
270 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
271 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
272 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
273 &xfs_iolock_active, "xfs_iolock_active");
274
221 spin_unlock(&ip->i_flags_lock); 275 spin_unlock(&ip->i_flags_lock);
222 write_unlock(&pag->pag_ici_lock); 276 spin_unlock(&pag->pag_ici_lock);
223 } else { 277 } else {
224 /* If the VFS inode is being torn down, pause and try again. */ 278 /* If the VFS inode is being torn down, pause and try again. */
225 if (!igrab(inode)) { 279 if (!igrab(inode)) {
@@ -230,7 +284,7 @@ xfs_iget_cache_hit(
230 284
231 /* We've got a live one. */ 285 /* We've got a live one. */
232 spin_unlock(&ip->i_flags_lock); 286 spin_unlock(&ip->i_flags_lock);
233 read_unlock(&pag->pag_ici_lock); 287 rcu_read_unlock();
234 trace_xfs_iget_hit(ip); 288 trace_xfs_iget_hit(ip);
235 } 289 }
236 290
@@ -244,7 +298,7 @@ xfs_iget_cache_hit(
244 298
245out_error: 299out_error:
246 spin_unlock(&ip->i_flags_lock); 300 spin_unlock(&ip->i_flags_lock);
247 read_unlock(&pag->pag_ici_lock); 301 rcu_read_unlock();
248 return error; 302 return error;
249} 303}
250 304
@@ -297,7 +351,7 @@ xfs_iget_cache_miss(
297 BUG(); 351 BUG();
298 } 352 }
299 353
300 write_lock(&pag->pag_ici_lock); 354 spin_lock(&pag->pag_ici_lock);
301 355
302 /* insert the new inode */ 356 /* insert the new inode */
303 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 357 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +366,14 @@ xfs_iget_cache_miss(
312 ip->i_udquot = ip->i_gdquot = NULL; 366 ip->i_udquot = ip->i_gdquot = NULL;
313 xfs_iflags_set(ip, XFS_INEW); 367 xfs_iflags_set(ip, XFS_INEW);
314 368
315 write_unlock(&pag->pag_ici_lock); 369 spin_unlock(&pag->pag_ici_lock);
316 radix_tree_preload_end(); 370 radix_tree_preload_end();
317 371
318 *ipp = ip; 372 *ipp = ip;
319 return 0; 373 return 0;
320 374
321out_preload_end: 375out_preload_end:
322 write_unlock(&pag->pag_ici_lock); 376 spin_unlock(&pag->pag_ici_lock);
323 radix_tree_preload_end(); 377 radix_tree_preload_end();
324 if (lock_flags) 378 if (lock_flags)
325 xfs_iunlock(ip, lock_flags); 379 xfs_iunlock(ip, lock_flags);
@@ -365,8 +419,8 @@ xfs_iget(
365 xfs_perag_t *pag; 419 xfs_perag_t *pag;
366 xfs_agino_t agino; 420 xfs_agino_t agino;
367 421
368 /* the radix tree exists only in inode capable AGs */ 422 /* reject inode numbers outside existing AGs */
369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) 423 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
370 return EINVAL; 424 return EINVAL;
371 425
372 /* get the perag structure and ensure that it's inode capable */ 426 /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +429,15 @@ xfs_iget(
375 429
376again: 430again:
377 error = 0; 431 error = 0;
378 read_lock(&pag->pag_ici_lock); 432 rcu_read_lock();
379 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 433 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
380 434
381 if (ip) { 435 if (ip) {
382 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 436 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
383 if (error) 437 if (error)
384 goto out_error_or_again; 438 goto out_error_or_again;
385 } else { 439 } else {
386 read_unlock(&pag->pag_ici_lock); 440 rcu_read_unlock();
387 XFS_STATS_INC(xs_ig_missed); 441 XFS_STATS_INC(xs_ig_missed);
388 442
389 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 443 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..be7cf625421f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -660,7 +660,8 @@ xfs_dinode_from_disk(
660 to->di_uid = be32_to_cpu(from->di_uid); 660 to->di_uid = be32_to_cpu(from->di_uid);
661 to->di_gid = be32_to_cpu(from->di_gid); 661 to->di_gid = be32_to_cpu(from->di_gid);
662 to->di_nlink = be32_to_cpu(from->di_nlink); 662 to->di_nlink = be32_to_cpu(from->di_nlink);
663 to->di_projid = be16_to_cpu(from->di_projid); 663 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
664 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
664 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 665 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
665 to->di_flushiter = be16_to_cpu(from->di_flushiter); 666 to->di_flushiter = be16_to_cpu(from->di_flushiter);
666 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 667 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +696,8 @@ xfs_dinode_to_disk(
695 to->di_uid = cpu_to_be32(from->di_uid); 696 to->di_uid = cpu_to_be32(from->di_uid);
696 to->di_gid = cpu_to_be32(from->di_gid); 697 to->di_gid = cpu_to_be32(from->di_gid);
697 to->di_nlink = cpu_to_be32(from->di_nlink); 698 to->di_nlink = cpu_to_be32(from->di_nlink);
698 to->di_projid = cpu_to_be16(from->di_projid); 699 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
700 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
699 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 701 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
700 to->di_flushiter = cpu_to_be16(from->di_flushiter); 702 to->di_flushiter = cpu_to_be16(from->di_flushiter);
701 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 703 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -874,7 +876,7 @@ xfs_iread(
874 if (ip->i_d.di_version == 1) { 876 if (ip->i_d.di_version == 1) {
875 ip->i_d.di_nlink = ip->i_d.di_onlink; 877 ip->i_d.di_nlink = ip->i_d.di_onlink;
876 ip->i_d.di_onlink = 0; 878 ip->i_d.di_onlink = 0;
877 ip->i_d.di_projid = 0; 879 xfs_set_projid(ip, 0);
878 } 880 }
879 881
880 ip->i_delayed_blks = 0; 882 ip->i_delayed_blks = 0;
@@ -885,7 +887,7 @@ xfs_iread(
885 * around for a while. This helps to keep recently accessed 887 * around for a while. This helps to keep recently accessed
886 * meta-data in-core longer. 888 * meta-data in-core longer.
887 */ 889 */
888 XFS_BUF_SET_REF(bp, XFS_INO_REF); 890 xfs_buf_set_ref(bp, XFS_INO_REF);
889 891
890 /* 892 /*
891 * Use xfs_trans_brelse() to release the buffer containing the 893 * Use xfs_trans_brelse() to release the buffer containing the
@@ -982,8 +984,7 @@ xfs_ialloc(
982 mode_t mode, 984 mode_t mode,
983 xfs_nlink_t nlink, 985 xfs_nlink_t nlink,
984 xfs_dev_t rdev, 986 xfs_dev_t rdev,
985 cred_t *cr, 987 prid_t prid,
986 xfs_prid_t prid,
987 int okalloc, 988 int okalloc,
988 xfs_buf_t **ialloc_context, 989 xfs_buf_t **ialloc_context,
989 boolean_t *call_again, 990 boolean_t *call_again,
@@ -1027,7 +1028,7 @@ xfs_ialloc(
1027 ASSERT(ip->i_d.di_nlink == nlink); 1028 ASSERT(ip->i_d.di_nlink == nlink);
1028 ip->i_d.di_uid = current_fsuid(); 1029 ip->i_d.di_uid = current_fsuid();
1029 ip->i_d.di_gid = current_fsgid(); 1030 ip->i_d.di_gid = current_fsgid();
1030 ip->i_d.di_projid = prid; 1031 xfs_set_projid(ip, prid);
1031 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1032 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1032 1033
1033 /* 1034 /*
@@ -1999,17 +2000,33 @@ xfs_ifree_cluster(
1999 */ 2000 */
2000 for (i = 0; i < ninodes; i++) { 2001 for (i = 0; i < ninodes; i++) {
2001retry: 2002retry:
2002 read_lock(&pag->pag_ici_lock); 2003 rcu_read_lock();
2003 ip = radix_tree_lookup(&pag->pag_ici_root, 2004 ip = radix_tree_lookup(&pag->pag_ici_root,
2004 XFS_INO_TO_AGINO(mp, (inum + i))); 2005 XFS_INO_TO_AGINO(mp, (inum + i)));
2005 2006
2006 /* Inode not in memory or stale, nothing to do */ 2007 /* Inode not in memory, nothing to do */
2007 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2008 if (!ip) {
2008 read_unlock(&pag->pag_ici_lock); 2009 rcu_read_unlock();
2009 continue; 2010 continue;
2010 } 2011 }
2011 2012
2012 /* 2013 /*
2014 * because this is an RCU protected lookup, we could
2015 * find a recently freed or even reallocated inode
2016 * during the lookup. We need to check under the
2017 * i_flags_lock for a valid inode here. Skip it if it
2018 * is not valid, the wrong inode or stale.
2019 */
2020 spin_lock(&ip->i_flags_lock);
2021 if (ip->i_ino != inum + i ||
2022 __xfs_iflags_test(ip, XFS_ISTALE)) {
2023 spin_unlock(&ip->i_flags_lock);
2024 rcu_read_unlock();
2025 continue;
2026 }
2027 spin_unlock(&ip->i_flags_lock);
2028
2029 /*
2013 * Don't try to lock/unlock the current inode, but we 2030 * Don't try to lock/unlock the current inode, but we
2014 * _cannot_ skip the other inodes that we did not find 2031 * _cannot_ skip the other inodes that we did not find
2015 * in the list attached to the buffer and are not 2032 * in the list attached to the buffer and are not
@@ -2018,11 +2035,11 @@ retry:
2018 */ 2035 */
2019 if (ip != free_ip && 2036 if (ip != free_ip &&
2020 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2037 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2021 read_unlock(&pag->pag_ici_lock); 2038 rcu_read_unlock();
2022 delay(1); 2039 delay(1);
2023 goto retry; 2040 goto retry;
2024 } 2041 }
2025 read_unlock(&pag->pag_ici_lock); 2042 rcu_read_unlock();
2026 2043
2027 xfs_iflock(ip); 2044 xfs_iflock(ip);
2028 xfs_iflags_set(ip, XFS_ISTALE); 2045 xfs_iflags_set(ip, XFS_ISTALE);
@@ -2628,7 +2645,7 @@ xfs_iflush_cluster(
2628 2645
2629 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2646 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2630 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2647 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2631 read_lock(&pag->pag_ici_lock); 2648 rcu_read_lock();
2632 /* really need a gang lookup range call here */ 2649 /* really need a gang lookup range call here */
2633 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2650 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2634 first_index, inodes_per_cluster); 2651 first_index, inodes_per_cluster);
@@ -2639,9 +2656,21 @@ xfs_iflush_cluster(
2639 iq = ilist[i]; 2656 iq = ilist[i];
2640 if (iq == ip) 2657 if (iq == ip)
2641 continue; 2658 continue;
2642 /* if the inode lies outside this cluster, we're done. */ 2659
2643 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2660 /*
2644 break; 2661 * because this is an RCU protected lookup, we could find a
2662 * recently freed or even reallocated inode during the lookup.
2663 * We need to check under the i_flags_lock for a valid inode
2664 * here. Skip it if it is not valid or the wrong inode.
2665 */
2666 spin_lock(&ip->i_flags_lock);
2667 if (!ip->i_ino ||
2668 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2669 spin_unlock(&ip->i_flags_lock);
2670 continue;
2671 }
2672 spin_unlock(&ip->i_flags_lock);
2673
2645 /* 2674 /*
2646 * Do an un-protected check to see if the inode is dirty and 2675 * Do an un-protected check to see if the inode is dirty and
2647 * is a candidate for flushing. These checks will be repeated 2676 * is a candidate for flushing. These checks will be repeated
@@ -2691,7 +2720,7 @@ xfs_iflush_cluster(
2691 } 2720 }
2692 2721
2693out_free: 2722out_free:
2694 read_unlock(&pag->pag_ici_lock); 2723 rcu_read_unlock();
2695 kmem_free(ilist); 2724 kmem_free(ilist);
2696out_put: 2725out_put:
2697 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
@@ -2703,7 +2732,7 @@ cluster_corrupt_out:
2703 * Corruption detected in the clustering loop. Invalidate the 2732 * Corruption detected in the clustering loop. Invalidate the
2704 * inode buffer and shut down the filesystem. 2733 * inode buffer and shut down the filesystem.
2705 */ 2734 */
2706 read_unlock(&pag->pag_ici_lock); 2735 rcu_read_unlock();
2707 /* 2736 /*
2708 * Clean up the buffer. If it was B_DELWRI, just release it -- 2737 * Clean up the buffer. If it was B_DELWRI, just release it --
2709 * brelse can handle it with no problems. If not, shut down the 2738 * brelse can handle it with no problems. If not, shut down the
@@ -2725,7 +2754,7 @@ cluster_corrupt_out:
2725 XFS_BUF_UNDONE(bp); 2754 XFS_BUF_UNDONE(bp);
2726 XFS_BUF_STALE(bp); 2755 XFS_BUF_STALE(bp);
2727 XFS_BUF_ERROR(bp,EIO); 2756 XFS_BUF_ERROR(bp,EIO);
2728 xfs_biodone(bp); 2757 xfs_buf_ioend(bp, 0);
2729 } else { 2758 } else {
2730 XFS_BUF_STALE(bp); 2759 XFS_BUF_STALE(bp);
2731 xfs_buf_relse(bp); 2760 xfs_buf_relse(bp);
@@ -3008,7 +3037,7 @@ xfs_iflush_int(
3008 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3037 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3009 memset(&(dip->di_pad[0]), 0, 3038 memset(&(dip->di_pad[0]), 0,
3010 sizeof(dip->di_pad)); 3039 sizeof(dip->di_pad));
3011 ASSERT(ip->i_d.di_projid == 0); 3040 ASSERT(xfs_get_projid(ip) == 0);
3012 } 3041 }
3013 } 3042 }
3014 3043
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..5c95fa8ec11d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
134 __uint32_t di_uid; /* owner's user id */ 134 __uint32_t di_uid; /* owner's user id */
135 __uint32_t di_gid; /* owner's group id */ 135 __uint32_t di_gid; /* owner's group id */
136 __uint32_t di_nlink; /* number of links to file */ 136 __uint32_t di_nlink; /* number of links to file */
137 __uint16_t di_projid; /* owner's project id */ 137 __uint16_t di_projid_lo; /* lower part of owner's project id */
138 __uint8_t di_pad[8]; /* unused, zeroed space */ 138 __uint16_t di_projid_hi; /* higher part of owner's project id */
139 __uint8_t di_pad[6]; /* unused, zeroed space */
139 __uint16_t di_flushiter; /* incremented on flush */ 140 __uint16_t di_flushiter; /* incremented on flush */
140 xfs_ictimestamp_t di_atime; /* time last accessed */ 141 xfs_ictimestamp_t di_atime; /* time last accessed */
141 xfs_ictimestamp_t di_mtime; /* time last modified */ 142 xfs_ictimestamp_t di_mtime; /* time last modified */
@@ -212,7 +213,6 @@ typedef struct xfs_icdinode {
212#ifdef __KERNEL__ 213#ifdef __KERNEL__
213 214
214struct bhv_desc; 215struct bhv_desc;
215struct cred;
216struct xfs_buf; 216struct xfs_buf;
217struct xfs_bmap_free; 217struct xfs_bmap_free;
218struct xfs_bmbt_irec; 218struct xfs_bmbt_irec;
@@ -335,6 +335,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
335} 335}
336 336
337/* 337/*
338 * Project quota id helpers (previously projid was 16bit only
339 * and using two 16bit values to hold new 32bit projid was choosen
340 * to retain compatibility with "old" filesystems).
341 */
342static inline prid_t
343xfs_get_projid(struct xfs_inode *ip)
344{
345 return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
346}
347
348static inline void
349xfs_set_projid(struct xfs_inode *ip,
350 prid_t projid)
351{
352 ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
353 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
354}
355
356/*
338 * Manage the i_flush queue embedded in the inode. This completion 357 * Manage the i_flush queue embedded in the inode. This completion
339 * queue synchronizes processes attempting to flush the in-core 358 * queue synchronizes processes attempting to flush the in-core
340 * inode back to disk. 359 * inode back to disk.
@@ -357,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
357/* 376/*
358 * In-core inode flags. 377 * In-core inode flags.
359 */ 378 */
360#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ 379#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
361#define XFS_ISTALE 0x0002 /* inode has been staled */ 380#define XFS_ISTALE 0x0002 /* inode has been staled */
362#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
363#define XFS_INEW 0x0008 /* inode has just been allocated */ 382#define XFS_INEW 0x0008 /* inode has just been allocated */
364#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
365#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
385#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
366 386
367/* 387/*
368 * Flags for inode locking. 388 * Flags for inode locking.
@@ -419,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
419#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 439#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 440#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421 441
442extern struct lock_class_key xfs_iolock_reclaimable;
443
422/* 444/*
423 * Flags for xfs_itruncate_start(). 445 * Flags for xfs_itruncate_start().
424 */ 446 */
@@ -456,8 +478,8 @@ void xfs_inode_free(struct xfs_inode *ip);
456 * xfs_inode.c prototypes. 478 * xfs_inode.c prototypes.
457 */ 479 */
458int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, 480int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
459 xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, 481 xfs_nlink_t, xfs_dev_t, prid_t, int,
460 int, struct xfs_buf **, boolean_t *, xfs_inode_t **); 482 struct xfs_buf **, boolean_t *, xfs_inode_t **);
461 483
462uint xfs_ip2xflags(struct xfs_inode *); 484uint xfs_ip2xflags(struct xfs_inode *);
463uint xfs_dic2xflags(struct xfs_dinode *); 485uint xfs_dic2xflags(struct xfs_dinode *);
@@ -471,7 +493,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
471void xfs_iext_realloc(xfs_inode_t *, int, int); 493void xfs_iext_realloc(xfs_inode_t *, int, int);
472void xfs_iunpin_wait(xfs_inode_t *); 494void xfs_iunpin_wait(xfs_inode_t *);
473int xfs_iflush(xfs_inode_t *, uint); 495int xfs_iflush(xfs_inode_t *, uint);
474void xfs_ichgtime(xfs_inode_t *, int);
475void xfs_lock_inodes(xfs_inode_t **, int, uint); 496void xfs_lock_inodes(xfs_inode_t **, int, uint);
476void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 497void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
477 498
@@ -482,7 +503,7 @@ void xfs_mark_inode_dirty_sync(xfs_inode_t *);
482#define IHOLD(ip) \ 503#define IHOLD(ip) \
483do { \ 504do { \
484 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 505 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
485 atomic_inc(&(VFS_I(ip)->i_count)); \ 506 ihold(VFS_I(ip)); \
486 trace_xfs_ihold(ip, _THIS_IP_); \ 507 trace_xfs_ihold(ip, _THIS_IP_); \
487} while (0) 508} while (0)
488 509
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..fd4f398bd6f1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -223,15 +223,6 @@ xfs_inode_item_format(
223 nvecs = 1; 223 nvecs = 1;
224 224
225 /* 225 /*
226 * Make sure the linux inode is dirty. We do this before
227 * clearing i_update_core as the VFS will call back into
228 * XFS here and set i_update_core, so we need to dirty the
229 * inode first so that the ordering of i_update_core and
230 * unlogged modifications still works as described below.
231 */
232 xfs_mark_inode_dirty_sync(ip);
233
234 /*
235 * Clear i_update_core if the timestamps (or any other 226 * Clear i_update_core if the timestamps (or any other
236 * non-transactional modification) need flushing/logging 227 * non-transactional modification) need flushing/logging
237 * and we're about to log them with the rest of the core. 228 * and we're about to log them with the rest of the core.
@@ -666,18 +657,37 @@ xfs_inode_item_unlock(
666} 657}
667 658
668/* 659/*
669 * This is called to find out where the oldest active copy of the 660 * This is called to find out where the oldest active copy of the inode log
670 * inode log item in the on disk log resides now that the last log 661 * item in the on disk log resides now that the last log write of it completed
671 * write of it completed at the given lsn. Since we always re-log 662 * at the given lsn. Since we always re-log all dirty data in an inode, the
672 * all dirty data in an inode, the latest copy in the on disk log 663 * latest copy in the on disk log is the only one that matters. Therefore,
673 * is the only one that matters. Therefore, simply return the 664 * simply return the given lsn.
674 * given lsn. 665 *
666 * If the inode has been marked stale because the cluster is being freed, we
667 * don't want to (re-)insert this inode into the AIL. There is a race condition
668 * where the cluster buffer may be unpinned before the inode is inserted into
669 * the AIL during transaction committed processing. If the buffer is unpinned
670 * before the inode item has been committed and inserted, then it is possible
671 * for the buffer to be written and IO completions before the inode is inserted
672 * into the AIL. In that case, we'd be inserting a clean, stale inode into the
673 * AIL which will never get removed. It will, however, get reclaimed which
674 * triggers an assert in xfs_inode_free() complaining about freein an inode
675 * still in the AIL.
676 *
677 * To avoid this, return a lower LSN than the one passed in so that the
678 * transaction committed code will not move the inode forward in the AIL but
679 * will still unpin it properly.
675 */ 680 */
676STATIC xfs_lsn_t 681STATIC xfs_lsn_t
677xfs_inode_item_committed( 682xfs_inode_item_committed(
678 struct xfs_log_item *lip, 683 struct xfs_log_item *lip,
679 xfs_lsn_t lsn) 684 xfs_lsn_t lsn)
680{ 685{
686 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
687 struct xfs_inode *ip = iip->ili_inode;
688
689 if (xfs_iflags_test(ip, XFS_ISTALE))
690 return lsn - 1;
681 return lsn; 691 return lsn;
682} 692}
683 693
@@ -832,15 +842,64 @@ xfs_inode_item_destroy(
832 * flushed to disk. It is responsible for removing the inode item 842 * flushed to disk. It is responsible for removing the inode item
833 * from the AIL if it has not been re-logged, and unlocking the inode's 843 * from the AIL if it has not been re-logged, and unlocking the inode's
834 * flush lock. 844 * flush lock.
845 *
846 * To reduce AIL lock traffic as much as possible, we scan the buffer log item
847 * list for other inodes that will run this function. We remove them from the
848 * buffer list so we can process all the inode IO completions in one AIL lock
849 * traversal.
835 */ 850 */
836void 851void
837xfs_iflush_done( 852xfs_iflush_done(
838 struct xfs_buf *bp, 853 struct xfs_buf *bp,
839 struct xfs_log_item *lip) 854 struct xfs_log_item *lip)
840{ 855{
841 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 856 struct xfs_inode_log_item *iip;
842 xfs_inode_t *ip = iip->ili_inode; 857 struct xfs_log_item *blip;
858 struct xfs_log_item *next;
859 struct xfs_log_item *prev;
843 struct xfs_ail *ailp = lip->li_ailp; 860 struct xfs_ail *ailp = lip->li_ailp;
861 int need_ail = 0;
862
863 /*
864 * Scan the buffer IO completions for other inodes being completed and
865 * attach them to the current inode log item.
866 */
867 blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
868 prev = NULL;
869 while (blip != NULL) {
870 if (lip->li_cb != xfs_iflush_done) {
871 prev = blip;
872 blip = blip->li_bio_list;
873 continue;
874 }
875
876 /* remove from list */
877 next = blip->li_bio_list;
878 if (!prev) {
879 XFS_BUF_SET_FSPRIVATE(bp, next);
880 } else {
881 prev->li_bio_list = next;
882 }
883
884 /* add to current list */
885 blip->li_bio_list = lip->li_bio_list;
886 lip->li_bio_list = blip;
887
888 /*
889 * while we have the item, do the unlocked check for needing
890 * the AIL lock.
891 */
892 iip = INODE_ITEM(blip);
893 if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
894 need_ail++;
895
896 blip = next;
897 }
898
899 /* make sure we capture the state of the initial inode. */
900 iip = INODE_ITEM(lip);
901 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
902 need_ail++;
844 903
845 /* 904 /*
846 * We only want to pull the item from the AIL if it is 905 * We only want to pull the item from the AIL if it is
@@ -851,28 +910,37 @@ xfs_iflush_done(
851 * the lock since it's cheaper, and then we recheck while 910 * the lock since it's cheaper, and then we recheck while
852 * holding the lock before removing the inode from the AIL. 911 * holding the lock before removing the inode from the AIL.
853 */ 912 */
854 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { 913 if (need_ail) {
914 struct xfs_log_item *log_items[need_ail];
915 int i = 0;
855 spin_lock(&ailp->xa_lock); 916 spin_lock(&ailp->xa_lock);
856 if (lip->li_lsn == iip->ili_flush_lsn) { 917 for (blip = lip; blip; blip = blip->li_bio_list) {
857 /* xfs_trans_ail_delete() drops the AIL lock. */ 918 iip = INODE_ITEM(blip);
858 xfs_trans_ail_delete(ailp, lip); 919 if (iip->ili_logged &&
859 } else { 920 blip->li_lsn == iip->ili_flush_lsn) {
860 spin_unlock(&ailp->xa_lock); 921 log_items[i++] = blip;
922 }
923 ASSERT(i <= need_ail);
861 } 924 }
925 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
926 xfs_trans_ail_delete_bulk(ailp, log_items, i);
862 } 927 }
863 928
864 iip->ili_logged = 0;
865 929
866 /* 930 /*
867 * Clear the ili_last_fields bits now that we know that the 931 * clean up and unlock the flush lock now we are done. We can clear the
868 * data corresponding to them is safely on disk. 932 * ili_last_fields bits now that we know that the data corresponding to
933 * them is safely on disk.
869 */ 934 */
870 iip->ili_last_fields = 0; 935 for (blip = lip; blip; blip = next) {
936 next = blip->li_bio_list;
937 blip->li_bio_list = NULL;
871 938
872 /* 939 iip = INODE_ITEM(blip);
873 * Release the inode's flush lock since we're done with it. 940 iip->ili_logged = 0;
874 */ 941 iip->ili_last_fields = 0;
875 xfs_ifunlock(ip); 942 xfs_ifunlock(iip->ili_inode);
943 }
876} 944}
877 945
878/* 946/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..8a0f044750c3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
47 47
48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
49 << mp->m_writeio_log) 49 << mp->m_writeio_log)
50#define XFS_STRAT_WRITE_IMAPS 2
51#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 50#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
52 51
53STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
54 int, struct xfs_bmbt_irec *, int *);
55STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
56 struct xfs_bmbt_irec *, int *);
57STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
58 struct xfs_bmbt_irec *, int *);
59
60int
61xfs_iomap(
62 struct xfs_inode *ip,
63 xfs_off_t offset,
64 ssize_t count,
65 int flags,
66 struct xfs_bmbt_irec *imap,
67 int *nimaps,
68 int *new)
69{
70 struct xfs_mount *mp = ip->i_mount;
71 xfs_fileoff_t offset_fsb, end_fsb;
72 int error = 0;
73 int lockmode = 0;
74 int bmapi_flags = 0;
75
76 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
77
78 *new = 0;
79
80 if (XFS_FORCED_SHUTDOWN(mp))
81 return XFS_ERROR(EIO);
82
83 trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
84
85 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
86 case BMAPI_READ:
87 lockmode = xfs_ilock_map_shared(ip);
88 bmapi_flags = XFS_BMAPI_ENTIRE;
89 break;
90 case BMAPI_WRITE:
91 lockmode = XFS_ILOCK_EXCL;
92 if (flags & BMAPI_IGNSTATE)
93 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
94 xfs_ilock(ip, lockmode);
95 break;
96 case BMAPI_ALLOCATE:
97 lockmode = XFS_ILOCK_SHARED;
98 bmapi_flags = XFS_BMAPI_ENTIRE;
99
100 /* Attempt non-blocking lock */
101 if (flags & BMAPI_TRYLOCK) {
102 if (!xfs_ilock_nowait(ip, lockmode))
103 return XFS_ERROR(EAGAIN);
104 } else {
105 xfs_ilock(ip, lockmode);
106 }
107 break;
108 default:
109 BUG();
110 }
111
112 ASSERT(offset <= mp->m_maxioffset);
113 if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
114 count = mp->m_maxioffset - offset;
115 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
116 offset_fsb = XFS_B_TO_FSBT(mp, offset);
117
118 error = xfs_bmapi(NULL, ip, offset_fsb,
119 (xfs_filblks_t)(end_fsb - offset_fsb),
120 bmapi_flags, NULL, 0, imap,
121 nimaps, NULL);
122
123 if (error)
124 goto out;
125
126 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
127 case BMAPI_WRITE:
128 /* If we found an extent, return it */
129 if (*nimaps &&
130 (imap->br_startblock != HOLESTARTBLOCK) &&
131 (imap->br_startblock != DELAYSTARTBLOCK)) {
132 trace_xfs_iomap_found(ip, offset, count, flags, imap);
133 break;
134 }
135
136 if (flags & BMAPI_DIRECT) {
137 error = xfs_iomap_write_direct(ip, offset, count, flags,
138 imap, nimaps);
139 } else {
140 error = xfs_iomap_write_delay(ip, offset, count, flags,
141 imap, nimaps);
142 }
143 if (!error) {
144 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
145 }
146 *new = 1;
147 break;
148 case BMAPI_ALLOCATE:
149 /* If we found an extent, return it */
150 xfs_iunlock(ip, lockmode);
151 lockmode = 0;
152
153 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
154 trace_xfs_iomap_found(ip, offset, count, flags, imap);
155 break;
156 }
157
158 error = xfs_iomap_write_allocate(ip, offset, count,
159 imap, nimaps);
160 break;
161 }
162
163 ASSERT(*nimaps <= 1);
164
165out:
166 if (lockmode)
167 xfs_iunlock(ip, lockmode);
168 return XFS_ERROR(error);
169}
170
171STATIC int 52STATIC int
172xfs_iomap_eof_align_last_fsb( 53xfs_iomap_eof_align_last_fsb(
173 xfs_mount_t *mp, 54 xfs_mount_t *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
236 return EFSCORRUPTED; 117 return EFSCORRUPTED;
237} 118}
238 119
239STATIC int 120int
240xfs_iomap_write_direct( 121xfs_iomap_write_direct(
241 xfs_inode_t *ip, 122 xfs_inode_t *ip,
242 xfs_off_t offset, 123 xfs_off_t offset,
243 size_t count, 124 size_t count,
244 int flags,
245 xfs_bmbt_irec_t *imap, 125 xfs_bmbt_irec_t *imap,
246 int *nmaps) 126 int nmaps)
247{ 127{
248 xfs_mount_t *mp = ip->i_mount; 128 xfs_mount_t *mp = ip->i_mount;
249 xfs_fileoff_t offset_fsb; 129 xfs_fileoff_t offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
279 if (error) 159 if (error)
280 goto error_out; 160 goto error_out;
281 } else { 161 } else {
282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 162 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
283 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 163 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
284 imap->br_blockcount + 164 imap->br_blockcount +
285 imap->br_startoff); 165 imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
331 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip);
332 212
333 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = XFS_BMAPI_WRITE;
334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 214 if (offset < ip->i_size || extsz)
335 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
336 216
337 /* 217 /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
370 goto error_out; 250 goto error_out;
371 } 251 }
372 252
373 *nmaps = 1;
374 return 0; 253 return 0;
375 254
376error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 255error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
379 258
380error1: /* Just cancel transaction */ 259error1: /* Just cancel transaction */
381 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 260 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
382 *nmaps = 0; /* nothing set-up here */
383 261
384error_out: 262error_out:
385 return XFS_ERROR(error); 263 return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
389 * If the caller is doing a write at the end of the file, then extend the 267 * If the caller is doing a write at the end of the file, then extend the
390 * allocation out to the file system's write iosize. We clean up any extra 268 * allocation out to the file system's write iosize. We clean up any extra
391 * space left over when the file is closed in xfs_inactive(). 269 * space left over when the file is closed in xfs_inactive().
270 *
271 * If we find we already have delalloc preallocation beyond EOF, don't do more
272 * preallocation as it it not needed.
392 */ 273 */
393STATIC int 274STATIC int
394xfs_iomap_eof_want_preallocate( 275xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
396 xfs_inode_t *ip, 277 xfs_inode_t *ip,
397 xfs_off_t offset, 278 xfs_off_t offset,
398 size_t count, 279 size_t count,
399 int ioflag,
400 xfs_bmbt_irec_t *imap, 280 xfs_bmbt_irec_t *imap,
401 int nimaps, 281 int nimaps,
402 int *prealloc) 282 int *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
405 xfs_filblks_t count_fsb; 285 xfs_filblks_t count_fsb;
406 xfs_fsblock_t firstblock; 286 xfs_fsblock_t firstblock;
407 int n, error, imaps; 287 int n, error, imaps;
288 int found_delalloc = 0;
408 289
409 *prealloc = 0; 290 *prealloc = 0;
410 if ((offset + count) <= ip->i_size) 291 if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
429 return 0; 310 return 0;
430 start_fsb += imap[n].br_blockcount; 311 start_fsb += imap[n].br_blockcount;
431 count_fsb -= imap[n].br_blockcount; 312 count_fsb -= imap[n].br_blockcount;
313
314 if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 found_delalloc = 1;
432 } 316 }
433 } 317 }
434 *prealloc = 1; 318 if (!found_delalloc)
319 *prealloc = 1;
435 return 0; 320 return 0;
436} 321}
437 322
438STATIC int 323/*
324 * If we don't have a user specified preallocation size, dynamically increase
325 * the preallocation size as the size of the file grows. Cap the maximum size
326 * at a single extent or less if the filesystem is near full. The closer the
327 * filesystem is to full, the smaller the maximum prealocation.
328 */
329STATIC xfs_fsblock_t
330xfs_iomap_prealloc_size(
331 struct xfs_mount *mp,
332 struct xfs_inode *ip)
333{
334 xfs_fsblock_t alloc_blocks = 0;
335
336 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
337 int shift = 0;
338 int64_t freesp;
339
340 /*
341 * rounddown_pow_of_two() returns an undefined result
342 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
343 * ensure we always pass in a non-zero value.
344 */
345 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
346 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
347 rounddown_pow_of_two(alloc_blocks));
348
349 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
350 freesp = mp->m_sb.sb_fdblocks;
351 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
352 shift = 2;
353 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
354 shift++;
355 if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
356 shift++;
357 if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
358 shift++;
359 if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
360 shift++;
361 }
362 if (shift)
363 alloc_blocks >>= shift;
364 }
365
366 if (alloc_blocks < mp->m_writeio_blocks)
367 alloc_blocks = mp->m_writeio_blocks;
368
369 return alloc_blocks;
370}
371
372int
439xfs_iomap_write_delay( 373xfs_iomap_write_delay(
440 xfs_inode_t *ip, 374 xfs_inode_t *ip,
441 xfs_off_t offset, 375 xfs_off_t offset,
442 size_t count, 376 size_t count,
443 int ioflag, 377 xfs_bmbt_irec_t *ret_imap)
444 xfs_bmbt_irec_t *ret_imap,
445 int *nmaps)
446{ 378{
447 xfs_mount_t *mp = ip->i_mount; 379 xfs_mount_t *mp = ip->i_mount;
448 xfs_fileoff_t offset_fsb; 380 xfs_fileoff_t offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
469 extsz = xfs_get_extsz_hint(ip); 401 extsz = xfs_get_extsz_hint(ip);
470 offset_fsb = XFS_B_TO_FSBT(mp, offset); 402 offset_fsb = XFS_B_TO_FSBT(mp, offset);
471 403
404
472 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 405 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
473 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 406 imap, XFS_WRITE_IMAPS, &prealloc);
474 if (error) 407 if (error)
475 return error; 408 return error;
476 409
477retry: 410retry:
478 if (prealloc) { 411 if (prealloc) {
412 xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
413
479 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 414 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
480 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 415 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
481 last_fsb = ioalign + mp->m_writeio_blocks; 416 last_fsb = ioalign + alloc_blocks;
482 } else { 417 } else {
483 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 418 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
484 } 419 }
@@ -496,22 +431,31 @@ retry:
496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 431 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 432 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
498 &nimaps, NULL); 433 &nimaps, NULL);
499 if (error && (error != ENOSPC)) 434 switch (error) {
435 case 0:
436 case ENOSPC:
437 case EDQUOT:
438 break;
439 default:
500 return XFS_ERROR(error); 440 return XFS_ERROR(error);
441 }
501 442
502 /* 443 /*
503 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 444 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
504 * then we must have run out of space - flush all other inodes with 445 * ENOSPC, * flush all other inodes with delalloc blocks to free up
505 * delalloc blocks and retry without EOF preallocation. 446 * some of the excess reserved metadata space. For both cases, retry
447 * without EOF preallocation.
506 */ 448 */
507 if (nimaps == 0) { 449 if (nimaps == 0) {
508 trace_xfs_delalloc_enospc(ip, offset, count); 450 trace_xfs_delalloc_enospc(ip, offset, count);
509 if (flushed) 451 if (flushed)
510 return XFS_ERROR(ENOSPC); 452 return XFS_ERROR(error ? error : ENOSPC);
511 453
512 xfs_iunlock(ip, XFS_ILOCK_EXCL); 454 if (error == ENOSPC) {
513 xfs_flush_inodes(ip); 455 xfs_iunlock(ip, XFS_ILOCK_EXCL);
514 xfs_ilock(ip, XFS_ILOCK_EXCL); 456 xfs_flush_inodes(ip);
457 xfs_ilock(ip, XFS_ILOCK_EXCL);
458 }
515 459
516 flushed = 1; 460 flushed = 1;
517 error = 0; 461 error = 0;
@@ -523,8 +467,6 @@ retry:
523 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 467 return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
524 468
525 *ret_imap = imap[0]; 469 *ret_imap = imap[0];
526 *nmaps = 1;
527
528 return 0; 470 return 0;
529} 471}
530 472
@@ -538,13 +480,12 @@ retry:
538 * We no longer bother to look at the incoming map - all we have to 480 * We no longer bother to look at the incoming map - all we have to
539 * guarantee is that whatever we allocate fills the required range. 481 * guarantee is that whatever we allocate fills the required range.
540 */ 482 */
541STATIC int 483int
542xfs_iomap_write_allocate( 484xfs_iomap_write_allocate(
543 xfs_inode_t *ip, 485 xfs_inode_t *ip,
544 xfs_off_t offset, 486 xfs_off_t offset,
545 size_t count, 487 size_t count,
546 xfs_bmbt_irec_t *imap, 488 xfs_bmbt_irec_t *imap)
547 int *retmap)
548{ 489{
549 xfs_mount_t *mp = ip->i_mount; 490 xfs_mount_t *mp = ip->i_mount;
550 xfs_fileoff_t offset_fsb, last_block; 491 xfs_fileoff_t offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
557 int error = 0; 498 int error = 0;
558 int nres; 499 int nres;
559 500
560 *retmap = 0;
561
562 /* 501 /*
563 * Make sure that the dquots are there. 502 * Make sure that the dquots are there.
564 */ 503 */
@@ -680,7 +619,6 @@ xfs_iomap_write_allocate(
680 if ((offset_fsb >= imap->br_startoff) && 619 if ((offset_fsb >= imap->br_startoff) &&
681 (offset_fsb < (imap->br_startoff + 620 (offset_fsb < (imap->br_startoff +
682 imap->br_blockcount))) { 621 imap->br_blockcount))) {
683 *retmap = 1;
684 XFS_STATS_INC(xs_xstrat_quick); 622 XFS_STATS_INC(xs_xstrat_quick);
685 return 0; 623 return 0;
686 } 624 }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21/* base extent manipulation calls */
22#define BMAPI_READ (1 << 0) /* read extents */
23#define BMAPI_WRITE (1 << 1) /* create extents */
24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
25
26/* modifiers */
27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
31
32#define BMAPI_FLAGS \
33 { BMAPI_READ, "READ" }, \
34 { BMAPI_WRITE, "WRITE" }, \
35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
37 { BMAPI_DIRECT, "DIRECT" }, \
38 { BMAPI_TRYLOCK, "TRYLOCK" }
39
40struct xfs_inode; 21struct xfs_inode;
41struct xfs_bmbt_irec; 22struct xfs_bmbt_irec;
42 23
43extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 24extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
44 struct xfs_bmbt_irec *, int *, int *); 25 struct xfs_bmbt_irec *, int);
26extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *);
28extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
29 struct xfs_bmbt_irec *);
45extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
46 31
47#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..dc1882adaf54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
92 * further change. 92 * further change.
93 */ 93 */
94 buf->bs_nlink = dic->di_nlink; 94 buf->bs_nlink = dic->di_nlink;
95 buf->bs_projid = dic->di_projid; 95 buf->bs_projid_lo = dic->di_projid_lo;
96 buf->bs_projid_hi = dic->di_projid_hi;
96 buf->bs_ino = ino; 97 buf->bs_ino = ino;
97 buf->bs_mode = dic->di_mode; 98 buf->bs_mode = dic->di_mode;
98 buf->bs_uid = dic->di_uid; 99 buf->bs_uid = dic->di_uid;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
47 xfs_buftarg_t *log_target, 47 xfs_buftarg_t *log_target,
48 xfs_daddr_t blk_offset, 48 xfs_daddr_t blk_offset,
49 int num_bblks); 49 int num_bblks);
50STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 50STATIC int xlog_space_left(struct log *log, atomic64_t *head);
51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
52STATIC void xlog_dealloc_log(xlog_t *log); 52STATIC void xlog_dealloc_log(xlog_t *log);
53 53
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
70/* local functions to manipulate grant head */ 70/* local functions to manipulate grant head */
71STATIC int xlog_grant_log_space(xlog_t *log, 71STATIC int xlog_grant_log_space(xlog_t *log,
72 xlog_ticket_t *xtic); 72 xlog_ticket_t *xtic);
73STATIC void xlog_grant_push_ail(xfs_mount_t *mp, 73STATIC void xlog_grant_push_ail(struct log *log,
74 int need_bytes); 74 int need_bytes);
75STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 75STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
76 xlog_ticket_t *ticket); 76 xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
81 81
82#if defined(DEBUG) 82#if defined(DEBUG)
83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
84STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 84STATIC void xlog_verify_grant_tail(struct log *log);
85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
86 int count, boolean_t syncing); 86 int count, boolean_t syncing);
87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, 87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
88 xfs_lsn_t tail_lsn); 88 xfs_lsn_t tail_lsn);
89#else 89#else
90#define xlog_verify_dest_ptr(a,b) 90#define xlog_verify_dest_ptr(a,b)
91#define xlog_verify_grant_head(a,b) 91#define xlog_verify_grant_tail(a)
92#define xlog_verify_iclog(a,b,c,d) 92#define xlog_verify_iclog(a,b,c,d)
93#define xlog_verify_tail_lsn(a,b,c) 93#define xlog_verify_tail_lsn(a,b,c)
94#endif 94#endif
95 95
96STATIC int xlog_iclogs_empty(xlog_t *log); 96STATIC int xlog_iclogs_empty(xlog_t *log);
97 97
98
99static void 98static void
100xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 99xlog_grant_sub_space(
100 struct log *log,
101 atomic64_t *head,
102 int bytes)
101{ 103{
102 if (*qp) { 104 int64_t head_val = atomic64_read(head);
103 tic->t_next = (*qp); 105 int64_t new, old;
104 tic->t_prev = (*qp)->t_prev;
105 (*qp)->t_prev->t_next = tic;
106 (*qp)->t_prev = tic;
107 } else {
108 tic->t_prev = tic->t_next = tic;
109 *qp = tic;
110 }
111 106
112 tic->t_flags |= XLOG_TIC_IN_Q; 107 do {
113} 108 int cycle, space;
114 109
115static void 110 xlog_crack_grant_head_val(head_val, &cycle, &space);
116xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
117{
118 if (tic == tic->t_next) {
119 *qp = NULL;
120 } else {
121 *qp = tic->t_next;
122 tic->t_next->t_prev = tic->t_prev;
123 tic->t_prev->t_next = tic->t_next;
124 }
125 111
126 tic->t_next = tic->t_prev = NULL; 112 space -= bytes;
127 tic->t_flags &= ~XLOG_TIC_IN_Q; 113 if (space < 0) {
114 space += log->l_logsize;
115 cycle--;
116 }
117
118 old = head_val;
119 new = xlog_assign_grant_head_val(cycle, space);
120 head_val = atomic64_cmpxchg(head, old, new);
121 } while (head_val != old);
128} 122}
129 123
130static void 124static void
131xlog_grant_sub_space(struct log *log, int bytes) 125xlog_grant_add_space(
126 struct log *log,
127 atomic64_t *head,
128 int bytes)
132{ 129{
133 log->l_grant_write_bytes -= bytes; 130 int64_t head_val = atomic64_read(head);
134 if (log->l_grant_write_bytes < 0) { 131 int64_t new, old;
135 log->l_grant_write_bytes += log->l_logsize;
136 log->l_grant_write_cycle--;
137 }
138 132
139 log->l_grant_reserve_bytes -= bytes; 133 do {
140 if ((log)->l_grant_reserve_bytes < 0) { 134 int tmp;
141 log->l_grant_reserve_bytes += log->l_logsize; 135 int cycle, space;
142 log->l_grant_reserve_cycle--;
143 }
144 136
145} 137 xlog_crack_grant_head_val(head_val, &cycle, &space);
146 138
147static void 139 tmp = log->l_logsize - space;
148xlog_grant_add_space_write(struct log *log, int bytes) 140 if (tmp > bytes)
149{ 141 space += bytes;
150 int tmp = log->l_logsize - log->l_grant_write_bytes; 142 else {
151 if (tmp > bytes) 143 space = bytes - tmp;
152 log->l_grant_write_bytes += bytes; 144 cycle++;
153 else { 145 }
154 log->l_grant_write_cycle++;
155 log->l_grant_write_bytes = bytes - tmp;
156 }
157}
158
159static void
160xlog_grant_add_space_reserve(struct log *log, int bytes)
161{
162 int tmp = log->l_logsize - log->l_grant_reserve_bytes;
163 if (tmp > bytes)
164 log->l_grant_reserve_bytes += bytes;
165 else {
166 log->l_grant_reserve_cycle++;
167 log->l_grant_reserve_bytes = bytes - tmp;
168 }
169}
170 146
171static inline void 147 old = head_val;
172xlog_grant_add_space(struct log *log, int bytes) 148 new = xlog_assign_grant_head_val(cycle, space);
173{ 149 head_val = atomic64_cmpxchg(head, old, new);
174 xlog_grant_add_space_write(log, bytes); 150 } while (head_val != old);
175 xlog_grant_add_space_reserve(log, bytes);
176} 151}
177 152
178static void 153static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
355 330
356 trace_xfs_log_reserve(log, internal_ticket); 331 trace_xfs_log_reserve(log, internal_ticket);
357 332
358 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 333 xlog_grant_push_ail(log, internal_ticket->t_unit_res);
359 retval = xlog_regrant_write_log_space(log, internal_ticket); 334 retval = xlog_regrant_write_log_space(log, internal_ticket);
360 } else { 335 } else {
361 /* may sleep if need to allocate more tickets */ 336 /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
369 344
370 trace_xfs_log_reserve(log, internal_ticket); 345 trace_xfs_log_reserve(log, internal_ticket);
371 346
372 xlog_grant_push_ail(mp, 347 xlog_grant_push_ail(log,
373 (internal_ticket->t_unit_res * 348 (internal_ticket->t_unit_res *
374 internal_ticket->t_cnt)); 349 internal_ticket->t_cnt));
375 retval = xlog_grant_log_space(log, internal_ticket); 350 retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
402 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
403 else { 378 else {
404 cmn_err(CE_NOTE, 379 cmn_err(CE_NOTE,
405 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380 "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
406 mp->m_fsname); 381 mp->m_fsname);
407 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
408 } 383 }
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
584 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 559 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
585 iclog->ic_state == XLOG_STATE_DIRTY)) { 560 iclog->ic_state == XLOG_STATE_DIRTY)) {
586 if (!XLOG_FORCED_SHUTDOWN(log)) { 561 if (!XLOG_FORCED_SHUTDOWN(log)) {
587 sv_wait(&iclog->ic_force_wait, PMEM, 562 xlog_wait(&iclog->ic_force_wait,
588 &log->l_icloglock, s); 563 &log->l_icloglock);
589 } else { 564 } else {
590 spin_unlock(&log->l_icloglock); 565 spin_unlock(&log->l_icloglock);
591 } 566 }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
625 || iclog->ic_state == XLOG_STATE_DIRTY 600 || iclog->ic_state == XLOG_STATE_DIRTY
626 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 601 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
627 602
628 sv_wait(&iclog->ic_force_wait, PMEM, 603 xlog_wait(&iclog->ic_force_wait,
629 &log->l_icloglock, s); 604 &log->l_icloglock);
630 } else { 605 } else {
631 spin_unlock(&log->l_icloglock); 606 spin_unlock(&log->l_icloglock);
632 } 607 }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t *mp,
703{ 678{
704 xlog_ticket_t *tic; 679 xlog_ticket_t *tic;
705 xlog_t *log = mp->m_log; 680 xlog_t *log = mp->m_log;
706 int need_bytes, free_bytes, cycle, bytes; 681 int need_bytes, free_bytes;
707 682
708 if (XLOG_FORCED_SHUTDOWN(log)) 683 if (XLOG_FORCED_SHUTDOWN(log))
709 return; 684 return;
710 685
711 if (tail_lsn == 0) { 686 if (tail_lsn == 0)
712 /* needed since sync_lsn is 64 bits */ 687 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
713 spin_lock(&log->l_icloglock);
714 tail_lsn = log->l_last_sync_lsn;
715 spin_unlock(&log->l_icloglock);
716 }
717 688
718 spin_lock(&log->l_grant_lock); 689 /* tail_lsn == 1 implies that we weren't passed a valid value. */
719 690 if (tail_lsn != 1)
720 /* Also an invalid lsn. 1 implies that we aren't passing in a valid 691 atomic64_set(&log->l_tail_lsn, tail_lsn);
721 * tail_lsn.
722 */
723 if (tail_lsn != 1) {
724 log->l_tail_lsn = tail_lsn;
725 }
726 692
727 if ((tic = log->l_write_headq)) { 693 if (!list_empty_careful(&log->l_writeq)) {
728#ifdef DEBUG 694#ifdef DEBUG
729 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 695 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
730 panic("Recovery problem"); 696 panic("Recovery problem");
731#endif 697#endif
732 cycle = log->l_grant_write_cycle; 698 spin_lock(&log->l_grant_write_lock);
733 bytes = log->l_grant_write_bytes; 699 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
734 free_bytes = xlog_space_left(log, cycle, bytes); 700 list_for_each_entry(tic, &log->l_writeq, t_queue) {
735 do {
736 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 701 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
737 702
738 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 703 if (free_bytes < tic->t_unit_res && tail_lsn != 1)
739 break; 704 break;
740 tail_lsn = 0; 705 tail_lsn = 0;
741 free_bytes -= tic->t_unit_res; 706 free_bytes -= tic->t_unit_res;
742 sv_signal(&tic->t_wait); 707 trace_xfs_log_regrant_write_wake_up(log, tic);
743 tic = tic->t_next; 708 wake_up(&tic->t_wait);
744 } while (tic != log->l_write_headq); 709 }
710 spin_unlock(&log->l_grant_write_lock);
745 } 711 }
746 if ((tic = log->l_reserve_headq)) { 712
713 if (!list_empty_careful(&log->l_reserveq)) {
747#ifdef DEBUG 714#ifdef DEBUG
748 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 715 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
749 panic("Recovery problem"); 716 panic("Recovery problem");
750#endif 717#endif
751 cycle = log->l_grant_reserve_cycle; 718 spin_lock(&log->l_grant_reserve_lock);
752 bytes = log->l_grant_reserve_bytes; 719 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
753 free_bytes = xlog_space_left(log, cycle, bytes); 720 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
754 do {
755 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 721 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
756 need_bytes = tic->t_unit_res*tic->t_cnt; 722 need_bytes = tic->t_unit_res*tic->t_cnt;
757 else 723 else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t *mp,
760 break; 726 break;
761 tail_lsn = 0; 727 tail_lsn = 0;
762 free_bytes -= need_bytes; 728 free_bytes -= need_bytes;
763 sv_signal(&tic->t_wait); 729 trace_xfs_log_grant_wake_up(log, tic);
764 tic = tic->t_next; 730 wake_up(&tic->t_wait);
765 } while (tic != log->l_reserve_headq); 731 }
732 spin_unlock(&log->l_grant_reserve_lock);
766 } 733 }
767 spin_unlock(&log->l_grant_lock); 734}
768} /* xfs_log_move_tail */
769 735
770/* 736/*
771 * Determine if we have a transaction that has gone to disk 737 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
831 * We may be holding the log iclog lock upon entering this routine. 797 * We may be holding the log iclog lock upon entering this routine.
832 */ 798 */
833xfs_lsn_t 799xfs_lsn_t
834xlog_assign_tail_lsn(xfs_mount_t *mp) 800xlog_assign_tail_lsn(
801 struct xfs_mount *mp)
835{ 802{
836 xfs_lsn_t tail_lsn; 803 xfs_lsn_t tail_lsn;
837 xlog_t *log = mp->m_log; 804 struct log *log = mp->m_log;
838 805
839 tail_lsn = xfs_trans_ail_tail(mp->m_ail); 806 tail_lsn = xfs_trans_ail_tail(mp->m_ail);
840 spin_lock(&log->l_grant_lock); 807 if (!tail_lsn)
841 if (tail_lsn != 0) { 808 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
842 log->l_tail_lsn = tail_lsn;
843 } else {
844 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
845 }
846 spin_unlock(&log->l_grant_lock);
847 809
810 atomic64_set(&log->l_tail_lsn, tail_lsn);
848 return tail_lsn; 811 return tail_lsn;
849} /* xlog_assign_tail_lsn */ 812}
850
851 813
852/* 814/*
853 * Return the space in the log between the tail and the head. The head 815 * Return the space in the log between the tail and the head. The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
864 * result is that we return the size of the log as the amount of space left. 826 * result is that we return the size of the log as the amount of space left.
865 */ 827 */
866STATIC int 828STATIC int
867xlog_space_left(xlog_t *log, int cycle, int bytes) 829xlog_space_left(
868{ 830 struct log *log,
869 int free_bytes; 831 atomic64_t *head)
870 int tail_bytes; 832{
871 int tail_cycle; 833 int free_bytes;
872 834 int tail_bytes;
873 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); 835 int tail_cycle;
874 tail_cycle = CYCLE_LSN(log->l_tail_lsn); 836 int head_cycle;
875 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { 837 int head_bytes;
876 free_bytes = log->l_logsize - (bytes - tail_bytes); 838
877 } else if ((tail_cycle + 1) < cycle) { 839 xlog_crack_grant_head(head, &head_cycle, &head_bytes);
840 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
841 tail_bytes = BBTOB(tail_bytes);
842 if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
843 free_bytes = log->l_logsize - (head_bytes - tail_bytes);
844 else if (tail_cycle + 1 < head_cycle)
878 return 0; 845 return 0;
879 } else if (tail_cycle < cycle) { 846 else if (tail_cycle < head_cycle) {
880 ASSERT(tail_cycle == (cycle - 1)); 847 ASSERT(tail_cycle == (head_cycle - 1));
881 free_bytes = tail_bytes - bytes; 848 free_bytes = tail_bytes - head_bytes;
882 } else { 849 } else {
883 /* 850 /*
884 * The reservation head is behind the tail. 851 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
889 "xlog_space_left: head behind tail\n" 856 "xlog_space_left: head behind tail\n"
890 " tail_cycle = %d, tail_bytes = %d\n" 857 " tail_cycle = %d, tail_bytes = %d\n"
891 " GH cycle = %d, GH bytes = %d", 858 " GH cycle = %d, GH bytes = %d",
892 tail_cycle, tail_bytes, cycle, bytes); 859 tail_cycle, tail_bytes, head_cycle, head_bytes);
893 ASSERT(0); 860 ASSERT(0);
894 free_bytes = log->l_logsize; 861 free_bytes = log->l_logsize;
895 } 862 }
896 return free_bytes; 863 return free_bytes;
897} /* xlog_space_left */ 864}
898 865
899 866
900/* 867/*
@@ -917,19 +884,6 @@ xlog_iodone(xfs_buf_t *bp)
917 l = iclog->ic_log; 884 l = iclog->ic_log;
918 885
919 /* 886 /*
920 * If the _XFS_BARRIER_FAILED flag was set by a lower
921 * layer, it means the underlying device no longer supports
922 * barrier I/O. Warn loudly and turn off barriers.
923 */
924 if (bp->b_flags & _XFS_BARRIER_FAILED) {
925 bp->b_flags &= ~_XFS_BARRIER_FAILED;
926 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
927 xfs_fs_cmn_err(CE_WARN, l->l_mp,
928 "xlog_iodone: Barriers are no longer supported"
929 " by device. Disabling barriers\n");
930 }
931
932 /*
933 * Race to shutdown the filesystem if we see an error. 887 * Race to shutdown the filesystem if we see an error.
934 */ 888 */
935 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 889 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1060,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t *mp,
1060 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1014 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1061 1015
1062 log->l_prev_block = -1; 1016 log->l_prev_block = -1;
1063 log->l_tail_lsn = xlog_assign_lsn(1, 0);
1064 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1017 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1065 log->l_last_sync_lsn = log->l_tail_lsn; 1018 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1019 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1066 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1020 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1067 log->l_grant_reserve_cycle = 1; 1021 xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
1068 log->l_grant_write_cycle = 1; 1022 xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
1023 INIT_LIST_HEAD(&log->l_reserveq);
1024 INIT_LIST_HEAD(&log->l_writeq);
1025 spin_lock_init(&log->l_grant_reserve_lock);
1026 spin_lock_init(&log->l_grant_write_lock);
1069 1027
1070 error = EFSCORRUPTED; 1028 error = EFSCORRUPTED;
1071 if (xfs_sb_version_hassector(&mp->m_sb)) { 1029 if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1107,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1107 log->l_xbuf = bp; 1065 log->l_xbuf = bp;
1108 1066
1109 spin_lock_init(&log->l_icloglock); 1067 spin_lock_init(&log->l_icloglock);
1110 spin_lock_init(&log->l_grant_lock); 1068 init_waitqueue_head(&log->l_flush_wait);
1111 sv_init(&log->l_flush_wait, 0, "flush_wait");
1112 1069
1113 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1070 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1114 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1071 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1131,7 +1088,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1131 iclog->ic_prev = prev_iclog; 1088 iclog->ic_prev = prev_iclog;
1132 prev_iclog = iclog; 1089 prev_iclog = iclog;
1133 1090
1134 bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); 1091 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1092 log->l_iclog_size, 0);
1135 if (!bp) 1093 if (!bp)
1136 goto out_free_iclog; 1094 goto out_free_iclog;
1137 if (!XFS_BUF_CPSEMA(bp)) 1095 if (!XFS_BUF_CPSEMA(bp))
@@ -1163,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1163 1121
1164 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1122 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1165 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1123 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1166 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1124 init_waitqueue_head(&iclog->ic_force_wait);
1167 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1125 init_waitqueue_head(&iclog->ic_write_wait);
1168 1126
1169 iclogp = &iclog->ic_next; 1127 iclogp = &iclog->ic_next;
1170 } 1128 }
@@ -1179,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t *mp,
1179out_free_iclog: 1137out_free_iclog:
1180 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1138 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1181 prev_iclog = iclog->ic_next; 1139 prev_iclog = iclog->ic_next;
1182 if (iclog->ic_bp) { 1140 if (iclog->ic_bp)
1183 sv_destroy(&iclog->ic_force_wait);
1184 sv_destroy(&iclog->ic_write_wait);
1185 xfs_buf_free(iclog->ic_bp); 1141 xfs_buf_free(iclog->ic_bp);
1186 }
1187 kmem_free(iclog); 1142 kmem_free(iclog);
1188 } 1143 }
1189 spinlock_destroy(&log->l_icloglock); 1144 spinlock_destroy(&log->l_icloglock);
1190 spinlock_destroy(&log->l_grant_lock);
1191 xfs_buf_free(log->l_xbuf); 1145 xfs_buf_free(log->l_xbuf);
1192out_free_log: 1146out_free_log:
1193 kmem_free(log); 1147 kmem_free(log);
@@ -1235,61 +1189,60 @@ xlog_commit_record(
1235 * water mark. In this manner, we would be creating a low water mark. 1189 * water mark. In this manner, we would be creating a low water mark.
1236 */ 1190 */
1237STATIC void 1191STATIC void
1238xlog_grant_push_ail(xfs_mount_t *mp, 1192xlog_grant_push_ail(
1239 int need_bytes) 1193 struct log *log,
1194 int need_bytes)
1240{ 1195{
1241 xlog_t *log = mp->m_log; /* pointer to the log */ 1196 xfs_lsn_t threshold_lsn = 0;
1242 xfs_lsn_t tail_lsn; /* lsn of the log tail */ 1197 xfs_lsn_t last_sync_lsn;
1243 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ 1198 int free_blocks;
1244 int free_blocks; /* free blocks left to write to */ 1199 int free_bytes;
1245 int free_bytes; /* free bytes left to write to */ 1200 int threshold_block;
1246 int threshold_block; /* block in lsn we'd like to be at */ 1201 int threshold_cycle;
1247 int threshold_cycle; /* lsn cycle we'd like to be at */ 1202 int free_threshold;
1248 int free_threshold; 1203
1249 1204 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1250 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1205
1251 1206 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
1252 spin_lock(&log->l_grant_lock); 1207 free_blocks = BTOBBT(free_bytes);
1253 free_bytes = xlog_space_left(log, 1208
1254 log->l_grant_reserve_cycle, 1209 /*
1255 log->l_grant_reserve_bytes); 1210 * Set the threshold for the minimum number of free blocks in the
1256 tail_lsn = log->l_tail_lsn; 1211 * log to the maximum of what the caller needs, one quarter of the
1257 free_blocks = BTOBBT(free_bytes); 1212 * log, and 256 blocks.
1258 1213 */
1259 /* 1214 free_threshold = BTOBB(need_bytes);
1260 * Set the threshold for the minimum number of free blocks in the 1215 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1261 * log to the maximum of what the caller needs, one quarter of the 1216 free_threshold = MAX(free_threshold, 256);
1262 * log, and 256 blocks. 1217 if (free_blocks >= free_threshold)
1263 */ 1218 return;
1264 free_threshold = BTOBB(need_bytes); 1219
1265 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); 1220 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1266 free_threshold = MAX(free_threshold, 256); 1221 &threshold_block);
1267 if (free_blocks < free_threshold) { 1222 threshold_block += free_threshold;
1268 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
1269 threshold_cycle = CYCLE_LSN(tail_lsn);
1270 if (threshold_block >= log->l_logBBsize) { 1223 if (threshold_block >= log->l_logBBsize) {
1271 threshold_block -= log->l_logBBsize; 1224 threshold_block -= log->l_logBBsize;
1272 threshold_cycle += 1; 1225 threshold_cycle += 1;
1273 } 1226 }
1274 threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); 1227 threshold_lsn = xlog_assign_lsn(threshold_cycle,
1228 threshold_block);
1229 /*
1230 * Don't pass in an lsn greater than the lsn of the last
1231 * log record known to be on disk. Use a snapshot of the last sync lsn
1232 * so that it doesn't change between the compare and the set.
1233 */
1234 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1235 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1236 threshold_lsn = last_sync_lsn;
1275 1237
1276 /* Don't pass in an lsn greater than the lsn of the last 1238 /*
1277 * log record known to be on disk. 1239 * Get the transaction layer to kick the dirty buffers out to
1240 * disk asynchronously. No point in trying to do this if
1241 * the filesystem is shutting down.
1278 */ 1242 */
1279 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) 1243 if (!XLOG_FORCED_SHUTDOWN(log))
1280 threshold_lsn = log->l_last_sync_lsn; 1244 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1281 } 1245}
1282 spin_unlock(&log->l_grant_lock);
1283
1284 /*
1285 * Get the transaction layer to kick the dirty buffers out to
1286 * disk asynchronously. No point in trying to do this if
1287 * the filesystem is shutting down.
1288 */
1289 if (threshold_lsn &&
1290 !XLOG_FORCED_SHUTDOWN(log))
1291 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1292} /* xlog_grant_push_ail */
1293 1246
1294/* 1247/*
1295 * The bdstrat callback function for log bufs. This gives us a central 1248 * The bdstrat callback function for log bufs. This gives us a central
@@ -1309,7 +1262,7 @@ xlog_bdstrat(
1309 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1262 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1310 XFS_BUF_ERROR(bp, EIO); 1263 XFS_BUF_ERROR(bp, EIO);
1311 XFS_BUF_STALE(bp); 1264 XFS_BUF_STALE(bp);
1312 xfs_biodone(bp); 1265 xfs_buf_ioend(bp, 0);
1313 /* 1266 /*
1314 * It would seem logical to return EIO here, but we rely on 1267 * It would seem logical to return EIO here, but we rely on
1315 * the log state machine to propagate I/O errors instead of 1268 * the log state machine to propagate I/O errors instead of
@@ -1384,9 +1337,8 @@ xlog_sync(xlog_t *log,
1384 roundoff < BBTOB(1))); 1337 roundoff < BBTOB(1)));
1385 1338
1386 /* move grant heads by roundoff in sync */ 1339 /* move grant heads by roundoff in sync */
1387 spin_lock(&log->l_grant_lock); 1340 xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
1388 xlog_grant_add_space(log, roundoff); 1341 xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
1389 spin_unlock(&log->l_grant_lock);
1390 1342
1391 /* put cycle number in every block */ 1343 /* put cycle number in every block */
1392 xlog_pack_data(log, iclog, roundoff); 1344 xlog_pack_data(log, iclog, roundoff);
@@ -1501,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
1501 1453
1502 iclog = log->l_iclog; 1454 iclog = log->l_iclog;
1503 for (i=0; i<log->l_iclog_bufs; i++) { 1455 for (i=0; i<log->l_iclog_bufs; i++) {
1504 sv_destroy(&iclog->ic_force_wait);
1505 sv_destroy(&iclog->ic_write_wait);
1506 xfs_buf_free(iclog->ic_bp); 1456 xfs_buf_free(iclog->ic_bp);
1507 next_iclog = iclog->ic_next; 1457 next_iclog = iclog->ic_next;
1508 kmem_free(iclog); 1458 kmem_free(iclog);
1509 iclog = next_iclog; 1459 iclog = next_iclog;
1510 } 1460 }
1511 spinlock_destroy(&log->l_icloglock); 1461 spinlock_destroy(&log->l_icloglock);
1512 spinlock_destroy(&log->l_grant_lock);
1513 1462
1514 xfs_buf_free(log->l_xbuf); 1463 xfs_buf_free(log->l_xbuf);
1515 log->l_mp->m_log = NULL; 1464 log->l_mp->m_log = NULL;
@@ -2244,7 +2193,7 @@ xlog_state_do_callback(
2244 lowest_lsn = xlog_get_lowest_lsn(log); 2193 lowest_lsn = xlog_get_lowest_lsn(log);
2245 if (lowest_lsn && 2194 if (lowest_lsn &&
2246 XFS_LSN_CMP(lowest_lsn, 2195 XFS_LSN_CMP(lowest_lsn,
2247 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2196 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2248 iclog = iclog->ic_next; 2197 iclog = iclog->ic_next;
2249 continue; /* Leave this iclog for 2198 continue; /* Leave this iclog for
2250 * another thread */ 2199 * another thread */
@@ -2252,23 +2201,21 @@ xlog_state_do_callback(
2252 2201
2253 iclog->ic_state = XLOG_STATE_CALLBACK; 2202 iclog->ic_state = XLOG_STATE_CALLBACK;
2254 2203
2255 spin_unlock(&log->l_icloglock);
2256 2204
2257 /* l_last_sync_lsn field protected by 2205 /*
2258 * l_grant_lock. Don't worry about iclog's lsn. 2206 * update the last_sync_lsn before we drop the
2259 * No one else can be here except us. 2207 * icloglock to ensure we are the only one that
2208 * can update it.
2260 */ 2209 */
2261 spin_lock(&log->l_grant_lock); 2210 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2262 ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, 2211 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2263 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2212 atomic64_set(&log->l_last_sync_lsn,
2264 log->l_last_sync_lsn = 2213 be64_to_cpu(iclog->ic_header.h_lsn));
2265 be64_to_cpu(iclog->ic_header.h_lsn);
2266 spin_unlock(&log->l_grant_lock);
2267 2214
2268 } else { 2215 } else
2269 spin_unlock(&log->l_icloglock);
2270 ioerrors++; 2216 ioerrors++;
2271 } 2217
2218 spin_unlock(&log->l_icloglock);
2272 2219
2273 /* 2220 /*
2274 * Keep processing entries in the callback list until 2221 * Keep processing entries in the callback list until
@@ -2309,7 +2256,7 @@ xlog_state_do_callback(
2309 xlog_state_clean_log(log); 2256 xlog_state_clean_log(log);
2310 2257
2311 /* wake up threads waiting in xfs_log_force() */ 2258 /* wake up threads waiting in xfs_log_force() */
2312 sv_broadcast(&iclog->ic_force_wait); 2259 wake_up_all(&iclog->ic_force_wait);
2313 2260
2314 iclog = iclog->ic_next; 2261 iclog = iclog->ic_next;
2315 } while (first_iclog != iclog); 2262 } while (first_iclog != iclog);
@@ -2356,7 +2303,7 @@ xlog_state_do_callback(
2356 spin_unlock(&log->l_icloglock); 2303 spin_unlock(&log->l_icloglock);
2357 2304
2358 if (wake) 2305 if (wake)
2359 sv_broadcast(&log->l_flush_wait); 2306 wake_up_all(&log->l_flush_wait);
2360} 2307}
2361 2308
2362 2309
@@ -2407,7 +2354,7 @@ xlog_state_done_syncing(
2407 * iclog buffer, we wake them all, one will get to do the 2354 * iclog buffer, we wake them all, one will get to do the
2408 * I/O, the others get to wait for the result. 2355 * I/O, the others get to wait for the result.
2409 */ 2356 */
2410 sv_broadcast(&iclog->ic_write_wait); 2357 wake_up_all(&iclog->ic_write_wait);
2411 spin_unlock(&log->l_icloglock); 2358 spin_unlock(&log->l_icloglock);
2412 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2359 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2413} /* xlog_state_done_syncing */ 2360} /* xlog_state_done_syncing */
@@ -2456,7 +2403,7 @@ restart:
2456 XFS_STATS_INC(xs_log_noiclogs); 2403 XFS_STATS_INC(xs_log_noiclogs);
2457 2404
2458 /* Wait for log writes to have flushed */ 2405 /* Wait for log writes to have flushed */
2459 sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); 2406 xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2460 goto restart; 2407 goto restart;
2461 } 2408 }
2462 2409
@@ -2539,6 +2486,18 @@ restart:
2539 * 2486 *
2540 * Once a ticket gets put onto the reserveq, it will only return after 2487 * Once a ticket gets put onto the reserveq, it will only return after
2541 * the needed reservation is satisfied. 2488 * the needed reservation is satisfied.
2489 *
2490 * This function is structured so that it has a lock free fast path. This is
2491 * necessary because every new transaction reservation will come through this
2492 * path. Hence any lock will be globally hot if we take it unconditionally on
2493 * every pass.
2494 *
2495 * As tickets are only ever moved on and off the reserveq under the
2496 * l_grant_reserve_lock, we only need to take that lock if we are going
2497 * to add the ticket to the queue and sleep. We can avoid taking the lock if the
2498 * ticket was never added to the reserveq because the t_queue list head will be
2499 * empty and we hold the only reference to it so it can safely be checked
2500 * unlocked.
2542 */ 2501 */
2543STATIC int 2502STATIC int
2544xlog_grant_log_space(xlog_t *log, 2503xlog_grant_log_space(xlog_t *log,
@@ -2546,24 +2505,27 @@ xlog_grant_log_space(xlog_t *log,
2546{ 2505{
2547 int free_bytes; 2506 int free_bytes;
2548 int need_bytes; 2507 int need_bytes;
2549#ifdef DEBUG
2550 xfs_lsn_t tail_lsn;
2551#endif
2552
2553 2508
2554#ifdef DEBUG 2509#ifdef DEBUG
2555 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2510 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2556 panic("grant Recovery problem"); 2511 panic("grant Recovery problem");
2557#endif 2512#endif
2558 2513
2559 /* Is there space or do we need to sleep? */
2560 spin_lock(&log->l_grant_lock);
2561
2562 trace_xfs_log_grant_enter(log, tic); 2514 trace_xfs_log_grant_enter(log, tic);
2563 2515
2516 need_bytes = tic->t_unit_res;
2517 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2518 need_bytes *= tic->t_ocnt;
2519
2564 /* something is already sleeping; insert new transaction at end */ 2520 /* something is already sleeping; insert new transaction at end */
2565 if (log->l_reserve_headq) { 2521 if (!list_empty_careful(&log->l_reserveq)) {
2566 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2522 spin_lock(&log->l_grant_reserve_lock);
2523 /* recheck the queue now we are locked */
2524 if (list_empty(&log->l_reserveq)) {
2525 spin_unlock(&log->l_grant_reserve_lock);
2526 goto redo;
2527 }
2528 list_add_tail(&tic->t_queue, &log->l_reserveq);
2567 2529
2568 trace_xfs_log_grant_sleep1(log, tic); 2530 trace_xfs_log_grant_sleep1(log, tic);
2569 2531
@@ -2575,72 +2537,57 @@ xlog_grant_log_space(xlog_t *log,
2575 goto error_return; 2537 goto error_return;
2576 2538
2577 XFS_STATS_INC(xs_sleep_logspace); 2539 XFS_STATS_INC(xs_sleep_logspace);
2578 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2540 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2541
2579 /* 2542 /*
2580 * If we got an error, and the filesystem is shutting down, 2543 * If we got an error, and the filesystem is shutting down,
2581 * we'll catch it down below. So just continue... 2544 * we'll catch it down below. So just continue...
2582 */ 2545 */
2583 trace_xfs_log_grant_wake1(log, tic); 2546 trace_xfs_log_grant_wake1(log, tic);
2584 spin_lock(&log->l_grant_lock);
2585 } 2547 }
2586 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2587 need_bytes = tic->t_unit_res*tic->t_ocnt;
2588 else
2589 need_bytes = tic->t_unit_res;
2590 2548
2591redo: 2549redo:
2592 if (XLOG_FORCED_SHUTDOWN(log)) 2550 if (XLOG_FORCED_SHUTDOWN(log))
2593 goto error_return; 2551 goto error_return_unlocked;
2594 2552
2595 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, 2553 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2596 log->l_grant_reserve_bytes);
2597 if (free_bytes < need_bytes) { 2554 if (free_bytes < need_bytes) {
2598 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2555 spin_lock(&log->l_grant_reserve_lock);
2599 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2556 if (list_empty(&tic->t_queue))
2557 list_add_tail(&tic->t_queue, &log->l_reserveq);
2600 2558
2601 trace_xfs_log_grant_sleep2(log, tic); 2559 trace_xfs_log_grant_sleep2(log, tic);
2602 2560
2603 spin_unlock(&log->l_grant_lock);
2604 xlog_grant_push_ail(log->l_mp, need_bytes);
2605 spin_lock(&log->l_grant_lock);
2606
2607 XFS_STATS_INC(xs_sleep_logspace);
2608 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2609
2610 spin_lock(&log->l_grant_lock);
2611 if (XLOG_FORCED_SHUTDOWN(log)) 2561 if (XLOG_FORCED_SHUTDOWN(log))
2612 goto error_return; 2562 goto error_return;
2613 2563
2614 trace_xfs_log_grant_wake2(log, tic); 2564 xlog_grant_push_ail(log, need_bytes);
2615 2565
2566 XFS_STATS_INC(xs_sleep_logspace);
2567 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2568
2569 trace_xfs_log_grant_wake2(log, tic);
2616 goto redo; 2570 goto redo;
2617 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2571 }
2618 xlog_del_ticketq(&log->l_reserve_headq, tic);
2619 2572
2620 /* we've got enough space */ 2573 if (!list_empty(&tic->t_queue)) {
2621 xlog_grant_add_space(log, need_bytes); 2574 spin_lock(&log->l_grant_reserve_lock);
2622#ifdef DEBUG 2575 list_del_init(&tic->t_queue);
2623 tail_lsn = log->l_tail_lsn; 2576 spin_unlock(&log->l_grant_reserve_lock);
2624 /*
2625 * Check to make sure the grant write head didn't just over lap the
2626 * tail. If the cycles are the same, we can't be overlapping.
2627 * Otherwise, make sure that the cycles differ by exactly one and
2628 * check the byte count.
2629 */
2630 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2631 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2632 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2633 } 2577 }
2634#endif 2578
2579 /* we've got enough space */
2580 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2581 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2635 trace_xfs_log_grant_exit(log, tic); 2582 trace_xfs_log_grant_exit(log, tic);
2636 xlog_verify_grant_head(log, 1); 2583 xlog_verify_grant_tail(log);
2637 spin_unlock(&log->l_grant_lock);
2638 return 0; 2584 return 0;
2639 2585
2640 error_return: 2586error_return_unlocked:
2641 if (tic->t_flags & XLOG_TIC_IN_Q) 2587 spin_lock(&log->l_grant_reserve_lock);
2642 xlog_del_ticketq(&log->l_reserve_headq, tic); 2588error_return:
2643 2589 list_del_init(&tic->t_queue);
2590 spin_unlock(&log->l_grant_reserve_lock);
2644 trace_xfs_log_grant_error(log, tic); 2591 trace_xfs_log_grant_error(log, tic);
2645 2592
2646 /* 2593 /*
@@ -2650,7 +2597,6 @@ redo:
2650 */ 2597 */
2651 tic->t_curr_res = 0; 2598 tic->t_curr_res = 0;
2652 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2599 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2653 spin_unlock(&log->l_grant_lock);
2654 return XFS_ERROR(EIO); 2600 return XFS_ERROR(EIO);
2655} /* xlog_grant_log_space */ 2601} /* xlog_grant_log_space */
2656 2602
@@ -2658,17 +2604,14 @@ redo:
2658/* 2604/*
2659 * Replenish the byte reservation required by moving the grant write head. 2605 * Replenish the byte reservation required by moving the grant write head.
2660 * 2606 *
2661 * 2607 * Similar to xlog_grant_log_space, the function is structured to have a lock
2608 * free fast path.
2662 */ 2609 */
2663STATIC int 2610STATIC int
2664xlog_regrant_write_log_space(xlog_t *log, 2611xlog_regrant_write_log_space(xlog_t *log,
2665 xlog_ticket_t *tic) 2612 xlog_ticket_t *tic)
2666{ 2613{
2667 int free_bytes, need_bytes; 2614 int free_bytes, need_bytes;
2668 xlog_ticket_t *ntic;
2669#ifdef DEBUG
2670 xfs_lsn_t tail_lsn;
2671#endif
2672 2615
2673 tic->t_curr_res = tic->t_unit_res; 2616 tic->t_curr_res = tic->t_unit_res;
2674 xlog_tic_reset_res(tic); 2617 xlog_tic_reset_res(tic);
@@ -2681,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t *log,
2681 panic("regrant Recovery problem"); 2624 panic("regrant Recovery problem");
2682#endif 2625#endif
2683 2626
2684 spin_lock(&log->l_grant_lock);
2685
2686 trace_xfs_log_regrant_write_enter(log, tic); 2627 trace_xfs_log_regrant_write_enter(log, tic);
2687
2688 if (XLOG_FORCED_SHUTDOWN(log)) 2628 if (XLOG_FORCED_SHUTDOWN(log))
2689 goto error_return; 2629 goto error_return_unlocked;
2690 2630
2691 /* If there are other waiters on the queue then give them a 2631 /* If there are other waiters on the queue then give them a
2692 * chance at logspace before us. Wake up the first waiters, 2632 * chance at logspace before us. Wake up the first waiters,
@@ -2695,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t *log,
2695 * this transaction. 2635 * this transaction.
2696 */ 2636 */
2697 need_bytes = tic->t_unit_res; 2637 need_bytes = tic->t_unit_res;
2698 if ((ntic = log->l_write_headq)) { 2638 if (!list_empty_careful(&log->l_writeq)) {
2699 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2639 struct xlog_ticket *ntic;
2700 log->l_grant_write_bytes); 2640
2701 do { 2641 spin_lock(&log->l_grant_write_lock);
2642 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2643 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2702 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); 2644 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2703 2645
2704 if (free_bytes < ntic->t_unit_res) 2646 if (free_bytes < ntic->t_unit_res)
2705 break; 2647 break;
2706 free_bytes -= ntic->t_unit_res; 2648 free_bytes -= ntic->t_unit_res;
2707 sv_signal(&ntic->t_wait); 2649 wake_up(&ntic->t_wait);
2708 ntic = ntic->t_next; 2650 }
2709 } while (ntic != log->l_write_headq);
2710
2711 if (ntic != log->l_write_headq) {
2712 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2713 xlog_ins_ticketq(&log->l_write_headq, tic);
2714 2651
2652 if (ntic != list_first_entry(&log->l_writeq,
2653 struct xlog_ticket, t_queue)) {
2654 if (list_empty(&tic->t_queue))
2655 list_add_tail(&tic->t_queue, &log->l_writeq);
2715 trace_xfs_log_regrant_write_sleep1(log, tic); 2656 trace_xfs_log_regrant_write_sleep1(log, tic);
2716 2657
2717 spin_unlock(&log->l_grant_lock); 2658 xlog_grant_push_ail(log, need_bytes);
2718 xlog_grant_push_ail(log->l_mp, need_bytes);
2719 spin_lock(&log->l_grant_lock);
2720 2659
2721 XFS_STATS_INC(xs_sleep_logspace); 2660 XFS_STATS_INC(xs_sleep_logspace);
2722 sv_wait(&tic->t_wait, PINOD|PLTWAIT, 2661 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2723 &log->l_grant_lock, s);
2724
2725 /* If we're shutting down, this tic is already
2726 * off the queue */
2727 spin_lock(&log->l_grant_lock);
2728 if (XLOG_FORCED_SHUTDOWN(log))
2729 goto error_return;
2730
2731 trace_xfs_log_regrant_write_wake1(log, tic); 2662 trace_xfs_log_regrant_write_wake1(log, tic);
2732 } 2663 } else
2664 spin_unlock(&log->l_grant_write_lock);
2733 } 2665 }
2734 2666
2735redo: 2667redo:
2736 if (XLOG_FORCED_SHUTDOWN(log)) 2668 if (XLOG_FORCED_SHUTDOWN(log))
2737 goto error_return; 2669 goto error_return_unlocked;
2738 2670
2739 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2671 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2740 log->l_grant_write_bytes);
2741 if (free_bytes < need_bytes) { 2672 if (free_bytes < need_bytes) {
2742 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2673 spin_lock(&log->l_grant_write_lock);
2743 xlog_ins_ticketq(&log->l_write_headq, tic); 2674 if (list_empty(&tic->t_queue))
2744 spin_unlock(&log->l_grant_lock); 2675 list_add_tail(&tic->t_queue, &log->l_writeq);
2745 xlog_grant_push_ail(log->l_mp, need_bytes);
2746 spin_lock(&log->l_grant_lock);
2747
2748 XFS_STATS_INC(xs_sleep_logspace);
2749 trace_xfs_log_regrant_write_sleep2(log, tic);
2750
2751 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2752 2676
2753 /* If we're shutting down, this tic is already off the queue */
2754 spin_lock(&log->l_grant_lock);
2755 if (XLOG_FORCED_SHUTDOWN(log)) 2677 if (XLOG_FORCED_SHUTDOWN(log))
2756 goto error_return; 2678 goto error_return;
2757 2679
2680 xlog_grant_push_ail(log, need_bytes);
2681
2682 XFS_STATS_INC(xs_sleep_logspace);
2683 trace_xfs_log_regrant_write_sleep2(log, tic);
2684 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2685
2758 trace_xfs_log_regrant_write_wake2(log, tic); 2686 trace_xfs_log_regrant_write_wake2(log, tic);
2759 goto redo; 2687 goto redo;
2760 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2688 }
2761 xlog_del_ticketq(&log->l_write_headq, tic);
2762 2689
2763 /* we've got enough space */ 2690 if (!list_empty(&tic->t_queue)) {
2764 xlog_grant_add_space_write(log, need_bytes); 2691 spin_lock(&log->l_grant_write_lock);
2765#ifdef DEBUG 2692 list_del_init(&tic->t_queue);
2766 tail_lsn = log->l_tail_lsn; 2693 spin_unlock(&log->l_grant_write_lock);
2767 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2768 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2769 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2770 } 2694 }
2771#endif
2772 2695
2696 /* we've got enough space */
2697 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2773 trace_xfs_log_regrant_write_exit(log, tic); 2698 trace_xfs_log_regrant_write_exit(log, tic);
2774 2699 xlog_verify_grant_tail(log);
2775 xlog_verify_grant_head(log, 1);
2776 spin_unlock(&log->l_grant_lock);
2777 return 0; 2700 return 0;
2778 2701
2779 2702
2703 error_return_unlocked:
2704 spin_lock(&log->l_grant_write_lock);
2780 error_return: 2705 error_return:
2781 if (tic->t_flags & XLOG_TIC_IN_Q) 2706 list_del_init(&tic->t_queue);
2782 xlog_del_ticketq(&log->l_reserve_headq, tic); 2707 spin_unlock(&log->l_grant_write_lock);
2783
2784 trace_xfs_log_regrant_write_error(log, tic); 2708 trace_xfs_log_regrant_write_error(log, tic);
2785 2709
2786 /* 2710 /*
@@ -2790,7 +2714,6 @@ redo:
2790 */ 2714 */
2791 tic->t_curr_res = 0; 2715 tic->t_curr_res = 0;
2792 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2716 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2793 spin_unlock(&log->l_grant_lock);
2794 return XFS_ERROR(EIO); 2717 return XFS_ERROR(EIO);
2795} /* xlog_regrant_write_log_space */ 2718} /* xlog_regrant_write_log_space */
2796 2719
@@ -2811,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2811 if (ticket->t_cnt > 0) 2734 if (ticket->t_cnt > 0)
2812 ticket->t_cnt--; 2735 ticket->t_cnt--;
2813 2736
2814 spin_lock(&log->l_grant_lock); 2737 xlog_grant_sub_space(log, &log->l_grant_reserve_head,
2815 xlog_grant_sub_space(log, ticket->t_curr_res); 2738 ticket->t_curr_res);
2739 xlog_grant_sub_space(log, &log->l_grant_write_head,
2740 ticket->t_curr_res);
2816 ticket->t_curr_res = ticket->t_unit_res; 2741 ticket->t_curr_res = ticket->t_unit_res;
2817 xlog_tic_reset_res(ticket); 2742 xlog_tic_reset_res(ticket);
2818 2743
2819 trace_xfs_log_regrant_reserve_sub(log, ticket); 2744 trace_xfs_log_regrant_reserve_sub(log, ticket);
2820 2745
2821 xlog_verify_grant_head(log, 1);
2822
2823 /* just return if we still have some of the pre-reserved space */ 2746 /* just return if we still have some of the pre-reserved space */
2824 if (ticket->t_cnt > 0) { 2747 if (ticket->t_cnt > 0)
2825 spin_unlock(&log->l_grant_lock);
2826 return; 2748 return;
2827 }
2828 2749
2829 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2750 xlog_grant_add_space(log, &log->l_grant_reserve_head,
2751 ticket->t_unit_res);
2830 2752
2831 trace_xfs_log_regrant_reserve_exit(log, ticket); 2753 trace_xfs_log_regrant_reserve_exit(log, ticket);
2832 2754
2833 xlog_verify_grant_head(log, 0);
2834 spin_unlock(&log->l_grant_lock);
2835 ticket->t_curr_res = ticket->t_unit_res; 2755 ticket->t_curr_res = ticket->t_unit_res;
2836 xlog_tic_reset_res(ticket); 2756 xlog_tic_reset_res(ticket);
2837} /* xlog_regrant_reserve_log_space */ 2757} /* xlog_regrant_reserve_log_space */
@@ -2855,28 +2775,29 @@ STATIC void
2855xlog_ungrant_log_space(xlog_t *log, 2775xlog_ungrant_log_space(xlog_t *log,
2856 xlog_ticket_t *ticket) 2776 xlog_ticket_t *ticket)
2857{ 2777{
2778 int bytes;
2779
2858 if (ticket->t_cnt > 0) 2780 if (ticket->t_cnt > 0)
2859 ticket->t_cnt--; 2781 ticket->t_cnt--;
2860 2782
2861 spin_lock(&log->l_grant_lock);
2862 trace_xfs_log_ungrant_enter(log, ticket); 2783 trace_xfs_log_ungrant_enter(log, ticket);
2863
2864 xlog_grant_sub_space(log, ticket->t_curr_res);
2865
2866 trace_xfs_log_ungrant_sub(log, ticket); 2784 trace_xfs_log_ungrant_sub(log, ticket);
2867 2785
2868 /* If this is a permanent reservation ticket, we may be able to free 2786 /*
2787 * If this is a permanent reservation ticket, we may be able to free
2869 * up more space based on the remaining count. 2788 * up more space based on the remaining count.
2870 */ 2789 */
2790 bytes = ticket->t_curr_res;
2871 if (ticket->t_cnt > 0) { 2791 if (ticket->t_cnt > 0) {
2872 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2792 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2873 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2793 bytes += ticket->t_unit_res*ticket->t_cnt;
2874 } 2794 }
2875 2795
2796 xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
2797 xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
2798
2876 trace_xfs_log_ungrant_exit(log, ticket); 2799 trace_xfs_log_ungrant_exit(log, ticket);
2877 2800
2878 xlog_verify_grant_head(log, 1);
2879 spin_unlock(&log->l_grant_lock);
2880 xfs_log_move_tail(log->l_mp, 1); 2801 xfs_log_move_tail(log->l_mp, 1);
2881} /* xlog_ungrant_log_space */ 2802} /* xlog_ungrant_log_space */
2882 2803
@@ -2913,11 +2834,11 @@ xlog_state_release_iclog(
2913 2834
2914 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2835 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2915 /* update tail before writing to iclog */ 2836 /* update tail before writing to iclog */
2916 xlog_assign_tail_lsn(log->l_mp); 2837 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
2917 sync++; 2838 sync++;
2918 iclog->ic_state = XLOG_STATE_SYNCING; 2839 iclog->ic_state = XLOG_STATE_SYNCING;
2919 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); 2840 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
2920 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2841 xlog_verify_tail_lsn(log, iclog, tail_lsn);
2921 /* cycle incremented when incrementing curr_block */ 2842 /* cycle incremented when incrementing curr_block */
2922 } 2843 }
2923 spin_unlock(&log->l_icloglock); 2844 spin_unlock(&log->l_icloglock);
@@ -3100,7 +3021,7 @@ maybe_sleep:
3100 return XFS_ERROR(EIO); 3021 return XFS_ERROR(EIO);
3101 } 3022 }
3102 XFS_STATS_INC(xs_log_force_sleep); 3023 XFS_STATS_INC(xs_log_force_sleep);
3103 sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); 3024 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3104 /* 3025 /*
3105 * No need to grab the log lock here since we're 3026 * No need to grab the log lock here since we're
3106 * only deciding whether or not to return EIO 3027 * only deciding whether or not to return EIO
@@ -3218,8 +3139,8 @@ try_again:
3218 3139
3219 XFS_STATS_INC(xs_log_force_sleep); 3140 XFS_STATS_INC(xs_log_force_sleep);
3220 3141
3221 sv_wait(&iclog->ic_prev->ic_write_wait, 3142 xlog_wait(&iclog->ic_prev->ic_write_wait,
3222 PSWP, &log->l_icloglock, s); 3143 &log->l_icloglock);
3223 if (log_flushed) 3144 if (log_flushed)
3224 *log_flushed = 1; 3145 *log_flushed = 1;
3225 already_slept = 1; 3146 already_slept = 1;
@@ -3247,7 +3168,7 @@ try_again:
3247 return XFS_ERROR(EIO); 3168 return XFS_ERROR(EIO);
3248 } 3169 }
3249 XFS_STATS_INC(xs_log_force_sleep); 3170 XFS_STATS_INC(xs_log_force_sleep);
3250 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); 3171 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3251 /* 3172 /*
3252 * No need to grab the log lock here since we're 3173 * No need to grab the log lock here since we're
3253 * only deciding whether or not to return EIO 3174 * only deciding whether or not to return EIO
@@ -3322,10 +3243,8 @@ xfs_log_ticket_put(
3322 xlog_ticket_t *ticket) 3243 xlog_ticket_t *ticket)
3323{ 3244{
3324 ASSERT(atomic_read(&ticket->t_ref) > 0); 3245 ASSERT(atomic_read(&ticket->t_ref) > 0);
3325 if (atomic_dec_and_test(&ticket->t_ref)) { 3246 if (atomic_dec_and_test(&ticket->t_ref))
3326 sv_destroy(&ticket->t_wait);
3327 kmem_zone_free(xfs_log_ticket_zone, ticket); 3247 kmem_zone_free(xfs_log_ticket_zone, ticket);
3328 }
3329} 3248}
3330 3249
3331xlog_ticket_t * 3250xlog_ticket_t *
@@ -3447,6 +3366,7 @@ xlog_ticket_alloc(
3447 } 3366 }
3448 3367
3449 atomic_set(&tic->t_ref, 1); 3368 atomic_set(&tic->t_ref, 1);
3369 INIT_LIST_HEAD(&tic->t_queue);
3450 tic->t_unit_res = unit_bytes; 3370 tic->t_unit_res = unit_bytes;
3451 tic->t_curr_res = unit_bytes; 3371 tic->t_curr_res = unit_bytes;
3452 tic->t_cnt = cnt; 3372 tic->t_cnt = cnt;
@@ -3457,7 +3377,7 @@ xlog_ticket_alloc(
3457 tic->t_trans_type = 0; 3377 tic->t_trans_type = 0;
3458 if (xflags & XFS_LOG_PERM_RESERV) 3378 if (xflags & XFS_LOG_PERM_RESERV)
3459 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3379 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3460 sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); 3380 init_waitqueue_head(&tic->t_wait);
3461 3381
3462 xlog_tic_reset_res(tic); 3382 xlog_tic_reset_res(tic);
3463 3383
@@ -3496,18 +3416,25 @@ xlog_verify_dest_ptr(
3496} 3416}
3497 3417
3498STATIC void 3418STATIC void
3499xlog_verify_grant_head(xlog_t *log, int equals) 3419xlog_verify_grant_tail(
3420 struct log *log)
3500{ 3421{
3501 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { 3422 int tail_cycle, tail_blocks;
3502 if (equals) 3423 int cycle, space;
3503 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); 3424
3504 else 3425 /*
3505 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); 3426 * Check to make sure the grant write head didn't just over lap the
3506 } else { 3427 * tail. If the cycles are the same, we can't be overlapping.
3507 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); 3428 * Otherwise, make sure that the cycles differ by exactly one and
3508 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); 3429 * check the byte count.
3509 } 3430 */
3510} /* xlog_verify_grant_head */ 3431 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
3432 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3433 if (tail_cycle != cycle) {
3434 ASSERT(cycle - 1 == tail_cycle);
3435 ASSERT(space <= BBTOB(tail_blocks));
3436 }
3437}
3511 3438
3512/* check if it will fit */ 3439/* check if it will fit */
3513STATIC void 3440STATIC void
@@ -3728,12 +3655,10 @@ xfs_log_force_umount(
3728 xlog_cil_force(log); 3655 xlog_cil_force(log);
3729 3656
3730 /* 3657 /*
3731 * We must hold both the GRANT lock and the LOG lock, 3658 * mark the filesystem and the as in a shutdown state and wake
3732 * before we mark the filesystem SHUTDOWN and wake 3659 * everybody up to tell them the bad news.
3733 * everybody up to tell the bad news.
3734 */ 3660 */
3735 spin_lock(&log->l_icloglock); 3661 spin_lock(&log->l_icloglock);
3736 spin_lock(&log->l_grant_lock);
3737 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3662 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3738 if (mp->m_sb_bp) 3663 if (mp->m_sb_bp)
3739 XFS_BUF_DONE(mp->m_sb_bp); 3664 XFS_BUF_DONE(mp->m_sb_bp);
@@ -3754,27 +3679,21 @@ xfs_log_force_umount(
3754 spin_unlock(&log->l_icloglock); 3679 spin_unlock(&log->l_icloglock);
3755 3680
3756 /* 3681 /*
3757 * We don't want anybody waiting for log reservations 3682 * We don't want anybody waiting for log reservations after this. That
3758 * after this. That means we have to wake up everybody 3683 * means we have to wake up everybody queued up on reserveq as well as
3759 * queued up on reserve_headq as well as write_headq. 3684 * writeq. In addition, we make sure in xlog_{re}grant_log_space that
3760 * In addition, we make sure in xlog_{re}grant_log_space 3685 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3761 * that we don't enqueue anything once the SHUTDOWN flag 3686 * action is protected by the grant locks.
3762 * is set, and this action is protected by the GRANTLOCK.
3763 */ 3687 */
3764 if ((tic = log->l_reserve_headq)) { 3688 spin_lock(&log->l_grant_reserve_lock);
3765 do { 3689 list_for_each_entry(tic, &log->l_reserveq, t_queue)
3766 sv_signal(&tic->t_wait); 3690 wake_up(&tic->t_wait);
3767 tic = tic->t_next; 3691 spin_unlock(&log->l_grant_reserve_lock);
3768 } while (tic != log->l_reserve_headq); 3692
3769 } 3693 spin_lock(&log->l_grant_write_lock);
3770 3694 list_for_each_entry(tic, &log->l_writeq, t_queue)
3771 if ((tic = log->l_write_headq)) { 3695 wake_up(&tic->t_wait);
3772 do { 3696 spin_unlock(&log->l_grant_write_lock);
3773 sv_signal(&tic->t_wait);
3774 tic = tic->t_next;
3775 } while (tic != log->l_write_headq);
3776 }
3777 spin_unlock(&log->l_grant_lock);
3778 3697
3779 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3698 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3780 ASSERT(!logerror); 3699 ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
193 193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 194void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 195 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 196 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e206fc1fa36..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
61 INIT_LIST_HEAD(&cil->xc_committing); 61 INIT_LIST_HEAD(&cil->xc_committing);
62 spin_lock_init(&cil->xc_cil_lock); 62 spin_lock_init(&cil->xc_cil_lock);
63 init_rwsem(&cil->xc_ctx_lock); 63 init_rwsem(&cil->xc_ctx_lock);
64 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); 64 init_waitqueue_head(&cil->xc_commit_wait);
65 65
66 INIT_LIST_HEAD(&ctx->committing); 66 INIT_LIST_HEAD(&ctx->committing);
67 INIT_LIST_HEAD(&ctx->busy_extents); 67 INIT_LIST_HEAD(&ctx->busy_extents);
@@ -146,102 +146,6 @@ xlog_cil_init_post_recovery(
146} 146}
147 147
148/* 148/*
149 * Insert the log item into the CIL and calculate the difference in space
150 * consumed by the item. Add the space to the checkpoint ticket and calculate
151 * if the change requires additional log metadata. If it does, take that space
152 * as well. Remove the amount of space we addded to the checkpoint ticket from
153 * the current transaction ticket so that the accounting works out correctly.
154 *
155 * If this is the first time the item is being placed into the CIL in this
156 * context, pin it so it can't be written to disk until the CIL is flushed to
157 * the iclog and the iclog written to disk.
158 */
159static void
160xlog_cil_insert(
161 struct log *log,
162 struct xlog_ticket *ticket,
163 struct xfs_log_item *item,
164 struct xfs_log_vec *lv)
165{
166 struct xfs_cil *cil = log->l_cilp;
167 struct xfs_log_vec *old = lv->lv_item->li_lv;
168 struct xfs_cil_ctx *ctx = cil->xc_ctx;
169 int len;
170 int diff_iovecs;
171 int iclog_space;
172
173 if (old) {
174 /* existing lv on log item, space used is a delta */
175 ASSERT(!list_empty(&item->li_cil));
176 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
177
178 len = lv->lv_buf_len - old->lv_buf_len;
179 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
180 kmem_free(old->lv_buf);
181 kmem_free(old);
182 } else {
183 /* new lv, must pin the log item */
184 ASSERT(!lv->lv_item->li_lv);
185 ASSERT(list_empty(&item->li_cil));
186
187 len = lv->lv_buf_len;
188 diff_iovecs = lv->lv_niovecs;
189 IOP_PIN(lv->lv_item);
190
191 }
192 len += diff_iovecs * sizeof(xlog_op_header_t);
193
194 /* attach new log vector to log item */
195 lv->lv_item->li_lv = lv;
196
197 spin_lock(&cil->xc_cil_lock);
198 list_move_tail(&item->li_cil, &cil->xc_cil);
199 ctx->nvecs += diff_iovecs;
200
201 /*
202 * If this is the first time the item is being committed to the CIL,
203 * store the sequence number on the log item so we can tell
204 * in future commits whether this is the first checkpoint the item is
205 * being committed into.
206 */
207 if (!item->li_seq)
208 item->li_seq = ctx->sequence;
209
210 /*
211 * Now transfer enough transaction reservation to the context ticket
212 * for the checkpoint. The context ticket is special - the unit
213 * reservation has to grow as well as the current reservation as we
214 * steal from tickets so we can correctly determine the space used
215 * during the transaction commit.
216 */
217 if (ctx->ticket->t_curr_res == 0) {
218 /* first commit in checkpoint, steal the header reservation */
219 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
220 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
221 ticket->t_curr_res -= ctx->ticket->t_unit_res;
222 }
223
224 /* do we need space for more log record headers? */
225 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
226 if (len > 0 && (ctx->space_used / iclog_space !=
227 (ctx->space_used + len) / iclog_space)) {
228 int hdrs;
229
230 hdrs = (len + iclog_space - 1) / iclog_space;
231 /* need to take into account split region headers, too */
232 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
233 ctx->ticket->t_unit_res += hdrs;
234 ctx->ticket->t_curr_res += hdrs;
235 ticket->t_curr_res -= hdrs;
236 ASSERT(ticket->t_curr_res >= len);
237 }
238 ticket->t_curr_res -= len;
239 ctx->space_used += len;
240
241 spin_unlock(&cil->xc_cil_lock);
242}
243
244/*
245 * Format log item into a flat buffers 149 * Format log item into a flat buffers
246 * 150 *
247 * For delayed logging, we need to hold a formatted buffer containing all the 151 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +190,7 @@ xlog_cil_format_items(
286 len += lv->lv_iovecp[index].i_len; 190 len += lv->lv_iovecp[index].i_len;
287 191
288 lv->lv_buf_len = len; 192 lv->lv_buf_len = len;
289 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); 193 lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
290 ptr = lv->lv_buf; 194 ptr = lv->lv_buf;
291 195
292 for (index = 0; index < lv->lv_niovecs; index++) { 196 for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
300 } 204 }
301} 205}
302 206
207/*
208 * Prepare the log item for insertion into the CIL. Calculate the difference in
209 * log space and vectors it will consume, and if it is a new item pin it as
210 * well.
211 */
212STATIC void
213xfs_cil_prepare_item(
214 struct log *log,
215 struct xfs_log_vec *lv,
216 int *len,
217 int *diff_iovecs)
218{
219 struct xfs_log_vec *old = lv->lv_item->li_lv;
220
221 if (old) {
222 /* existing lv on log item, space used is a delta */
223 ASSERT(!list_empty(&lv->lv_item->li_cil));
224 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
225
226 *len += lv->lv_buf_len - old->lv_buf_len;
227 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
228 kmem_free(old->lv_buf);
229 kmem_free(old);
230 } else {
231 /* new lv, must pin the log item */
232 ASSERT(!lv->lv_item->li_lv);
233 ASSERT(list_empty(&lv->lv_item->li_cil));
234
235 *len += lv->lv_buf_len;
236 *diff_iovecs += lv->lv_niovecs;
237 IOP_PIN(lv->lv_item);
238
239 }
240
241 /* attach new log vector to log item */
242 lv->lv_item->li_lv = lv;
243
244 /*
245 * If this is the first time the item is being committed to the
246 * CIL, store the sequence number on the log item so we can
247 * tell in future commits whether this is the first checkpoint
248 * the item is being committed into.
249 */
250 if (!lv->lv_item->li_seq)
251 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
252}
253
254/*
255 * Insert the log items into the CIL and calculate the difference in space
256 * consumed by the item. Add the space to the checkpoint ticket and calculate
257 * if the change requires additional log metadata. If it does, take that space
258 * as well. Remove the amount of space we addded to the checkpoint ticket from
259 * the current transaction ticket so that the accounting works out correctly.
260 */
303static void 261static void
304xlog_cil_insert_items( 262xlog_cil_insert_items(
305 struct log *log, 263 struct log *log,
306 struct xfs_log_vec *log_vector, 264 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket, 265 struct xlog_ticket *ticket)
308 xfs_lsn_t *start_lsn)
309{ 266{
310 struct xfs_log_vec *lv; 267 struct xfs_cil *cil = log->l_cilp;
311 268 struct xfs_cil_ctx *ctx = cil->xc_ctx;
312 if (start_lsn) 269 struct xfs_log_vec *lv;
313 *start_lsn = log->l_cilp->xc_ctx->sequence; 270 int len = 0;
271 int diff_iovecs = 0;
272 int iclog_space;
314 273
315 ASSERT(log_vector); 274 ASSERT(log_vector);
275
276 /*
277 * Do all the accounting aggregation and switching of log vectors
278 * around in a separate loop to the insertion of items into the CIL.
279 * Then we can do a separate loop to update the CIL within a single
280 * lock/unlock pair. This reduces the number of round trips on the CIL
281 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
282 * hold time for the transaction commit.
283 *
284 * If this is the first time the item is being placed into the CIL in
285 * this context, pin it so it can't be written to disk until the CIL is
286 * flushed to the iclog and the iclog written to disk.
287 *
288 * We can do this safely because the context can't checkpoint until we
289 * are done so it doesn't matter exactly how we update the CIL.
290 */
291 for (lv = log_vector; lv; lv = lv->lv_next)
292 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
293
294 /* account for space used by new iovec headers */
295 len += diff_iovecs * sizeof(xlog_op_header_t);
296
297 spin_lock(&cil->xc_cil_lock);
298
299 /* move the items to the tail of the CIL */
316 for (lv = log_vector; lv; lv = lv->lv_next) 300 for (lv = log_vector; lv; lv = lv->lv_next)
317 xlog_cil_insert(log, ticket, lv->lv_item, lv); 301 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
302
303 ctx->nvecs += diff_iovecs;
304
305 /*
306 * Now transfer enough transaction reservation to the context ticket
307 * for the checkpoint. The context ticket is special - the unit
308 * reservation has to grow as well as the current reservation as we
309 * steal from tickets so we can correctly determine the space used
310 * during the transaction commit.
311 */
312 if (ctx->ticket->t_curr_res == 0) {
313 /* first commit in checkpoint, steal the header reservation */
314 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
315 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
316 ticket->t_curr_res -= ctx->ticket->t_unit_res;
317 }
318
319 /* do we need space for more log record headers? */
320 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
321 if (len > 0 && (ctx->space_used / iclog_space !=
322 (ctx->space_used + len) / iclog_space)) {
323 int hdrs;
324
325 hdrs = (len + iclog_space - 1) / iclog_space;
326 /* need to take into account split region headers, too */
327 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
328 ctx->ticket->t_unit_res += hdrs;
329 ctx->ticket->t_curr_res += hdrs;
330 ticket->t_curr_res -= hdrs;
331 ASSERT(ticket->t_curr_res >= len);
332 }
333 ticket->t_curr_res -= len;
334 ctx->space_used += len;
335
336 spin_unlock(&cil->xc_cil_lock);
318} 337}
319 338
320static void 339static void
@@ -342,15 +361,10 @@ xlog_cil_committed(
342 int abort) 361 int abort)
343{ 362{
344 struct xfs_cil_ctx *ctx = args; 363 struct xfs_cil_ctx *ctx = args;
345 struct xfs_log_vec *lv;
346 int abortflag = abort ? XFS_LI_ABORTED : 0;
347 struct xfs_busy_extent *busyp, *n; 364 struct xfs_busy_extent *busyp, *n;
348 365
349 /* unpin all the log items */ 366 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
350 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { 367 ctx->start_lsn, abort);
351 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
352 abortflag);
353 }
354 368
355 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) 369 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
356 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); 370 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -529,7 +543,7 @@ xlog_cil_push(
529 543
530 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
531 if (error) 545 if (error)
532 goto out_abort; 546 goto out_abort_free_ticket;
533 547
534 /* 548 /*
535 * now that we've written the checkpoint into the log, strictly 549 * now that we've written the checkpoint into the log, strictly
@@ -549,14 +563,15 @@ restart:
549 * It is still being pushed! Wait for the push to 563 * It is still being pushed! Wait for the push to
550 * complete, then start again from the beginning. 564 * complete, then start again from the beginning.
551 */ 565 */
552 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 566 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
553 goto restart; 567 goto restart;
554 } 568 }
555 } 569 }
556 spin_unlock(&cil->xc_cil_lock); 570 spin_unlock(&cil->xc_cil_lock);
557 571
572 /* xfs_log_done always frees the ticket on error. */
558 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 573 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
559 if (error || commit_lsn == -1) 574 if (commit_lsn == -1)
560 goto out_abort; 575 goto out_abort;
561 576
562 /* attach all the transactions w/ busy extents to iclog */ 577 /* attach all the transactions w/ busy extents to iclog */
@@ -573,7 +588,7 @@ restart:
573 */ 588 */
574 spin_lock(&cil->xc_cil_lock); 589 spin_lock(&cil->xc_cil_lock);
575 ctx->commit_lsn = commit_lsn; 590 ctx->commit_lsn = commit_lsn;
576 sv_broadcast(&cil->xc_commit_wait); 591 wake_up_all(&cil->xc_commit_wait);
577 spin_unlock(&cil->xc_cil_lock); 592 spin_unlock(&cil->xc_cil_lock);
578 593
579 /* release the hounds! */ 594 /* release the hounds! */
@@ -586,6 +601,8 @@ out_free_ticket:
586 kmem_free(new_ctx); 601 kmem_free(new_ctx);
587 return 0; 602 return 0;
588 603
604out_abort_free_ticket:
605 xfs_log_ticket_put(tic);
589out_abort: 606out_abort:
590 xlog_cil_committed(ctx, XFS_LI_ABORTED); 607 xlog_cil_committed(ctx, XFS_LI_ABORTED);
591 return XFS_ERROR(EIO); 608 return XFS_ERROR(EIO);
@@ -608,7 +625,7 @@ out_abort:
608 * background commit, returns without it held once background commits are 625 * background commit, returns without it held once background commits are
609 * allowed again. 626 * allowed again.
610 */ 627 */
611int 628void
612xfs_log_commit_cil( 629xfs_log_commit_cil(
613 struct xfs_mount *mp, 630 struct xfs_mount *mp,
614 struct xfs_trans *tp, 631 struct xfs_trans *tp,
@@ -623,11 +640,6 @@ xfs_log_commit_cil(
623 if (flags & XFS_TRANS_RELEASE_LOG_RES) 640 if (flags & XFS_TRANS_RELEASE_LOG_RES)
624 log_flags = XFS_LOG_REL_PERM_RESERV; 641 log_flags = XFS_LOG_REL_PERM_RESERV;
625 642
626 if (XLOG_FORCED_SHUTDOWN(log)) {
627 xlog_cil_free_logvec(log_vector);
628 return XFS_ERROR(EIO);
629 }
630
631 /* 643 /*
632 * do all the hard work of formatting items (including memory 644 * do all the hard work of formatting items (including memory
633 * allocation) outside the CIL context lock. This prevents stalling CIL 645 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -638,7 +650,10 @@ xfs_log_commit_cil(
638 650
639 /* lock out background commit */ 651 /* lock out background commit */
640 down_read(&log->l_cilp->xc_ctx_lock); 652 down_read(&log->l_cilp->xc_ctx_lock);
641 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); 653 if (commit_lsn)
654 *commit_lsn = log->l_cilp->xc_ctx->sequence;
655
656 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
642 657
643 /* check we didn't blow the reservation */ 658 /* check we didn't blow the reservation */
644 if (tp->t_ticket->t_curr_res < 0) 659 if (tp->t_ticket->t_curr_res < 0)
@@ -684,7 +699,6 @@ xfs_log_commit_cil(
684 */ 699 */
685 if (push) 700 if (push)
686 xlog_cil_push(log, 0); 701 xlog_cil_push(log, 0);
687 return 0;
688} 702}
689 703
690/* 704/*
@@ -735,7 +749,7 @@ restart:
735 * It is still being pushed! Wait for the push to 749 * It is still being pushed! Wait for the push to
736 * complete, then start again from the beginning. 750 * complete, then start again from the beginning.
737 */ 751 */
738 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 752 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
739 goto restart; 753 goto restart;
740 } 754 }
741 if (ctx->sequence != sequence) 755 if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..d5f8be8f4bf6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
21struct xfs_buf; 21struct xfs_buf;
22struct log; 22struct log;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xfs_buf_cancel;
25struct xfs_mount; 24struct xfs_mount;
26 25
27/* 26/*
@@ -54,7 +53,6 @@ struct xfs_mount;
54 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ 53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
55 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) 54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
56 55
57
58static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) 56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
59{ 57{
60 return ((xfs_lsn_t)cycle << 32) | block; 58 return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
133 */ 131 */
134#define XLOG_TIC_INITED 0x1 /* has been initialized */ 132#define XLOG_TIC_INITED 0x1 /* has been initialized */
135#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ 133#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
136#define XLOG_TIC_IN_Q 0x4
137 134
138#define XLOG_TIC_FLAGS \ 135#define XLOG_TIC_FLAGS \
139 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 136 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
140 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ 137 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
141 { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
142 138
143#endif /* __KERNEL__ */ 139#endif /* __KERNEL__ */
144 140
@@ -244,9 +240,8 @@ typedef struct xlog_res {
244} xlog_res_t; 240} xlog_res_t;
245 241
246typedef struct xlog_ticket { 242typedef struct xlog_ticket {
247 sv_t t_wait; /* ticket wait queue : 20 */ 243 wait_queue_head_t t_wait; /* ticket wait queue */
248 struct xlog_ticket *t_next; /* :4|8 */ 244 struct list_head t_queue; /* reserve/write queue */
249 struct xlog_ticket *t_prev; /* :4|8 */
250 xlog_tid_t t_tid; /* transaction identifier : 4 */ 245 xlog_tid_t t_tid; /* transaction identifier : 4 */
251 atomic_t t_ref; /* ticket reference count : 4 */ 246 atomic_t t_ref; /* ticket reference count : 4 */
252 int t_curr_res; /* current reservation in bytes : 4 */ 247 int t_curr_res; /* current reservation in bytes : 4 */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
353 * and move everything else out to subsequent cachelines. 348 * and move everything else out to subsequent cachelines.
354 */ 349 */
355typedef struct xlog_in_core { 350typedef struct xlog_in_core {
356 sv_t ic_force_wait; 351 wait_queue_head_t ic_force_wait;
357 sv_t ic_write_wait; 352 wait_queue_head_t ic_write_wait;
358 struct xlog_in_core *ic_next; 353 struct xlog_in_core *ic_next;
359 struct xlog_in_core *ic_prev; 354 struct xlog_in_core *ic_prev;
360 struct xfs_buf *ic_bp; 355 struct xfs_buf *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
421 struct xfs_cil_ctx *xc_ctx; 416 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock; 417 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 418 struct list_head xc_committing;
424 sv_t xc_commit_wait; 419 wait_queue_head_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence; 420 xfs_lsn_t xc_current_sequence;
426}; 421};
427 422
@@ -491,7 +486,7 @@ typedef struct log {
491 struct xfs_buftarg *l_targ; /* buftarg of log */ 486 struct xfs_buftarg *l_targ; /* buftarg of log */
492 uint l_flags; 487 uint l_flags;
493 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 488 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
494 struct xfs_buf_cancel **l_buf_cancel_table; 489 struct list_head *l_buf_cancel_table;
495 int l_iclog_hsize; /* size of iclog header */ 490 int l_iclog_hsize; /* size of iclog header */
496 int l_iclog_heads; /* # of iclog header sectors */ 491 int l_iclog_heads; /* # of iclog header sectors */
497 uint l_sectBBsize; /* sector size in BBs (2^n) */ 492 uint l_sectBBsize; /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
503 int l_logBBsize; /* size of log in BB chunks */ 498 int l_logBBsize; /* size of log in BB chunks */
504 499
505 /* The following block of fields are changed while holding icloglock */ 500 /* The following block of fields are changed while holding icloglock */
506 sv_t l_flush_wait ____cacheline_aligned_in_smp; 501 wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
507 /* waiting for iclog flush */ 502 /* waiting for iclog flush */
508 int l_covered_state;/* state of "covering disk 503 int l_covered_state;/* state of "covering disk
509 * log entries" */ 504 * log entries" */
510 xlog_in_core_t *l_iclog; /* head log queue */ 505 xlog_in_core_t *l_iclog; /* head log queue */
511 spinlock_t l_icloglock; /* grab to change iclog state */ 506 spinlock_t l_icloglock; /* grab to change iclog state */
512 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
513 * buffers */
514 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
515 int l_curr_cycle; /* Cycle number of log writes */ 507 int l_curr_cycle; /* Cycle number of log writes */
516 int l_prev_cycle; /* Cycle number before last 508 int l_prev_cycle; /* Cycle number before last
517 * block increment */ 509 * block increment */
518 int l_curr_block; /* current logical log block */ 510 int l_curr_block; /* current logical log block */
519 int l_prev_block; /* previous logical log block */ 511 int l_prev_block; /* previous logical log block */
520 512
521 /* The following block of fields are changed while holding grant_lock */ 513 /*
522 spinlock_t l_grant_lock ____cacheline_aligned_in_smp; 514 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
523 xlog_ticket_t *l_reserve_headq; 515 * read without needing to hold specific locks. To avoid operations
524 xlog_ticket_t *l_write_headq; 516 * contending with other hot objects, place each of them on a separate
525 int l_grant_reserve_cycle; 517 * cacheline.
526 int l_grant_reserve_bytes; 518 */
527 int l_grant_write_cycle; 519 /* lsn of last LR on disk */
528 int l_grant_write_bytes; 520 atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
521 /* lsn of 1st LR with unflushed * buffers */
522 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
523
524 /*
525 * ticket grant locks, queues and accounting have their own cachlines
526 * as these are quite hot and can be operated on concurrently.
527 */
528 spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
529 struct list_head l_reserveq;
530 atomic64_t l_grant_reserve_head;
531
532 spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
533 struct list_head l_writeq;
534 atomic64_t l_grant_write_head;
529 535
530 /* The following field are used for debugging; need to hold icloglock */ 536 /* The following field are used for debugging; need to hold icloglock */
531#ifdef DEBUG 537#ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
534 540
535} xlog_t; 541} xlog_t;
536 542
543#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
544 ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
545
537#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 546#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
538 547
539/* common routines */ 548/* common routines */
@@ -562,6 +571,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
562 xlog_in_core_t **commit_iclog, uint flags); 571 xlog_in_core_t **commit_iclog, uint flags);
563 572
564/* 573/*
574 * When we crack an atomic LSN, we sample it first so that the value will not
575 * change while we are cracking it into the component values. This means we
576 * will always get consistent component values to work from. This should always
577 * be used to smaple and crack LSNs taht are stored and updated in atomic
578 * variables.
579 */
580static inline void
581xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
582{
583 xfs_lsn_t val = atomic64_read(lsn);
584
585 *cycle = CYCLE_LSN(val);
586 *block = BLOCK_LSN(val);
587}
588
589/*
590 * Calculate and assign a value to an atomic LSN variable from component pieces.
591 */
592static inline void
593xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
594{
595 atomic64_set(lsn, xlog_assign_lsn(cycle, block));
596}
597
598/*
599 * When we crack the grant head, we sample it first so that the value will not
600 * change while we are cracking it into the component values. This means we
601 * will always get consistent component values to work from.
602 */
603static inline void
604xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
605{
606 *cycle = val >> 32;
607 *space = val & 0xffffffff;
608}
609
610static inline void
611xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
612{
613 xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
614}
615
616static inline int64_t
617xlog_assign_grant_head_val(int cycle, int space)
618{
619 return ((int64_t)cycle << 32) | space;
620}
621
622static inline void
623xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
624{
625 atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
626}
627
628/*
565 * Committed Item List interfaces 629 * Committed Item List interfaces
566 */ 630 */
567int xlog_cil_init(struct log *log); 631int xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
585 */ 649 */
586#define XLOG_UNMOUNT_REC_TYPE (-1U) 650#define XLOG_UNMOUNT_REC_TYPE (-1U)
587 651
652/*
653 * Wrapper function for waiting on a wait queue serialised against wakeups
654 * by a spinlock. This matches the semantics of all the wait queues used in the
655 * log code.
656 */
657static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
658{
659 DECLARE_WAITQUEUE(wait, current);
660
661 add_wait_queue_exclusive(wq, &wait);
662 __set_current_state(TASK_UNINTERRUPTIBLE);
663 spin_unlock(lock);
664 schedule();
665 remove_wait_queue(wq, &wait);
666}
588#endif /* __KERNEL__ */ 667#endif /* __KERNEL__ */
589 668
590#endif /* __XFS_LOG_PRIV_H__ */ 669#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *);
53#endif 53#endif
54 54
55/* 55/*
56 * This structure is used during recovery to record the buf log items which
57 * have been canceled and should not be replayed.
58 */
59struct xfs_buf_cancel {
60 xfs_daddr_t bc_blkno;
61 uint bc_len;
62 int bc_refcount;
63 struct list_head bc_list;
64};
65
66/*
56 * Sector aligned buffer routines for buffer create/read/write/access 67 * Sector aligned buffer routines for buffer create/read/write/access
57 */ 68 */
58 69
@@ -107,7 +118,8 @@ xlog_get_bp(
107 nbblks += log->l_sectBBsize; 118 nbblks += log->l_sectBBsize;
108 nbblks = round_up(nbblks, log->l_sectBBsize); 119 nbblks = round_up(nbblks, log->l_sectBBsize);
109 120
110 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 121 return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
122 BBTOB(nbblks), 0);
111} 123}
112 124
113STATIC void 125STATIC void
@@ -167,7 +179,7 @@ xlog_bread_noalign(
167 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 179 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
168 180
169 xfsbdstrat(log->l_mp, bp); 181 xfsbdstrat(log->l_mp, bp);
170 error = xfs_iowait(bp); 182 error = xfs_buf_iowait(bp);
171 if (error) 183 if (error)
172 xfs_ioerror_alert("xlog_bread", log->l_mp, 184 xfs_ioerror_alert("xlog_bread", log->l_mp,
173 bp, XFS_BUF_ADDR(bp)); 185 bp, XFS_BUF_ADDR(bp));
@@ -321,12 +333,13 @@ xlog_recover_iodone(
321 * this during recovery. One strike! 333 * this during recovery. One strike!
322 */ 334 */
323 xfs_ioerror_alert("xlog_recover_iodone", 335 xfs_ioerror_alert("xlog_recover_iodone",
324 bp->b_mount, bp, XFS_BUF_ADDR(bp)); 336 bp->b_target->bt_mount, bp,
325 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 337 XFS_BUF_ADDR(bp));
338 xfs_force_shutdown(bp->b_target->bt_mount,
339 SHUTDOWN_META_IO_ERROR);
326 } 340 }
327 bp->b_mount = NULL;
328 XFS_BUF_CLR_IODONE_FUNC(bp); 341 XFS_BUF_CLR_IODONE_FUNC(bp);
329 xfs_biodone(bp); 342 xfs_buf_ioend(bp, 0);
330} 343}
331 344
332/* 345/*
@@ -923,12 +936,12 @@ xlog_find_tail(
923 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 936 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
924 if (found == 2) 937 if (found == 2)
925 log->l_curr_cycle++; 938 log->l_curr_cycle++;
926 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); 939 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
927 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); 940 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
928 log->l_grant_reserve_cycle = log->l_curr_cycle; 941 xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
929 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); 942 BBTOB(log->l_curr_block));
930 log->l_grant_write_cycle = log->l_curr_cycle; 943 xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
931 log->l_grant_write_bytes = BBTOB(log->l_curr_block); 944 BBTOB(log->l_curr_block));
932 945
933 /* 946 /*
934 * Look for unmount record. If we find it, then we know there 947 * Look for unmount record. If we find it, then we know there
@@ -958,7 +971,7 @@ xlog_find_tail(
958 } 971 }
959 after_umount_blk = (i + hblks + (int) 972 after_umount_blk = (i + hblks + (int)
960 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 973 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
961 tail_lsn = log->l_tail_lsn; 974 tail_lsn = atomic64_read(&log->l_tail_lsn);
962 if (*head_blk == after_umount_blk && 975 if (*head_blk == after_umount_blk &&
963 be32_to_cpu(rhead->h_num_logops) == 1) { 976 be32_to_cpu(rhead->h_num_logops) == 1) {
964 umount_data_blk = (i + hblks) % log->l_logBBsize; 977 umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -973,12 +986,10 @@ xlog_find_tail(
973 * log records will point recovery to after the 986 * log records will point recovery to after the
974 * current unmount record. 987 * current unmount record.
975 */ 988 */
976 log->l_tail_lsn = 989 xlog_assign_atomic_lsn(&log->l_tail_lsn,
977 xlog_assign_lsn(log->l_curr_cycle, 990 log->l_curr_cycle, after_umount_blk);
978 after_umount_blk); 991 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
979 log->l_last_sync_lsn = 992 log->l_curr_cycle, after_umount_blk);
980 xlog_assign_lsn(log->l_curr_cycle,
981 after_umount_blk);
982 *tail_blk = after_umount_blk; 993 *tail_blk = after_umount_blk;
983 994
984 /* 995 /*
@@ -1603,82 +1614,45 @@ xlog_recover_reorder_trans(
1603 * record in the table to tell us how many times we expect to see this 1614 * record in the table to tell us how many times we expect to see this
1604 * record during the second pass. 1615 * record during the second pass.
1605 */ 1616 */
1606STATIC void 1617STATIC int
1607xlog_recover_do_buffer_pass1( 1618xlog_recover_buffer_pass1(
1608 xlog_t *log, 1619 struct log *log,
1609 xfs_buf_log_format_t *buf_f) 1620 xlog_recover_item_t *item)
1610{ 1621{
1611 xfs_buf_cancel_t *bcp; 1622 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1612 xfs_buf_cancel_t *nextp; 1623 struct list_head *bucket;
1613 xfs_buf_cancel_t *prevp; 1624 struct xfs_buf_cancel *bcp;
1614 xfs_buf_cancel_t **bucket;
1615 xfs_daddr_t blkno = 0;
1616 uint len = 0;
1617 ushort flags = 0;
1618
1619 switch (buf_f->blf_type) {
1620 case XFS_LI_BUF:
1621 blkno = buf_f->blf_blkno;
1622 len = buf_f->blf_len;
1623 flags = buf_f->blf_flags;
1624 break;
1625 }
1626 1625
1627 /* 1626 /*
1628 * If this isn't a cancel buffer item, then just return. 1627 * If this isn't a cancel buffer item, then just return.
1629 */ 1628 */
1630 if (!(flags & XFS_BLF_CANCEL)) { 1629 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1631 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1630 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1632 return; 1631 return 0;
1633 }
1634
1635 /*
1636 * Insert an xfs_buf_cancel record into the hash table of
1637 * them. If there is already an identical record, bump
1638 * its reference count.
1639 */
1640 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1641 XLOG_BC_TABLE_SIZE];
1642 /*
1643 * If the hash bucket is empty then just insert a new record into
1644 * the bucket.
1645 */
1646 if (*bucket == NULL) {
1647 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1648 KM_SLEEP);
1649 bcp->bc_blkno = blkno;
1650 bcp->bc_len = len;
1651 bcp->bc_refcount = 1;
1652 bcp->bc_next = NULL;
1653 *bucket = bcp;
1654 return;
1655 } 1632 }
1656 1633
1657 /* 1634 /*
1658 * The hash bucket is not empty, so search for duplicates of our 1635 * Insert an xfs_buf_cancel record into the hash table of them.
1659 * record. If we find one them just bump its refcount. If not 1636 * If there is already an identical record, bump its reference count.
1660 * then add us at the end of the list.
1661 */ 1637 */
1662 prevp = NULL; 1638 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1663 nextp = *bucket; 1639 list_for_each_entry(bcp, bucket, bc_list) {
1664 while (nextp != NULL) { 1640 if (bcp->bc_blkno == buf_f->blf_blkno &&
1665 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1641 bcp->bc_len == buf_f->blf_len) {
1666 nextp->bc_refcount++; 1642 bcp->bc_refcount++;
1667 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1643 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1668 return; 1644 return 0;
1669 } 1645 }
1670 prevp = nextp; 1646 }
1671 nextp = nextp->bc_next; 1647
1672 } 1648 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1673 ASSERT(prevp != NULL); 1649 bcp->bc_blkno = buf_f->blf_blkno;
1674 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1650 bcp->bc_len = buf_f->blf_len;
1675 KM_SLEEP);
1676 bcp->bc_blkno = blkno;
1677 bcp->bc_len = len;
1678 bcp->bc_refcount = 1; 1651 bcp->bc_refcount = 1;
1679 bcp->bc_next = NULL; 1652 list_add_tail(&bcp->bc_list, bucket);
1680 prevp->bc_next = bcp; 1653
1681 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1654 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1655 return 0;
1682} 1656}
1683 1657
1684/* 1658/*
@@ -1696,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
1696 */ 1670 */
1697STATIC int 1671STATIC int
1698xlog_check_buffer_cancelled( 1672xlog_check_buffer_cancelled(
1699 xlog_t *log, 1673 struct log *log,
1700 xfs_daddr_t blkno, 1674 xfs_daddr_t blkno,
1701 uint len, 1675 uint len,
1702 ushort flags) 1676 ushort flags)
1703{ 1677{
1704 xfs_buf_cancel_t *bcp; 1678 struct list_head *bucket;
1705 xfs_buf_cancel_t *prevp; 1679 struct xfs_buf_cancel *bcp;
1706 xfs_buf_cancel_t **bucket;
1707 1680
1708 if (log->l_buf_cancel_table == NULL) { 1681 if (log->l_buf_cancel_table == NULL) {
1709 /* 1682 /*
@@ -1714,128 +1687,70 @@ xlog_check_buffer_cancelled(
1714 return 0; 1687 return 0;
1715 } 1688 }
1716 1689
1717 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1718 XLOG_BC_TABLE_SIZE];
1719 bcp = *bucket;
1720 if (bcp == NULL) {
1721 /*
1722 * There is no corresponding entry in the table built
1723 * in pass one, so this buffer has not been cancelled.
1724 */
1725 ASSERT(!(flags & XFS_BLF_CANCEL));
1726 return 0;
1727 }
1728
1729 /* 1690 /*
1730 * Search for an entry in the buffer cancel table that 1691 * Search for an entry in the cancel table that matches our buffer.
1731 * matches our buffer.
1732 */ 1692 */
1733 prevp = NULL; 1693 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1734 while (bcp != NULL) { 1694 list_for_each_entry(bcp, bucket, bc_list) {
1735 if (bcp->bc_blkno == blkno && bcp->bc_len == len) { 1695 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1736 /* 1696 goto found;
1737 * We've go a match, so return 1 so that the
1738 * recovery of this buffer is cancelled.
1739 * If this buffer is actually a buffer cancel
1740 * log item, then decrement the refcount on the
1741 * one in the table and remove it if this is the
1742 * last reference.
1743 */
1744 if (flags & XFS_BLF_CANCEL) {
1745 bcp->bc_refcount--;
1746 if (bcp->bc_refcount == 0) {
1747 if (prevp == NULL) {
1748 *bucket = bcp->bc_next;
1749 } else {
1750 prevp->bc_next = bcp->bc_next;
1751 }
1752 kmem_free(bcp);
1753 }
1754 }
1755 return 1;
1756 }
1757 prevp = bcp;
1758 bcp = bcp->bc_next;
1759 } 1697 }
1698
1760 /* 1699 /*
1761 * We didn't find a corresponding entry in the table, so 1700 * We didn't find a corresponding entry in the table, so return 0 so
1762 * return 0 so that the buffer is NOT cancelled. 1701 * that the buffer is NOT cancelled.
1763 */ 1702 */
1764 ASSERT(!(flags & XFS_BLF_CANCEL)); 1703 ASSERT(!(flags & XFS_BLF_CANCEL));
1765 return 0; 1704 return 0;
1766}
1767 1705
1768STATIC int 1706found:
1769xlog_recover_do_buffer_pass2( 1707 /*
1770 xlog_t *log, 1708 * We've go a match, so return 1 so that the recovery of this buffer
1771 xfs_buf_log_format_t *buf_f) 1709 * is cancelled. If this buffer is actually a buffer cancel log
1772{ 1710 * item, then decrement the refcount on the one in the table and
1773 xfs_daddr_t blkno = 0; 1711 * remove it if this is the last reference.
1774 ushort flags = 0; 1712 */
1775 uint len = 0; 1713 if (flags & XFS_BLF_CANCEL) {
1776 1714 if (--bcp->bc_refcount == 0) {
1777 switch (buf_f->blf_type) { 1715 list_del(&bcp->bc_list);
1778 case XFS_LI_BUF: 1716 kmem_free(bcp);
1779 blkno = buf_f->blf_blkno; 1717 }
1780 flags = buf_f->blf_flags;
1781 len = buf_f->blf_len;
1782 break;
1783 } 1718 }
1784 1719 return 1;
1785 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1786} 1720}
1787 1721
1788/* 1722/*
1789 * Perform recovery for a buffer full of inodes. In these buffers, 1723 * Perform recovery for a buffer full of inodes. In these buffers, the only
1790 * the only data which should be recovered is that which corresponds 1724 * data which should be recovered is that which corresponds to the
1791 * to the di_next_unlinked pointers in the on disk inode structures. 1725 * di_next_unlinked pointers in the on disk inode structures. The rest of the
1792 * The rest of the data for the inodes is always logged through the 1726 * data for the inodes is always logged through the inodes themselves rather
1793 * inodes themselves rather than the inode buffer and is recovered 1727 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1794 * in xlog_recover_do_inode_trans().
1795 * 1728 *
1796 * The only time when buffers full of inodes are fully recovered is 1729 * The only time when buffers full of inodes are fully recovered is when the
1797 * when the buffer is full of newly allocated inodes. In this case 1730 * buffer is full of newly allocated inodes. In this case the buffer will
1798 * the buffer will not be marked as an inode buffer and so will be 1731 * not be marked as an inode buffer and so will be sent to
1799 * sent to xlog_recover_do_reg_buffer() below during recovery. 1732 * xlog_recover_do_reg_buffer() below during recovery.
1800 */ 1733 */
1801STATIC int 1734STATIC int
1802xlog_recover_do_inode_buffer( 1735xlog_recover_do_inode_buffer(
1803 xfs_mount_t *mp, 1736 struct xfs_mount *mp,
1804 xlog_recover_item_t *item, 1737 xlog_recover_item_t *item,
1805 xfs_buf_t *bp, 1738 struct xfs_buf *bp,
1806 xfs_buf_log_format_t *buf_f) 1739 xfs_buf_log_format_t *buf_f)
1807{ 1740{
1808 int i; 1741 int i;
1809 int item_index; 1742 int item_index = 0;
1810 int bit; 1743 int bit = 0;
1811 int nbits; 1744 int nbits = 0;
1812 int reg_buf_offset; 1745 int reg_buf_offset = 0;
1813 int reg_buf_bytes; 1746 int reg_buf_bytes = 0;
1814 int next_unlinked_offset; 1747 int next_unlinked_offset;
1815 int inodes_per_buf; 1748 int inodes_per_buf;
1816 xfs_agino_t *logged_nextp; 1749 xfs_agino_t *logged_nextp;
1817 xfs_agino_t *buffer_nextp; 1750 xfs_agino_t *buffer_nextp;
1818 unsigned int *data_map = NULL;
1819 unsigned int map_size = 0;
1820 1751
1821 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1752 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1822 1753
1823 switch (buf_f->blf_type) {
1824 case XFS_LI_BUF:
1825 data_map = buf_f->blf_data_map;
1826 map_size = buf_f->blf_map_size;
1827 break;
1828 }
1829 /*
1830 * Set the variables corresponding to the current region to
1831 * 0 so that we'll initialize them on the first pass through
1832 * the loop.
1833 */
1834 reg_buf_offset = 0;
1835 reg_buf_bytes = 0;
1836 bit = 0;
1837 nbits = 0;
1838 item_index = 0;
1839 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1754 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1840 for (i = 0; i < inodes_per_buf; i++) { 1755 for (i = 0; i < inodes_per_buf; i++) {
1841 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1756 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1850,18 +1765,18 @@ xlog_recover_do_inode_buffer(
1850 * the current di_next_unlinked field. 1765 * the current di_next_unlinked field.
1851 */ 1766 */
1852 bit += nbits; 1767 bit += nbits;
1853 bit = xfs_next_bit(data_map, map_size, bit); 1768 bit = xfs_next_bit(buf_f->blf_data_map,
1769 buf_f->blf_map_size, bit);
1854 1770
1855 /* 1771 /*
1856 * If there are no more logged regions in the 1772 * If there are no more logged regions in the
1857 * buffer, then we're done. 1773 * buffer, then we're done.
1858 */ 1774 */
1859 if (bit == -1) { 1775 if (bit == -1)
1860 return 0; 1776 return 0;
1861 }
1862 1777
1863 nbits = xfs_contig_bits(data_map, map_size, 1778 nbits = xfs_contig_bits(buf_f->blf_data_map,
1864 bit); 1779 buf_f->blf_map_size, bit);
1865 ASSERT(nbits > 0); 1780 ASSERT(nbits > 0);
1866 reg_buf_offset = bit << XFS_BLF_SHIFT; 1781 reg_buf_offset = bit << XFS_BLF_SHIFT;
1867 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 1782 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1873,9 +1788,8 @@ xlog_recover_do_inode_buffer(
1873 * di_next_unlinked field, then move on to the next 1788 * di_next_unlinked field, then move on to the next
1874 * di_next_unlinked field. 1789 * di_next_unlinked field.
1875 */ 1790 */
1876 if (next_unlinked_offset < reg_buf_offset) { 1791 if (next_unlinked_offset < reg_buf_offset)
1877 continue; 1792 continue;
1878 }
1879 1793
1880 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1794 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1881 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1795 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1911,36 +1825,29 @@ xlog_recover_do_inode_buffer(
1911 * given buffer. The bitmap in the buf log format structure indicates 1825 * given buffer. The bitmap in the buf log format structure indicates
1912 * where to place the logged data. 1826 * where to place the logged data.
1913 */ 1827 */
1914/*ARGSUSED*/
1915STATIC void 1828STATIC void
1916xlog_recover_do_reg_buffer( 1829xlog_recover_do_reg_buffer(
1917 struct xfs_mount *mp, 1830 struct xfs_mount *mp,
1918 xlog_recover_item_t *item, 1831 xlog_recover_item_t *item,
1919 xfs_buf_t *bp, 1832 struct xfs_buf *bp,
1920 xfs_buf_log_format_t *buf_f) 1833 xfs_buf_log_format_t *buf_f)
1921{ 1834{
1922 int i; 1835 int i;
1923 int bit; 1836 int bit;
1924 int nbits; 1837 int nbits;
1925 unsigned int *data_map = NULL;
1926 unsigned int map_size = 0;
1927 int error; 1838 int error;
1928 1839
1929 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 1840 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1930 1841
1931 switch (buf_f->blf_type) {
1932 case XFS_LI_BUF:
1933 data_map = buf_f->blf_data_map;
1934 map_size = buf_f->blf_map_size;
1935 break;
1936 }
1937 bit = 0; 1842 bit = 0;
1938 i = 1; /* 0 is the buf format structure */ 1843 i = 1; /* 0 is the buf format structure */
1939 while (1) { 1844 while (1) {
1940 bit = xfs_next_bit(data_map, map_size, bit); 1845 bit = xfs_next_bit(buf_f->blf_data_map,
1846 buf_f->blf_map_size, bit);
1941 if (bit == -1) 1847 if (bit == -1)
1942 break; 1848 break;
1943 nbits = xfs_contig_bits(data_map, map_size, bit); 1849 nbits = xfs_contig_bits(buf_f->blf_data_map,
1850 buf_f->blf_map_size, bit);
1944 ASSERT(nbits > 0); 1851 ASSERT(nbits > 0);
1945 ASSERT(item->ri_buf[i].i_addr != NULL); 1852 ASSERT(item->ri_buf[i].i_addr != NULL);
1946 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 1853 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2174,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
2174 * for more details on the implementation of the table of cancel records. 2081 * for more details on the implementation of the table of cancel records.
2175 */ 2082 */
2176STATIC int 2083STATIC int
2177xlog_recover_do_buffer_trans( 2084xlog_recover_buffer_pass2(
2178 xlog_t *log, 2085 xlog_t *log,
2179 xlog_recover_item_t *item, 2086 xlog_recover_item_t *item)
2180 int pass)
2181{ 2087{
2182 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2088 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2183 xfs_mount_t *mp; 2089 xfs_mount_t *mp = log->l_mp;
2184 xfs_buf_t *bp; 2090 xfs_buf_t *bp;
2185 int error; 2091 int error;
2186 int cancel;
2187 xfs_daddr_t blkno;
2188 int len;
2189 ushort flags;
2190 uint buf_flags; 2092 uint buf_flags;
2191 2093
2192 if (pass == XLOG_RECOVER_PASS1) { 2094 /*
2193 /* 2095 * In this pass we only want to recover all the buffers which have
2194 * In this pass we're only looking for buf items 2096 * not been cancelled and are not cancellation buffers themselves.
2195 * with the XFS_BLF_CANCEL bit set. 2097 */
2196 */ 2098 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2197 xlog_recover_do_buffer_pass1(log, buf_f); 2099 buf_f->blf_len, buf_f->blf_flags)) {
2100 trace_xfs_log_recover_buf_cancel(log, buf_f);
2198 return 0; 2101 return 0;
2199 } else {
2200 /*
2201 * In this pass we want to recover all the buffers
2202 * which have not been cancelled and are not
2203 * cancellation buffers themselves. The routine
2204 * we call here will tell us whether or not to
2205 * continue with the replay of this buffer.
2206 */
2207 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2208 if (cancel) {
2209 trace_xfs_log_recover_buf_cancel(log, buf_f);
2210 return 0;
2211 }
2212 } 2102 }
2103
2213 trace_xfs_log_recover_buf_recover(log, buf_f); 2104 trace_xfs_log_recover_buf_recover(log, buf_f);
2214 switch (buf_f->blf_type) {
2215 case XFS_LI_BUF:
2216 blkno = buf_f->blf_blkno;
2217 len = buf_f->blf_len;
2218 flags = buf_f->blf_flags;
2219 break;
2220 default:
2221 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2222 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2223 buf_f->blf_type, log->l_mp->m_logname ?
2224 log->l_mp->m_logname : "internal");
2225 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2226 XFS_ERRLEVEL_LOW, log->l_mp);
2227 return XFS_ERROR(EFSCORRUPTED);
2228 }
2229 2105
2230 mp = log->l_mp;
2231 buf_flags = XBF_LOCK; 2106 buf_flags = XBF_LOCK;
2232 if (!(flags & XFS_BLF_INODE_BUF)) 2107 if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
2233 buf_flags |= XBF_MAPPED; 2108 buf_flags |= XBF_MAPPED;
2234 2109
2235 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2110 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2111 buf_flags);
2236 if (XFS_BUF_ISERROR(bp)) { 2112 if (XFS_BUF_ISERROR(bp)) {
2237 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2113 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
2238 bp, blkno); 2114 bp, buf_f->blf_blkno);
2239 error = XFS_BUF_GETERROR(bp); 2115 error = XFS_BUF_GETERROR(bp);
2240 xfs_buf_relse(bp); 2116 xfs_buf_relse(bp);
2241 return error; 2117 return error;
2242 } 2118 }
2243 2119
2244 error = 0; 2120 error = 0;
2245 if (flags & XFS_BLF_INODE_BUF) { 2121 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2246 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2122 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2247 } else if (flags & 2123 } else if (buf_f->blf_flags &
2248 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2124 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2249 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2125 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2250 } else { 2126 } else {
@@ -2275,8 +2151,7 @@ xlog_recover_do_buffer_trans(
2275 XFS_BUF_STALE(bp); 2151 XFS_BUF_STALE(bp);
2276 error = xfs_bwrite(mp, bp); 2152 error = xfs_bwrite(mp, bp);
2277 } else { 2153 } else {
2278 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2154 ASSERT(bp->b_target->bt_mount == mp);
2279 bp->b_mount = mp;
2280 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2155 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2281 xfs_bdwrite(mp, bp); 2156 xfs_bdwrite(mp, bp);
2282 } 2157 }
@@ -2285,16 +2160,14 @@ xlog_recover_do_buffer_trans(
2285} 2160}
2286 2161
2287STATIC int 2162STATIC int
2288xlog_recover_do_inode_trans( 2163xlog_recover_inode_pass2(
2289 xlog_t *log, 2164 xlog_t *log,
2290 xlog_recover_item_t *item, 2165 xlog_recover_item_t *item)
2291 int pass)
2292{ 2166{
2293 xfs_inode_log_format_t *in_f; 2167 xfs_inode_log_format_t *in_f;
2294 xfs_mount_t *mp; 2168 xfs_mount_t *mp = log->l_mp;
2295 xfs_buf_t *bp; 2169 xfs_buf_t *bp;
2296 xfs_dinode_t *dip; 2170 xfs_dinode_t *dip;
2297 xfs_ino_t ino;
2298 int len; 2171 int len;
2299 xfs_caddr_t src; 2172 xfs_caddr_t src;
2300 xfs_caddr_t dest; 2173 xfs_caddr_t dest;
@@ -2304,10 +2177,6 @@ xlog_recover_do_inode_trans(
2304 xfs_icdinode_t *dicp; 2177 xfs_icdinode_t *dicp;
2305 int need_free = 0; 2178 int need_free = 0;
2306 2179
2307 if (pass == XLOG_RECOVER_PASS1) {
2308 return 0;
2309 }
2310
2311 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2180 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2312 in_f = item->ri_buf[0].i_addr; 2181 in_f = item->ri_buf[0].i_addr;
2313 } else { 2182 } else {
@@ -2317,8 +2186,6 @@ xlog_recover_do_inode_trans(
2317 if (error) 2186 if (error)
2318 goto error; 2187 goto error;
2319 } 2188 }
2320 ino = in_f->ilf_ino;
2321 mp = log->l_mp;
2322 2189
2323 /* 2190 /*
2324 * Inode buffers can be freed, look out for it, 2191 * Inode buffers can be freed, look out for it,
@@ -2353,8 +2220,8 @@ xlog_recover_do_inode_trans(
2353 xfs_buf_relse(bp); 2220 xfs_buf_relse(bp);
2354 xfs_fs_cmn_err(CE_ALERT, mp, 2221 xfs_fs_cmn_err(CE_ALERT, mp,
2355 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2222 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2356 dip, bp, ino); 2223 dip, bp, in_f->ilf_ino);
2357 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", 2224 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2358 XFS_ERRLEVEL_LOW, mp); 2225 XFS_ERRLEVEL_LOW, mp);
2359 error = EFSCORRUPTED; 2226 error = EFSCORRUPTED;
2360 goto error; 2227 goto error;
@@ -2364,8 +2231,8 @@ xlog_recover_do_inode_trans(
2364 xfs_buf_relse(bp); 2231 xfs_buf_relse(bp);
2365 xfs_fs_cmn_err(CE_ALERT, mp, 2232 xfs_fs_cmn_err(CE_ALERT, mp,
2366 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2233 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2367 item, ino); 2234 item, in_f->ilf_ino);
2368 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", 2235 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2369 XFS_ERRLEVEL_LOW, mp); 2236 XFS_ERRLEVEL_LOW, mp);
2370 error = EFSCORRUPTED; 2237 error = EFSCORRUPTED;
2371 goto error; 2238 goto error;
@@ -2393,12 +2260,12 @@ xlog_recover_do_inode_trans(
2393 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2260 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2394 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2261 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2395 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2262 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2396 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", 2263 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2397 XFS_ERRLEVEL_LOW, mp, dicp); 2264 XFS_ERRLEVEL_LOW, mp, dicp);
2398 xfs_buf_relse(bp); 2265 xfs_buf_relse(bp);
2399 xfs_fs_cmn_err(CE_ALERT, mp, 2266 xfs_fs_cmn_err(CE_ALERT, mp,
2400 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2267 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2401 item, dip, bp, ino); 2268 item, dip, bp, in_f->ilf_ino);
2402 error = EFSCORRUPTED; 2269 error = EFSCORRUPTED;
2403 goto error; 2270 goto error;
2404 } 2271 }
@@ -2406,40 +2273,40 @@ xlog_recover_do_inode_trans(
2406 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2273 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2407 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2274 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2408 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2275 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2409 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", 2276 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2410 XFS_ERRLEVEL_LOW, mp, dicp); 2277 XFS_ERRLEVEL_LOW, mp, dicp);
2411 xfs_buf_relse(bp); 2278 xfs_buf_relse(bp);
2412 xfs_fs_cmn_err(CE_ALERT, mp, 2279 xfs_fs_cmn_err(CE_ALERT, mp,
2413 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2280 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2414 item, dip, bp, ino); 2281 item, dip, bp, in_f->ilf_ino);
2415 error = EFSCORRUPTED; 2282 error = EFSCORRUPTED;
2416 goto error; 2283 goto error;
2417 } 2284 }
2418 } 2285 }
2419 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2286 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2420 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", 2287 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2421 XFS_ERRLEVEL_LOW, mp, dicp); 2288 XFS_ERRLEVEL_LOW, mp, dicp);
2422 xfs_buf_relse(bp); 2289 xfs_buf_relse(bp);
2423 xfs_fs_cmn_err(CE_ALERT, mp, 2290 xfs_fs_cmn_err(CE_ALERT, mp,
2424 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2291 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2425 item, dip, bp, ino, 2292 item, dip, bp, in_f->ilf_ino,
2426 dicp->di_nextents + dicp->di_anextents, 2293 dicp->di_nextents + dicp->di_anextents,
2427 dicp->di_nblocks); 2294 dicp->di_nblocks);
2428 error = EFSCORRUPTED; 2295 error = EFSCORRUPTED;
2429 goto error; 2296 goto error;
2430 } 2297 }
2431 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2298 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2432 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", 2299 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2433 XFS_ERRLEVEL_LOW, mp, dicp); 2300 XFS_ERRLEVEL_LOW, mp, dicp);
2434 xfs_buf_relse(bp); 2301 xfs_buf_relse(bp);
2435 xfs_fs_cmn_err(CE_ALERT, mp, 2302 xfs_fs_cmn_err(CE_ALERT, mp,
2436 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2303 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2437 item, dip, bp, ino, dicp->di_forkoff); 2304 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2438 error = EFSCORRUPTED; 2305 error = EFSCORRUPTED;
2439 goto error; 2306 goto error;
2440 } 2307 }
2441 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2308 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2442 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2309 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2443 XFS_ERRLEVEL_LOW, mp, dicp); 2310 XFS_ERRLEVEL_LOW, mp, dicp);
2444 xfs_buf_relse(bp); 2311 xfs_buf_relse(bp);
2445 xfs_fs_cmn_err(CE_ALERT, mp, 2312 xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2531,7 +2398,7 @@ xlog_recover_do_inode_trans(
2531 break; 2398 break;
2532 2399
2533 default: 2400 default:
2534 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); 2401 xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
2535 ASSERT(0); 2402 ASSERT(0);
2536 xfs_buf_relse(bp); 2403 xfs_buf_relse(bp);
2537 error = EIO; 2404 error = EIO;
@@ -2540,8 +2407,7 @@ xlog_recover_do_inode_trans(
2540 } 2407 }
2541 2408
2542write_inode_buffer: 2409write_inode_buffer:
2543 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2410 ASSERT(bp->b_target->bt_mount == mp);
2544 bp->b_mount = mp;
2545 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2411 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2546 xfs_bdwrite(mp, bp); 2412 xfs_bdwrite(mp, bp);
2547error: 2413error:
@@ -2556,18 +2422,11 @@ error:
2556 * of that type. 2422 * of that type.
2557 */ 2423 */
2558STATIC int 2424STATIC int
2559xlog_recover_do_quotaoff_trans( 2425xlog_recover_quotaoff_pass1(
2560 xlog_t *log, 2426 xlog_t *log,
2561 xlog_recover_item_t *item, 2427 xlog_recover_item_t *item)
2562 int pass)
2563{ 2428{
2564 xfs_qoff_logformat_t *qoff_f; 2429 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
2565
2566 if (pass == XLOG_RECOVER_PASS2) {
2567 return (0);
2568 }
2569
2570 qoff_f = item->ri_buf[0].i_addr;
2571 ASSERT(qoff_f); 2430 ASSERT(qoff_f);
2572 2431
2573 /* 2432 /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
2588 * Recover a dquot record 2447 * Recover a dquot record
2589 */ 2448 */
2590STATIC int 2449STATIC int
2591xlog_recover_do_dquot_trans( 2450xlog_recover_dquot_pass2(
2592 xlog_t *log, 2451 xlog_t *log,
2593 xlog_recover_item_t *item, 2452 xlog_recover_item_t *item)
2594 int pass)
2595{ 2453{
2596 xfs_mount_t *mp; 2454 xfs_mount_t *mp = log->l_mp;
2597 xfs_buf_t *bp; 2455 xfs_buf_t *bp;
2598 struct xfs_disk_dquot *ddq, *recddq; 2456 struct xfs_disk_dquot *ddq, *recddq;
2599 int error; 2457 int error;
2600 xfs_dq_logformat_t *dq_f; 2458 xfs_dq_logformat_t *dq_f;
2601 uint type; 2459 uint type;
2602 2460
2603 if (pass == XLOG_RECOVER_PASS1) {
2604 return 0;
2605 }
2606 mp = log->l_mp;
2607 2461
2608 /* 2462 /*
2609 * Filesystems are required to send in quota flags at mount time. 2463 * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
2647 if ((error = xfs_qm_dqcheck(recddq, 2501 if ((error = xfs_qm_dqcheck(recddq,
2648 dq_f->qlf_id, 2502 dq_f->qlf_id,
2649 0, XFS_QMOPT_DOWARN, 2503 0, XFS_QMOPT_DOWARN,
2650 "xlog_recover_do_dquot_trans (log copy)"))) { 2504 "xlog_recover_dquot_pass2 (log copy)"))) {
2651 return XFS_ERROR(EIO); 2505 return XFS_ERROR(EIO);
2652 } 2506 }
2653 ASSERT(dq_f->qlf_len == 1); 2507 ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
2670 * minimal initialization then. 2524 * minimal initialization then.
2671 */ 2525 */
2672 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2526 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2673 "xlog_recover_do_dquot_trans")) { 2527 "xlog_recover_dquot_pass2")) {
2674 xfs_buf_relse(bp); 2528 xfs_buf_relse(bp);
2675 return XFS_ERROR(EIO); 2529 return XFS_ERROR(EIO);
2676 } 2530 }
@@ -2678,8 +2532,7 @@ xlog_recover_do_dquot_trans(
2678 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2532 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2679 2533
2680 ASSERT(dq_f->qlf_size == 2); 2534 ASSERT(dq_f->qlf_size == 2);
2681 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2535 ASSERT(bp->b_target->bt_mount == mp);
2682 bp->b_mount = mp;
2683 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2536 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2684 xfs_bdwrite(mp, bp); 2537 xfs_bdwrite(mp, bp);
2685 2538
@@ -2694,38 +2547,31 @@ xlog_recover_do_dquot_trans(
2694 * LSN. 2547 * LSN.
2695 */ 2548 */
2696STATIC int 2549STATIC int
2697xlog_recover_do_efi_trans( 2550xlog_recover_efi_pass2(
2698 xlog_t *log, 2551 xlog_t *log,
2699 xlog_recover_item_t *item, 2552 xlog_recover_item_t *item,
2700 xfs_lsn_t lsn, 2553 xfs_lsn_t lsn)
2701 int pass)
2702{ 2554{
2703 int error; 2555 int error;
2704 xfs_mount_t *mp; 2556 xfs_mount_t *mp = log->l_mp;
2705 xfs_efi_log_item_t *efip; 2557 xfs_efi_log_item_t *efip;
2706 xfs_efi_log_format_t *efi_formatp; 2558 xfs_efi_log_format_t *efi_formatp;
2707 2559
2708 if (pass == XLOG_RECOVER_PASS1) {
2709 return 0;
2710 }
2711
2712 efi_formatp = item->ri_buf[0].i_addr; 2560 efi_formatp = item->ri_buf[0].i_addr;
2713 2561
2714 mp = log->l_mp;
2715 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2562 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2716 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2563 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2717 &(efip->efi_format)))) { 2564 &(efip->efi_format)))) {
2718 xfs_efi_item_free(efip); 2565 xfs_efi_item_free(efip);
2719 return error; 2566 return error;
2720 } 2567 }
2721 efip->efi_next_extent = efi_formatp->efi_nextents; 2568 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2722 efip->efi_flags |= XFS_EFI_COMMITTED;
2723 2569
2724 spin_lock(&log->l_ailp->xa_lock); 2570 spin_lock(&log->l_ailp->xa_lock);
2725 /* 2571 /*
2726 * xfs_trans_ail_update() drops the AIL lock. 2572 * xfs_trans_ail_update() drops the AIL lock.
2727 */ 2573 */
2728 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); 2574 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2729 return 0; 2575 return 0;
2730} 2576}
2731 2577
@@ -2738,11 +2584,10 @@ xlog_recover_do_efi_trans(
2738 * efd format structure. If we find it, we remove the efi from the 2584 * efd format structure. If we find it, we remove the efi from the
2739 * AIL and free it. 2585 * AIL and free it.
2740 */ 2586 */
2741STATIC void 2587STATIC int
2742xlog_recover_do_efd_trans( 2588xlog_recover_efd_pass2(
2743 xlog_t *log, 2589 xlog_t *log,
2744 xlog_recover_item_t *item, 2590 xlog_recover_item_t *item)
2745 int pass)
2746{ 2591{
2747 xfs_efd_log_format_t *efd_formatp; 2592 xfs_efd_log_format_t *efd_formatp;
2748 xfs_efi_log_item_t *efip = NULL; 2593 xfs_efi_log_item_t *efip = NULL;
@@ -2751,10 +2596,6 @@ xlog_recover_do_efd_trans(
2751 struct xfs_ail_cursor cur; 2596 struct xfs_ail_cursor cur;
2752 struct xfs_ail *ailp = log->l_ailp; 2597 struct xfs_ail *ailp = log->l_ailp;
2753 2598
2754 if (pass == XLOG_RECOVER_PASS1) {
2755 return;
2756 }
2757
2758 efd_formatp = item->ri_buf[0].i_addr; 2599 efd_formatp = item->ri_buf[0].i_addr;
2759 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2600 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2760 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2601 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2786,62 +2627,6 @@ xlog_recover_do_efd_trans(
2786 } 2627 }
2787 xfs_trans_ail_cursor_done(ailp, &cur); 2628 xfs_trans_ail_cursor_done(ailp, &cur);
2788 spin_unlock(&ailp->xa_lock); 2629 spin_unlock(&ailp->xa_lock);
2789}
2790
2791/*
2792 * Perform the transaction
2793 *
2794 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2795 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2796 */
2797STATIC int
2798xlog_recover_do_trans(
2799 xlog_t *log,
2800 xlog_recover_t *trans,
2801 int pass)
2802{
2803 int error = 0;
2804 xlog_recover_item_t *item;
2805
2806 error = xlog_recover_reorder_trans(log, trans, pass);
2807 if (error)
2808 return error;
2809
2810 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2811 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2812 switch (ITEM_TYPE(item)) {
2813 case XFS_LI_BUF:
2814 error = xlog_recover_do_buffer_trans(log, item, pass);
2815 break;
2816 case XFS_LI_INODE:
2817 error = xlog_recover_do_inode_trans(log, item, pass);
2818 break;
2819 case XFS_LI_EFI:
2820 error = xlog_recover_do_efi_trans(log, item,
2821 trans->r_lsn, pass);
2822 break;
2823 case XFS_LI_EFD:
2824 xlog_recover_do_efd_trans(log, item, pass);
2825 error = 0;
2826 break;
2827 case XFS_LI_DQUOT:
2828 error = xlog_recover_do_dquot_trans(log, item, pass);
2829 break;
2830 case XFS_LI_QUOTAOFF:
2831 error = xlog_recover_do_quotaoff_trans(log, item,
2832 pass);
2833 break;
2834 default:
2835 xlog_warn(
2836 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2837 ASSERT(0);
2838 error = XFS_ERROR(EIO);
2839 break;
2840 }
2841
2842 if (error)
2843 return error;
2844 }
2845 2630
2846 return 0; 2631 return 0;
2847} 2632}
@@ -2853,7 +2638,7 @@ xlog_recover_do_trans(
2853 */ 2638 */
2854STATIC void 2639STATIC void
2855xlog_recover_free_trans( 2640xlog_recover_free_trans(
2856 xlog_recover_t *trans) 2641 struct xlog_recover *trans)
2857{ 2642{
2858 xlog_recover_item_t *item, *n; 2643 xlog_recover_item_t *item, *n;
2859 int i; 2644 int i;
@@ -2872,17 +2657,95 @@ xlog_recover_free_trans(
2872} 2657}
2873 2658
2874STATIC int 2659STATIC int
2660xlog_recover_commit_pass1(
2661 struct log *log,
2662 struct xlog_recover *trans,
2663 xlog_recover_item_t *item)
2664{
2665 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2666
2667 switch (ITEM_TYPE(item)) {
2668 case XFS_LI_BUF:
2669 return xlog_recover_buffer_pass1(log, item);
2670 case XFS_LI_QUOTAOFF:
2671 return xlog_recover_quotaoff_pass1(log, item);
2672 case XFS_LI_INODE:
2673 case XFS_LI_EFI:
2674 case XFS_LI_EFD:
2675 case XFS_LI_DQUOT:
2676 /* nothing to do in pass 1 */
2677 return 0;
2678 default:
2679 xlog_warn(
2680 "XFS: invalid item type (%d) xlog_recover_commit_pass1",
2681 ITEM_TYPE(item));
2682 ASSERT(0);
2683 return XFS_ERROR(EIO);
2684 }
2685}
2686
2687STATIC int
2688xlog_recover_commit_pass2(
2689 struct log *log,
2690 struct xlog_recover *trans,
2691 xlog_recover_item_t *item)
2692{
2693 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2694
2695 switch (ITEM_TYPE(item)) {
2696 case XFS_LI_BUF:
2697 return xlog_recover_buffer_pass2(log, item);
2698 case XFS_LI_INODE:
2699 return xlog_recover_inode_pass2(log, item);
2700 case XFS_LI_EFI:
2701 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2702 case XFS_LI_EFD:
2703 return xlog_recover_efd_pass2(log, item);
2704 case XFS_LI_DQUOT:
2705 return xlog_recover_dquot_pass2(log, item);
2706 case XFS_LI_QUOTAOFF:
2707 /* nothing to do in pass2 */
2708 return 0;
2709 default:
2710 xlog_warn(
2711 "XFS: invalid item type (%d) xlog_recover_commit_pass2",
2712 ITEM_TYPE(item));
2713 ASSERT(0);
2714 return XFS_ERROR(EIO);
2715 }
2716}
2717
2718/*
2719 * Perform the transaction.
2720 *
2721 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2722 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2723 */
2724STATIC int
2875xlog_recover_commit_trans( 2725xlog_recover_commit_trans(
2876 xlog_t *log, 2726 struct log *log,
2877 xlog_recover_t *trans, 2727 struct xlog_recover *trans,
2878 int pass) 2728 int pass)
2879{ 2729{
2880 int error; 2730 int error = 0;
2731 xlog_recover_item_t *item;
2881 2732
2882 hlist_del(&trans->r_list); 2733 hlist_del(&trans->r_list);
2883 if ((error = xlog_recover_do_trans(log, trans, pass))) 2734
2735 error = xlog_recover_reorder_trans(log, trans, pass);
2736 if (error)
2884 return error; 2737 return error;
2885 xlog_recover_free_trans(trans); /* no error */ 2738
2739 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2740 if (pass == XLOG_RECOVER_PASS1)
2741 error = xlog_recover_commit_pass1(log, trans, item);
2742 else
2743 error = xlog_recover_commit_pass2(log, trans, item);
2744 if (error)
2745 return error;
2746 }
2747
2748 xlog_recover_free_trans(trans);
2886 return 0; 2749 return 0;
2887} 2750}
2888 2751
@@ -3012,7 +2875,7 @@ xlog_recover_process_efi(
3012 xfs_extent_t *extp; 2875 xfs_extent_t *extp;
3013 xfs_fsblock_t startblock_fsb; 2876 xfs_fsblock_t startblock_fsb;
3014 2877
3015 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); 2878 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3016 2879
3017 /* 2880 /*
3018 * First check the validity of the extents described by the 2881 * First check the validity of the extents described by the
@@ -3051,7 +2914,7 @@ xlog_recover_process_efi(
3051 extp->ext_len); 2914 extp->ext_len);
3052 } 2915 }
3053 2916
3054 efip->efi_flags |= XFS_EFI_RECOVERED; 2917 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3055 error = xfs_trans_commit(tp, 0); 2918 error = xfs_trans_commit(tp, 0);
3056 return error; 2919 return error;
3057 2920
@@ -3108,7 +2971,7 @@ xlog_recover_process_efis(
3108 * Skip EFIs that we've already processed. 2971 * Skip EFIs that we've already processed.
3109 */ 2972 */
3110 efip = (xfs_efi_log_item_t *)lip; 2973 efip = (xfs_efi_log_item_t *)lip;
3111 if (efip->efi_flags & XFS_EFI_RECOVERED) { 2974 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3112 lip = xfs_trans_ail_cursor_next(ailp, &cur); 2975 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3113 continue; 2976 continue;
3114 } 2977 }
@@ -3725,7 +3588,7 @@ xlog_do_log_recovery(
3725 xfs_daddr_t head_blk, 3588 xfs_daddr_t head_blk,
3726 xfs_daddr_t tail_blk) 3589 xfs_daddr_t tail_blk)
3727{ 3590{
3728 int error; 3591 int error, i;
3729 3592
3730 ASSERT(head_blk != tail_blk); 3593 ASSERT(head_blk != tail_blk);
3731 3594
@@ -3733,10 +3596,12 @@ xlog_do_log_recovery(
3733 * First do a pass to find all of the cancelled buf log items. 3596 * First do a pass to find all of the cancelled buf log items.
3734 * Store them in the buf_cancel_table for use in the second pass. 3597 * Store them in the buf_cancel_table for use in the second pass.
3735 */ 3598 */
3736 log->l_buf_cancel_table = 3599 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3737 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * 3600 sizeof(struct list_head),
3738 sizeof(xfs_buf_cancel_t*),
3739 KM_SLEEP); 3601 KM_SLEEP);
3602 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3603 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3604
3740 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3605 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3741 XLOG_RECOVER_PASS1); 3606 XLOG_RECOVER_PASS1);
3742 if (error != 0) { 3607 if (error != 0) {
@@ -3755,7 +3620,7 @@ xlog_do_log_recovery(
3755 int i; 3620 int i;
3756 3621
3757 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 3622 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3758 ASSERT(log->l_buf_cancel_table[i] == NULL); 3623 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3759 } 3624 }
3760#endif /* DEBUG */ 3625#endif /* DEBUG */
3761 3626
@@ -3817,7 +3682,7 @@ xlog_do_recover(
3817 XFS_BUF_READ(bp); 3682 XFS_BUF_READ(bp);
3818 XFS_BUF_UNASYNC(bp); 3683 XFS_BUF_UNASYNC(bp);
3819 xfsbdstrat(log->l_mp, bp); 3684 xfsbdstrat(log->l_mp, bp);
3820 error = xfs_iowait(bp); 3685 error = xfs_buf_iowait(bp);
3821 if (error) { 3686 if (error) {
3822 xfs_ioerror_alert("xlog_do_recover", 3687 xfs_ioerror_alert("xlog_do_recover",
3823 log->l_mp, bp, XFS_BUF_ADDR(bp)); 3688 log->l_mp, bp, XFS_BUF_ADDR(bp));
@@ -3935,7 +3800,7 @@ xlog_recover_finish(
3935 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3936 } else { 3801 } else {
3937 cmn_err(CE_DEBUG, 3802 cmn_err(CE_DEBUG,
3938 "!Ending clean XFS mount for filesystem: %s\n", 3803 "Ending clean XFS mount for filesystem: %s\n",
3939 log->l_mp->m_fsname); 3804 log->l_mp->m_fsname);
3940 } 3805 }
3941 return 0; 3806 return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..d447aef84bc3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
52 int); 52 int);
53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, 53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
54 int); 54 int);
55STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
56 int64_t, int);
57STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); 55STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
58
59#else 56#else
60 57
61#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) 58#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
62#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) 59#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
63#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0)
64
65#endif 60#endif
66 61
67static const struct { 62static const struct {
@@ -199,6 +194,8 @@ xfs_uuid_unmount(
199 194
200/* 195/*
201 * Reference counting access wrappers to the perag structures. 196 * Reference counting access wrappers to the perag structures.
197 * Because we never free per-ag structures, the only thing we
198 * have to protect against changes is the tree structure itself.
202 */ 199 */
203struct xfs_perag * 200struct xfs_perag *
204xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) 201xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,19 +203,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
206 struct xfs_perag *pag; 203 struct xfs_perag *pag;
207 int ref = 0; 204 int ref = 0;
208 205
209 spin_lock(&mp->m_perag_lock); 206 rcu_read_lock();
210 pag = radix_tree_lookup(&mp->m_perag_tree, agno); 207 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
211 if (pag) { 208 if (pag) {
212 ASSERT(atomic_read(&pag->pag_ref) >= 0); 209 ASSERT(atomic_read(&pag->pag_ref) >= 0);
213 /* catch leaks in the positive direction during testing */
214 ASSERT(atomic_read(&pag->pag_ref) < 1000);
215 ref = atomic_inc_return(&pag->pag_ref); 210 ref = atomic_inc_return(&pag->pag_ref);
216 } 211 }
217 spin_unlock(&mp->m_perag_lock); 212 rcu_read_unlock();
218 trace_xfs_perag_get(mp, agno, ref, _RET_IP_); 213 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
219 return pag; 214 return pag;
220} 215}
221 216
217/*
218 * search from @first to find the next perag with the given tag set.
219 */
220struct xfs_perag *
221xfs_perag_get_tag(
222 struct xfs_mount *mp,
223 xfs_agnumber_t first,
224 int tag)
225{
226 struct xfs_perag *pag;
227 int found;
228 int ref;
229
230 rcu_read_lock();
231 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
232 (void **)&pag, first, 1, tag);
233 if (found <= 0) {
234 rcu_read_unlock();
235 return NULL;
236 }
237 ref = atomic_inc_return(&pag->pag_ref);
238 rcu_read_unlock();
239 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
240 return pag;
241}
242
222void 243void
223xfs_perag_put(struct xfs_perag *pag) 244xfs_perag_put(struct xfs_perag *pag)
224{ 245{
@@ -229,10 +250,18 @@ xfs_perag_put(struct xfs_perag *pag)
229 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); 250 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
230} 251}
231 252
253STATIC void
254__xfs_free_perag(
255 struct rcu_head *head)
256{
257 struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
258
259 ASSERT(atomic_read(&pag->pag_ref) == 0);
260 kmem_free(pag);
261}
262
232/* 263/*
233 * Free up the resources associated with a mount structure. Assume that 264 * Free up the per-ag resources associated with the mount structure.
234 * the structure was initially zeroed, so we can tell which fields got
235 * initialized.
236 */ 265 */
237STATIC void 266STATIC void
238xfs_free_perag( 267xfs_free_perag(
@@ -244,10 +273,10 @@ xfs_free_perag(
244 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 273 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
245 spin_lock(&mp->m_perag_lock); 274 spin_lock(&mp->m_perag_lock);
246 pag = radix_tree_delete(&mp->m_perag_tree, agno); 275 pag = radix_tree_delete(&mp->m_perag_tree, agno);
276 spin_unlock(&mp->m_perag_lock);
247 ASSERT(pag); 277 ASSERT(pag);
248 ASSERT(atomic_read(&pag->pag_ref) == 0); 278 ASSERT(atomic_read(&pag->pag_ref) == 0);
249 spin_unlock(&mp->m_perag_lock); 279 call_rcu(&pag->rcu_head, __xfs_free_perag);
250 kmem_free(pag);
251 } 280 }
252} 281}
253 282
@@ -443,8 +472,11 @@ xfs_initialize_perag(
443 goto out_unwind; 472 goto out_unwind;
444 pag->pag_agno = index; 473 pag->pag_agno = index;
445 pag->pag_mount = mp; 474 pag->pag_mount = mp;
446 rwlock_init(&pag->pag_ici_lock); 475 spin_lock_init(&pag->pag_ici_lock);
476 mutex_init(&pag->pag_ici_reclaim_lock);
447 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
478 spin_lock_init(&pag->pag_buf_lock);
479 pag->pag_buf_tree = RB_ROOT;
448 480
449 if (radix_tree_preload(GFP_NOFS)) 481 if (radix_tree_preload(GFP_NOFS))
450 goto out_unwind; 482 goto out_unwind;
@@ -639,7 +671,6 @@ int
639xfs_readsb(xfs_mount_t *mp, int flags) 671xfs_readsb(xfs_mount_t *mp, int flags)
640{ 672{
641 unsigned int sector_size; 673 unsigned int sector_size;
642 unsigned int extra_flags;
643 xfs_buf_t *bp; 674 xfs_buf_t *bp;
644 int error; 675 int error;
645 676
@@ -652,28 +683,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
652 * access to the superblock. 683 * access to the superblock.
653 */ 684 */
654 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 685 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
655 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
656 686
657 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 687reread:
658 extra_flags); 688 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
659 if (!bp || XFS_BUF_ISERROR(bp)) { 689 XFS_SB_DADDR, sector_size, 0);
660 xfs_fs_mount_cmn_err(flags, "SB read failed"); 690 if (!bp) {
661 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 691 xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
662 goto fail; 692 return EIO;
663 } 693 }
664 ASSERT(XFS_BUF_ISBUSY(bp));
665 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
666 694
667 /* 695 /*
668 * Initialize the mount structure from the superblock. 696 * Initialize the mount structure from the superblock.
669 * But first do some basic consistency checking. 697 * But first do some basic consistency checking.
670 */ 698 */
671 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 699 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
672
673 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 700 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
674 if (error) { 701 if (error) {
675 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 702 xfs_fs_mount_cmn_err(flags, "SB validate failed");
676 goto fail; 703 goto release_buf;
677 } 704 }
678 705
679 /* 706 /*
@@ -684,7 +711,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
684 "device supports only %u byte sectors (not %u)", 711 "device supports only %u byte sectors (not %u)",
685 sector_size, mp->m_sb.sb_sectsize); 712 sector_size, mp->m_sb.sb_sectsize);
686 error = ENOSYS; 713 error = ENOSYS;
687 goto fail; 714 goto release_buf;
688 } 715 }
689 716
690 /* 717 /*
@@ -692,33 +719,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
692 * re-read the superblock so the buffer is correctly sized. 719 * re-read the superblock so the buffer is correctly sized.
693 */ 720 */
694 if (sector_size < mp->m_sb.sb_sectsize) { 721 if (sector_size < mp->m_sb.sb_sectsize) {
695 XFS_BUF_UNMANAGE(bp);
696 xfs_buf_relse(bp); 722 xfs_buf_relse(bp);
697 sector_size = mp->m_sb.sb_sectsize; 723 sector_size = mp->m_sb.sb_sectsize;
698 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, 724 goto reread;
699 BTOBB(sector_size), extra_flags);
700 if (!bp || XFS_BUF_ISERROR(bp)) {
701 xfs_fs_mount_cmn_err(flags, "SB re-read failed");
702 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
703 goto fail;
704 }
705 ASSERT(XFS_BUF_ISBUSY(bp));
706 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
707 } 725 }
708 726
709 /* Initialize per-cpu counters */ 727 /* Initialize per-cpu counters */
710 xfs_icsb_reinit_counters(mp); 728 xfs_icsb_reinit_counters(mp);
711 729
712 mp->m_sb_bp = bp; 730 mp->m_sb_bp = bp;
713 xfs_buf_relse(bp); 731 xfs_buf_unlock(bp);
714 ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
715 return 0; 732 return 0;
716 733
717 fail: 734release_buf:
718 if (bp) { 735 xfs_buf_relse(bp);
719 XFS_BUF_UNMANAGE(bp);
720 xfs_buf_relse(bp);
721 }
722 return error; 736 return error;
723} 737}
724 738
@@ -961,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
961} 975}
962 976
963/* 977/*
978 * precalculate the low space thresholds for dynamic speculative preallocation.
979 */
980void
981xfs_set_low_space_thresholds(
982 struct xfs_mount *mp)
983{
984 int i;
985
986 for (i = 0; i < XFS_LOWSP_MAX; i++) {
987 __uint64_t space = mp->m_sb.sb_dblocks;
988
989 do_div(space, 100);
990 mp->m_low_space[i] = space * (i + 1);
991 }
992}
993
994
995/*
964 * Set whether we're using inode alignment. 996 * Set whether we're using inode alignment.
965 */ 997 */
966STATIC void 998STATIC void
@@ -991,42 +1023,35 @@ xfs_check_sizes(xfs_mount_t *mp)
991{ 1023{
992 xfs_buf_t *bp; 1024 xfs_buf_t *bp;
993 xfs_daddr_t d; 1025 xfs_daddr_t d;
994 int error;
995 1026
996 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1027 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
997 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1028 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
998 cmn_err(CE_WARN, "XFS: size check 1 failed"); 1029 cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
999 return XFS_ERROR(EFBIG); 1030 return XFS_ERROR(EFBIG);
1000 } 1031 }
1001 error = xfs_read_buf(mp, mp->m_ddev_targp, 1032 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1002 d - XFS_FSS_TO_BB(mp, 1), 1033 d - XFS_FSS_TO_BB(mp, 1),
1003 XFS_FSS_TO_BB(mp, 1), 0, &bp); 1034 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1004 if (!error) { 1035 if (!bp) {
1005 xfs_buf_relse(bp); 1036 cmn_err(CE_WARN, "XFS: last sector read failed");
1006 } else { 1037 return EIO;
1007 cmn_err(CE_WARN, "XFS: size check 2 failed");
1008 if (error == ENOSPC)
1009 error = XFS_ERROR(EFBIG);
1010 return error;
1011 } 1038 }
1039 xfs_buf_relse(bp);
1012 1040
1013 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1041 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1014 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1042 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1015 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1043 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1016 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1044 cmn_err(CE_WARN, "XFS: log size mismatch detected");
1017 return XFS_ERROR(EFBIG); 1045 return XFS_ERROR(EFBIG);
1018 } 1046 }
1019 error = xfs_read_buf(mp, mp->m_logdev_targp, 1047 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1020 d - XFS_FSB_TO_BB(mp, 1), 1048 d - XFS_FSB_TO_BB(mp, 1),
1021 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1049 XFS_FSB_TO_B(mp, 1), 0);
1022 if (!error) { 1050 if (!bp) {
1023 xfs_buf_relse(bp); 1051 cmn_err(CE_WARN, "XFS: log device read failed");
1024 } else { 1052 return EIO;
1025 cmn_err(CE_WARN, "XFS: size check 3 failed");
1026 if (error == ENOSPC)
1027 error = XFS_ERROR(EFBIG);
1028 return error;
1029 } 1053 }
1054 xfs_buf_relse(bp);
1030 } 1055 }
1031 return 0; 1056 return 0;
1032} 1057}
@@ -1189,6 +1214,9 @@ xfs_mountfs(
1189 */ 1214 */
1190 xfs_set_rw_sizes(mp); 1215 xfs_set_rw_sizes(mp);
1191 1216
1217 /* set the low space thresholds for dynamic preallocation */
1218 xfs_set_low_space_thresholds(mp);
1219
1192 /* 1220 /*
1193 * Set the inode cluster size. 1221 * Set the inode cluster size.
1194 * This may still be overridden by the file system 1222 * This may still be overridden by the file system
@@ -1601,7 +1629,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1601 XFS_BUF_UNASYNC(sbp); 1629 XFS_BUF_UNASYNC(sbp);
1602 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1630 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1603 xfsbdstrat(mp, sbp); 1631 xfsbdstrat(mp, sbp);
1604 error = xfs_iowait(sbp); 1632 error = xfs_buf_iowait(sbp);
1605 if (error) 1633 if (error)
1606 xfs_ioerror_alert("xfs_unmountfs_writesb", 1634 xfs_ioerror_alert("xfs_unmountfs_writesb",
1607 mp, sbp, XFS_BUF_ADDR(sbp)); 1635 mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1832,135 +1860,72 @@ xfs_mod_incore_sb_unlocked(
1832 */ 1860 */
1833int 1861int
1834xfs_mod_incore_sb( 1862xfs_mod_incore_sb(
1835 xfs_mount_t *mp, 1863 struct xfs_mount *mp,
1836 xfs_sb_field_t field, 1864 xfs_sb_field_t field,
1837 int64_t delta, 1865 int64_t delta,
1838 int rsvd) 1866 int rsvd)
1839{ 1867{
1840 int status; 1868 int status;
1841 1869
1842 /* check for per-cpu counters */
1843 switch (field) {
1844#ifdef HAVE_PERCPU_SB 1870#ifdef HAVE_PERCPU_SB
1845 case XFS_SBS_ICOUNT: 1871 ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
1846 case XFS_SBS_IFREE:
1847 case XFS_SBS_FDBLOCKS:
1848 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1849 status = xfs_icsb_modify_counters(mp, field,
1850 delta, rsvd);
1851 break;
1852 }
1853 /* FALLTHROUGH */
1854#endif 1872#endif
1855 default: 1873 spin_lock(&mp->m_sb_lock);
1856 spin_lock(&mp->m_sb_lock); 1874 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1857 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1875 spin_unlock(&mp->m_sb_lock);
1858 spin_unlock(&mp->m_sb_lock);
1859 break;
1860 }
1861 1876
1862 return status; 1877 return status;
1863} 1878}
1864 1879
1865/* 1880/*
1866 * xfs_mod_incore_sb_batch() is used to change more than one field 1881 * Change more than one field in the in-core superblock structure at a time.
1867 * in the in-core superblock structure at a time. This modification 1882 *
1868 * is protected by a lock internal to this module. The fields and 1883 * The fields and changes to those fields are specified in the array of
1869 * changes to those fields are specified in the array of xfs_mod_sb 1884 * xfs_mod_sb structures passed in. Either all of the specified deltas
1870 * structures passed in. 1885 * will be applied or none of them will. If any modified field dips below 0,
1886 * then all modifications will be backed out and EINVAL will be returned.
1871 * 1887 *
1872 * Either all of the specified deltas will be applied or none of 1888 * Note that this function may not be used for the superblock values that
1873 * them will. If any modified field dips below 0, then all modifications 1889 * are tracked with the in-memory per-cpu counters - a direct call to
1874 * will be backed out and EINVAL will be returned. 1890 * xfs_icsb_modify_counters is required for these.
1875 */ 1891 */
1876int 1892int
1877xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) 1893xfs_mod_incore_sb_batch(
1894 struct xfs_mount *mp,
1895 xfs_mod_sb_t *msb,
1896 uint nmsb,
1897 int rsvd)
1878{ 1898{
1879 int status=0; 1899 xfs_mod_sb_t *msbp = &msb[0];
1880 xfs_mod_sb_t *msbp; 1900 int error = 0;
1881 1901
1882 /* 1902 /*
1883 * Loop through the array of mod structures and apply each 1903 * Loop through the array of mod structures and apply each individually.
1884 * individually. If any fail, then back out all those 1904 * If any fail, then back out all those which have already been applied.
1885 * which have already been applied. Do all of this within 1905 * Do all of this within the scope of the m_sb_lock so that all of the
1886 * the scope of the m_sb_lock so that all of the changes will 1906 * changes will be atomic.
1887 * be atomic.
1888 */ 1907 */
1889 spin_lock(&mp->m_sb_lock); 1908 spin_lock(&mp->m_sb_lock);
1890 msbp = &msb[0];
1891 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { 1909 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
1892 /* 1910 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1893 * Apply the delta at index n. If it fails, break 1911 msbp->msb_field > XFS_SBS_FDBLOCKS);
1894 * from the loop so we'll fall into the undo loop
1895 * below.
1896 */
1897 switch (msbp->msb_field) {
1898#ifdef HAVE_PERCPU_SB
1899 case XFS_SBS_ICOUNT:
1900 case XFS_SBS_IFREE:
1901 case XFS_SBS_FDBLOCKS:
1902 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1903 spin_unlock(&mp->m_sb_lock);
1904 status = xfs_icsb_modify_counters(mp,
1905 msbp->msb_field,
1906 msbp->msb_delta, rsvd);
1907 spin_lock(&mp->m_sb_lock);
1908 break;
1909 }
1910 /* FALLTHROUGH */
1911#endif
1912 default:
1913 status = xfs_mod_incore_sb_unlocked(mp,
1914 msbp->msb_field,
1915 msbp->msb_delta, rsvd);
1916 break;
1917 }
1918 1912
1919 if (status != 0) { 1913 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1920 break; 1914 msbp->msb_delta, rsvd);
1921 } 1915 if (error)
1916 goto unwind;
1922 } 1917 }
1918 spin_unlock(&mp->m_sb_lock);
1919 return 0;
1923 1920
1924 /* 1921unwind:
1925 * If we didn't complete the loop above, then back out 1922 while (--msbp >= msb) {
1926 * any changes made to the superblock. If you add code 1923 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1927 * between the loop above and here, make sure that you 1924 -msbp->msb_delta, rsvd);
1928 * preserve the value of status. Loop back until 1925 ASSERT(error == 0);
1929 * we step below the beginning of the array. Make sure
1930 * we don't touch anything back there.
1931 */
1932 if (status != 0) {
1933 msbp--;
1934 while (msbp >= msb) {
1935 switch (msbp->msb_field) {
1936#ifdef HAVE_PERCPU_SB
1937 case XFS_SBS_ICOUNT:
1938 case XFS_SBS_IFREE:
1939 case XFS_SBS_FDBLOCKS:
1940 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1941 spin_unlock(&mp->m_sb_lock);
1942 status = xfs_icsb_modify_counters(mp,
1943 msbp->msb_field,
1944 -(msbp->msb_delta),
1945 rsvd);
1946 spin_lock(&mp->m_sb_lock);
1947 break;
1948 }
1949 /* FALLTHROUGH */
1950#endif
1951 default:
1952 status = xfs_mod_incore_sb_unlocked(mp,
1953 msbp->msb_field,
1954 -(msbp->msb_delta),
1955 rsvd);
1956 break;
1957 }
1958 ASSERT(status == 0);
1959 msbp--;
1960 }
1961 } 1926 }
1962 spin_unlock(&mp->m_sb_lock); 1927 spin_unlock(&mp->m_sb_lock);
1963 return status; 1928 return error;
1964} 1929}
1965 1930
1966/* 1931/*
@@ -1998,18 +1963,13 @@ xfs_getsb(
1998 */ 1963 */
1999void 1964void
2000xfs_freesb( 1965xfs_freesb(
2001 xfs_mount_t *mp) 1966 struct xfs_mount *mp)
2002{ 1967{
2003 xfs_buf_t *bp; 1968 struct xfs_buf *bp = mp->m_sb_bp;
2004 1969
2005 /* 1970 xfs_buf_lock(bp);
2006 * Use xfs_getsb() so that the buffer will be locked
2007 * when we call xfs_buf_relse().
2008 */
2009 bp = xfs_getsb(mp, 0);
2010 XFS_BUF_UNMANAGE(bp);
2011 xfs_buf_relse(bp);
2012 mp->m_sb_bp = NULL; 1971 mp->m_sb_bp = NULL;
1972 xfs_buf_relse(bp);
2013} 1973}
2014 1974
2015/* 1975/*
@@ -2496,7 +2456,7 @@ xfs_icsb_balance_counter(
2496 spin_unlock(&mp->m_sb_lock); 2456 spin_unlock(&mp->m_sb_lock);
2497} 2457}
2498 2458
2499STATIC int 2459int
2500xfs_icsb_modify_counters( 2460xfs_icsb_modify_counters(
2501 xfs_mount_t *mp, 2461 xfs_mount_t *mp,
2502 xfs_sb_field_t field, 2462 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..a62e8971539d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
53 53
54#include "xfs_sync.h" 54#include "xfs_sync.h"
55 55
56struct cred;
57struct log; 56struct log;
58struct xfs_mount_args; 57struct xfs_mount_args;
59struct xfs_inode; 58struct xfs_inode;
@@ -91,6 +90,8 @@ extern void xfs_icsb_reinit_counters(struct xfs_mount *);
91extern void xfs_icsb_destroy_counters(struct xfs_mount *); 90extern void xfs_icsb_destroy_counters(struct xfs_mount *);
92extern void xfs_icsb_sync_counters(struct xfs_mount *, int); 91extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
93extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); 92extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
93extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
94 int64_t, int);
94 95
95#else 96#else
96#define xfs_icsb_init_counters(mp) (0) 97#define xfs_icsb_init_counters(mp) (0)
@@ -98,8 +99,20 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
98#define xfs_icsb_reinit_counters(mp) do { } while (0) 99#define xfs_icsb_reinit_counters(mp) do { } while (0)
99#define xfs_icsb_sync_counters(mp, flags) do { } while (0) 100#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
100#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) 101#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
102#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
103 xfs_mod_incore_sb(mp, field, delta, rsvd)
101#endif 104#endif
102 105
106/* dynamic preallocation free space thresholds, 5% down to 1% */
107enum {
108 XFS_LOWSP_1_PCNT = 0,
109 XFS_LOWSP_2_PCNT,
110 XFS_LOWSP_3_PCNT,
111 XFS_LOWSP_4_PCNT,
112 XFS_LOWSP_5_PCNT,
113 XFS_LOWSP_MAX,
114};
115
103typedef struct xfs_mount { 116typedef struct xfs_mount {
104 struct super_block *m_super; 117 struct super_block *m_super;
105 xfs_tid_t m_tid; /* next unused tid for fs */ 118 xfs_tid_t m_tid; /* next unused tid for fs */
@@ -199,6 +212,8 @@ typedef struct xfs_mount {
199 __int64_t m_update_flags; /* sb flags we need to update 212 __int64_t m_update_flags; /* sb flags we need to update
200 on the next remount,rw */ 213 on the next remount,rw */
201 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 214 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
215 int64_t m_low_space[XFS_LOWSP_MAX];
216 /* low free space thresholds */
202} xfs_mount_t; 217} xfs_mount_t;
203 218
204/* 219/*
@@ -232,8 +247,6 @@ typedef struct xfs_mount {
232#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ 247#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
233#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred 248#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
234 * I/O size in stat() */ 249 * I/O size in stat() */
235#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
236 counters */
237#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams 250#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
238 allocator */ 251 allocator */
239#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ 252#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
@@ -327,6 +340,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
327 * perag get/put wrappers for ref counting 340 * perag get/put wrappers for ref counting
328 */ 341 */
329struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); 342struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
343struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
344 int tag);
330void xfs_perag_put(struct xfs_perag *pag); 345void xfs_perag_put(struct xfs_perag *pag);
331 346
332/* 347/*
@@ -376,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
376 391
377extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 392extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
378 393
394extern void xfs_set_low_space_thresholds(struct xfs_mount *);
395
379#endif /* __KERNEL__ */ 396#endif /* __KERNEL__ */
380 397
381extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 398extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..edfa178bafb6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
408 spin_lock(&mru->lock); 408 spin_lock(&mru->lock);
409 if (mru->queued) { 409 if (mru->queued) {
410 spin_unlock(&mru->lock); 410 spin_unlock(&mru->lock);
411 cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); 411 cancel_delayed_work_sync(&mru->work);
412 spin_lock(&mru->lock); 412 spin_lock(&mru->lock);
413 } 413 }
414 414
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..9bb6eda4cd21 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) 346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
347#define xfs_trans_apply_dquot_deltas(tp) 347#define xfs_trans_apply_dquot_deltas(tp)
348#define xfs_trans_unreserve_and_mod_dquots(tp) 348#define xfs_trans_unreserve_and_mod_dquots(tp)
349#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) 349static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
350#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) 350 struct xfs_inode *ip, long nblks, long ninos, uint flags)
351{
352 return 0;
353}
354static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
355 struct xfs_mount *mp, struct xfs_dquot *udqp,
356 struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
357{
358 return 0;
359}
351#define xfs_qm_vop_create_dqattach(tp, ip, u, g) 360#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
352#define xfs_qm_vop_rename_dqattach(it) (0) 361#define xfs_qm_vop_rename_dqattach(it) (0)
353#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) 362#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
357#define xfs_qm_dqdetach(ip) 366#define xfs_qm_dqdetach(ip)
358#define xfs_qm_dqrele(d) 367#define xfs_qm_dqrele(d)
359#define xfs_qm_statvfs(ip, s) 368#define xfs_qm_statvfs(ip, s)
360#define xfs_qm_sync(mp, fl) (0) 369static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
370{
371 return 0;
372}
361#define xfs_qm_newmount(mp, a, b) (0) 373#define xfs_qm_newmount(mp, a, b) (0)
362#define xfs_qm_mount_quotas(mp) 374#define xfs_qm_mount_quotas(mp)
363#define xfs_qm_unmount(mp) 375#define xfs_qm_unmount(mp)
364#define xfs_qm_unmount_quotas(mp) (0) 376#define xfs_qm_unmount_quotas(mp)
365#endif /* CONFIG_XFS_QUOTA */ 377#endif /* CONFIG_XFS_QUOTA */
366 378
367#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ 379#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_REFCACHE_H__
19#define __XFS_REFCACHE_H__
20
21#ifdef HAVE_REFCACHE
22/*
23 * Maximum size (in inodes) for the NFS reference cache
24 */
25#define XFS_REFCACHE_SIZE_MAX 512
26
27struct xfs_inode;
28struct xfs_mount;
29
30extern void xfs_refcache_insert(struct xfs_inode *);
31extern void xfs_refcache_purge_ip(struct xfs_inode *);
32extern void xfs_refcache_purge_mp(struct xfs_mount *);
33extern void xfs_refcache_purge_some(struct xfs_mount *);
34extern void xfs_refcache_resize(int);
35extern void xfs_refcache_destroy(void);
36
37extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
38
39#else
40
41#define xfs_refcache_insert(ip) do { } while (0)
42#define xfs_refcache_purge_ip(ip) do { } while (0)
43#define xfs_refcache_purge_mp(mp) do { } while (0)
44#define xfs_refcache_purge_some(mp) do { } while (0)
45#define xfs_refcache_resize(size) do { } while (0)
46#define xfs_refcache_destroy() do { } while (0)
47
48#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
49
50#endif
51
52#endif /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
183 * tree quota mechanism would be circumvented. 183 * tree quota mechanism would be circumvented.
184 */ 184 */
185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
186 (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { 186 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
187 error = XFS_ERROR(EXDEV); 187 error = XFS_ERROR(EXDEV);
188 goto error_return; 188 goto error_return;
189 } 189 }
@@ -211,7 +211,9 @@ xfs_rename(
211 goto error_return; 211 goto error_return;
212 if (error) 212 if (error)
213 goto abort_return; 213 goto abort_return;
214 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 214
215 xfs_trans_ichgtime(tp, target_dp,
216 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
215 217
216 if (new_parent && src_is_directory) { 218 if (new_parent && src_is_directory) {
217 error = xfs_bumplink(tp, target_dp); 219 error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
249 &first_block, &free_list, spaceres); 251 &first_block, &free_list, spaceres);
250 if (error) 252 if (error)
251 goto abort_return; 253 goto abort_return;
252 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 254
255 xfs_trans_ichgtime(tp, target_dp,
256 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
253 257
254 /* 258 /*
255 * Decrement the link count on the target since the target 259 * Decrement the link count on the target since the target
@@ -292,7 +296,8 @@ xfs_rename(
292 * inode isn't really being changed, but old unix file systems did 296 * inode isn't really being changed, but old unix file systems did
293 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
294 */ 298 */
295 xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
300 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
296 301
297 /* 302 /*
298 * Adjust the link count on src_dp. This is necessary when 303 * Adjust the link count on src_dp. This is necessary when
@@ -315,7 +320,7 @@ xfs_rename(
315 if (error) 320 if (error)
316 goto abort_return; 321 goto abort_return;
317 322
318 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 323 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
319 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 324 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
320 if (new_parent) 325 if (new_parent)
321 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 326 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..12a191385310 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
39#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_buf.h"
42 43
43 44
44/* 45/*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
1883 /* 1884 /*
1884 * Read in the last block of the device, make sure it exists. 1885 * Read in the last block of the device, make sure it exists.
1885 */ 1886 */
1886 error = xfs_read_buf(mp, mp->m_rtdev_targp, 1887 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
1887 XFS_FSB_TO_BB(mp, nrblocks - 1), 1888 XFS_FSB_TO_BB(mp, nrblocks - 1),
1888 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1889 XFS_FSB_TO_B(mp, 1), 0);
1889 if (error) 1890 if (!bp)
1890 return error; 1891 return EIO;
1891 ASSERT(bp);
1892 xfs_buf_relse(bp); 1892 xfs_buf_relse(bp);
1893
1893 /* 1894 /*
1894 * Calculate new parameters. These are the final values to be reached. 1895 * Calculate new parameters. These are the final values to be reached.
1895 */ 1896 */
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
2215{ 2216{
2216 xfs_buf_t *bp; /* buffer for last block of subvolume */ 2217 xfs_buf_t *bp; /* buffer for last block of subvolume */
2217 xfs_daddr_t d; /* address of last block of subvolume */ 2218 xfs_daddr_t d; /* address of last block of subvolume */
2218 int error; /* error return value */
2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */ 2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */
2220 2220
2221 sbp = &mp->m_sb; 2221 sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
2242 (unsigned long long) mp->m_sb.sb_rblocks); 2242 (unsigned long long) mp->m_sb.sb_rblocks);
2243 return XFS_ERROR(EFBIG); 2243 return XFS_ERROR(EFBIG);
2244 } 2244 }
2245 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2245 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
2246 d - XFS_FSB_TO_BB(mp, 1), 2246 d - XFS_FSB_TO_BB(mp, 1),
2247 XFS_FSB_TO_BB(mp, 1), 0, &bp); 2247 XFS_FSB_TO_B(mp, 1), 0);
2248 if (error) { 2248 if (!bp) {
2249 cmn_err(CE_WARN, 2249 cmn_err(CE_WARN, "XFS: realtime device size check failed");
2250 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2250 return EIO;
2251 if (error == ENOSPC)
2252 return XFS_ERROR(EFBIG);
2253 return error;
2254 } 2251 }
2255 xfs_buf_relse(bp); 2252 xfs_buf_relse(bp);
2256 return 0; 2253 return 0;
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
83 84
84#define XFS_SB_VERSION2_OKREALFBITS \ 85#define XFS_SB_VERSION2_OKREALFBITS \
85 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
86 XFS_SB_VERSION2_ATTR2BIT) 87 XFS_SB_VERSION2_ATTR2BIT | \
88 XFS_SB_VERSION2_PROJID32BIT)
87#define XFS_SB_VERSION2_OKSASHFBITS \ 89#define XFS_SB_VERSION2_OKSASHFBITS \
88 (0) 90 (0)
89#define XFS_SB_VERSION2_OKREALBITS \ 91#define XFS_SB_VERSION2_OKREALBITS \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
495 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; 497 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
496} 498}
497 499
500static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
501{
502 return xfs_sb_version_hasmorebits(sbp) &&
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504}
505
498/* 506/*
499 * end of superblock version macros 507 * end of superblock version macros
500 */ 508 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -696,7 +696,7 @@ xfs_trans_reserve(
696 * fail if the count would go below zero. 696 * fail if the count would go below zero.
697 */ 697 */
698 if (blocks > 0) { 698 if (blocks > 0) {
699 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 699 error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
700 -((int64_t)blocks), rsvd); 700 -((int64_t)blocks), rsvd);
701 if (error != 0) { 701 if (error != 0) {
702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +767,7 @@ undo_log:
767 767
768undo_blocks: 768undo_blocks:
769 if (blocks > 0) { 769 if (blocks > 0) {
770 (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 770 xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
771 (int64_t)blocks, rsvd); 771 (int64_t)blocks, rsvd);
772 tp->t_blk_res = 0; 772 tp->t_blk_res = 0;
773 } 773 }
@@ -1009,7 +1009,7 @@ void
1009xfs_trans_unreserve_and_mod_sb( 1009xfs_trans_unreserve_and_mod_sb(
1010 xfs_trans_t *tp) 1010 xfs_trans_t *tp)
1011{ 1011{
1012 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ 1012 xfs_mod_sb_t msb[9]; /* If you add cases, add entries */
1013 xfs_mod_sb_t *msbp; 1013 xfs_mod_sb_t *msbp;
1014 xfs_mount_t *mp = tp->t_mountp; 1014 xfs_mount_t *mp = tp->t_mountp;
1015 /* REFERENCED */ 1015 /* REFERENCED */
@@ -1017,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
1017 int rsvd; 1017 int rsvd;
1018 int64_t blkdelta = 0; 1018 int64_t blkdelta = 0;
1019 int64_t rtxdelta = 0; 1019 int64_t rtxdelta = 0;
1020 int64_t idelta = 0;
1021 int64_t ifreedelta = 0;
1020 1022
1021 msbp = msb; 1023 msbp = msb;
1022 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 1024 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
1023 1025
1024 /* calculate free blocks delta */ 1026 /* calculate deltas */
1025 if (tp->t_blk_res > 0) 1027 if (tp->t_blk_res > 0)
1026 blkdelta = tp->t_blk_res; 1028 blkdelta = tp->t_blk_res;
1027
1028 if ((tp->t_fdblocks_delta != 0) && 1029 if ((tp->t_fdblocks_delta != 0) &&
1029 (xfs_sb_version_haslazysbcount(&mp->m_sb) || 1030 (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1030 (tp->t_flags & XFS_TRANS_SB_DIRTY))) 1031 (tp->t_flags & XFS_TRANS_SB_DIRTY)))
1031 blkdelta += tp->t_fdblocks_delta; 1032 blkdelta += tp->t_fdblocks_delta;
1032 1033
1033 if (blkdelta != 0) {
1034 msbp->msb_field = XFS_SBS_FDBLOCKS;
1035 msbp->msb_delta = blkdelta;
1036 msbp++;
1037 }
1038
1039 /* calculate free realtime extents delta */
1040 if (tp->t_rtx_res > 0) 1034 if (tp->t_rtx_res > 0)
1041 rtxdelta = tp->t_rtx_res; 1035 rtxdelta = tp->t_rtx_res;
1042
1043 if ((tp->t_frextents_delta != 0) && 1036 if ((tp->t_frextents_delta != 0) &&
1044 (tp->t_flags & XFS_TRANS_SB_DIRTY)) 1037 (tp->t_flags & XFS_TRANS_SB_DIRTY))
1045 rtxdelta += tp->t_frextents_delta; 1038 rtxdelta += tp->t_frextents_delta;
1046 1039
1040 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1041 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1042 idelta = tp->t_icount_delta;
1043 ifreedelta = tp->t_ifree_delta;
1044 }
1045
1046 /* apply the per-cpu counters */
1047 if (blkdelta) {
1048 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
1049 blkdelta, rsvd);
1050 if (error)
1051 goto out;
1052 }
1053
1054 if (idelta) {
1055 error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
1056 idelta, rsvd);
1057 if (error)
1058 goto out_undo_fdblocks;
1059 }
1060
1061 if (ifreedelta) {
1062 error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
1063 ifreedelta, rsvd);
1064 if (error)
1065 goto out_undo_icount;
1066 }
1067
1068 /* apply remaining deltas */
1047 if (rtxdelta != 0) { 1069 if (rtxdelta != 0) {
1048 msbp->msb_field = XFS_SBS_FREXTENTS; 1070 msbp->msb_field = XFS_SBS_FREXTENTS;
1049 msbp->msb_delta = rtxdelta; 1071 msbp->msb_delta = rtxdelta;
1050 msbp++; 1072 msbp++;
1051 } 1073 }
1052 1074
1053 /* apply remaining deltas */
1054
1055 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1056 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1057 if (tp->t_icount_delta != 0) {
1058 msbp->msb_field = XFS_SBS_ICOUNT;
1059 msbp->msb_delta = tp->t_icount_delta;
1060 msbp++;
1061 }
1062 if (tp->t_ifree_delta != 0) {
1063 msbp->msb_field = XFS_SBS_IFREE;
1064 msbp->msb_delta = tp->t_ifree_delta;
1065 msbp++;
1066 }
1067 }
1068
1069 if (tp->t_flags & XFS_TRANS_SB_DIRTY) { 1075 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
1070 if (tp->t_dblocks_delta != 0) { 1076 if (tp->t_dblocks_delta != 0) {
1071 msbp->msb_field = XFS_SBS_DBLOCKS; 1077 msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1121,24 @@ xfs_trans_unreserve_and_mod_sb(
1115 if (msbp > msb) { 1121 if (msbp > msb) {
1116 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, 1122 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
1117 (uint)(msbp - msb), rsvd); 1123 (uint)(msbp - msb), rsvd);
1118 ASSERT(error == 0); 1124 if (error)
1125 goto out_undo_ifreecount;
1119 } 1126 }
1127
1128 return;
1129
1130out_undo_ifreecount:
1131 if (ifreedelta)
1132 xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
1133out_undo_icount:
1134 if (idelta)
1135 xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
1136out_undo_fdblocks:
1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out:
1140 ASSERT(error == 0);
1141 return;
1120} 1142}
1121 1143
1122/* 1144/*
@@ -1328,7 +1350,7 @@ xfs_trans_fill_vecs(
1328 * they could be immediately flushed and we'd have to race with the flusher 1350 * they could be immediately flushed and we'd have to race with the flusher
1329 * trying to pull the item from the AIL as we add it. 1351 * trying to pull the item from the AIL as we add it.
1330 */ 1352 */
1331void 1353static void
1332xfs_trans_item_committed( 1354xfs_trans_item_committed(
1333 struct xfs_log_item *lip, 1355 struct xfs_log_item *lip,
1334 xfs_lsn_t commit_lsn, 1356 xfs_lsn_t commit_lsn,
@@ -1389,15 +1411,12 @@ xfs_trans_item_committed(
1389 */ 1411 */
1390STATIC void 1412STATIC void
1391xfs_trans_committed( 1413xfs_trans_committed(
1392 struct xfs_trans *tp, 1414 void *arg,
1393 int abortflag) 1415 int abortflag)
1394{ 1416{
1417 struct xfs_trans *tp = arg;
1395 struct xfs_log_item_desc *lidp, *next; 1418 struct xfs_log_item_desc *lidp, *next;
1396 1419
1397 /* Call the transaction's completion callback if there is one. */
1398 if (tp->t_callback != NULL)
1399 tp->t_callback(tp, tp->t_callarg);
1400
1401 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { 1420 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1402 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); 1421 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
1403 xfs_trans_free_item_desc(lidp); 1422 xfs_trans_free_item_desc(lidp);
@@ -1406,21 +1425,120 @@ xfs_trans_committed(
1406 xfs_trans_free(tp); 1425 xfs_trans_free(tp);
1407} 1426}
1408 1427
1428static inline void
1429xfs_log_item_batch_insert(
1430 struct xfs_ail *ailp,
1431 struct xfs_log_item **log_items,
1432 int nr_items,
1433 xfs_lsn_t commit_lsn)
1434{
1435 int i;
1436
1437 spin_lock(&ailp->xa_lock);
1438 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1439 xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
1440
1441 for (i = 0; i < nr_items; i++)
1442 IOP_UNPIN(log_items[i], 0);
1443}
1444
1445/*
1446 * Bulk operation version of xfs_trans_committed that takes a log vector of
1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1448 * minimise lock traffic.
1449 *
1450 * If we are called with the aborted flag set, it is because a log write during
1451 * a CIL checkpoint commit has failed. In this case, all the items in the
1452 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
1453 * means that checkpoint commit abort handling is treated exactly the same
1454 * as an iclog write error even though we haven't started any IO yet. Hence in
1455 * this case all we need to do is IOP_COMMITTED processing, followed by an
1456 * IOP_UNPIN(aborted) call.
1457 */
1458void
1459xfs_trans_committed_bulk(
1460 struct xfs_ail *ailp,
1461 struct xfs_log_vec *log_vector,
1462 xfs_lsn_t commit_lsn,
1463 int aborted)
1464{
1465#define LOG_ITEM_BATCH_SIZE 32
1466 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
1467 struct xfs_log_vec *lv;
1468 int i = 0;
1469
1470 /* unpin all the log items */
1471 for (lv = log_vector; lv; lv = lv->lv_next ) {
1472 struct xfs_log_item *lip = lv->lv_item;
1473 xfs_lsn_t item_lsn;
1474
1475 if (aborted)
1476 lip->li_flags |= XFS_LI_ABORTED;
1477 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1478
1479 /* item_lsn of -1 means the item was freed */
1480 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1481 continue;
1482
1483 /*
1484 * if we are aborting the operation, no point in inserting the
1485 * object into the AIL as we are in a shutdown situation.
1486 */
1487 if (aborted) {
1488 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1489 IOP_UNPIN(lip, 1);
1490 continue;
1491 }
1492
1493 if (item_lsn != commit_lsn) {
1494
1495 /*
1496 * Not a bulk update option due to unusual item_lsn.
1497 * Push into AIL immediately, rechecking the lsn once
1498 * we have the ail lock. Then unpin the item.
1499 */
1500 spin_lock(&ailp->xa_lock);
1501 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
1502 xfs_trans_ail_update(ailp, lip, item_lsn);
1503 else
1504 spin_unlock(&ailp->xa_lock);
1505 IOP_UNPIN(lip, 0);
1506 continue;
1507 }
1508
1509 /* Item is a candidate for bulk AIL insert. */
1510 log_items[i++] = lv->lv_item;
1511 if (i >= LOG_ITEM_BATCH_SIZE) {
1512 xfs_log_item_batch_insert(ailp, log_items,
1513 LOG_ITEM_BATCH_SIZE, commit_lsn);
1514 i = 0;
1515 }
1516 }
1517
1518 /* make sure we insert the remainder! */
1519 if (i)
1520 xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
1521}
1522
1409/* 1523/*
1410 * Called from the trans_commit code when we notice that 1524 * Called from the trans_commit code when we notice that the filesystem is in
1411 * the filesystem is in the middle of a forced shutdown. 1525 * the middle of a forced shutdown.
1526 *
1527 * When we are called here, we have already pinned all the items in the
1528 * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
1529 * so we can simply walk the items in the transaction, unpin them with an abort
1530 * flag and then free the items. Note that unpinning the items can result in
1531 * them being freed immediately, so we need to use a safe list traversal method
1532 * here.
1412 */ 1533 */
1413STATIC void 1534STATIC void
1414xfs_trans_uncommit( 1535xfs_trans_uncommit(
1415 struct xfs_trans *tp, 1536 struct xfs_trans *tp,
1416 uint flags) 1537 uint flags)
1417{ 1538{
1418 struct xfs_log_item_desc *lidp; 1539 struct xfs_log_item_desc *lidp, *n;
1419 1540
1420 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 1541 list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
1421 /*
1422 * Unpin all but those that aren't dirty.
1423 */
1424 if (lidp->lid_flags & XFS_LID_DIRTY) 1542 if (lidp->lid_flags & XFS_LID_DIRTY)
1425 IOP_UNPIN(lidp->lid_item, 1); 1543 IOP_UNPIN(lidp->lid_item, 1);
1426 } 1544 }
@@ -1525,7 +1643,7 @@ xfs_trans_commit_iclog(
1525 * running in simulation mode (the log is explicitly turned 1643 * running in simulation mode (the log is explicitly turned
1526 * off). 1644 * off).
1527 */ 1645 */
1528 tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; 1646 tp->t_logcb.cb_func = xfs_trans_committed;
1529 tp->t_logcb.cb_arg = tp; 1647 tp->t_logcb.cb_arg = tp;
1530 1648
1531 /* 1649 /*
@@ -1637,7 +1755,6 @@ xfs_trans_commit_cil(
1637 int flags) 1755 int flags)
1638{ 1756{
1639 struct xfs_log_vec *log_vector; 1757 struct xfs_log_vec *log_vector;
1640 int error;
1641 1758
1642 /* 1759 /*
1643 * Get each log item to allocate a vector structure for 1760 * Get each log item to allocate a vector structure for
@@ -1648,9 +1765,7 @@ xfs_trans_commit_cil(
1648 if (!log_vector) 1765 if (!log_vector)
1649 return ENOMEM; 1766 return ENOMEM;
1650 1767
1651 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); 1768 xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1652 if (error)
1653 return error;
1654 1769
1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1770 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1656 xfs_trans_free(tp); 1771 xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..c2042b736b81 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
294#define XFS_ALLOC_BTREE_REF 2 294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2 295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2 296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
297#define XFS_ATTR_BTREE_REF 1 298#define XFS_ATTR_BTREE_REF 1
298#define XFS_INO_REF 1
299#define XFS_DQUOT_REF 1 299#define XFS_DQUOT_REF 1
300 300
301#ifdef __KERNEL__ 301#ifdef __KERNEL__
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
399 * transaction. */ 399 * transaction. */
400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */ 400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ 401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
402 xfs_trans_callback_t t_callback; /* transaction callback */
403 void *t_callarg; /* callback arg */
404 unsigned int t_flags; /* misc flags */ 402 unsigned int t_flags; /* misc flags */
405 int64_t t_icount_delta; /* superblock icount change */ 403 int64_t t_icount_delta; /* superblock icount change */
406 int64_t t_ifree_delta; /* superblock ifree change */ 404 int64_t t_ifree_delta; /* superblock ifree change */
@@ -473,6 +471,7 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
473void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
474int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, 472int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
475 xfs_ino_t , uint, uint, struct xfs_inode **); 473 xfs_ino_t , uint, uint, struct xfs_inode **);
474void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
476void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 475void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
477void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); 476void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
478void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 477void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..c5bbbc45db91 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); 31STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
32STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); 32STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); 33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); 34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 35
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
449 xfs_log_move_tail(ailp->xa_mount, 1); 449 xfs_log_move_tail(ailp->xa_mount, 1);
450} /* xfs_trans_unlocked_item */ 450} /* xfs_trans_unlocked_item */
451 451
452
453/* 452/*
454 * Update the position of the item in the AIL with the new 453 * xfs_trans_ail_update - bulk AIL insertion operation.
455 * lsn. If it is not yet in the AIL, add it. Otherwise, move 454 *
456 * it to its new position by removing it and re-adding it. 455 * @xfs_trans_ail_update takes an array of log items that all need to be
456 * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
457 * be added. Otherwise, it will be repositioned by removing it and re-adding
458 * it to the AIL. If we move the first item in the AIL, update the log tail to
459 * match the new minimum LSN in the AIL.
457 * 460 *
458 * Wakeup anyone with an lsn less than the item's lsn. If the item 461 * This function takes the AIL lock once to execute the update operations on
459 * we move in the AIL is the minimum one, update the tail lsn in the 462 * all the items in the array, and as such should not be called with the AIL
460 * log manager. 463 * lock held. As a result, once we have the AIL lock, we need to check each log
464 * item LSN to confirm it needs to be moved forward in the AIL.
461 * 465 *
462 * This function must be called with the AIL lock held. The lock 466 * To optimise the insert operation, we delete all the items from the AIL in
463 * is dropped before returning. 467 * the first pass, moving them into a temporary list, then splice the temporary
468 * list into the correct position in the AIL. This avoids needing to do an
469 * insert operation on every item.
470 *
471 * This function must be called with the AIL lock held. The lock is dropped
472 * before returning.
464 */ 473 */
465void 474void
466xfs_trans_ail_update( 475xfs_trans_ail_update_bulk(
467 struct xfs_ail *ailp, 476 struct xfs_ail *ailp,
468 xfs_log_item_t *lip, 477 struct xfs_log_item **log_items,
469 xfs_lsn_t lsn) __releases(ailp->xa_lock) 478 int nr_items,
479 xfs_lsn_t lsn) __releases(ailp->xa_lock)
470{ 480{
471 xfs_log_item_t *dlip = NULL; 481 xfs_log_item_t *mlip;
472 xfs_log_item_t *mlip; /* ptr to minimum lip */
473 xfs_lsn_t tail_lsn; 482 xfs_lsn_t tail_lsn;
483 int mlip_changed = 0;
484 int i;
485 LIST_HEAD(tmp);
474 486
475 mlip = xfs_ail_min(ailp); 487 mlip = xfs_ail_min(ailp);
476 488
477 if (lip->li_flags & XFS_LI_IN_AIL) { 489 for (i = 0; i < nr_items; i++) {
478 dlip = xfs_ail_delete(ailp, lip); 490 struct xfs_log_item *lip = log_items[i];
479 ASSERT(dlip == lip); 491 if (lip->li_flags & XFS_LI_IN_AIL) {
480 xfs_trans_ail_cursor_clear(ailp, dlip); 492 /* check if we really need to move the item */
481 } else { 493 if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
482 lip->li_flags |= XFS_LI_IN_AIL; 494 continue;
495
496 xfs_ail_delete(ailp, lip);
497 if (mlip == lip)
498 mlip_changed = 1;
499 } else {
500 lip->li_flags |= XFS_LI_IN_AIL;
501 }
502 lip->li_lsn = lsn;
503 list_add(&lip->li_ail, &tmp);
483 } 504 }
484 505
485 lip->li_lsn = lsn; 506 xfs_ail_splice(ailp, &tmp, lsn);
486 xfs_ail_insert(ailp, lip);
487 507
488 if (mlip == dlip) { 508 if (!mlip_changed) {
489 mlip = xfs_ail_min(ailp);
490 /*
491 * It is not safe to access mlip after the AIL lock is
492 * dropped, so we must get a copy of li_lsn before we do
493 * so. This is especially important on 32-bit platforms
494 * where accessing and updating 64-bit values like li_lsn
495 * is not atomic.
496 */
497 tail_lsn = mlip->li_lsn;
498 spin_unlock(&ailp->xa_lock);
499 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
500 } else {
501 spin_unlock(&ailp->xa_lock); 509 spin_unlock(&ailp->xa_lock);
510 return;
502 } 511 }
503 512
504 513 /*
505} /* xfs_trans_update_ail */ 514 * It is not safe to access mlip after the AIL lock is dropped, so we
515 * must get a copy of li_lsn before we do so. This is especially
516 * important on 32-bit platforms where accessing and updating 64-bit
517 * values like li_lsn is not atomic.
518 */
519 mlip = xfs_ail_min(ailp);
520 tail_lsn = mlip->li_lsn;
521 spin_unlock(&ailp->xa_lock);
522 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
523}
506 524
507/* 525/*
508 * Delete the given item from the AIL. It must already be in 526 * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
509 * the AIL.
510 * 527 *
511 * Wakeup anyone with an lsn less than item's lsn. If the item 528 * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
512 * we delete in the AIL is the minimum one, update the tail lsn in the 529 * removed from the AIL. The caller is already holding the AIL lock, and done
513 * log manager. 530 * all the checks necessary to ensure the items passed in via @log_items are
531 * ready for deletion. This includes checking that the items are in the AIL.
514 * 532 *
515 * Clear the IN_AIL flag from the item, reset its lsn to 0, and 533 * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
516 * bump the AIL's generation count to indicate that the tree 534 * flag from the item and reset the item's lsn to 0. If we remove the first
517 * has changed. 535 * item in the AIL, update the log tail to match the new minimum LSN in the
536 * AIL.
518 * 537 *
519 * This function must be called with the AIL lock held. The lock 538 * This function will not drop the AIL lock until all items are removed from
520 * is dropped before returning. 539 * the AIL to minimise the amount of lock traffic on the AIL. This does not
540 * greatly increase the AIL hold time, but does significantly reduce the amount
541 * of traffic on the lock, especially during IO completion.
542 *
543 * This function must be called with the AIL lock held. The lock is dropped
544 * before returning.
521 */ 545 */
522void 546void
523xfs_trans_ail_delete( 547xfs_trans_ail_delete_bulk(
524 struct xfs_ail *ailp, 548 struct xfs_ail *ailp,
525 xfs_log_item_t *lip) __releases(ailp->xa_lock) 549 struct xfs_log_item **log_items,
550 int nr_items) __releases(ailp->xa_lock)
526{ 551{
527 xfs_log_item_t *dlip;
528 xfs_log_item_t *mlip; 552 xfs_log_item_t *mlip;
529 xfs_lsn_t tail_lsn; 553 xfs_lsn_t tail_lsn;
554 int mlip_changed = 0;
555 int i;
530 556
531 if (lip->li_flags & XFS_LI_IN_AIL) { 557 mlip = xfs_ail_min(ailp);
532 mlip = xfs_ail_min(ailp);
533 dlip = xfs_ail_delete(ailp, lip);
534 ASSERT(dlip == lip);
535 xfs_trans_ail_cursor_clear(ailp, dlip);
536
537 558
538 lip->li_flags &= ~XFS_LI_IN_AIL; 559 for (i = 0; i < nr_items; i++) {
539 lip->li_lsn = 0; 560 struct xfs_log_item *lip = log_items[i];
561 if (!(lip->li_flags & XFS_LI_IN_AIL)) {
562 struct xfs_mount *mp = ailp->xa_mount;
540 563
541 if (mlip == dlip) {
542 mlip = xfs_ail_min(ailp);
543 /*
544 * It is not safe to access mlip after the AIL lock
545 * is dropped, so we must get a copy of li_lsn
546 * before we do so. This is especially important
547 * on 32-bit platforms where accessing and updating
548 * 64-bit values like li_lsn is not atomic.
549 */
550 tail_lsn = mlip ? mlip->li_lsn : 0;
551 spin_unlock(&ailp->xa_lock);
552 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
553 } else {
554 spin_unlock(&ailp->xa_lock); 564 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) {
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 return;
555 } 572 }
573
574 xfs_ail_delete(ailp, lip);
575 lip->li_flags &= ~XFS_LI_IN_AIL;
576 lip->li_lsn = 0;
577 if (mlip == lip)
578 mlip_changed = 1;
556 } 579 }
557 else {
558 /*
559 * If the file system is not being shutdown, we are in
560 * serious trouble if we get to this stage.
561 */
562 struct xfs_mount *mp = ailp->xa_mount;
563 580
581 if (!mlip_changed) {
564 spin_unlock(&ailp->xa_lock); 582 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 583 return;
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 } 584 }
572}
573
574 585
586 /*
587 * It is not safe to access mlip after the AIL lock is dropped, so we
588 * must get a copy of li_lsn before we do so. This is especially
589 * important on 32-bit platforms where accessing and updating 64-bit
590 * values like li_lsn is not atomic. It is possible we've emptied the
591 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
592 */
593 mlip = xfs_ail_min(ailp);
594 tail_lsn = mlip ? mlip->li_lsn : 0;
595 spin_unlock(&ailp->xa_lock);
596 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
597}
575 598
576/* 599/*
577 * The active item list (AIL) is a doubly linked list of log 600 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
623} 646}
624 647
625/* 648/*
626 * Insert the given log item into the AIL. 649 * splice the log item list into the AIL at the given LSN.
627 * We almost always insert at the end of the list, so on inserts
628 * we search from the end of the list to find where the
629 * new item belongs.
630 */ 650 */
631STATIC void 651STATIC void
632xfs_ail_insert( 652xfs_ail_splice(
633 struct xfs_ail *ailp, 653 struct xfs_ail *ailp,
634 xfs_log_item_t *lip) 654 struct list_head *list,
635/* ARGSUSED */ 655 xfs_lsn_t lsn)
636{ 656{
637 xfs_log_item_t *next_lip; 657 xfs_log_item_t *next_lip;
638 658
@@ -640,39 +660,33 @@ xfs_ail_insert(
640 * If the list is empty, just insert the item. 660 * If the list is empty, just insert the item.
641 */ 661 */
642 if (list_empty(&ailp->xa_ail)) { 662 if (list_empty(&ailp->xa_ail)) {
643 list_add(&lip->li_ail, &ailp->xa_ail); 663 list_splice(list, &ailp->xa_ail);
644 return; 664 return;
645 } 665 }
646 666
647 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { 667 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
648 if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) 668 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
649 break; 669 break;
650 } 670 }
651 671
652 ASSERT((&next_lip->li_ail == &ailp->xa_ail) || 672 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
653 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); 673 (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
654
655 list_add(&lip->li_ail, &next_lip->li_ail);
656 674
657 xfs_ail_check(ailp, lip); 675 list_splice_init(list, &next_lip->li_ail);
658 return; 676 return;
659} 677}
660 678
661/* 679/*
662 * Delete the given item from the AIL. Return a pointer to the item. 680 * Delete the given item from the AIL. Return a pointer to the item.
663 */ 681 */
664/*ARGSUSED*/ 682STATIC void
665STATIC xfs_log_item_t *
666xfs_ail_delete( 683xfs_ail_delete(
667 struct xfs_ail *ailp, 684 struct xfs_ail *ailp,
668 xfs_log_item_t *lip) 685 xfs_log_item_t *lip)
669/* ARGSUSED */
670{ 686{
671 xfs_ail_check(ailp, lip); 687 xfs_ail_check(ailp, lip);
672
673 list_del(&lip->li_ail); 688 list_del(&lip->li_ail);
674 689 xfs_trans_ail_cursor_clear(ailp, lip);
675 return lip;
676} 690}
677 691
678/* 692/*
@@ -682,7 +696,6 @@ xfs_ail_delete(
682STATIC xfs_log_item_t * 696STATIC xfs_log_item_t *
683xfs_ail_min( 697xfs_ail_min(
684 struct xfs_ail *ailp) 698 struct xfs_ail *ailp)
685/* ARGSUSED */
686{ 699{
687 if (list_empty(&ailp->xa_ail)) 700 if (list_empty(&ailp->xa_ail))
688 return NULL; 701 return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
699xfs_ail_next( 712xfs_ail_next(
700 struct xfs_ail *ailp, 713 struct xfs_ail *ailp,
701 xfs_log_item_t *lip) 714 xfs_log_item_t *lip)
702/* ARGSUSED */
703{ 715{
704 if (lip->li_ail.next == &ailp->xa_ail) 716 if (lip->li_ail.next == &ailp->xa_ail)
705 return NULL; 717 return NULL;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..c47918c302a5 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
336 ASSERT(!XFS_BUF_ISASYNC(bp)); 336 ASSERT(!XFS_BUF_ISASYNC(bp));
337 XFS_BUF_READ(bp); 337 XFS_BUF_READ(bp);
338 xfsbdstrat(tp->t_mountp, bp); 338 xfsbdstrat(tp->t_mountp, bp);
339 error = xfs_iowait(bp); 339 error = xfs_buf_iowait(bp);
340 if (error) { 340 if (error) {
341 xfs_ioerror_alert("xfs_trans_read_buf", mp, 341 xfs_ioerror_alert("xfs_trans_read_buf", mp,
342 bp, blkno); 342 bp, blkno);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
69 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
71 71
72 next_extent = efip->efi_next_extent; 72 /*
73 * atomic_inc_return gives us the value after the increment;
74 * we want to use it as an array index so we need to subtract 1 from
75 * it.
76 */
77 next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
73 ASSERT(next_extent < efip->efi_format.efi_nextents); 78 ASSERT(next_extent < efip->efi_format.efi_nextents);
74 extp = &(efip->efi_format.efi_extents[next_extent]); 79 extp = &(efip->efi_format.efi_extents[next_extent]);
75 extp->ext_start = start_block; 80 extp->ext_start = start_block;
76 extp->ext_len = ext_len; 81 extp->ext_len = ext_len;
77 efip->efi_next_extent++;
78} 82}
79 83
80 84
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..ccb34532768b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -118,6 +118,36 @@ xfs_trans_ijoin_ref(
118} 118}
119 119
120/* 120/*
121 * Transactional inode timestamp update. Requires the inode to be locked and
122 * joined to the transaction supplied. Relies on the transaction subsystem to
123 * track dirty state and update/writeback the inode accordingly.
124 */
125void
126xfs_trans_ichgtime(
127 struct xfs_trans *tp,
128 struct xfs_inode *ip,
129 int flags)
130{
131 struct inode *inode = VFS_I(ip);
132 timespec_t tv;
133
134 ASSERT(tp);
135 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
136 ASSERT(ip->i_transp == tp);
137
138 tv = current_fs_time(inode->i_sb);
139
140 if ((flags & XFS_ICHGTIME_MOD) &&
141 !timespec_equal(&inode->i_mtime, &tv)) {
142 inode->i_mtime = tv;
143 }
144 if ((flags & XFS_ICHGTIME_CHG) &&
145 !timespec_equal(&inode->i_ctime, &tv)) {
146 inode->i_ctime = tv;
147 }
148}
149
150/*
121 * This is called to mark the fields indicated in fieldmask as needing 151 * This is called to mark the fields indicated in fieldmask as needing
122 * to be logged when the transaction is committed. The inode must 152 * to be logged when the transaction is committed. The inode must
123 * already be associated with the given transaction. 153 * already be associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..35162c238fa3 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
22struct xfs_log_item_desc; 22struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_ail;
26struct xfs_log_vec;
25 27
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 29void xfs_trans_del_item(struct xfs_log_item *);
28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags); 31 int flags);
30void xfs_trans_item_committed(struct xfs_log_item *lip,
31 xfs_lsn_t commit_lsn, int aborted);
32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
33 33
34void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
35 xfs_lsn_t commit_lsn, int aborted);
34/* 36/*
35 * AIL traversal cursor. 37 * AIL traversal cursor.
36 * 38 *
@@ -73,12 +75,29 @@ struct xfs_ail {
73/* 75/*
74 * From xfs_trans_ail.c 76 * From xfs_trans_ail.c
75 */ 77 */
76void xfs_trans_ail_update(struct xfs_ail *ailp, 78void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
77 struct xfs_log_item *lip, xfs_lsn_t lsn) 79 struct xfs_log_item **log_items, int nr_items,
78 __releases(ailp->xa_lock); 80 xfs_lsn_t lsn) __releases(ailp->xa_lock);
79void xfs_trans_ail_delete(struct xfs_ail *ailp, 81static inline void
80 struct xfs_log_item *lip) 82xfs_trans_ail_update(
81 __releases(ailp->xa_lock); 83 struct xfs_ail *ailp,
84 struct xfs_log_item *lip,
85 xfs_lsn_t lsn) __releases(ailp->xa_lock)
86{
87 xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
88}
89
90void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
91 struct xfs_log_item **log_items, int nr_items)
92 __releases(ailp->xa_lock);
93static inline void
94xfs_trans_ail_delete(
95 struct xfs_ail *ailp,
96 xfs_log_item_t *lip) __releases(ailp->xa_lock)
97{
98 xfs_trans_ail_delete_bulk(ailp, &lip, 1);
99}
100
82void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); 101void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
83void xfs_trans_unlocked_item(struct xfs_ail *, 102void xfs_trans_unlocked_item(struct xfs_ail *,
84 xfs_log_item_t *); 103 xfs_log_item_t *);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..26d1867d8156 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ 73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ 74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */ 76typedef __uint32_t xlog_tid_t; /* transaction ID type */
79 77
80/* 78/*
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
56 mode_t mode, 56 mode_t mode,
57 xfs_nlink_t nlink, 57 xfs_nlink_t nlink,
58 xfs_dev_t rdev, 58 xfs_dev_t rdev,
59 cred_t *credp,
60 prid_t prid, /* project id */ 59 prid_t prid, /* project id */
61 int okalloc, /* ok to allocate new space */ 60 int okalloc, /* ok to allocate new space */
62 xfs_inode_t **ipp, /* pointer to inode; it will be 61 xfs_inode_t **ipp, /* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
93 * transaction commit so that no other process can steal 92 * transaction commit so that no other process can steal
94 * the inode(s) that we've just allocated. 93 * the inode(s) that we've just allocated.
95 */ 94 */
96 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, 95 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
97 &ialloc_context, &call_again, &ip); 96 &ialloc_context, &call_again, &ip);
98 97
99 /* 98 /*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
197 * other allocations in this allocation group, 196 * other allocations in this allocation group,
198 * this call should always succeed. 197 * this call should always succeed.
199 */ 198 */
200 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, 199 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
201 okalloc, &ialloc_context, &call_again, &ip); 200 okalloc, &ialloc_context, &call_again, &ip);
202 201
203 /* 202 /*
@@ -235,7 +234,7 @@ xfs_droplink(
235{ 234{
236 int error; 235 int error;
237 236
238 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 237 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
239 238
240 ASSERT (ip->i_d.di_nlink > 0); 239 ASSERT (ip->i_d.di_nlink > 0);
241 ip->i_d.di_nlink--; 240 ip->i_d.di_nlink--;
@@ -299,7 +298,7 @@ xfs_bumplink(
299{ 298{
300 if (ip->i_d.di_nlink >= XFS_MAXLINK) 299 if (ip->i_d.di_nlink >= XFS_MAXLINK)
301 return XFS_ERROR(EMLINK); 300 return XFS_ERROR(EMLINK);
302 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 301 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
303 302
304 ASSERT(ip->i_d.di_nlink > 0); 303 ASSERT(ip->i_d.di_nlink > 0);
305 ip->i_d.di_nlink++; 304 ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
19#define __XFS_UTILS_H__ 19#define __XFS_UTILS_H__
20 20
21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, 21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
22 xfs_dev_t, cred_t *, prid_t, int, 22 xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
23 xfs_inode_t **, int *);
24extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); 23extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
25extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); 24extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
26extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); 25extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..d8e6f8cd6f0c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
114 */ 114 */
115 ASSERT(udqp == NULL); 115 ASSERT(udqp == NULL);
116 ASSERT(gdqp == NULL); 116 ASSERT(gdqp == NULL);
117 code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, 117 code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
118 qflags, &udqp, &gdqp); 118 qflags, &udqp, &gdqp);
119 if (code) 119 if (code)
120 return code; 120 return code;
@@ -184,8 +184,11 @@ xfs_setattr(
184 ip->i_size == 0 && ip->i_d.di_nextents == 0) { 184 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
185 xfs_iunlock(ip, XFS_ILOCK_EXCL); 185 xfs_iunlock(ip, XFS_ILOCK_EXCL);
186 lock_flags &= ~XFS_ILOCK_EXCL; 186 lock_flags &= ~XFS_ILOCK_EXCL;
187 if (mask & ATTR_CTIME) 187 if (mask & ATTR_CTIME) {
188 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 188 inode->i_mtime = inode->i_ctime =
189 current_fs_time(inode->i_sb);
190 xfs_mark_inode_dirty_sync(ip);
191 }
189 code = 0; 192 code = 0;
190 goto error_return; 193 goto error_return;
191 } 194 }
@@ -961,29 +964,48 @@ xfs_release(
961 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
962 } 965 }
963 966
964 if (ip->i_d.di_nlink != 0) { 967 if (ip->i_d.di_nlink == 0)
965 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 968 return 0;
966 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
967 ip->i_delayed_blks > 0)) &&
968 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
969 (!(ip->i_d.di_flags &
970 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
971 969
972 /* 970 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
973 * If we can't get the iolock just skip truncating 971 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
974 * the blocks past EOF because we could deadlock 972 ip->i_delayed_blks > 0)) &&
975 * with the mmap_sem otherwise. We'll get another 973 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
976 * chance to drop them once the last reference to 974 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
977 * the inode is dropped, so we'll never leak blocks 975
978 * permanently. 976 /*
979 */ 977 * If we can't get the iolock just skip truncating the blocks
980 error = xfs_free_eofblocks(mp, ip, 978 * past EOF because we could deadlock with the mmap_sem
981 XFS_FREE_EOF_TRYLOCK); 979 * otherwise. We'll get another chance to drop them once the
982 if (error) 980 * last reference to the inode is dropped, so we'll never leak
983 return error; 981 * blocks permanently.
984 } 982 *
985 } 983 * Further, check if the inode is being opened, written and
984 * closed frequently and we have delayed allocation blocks
985 * oustanding (e.g. streaming writes from the NFS server),
986 * truncating the blocks past EOF will cause fragmentation to
987 * occur.
988 *
989 * In this case don't do the truncation, either, but we have to
990 * be careful how we detect this case. Blocks beyond EOF show
991 * up as i_delayed_blks even when the inode is clean, so we
992 * need to truncate them away first before checking for a dirty
993 * release. Hence on the first dirty close we will still remove
994 * the speculative allocation, but after that we will leave it
995 * in place.
996 */
997 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
998 return 0;
999
1000 error = xfs_free_eofblocks(mp, ip,
1001 XFS_FREE_EOF_TRYLOCK);
1002 if (error)
1003 return error;
986 1004
1005 /* delalloc blocks after truncation means it really is dirty */
1006 if (ip->i_delayed_blks)
1007 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1008 }
987 return 0; 1009 return 0;
988} 1010}
989 1011
@@ -1253,8 +1275,7 @@ xfs_create(
1253 struct xfs_name *name, 1275 struct xfs_name *name,
1254 mode_t mode, 1276 mode_t mode,
1255 xfs_dev_t rdev, 1277 xfs_dev_t rdev,
1256 xfs_inode_t **ipp, 1278 xfs_inode_t **ipp)
1257 cred_t *credp)
1258{ 1279{
1259 int is_dir = S_ISDIR(mode); 1280 int is_dir = S_ISDIR(mode);
1260 struct xfs_mount *mp = dp->i_mount; 1281 struct xfs_mount *mp = dp->i_mount;
@@ -1266,7 +1287,7 @@ xfs_create(
1266 boolean_t unlock_dp_on_error = B_FALSE; 1287 boolean_t unlock_dp_on_error = B_FALSE;
1267 uint cancel_flags; 1288 uint cancel_flags;
1268 int committed; 1289 int committed;
1269 xfs_prid_t prid; 1290 prid_t prid;
1270 struct xfs_dquot *udqp = NULL; 1291 struct xfs_dquot *udqp = NULL;
1271 struct xfs_dquot *gdqp = NULL; 1292 struct xfs_dquot *gdqp = NULL;
1272 uint resblks; 1293 uint resblks;
@@ -1279,9 +1300,9 @@ xfs_create(
1279 return XFS_ERROR(EIO); 1300 return XFS_ERROR(EIO);
1280 1301
1281 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1302 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1282 prid = dp->i_d.di_projid; 1303 prid = xfs_get_projid(dp);
1283 else 1304 else
1284 prid = dfltprid; 1305 prid = XFS_PROJID_DEFAULT;
1285 1306
1286 /* 1307 /*
1287 * Make sure that we have allocated dquot(s) on disk. 1308 * Make sure that we have allocated dquot(s) on disk.
@@ -1360,7 +1381,7 @@ xfs_create(
1360 * entry pointing to them, but a directory also the "." entry 1381 * entry pointing to them, but a directory also the "." entry
1361 * pointing to itself. 1382 * pointing to itself.
1362 */ 1383 */
1363 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, 1384 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1364 prid, resblks > 0, &ip, &committed); 1385 prid, resblks > 0, &ip, &committed);
1365 if (error) { 1386 if (error) {
1366 if (error == ENOSPC) 1387 if (error == ENOSPC)
@@ -1391,7 +1412,7 @@ xfs_create(
1391 ASSERT(error != ENOSPC); 1412 ASSERT(error != ENOSPC);
1392 goto out_trans_abort; 1413 goto out_trans_abort;
1393 } 1414 }
1394 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1415 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1395 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1416 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1396 1417
1397 if (is_dir) { 1418 if (is_dir) {
@@ -1742,7 +1763,7 @@ xfs_remove(
1742 ASSERT(error != ENOENT); 1763 ASSERT(error != ENOENT);
1743 goto out_bmap_cancel; 1764 goto out_bmap_cancel;
1744 } 1765 }
1745 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1766 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1746 1767
1747 if (is_dir) { 1768 if (is_dir) {
1748 /* 1769 /*
@@ -1880,7 +1901,7 @@ xfs_link(
1880 * the tree quota mechanism could be circumvented. 1901 * the tree quota mechanism could be circumvented.
1881 */ 1902 */
1882 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1903 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1883 (tdp->i_d.di_projid != sip->i_d.di_projid))) { 1904 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1884 error = XFS_ERROR(EXDEV); 1905 error = XFS_ERROR(EXDEV);
1885 goto error_return; 1906 goto error_return;
1886 } 1907 }
@@ -1895,7 +1916,7 @@ xfs_link(
1895 &first_block, &free_list, resblks); 1916 &first_block, &free_list, resblks);
1896 if (error) 1917 if (error)
1897 goto abort_return; 1918 goto abort_return;
1898 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1919 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1899 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1920 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1900 1921
1901 error = xfs_bumplink(tp, sip); 1922 error = xfs_bumplink(tp, sip);
@@ -1933,8 +1954,7 @@ xfs_symlink(
1933 struct xfs_name *link_name, 1954 struct xfs_name *link_name,
1934 const char *target_path, 1955 const char *target_path,
1935 mode_t mode, 1956 mode_t mode,
1936 xfs_inode_t **ipp, 1957 xfs_inode_t **ipp)
1937 cred_t *credp)
1938{ 1958{
1939 xfs_mount_t *mp = dp->i_mount; 1959 xfs_mount_t *mp = dp->i_mount;
1940 xfs_trans_t *tp; 1960 xfs_trans_t *tp;
@@ -1955,7 +1975,7 @@ xfs_symlink(
1955 int byte_cnt; 1975 int byte_cnt;
1956 int n; 1976 int n;
1957 xfs_buf_t *bp; 1977 xfs_buf_t *bp;
1958 xfs_prid_t prid; 1978 prid_t prid;
1959 struct xfs_dquot *udqp, *gdqp; 1979 struct xfs_dquot *udqp, *gdqp;
1960 uint resblks; 1980 uint resblks;
1961 1981
@@ -1978,9 +1998,9 @@ xfs_symlink(
1978 1998
1979 udqp = gdqp = NULL; 1999 udqp = gdqp = NULL;
1980 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 2000 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1981 prid = dp->i_d.di_projid; 2001 prid = xfs_get_projid(dp);
1982 else 2002 else
1983 prid = (xfs_prid_t)dfltprid; 2003 prid = XFS_PROJID_DEFAULT;
1984 2004
1985 /* 2005 /*
1986 * Make sure that we have allocated dquot(s) on disk. 2006 * Make sure that we have allocated dquot(s) on disk.
@@ -2046,8 +2066,8 @@ xfs_symlink(
2046 /* 2066 /*
2047 * Allocate an inode for the symlink. 2067 * Allocate an inode for the symlink.
2048 */ 2068 */
2049 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 2069 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
2050 1, 0, credp, prid, resblks > 0, &ip, NULL); 2070 prid, resblks > 0, &ip, NULL);
2051 if (error) { 2071 if (error) {
2052 if (error == ENOSPC) 2072 if (error == ENOSPC)
2053 goto error_return; 2073 goto error_return;
@@ -2129,7 +2149,7 @@ xfs_symlink(
2129 &first_block, &free_list, resblks); 2149 &first_block, &free_list, resblks);
2130 if (error) 2150 if (error)
2131 goto error1; 2151 goto error1;
2132 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2152 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2133 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2153 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2134 2154
2135 /* 2155 /*
@@ -2272,7 +2292,7 @@ xfs_alloc_file_space(
2272 count = len; 2292 count = len;
2273 imapp = &imaps[0]; 2293 imapp = &imaps[0];
2274 nimaps = 1; 2294 nimaps = 1;
2275 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); 2295 bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
2276 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 2296 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
2277 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 2297 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
2278 2298
@@ -2431,9 +2451,9 @@ xfs_zero_remaining_bytes(
2431 if (endoff > ip->i_size) 2451 if (endoff > ip->i_size)
2432 endoff = ip->i_size; 2452 endoff = ip->i_size;
2433 2453
2434 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 2454 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
2435 XFS_IS_REALTIME_INODE(ip) ? 2455 mp->m_rtdev_targp : mp->m_ddev_targp,
2436 mp->m_rtdev_targp : mp->m_ddev_targp); 2456 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
2437 if (!bp) 2457 if (!bp)
2438 return XFS_ERROR(ENOMEM); 2458 return XFS_ERROR(ENOMEM);
2439 2459
@@ -2459,7 +2479,7 @@ xfs_zero_remaining_bytes(
2459 XFS_BUF_READ(bp); 2479 XFS_BUF_READ(bp);
2460 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); 2480 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2461 xfsbdstrat(mp, bp); 2481 xfsbdstrat(mp, bp);
2462 error = xfs_iowait(bp); 2482 error = xfs_buf_iowait(bp);
2463 if (error) { 2483 if (error) {
2464 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 2484 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
2465 mp, bp, XFS_BUF_ADDR(bp)); 2485 mp, bp, XFS_BUF_ADDR(bp));
@@ -2472,7 +2492,7 @@ xfs_zero_remaining_bytes(
2472 XFS_BUF_UNREAD(bp); 2492 XFS_BUF_UNREAD(bp);
2473 XFS_BUF_WRITE(bp); 2493 XFS_BUF_WRITE(bp);
2474 xfsbdstrat(mp, bp); 2494 xfsbdstrat(mp, bp);
2475 error = xfs_iowait(bp); 2495 error = xfs_buf_iowait(bp);
2476 if (error) { 2496 if (error) {
2477 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 2497 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
2478 mp, bp, XFS_BUF_ADDR(bp)); 2498 mp, bp, XFS_BUF_ADDR(bp));
@@ -2711,6 +2731,7 @@ xfs_change_file_space(
2711 xfs_off_t llen; 2731 xfs_off_t llen;
2712 xfs_trans_t *tp; 2732 xfs_trans_t *tp;
2713 struct iattr iattr; 2733 struct iattr iattr;
2734 int prealloc_type;
2714 2735
2715 if (!S_ISREG(ip->i_d.di_mode)) 2736 if (!S_ISREG(ip->i_d.di_mode))
2716 return XFS_ERROR(EINVAL); 2737 return XFS_ERROR(EINVAL);
@@ -2753,12 +2774,17 @@ xfs_change_file_space(
2753 * size to be changed. 2774 * size to be changed.
2754 */ 2775 */
2755 setprealloc = clrprealloc = 0; 2776 setprealloc = clrprealloc = 0;
2777 prealloc_type = XFS_BMAPI_PREALLOC;
2756 2778
2757 switch (cmd) { 2779 switch (cmd) {
2780 case XFS_IOC_ZERO_RANGE:
2781 prealloc_type |= XFS_BMAPI_CONVERT;
2782 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2783 /* FALLTHRU */
2758 case XFS_IOC_RESVSP: 2784 case XFS_IOC_RESVSP:
2759 case XFS_IOC_RESVSP64: 2785 case XFS_IOC_RESVSP64:
2760 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2786 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2761 1, attr_flags); 2787 prealloc_type, attr_flags);
2762 if (error) 2788 if (error)
2763 return error; 2789 return error;
2764 setprealloc = 1; 2790 setprealloc = 1;
@@ -2827,7 +2853,7 @@ xfs_change_file_space(
2827 if (ip->i_d.di_mode & S_IXGRP) 2853 if (ip->i_d.di_mode & S_IXGRP)
2828 ip->i_d.di_mode &= ~S_ISGID; 2854 ip->i_d.di_mode &= ~S_ISGID;
2829 2855
2830 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2856 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2831 } 2857 }
2832 if (setprealloc) 2858 if (setprealloc)
2833 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 2859 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..f6702927eee4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
2#define _XFS_VNODEOPS_H 1 2#define _XFS_VNODEOPS_H 1
3 3
4struct attrlist_cursor_kern; 4struct attrlist_cursor_kern;
5struct cred;
6struct file; 5struct file;
7struct iattr; 6struct iattr;
8struct inode; 7struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 25int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 struct xfs_inode **ipp, struct xfs_name *ci_name); 26 struct xfs_inode **ipp, struct xfs_name *ci_name);
28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, 27int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
29 xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp); 28 xfs_dev_t rdev, struct xfs_inode **ipp);
30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 29int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 30 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 31int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 33int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
35 xfs_off_t *offset, filldir_t filldir); 34 xfs_off_t *offset, filldir_t filldir);
36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
37 const char *target_path, mode_t mode, struct xfs_inode **ipp, 36 const char *target_path, mode_t mode, struct xfs_inode **ipp);
38 cred_t *credp);
39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
40int xfs_change_file_space(struct xfs_inode *ip, int cmd, 38int xfs_change_file_space(struct xfs_inode *ip, int cmd,
41 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); 39 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);